summaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
authorAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
committerAndré Fabian Silva Delgado <emulatorman@parabola.nu>2015-08-05 17:04:01 -0300
commit57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree5e910f0e82173f4ef4f51111366a3f1299037a7b /drivers/block
Initial import
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c7241
-rw-r--r--drivers/block/DAC960.h4415
-rw-r--r--drivers/block/Kconfig573
-rw-r--r--drivers/block/Makefile50
-rw-r--r--drivers/block/amiflop.c1893
-rw-r--r--drivers/block/aoe/Makefile6
-rw-r--r--drivers/block/aoe/aoe.h240
-rw-r--r--drivers/block/aoe/aoeblk.c464
-rw-r--r--drivers/block/aoe/aoechr.c320
-rw-r--r--drivers/block/aoe/aoecmd.c1826
-rw-r--r--drivers/block/aoe/aoedev.c526
-rw-r--r--drivers/block/aoe/aoemain.c115
-rw-r--r--drivers/block/aoe/aoenet.c223
-rw-r--r--drivers/block/ataflop.c2056
-rw-r--r--drivers/block/brd.c651
-rw-r--r--drivers/block/cciss.c5392
-rw-r--r--drivers/block/cciss.h435
-rw-r--r--drivers/block/cciss_cmd.h269
-rw-r--r--drivers/block/cciss_scsi.c1712
-rw-r--r--drivers/block/cciss_scsi.h79
-rw-r--r--drivers/block/cpqarray.c1820
-rw-r--r--drivers/block/cpqarray.h126
-rw-r--r--drivers/block/cryptoloop.c216
-rw-r--r--drivers/block/drbd/Kconfig73
-rw-r--r--drivers/block/drbd/Makefile8
-rw-r--r--drivers/block/drbd/drbd_actlog.c1219
-rw-r--r--drivers/block/drbd/drbd_bitmap.c1648
-rw-r--r--drivers/block/drbd/drbd_debugfs.c958
-rw-r--r--drivers/block/drbd/drbd_debugfs.h39
-rw-r--r--drivers/block/drbd/drbd_int.h2328
-rw-r--r--drivers/block/drbd/drbd_interval.c179
-rw-r--r--drivers/block/drbd/drbd_interval.h42
-rw-r--r--drivers/block/drbd/drbd_main.c3844
-rw-r--r--drivers/block/drbd/drbd_nl.c3674
-rw-r--r--drivers/block/drbd/drbd_nla.c54
-rw-r--r--drivers/block/drbd/drbd_nla.h8
-rw-r--r--drivers/block/drbd/drbd_proc.c368
-rw-r--r--drivers/block/drbd/drbd_protocol.h307
-rw-r--r--drivers/block/drbd/drbd_receiver.c5661
-rw-r--r--drivers/block/drbd/drbd_req.c1638
-rw-r--r--drivers/block/drbd/drbd_req.h351
-rw-r--r--drivers/block/drbd/drbd_state.c1918
-rw-r--r--drivers/block/drbd/drbd_state.h166
-rw-r--r--drivers/block/drbd/drbd_strings.c118
-rw-r--r--drivers/block/drbd/drbd_strings.h9
-rw-r--r--drivers/block/drbd/drbd_vli.h351
-rw-r--r--drivers/block/drbd/drbd_worker.c2156
-rw-r--r--drivers/block/floppy.c4645
-rw-r--r--drivers/block/hd.c804
-rw-r--r--drivers/block/ida_cmd.h349
-rw-r--r--drivers/block/ida_ioctl.h87
-rw-r--r--drivers/block/loop.c1885
-rw-r--r--drivers/block/loop.h92
-rw-r--r--drivers/block/mg_disk.c1112
-rw-r--r--drivers/block/mtip32xx/Kconfig9
-rw-r--r--drivers/block/mtip32xx/Makefile5
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c4745
-rw-r--r--drivers/block/mtip32xx/mtip32xx.h511
-rw-r--r--drivers/block/nbd.c896
-rw-r--r--drivers/block/null_blk.c674
-rw-r--r--drivers/block/nvme-core.c3178
-rw-r--r--drivers/block/nvme-scsi.c3070
-rw-r--r--drivers/block/osdblk.c699
-rw-r--r--drivers/block/paride/Kconfig300
-rw-r--r--drivers/block/paride/Makefile28
-rw-r--r--drivers/block/paride/Transition-notes128
-rw-r--r--drivers/block/paride/aten.c162
-rw-r--r--drivers/block/paride/bpck.c477
-rw-r--r--drivers/block/paride/bpck6.c267
-rw-r--r--drivers/block/paride/comm.c218
-rw-r--r--drivers/block/paride/dstr.c233
-rw-r--r--drivers/block/paride/epat.c340
-rw-r--r--drivers/block/paride/epia.c316
-rw-r--r--drivers/block/paride/fit2.c151
-rw-r--r--drivers/block/paride/fit3.c211
-rw-r--r--drivers/block/paride/friq.c276
-rw-r--r--drivers/block/paride/frpw.c313
-rw-r--r--drivers/block/paride/kbic.c305
-rw-r--r--drivers/block/paride/ktti.c128
-rw-r--r--drivers/block/paride/mkd30
-rw-r--r--drivers/block/paride/on20.c153
-rw-r--r--drivers/block/paride/on26.c319
-rw-r--r--drivers/block/paride/paride.c434
-rw-r--r--drivers/block/paride/paride.h170
-rw-r--r--drivers/block/paride/pcd.c991
-rw-r--r--drivers/block/paride/pd.c958
-rw-r--r--drivers/block/paride/pf.c1007
-rw-r--r--drivers/block/paride/pg.c726
-rw-r--r--drivers/block/paride/ppc6lnx.c726
-rw-r--r--drivers/block/paride/pseudo.h102
-rw-r--r--drivers/block/paride/pt.c1016
-rw-r--r--drivers/block/pktcdvd.c3029
-rw-r--r--drivers/block/pmem.c262
-rw-r--r--drivers/block/ps3disk.c589
-rw-r--r--drivers/block/ps3vram.c878
-rw-r--r--drivers/block/rbd.c5752
-rw-r--r--drivers/block/rbd_types.h81
-rw-r--r--drivers/block/rsxx/Makefile2
-rw-r--r--drivers/block/rsxx/config.c211
-rw-r--r--drivers/block/rsxx/core.c1144
-rw-r--r--drivers/block/rsxx/cregs.c804
-rw-r--r--drivers/block/rsxx/dev.c340
-rw-r--r--drivers/block/rsxx/dma.c1104
-rw-r--r--drivers/block/rsxx/rsxx.h47
-rw-r--r--drivers/block/rsxx/rsxx_cfg.h72
-rw-r--r--drivers/block/rsxx/rsxx_priv.h434
-rw-r--r--drivers/block/skd_main.c5411
-rw-r--r--drivers/block/skd_s1120.h330
-rw-r--r--drivers/block/smart1,2.h278
-rw-r--r--drivers/block/sunvdc.c1129
-rw-r--r--drivers/block/swim.c994
-rw-r--r--drivers/block/swim3.c1289
-rw-r--r--drivers/block/swim_asm.S247
-rw-r--r--drivers/block/sx8.c1749
-rw-r--r--drivers/block/umem.c1136
-rw-r--r--drivers/block/umem.h133
-rw-r--r--drivers/block/virtio_blk.c901
-rw-r--r--drivers/block/xen-blkback/Makefile3
-rw-r--r--drivers/block/xen-blkback/blkback.c1456
-rw-r--r--drivers/block/xen-blkback/common.h492
-rw-r--r--drivers/block/xen-blkback/xenbus.c934
-rw-r--r--drivers/block/xen-blkfront.c2126
-rw-r--r--drivers/block/xsysace.c1245
-rw-r--r--drivers/block/z2ram.c418
-rw-r--r--drivers/block/zram/Kconfig34
-rw-r--r--drivers/block/zram/Makefile5
-rw-r--r--drivers/block/zram/zcomp.c353
-rw-r--r--drivers/block/zram/zcomp.h68
-rw-r--r--drivers/block/zram/zcomp_lz4.c47
-rw-r--r--drivers/block/zram/zcomp_lz4.h17
-rw-r--r--drivers/block/zram/zcomp_lzo.c47
-rw-r--r--drivers/block/zram/zcomp_lzo.h17
-rw-r--r--drivers/block/zram/zram_drv.c1322
-rw-r--r--drivers/block/zram/zram_drv.h125
134 files changed, 128066 insertions, 0 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
new file mode 100644
index 000000000..811e11c82
--- /dev/null
+++ b/drivers/block/DAC960.c
@@ -0,0 +1,7241 @@
+/*
+
+ Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers
+
+ Copyright 1998-2001 by Leonard N. Zubkoff <lnz@dandelion.com>
+ Portions Copyright 2002 by Mylex (An IBM Business Unit)
+
+ This program is free software; you may redistribute and/or modify it under
+ the terms of the GNU General Public License Version 2 as published by the
+ Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY, without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for complete details.
+
+*/
+
+
+#define DAC960_DriverVersion "2.5.49"
+#define DAC960_DriverDate "21 Aug 2007"
+
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/miscdevice.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/genhd.h>
+#include <linux/hdreg.h>
+#include <linux/blkpg.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/reboot.h>
+#include <linux/spinlock.h>
+#include <linux/timer.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+#include "DAC960.h"
+
+#define DAC960_GAM_MINOR 252
+
+
+static DEFINE_MUTEX(DAC960_mutex);
+static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers];
+static int DAC960_ControllerCount;
+static struct proc_dir_entry *DAC960_ProcDirectoryEntry;
+
+static long disk_size(DAC960_Controller_T *p, int drive_nr)
+{
+ if (p->FirmwareType == DAC960_V1_Controller) {
+ if (drive_nr >= p->LogicalDriveCount)
+ return 0;
+ return p->V1.LogicalDriveInformation[drive_nr].
+ LogicalDriveSize;
+ } else {
+ DAC960_V2_LogicalDeviceInfo_T *i =
+ p->V2.LogicalDeviceInformation[drive_nr];
+ if (i == NULL)
+ return 0;
+ return i->ConfigurableDeviceSize;
+ }
+}
+
+static int DAC960_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ DAC960_Controller_T *p = disk->queue->queuedata;
+ int drive_nr = (long)disk->private_data;
+ int ret = -ENXIO;
+
+ mutex_lock(&DAC960_mutex);
+ if (p->FirmwareType == DAC960_V1_Controller) {
+ if (p->V1.LogicalDriveInformation[drive_nr].
+ LogicalDriveState == DAC960_V1_LogicalDrive_Offline)
+ goto out;
+ } else {
+ DAC960_V2_LogicalDeviceInfo_T *i =
+ p->V2.LogicalDeviceInformation[drive_nr];
+ if (!i || i->LogicalDeviceState == DAC960_V2_LogicalDevice_Offline)
+ goto out;
+ }
+
+ check_disk_change(bdev);
+
+ if (!get_capacity(p->disks[drive_nr]))
+ goto out;
+ ret = 0;
+out:
+ mutex_unlock(&DAC960_mutex);
+ return ret;
+}
+
+static int DAC960_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ DAC960_Controller_T *p = disk->queue->queuedata;
+ int drive_nr = (long)disk->private_data;
+
+ if (p->FirmwareType == DAC960_V1_Controller) {
+ geo->heads = p->V1.GeometryTranslationHeads;
+ geo->sectors = p->V1.GeometryTranslationSectors;
+ geo->cylinders = p->V1.LogicalDriveInformation[drive_nr].
+ LogicalDriveSize / (geo->heads * geo->sectors);
+ } else {
+ DAC960_V2_LogicalDeviceInfo_T *i =
+ p->V2.LogicalDeviceInformation[drive_nr];
+ switch (i->DriveGeometry) {
+ case DAC960_V2_Geometry_128_32:
+ geo->heads = 128;
+ geo->sectors = 32;
+ break;
+ case DAC960_V2_Geometry_255_63:
+ geo->heads = 255;
+ geo->sectors = 63;
+ break;
+ default:
+ DAC960_Error("Illegal Logical Device Geometry %d\n",
+ p, i->DriveGeometry);
+ return -EINVAL;
+ }
+
+ geo->cylinders = i->ConfigurableDeviceSize /
+ (geo->heads * geo->sectors);
+ }
+
+ return 0;
+}
+
+static unsigned int DAC960_check_events(struct gendisk *disk,
+ unsigned int clearing)
+{
+ DAC960_Controller_T *p = disk->queue->queuedata;
+ int drive_nr = (long)disk->private_data;
+
+ if (!p->LogicalDriveInitiallyAccessible[drive_nr])
+ return DISK_EVENT_MEDIA_CHANGE;
+ return 0;
+}
+
+static int DAC960_revalidate_disk(struct gendisk *disk)
+{
+ DAC960_Controller_T *p = disk->queue->queuedata;
+ int unit = (long)disk->private_data;
+
+ set_capacity(disk, disk_size(p, unit));
+ return 0;
+}
+
+static const struct block_device_operations DAC960_BlockDeviceOperations = {
+ .owner = THIS_MODULE,
+ .open = DAC960_open,
+ .getgeo = DAC960_getgeo,
+ .check_events = DAC960_check_events,
+ .revalidate_disk = DAC960_revalidate_disk,
+};
+
+
+/*
+ DAC960_AnnounceDriver announces the Driver Version and Date, Author's Name,
+ Copyright Notice, and Electronic Mail Address.
+*/
+
+static void DAC960_AnnounceDriver(DAC960_Controller_T *Controller)
+{
+ DAC960_Announce("***** DAC960 RAID Driver Version "
+ DAC960_DriverVersion " of "
+ DAC960_DriverDate " *****\n", Controller);
+ DAC960_Announce("Copyright 1998-2001 by Leonard N. Zubkoff "
+ "<lnz@dandelion.com>\n", Controller);
+}
+
+
+/*
+ DAC960_Failure prints a standardized error message, and then returns false.
+*/
+
+static bool DAC960_Failure(DAC960_Controller_T *Controller,
+ unsigned char *ErrorMessage)
+{
+ DAC960_Error("While configuring DAC960 PCI RAID Controller at\n",
+ Controller);
+ if (Controller->IO_Address == 0)
+ DAC960_Error("PCI Bus %d Device %d Function %d I/O Address N/A "
+ "PCI Address 0x%X\n", Controller,
+ Controller->Bus, Controller->Device,
+ Controller->Function, Controller->PCI_Address);
+ else DAC960_Error("PCI Bus %d Device %d Function %d I/O Address "
+ "0x%X PCI Address 0x%X\n", Controller,
+ Controller->Bus, Controller->Device,
+ Controller->Function, Controller->IO_Address,
+ Controller->PCI_Address);
+ DAC960_Error("%s FAILED - DETACHING\n", Controller, ErrorMessage);
+ return false;
+}
+
+/*
+ init_dma_loaf() and slice_dma_loaf() are helper functions for
+ aggregating the dma-mapped memory for a well-known collection of
+ data structures that are of different lengths.
+
+ These routines don't guarantee any alignment. The caller must
+ include any space needed for alignment in the sizes of the structures
+ that are passed in.
+ */
+
+static bool init_dma_loaf(struct pci_dev *dev, struct dma_loaf *loaf,
+ size_t len)
+{
+ void *cpu_addr;
+ dma_addr_t dma_handle;
+
+ cpu_addr = pci_alloc_consistent(dev, len, &dma_handle);
+ if (cpu_addr == NULL)
+ return false;
+
+ loaf->cpu_free = loaf->cpu_base = cpu_addr;
+ loaf->dma_free =loaf->dma_base = dma_handle;
+ loaf->length = len;
+ memset(cpu_addr, 0, len);
+ return true;
+}
+
+static void *slice_dma_loaf(struct dma_loaf *loaf, size_t len,
+ dma_addr_t *dma_handle)
+{
+ void *cpu_end = loaf->cpu_free + len;
+ void *cpu_addr = loaf->cpu_free;
+
+ BUG_ON(cpu_end > loaf->cpu_base + loaf->length);
+ *dma_handle = loaf->dma_free;
+ loaf->cpu_free = cpu_end;
+ loaf->dma_free += len;
+ return cpu_addr;
+}
+
+static void free_dma_loaf(struct pci_dev *dev, struct dma_loaf *loaf_handle)
+{
+ if (loaf_handle->cpu_base != NULL)
+ pci_free_consistent(dev, loaf_handle->length,
+ loaf_handle->cpu_base, loaf_handle->dma_base);
+}
+
+
+/*
+ DAC960_CreateAuxiliaryStructures allocates and initializes the auxiliary
+ data structures for Controller. It returns true on success and false on
+ failure.
+*/
+
+static bool DAC960_CreateAuxiliaryStructures(DAC960_Controller_T *Controller)
+{
+ int CommandAllocationLength, CommandAllocationGroupSize;
+ int CommandsRemaining = 0, CommandIdentifier, CommandGroupByteCount;
+ void *AllocationPointer = NULL;
+ void *ScatterGatherCPU = NULL;
+ dma_addr_t ScatterGatherDMA;
+ struct pci_pool *ScatterGatherPool;
+ void *RequestSenseCPU = NULL;
+ dma_addr_t RequestSenseDMA;
+ struct pci_pool *RequestSensePool = NULL;
+
+ if (Controller->FirmwareType == DAC960_V1_Controller)
+ {
+ CommandAllocationLength = offsetof(DAC960_Command_T, V1.EndMarker);
+ CommandAllocationGroupSize = DAC960_V1_CommandAllocationGroupSize;
+ ScatterGatherPool = pci_pool_create("DAC960_V1_ScatterGather",
+ Controller->PCIDevice,
+ DAC960_V1_ScatterGatherLimit * sizeof(DAC960_V1_ScatterGatherSegment_T),
+ sizeof(DAC960_V1_ScatterGatherSegment_T), 0);
+ if (ScatterGatherPool == NULL)
+ return DAC960_Failure(Controller,
+ "AUXILIARY STRUCTURE CREATION (SG)");
+ Controller->ScatterGatherPool = ScatterGatherPool;
+ }
+ else
+ {
+ CommandAllocationLength = offsetof(DAC960_Command_T, V2.EndMarker);
+ CommandAllocationGroupSize = DAC960_V2_CommandAllocationGroupSize;
+ ScatterGatherPool = pci_pool_create("DAC960_V2_ScatterGather",
+ Controller->PCIDevice,
+ DAC960_V2_ScatterGatherLimit * sizeof(DAC960_V2_ScatterGatherSegment_T),
+ sizeof(DAC960_V2_ScatterGatherSegment_T), 0);
+ if (ScatterGatherPool == NULL)
+ return DAC960_Failure(Controller,
+ "AUXILIARY STRUCTURE CREATION (SG)");
+ RequestSensePool = pci_pool_create("DAC960_V2_RequestSense",
+ Controller->PCIDevice, sizeof(DAC960_SCSI_RequestSense_T),
+ sizeof(int), 0);
+ if (RequestSensePool == NULL) {
+ pci_pool_destroy(ScatterGatherPool);
+ return DAC960_Failure(Controller,
+ "AUXILIARY STRUCTURE CREATION (SG)");
+ }
+ Controller->ScatterGatherPool = ScatterGatherPool;
+ Controller->V2.RequestSensePool = RequestSensePool;
+ }
+ Controller->CommandAllocationGroupSize = CommandAllocationGroupSize;
+ Controller->FreeCommands = NULL;
+ for (CommandIdentifier = 1;
+ CommandIdentifier <= Controller->DriverQueueDepth;
+ CommandIdentifier++)
+ {
+ DAC960_Command_T *Command;
+ if (--CommandsRemaining <= 0)
+ {
+ CommandsRemaining =
+ Controller->DriverQueueDepth - CommandIdentifier + 1;
+ if (CommandsRemaining > CommandAllocationGroupSize)
+ CommandsRemaining = CommandAllocationGroupSize;
+ CommandGroupByteCount =
+ CommandsRemaining * CommandAllocationLength;
+ AllocationPointer = kzalloc(CommandGroupByteCount, GFP_ATOMIC);
+ if (AllocationPointer == NULL)
+ return DAC960_Failure(Controller,
+ "AUXILIARY STRUCTURE CREATION");
+ }
+ Command = (DAC960_Command_T *) AllocationPointer;
+ AllocationPointer += CommandAllocationLength;
+ Command->CommandIdentifier = CommandIdentifier;
+ Command->Controller = Controller;
+ Command->Next = Controller->FreeCommands;
+ Controller->FreeCommands = Command;
+ Controller->Commands[CommandIdentifier-1] = Command;
+ ScatterGatherCPU = pci_pool_alloc(ScatterGatherPool, GFP_ATOMIC,
+ &ScatterGatherDMA);
+ if (ScatterGatherCPU == NULL)
+ return DAC960_Failure(Controller, "AUXILIARY STRUCTURE CREATION");
+
+ if (RequestSensePool != NULL) {
+ RequestSenseCPU = pci_pool_alloc(RequestSensePool, GFP_ATOMIC,
+ &RequestSenseDMA);
+ if (RequestSenseCPU == NULL) {
+ pci_pool_free(ScatterGatherPool, ScatterGatherCPU,
+ ScatterGatherDMA);
+ return DAC960_Failure(Controller,
+ "AUXILIARY STRUCTURE CREATION");
+ }
+ }
+ if (Controller->FirmwareType == DAC960_V1_Controller) {
+ Command->cmd_sglist = Command->V1.ScatterList;
+ Command->V1.ScatterGatherList =
+ (DAC960_V1_ScatterGatherSegment_T *)ScatterGatherCPU;
+ Command->V1.ScatterGatherListDMA = ScatterGatherDMA;
+ sg_init_table(Command->cmd_sglist, DAC960_V1_ScatterGatherLimit);
+ } else {
+ Command->cmd_sglist = Command->V2.ScatterList;
+ Command->V2.ScatterGatherList =
+ (DAC960_V2_ScatterGatherSegment_T *)ScatterGatherCPU;
+ Command->V2.ScatterGatherListDMA = ScatterGatherDMA;
+ Command->V2.RequestSense =
+ (DAC960_SCSI_RequestSense_T *)RequestSenseCPU;
+ Command->V2.RequestSenseDMA = RequestSenseDMA;
+ sg_init_table(Command->cmd_sglist, DAC960_V2_ScatterGatherLimit);
+ }
+ }
+ return true;
+}
+
+
+/*
+ DAC960_DestroyAuxiliaryStructures deallocates the auxiliary data
+ structures for Controller.
+*/
+
+static void DAC960_DestroyAuxiliaryStructures(DAC960_Controller_T *Controller)
+{
+ int i;
+ struct pci_pool *ScatterGatherPool = Controller->ScatterGatherPool;
+ struct pci_pool *RequestSensePool = NULL;
+ void *ScatterGatherCPU;
+ dma_addr_t ScatterGatherDMA;
+ void *RequestSenseCPU;
+ dma_addr_t RequestSenseDMA;
+ DAC960_Command_T *CommandGroup = NULL;
+
+
+ if (Controller->FirmwareType == DAC960_V2_Controller)
+ RequestSensePool = Controller->V2.RequestSensePool;
+
+ Controller->FreeCommands = NULL;
+ for (i = 0; i < Controller->DriverQueueDepth; i++)
+ {
+ DAC960_Command_T *Command = Controller->Commands[i];
+
+ if (Command == NULL)
+ continue;
+
+ if (Controller->FirmwareType == DAC960_V1_Controller) {
+ ScatterGatherCPU = (void *)Command->V1.ScatterGatherList;
+ ScatterGatherDMA = Command->V1.ScatterGatherListDMA;
+ RequestSenseCPU = NULL;
+ RequestSenseDMA = (dma_addr_t)0;
+ } else {
+ ScatterGatherCPU = (void *)Command->V2.ScatterGatherList;
+ ScatterGatherDMA = Command->V2.ScatterGatherListDMA;
+ RequestSenseCPU = (void *)Command->V2.RequestSense;
+ RequestSenseDMA = Command->V2.RequestSenseDMA;
+ }
+ if (ScatterGatherCPU != NULL)
+ pci_pool_free(ScatterGatherPool, ScatterGatherCPU, ScatterGatherDMA);
+ if (RequestSenseCPU != NULL)
+ pci_pool_free(RequestSensePool, RequestSenseCPU, RequestSenseDMA);
+
+ if ((Command->CommandIdentifier
+ % Controller->CommandAllocationGroupSize) == 1) {
+ /*
+ * We can't free the group of commands until all of the
+ * request sense and scatter gather dma structures are free.
+ * Remember the beginning of the group, but don't free it
+ * until we've reached the beginning of the next group.
+ */
+ kfree(CommandGroup);
+ CommandGroup = Command;
+ }
+ Controller->Commands[i] = NULL;
+ }
+ kfree(CommandGroup);
+
+ if (Controller->CombinedStatusBuffer != NULL)
+ {
+ kfree(Controller->CombinedStatusBuffer);
+ Controller->CombinedStatusBuffer = NULL;
+ Controller->CurrentStatusBuffer = NULL;
+ }
+
+ if (ScatterGatherPool != NULL)
+ pci_pool_destroy(ScatterGatherPool);
+ if (Controller->FirmwareType == DAC960_V1_Controller)
+ return;
+
+ if (RequestSensePool != NULL)
+ pci_pool_destroy(RequestSensePool);
+
+ for (i = 0; i < DAC960_MaxLogicalDrives; i++) {
+ kfree(Controller->V2.LogicalDeviceInformation[i]);
+ Controller->V2.LogicalDeviceInformation[i] = NULL;
+ }
+
+ for (i = 0; i < DAC960_V2_MaxPhysicalDevices; i++)
+ {
+ kfree(Controller->V2.PhysicalDeviceInformation[i]);
+ Controller->V2.PhysicalDeviceInformation[i] = NULL;
+ kfree(Controller->V2.InquiryUnitSerialNumber[i]);
+ Controller->V2.InquiryUnitSerialNumber[i] = NULL;
+ }
+}
+
+
+/*
+ DAC960_V1_ClearCommand clears critical fields of Command for DAC960 V1
+ Firmware Controllers.
+*/
+
+static inline void DAC960_V1_ClearCommand(DAC960_Command_T *Command)
+{
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ memset(CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T));
+ Command->V1.CommandStatus = 0;
+}
+
+
+/*
+ DAC960_V2_ClearCommand clears critical fields of Command for DAC960 V2
+ Firmware Controllers.
+*/
+
+static inline void DAC960_V2_ClearCommand(DAC960_Command_T *Command)
+{
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ memset(CommandMailbox, 0, sizeof(DAC960_V2_CommandMailbox_T));
+ Command->V2.CommandStatus = 0;
+}
+
+
+/*
+ DAC960_AllocateCommand allocates a Command structure from Controller's
+ free list. During driver initialization, a special initialization command
+ has been placed on the free list to guarantee that command allocation can
+ never fail.
+*/
+
+static inline DAC960_Command_T *DAC960_AllocateCommand(DAC960_Controller_T
+ *Controller)
+{
+ DAC960_Command_T *Command = Controller->FreeCommands;
+ if (Command == NULL) return NULL;
+ Controller->FreeCommands = Command->Next;
+ Command->Next = NULL;
+ return Command;
+}
+
+
+/*
+ DAC960_DeallocateCommand deallocates Command, returning it to Controller's
+ free list.
+*/
+
+static inline void DAC960_DeallocateCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+
+ Command->Request = NULL;
+ Command->Next = Controller->FreeCommands;
+ Controller->FreeCommands = Command;
+}
+
+
+/*
+ DAC960_WaitForCommand waits for a wake_up on Controller's Command Wait Queue.
+*/
+
+static void DAC960_WaitForCommand(DAC960_Controller_T *Controller)
+{
+ spin_unlock_irq(&Controller->queue_lock);
+ __wait_event(Controller->CommandWaitQueue, Controller->FreeCommands);
+ spin_lock_irq(&Controller->queue_lock);
+}
+
+/*
+ DAC960_GEM_QueueCommand queues Command for DAC960 GEM Series Controllers.
+*/
+
+static void DAC960_GEM_QueueCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_CommandMailbox_T *NextCommandMailbox =
+ Controller->V2.NextCommandMailbox;
+
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ DAC960_GEM_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
+
+ if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 ||
+ Controller->V2.PreviousCommandMailbox2->Words[0] == 0)
+ DAC960_GEM_MemoryMailboxNewCommand(ControllerBaseAddress);
+
+ Controller->V2.PreviousCommandMailbox2 =
+ Controller->V2.PreviousCommandMailbox1;
+ Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox;
+
+ if (++NextCommandMailbox > Controller->V2.LastCommandMailbox)
+ NextCommandMailbox = Controller->V2.FirstCommandMailbox;
+
+ Controller->V2.NextCommandMailbox = NextCommandMailbox;
+}
+
+/*
+ DAC960_BA_QueueCommand queues Command for DAC960 BA Series Controllers.
+*/
+
+static void DAC960_BA_QueueCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_CommandMailbox_T *NextCommandMailbox =
+ Controller->V2.NextCommandMailbox;
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ DAC960_BA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
+ if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 ||
+ Controller->V2.PreviousCommandMailbox2->Words[0] == 0)
+ DAC960_BA_MemoryMailboxNewCommand(ControllerBaseAddress);
+ Controller->V2.PreviousCommandMailbox2 =
+ Controller->V2.PreviousCommandMailbox1;
+ Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox;
+ if (++NextCommandMailbox > Controller->V2.LastCommandMailbox)
+ NextCommandMailbox = Controller->V2.FirstCommandMailbox;
+ Controller->V2.NextCommandMailbox = NextCommandMailbox;
+}
+
+
+/*
+ DAC960_LP_QueueCommand queues Command for DAC960 LP Series Controllers.
+*/
+
+static void DAC960_LP_QueueCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_CommandMailbox_T *NextCommandMailbox =
+ Controller->V2.NextCommandMailbox;
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ DAC960_LP_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
+ if (Controller->V2.PreviousCommandMailbox1->Words[0] == 0 ||
+ Controller->V2.PreviousCommandMailbox2->Words[0] == 0)
+ DAC960_LP_MemoryMailboxNewCommand(ControllerBaseAddress);
+ Controller->V2.PreviousCommandMailbox2 =
+ Controller->V2.PreviousCommandMailbox1;
+ Controller->V2.PreviousCommandMailbox1 = NextCommandMailbox;
+ if (++NextCommandMailbox > Controller->V2.LastCommandMailbox)
+ NextCommandMailbox = Controller->V2.FirstCommandMailbox;
+ Controller->V2.NextCommandMailbox = NextCommandMailbox;
+}
+
+
+/*
+ DAC960_LA_QueueCommandDualMode queues Command for DAC960 LA Series
+ Controllers with Dual Mode Firmware.
+*/
+
+static void DAC960_LA_QueueCommandDualMode(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_CommandMailbox_T *NextCommandMailbox =
+ Controller->V1.NextCommandMailbox;
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ DAC960_LA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
+ if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 ||
+ Controller->V1.PreviousCommandMailbox2->Words[0] == 0)
+ DAC960_LA_MemoryMailboxNewCommand(ControllerBaseAddress);
+ Controller->V1.PreviousCommandMailbox2 =
+ Controller->V1.PreviousCommandMailbox1;
+ Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox;
+ if (++NextCommandMailbox > Controller->V1.LastCommandMailbox)
+ NextCommandMailbox = Controller->V1.FirstCommandMailbox;
+ Controller->V1.NextCommandMailbox = NextCommandMailbox;
+}
+
+
+/*
+ DAC960_LA_QueueCommandSingleMode queues Command for DAC960 LA Series
+ Controllers with Single Mode Firmware.
+*/
+
+static void DAC960_LA_QueueCommandSingleMode(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_CommandMailbox_T *NextCommandMailbox =
+ Controller->V1.NextCommandMailbox;
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ DAC960_LA_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
+ if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 ||
+ Controller->V1.PreviousCommandMailbox2->Words[0] == 0)
+ DAC960_LA_HardwareMailboxNewCommand(ControllerBaseAddress);
+ Controller->V1.PreviousCommandMailbox2 =
+ Controller->V1.PreviousCommandMailbox1;
+ Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox;
+ if (++NextCommandMailbox > Controller->V1.LastCommandMailbox)
+ NextCommandMailbox = Controller->V1.FirstCommandMailbox;
+ Controller->V1.NextCommandMailbox = NextCommandMailbox;
+}
+
+
+/*
+ DAC960_PG_QueueCommandDualMode queues Command for DAC960 PG Series
+ Controllers with Dual Mode Firmware.
+*/
+
+static void DAC960_PG_QueueCommandDualMode(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_CommandMailbox_T *NextCommandMailbox =
+ Controller->V1.NextCommandMailbox;
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ DAC960_PG_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
+ if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 ||
+ Controller->V1.PreviousCommandMailbox2->Words[0] == 0)
+ DAC960_PG_MemoryMailboxNewCommand(ControllerBaseAddress);
+ Controller->V1.PreviousCommandMailbox2 =
+ Controller->V1.PreviousCommandMailbox1;
+ Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox;
+ if (++NextCommandMailbox > Controller->V1.LastCommandMailbox)
+ NextCommandMailbox = Controller->V1.FirstCommandMailbox;
+ Controller->V1.NextCommandMailbox = NextCommandMailbox;
+}
+
+
+/*
+ DAC960_PG_QueueCommandSingleMode queues Command for DAC960 PG Series
+ Controllers with Single Mode Firmware.
+*/
+
+static void DAC960_PG_QueueCommandSingleMode(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_CommandMailbox_T *NextCommandMailbox =
+ Controller->V1.NextCommandMailbox;
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ DAC960_PG_WriteCommandMailbox(NextCommandMailbox, CommandMailbox);
+ if (Controller->V1.PreviousCommandMailbox1->Words[0] == 0 ||
+ Controller->V1.PreviousCommandMailbox2->Words[0] == 0)
+ DAC960_PG_HardwareMailboxNewCommand(ControllerBaseAddress);
+ Controller->V1.PreviousCommandMailbox2 =
+ Controller->V1.PreviousCommandMailbox1;
+ Controller->V1.PreviousCommandMailbox1 = NextCommandMailbox;
+ if (++NextCommandMailbox > Controller->V1.LastCommandMailbox)
+ NextCommandMailbox = Controller->V1.FirstCommandMailbox;
+ Controller->V1.NextCommandMailbox = NextCommandMailbox;
+}
+
+
+/*
+ DAC960_PD_QueueCommand queues Command for DAC960 PD Series Controllers.
+*/
+
+static void DAC960_PD_QueueCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ while (DAC960_PD_MailboxFullP(ControllerBaseAddress))
+ udelay(1);
+ DAC960_PD_WriteCommandMailbox(ControllerBaseAddress, CommandMailbox);
+ DAC960_PD_NewCommand(ControllerBaseAddress);
+}
+
+
+/*
+ DAC960_P_QueueCommand queues Command for DAC960 P Series Controllers.
+*/
+
+static void DAC960_P_QueueCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ CommandMailbox->Common.CommandIdentifier = Command->CommandIdentifier;
+ switch (CommandMailbox->Common.CommandOpcode)
+ {
+ case DAC960_V1_Enquiry:
+ CommandMailbox->Common.CommandOpcode = DAC960_V1_Enquiry_Old;
+ break;
+ case DAC960_V1_GetDeviceState:
+ CommandMailbox->Common.CommandOpcode = DAC960_V1_GetDeviceState_Old;
+ break;
+ case DAC960_V1_Read:
+ CommandMailbox->Common.CommandOpcode = DAC960_V1_Read_Old;
+ DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox);
+ break;
+ case DAC960_V1_Write:
+ CommandMailbox->Common.CommandOpcode = DAC960_V1_Write_Old;
+ DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox);
+ break;
+ case DAC960_V1_ReadWithScatterGather:
+ CommandMailbox->Common.CommandOpcode =
+ DAC960_V1_ReadWithScatterGather_Old;
+ DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox);
+ break;
+ case DAC960_V1_WriteWithScatterGather:
+ CommandMailbox->Common.CommandOpcode =
+ DAC960_V1_WriteWithScatterGather_Old;
+ DAC960_PD_To_P_TranslateReadWriteCommand(CommandMailbox);
+ break;
+ default:
+ break;
+ }
+ while (DAC960_PD_MailboxFullP(ControllerBaseAddress))
+ udelay(1);
+ DAC960_PD_WriteCommandMailbox(ControllerBaseAddress, CommandMailbox);
+ DAC960_PD_NewCommand(ControllerBaseAddress);
+}
+
+
+/*
+ DAC960_ExecuteCommand executes Command and waits for completion.
+*/
+
+static void DAC960_ExecuteCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ DECLARE_COMPLETION_ONSTACK(Completion);
+ unsigned long flags;
+ Command->Completion = &Completion;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_QueueCommand(Command);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+
+ if (in_interrupt())
+ return;
+ wait_for_completion(&Completion);
+}
+
+
+/*
+ DAC960_V1_ExecuteType3 executes a DAC960 V1 Firmware Controller Type 3
+ Command and waits for completion. It returns true on success and false
+ on failure.
+*/
+
+static bool DAC960_V1_ExecuteType3(DAC960_Controller_T *Controller,
+ DAC960_V1_CommandOpcode_T CommandOpcode,
+ dma_addr_t DataDMA)
+{
+ DAC960_Command_T *Command = DAC960_AllocateCommand(Controller);
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_CommandStatus_T CommandStatus;
+ DAC960_V1_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox->Type3.CommandOpcode = CommandOpcode;
+ CommandMailbox->Type3.BusAddress = DataDMA;
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V1.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V1_NormalCompletion);
+}
+
+
+/*
+ DAC960_V1_ExecuteTypeB executes a DAC960 V1 Firmware Controller Type 3B
+ Command and waits for completion. It returns true on success and false
+ on failure.
+*/
+
+static bool DAC960_V1_ExecuteType3B(DAC960_Controller_T *Controller,
+ DAC960_V1_CommandOpcode_T CommandOpcode,
+ unsigned char CommandOpcode2,
+ dma_addr_t DataDMA)
+{
+ DAC960_Command_T *Command = DAC960_AllocateCommand(Controller);
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_CommandStatus_T CommandStatus;
+ DAC960_V1_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox->Type3B.CommandOpcode = CommandOpcode;
+ CommandMailbox->Type3B.CommandOpcode2 = CommandOpcode2;
+ CommandMailbox->Type3B.BusAddress = DataDMA;
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V1.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V1_NormalCompletion);
+}
+
+
+/*
+ DAC960_V1_ExecuteType3D executes a DAC960 V1 Firmware Controller Type 3D
+ Command and waits for completion. It returns true on success and false
+ on failure.
+*/
+
+static bool DAC960_V1_ExecuteType3D(DAC960_Controller_T *Controller,
+ DAC960_V1_CommandOpcode_T CommandOpcode,
+ unsigned char Channel,
+ unsigned char TargetID,
+ dma_addr_t DataDMA)
+{
+ DAC960_Command_T *Command = DAC960_AllocateCommand(Controller);
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_CommandStatus_T CommandStatus;
+ DAC960_V1_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox->Type3D.CommandOpcode = CommandOpcode;
+ CommandMailbox->Type3D.Channel = Channel;
+ CommandMailbox->Type3D.TargetID = TargetID;
+ CommandMailbox->Type3D.BusAddress = DataDMA;
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V1.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V1_NormalCompletion);
+}
+
+
+/*
+ DAC960_V2_GeneralInfo executes a DAC960 V2 Firmware General Information
+ Reading IOCTL Command and waits for completion. It returns true on success
+ and false on failure.
+
+ Return data in The controller's HealthStatusBuffer, which is dma-able memory
+*/
+
+static bool DAC960_V2_GeneralInfo(DAC960_Controller_T *Controller)
+{
+ DAC960_Command_T *Command = DAC960_AllocateCommand(Controller);
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_CommandStatus_T CommandStatus;
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox->Common.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->Common.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->Common.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->Common.DataTransferSize = sizeof(DAC960_V2_HealthStatusBuffer_T);
+ CommandMailbox->Common.IOCTL_Opcode = DAC960_V2_GetHealthStatus;
+ CommandMailbox->Common.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.HealthStatusBufferDMA;
+ CommandMailbox->Common.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->Common.DataTransferSize;
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V2.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V2_NormalCompletion);
+}
+
+
+/*
+ DAC960_V2_ControllerInfo executes a DAC960 V2 Firmware Controller
+ Information Reading IOCTL Command and waits for completion. It returns
+ true on success and false on failure.
+
+ Data is returned in the controller's V2.NewControllerInformation dma-able
+ memory buffer.
+*/
+
+static bool DAC960_V2_NewControllerInfo(DAC960_Controller_T *Controller)
+{
+ DAC960_Command_T *Command = DAC960_AllocateCommand(Controller);
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_CommandStatus_T CommandStatus;
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox->ControllerInfo.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->ControllerInfo.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->ControllerInfo.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->ControllerInfo.DataTransferSize = sizeof(DAC960_V2_ControllerInfo_T);
+ CommandMailbox->ControllerInfo.ControllerNumber = 0;
+ CommandMailbox->ControllerInfo.IOCTL_Opcode = DAC960_V2_GetControllerInfo;
+ CommandMailbox->ControllerInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.NewControllerInformationDMA;
+ CommandMailbox->ControllerInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->ControllerInfo.DataTransferSize;
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V2.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V2_NormalCompletion);
+}
+
+
+/*
+ DAC960_V2_LogicalDeviceInfo executes a DAC960 V2 Firmware Controller Logical
+ Device Information Reading IOCTL Command and waits for completion. It
+ returns true on success and false on failure.
+
+ Data is returned in the controller's V2.NewLogicalDeviceInformation
+*/
+
+static bool DAC960_V2_NewLogicalDeviceInfo(DAC960_Controller_T *Controller,
+ unsigned short LogicalDeviceNumber)
+{
+ DAC960_Command_T *Command = DAC960_AllocateCommand(Controller);
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_CommandStatus_T CommandStatus;
+
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox->LogicalDeviceInfo.CommandOpcode =
+ DAC960_V2_IOCTL;
+ CommandMailbox->LogicalDeviceInfo.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->LogicalDeviceInfo.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->LogicalDeviceInfo.DataTransferSize =
+ sizeof(DAC960_V2_LogicalDeviceInfo_T);
+ CommandMailbox->LogicalDeviceInfo.LogicalDevice.LogicalDeviceNumber =
+ LogicalDeviceNumber;
+ CommandMailbox->LogicalDeviceInfo.IOCTL_Opcode = DAC960_V2_GetLogicalDeviceInfoValid;
+ CommandMailbox->LogicalDeviceInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.NewLogicalDeviceInformationDMA;
+ CommandMailbox->LogicalDeviceInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->LogicalDeviceInfo.DataTransferSize;
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V2.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V2_NormalCompletion);
+}
+
+
+/*
+ DAC960_V2_PhysicalDeviceInfo executes a DAC960 V2 Firmware Controller "Read
+ Physical Device Information" IOCTL Command and waits for completion. It
+ returns true on success and false on failure.
+
+ The Channel, TargetID, LogicalUnit arguments should be 0 the first time
+ this function is called for a given controller. This will return data
+ for the "first" device on that controller. The returned data includes a
+ Channel, TargetID, LogicalUnit that can be passed in to this routine to
+ get data for the NEXT device on that controller.
+
+ Data is stored in the controller's V2.NewPhysicalDeviceInfo dma-able
+ memory buffer.
+
+*/
+
+static bool DAC960_V2_NewPhysicalDeviceInfo(DAC960_Controller_T *Controller,
+ unsigned char Channel,
+ unsigned char TargetID,
+ unsigned char LogicalUnit)
+{
+ DAC960_Command_T *Command = DAC960_AllocateCommand(Controller);
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_CommandStatus_T CommandStatus;
+
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox->PhysicalDeviceInfo.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->PhysicalDeviceInfo.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->PhysicalDeviceInfo.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->PhysicalDeviceInfo.DataTransferSize =
+ sizeof(DAC960_V2_PhysicalDeviceInfo_T);
+ CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.LogicalUnit = LogicalUnit;
+ CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.TargetID = TargetID;
+ CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.Channel = Channel;
+ CommandMailbox->PhysicalDeviceInfo.IOCTL_Opcode =
+ DAC960_V2_GetPhysicalDeviceInfoValid;
+ CommandMailbox->PhysicalDeviceInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.NewPhysicalDeviceInformationDMA;
+ CommandMailbox->PhysicalDeviceInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->PhysicalDeviceInfo.DataTransferSize;
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V2.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V2_NormalCompletion);
+}
+
+
+static void DAC960_V2_ConstructNewUnitSerialNumber(
+ DAC960_Controller_T *Controller,
+ DAC960_V2_CommandMailbox_T *CommandMailbox, int Channel, int TargetID,
+ int LogicalUnit)
+{
+ CommandMailbox->SCSI_10.CommandOpcode = DAC960_V2_SCSI_10_Passthru;
+ CommandMailbox->SCSI_10.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->SCSI_10.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->SCSI_10.DataTransferSize =
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T);
+ CommandMailbox->SCSI_10.PhysicalDevice.LogicalUnit = LogicalUnit;
+ CommandMailbox->SCSI_10.PhysicalDevice.TargetID = TargetID;
+ CommandMailbox->SCSI_10.PhysicalDevice.Channel = Channel;
+ CommandMailbox->SCSI_10.CDBLength = 6;
+ CommandMailbox->SCSI_10.SCSI_CDB[0] = 0x12; /* INQUIRY */
+ CommandMailbox->SCSI_10.SCSI_CDB[1] = 1; /* EVPD = 1 */
+ CommandMailbox->SCSI_10.SCSI_CDB[2] = 0x80; /* Page Code */
+ CommandMailbox->SCSI_10.SCSI_CDB[3] = 0; /* Reserved */
+ CommandMailbox->SCSI_10.SCSI_CDB[4] =
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T);
+ CommandMailbox->SCSI_10.SCSI_CDB[5] = 0; /* Control */
+ CommandMailbox->SCSI_10.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.NewInquiryUnitSerialNumberDMA;
+ CommandMailbox->SCSI_10.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->SCSI_10.DataTransferSize;
+}
+
+
+/*
+ DAC960_V2_NewUnitSerialNumber executes an SCSI pass-through
+ Inquiry command to a SCSI device identified by Channel number,
+ Target id, Logical Unit Number. This function Waits for completion
+ of the command.
+
+ The return data includes Unit Serial Number information for the
+ specified device.
+
+ Data is stored in the controller's V2.NewPhysicalDeviceInfo dma-able
+ memory buffer.
+*/
+
+static bool DAC960_V2_NewInquiryUnitSerialNumber(DAC960_Controller_T *Controller,
+ int Channel, int TargetID, int LogicalUnit)
+{
+ DAC960_Command_T *Command;
+ DAC960_V2_CommandMailbox_T *CommandMailbox;
+ DAC960_V2_CommandStatus_T CommandStatus;
+
+ Command = DAC960_AllocateCommand(Controller);
+ CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+
+ DAC960_V2_ConstructNewUnitSerialNumber(Controller, CommandMailbox,
+ Channel, TargetID, LogicalUnit);
+
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V2.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V2_NormalCompletion);
+}
+
+
+/*
+ DAC960_V2_DeviceOperation executes a DAC960 V2 Firmware Controller Device
+ Operation IOCTL Command and waits for completion. It returns true on
+ success and false on failure.
+*/
+
+static bool DAC960_V2_DeviceOperation(DAC960_Controller_T *Controller,
+ DAC960_V2_IOCTL_Opcode_T IOCTL_Opcode,
+ DAC960_V2_OperationDevice_T
+ OperationDevice)
+{
+ DAC960_Command_T *Command = DAC960_AllocateCommand(Controller);
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_CommandStatus_T CommandStatus;
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox->DeviceOperation.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->DeviceOperation.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->DeviceOperation.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->DeviceOperation.IOCTL_Opcode = IOCTL_Opcode;
+ CommandMailbox->DeviceOperation.OperationDevice = OperationDevice;
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V2.CommandStatus;
+ DAC960_DeallocateCommand(Command);
+ return (CommandStatus == DAC960_V2_NormalCompletion);
+}
+
+
+/*
+ DAC960_V1_EnableMemoryMailboxInterface enables the Memory Mailbox Interface
+ for DAC960 V1 Firmware Controllers.
+
+ PD and P controller types have no memory mailbox, but still need the
+ other dma mapped memory.
+*/
+
+static bool DAC960_V1_EnableMemoryMailboxInterface(DAC960_Controller_T
+ *Controller)
+{
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_HardwareType_T hw_type = Controller->HardwareType;
+ struct pci_dev *PCI_Device = Controller->PCIDevice;
+ struct dma_loaf *DmaPages = &Controller->DmaPages;
+ size_t DmaPagesSize;
+ size_t CommandMailboxesSize;
+ size_t StatusMailboxesSize;
+
+ DAC960_V1_CommandMailbox_T *CommandMailboxesMemory;
+ dma_addr_t CommandMailboxesMemoryDMA;
+
+ DAC960_V1_StatusMailbox_T *StatusMailboxesMemory;
+ dma_addr_t StatusMailboxesMemoryDMA;
+
+ DAC960_V1_CommandMailbox_T CommandMailbox;
+ DAC960_V1_CommandStatus_T CommandStatus;
+ int TimeoutCounter;
+ int i;
+
+ memset(&CommandMailbox, 0, sizeof(DAC960_V1_CommandMailbox_T));
+
+ if (pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
+ return DAC960_Failure(Controller, "DMA mask out of range");
+ Controller->BounceBufferLimit = DMA_BIT_MASK(32);
+
+ if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller)) {
+ CommandMailboxesSize = 0;
+ StatusMailboxesSize = 0;
+ } else {
+ CommandMailboxesSize = DAC960_V1_CommandMailboxCount * sizeof(DAC960_V1_CommandMailbox_T);
+ StatusMailboxesSize = DAC960_V1_StatusMailboxCount * sizeof(DAC960_V1_StatusMailbox_T);
+ }
+ DmaPagesSize = CommandMailboxesSize + StatusMailboxesSize +
+ sizeof(DAC960_V1_DCDB_T) + sizeof(DAC960_V1_Enquiry_T) +
+ sizeof(DAC960_V1_ErrorTable_T) + sizeof(DAC960_V1_EventLogEntry_T) +
+ sizeof(DAC960_V1_RebuildProgress_T) +
+ sizeof(DAC960_V1_LogicalDriveInformationArray_T) +
+ sizeof(DAC960_V1_BackgroundInitializationStatus_T) +
+ sizeof(DAC960_V1_DeviceState_T) + sizeof(DAC960_SCSI_Inquiry_T) +
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T);
+
+ if (!init_dma_loaf(PCI_Device, DmaPages, DmaPagesSize))
+ return false;
+
+
+ if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller))
+ goto skip_mailboxes;
+
+ CommandMailboxesMemory = slice_dma_loaf(DmaPages,
+ CommandMailboxesSize, &CommandMailboxesMemoryDMA);
+
+ /* These are the base addresses for the command memory mailbox array */
+ Controller->V1.FirstCommandMailbox = CommandMailboxesMemory;
+ Controller->V1.FirstCommandMailboxDMA = CommandMailboxesMemoryDMA;
+
+ CommandMailboxesMemory += DAC960_V1_CommandMailboxCount - 1;
+ Controller->V1.LastCommandMailbox = CommandMailboxesMemory;
+ Controller->V1.NextCommandMailbox = Controller->V1.FirstCommandMailbox;
+ Controller->V1.PreviousCommandMailbox1 = Controller->V1.LastCommandMailbox;
+ Controller->V1.PreviousCommandMailbox2 =
+ Controller->V1.LastCommandMailbox - 1;
+
+ /* These are the base addresses for the status memory mailbox array */
+ StatusMailboxesMemory = slice_dma_loaf(DmaPages,
+ StatusMailboxesSize, &StatusMailboxesMemoryDMA);
+
+ Controller->V1.FirstStatusMailbox = StatusMailboxesMemory;
+ Controller->V1.FirstStatusMailboxDMA = StatusMailboxesMemoryDMA;
+ StatusMailboxesMemory += DAC960_V1_StatusMailboxCount - 1;
+ Controller->V1.LastStatusMailbox = StatusMailboxesMemory;
+ Controller->V1.NextStatusMailbox = Controller->V1.FirstStatusMailbox;
+
+skip_mailboxes:
+ Controller->V1.MonitoringDCDB = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V1_DCDB_T),
+ &Controller->V1.MonitoringDCDB_DMA);
+
+ Controller->V1.NewEnquiry = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V1_Enquiry_T),
+ &Controller->V1.NewEnquiryDMA);
+
+ Controller->V1.NewErrorTable = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V1_ErrorTable_T),
+ &Controller->V1.NewErrorTableDMA);
+
+ Controller->V1.EventLogEntry = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V1_EventLogEntry_T),
+ &Controller->V1.EventLogEntryDMA);
+
+ Controller->V1.RebuildProgress = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V1_RebuildProgress_T),
+ &Controller->V1.RebuildProgressDMA);
+
+ Controller->V1.NewLogicalDriveInformation = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V1_LogicalDriveInformationArray_T),
+ &Controller->V1.NewLogicalDriveInformationDMA);
+
+ Controller->V1.BackgroundInitializationStatus = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V1_BackgroundInitializationStatus_T),
+ &Controller->V1.BackgroundInitializationStatusDMA);
+
+ Controller->V1.NewDeviceState = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V1_DeviceState_T),
+ &Controller->V1.NewDeviceStateDMA);
+
+ Controller->V1.NewInquiryStandardData = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_SCSI_Inquiry_T),
+ &Controller->V1.NewInquiryStandardDataDMA);
+
+ Controller->V1.NewInquiryUnitSerialNumber = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T),
+ &Controller->V1.NewInquiryUnitSerialNumberDMA);
+
+ if ((hw_type == DAC960_PD_Controller) || (hw_type == DAC960_P_Controller))
+ return true;
+
+ /* Enable the Memory Mailbox Interface. */
+ Controller->V1.DualModeMemoryMailboxInterface = true;
+ CommandMailbox.TypeX.CommandOpcode = 0x2B;
+ CommandMailbox.TypeX.CommandIdentifier = 0;
+ CommandMailbox.TypeX.CommandOpcode2 = 0x14;
+ CommandMailbox.TypeX.CommandMailboxesBusAddress =
+ Controller->V1.FirstCommandMailboxDMA;
+ CommandMailbox.TypeX.StatusMailboxesBusAddress =
+ Controller->V1.FirstStatusMailboxDMA;
+#define TIMEOUT_COUNT 1000000
+
+ for (i = 0; i < 2; i++)
+ switch (Controller->HardwareType)
+ {
+ case DAC960_LA_Controller:
+ TimeoutCounter = TIMEOUT_COUNT;
+ while (--TimeoutCounter >= 0)
+ {
+ if (!DAC960_LA_HardwareMailboxFullP(ControllerBaseAddress))
+ break;
+ udelay(10);
+ }
+ if (TimeoutCounter < 0) return false;
+ DAC960_LA_WriteHardwareMailbox(ControllerBaseAddress, &CommandMailbox);
+ DAC960_LA_HardwareMailboxNewCommand(ControllerBaseAddress);
+ TimeoutCounter = TIMEOUT_COUNT;
+ while (--TimeoutCounter >= 0)
+ {
+ if (DAC960_LA_HardwareMailboxStatusAvailableP(
+ ControllerBaseAddress))
+ break;
+ udelay(10);
+ }
+ if (TimeoutCounter < 0) return false;
+ CommandStatus = DAC960_LA_ReadStatusRegister(ControllerBaseAddress);
+ DAC960_LA_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress);
+ DAC960_LA_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress);
+ if (CommandStatus == DAC960_V1_NormalCompletion) return true;
+ Controller->V1.DualModeMemoryMailboxInterface = false;
+ CommandMailbox.TypeX.CommandOpcode2 = 0x10;
+ break;
+ case DAC960_PG_Controller:
+ TimeoutCounter = TIMEOUT_COUNT;
+ while (--TimeoutCounter >= 0)
+ {
+ if (!DAC960_PG_HardwareMailboxFullP(ControllerBaseAddress))
+ break;
+ udelay(10);
+ }
+ if (TimeoutCounter < 0) return false;
+ DAC960_PG_WriteHardwareMailbox(ControllerBaseAddress, &CommandMailbox);
+ DAC960_PG_HardwareMailboxNewCommand(ControllerBaseAddress);
+
+ TimeoutCounter = TIMEOUT_COUNT;
+ while (--TimeoutCounter >= 0)
+ {
+ if (DAC960_PG_HardwareMailboxStatusAvailableP(
+ ControllerBaseAddress))
+ break;
+ udelay(10);
+ }
+ if (TimeoutCounter < 0) return false;
+ CommandStatus = DAC960_PG_ReadStatusRegister(ControllerBaseAddress);
+ DAC960_PG_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress);
+ DAC960_PG_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress);
+ if (CommandStatus == DAC960_V1_NormalCompletion) return true;
+ Controller->V1.DualModeMemoryMailboxInterface = false;
+ CommandMailbox.TypeX.CommandOpcode2 = 0x10;
+ break;
+ default:
+ DAC960_Failure(Controller, "Unknown Controller Type\n");
+ break;
+ }
+ return false;
+}
+
+
+/*
+ DAC960_V2_EnableMemoryMailboxInterface enables the Memory Mailbox Interface
+ for DAC960 V2 Firmware Controllers.
+
+ Aggregate the space needed for the controller's memory mailbox and
+ the other data structures that will be targets of dma transfers with
+ the controller. Allocate a dma-mapped region of memory to hold these
+ structures. Then, save CPU pointers and dma_addr_t values to reference
+ the structures that are contained in that region.
+*/
+
+static bool DAC960_V2_EnableMemoryMailboxInterface(DAC960_Controller_T
+ *Controller)
+{
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ struct pci_dev *PCI_Device = Controller->PCIDevice;
+ struct dma_loaf *DmaPages = &Controller->DmaPages;
+ size_t DmaPagesSize;
+ size_t CommandMailboxesSize;
+ size_t StatusMailboxesSize;
+
+ DAC960_V2_CommandMailbox_T *CommandMailboxesMemory;
+ dma_addr_t CommandMailboxesMemoryDMA;
+
+ DAC960_V2_StatusMailbox_T *StatusMailboxesMemory;
+ dma_addr_t StatusMailboxesMemoryDMA;
+
+ DAC960_V2_CommandMailbox_T *CommandMailbox;
+ dma_addr_t CommandMailboxDMA;
+ DAC960_V2_CommandStatus_T CommandStatus;
+
+ if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(64)))
+ Controller->BounceBufferLimit = DMA_BIT_MASK(64);
+ else if (!pci_set_dma_mask(Controller->PCIDevice, DMA_BIT_MASK(32)))
+ Controller->BounceBufferLimit = DMA_BIT_MASK(32);
+ else
+ return DAC960_Failure(Controller, "DMA mask out of range");
+
+ /* This is a temporary dma mapping, used only in the scope of this function */
+ CommandMailbox = pci_alloc_consistent(PCI_Device,
+ sizeof(DAC960_V2_CommandMailbox_T), &CommandMailboxDMA);
+ if (CommandMailbox == NULL)
+ return false;
+
+ CommandMailboxesSize = DAC960_V2_CommandMailboxCount * sizeof(DAC960_V2_CommandMailbox_T);
+ StatusMailboxesSize = DAC960_V2_StatusMailboxCount * sizeof(DAC960_V2_StatusMailbox_T);
+ DmaPagesSize =
+ CommandMailboxesSize + StatusMailboxesSize +
+ sizeof(DAC960_V2_HealthStatusBuffer_T) +
+ sizeof(DAC960_V2_ControllerInfo_T) +
+ sizeof(DAC960_V2_LogicalDeviceInfo_T) +
+ sizeof(DAC960_V2_PhysicalDeviceInfo_T) +
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T) +
+ sizeof(DAC960_V2_Event_T) +
+ sizeof(DAC960_V2_PhysicalToLogicalDevice_T);
+
+ if (!init_dma_loaf(PCI_Device, DmaPages, DmaPagesSize)) {
+ pci_free_consistent(PCI_Device, sizeof(DAC960_V2_CommandMailbox_T),
+ CommandMailbox, CommandMailboxDMA);
+ return false;
+ }
+
+ CommandMailboxesMemory = slice_dma_loaf(DmaPages,
+ CommandMailboxesSize, &CommandMailboxesMemoryDMA);
+
+ /* These are the base addresses for the command memory mailbox array */
+ Controller->V2.FirstCommandMailbox = CommandMailboxesMemory;
+ Controller->V2.FirstCommandMailboxDMA = CommandMailboxesMemoryDMA;
+
+ CommandMailboxesMemory += DAC960_V2_CommandMailboxCount - 1;
+ Controller->V2.LastCommandMailbox = CommandMailboxesMemory;
+ Controller->V2.NextCommandMailbox = Controller->V2.FirstCommandMailbox;
+ Controller->V2.PreviousCommandMailbox1 = Controller->V2.LastCommandMailbox;
+ Controller->V2.PreviousCommandMailbox2 =
+ Controller->V2.LastCommandMailbox - 1;
+
+ /* These are the base addresses for the status memory mailbox array */
+ StatusMailboxesMemory = slice_dma_loaf(DmaPages,
+ StatusMailboxesSize, &StatusMailboxesMemoryDMA);
+
+ Controller->V2.FirstStatusMailbox = StatusMailboxesMemory;
+ Controller->V2.FirstStatusMailboxDMA = StatusMailboxesMemoryDMA;
+ StatusMailboxesMemory += DAC960_V2_StatusMailboxCount - 1;
+ Controller->V2.LastStatusMailbox = StatusMailboxesMemory;
+ Controller->V2.NextStatusMailbox = Controller->V2.FirstStatusMailbox;
+
+ Controller->V2.HealthStatusBuffer = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V2_HealthStatusBuffer_T),
+ &Controller->V2.HealthStatusBufferDMA);
+
+ Controller->V2.NewControllerInformation = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V2_ControllerInfo_T),
+ &Controller->V2.NewControllerInformationDMA);
+
+ Controller->V2.NewLogicalDeviceInformation = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V2_LogicalDeviceInfo_T),
+ &Controller->V2.NewLogicalDeviceInformationDMA);
+
+ Controller->V2.NewPhysicalDeviceInformation = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V2_PhysicalDeviceInfo_T),
+ &Controller->V2.NewPhysicalDeviceInformationDMA);
+
+ Controller->V2.NewInquiryUnitSerialNumber = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T),
+ &Controller->V2.NewInquiryUnitSerialNumberDMA);
+
+ Controller->V2.Event = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V2_Event_T),
+ &Controller->V2.EventDMA);
+
+ Controller->V2.PhysicalToLogicalDevice = slice_dma_loaf(DmaPages,
+ sizeof(DAC960_V2_PhysicalToLogicalDevice_T),
+ &Controller->V2.PhysicalToLogicalDeviceDMA);
+
+ /*
+ Enable the Memory Mailbox Interface.
+
+ I don't know why we can't just use one of the memory mailboxes
+ we just allocated to do this, instead of using this temporary one.
+ Try this change later.
+ */
+ memset(CommandMailbox, 0, sizeof(DAC960_V2_CommandMailbox_T));
+ CommandMailbox->SetMemoryMailbox.CommandIdentifier = 1;
+ CommandMailbox->SetMemoryMailbox.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->SetMemoryMailbox.CommandControlBits.NoAutoRequestSense = true;
+ CommandMailbox->SetMemoryMailbox.FirstCommandMailboxSizeKB =
+ (DAC960_V2_CommandMailboxCount * sizeof(DAC960_V2_CommandMailbox_T)) >> 10;
+ CommandMailbox->SetMemoryMailbox.FirstStatusMailboxSizeKB =
+ (DAC960_V2_StatusMailboxCount * sizeof(DAC960_V2_StatusMailbox_T)) >> 10;
+ CommandMailbox->SetMemoryMailbox.SecondCommandMailboxSizeKB = 0;
+ CommandMailbox->SetMemoryMailbox.SecondStatusMailboxSizeKB = 0;
+ CommandMailbox->SetMemoryMailbox.RequestSenseSize = 0;
+ CommandMailbox->SetMemoryMailbox.IOCTL_Opcode = DAC960_V2_SetMemoryMailbox;
+ CommandMailbox->SetMemoryMailbox.HealthStatusBufferSizeKB = 1;
+ CommandMailbox->SetMemoryMailbox.HealthStatusBufferBusAddress =
+ Controller->V2.HealthStatusBufferDMA;
+ CommandMailbox->SetMemoryMailbox.FirstCommandMailboxBusAddress =
+ Controller->V2.FirstCommandMailboxDMA;
+ CommandMailbox->SetMemoryMailbox.FirstStatusMailboxBusAddress =
+ Controller->V2.FirstStatusMailboxDMA;
+ switch (Controller->HardwareType)
+ {
+ case DAC960_GEM_Controller:
+ while (DAC960_GEM_HardwareMailboxFullP(ControllerBaseAddress))
+ udelay(1);
+ DAC960_GEM_WriteHardwareMailbox(ControllerBaseAddress, CommandMailboxDMA);
+ DAC960_GEM_HardwareMailboxNewCommand(ControllerBaseAddress);
+ while (!DAC960_GEM_HardwareMailboxStatusAvailableP(ControllerBaseAddress))
+ udelay(1);
+ CommandStatus = DAC960_GEM_ReadCommandStatus(ControllerBaseAddress);
+ DAC960_GEM_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress);
+ DAC960_GEM_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress);
+ break;
+ case DAC960_BA_Controller:
+ while (DAC960_BA_HardwareMailboxFullP(ControllerBaseAddress))
+ udelay(1);
+ DAC960_BA_WriteHardwareMailbox(ControllerBaseAddress, CommandMailboxDMA);
+ DAC960_BA_HardwareMailboxNewCommand(ControllerBaseAddress);
+ while (!DAC960_BA_HardwareMailboxStatusAvailableP(ControllerBaseAddress))
+ udelay(1);
+ CommandStatus = DAC960_BA_ReadCommandStatus(ControllerBaseAddress);
+ DAC960_BA_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress);
+ DAC960_BA_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress);
+ break;
+ case DAC960_LP_Controller:
+ while (DAC960_LP_HardwareMailboxFullP(ControllerBaseAddress))
+ udelay(1);
+ DAC960_LP_WriteHardwareMailbox(ControllerBaseAddress, CommandMailboxDMA);
+ DAC960_LP_HardwareMailboxNewCommand(ControllerBaseAddress);
+ while (!DAC960_LP_HardwareMailboxStatusAvailableP(ControllerBaseAddress))
+ udelay(1);
+ CommandStatus = DAC960_LP_ReadCommandStatus(ControllerBaseAddress);
+ DAC960_LP_AcknowledgeHardwareMailboxInterrupt(ControllerBaseAddress);
+ DAC960_LP_AcknowledgeHardwareMailboxStatus(ControllerBaseAddress);
+ break;
+ default:
+ DAC960_Failure(Controller, "Unknown Controller Type\n");
+ CommandStatus = DAC960_V2_AbormalCompletion;
+ break;
+ }
+ pci_free_consistent(PCI_Device, sizeof(DAC960_V2_CommandMailbox_T),
+ CommandMailbox, CommandMailboxDMA);
+ return (CommandStatus == DAC960_V2_NormalCompletion);
+}
+
+
+/*
+ DAC960_V1_ReadControllerConfiguration reads the Configuration Information
+ from DAC960 V1 Firmware Controllers and initializes the Controller structure.
+*/
+
+static bool DAC960_V1_ReadControllerConfiguration(DAC960_Controller_T
+ *Controller)
+{
+ DAC960_V1_Enquiry2_T *Enquiry2;
+ dma_addr_t Enquiry2DMA;
+ DAC960_V1_Config2_T *Config2;
+ dma_addr_t Config2DMA;
+ int LogicalDriveNumber, Channel, TargetID;
+ struct dma_loaf local_dma;
+
+ if (!init_dma_loaf(Controller->PCIDevice, &local_dma,
+ sizeof(DAC960_V1_Enquiry2_T) + sizeof(DAC960_V1_Config2_T)))
+ return DAC960_Failure(Controller, "LOGICAL DEVICE ALLOCATION");
+
+ Enquiry2 = slice_dma_loaf(&local_dma, sizeof(DAC960_V1_Enquiry2_T), &Enquiry2DMA);
+ Config2 = slice_dma_loaf(&local_dma, sizeof(DAC960_V1_Config2_T), &Config2DMA);
+
+ if (!DAC960_V1_ExecuteType3(Controller, DAC960_V1_Enquiry,
+ Controller->V1.NewEnquiryDMA)) {
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return DAC960_Failure(Controller, "ENQUIRY");
+ }
+ memcpy(&Controller->V1.Enquiry, Controller->V1.NewEnquiry,
+ sizeof(DAC960_V1_Enquiry_T));
+
+ if (!DAC960_V1_ExecuteType3(Controller, DAC960_V1_Enquiry2, Enquiry2DMA)) {
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return DAC960_Failure(Controller, "ENQUIRY2");
+ }
+
+ if (!DAC960_V1_ExecuteType3(Controller, DAC960_V1_ReadConfig2, Config2DMA)) {
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return DAC960_Failure(Controller, "READ CONFIG2");
+ }
+
+ if (!DAC960_V1_ExecuteType3(Controller, DAC960_V1_GetLogicalDriveInformation,
+ Controller->V1.NewLogicalDriveInformationDMA)) {
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return DAC960_Failure(Controller, "GET LOGICAL DRIVE INFORMATION");
+ }
+ memcpy(&Controller->V1.LogicalDriveInformation,
+ Controller->V1.NewLogicalDriveInformation,
+ sizeof(DAC960_V1_LogicalDriveInformationArray_T));
+
+ for (Channel = 0; Channel < Enquiry2->ActualChannels; Channel++)
+ for (TargetID = 0; TargetID < Enquiry2->MaxTargets; TargetID++) {
+ if (!DAC960_V1_ExecuteType3D(Controller, DAC960_V1_GetDeviceState,
+ Channel, TargetID,
+ Controller->V1.NewDeviceStateDMA)) {
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return DAC960_Failure(Controller, "GET DEVICE STATE");
+ }
+ memcpy(&Controller->V1.DeviceState[Channel][TargetID],
+ Controller->V1.NewDeviceState, sizeof(DAC960_V1_DeviceState_T));
+ }
+ /*
+ Initialize the Controller Model Name and Full Model Name fields.
+ */
+ switch (Enquiry2->HardwareID.SubModel)
+ {
+ case DAC960_V1_P_PD_PU:
+ if (Enquiry2->SCSICapability.BusSpeed == DAC960_V1_Ultra)
+ strcpy(Controller->ModelName, "DAC960PU");
+ else strcpy(Controller->ModelName, "DAC960PD");
+ break;
+ case DAC960_V1_PL:
+ strcpy(Controller->ModelName, "DAC960PL");
+ break;
+ case DAC960_V1_PG:
+ strcpy(Controller->ModelName, "DAC960PG");
+ break;
+ case DAC960_V1_PJ:
+ strcpy(Controller->ModelName, "DAC960PJ");
+ break;
+ case DAC960_V1_PR:
+ strcpy(Controller->ModelName, "DAC960PR");
+ break;
+ case DAC960_V1_PT:
+ strcpy(Controller->ModelName, "DAC960PT");
+ break;
+ case DAC960_V1_PTL0:
+ strcpy(Controller->ModelName, "DAC960PTL0");
+ break;
+ case DAC960_V1_PRL:
+ strcpy(Controller->ModelName, "DAC960PRL");
+ break;
+ case DAC960_V1_PTL1:
+ strcpy(Controller->ModelName, "DAC960PTL1");
+ break;
+ case DAC960_V1_1164P:
+ strcpy(Controller->ModelName, "DAC1164P");
+ break;
+ default:
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return DAC960_Failure(Controller, "MODEL VERIFICATION");
+ }
+ strcpy(Controller->FullModelName, "Mylex ");
+ strcat(Controller->FullModelName, Controller->ModelName);
+ /*
+ Initialize the Controller Firmware Version field and verify that it
+ is a supported firmware version. The supported firmware versions are:
+
+ DAC1164P 5.06 and above
+ DAC960PTL/PRL/PJ/PG 4.06 and above
+ DAC960PU/PD/PL 3.51 and above
+ DAC960PU/PD/PL/P 2.73 and above
+ */
+#if defined(CONFIG_ALPHA)
+ /*
+ DEC Alpha machines were often equipped with DAC960 cards that were
+ OEMed from Mylex, and had their own custom firmware. Version 2.70,
+ the last custom FW revision to be released by DEC for these older
+ controllers, appears to work quite well with this driver.
+
+ Cards tested successfully were several versions each of the PD and
+ PU, called by DEC the KZPSC and KZPAC, respectively, and having
+ the Manufacturer Numbers (from Mylex), usually on a sticker on the
+ back of the board, of:
+
+ KZPSC: D040347 (1-channel) or D040348 (2-channel) or D040349 (3-channel)
+ KZPAC: D040395 (1-channel) or D040396 (2-channel) or D040397 (3-channel)
+ */
+# define FIRMWARE_27X "2.70"
+#else
+# define FIRMWARE_27X "2.73"
+#endif
+
+ if (Enquiry2->FirmwareID.MajorVersion == 0)
+ {
+ Enquiry2->FirmwareID.MajorVersion =
+ Controller->V1.Enquiry.MajorFirmwareVersion;
+ Enquiry2->FirmwareID.MinorVersion =
+ Controller->V1.Enquiry.MinorFirmwareVersion;
+ Enquiry2->FirmwareID.FirmwareType = '0';
+ Enquiry2->FirmwareID.TurnID = 0;
+ }
+ sprintf(Controller->FirmwareVersion, "%d.%02d-%c-%02d",
+ Enquiry2->FirmwareID.MajorVersion, Enquiry2->FirmwareID.MinorVersion,
+ Enquiry2->FirmwareID.FirmwareType, Enquiry2->FirmwareID.TurnID);
+ if (!((Controller->FirmwareVersion[0] == '5' &&
+ strcmp(Controller->FirmwareVersion, "5.06") >= 0) ||
+ (Controller->FirmwareVersion[0] == '4' &&
+ strcmp(Controller->FirmwareVersion, "4.06") >= 0) ||
+ (Controller->FirmwareVersion[0] == '3' &&
+ strcmp(Controller->FirmwareVersion, "3.51") >= 0) ||
+ (Controller->FirmwareVersion[0] == '2' &&
+ strcmp(Controller->FirmwareVersion, FIRMWARE_27X) >= 0)))
+ {
+ DAC960_Failure(Controller, "FIRMWARE VERSION VERIFICATION");
+ DAC960_Error("Firmware Version = '%s'\n", Controller,
+ Controller->FirmwareVersion);
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return false;
+ }
+ /*
+ Initialize the Controller Channels, Targets, Memory Size, and SAF-TE
+ Enclosure Management Enabled fields.
+ */
+ Controller->Channels = Enquiry2->ActualChannels;
+ Controller->Targets = Enquiry2->MaxTargets;
+ Controller->MemorySize = Enquiry2->MemorySize >> 20;
+ Controller->V1.SAFTE_EnclosureManagementEnabled =
+ (Enquiry2->FaultManagementType == DAC960_V1_SAFTE);
+ /*
+ Initialize the Controller Queue Depth, Driver Queue Depth, Logical Drive
+ Count, Maximum Blocks per Command, Controller Scatter/Gather Limit, and
+ Driver Scatter/Gather Limit. The Driver Queue Depth must be at most one
+ less than the Controller Queue Depth to allow for an automatic drive
+ rebuild operation.
+ */
+ Controller->ControllerQueueDepth = Controller->V1.Enquiry.MaxCommands;
+ Controller->DriverQueueDepth = Controller->ControllerQueueDepth - 1;
+ if (Controller->DriverQueueDepth > DAC960_MaxDriverQueueDepth)
+ Controller->DriverQueueDepth = DAC960_MaxDriverQueueDepth;
+ Controller->LogicalDriveCount =
+ Controller->V1.Enquiry.NumberOfLogicalDrives;
+ Controller->MaxBlocksPerCommand = Enquiry2->MaxBlocksPerCommand;
+ Controller->ControllerScatterGatherLimit = Enquiry2->MaxScatterGatherEntries;
+ Controller->DriverScatterGatherLimit =
+ Controller->ControllerScatterGatherLimit;
+ if (Controller->DriverScatterGatherLimit > DAC960_V1_ScatterGatherLimit)
+ Controller->DriverScatterGatherLimit = DAC960_V1_ScatterGatherLimit;
+ /*
+ Initialize the Stripe Size, Segment Size, and Geometry Translation.
+ */
+ Controller->V1.StripeSize = Config2->BlocksPerStripe * Config2->BlockFactor
+ >> (10 - DAC960_BlockSizeBits);
+ Controller->V1.SegmentSize = Config2->BlocksPerCacheLine * Config2->BlockFactor
+ >> (10 - DAC960_BlockSizeBits);
+ switch (Config2->DriveGeometry)
+ {
+ case DAC960_V1_Geometry_128_32:
+ Controller->V1.GeometryTranslationHeads = 128;
+ Controller->V1.GeometryTranslationSectors = 32;
+ break;
+ case DAC960_V1_Geometry_255_63:
+ Controller->V1.GeometryTranslationHeads = 255;
+ Controller->V1.GeometryTranslationSectors = 63;
+ break;
+ default:
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return DAC960_Failure(Controller, "CONFIG2 DRIVE GEOMETRY");
+ }
+ /*
+ Initialize the Background Initialization Status.
+ */
+ if ((Controller->FirmwareVersion[0] == '4' &&
+ strcmp(Controller->FirmwareVersion, "4.08") >= 0) ||
+ (Controller->FirmwareVersion[0] == '5' &&
+ strcmp(Controller->FirmwareVersion, "5.08") >= 0))
+ {
+ Controller->V1.BackgroundInitializationStatusSupported = true;
+ DAC960_V1_ExecuteType3B(Controller,
+ DAC960_V1_BackgroundInitializationControl, 0x20,
+ Controller->
+ V1.BackgroundInitializationStatusDMA);
+ memcpy(&Controller->V1.LastBackgroundInitializationStatus,
+ Controller->V1.BackgroundInitializationStatus,
+ sizeof(DAC960_V1_BackgroundInitializationStatus_T));
+ }
+ /*
+ Initialize the Logical Drive Initially Accessible flag.
+ */
+ for (LogicalDriveNumber = 0;
+ LogicalDriveNumber < Controller->LogicalDriveCount;
+ LogicalDriveNumber++)
+ if (Controller->V1.LogicalDriveInformation
+ [LogicalDriveNumber].LogicalDriveState !=
+ DAC960_V1_LogicalDrive_Offline)
+ Controller->LogicalDriveInitiallyAccessible[LogicalDriveNumber] = true;
+ Controller->V1.LastRebuildStatus = DAC960_V1_NoRebuildOrCheckInProgress;
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return true;
+}
+
+
+/*
+ DAC960_V2_ReadControllerConfiguration reads the Configuration Information
+ from DAC960 V2 Firmware Controllers and initializes the Controller structure.
+*/
+
+static bool DAC960_V2_ReadControllerConfiguration(DAC960_Controller_T
+ *Controller)
+{
+ DAC960_V2_ControllerInfo_T *ControllerInfo =
+ &Controller->V2.ControllerInformation;
+ unsigned short LogicalDeviceNumber = 0;
+ int ModelNameLength;
+
+ /* Get data into dma-able area, then copy into permanent location */
+ if (!DAC960_V2_NewControllerInfo(Controller))
+ return DAC960_Failure(Controller, "GET CONTROLLER INFO");
+ memcpy(ControllerInfo, Controller->V2.NewControllerInformation,
+ sizeof(DAC960_V2_ControllerInfo_T));
+
+
+ if (!DAC960_V2_GeneralInfo(Controller))
+ return DAC960_Failure(Controller, "GET HEALTH STATUS");
+
+ /*
+ Initialize the Controller Model Name and Full Model Name fields.
+ */
+ ModelNameLength = sizeof(ControllerInfo->ControllerName);
+ if (ModelNameLength > sizeof(Controller->ModelName)-1)
+ ModelNameLength = sizeof(Controller->ModelName)-1;
+ memcpy(Controller->ModelName, ControllerInfo->ControllerName,
+ ModelNameLength);
+ ModelNameLength--;
+ while (Controller->ModelName[ModelNameLength] == ' ' ||
+ Controller->ModelName[ModelNameLength] == '\0')
+ ModelNameLength--;
+ Controller->ModelName[++ModelNameLength] = '\0';
+ strcpy(Controller->FullModelName, "Mylex ");
+ strcat(Controller->FullModelName, Controller->ModelName);
+ /*
+ Initialize the Controller Firmware Version field.
+ */
+ sprintf(Controller->FirmwareVersion, "%d.%02d-%02d",
+ ControllerInfo->FirmwareMajorVersion,
+ ControllerInfo->FirmwareMinorVersion,
+ ControllerInfo->FirmwareTurnNumber);
+ if (ControllerInfo->FirmwareMajorVersion == 6 &&
+ ControllerInfo->FirmwareMinorVersion == 0 &&
+ ControllerInfo->FirmwareTurnNumber < 1)
+ {
+ DAC960_Info("FIRMWARE VERSION %s DOES NOT PROVIDE THE CONTROLLER\n",
+ Controller, Controller->FirmwareVersion);
+ DAC960_Info("STATUS MONITORING FUNCTIONALITY NEEDED BY THIS DRIVER.\n",
+ Controller);
+ DAC960_Info("PLEASE UPGRADE TO VERSION 6.00-01 OR ABOVE.\n",
+ Controller);
+ }
+ /*
+ Initialize the Controller Channels, Targets, and Memory Size.
+ */
+ Controller->Channels = ControllerInfo->NumberOfPhysicalChannelsPresent;
+ Controller->Targets =
+ ControllerInfo->MaximumTargetsPerChannel
+ [ControllerInfo->NumberOfPhysicalChannelsPresent-1];
+ Controller->MemorySize = ControllerInfo->MemorySizeMB;
+ /*
+ Initialize the Controller Queue Depth, Driver Queue Depth, Logical Drive
+ Count, Maximum Blocks per Command, Controller Scatter/Gather Limit, and
+ Driver Scatter/Gather Limit. The Driver Queue Depth must be at most one
+ less than the Controller Queue Depth to allow for an automatic drive
+ rebuild operation.
+ */
+ Controller->ControllerQueueDepth = ControllerInfo->MaximumParallelCommands;
+ Controller->DriverQueueDepth = Controller->ControllerQueueDepth - 1;
+ if (Controller->DriverQueueDepth > DAC960_MaxDriverQueueDepth)
+ Controller->DriverQueueDepth = DAC960_MaxDriverQueueDepth;
+ Controller->LogicalDriveCount = ControllerInfo->LogicalDevicesPresent;
+ Controller->MaxBlocksPerCommand =
+ ControllerInfo->MaximumDataTransferSizeInBlocks;
+ Controller->ControllerScatterGatherLimit =
+ ControllerInfo->MaximumScatterGatherEntries;
+ Controller->DriverScatterGatherLimit =
+ Controller->ControllerScatterGatherLimit;
+ if (Controller->DriverScatterGatherLimit > DAC960_V2_ScatterGatherLimit)
+ Controller->DriverScatterGatherLimit = DAC960_V2_ScatterGatherLimit;
+ /*
+ Initialize the Logical Device Information.
+ */
+ while (true)
+ {
+ DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo =
+ Controller->V2.NewLogicalDeviceInformation;
+ DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo;
+ DAC960_V2_PhysicalDevice_T PhysicalDevice;
+
+ if (!DAC960_V2_NewLogicalDeviceInfo(Controller, LogicalDeviceNumber))
+ break;
+ LogicalDeviceNumber = NewLogicalDeviceInfo->LogicalDeviceNumber;
+ if (LogicalDeviceNumber >= DAC960_MaxLogicalDrives) {
+ DAC960_Error("DAC960: Logical Drive Number %d not supported\n",
+ Controller, LogicalDeviceNumber);
+ break;
+ }
+ if (NewLogicalDeviceInfo->DeviceBlockSizeInBytes != DAC960_BlockSize) {
+ DAC960_Error("DAC960: Logical Drive Block Size %d not supported\n",
+ Controller, NewLogicalDeviceInfo->DeviceBlockSizeInBytes);
+ LogicalDeviceNumber++;
+ continue;
+ }
+ PhysicalDevice.Controller = 0;
+ PhysicalDevice.Channel = NewLogicalDeviceInfo->Channel;
+ PhysicalDevice.TargetID = NewLogicalDeviceInfo->TargetID;
+ PhysicalDevice.LogicalUnit = NewLogicalDeviceInfo->LogicalUnit;
+ Controller->V2.LogicalDriveToVirtualDevice[LogicalDeviceNumber] =
+ PhysicalDevice;
+ if (NewLogicalDeviceInfo->LogicalDeviceState !=
+ DAC960_V2_LogicalDevice_Offline)
+ Controller->LogicalDriveInitiallyAccessible[LogicalDeviceNumber] = true;
+ LogicalDeviceInfo = kmalloc(sizeof(DAC960_V2_LogicalDeviceInfo_T),
+ GFP_ATOMIC);
+ if (LogicalDeviceInfo == NULL)
+ return DAC960_Failure(Controller, "LOGICAL DEVICE ALLOCATION");
+ Controller->V2.LogicalDeviceInformation[LogicalDeviceNumber] =
+ LogicalDeviceInfo;
+ memcpy(LogicalDeviceInfo, NewLogicalDeviceInfo,
+ sizeof(DAC960_V2_LogicalDeviceInfo_T));
+ LogicalDeviceNumber++;
+ }
+ return true;
+}
+
+
+/*
+ DAC960_ReportControllerConfiguration reports the Configuration Information
+ for Controller.
+*/
+
+static bool DAC960_ReportControllerConfiguration(DAC960_Controller_T
+ *Controller)
+{
+ DAC960_Info("Configuring Mylex %s PCI RAID Controller\n",
+ Controller, Controller->ModelName);
+ DAC960_Info(" Firmware Version: %s, Channels: %d, Memory Size: %dMB\n",
+ Controller, Controller->FirmwareVersion,
+ Controller->Channels, Controller->MemorySize);
+ DAC960_Info(" PCI Bus: %d, Device: %d, Function: %d, I/O Address: ",
+ Controller, Controller->Bus,
+ Controller->Device, Controller->Function);
+ if (Controller->IO_Address == 0)
+ DAC960_Info("Unassigned\n", Controller);
+ else DAC960_Info("0x%X\n", Controller, Controller->IO_Address);
+ DAC960_Info(" PCI Address: 0x%X mapped at 0x%lX, IRQ Channel: %d\n",
+ Controller, Controller->PCI_Address,
+ (unsigned long) Controller->BaseAddress,
+ Controller->IRQ_Channel);
+ DAC960_Info(" Controller Queue Depth: %d, "
+ "Maximum Blocks per Command: %d\n",
+ Controller, Controller->ControllerQueueDepth,
+ Controller->MaxBlocksPerCommand);
+ DAC960_Info(" Driver Queue Depth: %d, "
+ "Scatter/Gather Limit: %d of %d Segments\n",
+ Controller, Controller->DriverQueueDepth,
+ Controller->DriverScatterGatherLimit,
+ Controller->ControllerScatterGatherLimit);
+ if (Controller->FirmwareType == DAC960_V1_Controller)
+ {
+ DAC960_Info(" Stripe Size: %dKB, Segment Size: %dKB, "
+ "BIOS Geometry: %d/%d\n", Controller,
+ Controller->V1.StripeSize,
+ Controller->V1.SegmentSize,
+ Controller->V1.GeometryTranslationHeads,
+ Controller->V1.GeometryTranslationSectors);
+ if (Controller->V1.SAFTE_EnclosureManagementEnabled)
+ DAC960_Info(" SAF-TE Enclosure Management Enabled\n", Controller);
+ }
+ return true;
+}
+
+
+/*
+ DAC960_V1_ReadDeviceConfiguration reads the Device Configuration Information
+ for DAC960 V1 Firmware Controllers by requesting the SCSI Inquiry and SCSI
+ Inquiry Unit Serial Number information for each device connected to
+ Controller.
+*/
+
+static bool DAC960_V1_ReadDeviceConfiguration(DAC960_Controller_T
+ *Controller)
+{
+ struct dma_loaf local_dma;
+
+ dma_addr_t DCDBs_dma[DAC960_V1_MaxChannels];
+ DAC960_V1_DCDB_T *DCDBs_cpu[DAC960_V1_MaxChannels];
+
+ dma_addr_t SCSI_Inquiry_dma[DAC960_V1_MaxChannels];
+ DAC960_SCSI_Inquiry_T *SCSI_Inquiry_cpu[DAC960_V1_MaxChannels];
+
+ dma_addr_t SCSI_NewInquiryUnitSerialNumberDMA[DAC960_V1_MaxChannels];
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *SCSI_NewInquiryUnitSerialNumberCPU[DAC960_V1_MaxChannels];
+
+ struct completion Completions[DAC960_V1_MaxChannels];
+ unsigned long flags;
+ int Channel, TargetID;
+
+ if (!init_dma_loaf(Controller->PCIDevice, &local_dma,
+ DAC960_V1_MaxChannels*(sizeof(DAC960_V1_DCDB_T) +
+ sizeof(DAC960_SCSI_Inquiry_T) +
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T))))
+ return DAC960_Failure(Controller,
+ "DMA ALLOCATION FAILED IN ReadDeviceConfiguration");
+
+ for (Channel = 0; Channel < Controller->Channels; Channel++) {
+ DCDBs_cpu[Channel] = slice_dma_loaf(&local_dma,
+ sizeof(DAC960_V1_DCDB_T), DCDBs_dma + Channel);
+ SCSI_Inquiry_cpu[Channel] = slice_dma_loaf(&local_dma,
+ sizeof(DAC960_SCSI_Inquiry_T),
+ SCSI_Inquiry_dma + Channel);
+ SCSI_NewInquiryUnitSerialNumberCPU[Channel] = slice_dma_loaf(&local_dma,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T),
+ SCSI_NewInquiryUnitSerialNumberDMA + Channel);
+ }
+
+ for (TargetID = 0; TargetID < Controller->Targets; TargetID++)
+ {
+ /*
+ * For each channel, submit a probe for a device on that channel.
+ * The timeout interval for a device that is present is 10 seconds.
+ * With this approach, the timeout periods can elapse in parallel
+ * on each channel.
+ */
+ for (Channel = 0; Channel < Controller->Channels; Channel++)
+ {
+ dma_addr_t NewInquiryStandardDataDMA = SCSI_Inquiry_dma[Channel];
+ DAC960_V1_DCDB_T *DCDB = DCDBs_cpu[Channel];
+ dma_addr_t DCDB_dma = DCDBs_dma[Channel];
+ DAC960_Command_T *Command = Controller->Commands[Channel];
+ struct completion *Completion = &Completions[Channel];
+
+ init_completion(Completion);
+ DAC960_V1_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ Command->Completion = Completion;
+ Command->V1.CommandMailbox.Type3.CommandOpcode = DAC960_V1_DCDB;
+ Command->V1.CommandMailbox.Type3.BusAddress = DCDB_dma;
+ DCDB->Channel = Channel;
+ DCDB->TargetID = TargetID;
+ DCDB->Direction = DAC960_V1_DCDB_DataTransferDeviceToSystem;
+ DCDB->EarlyStatus = false;
+ DCDB->Timeout = DAC960_V1_DCDB_Timeout_10_seconds;
+ DCDB->NoAutomaticRequestSense = false;
+ DCDB->DisconnectPermitted = true;
+ DCDB->TransferLength = sizeof(DAC960_SCSI_Inquiry_T);
+ DCDB->BusAddress = NewInquiryStandardDataDMA;
+ DCDB->CDBLength = 6;
+ DCDB->TransferLengthHigh4 = 0;
+ DCDB->SenseLength = sizeof(DCDB->SenseData);
+ DCDB->CDB[0] = 0x12; /* INQUIRY */
+ DCDB->CDB[1] = 0; /* EVPD = 0 */
+ DCDB->CDB[2] = 0; /* Page Code */
+ DCDB->CDB[3] = 0; /* Reserved */
+ DCDB->CDB[4] = sizeof(DAC960_SCSI_Inquiry_T);
+ DCDB->CDB[5] = 0; /* Control */
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_QueueCommand(Command);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ }
+ /*
+ * Wait for the problems submitted in the previous loop
+ * to complete. On the probes that are successful,
+ * get the serial number of the device that was found.
+ */
+ for (Channel = 0; Channel < Controller->Channels; Channel++)
+ {
+ DAC960_SCSI_Inquiry_T *InquiryStandardData =
+ &Controller->V1.InquiryStandardData[Channel][TargetID];
+ DAC960_SCSI_Inquiry_T *NewInquiryStandardData = SCSI_Inquiry_cpu[Channel];
+ dma_addr_t NewInquiryUnitSerialNumberDMA =
+ SCSI_NewInquiryUnitSerialNumberDMA[Channel];
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *NewInquiryUnitSerialNumber =
+ SCSI_NewInquiryUnitSerialNumberCPU[Channel];
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber =
+ &Controller->V1.InquiryUnitSerialNumber[Channel][TargetID];
+ DAC960_Command_T *Command = Controller->Commands[Channel];
+ DAC960_V1_DCDB_T *DCDB = DCDBs_cpu[Channel];
+ struct completion *Completion = &Completions[Channel];
+
+ wait_for_completion(Completion);
+
+ if (Command->V1.CommandStatus != DAC960_V1_NormalCompletion) {
+ memset(InquiryStandardData, 0, sizeof(DAC960_SCSI_Inquiry_T));
+ InquiryStandardData->PeripheralDeviceType = 0x1F;
+ continue;
+ } else
+ memcpy(InquiryStandardData, NewInquiryStandardData, sizeof(DAC960_SCSI_Inquiry_T));
+
+ /* Preserve Channel and TargetID values from the previous loop */
+ Command->Completion = Completion;
+ DCDB->TransferLength = sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T);
+ DCDB->BusAddress = NewInquiryUnitSerialNumberDMA;
+ DCDB->SenseLength = sizeof(DCDB->SenseData);
+ DCDB->CDB[0] = 0x12; /* INQUIRY */
+ DCDB->CDB[1] = 1; /* EVPD = 1 */
+ DCDB->CDB[2] = 0x80; /* Page Code */
+ DCDB->CDB[3] = 0; /* Reserved */
+ DCDB->CDB[4] = sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T);
+ DCDB->CDB[5] = 0; /* Control */
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_QueueCommand(Command);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ wait_for_completion(Completion);
+
+ if (Command->V1.CommandStatus != DAC960_V1_NormalCompletion) {
+ memset(InquiryUnitSerialNumber, 0,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+ InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F;
+ } else
+ memcpy(InquiryUnitSerialNumber, NewInquiryUnitSerialNumber,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+ }
+ }
+ free_dma_loaf(Controller->PCIDevice, &local_dma);
+ return true;
+}
+
+
+/*
+ DAC960_V2_ReadDeviceConfiguration reads the Device Configuration Information
+ for DAC960 V2 Firmware Controllers by requesting the Physical Device
+ Information and SCSI Inquiry Unit Serial Number information for each
+ device connected to Controller.
+*/
+
+static bool DAC960_V2_ReadDeviceConfiguration(DAC960_Controller_T
+ *Controller)
+{
+ unsigned char Channel = 0, TargetID = 0, LogicalUnit = 0;
+ unsigned short PhysicalDeviceIndex = 0;
+
+ while (true)
+ {
+ DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo =
+ Controller->V2.NewPhysicalDeviceInformation;
+ DAC960_V2_PhysicalDeviceInfo_T *PhysicalDeviceInfo;
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *NewInquiryUnitSerialNumber =
+ Controller->V2.NewInquiryUnitSerialNumber;
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber;
+
+ if (!DAC960_V2_NewPhysicalDeviceInfo(Controller, Channel, TargetID, LogicalUnit))
+ break;
+
+ PhysicalDeviceInfo = kmalloc(sizeof(DAC960_V2_PhysicalDeviceInfo_T),
+ GFP_ATOMIC);
+ if (PhysicalDeviceInfo == NULL)
+ return DAC960_Failure(Controller, "PHYSICAL DEVICE ALLOCATION");
+ Controller->V2.PhysicalDeviceInformation[PhysicalDeviceIndex] =
+ PhysicalDeviceInfo;
+ memcpy(PhysicalDeviceInfo, NewPhysicalDeviceInfo,
+ sizeof(DAC960_V2_PhysicalDeviceInfo_T));
+
+ InquiryUnitSerialNumber = kmalloc(
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T), GFP_ATOMIC);
+ if (InquiryUnitSerialNumber == NULL) {
+ kfree(PhysicalDeviceInfo);
+ return DAC960_Failure(Controller, "SERIAL NUMBER ALLOCATION");
+ }
+ Controller->V2.InquiryUnitSerialNumber[PhysicalDeviceIndex] =
+ InquiryUnitSerialNumber;
+
+ Channel = NewPhysicalDeviceInfo->Channel;
+ TargetID = NewPhysicalDeviceInfo->TargetID;
+ LogicalUnit = NewPhysicalDeviceInfo->LogicalUnit;
+
+ /*
+ Some devices do NOT have Unit Serial Numbers.
+ This command fails for them. But, we still want to
+ remember those devices are there. Construct a
+ UnitSerialNumber structure for the failure case.
+ */
+ if (!DAC960_V2_NewInquiryUnitSerialNumber(Controller, Channel, TargetID, LogicalUnit)) {
+ memset(InquiryUnitSerialNumber, 0,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+ InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F;
+ } else
+ memcpy(InquiryUnitSerialNumber, NewInquiryUnitSerialNumber,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+
+ PhysicalDeviceIndex++;
+ LogicalUnit++;
+ }
+ return true;
+}
+
+
+/*
+ DAC960_SanitizeInquiryData sanitizes the Vendor, Model, Revision, and
+ Product Serial Number fields of the Inquiry Standard Data and Inquiry
+ Unit Serial Number structures.
+*/
+
+static void DAC960_SanitizeInquiryData(DAC960_SCSI_Inquiry_T
+ *InquiryStandardData,
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T
+ *InquiryUnitSerialNumber,
+ unsigned char *Vendor,
+ unsigned char *Model,
+ unsigned char *Revision,
+ unsigned char *SerialNumber)
+{
+ int SerialNumberLength, i;
+ if (InquiryStandardData->PeripheralDeviceType == 0x1F) return;
+ for (i = 0; i < sizeof(InquiryStandardData->VendorIdentification); i++)
+ {
+ unsigned char VendorCharacter =
+ InquiryStandardData->VendorIdentification[i];
+ Vendor[i] = (VendorCharacter >= ' ' && VendorCharacter <= '~'
+ ? VendorCharacter : ' ');
+ }
+ Vendor[sizeof(InquiryStandardData->VendorIdentification)] = '\0';
+ for (i = 0; i < sizeof(InquiryStandardData->ProductIdentification); i++)
+ {
+ unsigned char ModelCharacter =
+ InquiryStandardData->ProductIdentification[i];
+ Model[i] = (ModelCharacter >= ' ' && ModelCharacter <= '~'
+ ? ModelCharacter : ' ');
+ }
+ Model[sizeof(InquiryStandardData->ProductIdentification)] = '\0';
+ for (i = 0; i < sizeof(InquiryStandardData->ProductRevisionLevel); i++)
+ {
+ unsigned char RevisionCharacter =
+ InquiryStandardData->ProductRevisionLevel[i];
+ Revision[i] = (RevisionCharacter >= ' ' && RevisionCharacter <= '~'
+ ? RevisionCharacter : ' ');
+ }
+ Revision[sizeof(InquiryStandardData->ProductRevisionLevel)] = '\0';
+ if (InquiryUnitSerialNumber->PeripheralDeviceType == 0x1F) return;
+ SerialNumberLength = InquiryUnitSerialNumber->PageLength;
+ if (SerialNumberLength >
+ sizeof(InquiryUnitSerialNumber->ProductSerialNumber))
+ SerialNumberLength = sizeof(InquiryUnitSerialNumber->ProductSerialNumber);
+ for (i = 0; i < SerialNumberLength; i++)
+ {
+ unsigned char SerialNumberCharacter =
+ InquiryUnitSerialNumber->ProductSerialNumber[i];
+ SerialNumber[i] =
+ (SerialNumberCharacter >= ' ' && SerialNumberCharacter <= '~'
+ ? SerialNumberCharacter : ' ');
+ }
+ SerialNumber[SerialNumberLength] = '\0';
+}
+
+
+/*
+ DAC960_V1_ReportDeviceConfiguration reports the Device Configuration
+ Information for DAC960 V1 Firmware Controllers.
+*/
+
+static bool DAC960_V1_ReportDeviceConfiguration(DAC960_Controller_T
+ *Controller)
+{
+ int LogicalDriveNumber, Channel, TargetID;
+ DAC960_Info(" Physical Devices:\n", Controller);
+ for (Channel = 0; Channel < Controller->Channels; Channel++)
+ for (TargetID = 0; TargetID < Controller->Targets; TargetID++)
+ {
+ DAC960_SCSI_Inquiry_T *InquiryStandardData =
+ &Controller->V1.InquiryStandardData[Channel][TargetID];
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber =
+ &Controller->V1.InquiryUnitSerialNumber[Channel][TargetID];
+ DAC960_V1_DeviceState_T *DeviceState =
+ &Controller->V1.DeviceState[Channel][TargetID];
+ DAC960_V1_ErrorTableEntry_T *ErrorEntry =
+ &Controller->V1.ErrorTable.ErrorTableEntries[Channel][TargetID];
+ char Vendor[1+sizeof(InquiryStandardData->VendorIdentification)];
+ char Model[1+sizeof(InquiryStandardData->ProductIdentification)];
+ char Revision[1+sizeof(InquiryStandardData->ProductRevisionLevel)];
+ char SerialNumber[1+sizeof(InquiryUnitSerialNumber
+ ->ProductSerialNumber)];
+ if (InquiryStandardData->PeripheralDeviceType == 0x1F) continue;
+ DAC960_SanitizeInquiryData(InquiryStandardData, InquiryUnitSerialNumber,
+ Vendor, Model, Revision, SerialNumber);
+ DAC960_Info(" %d:%d%s Vendor: %s Model: %s Revision: %s\n",
+ Controller, Channel, TargetID, (TargetID < 10 ? " " : ""),
+ Vendor, Model, Revision);
+ if (InquiryUnitSerialNumber->PeripheralDeviceType != 0x1F)
+ DAC960_Info(" Serial Number: %s\n", Controller, SerialNumber);
+ if (DeviceState->Present &&
+ DeviceState->DeviceType == DAC960_V1_DiskType)
+ {
+ if (Controller->V1.DeviceResetCount[Channel][TargetID] > 0)
+ DAC960_Info(" Disk Status: %s, %u blocks, %d resets\n",
+ Controller,
+ (DeviceState->DeviceState == DAC960_V1_Device_Dead
+ ? "Dead"
+ : DeviceState->DeviceState
+ == DAC960_V1_Device_WriteOnly
+ ? "Write-Only"
+ : DeviceState->DeviceState
+ == DAC960_V1_Device_Online
+ ? "Online" : "Standby"),
+ DeviceState->DiskSize,
+ Controller->V1.DeviceResetCount[Channel][TargetID]);
+ else
+ DAC960_Info(" Disk Status: %s, %u blocks\n", Controller,
+ (DeviceState->DeviceState == DAC960_V1_Device_Dead
+ ? "Dead"
+ : DeviceState->DeviceState
+ == DAC960_V1_Device_WriteOnly
+ ? "Write-Only"
+ : DeviceState->DeviceState
+ == DAC960_V1_Device_Online
+ ? "Online" : "Standby"),
+ DeviceState->DiskSize);
+ }
+ if (ErrorEntry->ParityErrorCount > 0 ||
+ ErrorEntry->SoftErrorCount > 0 ||
+ ErrorEntry->HardErrorCount > 0 ||
+ ErrorEntry->MiscErrorCount > 0)
+ DAC960_Info(" Errors - Parity: %d, Soft: %d, "
+ "Hard: %d, Misc: %d\n", Controller,
+ ErrorEntry->ParityErrorCount,
+ ErrorEntry->SoftErrorCount,
+ ErrorEntry->HardErrorCount,
+ ErrorEntry->MiscErrorCount);
+ }
+ DAC960_Info(" Logical Drives:\n", Controller);
+ for (LogicalDriveNumber = 0;
+ LogicalDriveNumber < Controller->LogicalDriveCount;
+ LogicalDriveNumber++)
+ {
+ DAC960_V1_LogicalDriveInformation_T *LogicalDriveInformation =
+ &Controller->V1.LogicalDriveInformation[LogicalDriveNumber];
+ DAC960_Info(" /dev/rd/c%dd%d: RAID-%d, %s, %u blocks, %s\n",
+ Controller, Controller->ControllerNumber, LogicalDriveNumber,
+ LogicalDriveInformation->RAIDLevel,
+ (LogicalDriveInformation->LogicalDriveState
+ == DAC960_V1_LogicalDrive_Online
+ ? "Online"
+ : LogicalDriveInformation->LogicalDriveState
+ == DAC960_V1_LogicalDrive_Critical
+ ? "Critical" : "Offline"),
+ LogicalDriveInformation->LogicalDriveSize,
+ (LogicalDriveInformation->WriteBack
+ ? "Write Back" : "Write Thru"));
+ }
+ return true;
+}
+
+
+/*
+ DAC960_V2_ReportDeviceConfiguration reports the Device Configuration
+ Information for DAC960 V2 Firmware Controllers.
+*/
+
+static bool DAC960_V2_ReportDeviceConfiguration(DAC960_Controller_T
+ *Controller)
+{
+ int PhysicalDeviceIndex, LogicalDriveNumber;
+ DAC960_Info(" Physical Devices:\n", Controller);
+ for (PhysicalDeviceIndex = 0;
+ PhysicalDeviceIndex < DAC960_V2_MaxPhysicalDevices;
+ PhysicalDeviceIndex++)
+ {
+ DAC960_V2_PhysicalDeviceInfo_T *PhysicalDeviceInfo =
+ Controller->V2.PhysicalDeviceInformation[PhysicalDeviceIndex];
+ DAC960_SCSI_Inquiry_T *InquiryStandardData =
+ (DAC960_SCSI_Inquiry_T *) &PhysicalDeviceInfo->SCSI_InquiryData;
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber =
+ Controller->V2.InquiryUnitSerialNumber[PhysicalDeviceIndex];
+ char Vendor[1+sizeof(InquiryStandardData->VendorIdentification)];
+ char Model[1+sizeof(InquiryStandardData->ProductIdentification)];
+ char Revision[1+sizeof(InquiryStandardData->ProductRevisionLevel)];
+ char SerialNumber[1+sizeof(InquiryUnitSerialNumber->ProductSerialNumber)];
+ if (PhysicalDeviceInfo == NULL) break;
+ DAC960_SanitizeInquiryData(InquiryStandardData, InquiryUnitSerialNumber,
+ Vendor, Model, Revision, SerialNumber);
+ DAC960_Info(" %d:%d%s Vendor: %s Model: %s Revision: %s\n",
+ Controller,
+ PhysicalDeviceInfo->Channel,
+ PhysicalDeviceInfo->TargetID,
+ (PhysicalDeviceInfo->TargetID < 10 ? " " : ""),
+ Vendor, Model, Revision);
+ if (PhysicalDeviceInfo->NegotiatedSynchronousMegaTransfers == 0)
+ DAC960_Info(" %sAsynchronous\n", Controller,
+ (PhysicalDeviceInfo->NegotiatedDataWidthBits == 16
+ ? "Wide " :""));
+ else
+ DAC960_Info(" %sSynchronous at %d MB/sec\n", Controller,
+ (PhysicalDeviceInfo->NegotiatedDataWidthBits == 16
+ ? "Wide " :""),
+ (PhysicalDeviceInfo->NegotiatedSynchronousMegaTransfers
+ * PhysicalDeviceInfo->NegotiatedDataWidthBits/8));
+ if (InquiryUnitSerialNumber->PeripheralDeviceType != 0x1F)
+ DAC960_Info(" Serial Number: %s\n", Controller, SerialNumber);
+ if (PhysicalDeviceInfo->PhysicalDeviceState ==
+ DAC960_V2_Device_Unconfigured)
+ continue;
+ DAC960_Info(" Disk Status: %s, %u blocks\n", Controller,
+ (PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Online
+ ? "Online"
+ : PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Rebuild
+ ? "Rebuild"
+ : PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Missing
+ ? "Missing"
+ : PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Critical
+ ? "Critical"
+ : PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Dead
+ ? "Dead"
+ : PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_SuspectedDead
+ ? "Suspected-Dead"
+ : PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_CommandedOffline
+ ? "Commanded-Offline"
+ : PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Standby
+ ? "Standby" : "Unknown"),
+ PhysicalDeviceInfo->ConfigurableDeviceSize);
+ if (PhysicalDeviceInfo->ParityErrors == 0 &&
+ PhysicalDeviceInfo->SoftErrors == 0 &&
+ PhysicalDeviceInfo->HardErrors == 0 &&
+ PhysicalDeviceInfo->MiscellaneousErrors == 0 &&
+ PhysicalDeviceInfo->CommandTimeouts == 0 &&
+ PhysicalDeviceInfo->Retries == 0 &&
+ PhysicalDeviceInfo->Aborts == 0 &&
+ PhysicalDeviceInfo->PredictedFailuresDetected == 0)
+ continue;
+ DAC960_Info(" Errors - Parity: %d, Soft: %d, "
+ "Hard: %d, Misc: %d\n", Controller,
+ PhysicalDeviceInfo->ParityErrors,
+ PhysicalDeviceInfo->SoftErrors,
+ PhysicalDeviceInfo->HardErrors,
+ PhysicalDeviceInfo->MiscellaneousErrors);
+ DAC960_Info(" Timeouts: %d, Retries: %d, "
+ "Aborts: %d, Predicted: %d\n", Controller,
+ PhysicalDeviceInfo->CommandTimeouts,
+ PhysicalDeviceInfo->Retries,
+ PhysicalDeviceInfo->Aborts,
+ PhysicalDeviceInfo->PredictedFailuresDetected);
+ }
+ DAC960_Info(" Logical Drives:\n", Controller);
+ for (LogicalDriveNumber = 0;
+ LogicalDriveNumber < DAC960_MaxLogicalDrives;
+ LogicalDriveNumber++)
+ {
+ DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo =
+ Controller->V2.LogicalDeviceInformation[LogicalDriveNumber];
+ unsigned char *ReadCacheStatus[] = { "Read Cache Disabled",
+ "Read Cache Enabled",
+ "Read Ahead Enabled",
+ "Intelligent Read Ahead Enabled",
+ "-", "-", "-", "-" };
+ unsigned char *WriteCacheStatus[] = { "Write Cache Disabled",
+ "Logical Device Read Only",
+ "Write Cache Enabled",
+ "Intelligent Write Cache Enabled",
+ "-", "-", "-", "-" };
+ unsigned char *GeometryTranslation;
+ if (LogicalDeviceInfo == NULL) continue;
+ switch (LogicalDeviceInfo->DriveGeometry)
+ {
+ case DAC960_V2_Geometry_128_32:
+ GeometryTranslation = "128/32";
+ break;
+ case DAC960_V2_Geometry_255_63:
+ GeometryTranslation = "255/63";
+ break;
+ default:
+ GeometryTranslation = "Invalid";
+ DAC960_Error("Illegal Logical Device Geometry %d\n",
+ Controller, LogicalDeviceInfo->DriveGeometry);
+ break;
+ }
+ DAC960_Info(" /dev/rd/c%dd%d: RAID-%d, %s, %u blocks\n",
+ Controller, Controller->ControllerNumber, LogicalDriveNumber,
+ LogicalDeviceInfo->RAIDLevel,
+ (LogicalDeviceInfo->LogicalDeviceState
+ == DAC960_V2_LogicalDevice_Online
+ ? "Online"
+ : LogicalDeviceInfo->LogicalDeviceState
+ == DAC960_V2_LogicalDevice_Critical
+ ? "Critical" : "Offline"),
+ LogicalDeviceInfo->ConfigurableDeviceSize);
+ DAC960_Info(" Logical Device %s, BIOS Geometry: %s\n",
+ Controller,
+ (LogicalDeviceInfo->LogicalDeviceControl
+ .LogicalDeviceInitialized
+ ? "Initialized" : "Uninitialized"),
+ GeometryTranslation);
+ if (LogicalDeviceInfo->StripeSize == 0)
+ {
+ if (LogicalDeviceInfo->CacheLineSize == 0)
+ DAC960_Info(" Stripe Size: N/A, "
+ "Segment Size: N/A\n", Controller);
+ else
+ DAC960_Info(" Stripe Size: N/A, "
+ "Segment Size: %dKB\n", Controller,
+ 1 << (LogicalDeviceInfo->CacheLineSize - 2));
+ }
+ else
+ {
+ if (LogicalDeviceInfo->CacheLineSize == 0)
+ DAC960_Info(" Stripe Size: %dKB, "
+ "Segment Size: N/A\n", Controller,
+ 1 << (LogicalDeviceInfo->StripeSize - 2));
+ else
+ DAC960_Info(" Stripe Size: %dKB, "
+ "Segment Size: %dKB\n", Controller,
+ 1 << (LogicalDeviceInfo->StripeSize - 2),
+ 1 << (LogicalDeviceInfo->CacheLineSize - 2));
+ }
+ DAC960_Info(" %s, %s\n", Controller,
+ ReadCacheStatus[
+ LogicalDeviceInfo->LogicalDeviceControl.ReadCache],
+ WriteCacheStatus[
+ LogicalDeviceInfo->LogicalDeviceControl.WriteCache]);
+ if (LogicalDeviceInfo->SoftErrors > 0 ||
+ LogicalDeviceInfo->CommandsFailed > 0 ||
+ LogicalDeviceInfo->DeferredWriteErrors)
+ DAC960_Info(" Errors - Soft: %d, Failed: %d, "
+ "Deferred Write: %d\n", Controller,
+ LogicalDeviceInfo->SoftErrors,
+ LogicalDeviceInfo->CommandsFailed,
+ LogicalDeviceInfo->DeferredWriteErrors);
+
+ }
+ return true;
+}
+
+/*
+ DAC960_RegisterBlockDevice registers the Block Device structures
+ associated with Controller.
+*/
+
+static bool DAC960_RegisterBlockDevice(DAC960_Controller_T *Controller)
+{
+ int MajorNumber = DAC960_MAJOR + Controller->ControllerNumber;
+ int n;
+
+ /*
+ Register the Block Device Major Number for this DAC960 Controller.
+ */
+ if (register_blkdev(MajorNumber, "dac960") < 0)
+ return false;
+
+ for (n = 0; n < DAC960_MaxLogicalDrives; n++) {
+ struct gendisk *disk = Controller->disks[n];
+ struct request_queue *RequestQueue;
+
+ /* for now, let all request queues share controller's lock */
+ RequestQueue = blk_init_queue(DAC960_RequestFunction,&Controller->queue_lock);
+ if (!RequestQueue) {
+ printk("DAC960: failure to allocate request queue\n");
+ continue;
+ }
+ Controller->RequestQueue[n] = RequestQueue;
+ blk_queue_bounce_limit(RequestQueue, Controller->BounceBufferLimit);
+ RequestQueue->queuedata = Controller;
+ blk_queue_max_segments(RequestQueue, Controller->DriverScatterGatherLimit);
+ blk_queue_max_hw_sectors(RequestQueue, Controller->MaxBlocksPerCommand);
+ disk->queue = RequestQueue;
+ sprintf(disk->disk_name, "rd/c%dd%d", Controller->ControllerNumber, n);
+ disk->major = MajorNumber;
+ disk->first_minor = n << DAC960_MaxPartitionsBits;
+ disk->fops = &DAC960_BlockDeviceOperations;
+ }
+ /*
+ Indicate the Block Device Registration completed successfully,
+ */
+ return true;
+}
+
+
+/*
+ DAC960_UnregisterBlockDevice unregisters the Block Device structures
+ associated with Controller.
+*/
+
+static void DAC960_UnregisterBlockDevice(DAC960_Controller_T *Controller)
+{
+ int MajorNumber = DAC960_MAJOR + Controller->ControllerNumber;
+ int disk;
+
+ /* does order matter when deleting gendisk and cleanup in request queue? */
+ for (disk = 0; disk < DAC960_MaxLogicalDrives; disk++) {
+ del_gendisk(Controller->disks[disk]);
+ blk_cleanup_queue(Controller->RequestQueue[disk]);
+ Controller->RequestQueue[disk] = NULL;
+ }
+
+ /*
+ Unregister the Block Device Major Number for this DAC960 Controller.
+ */
+ unregister_blkdev(MajorNumber, "dac960");
+}
+
+/*
+ DAC960_ComputeGenericDiskInfo computes the values for the Generic Disk
+ Information Partition Sector Counts and Block Sizes.
+*/
+
+static void DAC960_ComputeGenericDiskInfo(DAC960_Controller_T *Controller)
+{
+ int disk;
+ for (disk = 0; disk < DAC960_MaxLogicalDrives; disk++)
+ set_capacity(Controller->disks[disk], disk_size(Controller, disk));
+}
+
+/*
+ DAC960_ReportErrorStatus reports Controller BIOS Messages passed through
+ the Error Status Register when the driver performs the BIOS handshaking.
+ It returns true for fatal errors and false otherwise.
+*/
+
+static bool DAC960_ReportErrorStatus(DAC960_Controller_T *Controller,
+ unsigned char ErrorStatus,
+ unsigned char Parameter0,
+ unsigned char Parameter1)
+{
+ switch (ErrorStatus)
+ {
+ case 0x00:
+ DAC960_Notice("Physical Device %d:%d Not Responding\n",
+ Controller, Parameter1, Parameter0);
+ break;
+ case 0x08:
+ if (Controller->DriveSpinUpMessageDisplayed) break;
+ DAC960_Notice("Spinning Up Drives\n", Controller);
+ Controller->DriveSpinUpMessageDisplayed = true;
+ break;
+ case 0x30:
+ DAC960_Notice("Configuration Checksum Error\n", Controller);
+ break;
+ case 0x60:
+ DAC960_Notice("Mirror Race Recovery Failed\n", Controller);
+ break;
+ case 0x70:
+ DAC960_Notice("Mirror Race Recovery In Progress\n", Controller);
+ break;
+ case 0x90:
+ DAC960_Notice("Physical Device %d:%d COD Mismatch\n",
+ Controller, Parameter1, Parameter0);
+ break;
+ case 0xA0:
+ DAC960_Notice("Logical Drive Installation Aborted\n", Controller);
+ break;
+ case 0xB0:
+ DAC960_Notice("Mirror Race On A Critical Logical Drive\n", Controller);
+ break;
+ case 0xD0:
+ DAC960_Notice("New Controller Configuration Found\n", Controller);
+ break;
+ case 0xF0:
+ DAC960_Error("Fatal Memory Parity Error for Controller at\n", Controller);
+ return true;
+ default:
+ DAC960_Error("Unknown Initialization Error %02X for Controller at\n",
+ Controller, ErrorStatus);
+ return true;
+ }
+ return false;
+}
+
+
+/*
+ * DAC960_DetectCleanup releases the resources that were allocated
+ * during DAC960_DetectController(). DAC960_DetectController can
+ * has several internal failure points, so not ALL resources may
+ * have been allocated. It's important to free only
+ * resources that HAVE been allocated. The code below always
+ * tests that the resource has been allocated before attempting to
+ * free it.
+ */
+static void DAC960_DetectCleanup(DAC960_Controller_T *Controller)
+{
+ int i;
+
+ /* Free the memory mailbox, status, and related structures */
+ free_dma_loaf(Controller->PCIDevice, &Controller->DmaPages);
+ if (Controller->MemoryMappedAddress) {
+ switch(Controller->HardwareType)
+ {
+ case DAC960_GEM_Controller:
+ DAC960_GEM_DisableInterrupts(Controller->BaseAddress);
+ break;
+ case DAC960_BA_Controller:
+ DAC960_BA_DisableInterrupts(Controller->BaseAddress);
+ break;
+ case DAC960_LP_Controller:
+ DAC960_LP_DisableInterrupts(Controller->BaseAddress);
+ break;
+ case DAC960_LA_Controller:
+ DAC960_LA_DisableInterrupts(Controller->BaseAddress);
+ break;
+ case DAC960_PG_Controller:
+ DAC960_PG_DisableInterrupts(Controller->BaseAddress);
+ break;
+ case DAC960_PD_Controller:
+ DAC960_PD_DisableInterrupts(Controller->BaseAddress);
+ break;
+ case DAC960_P_Controller:
+ DAC960_PD_DisableInterrupts(Controller->BaseAddress);
+ break;
+ }
+ iounmap(Controller->MemoryMappedAddress);
+ }
+ if (Controller->IRQ_Channel)
+ free_irq(Controller->IRQ_Channel, Controller);
+ if (Controller->IO_Address)
+ release_region(Controller->IO_Address, 0x80);
+ pci_disable_device(Controller->PCIDevice);
+ for (i = 0; (i < DAC960_MaxLogicalDrives) && Controller->disks[i]; i++)
+ put_disk(Controller->disks[i]);
+ DAC960_Controllers[Controller->ControllerNumber] = NULL;
+ kfree(Controller);
+}
+
+
+/*
+ DAC960_DetectController detects Mylex DAC960/AcceleRAID/eXtremeRAID
+ PCI RAID Controllers by interrogating the PCI Configuration Space for
+ Controller Type.
+*/
+
+static DAC960_Controller_T *
+DAC960_DetectController(struct pci_dev *PCI_Device,
+ const struct pci_device_id *entry)
+{
+ struct DAC960_privdata *privdata =
+ (struct DAC960_privdata *)entry->driver_data;
+ irq_handler_t InterruptHandler = privdata->InterruptHandler;
+ unsigned int MemoryWindowSize = privdata->MemoryWindowSize;
+ DAC960_Controller_T *Controller = NULL;
+ unsigned char DeviceFunction = PCI_Device->devfn;
+ unsigned char ErrorStatus, Parameter0, Parameter1;
+ unsigned int IRQ_Channel;
+ void __iomem *BaseAddress;
+ int i;
+
+ Controller = kzalloc(sizeof(DAC960_Controller_T), GFP_ATOMIC);
+ if (Controller == NULL) {
+ DAC960_Error("Unable to allocate Controller structure for "
+ "Controller at\n", NULL);
+ return NULL;
+ }
+ Controller->ControllerNumber = DAC960_ControllerCount;
+ DAC960_Controllers[DAC960_ControllerCount++] = Controller;
+ Controller->Bus = PCI_Device->bus->number;
+ Controller->FirmwareType = privdata->FirmwareType;
+ Controller->HardwareType = privdata->HardwareType;
+ Controller->Device = DeviceFunction >> 3;
+ Controller->Function = DeviceFunction & 0x7;
+ Controller->PCIDevice = PCI_Device;
+ strcpy(Controller->FullModelName, "DAC960");
+
+ if (pci_enable_device(PCI_Device))
+ goto Failure;
+
+ switch (Controller->HardwareType)
+ {
+ case DAC960_GEM_Controller:
+ Controller->PCI_Address = pci_resource_start(PCI_Device, 0);
+ break;
+ case DAC960_BA_Controller:
+ Controller->PCI_Address = pci_resource_start(PCI_Device, 0);
+ break;
+ case DAC960_LP_Controller:
+ Controller->PCI_Address = pci_resource_start(PCI_Device, 0);
+ break;
+ case DAC960_LA_Controller:
+ Controller->PCI_Address = pci_resource_start(PCI_Device, 0);
+ break;
+ case DAC960_PG_Controller:
+ Controller->PCI_Address = pci_resource_start(PCI_Device, 0);
+ break;
+ case DAC960_PD_Controller:
+ Controller->IO_Address = pci_resource_start(PCI_Device, 0);
+ Controller->PCI_Address = pci_resource_start(PCI_Device, 1);
+ break;
+ case DAC960_P_Controller:
+ Controller->IO_Address = pci_resource_start(PCI_Device, 0);
+ Controller->PCI_Address = pci_resource_start(PCI_Device, 1);
+ break;
+ }
+
+ pci_set_drvdata(PCI_Device, (void *)((long)Controller->ControllerNumber));
+ for (i = 0; i < DAC960_MaxLogicalDrives; i++) {
+ Controller->disks[i] = alloc_disk(1<<DAC960_MaxPartitionsBits);
+ if (!Controller->disks[i])
+ goto Failure;
+ Controller->disks[i]->private_data = (void *)((long)i);
+ }
+ init_waitqueue_head(&Controller->CommandWaitQueue);
+ init_waitqueue_head(&Controller->HealthStatusWaitQueue);
+ spin_lock_init(&Controller->queue_lock);
+ DAC960_AnnounceDriver(Controller);
+ /*
+ Map the Controller Register Window.
+ */
+ if (MemoryWindowSize < PAGE_SIZE)
+ MemoryWindowSize = PAGE_SIZE;
+ Controller->MemoryMappedAddress =
+ ioremap_nocache(Controller->PCI_Address & PAGE_MASK, MemoryWindowSize);
+ Controller->BaseAddress =
+ Controller->MemoryMappedAddress + (Controller->PCI_Address & ~PAGE_MASK);
+ if (Controller->MemoryMappedAddress == NULL)
+ {
+ DAC960_Error("Unable to map Controller Register Window for "
+ "Controller at\n", Controller);
+ goto Failure;
+ }
+ BaseAddress = Controller->BaseAddress;
+ switch (Controller->HardwareType)
+ {
+ case DAC960_GEM_Controller:
+ DAC960_GEM_DisableInterrupts(BaseAddress);
+ DAC960_GEM_AcknowledgeHardwareMailboxStatus(BaseAddress);
+ udelay(1000);
+ while (DAC960_GEM_InitializationInProgressP(BaseAddress))
+ {
+ if (DAC960_GEM_ReadErrorStatus(BaseAddress, &ErrorStatus,
+ &Parameter0, &Parameter1) &&
+ DAC960_ReportErrorStatus(Controller, ErrorStatus,
+ Parameter0, Parameter1))
+ goto Failure;
+ udelay(10);
+ }
+ if (!DAC960_V2_EnableMemoryMailboxInterface(Controller))
+ {
+ DAC960_Error("Unable to Enable Memory Mailbox Interface "
+ "for Controller at\n", Controller);
+ goto Failure;
+ }
+ DAC960_GEM_EnableInterrupts(BaseAddress);
+ Controller->QueueCommand = DAC960_GEM_QueueCommand;
+ Controller->ReadControllerConfiguration =
+ DAC960_V2_ReadControllerConfiguration;
+ Controller->ReadDeviceConfiguration =
+ DAC960_V2_ReadDeviceConfiguration;
+ Controller->ReportDeviceConfiguration =
+ DAC960_V2_ReportDeviceConfiguration;
+ Controller->QueueReadWriteCommand =
+ DAC960_V2_QueueReadWriteCommand;
+ break;
+ case DAC960_BA_Controller:
+ DAC960_BA_DisableInterrupts(BaseAddress);
+ DAC960_BA_AcknowledgeHardwareMailboxStatus(BaseAddress);
+ udelay(1000);
+ while (DAC960_BA_InitializationInProgressP(BaseAddress))
+ {
+ if (DAC960_BA_ReadErrorStatus(BaseAddress, &ErrorStatus,
+ &Parameter0, &Parameter1) &&
+ DAC960_ReportErrorStatus(Controller, ErrorStatus,
+ Parameter0, Parameter1))
+ goto Failure;
+ udelay(10);
+ }
+ if (!DAC960_V2_EnableMemoryMailboxInterface(Controller))
+ {
+ DAC960_Error("Unable to Enable Memory Mailbox Interface "
+ "for Controller at\n", Controller);
+ goto Failure;
+ }
+ DAC960_BA_EnableInterrupts(BaseAddress);
+ Controller->QueueCommand = DAC960_BA_QueueCommand;
+ Controller->ReadControllerConfiguration =
+ DAC960_V2_ReadControllerConfiguration;
+ Controller->ReadDeviceConfiguration =
+ DAC960_V2_ReadDeviceConfiguration;
+ Controller->ReportDeviceConfiguration =
+ DAC960_V2_ReportDeviceConfiguration;
+ Controller->QueueReadWriteCommand =
+ DAC960_V2_QueueReadWriteCommand;
+ break;
+ case DAC960_LP_Controller:
+ DAC960_LP_DisableInterrupts(BaseAddress);
+ DAC960_LP_AcknowledgeHardwareMailboxStatus(BaseAddress);
+ udelay(1000);
+ while (DAC960_LP_InitializationInProgressP(BaseAddress))
+ {
+ if (DAC960_LP_ReadErrorStatus(BaseAddress, &ErrorStatus,
+ &Parameter0, &Parameter1) &&
+ DAC960_ReportErrorStatus(Controller, ErrorStatus,
+ Parameter0, Parameter1))
+ goto Failure;
+ udelay(10);
+ }
+ if (!DAC960_V2_EnableMemoryMailboxInterface(Controller))
+ {
+ DAC960_Error("Unable to Enable Memory Mailbox Interface "
+ "for Controller at\n", Controller);
+ goto Failure;
+ }
+ DAC960_LP_EnableInterrupts(BaseAddress);
+ Controller->QueueCommand = DAC960_LP_QueueCommand;
+ Controller->ReadControllerConfiguration =
+ DAC960_V2_ReadControllerConfiguration;
+ Controller->ReadDeviceConfiguration =
+ DAC960_V2_ReadDeviceConfiguration;
+ Controller->ReportDeviceConfiguration =
+ DAC960_V2_ReportDeviceConfiguration;
+ Controller->QueueReadWriteCommand =
+ DAC960_V2_QueueReadWriteCommand;
+ break;
+ case DAC960_LA_Controller:
+ DAC960_LA_DisableInterrupts(BaseAddress);
+ DAC960_LA_AcknowledgeHardwareMailboxStatus(BaseAddress);
+ udelay(1000);
+ while (DAC960_LA_InitializationInProgressP(BaseAddress))
+ {
+ if (DAC960_LA_ReadErrorStatus(BaseAddress, &ErrorStatus,
+ &Parameter0, &Parameter1) &&
+ DAC960_ReportErrorStatus(Controller, ErrorStatus,
+ Parameter0, Parameter1))
+ goto Failure;
+ udelay(10);
+ }
+ if (!DAC960_V1_EnableMemoryMailboxInterface(Controller))
+ {
+ DAC960_Error("Unable to Enable Memory Mailbox Interface "
+ "for Controller at\n", Controller);
+ goto Failure;
+ }
+ DAC960_LA_EnableInterrupts(BaseAddress);
+ if (Controller->V1.DualModeMemoryMailboxInterface)
+ Controller->QueueCommand = DAC960_LA_QueueCommandDualMode;
+ else Controller->QueueCommand = DAC960_LA_QueueCommandSingleMode;
+ Controller->ReadControllerConfiguration =
+ DAC960_V1_ReadControllerConfiguration;
+ Controller->ReadDeviceConfiguration =
+ DAC960_V1_ReadDeviceConfiguration;
+ Controller->ReportDeviceConfiguration =
+ DAC960_V1_ReportDeviceConfiguration;
+ Controller->QueueReadWriteCommand =
+ DAC960_V1_QueueReadWriteCommand;
+ break;
+ case DAC960_PG_Controller:
+ DAC960_PG_DisableInterrupts(BaseAddress);
+ DAC960_PG_AcknowledgeHardwareMailboxStatus(BaseAddress);
+ udelay(1000);
+ while (DAC960_PG_InitializationInProgressP(BaseAddress))
+ {
+ if (DAC960_PG_ReadErrorStatus(BaseAddress, &ErrorStatus,
+ &Parameter0, &Parameter1) &&
+ DAC960_ReportErrorStatus(Controller, ErrorStatus,
+ Parameter0, Parameter1))
+ goto Failure;
+ udelay(10);
+ }
+ if (!DAC960_V1_EnableMemoryMailboxInterface(Controller))
+ {
+ DAC960_Error("Unable to Enable Memory Mailbox Interface "
+ "for Controller at\n", Controller);
+ goto Failure;
+ }
+ DAC960_PG_EnableInterrupts(BaseAddress);
+ if (Controller->V1.DualModeMemoryMailboxInterface)
+ Controller->QueueCommand = DAC960_PG_QueueCommandDualMode;
+ else Controller->QueueCommand = DAC960_PG_QueueCommandSingleMode;
+ Controller->ReadControllerConfiguration =
+ DAC960_V1_ReadControllerConfiguration;
+ Controller->ReadDeviceConfiguration =
+ DAC960_V1_ReadDeviceConfiguration;
+ Controller->ReportDeviceConfiguration =
+ DAC960_V1_ReportDeviceConfiguration;
+ Controller->QueueReadWriteCommand =
+ DAC960_V1_QueueReadWriteCommand;
+ break;
+ case DAC960_PD_Controller:
+ if (!request_region(Controller->IO_Address, 0x80,
+ Controller->FullModelName)) {
+ DAC960_Error("IO port 0x%d busy for Controller at\n",
+ Controller, Controller->IO_Address);
+ goto Failure;
+ }
+ DAC960_PD_DisableInterrupts(BaseAddress);
+ DAC960_PD_AcknowledgeStatus(BaseAddress);
+ udelay(1000);
+ while (DAC960_PD_InitializationInProgressP(BaseAddress))
+ {
+ if (DAC960_PD_ReadErrorStatus(BaseAddress, &ErrorStatus,
+ &Parameter0, &Parameter1) &&
+ DAC960_ReportErrorStatus(Controller, ErrorStatus,
+ Parameter0, Parameter1))
+ goto Failure;
+ udelay(10);
+ }
+ if (!DAC960_V1_EnableMemoryMailboxInterface(Controller))
+ {
+ DAC960_Error("Unable to allocate DMA mapped memory "
+ "for Controller at\n", Controller);
+ goto Failure;
+ }
+ DAC960_PD_EnableInterrupts(BaseAddress);
+ Controller->QueueCommand = DAC960_PD_QueueCommand;
+ Controller->ReadControllerConfiguration =
+ DAC960_V1_ReadControllerConfiguration;
+ Controller->ReadDeviceConfiguration =
+ DAC960_V1_ReadDeviceConfiguration;
+ Controller->ReportDeviceConfiguration =
+ DAC960_V1_ReportDeviceConfiguration;
+ Controller->QueueReadWriteCommand =
+ DAC960_V1_QueueReadWriteCommand;
+ break;
+ case DAC960_P_Controller:
+ if (!request_region(Controller->IO_Address, 0x80,
+ Controller->FullModelName)){
+ DAC960_Error("IO port 0x%d busy for Controller at\n",
+ Controller, Controller->IO_Address);
+ goto Failure;
+ }
+ DAC960_PD_DisableInterrupts(BaseAddress);
+ DAC960_PD_AcknowledgeStatus(BaseAddress);
+ udelay(1000);
+ while (DAC960_PD_InitializationInProgressP(BaseAddress))
+ {
+ if (DAC960_PD_ReadErrorStatus(BaseAddress, &ErrorStatus,
+ &Parameter0, &Parameter1) &&
+ DAC960_ReportErrorStatus(Controller, ErrorStatus,
+ Parameter0, Parameter1))
+ goto Failure;
+ udelay(10);
+ }
+ if (!DAC960_V1_EnableMemoryMailboxInterface(Controller))
+ {
+ DAC960_Error("Unable to allocate DMA mapped memory"
+ "for Controller at\n", Controller);
+ goto Failure;
+ }
+ DAC960_PD_EnableInterrupts(BaseAddress);
+ Controller->QueueCommand = DAC960_P_QueueCommand;
+ Controller->ReadControllerConfiguration =
+ DAC960_V1_ReadControllerConfiguration;
+ Controller->ReadDeviceConfiguration =
+ DAC960_V1_ReadDeviceConfiguration;
+ Controller->ReportDeviceConfiguration =
+ DAC960_V1_ReportDeviceConfiguration;
+ Controller->QueueReadWriteCommand =
+ DAC960_V1_QueueReadWriteCommand;
+ break;
+ }
+ /*
+ Acquire shared access to the IRQ Channel.
+ */
+ IRQ_Channel = PCI_Device->irq;
+ if (request_irq(IRQ_Channel, InterruptHandler, IRQF_SHARED,
+ Controller->FullModelName, Controller) < 0)
+ {
+ DAC960_Error("Unable to acquire IRQ Channel %d for Controller at\n",
+ Controller, Controller->IRQ_Channel);
+ goto Failure;
+ }
+ Controller->IRQ_Channel = IRQ_Channel;
+ Controller->InitialCommand.CommandIdentifier = 1;
+ Controller->InitialCommand.Controller = Controller;
+ Controller->Commands[0] = &Controller->InitialCommand;
+ Controller->FreeCommands = &Controller->InitialCommand;
+ return Controller;
+
+Failure:
+ if (Controller->IO_Address == 0)
+ DAC960_Error("PCI Bus %d Device %d Function %d I/O Address N/A "
+ "PCI Address 0x%X\n", Controller,
+ Controller->Bus, Controller->Device,
+ Controller->Function, Controller->PCI_Address);
+ else
+ DAC960_Error("PCI Bus %d Device %d Function %d I/O Address "
+ "0x%X PCI Address 0x%X\n", Controller,
+ Controller->Bus, Controller->Device,
+ Controller->Function, Controller->IO_Address,
+ Controller->PCI_Address);
+ DAC960_DetectCleanup(Controller);
+ DAC960_ControllerCount--;
+ return NULL;
+}
+
+/*
+ DAC960_InitializeController initializes Controller.
+*/
+
+static bool
+DAC960_InitializeController(DAC960_Controller_T *Controller)
+{
+ if (DAC960_ReadControllerConfiguration(Controller) &&
+ DAC960_ReportControllerConfiguration(Controller) &&
+ DAC960_CreateAuxiliaryStructures(Controller) &&
+ DAC960_ReadDeviceConfiguration(Controller) &&
+ DAC960_ReportDeviceConfiguration(Controller) &&
+ DAC960_RegisterBlockDevice(Controller))
+ {
+ /*
+ Initialize the Monitoring Timer.
+ */
+ init_timer(&Controller->MonitoringTimer);
+ Controller->MonitoringTimer.expires =
+ jiffies + DAC960_MonitoringTimerInterval;
+ Controller->MonitoringTimer.data = (unsigned long) Controller;
+ Controller->MonitoringTimer.function = DAC960_MonitoringTimerFunction;
+ add_timer(&Controller->MonitoringTimer);
+ Controller->ControllerInitialized = true;
+ return true;
+ }
+ return false;
+}
+
+
+/*
+ DAC960_FinalizeController finalizes Controller.
+*/
+
+static void DAC960_FinalizeController(DAC960_Controller_T *Controller)
+{
+ if (Controller->ControllerInitialized)
+ {
+ unsigned long flags;
+
+ /*
+ * Acquiring and releasing lock here eliminates
+ * a very low probability race.
+ *
+ * The code below allocates controller command structures
+ * from the free list without holding the controller lock.
+ * This is safe assuming there is no other activity on
+ * the controller at the time.
+ *
+ * But, there might be a monitoring command still
+ * in progress. Setting the Shutdown flag while holding
+ * the lock ensures that there is no monitoring command
+ * in the interrupt handler currently, and any monitoring
+ * commands that complete from this time on will NOT return
+ * their command structure to the free list.
+ */
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ Controller->ShutdownMonitoringTimer = 1;
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+
+ del_timer_sync(&Controller->MonitoringTimer);
+ if (Controller->FirmwareType == DAC960_V1_Controller)
+ {
+ DAC960_Notice("Flushing Cache...", Controller);
+ DAC960_V1_ExecuteType3(Controller, DAC960_V1_Flush, 0);
+ DAC960_Notice("done\n", Controller);
+
+ if (Controller->HardwareType == DAC960_PD_Controller)
+ release_region(Controller->IO_Address, 0x80);
+ }
+ else
+ {
+ DAC960_Notice("Flushing Cache...", Controller);
+ DAC960_V2_DeviceOperation(Controller, DAC960_V2_PauseDevice,
+ DAC960_V2_RAID_Controller);
+ DAC960_Notice("done\n", Controller);
+ }
+ }
+ DAC960_UnregisterBlockDevice(Controller);
+ DAC960_DestroyAuxiliaryStructures(Controller);
+ DAC960_DestroyProcEntries(Controller);
+ DAC960_DetectCleanup(Controller);
+}
+
+
+/*
+ DAC960_Probe verifies controller's existence and
+ initializes the DAC960 Driver for that controller.
+*/
+
+static int
+DAC960_Probe(struct pci_dev *dev, const struct pci_device_id *entry)
+{
+ int disk;
+ DAC960_Controller_T *Controller;
+
+ if (DAC960_ControllerCount == DAC960_MaxControllers)
+ {
+ DAC960_Error("More than %d DAC960 Controllers detected - "
+ "ignoring from Controller at\n",
+ NULL, DAC960_MaxControllers);
+ return -ENODEV;
+ }
+
+ Controller = DAC960_DetectController(dev, entry);
+ if (!Controller)
+ return -ENODEV;
+
+ if (!DAC960_InitializeController(Controller)) {
+ DAC960_FinalizeController(Controller);
+ return -ENODEV;
+ }
+
+ for (disk = 0; disk < DAC960_MaxLogicalDrives; disk++) {
+ set_capacity(Controller->disks[disk], disk_size(Controller, disk));
+ add_disk(Controller->disks[disk]);
+ }
+ DAC960_CreateProcEntries(Controller);
+ return 0;
+}
+
+
+/*
+ DAC960_Finalize finalizes the DAC960 Driver.
+*/
+
+static void DAC960_Remove(struct pci_dev *PCI_Device)
+{
+ int Controller_Number = (long)pci_get_drvdata(PCI_Device);
+ DAC960_Controller_T *Controller = DAC960_Controllers[Controller_Number];
+ if (Controller != NULL)
+ DAC960_FinalizeController(Controller);
+}
+
+
+/*
+ DAC960_V1_QueueReadWriteCommand prepares and queues a Read/Write Command for
+ DAC960 V1 Firmware Controllers.
+*/
+
+static void DAC960_V1_QueueReadWriteCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_ScatterGatherSegment_T *ScatterGatherList =
+ Command->V1.ScatterGatherList;
+ struct scatterlist *ScatterList = Command->V1.ScatterList;
+
+ DAC960_V1_ClearCommand(Command);
+
+ if (Command->SegmentCount == 1)
+ {
+ if (Command->DmaDirection == PCI_DMA_FROMDEVICE)
+ CommandMailbox->Type5.CommandOpcode = DAC960_V1_Read;
+ else
+ CommandMailbox->Type5.CommandOpcode = DAC960_V1_Write;
+
+ CommandMailbox->Type5.LD.TransferLength = Command->BlockCount;
+ CommandMailbox->Type5.LD.LogicalDriveNumber = Command->LogicalDriveNumber;
+ CommandMailbox->Type5.LogicalBlockAddress = Command->BlockNumber;
+ CommandMailbox->Type5.BusAddress =
+ (DAC960_BusAddress32_T)sg_dma_address(ScatterList);
+ }
+ else
+ {
+ int i;
+
+ if (Command->DmaDirection == PCI_DMA_FROMDEVICE)
+ CommandMailbox->Type5.CommandOpcode = DAC960_V1_ReadWithScatterGather;
+ else
+ CommandMailbox->Type5.CommandOpcode = DAC960_V1_WriteWithScatterGather;
+
+ CommandMailbox->Type5.LD.TransferLength = Command->BlockCount;
+ CommandMailbox->Type5.LD.LogicalDriveNumber = Command->LogicalDriveNumber;
+ CommandMailbox->Type5.LogicalBlockAddress = Command->BlockNumber;
+ CommandMailbox->Type5.BusAddress = Command->V1.ScatterGatherListDMA;
+
+ CommandMailbox->Type5.ScatterGatherCount = Command->SegmentCount;
+
+ for (i = 0; i < Command->SegmentCount; i++, ScatterList++, ScatterGatherList++) {
+ ScatterGatherList->SegmentDataPointer =
+ (DAC960_BusAddress32_T)sg_dma_address(ScatterList);
+ ScatterGatherList->SegmentByteCount =
+ (DAC960_ByteCount32_T)sg_dma_len(ScatterList);
+ }
+ }
+ DAC960_QueueCommand(Command);
+}
+
+
+/*
+ DAC960_V2_QueueReadWriteCommand prepares and queues a Read/Write Command for
+ DAC960 V2 Firmware Controllers.
+*/
+
+static void DAC960_V2_QueueReadWriteCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ struct scatterlist *ScatterList = Command->V2.ScatterList;
+
+ DAC960_V2_ClearCommand(Command);
+
+ CommandMailbox->SCSI_10.CommandOpcode = DAC960_V2_SCSI_10;
+ CommandMailbox->SCSI_10.CommandControlBits.DataTransferControllerToHost =
+ (Command->DmaDirection == PCI_DMA_FROMDEVICE);
+ CommandMailbox->SCSI_10.DataTransferSize =
+ Command->BlockCount << DAC960_BlockSizeBits;
+ CommandMailbox->SCSI_10.RequestSenseBusAddress = Command->V2.RequestSenseDMA;
+ CommandMailbox->SCSI_10.PhysicalDevice =
+ Controller->V2.LogicalDriveToVirtualDevice[Command->LogicalDriveNumber];
+ CommandMailbox->SCSI_10.RequestSenseSize = sizeof(DAC960_SCSI_RequestSense_T);
+ CommandMailbox->SCSI_10.CDBLength = 10;
+ CommandMailbox->SCSI_10.SCSI_CDB[0] =
+ (Command->DmaDirection == PCI_DMA_FROMDEVICE ? 0x28 : 0x2A);
+ CommandMailbox->SCSI_10.SCSI_CDB[2] = Command->BlockNumber >> 24;
+ CommandMailbox->SCSI_10.SCSI_CDB[3] = Command->BlockNumber >> 16;
+ CommandMailbox->SCSI_10.SCSI_CDB[4] = Command->BlockNumber >> 8;
+ CommandMailbox->SCSI_10.SCSI_CDB[5] = Command->BlockNumber;
+ CommandMailbox->SCSI_10.SCSI_CDB[7] = Command->BlockCount >> 8;
+ CommandMailbox->SCSI_10.SCSI_CDB[8] = Command->BlockCount;
+
+ if (Command->SegmentCount == 1)
+ {
+ CommandMailbox->SCSI_10.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ (DAC960_BusAddress64_T)sg_dma_address(ScatterList);
+ CommandMailbox->SCSI_10.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->SCSI_10.DataTransferSize;
+ }
+ else
+ {
+ DAC960_V2_ScatterGatherSegment_T *ScatterGatherList;
+ int i;
+
+ if (Command->SegmentCount > 2)
+ {
+ ScatterGatherList = Command->V2.ScatterGatherList;
+ CommandMailbox->SCSI_10.CommandControlBits
+ .AdditionalScatterGatherListMemory = true;
+ CommandMailbox->SCSI_10.DataTransferMemoryAddress
+ .ExtendedScatterGather.ScatterGatherList0Length = Command->SegmentCount;
+ CommandMailbox->SCSI_10.DataTransferMemoryAddress
+ .ExtendedScatterGather.ScatterGatherList0Address =
+ Command->V2.ScatterGatherListDMA;
+ }
+ else
+ ScatterGatherList = CommandMailbox->SCSI_10.DataTransferMemoryAddress
+ .ScatterGatherSegments;
+
+ for (i = 0; i < Command->SegmentCount; i++, ScatterList++, ScatterGatherList++) {
+ ScatterGatherList->SegmentDataPointer =
+ (DAC960_BusAddress64_T)sg_dma_address(ScatterList);
+ ScatterGatherList->SegmentByteCount =
+ (DAC960_ByteCount64_T)sg_dma_len(ScatterList);
+ }
+ }
+ DAC960_QueueCommand(Command);
+}
+
+
+static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_queue *req_q)
+{
+ struct request *Request;
+ DAC960_Command_T *Command;
+
+ while(1) {
+ Request = blk_peek_request(req_q);
+ if (!Request)
+ return 1;
+
+ Command = DAC960_AllocateCommand(Controller);
+ if (Command == NULL)
+ return 0;
+
+ if (rq_data_dir(Request) == READ) {
+ Command->DmaDirection = PCI_DMA_FROMDEVICE;
+ Command->CommandType = DAC960_ReadCommand;
+ } else {
+ Command->DmaDirection = PCI_DMA_TODEVICE;
+ Command->CommandType = DAC960_WriteCommand;
+ }
+ Command->Completion = Request->end_io_data;
+ Command->LogicalDriveNumber = (long)Request->rq_disk->private_data;
+ Command->BlockNumber = blk_rq_pos(Request);
+ Command->BlockCount = blk_rq_sectors(Request);
+ Command->Request = Request;
+ blk_start_request(Request);
+ Command->SegmentCount = blk_rq_map_sg(req_q,
+ Command->Request, Command->cmd_sglist);
+ /* pci_map_sg MAY change the value of SegCount */
+ Command->SegmentCount = pci_map_sg(Controller->PCIDevice, Command->cmd_sglist,
+ Command->SegmentCount, Command->DmaDirection);
+
+ DAC960_QueueReadWriteCommand(Command);
+ }
+}
+
+/*
+ DAC960_ProcessRequest attempts to remove one I/O Request from Controller's
+ I/O Request Queue and queues it to the Controller. WaitForCommand is true if
+ this function should wait for a Command to become available if necessary.
+ This function returns true if an I/O Request was queued and false otherwise.
+*/
+static void DAC960_ProcessRequest(DAC960_Controller_T *controller)
+{
+ int i;
+
+ if (!controller->ControllerInitialized)
+ return;
+
+ /* Do this better later! */
+ for (i = controller->req_q_index; i < DAC960_MaxLogicalDrives; i++) {
+ struct request_queue *req_q = controller->RequestQueue[i];
+
+ if (req_q == NULL)
+ continue;
+
+ if (!DAC960_process_queue(controller, req_q)) {
+ controller->req_q_index = i;
+ return;
+ }
+ }
+
+ if (controller->req_q_index == 0)
+ return;
+
+ for (i = 0; i < controller->req_q_index; i++) {
+ struct request_queue *req_q = controller->RequestQueue[i];
+
+ if (req_q == NULL)
+ continue;
+
+ if (!DAC960_process_queue(controller, req_q)) {
+ controller->req_q_index = i;
+ return;
+ }
+ }
+}
+
+
+/*
+ DAC960_queue_partial_rw extracts one bio from the request already
+ associated with argument command, and construct a new command block to retry I/O
+ only on that bio. Queue that command to the controller.
+
+ This function re-uses a previously-allocated Command,
+ there is no failure mode from trying to allocate a command.
+*/
+
+static void DAC960_queue_partial_rw(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ struct request *Request = Command->Request;
+ struct request_queue *req_q = Controller->RequestQueue[Command->LogicalDriveNumber];
+
+ if (Command->DmaDirection == PCI_DMA_FROMDEVICE)
+ Command->CommandType = DAC960_ReadRetryCommand;
+ else
+ Command->CommandType = DAC960_WriteRetryCommand;
+
+ /*
+ * We could be more efficient with these mapping requests
+ * and map only the portions that we need. But since this
+ * code should almost never be called, just go with a
+ * simple coding.
+ */
+ (void)blk_rq_map_sg(req_q, Command->Request, Command->cmd_sglist);
+
+ (void)pci_map_sg(Controller->PCIDevice, Command->cmd_sglist, 1, Command->DmaDirection);
+ /*
+ * Resubmitting the request sector at a time is really tedious.
+ * But, this should almost never happen. So, we're willing to pay
+ * this price so that in the end, as much of the transfer is completed
+ * successfully as possible.
+ */
+ Command->SegmentCount = 1;
+ Command->BlockNumber = blk_rq_pos(Request);
+ Command->BlockCount = 1;
+ DAC960_QueueReadWriteCommand(Command);
+ return;
+}
+
+/*
+ DAC960_RequestFunction is the I/O Request Function for DAC960 Controllers.
+*/
+
+static void DAC960_RequestFunction(struct request_queue *RequestQueue)
+{
+ DAC960_ProcessRequest(RequestQueue->queuedata);
+}
+
+/*
+ DAC960_ProcessCompletedBuffer performs completion processing for an
+ individual Buffer.
+*/
+
+static inline bool DAC960_ProcessCompletedRequest(DAC960_Command_T *Command,
+ bool SuccessfulIO)
+{
+ struct request *Request = Command->Request;
+ int Error = SuccessfulIO ? 0 : -EIO;
+
+ pci_unmap_sg(Command->Controller->PCIDevice, Command->cmd_sglist,
+ Command->SegmentCount, Command->DmaDirection);
+
+ if (!__blk_end_request(Request, Error, Command->BlockCount << 9)) {
+ if (Command->Completion) {
+ complete(Command->Completion);
+ Command->Completion = NULL;
+ }
+ return true;
+ }
+ return false;
+}
+
+/*
+ DAC960_V1_ReadWriteError prints an appropriate error message for Command
+ when an error occurs on a Read or Write operation.
+*/
+
+static void DAC960_V1_ReadWriteError(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ unsigned char *CommandName = "UNKNOWN";
+ switch (Command->CommandType)
+ {
+ case DAC960_ReadCommand:
+ case DAC960_ReadRetryCommand:
+ CommandName = "READ";
+ break;
+ case DAC960_WriteCommand:
+ case DAC960_WriteRetryCommand:
+ CommandName = "WRITE";
+ break;
+ case DAC960_MonitoringCommand:
+ case DAC960_ImmediateCommand:
+ case DAC960_QueuedCommand:
+ break;
+ }
+ switch (Command->V1.CommandStatus)
+ {
+ case DAC960_V1_IrrecoverableDataError:
+ DAC960_Error("Irrecoverable Data Error on %s:\n",
+ Controller, CommandName);
+ break;
+ case DAC960_V1_LogicalDriveNonexistentOrOffline:
+ DAC960_Error("Logical Drive Nonexistent or Offline on %s:\n",
+ Controller, CommandName);
+ break;
+ case DAC960_V1_AccessBeyondEndOfLogicalDrive:
+ DAC960_Error("Attempt to Access Beyond End of Logical Drive "
+ "on %s:\n", Controller, CommandName);
+ break;
+ case DAC960_V1_BadDataEncountered:
+ DAC960_Error("Bad Data Encountered on %s:\n", Controller, CommandName);
+ break;
+ default:
+ DAC960_Error("Unexpected Error Status %04X on %s:\n",
+ Controller, Command->V1.CommandStatus, CommandName);
+ break;
+ }
+ DAC960_Error(" /dev/rd/c%dd%d: absolute blocks %u..%u\n",
+ Controller, Controller->ControllerNumber,
+ Command->LogicalDriveNumber, Command->BlockNumber,
+ Command->BlockNumber + Command->BlockCount - 1);
+}
+
+
+/*
+ DAC960_V1_ProcessCompletedCommand performs completion processing for Command
+ for DAC960 V1 Firmware Controllers.
+*/
+
+static void DAC960_V1_ProcessCompletedCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ DAC960_CommandType_T CommandType = Command->CommandType;
+ DAC960_V1_CommandOpcode_T CommandOpcode =
+ Command->V1.CommandMailbox.Common.CommandOpcode;
+ DAC960_V1_CommandStatus_T CommandStatus = Command->V1.CommandStatus;
+
+ if (CommandType == DAC960_ReadCommand ||
+ CommandType == DAC960_WriteCommand)
+ {
+
+#ifdef FORCE_RETRY_DEBUG
+ CommandStatus = DAC960_V1_IrrecoverableDataError;
+#endif
+
+ if (CommandStatus == DAC960_V1_NormalCompletion) {
+
+ if (!DAC960_ProcessCompletedRequest(Command, true))
+ BUG();
+
+ } else if (CommandStatus == DAC960_V1_IrrecoverableDataError ||
+ CommandStatus == DAC960_V1_BadDataEncountered)
+ {
+ /*
+ * break the command down into pieces and resubmit each
+ * piece, hoping that some of them will succeed.
+ */
+ DAC960_queue_partial_rw(Command);
+ return;
+ }
+ else
+ {
+ if (CommandStatus != DAC960_V1_LogicalDriveNonexistentOrOffline)
+ DAC960_V1_ReadWriteError(Command);
+
+ if (!DAC960_ProcessCompletedRequest(Command, false))
+ BUG();
+ }
+ }
+ else if (CommandType == DAC960_ReadRetryCommand ||
+ CommandType == DAC960_WriteRetryCommand)
+ {
+ bool normal_completion;
+#ifdef FORCE_RETRY_FAILURE_DEBUG
+ static int retry_count = 1;
+#endif
+ /*
+ Perform completion processing for the portion that was
+ retried, and submit the next portion, if any.
+ */
+ normal_completion = true;
+ if (CommandStatus != DAC960_V1_NormalCompletion) {
+ normal_completion = false;
+ if (CommandStatus != DAC960_V1_LogicalDriveNonexistentOrOffline)
+ DAC960_V1_ReadWriteError(Command);
+ }
+
+#ifdef FORCE_RETRY_FAILURE_DEBUG
+ if (!(++retry_count % 10000)) {
+ printk("V1 error retry failure test\n");
+ normal_completion = false;
+ DAC960_V1_ReadWriteError(Command);
+ }
+#endif
+
+ if (!DAC960_ProcessCompletedRequest(Command, normal_completion)) {
+ DAC960_queue_partial_rw(Command);
+ return;
+ }
+ }
+
+ else if (CommandType == DAC960_MonitoringCommand)
+ {
+ if (Controller->ShutdownMonitoringTimer)
+ return;
+ if (CommandOpcode == DAC960_V1_Enquiry)
+ {
+ DAC960_V1_Enquiry_T *OldEnquiry = &Controller->V1.Enquiry;
+ DAC960_V1_Enquiry_T *NewEnquiry = Controller->V1.NewEnquiry;
+ unsigned int OldCriticalLogicalDriveCount =
+ OldEnquiry->CriticalLogicalDriveCount;
+ unsigned int NewCriticalLogicalDriveCount =
+ NewEnquiry->CriticalLogicalDriveCount;
+ if (NewEnquiry->NumberOfLogicalDrives > Controller->LogicalDriveCount)
+ {
+ int LogicalDriveNumber = Controller->LogicalDriveCount - 1;
+ while (++LogicalDriveNumber < NewEnquiry->NumberOfLogicalDrives)
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) "
+ "Now Exists\n", Controller,
+ LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber);
+ Controller->LogicalDriveCount = NewEnquiry->NumberOfLogicalDrives;
+ DAC960_ComputeGenericDiskInfo(Controller);
+ }
+ if (NewEnquiry->NumberOfLogicalDrives < Controller->LogicalDriveCount)
+ {
+ int LogicalDriveNumber = NewEnquiry->NumberOfLogicalDrives - 1;
+ while (++LogicalDriveNumber < Controller->LogicalDriveCount)
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) "
+ "No Longer Exists\n", Controller,
+ LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber);
+ Controller->LogicalDriveCount = NewEnquiry->NumberOfLogicalDrives;
+ DAC960_ComputeGenericDiskInfo(Controller);
+ }
+ if (NewEnquiry->StatusFlags.DeferredWriteError !=
+ OldEnquiry->StatusFlags.DeferredWriteError)
+ DAC960_Critical("Deferred Write Error Flag is now %s\n", Controller,
+ (NewEnquiry->StatusFlags.DeferredWriteError
+ ? "TRUE" : "FALSE"));
+ if ((NewCriticalLogicalDriveCount > 0 ||
+ NewCriticalLogicalDriveCount != OldCriticalLogicalDriveCount) ||
+ (NewEnquiry->OfflineLogicalDriveCount > 0 ||
+ NewEnquiry->OfflineLogicalDriveCount !=
+ OldEnquiry->OfflineLogicalDriveCount) ||
+ (NewEnquiry->DeadDriveCount > 0 ||
+ NewEnquiry->DeadDriveCount !=
+ OldEnquiry->DeadDriveCount) ||
+ (NewEnquiry->EventLogSequenceNumber !=
+ OldEnquiry->EventLogSequenceNumber) ||
+ Controller->MonitoringTimerCount == 0 ||
+ time_after_eq(jiffies, Controller->SecondaryMonitoringTime
+ + DAC960_SecondaryMonitoringInterval))
+ {
+ Controller->V1.NeedLogicalDriveInformation = true;
+ Controller->V1.NewEventLogSequenceNumber =
+ NewEnquiry->EventLogSequenceNumber;
+ Controller->V1.NeedErrorTableInformation = true;
+ Controller->V1.NeedDeviceStateInformation = true;
+ Controller->V1.StartDeviceStateScan = true;
+ Controller->V1.NeedBackgroundInitializationStatus =
+ Controller->V1.BackgroundInitializationStatusSupported;
+ Controller->SecondaryMonitoringTime = jiffies;
+ }
+ if (NewEnquiry->RebuildFlag == DAC960_V1_StandbyRebuildInProgress ||
+ NewEnquiry->RebuildFlag
+ == DAC960_V1_BackgroundRebuildInProgress ||
+ OldEnquiry->RebuildFlag == DAC960_V1_StandbyRebuildInProgress ||
+ OldEnquiry->RebuildFlag == DAC960_V1_BackgroundRebuildInProgress)
+ {
+ Controller->V1.NeedRebuildProgress = true;
+ Controller->V1.RebuildProgressFirst =
+ (NewEnquiry->CriticalLogicalDriveCount <
+ OldEnquiry->CriticalLogicalDriveCount);
+ }
+ if (OldEnquiry->RebuildFlag == DAC960_V1_BackgroundCheckInProgress)
+ switch (NewEnquiry->RebuildFlag)
+ {
+ case DAC960_V1_NoStandbyRebuildOrCheckInProgress:
+ DAC960_Progress("Consistency Check Completed Successfully\n",
+ Controller);
+ break;
+ case DAC960_V1_StandbyRebuildInProgress:
+ case DAC960_V1_BackgroundRebuildInProgress:
+ break;
+ case DAC960_V1_BackgroundCheckInProgress:
+ Controller->V1.NeedConsistencyCheckProgress = true;
+ break;
+ case DAC960_V1_StandbyRebuildCompletedWithError:
+ DAC960_Progress("Consistency Check Completed with Error\n",
+ Controller);
+ break;
+ case DAC960_V1_BackgroundRebuildOrCheckFailed_DriveFailed:
+ DAC960_Progress("Consistency Check Failed - "
+ "Physical Device Failed\n", Controller);
+ break;
+ case DAC960_V1_BackgroundRebuildOrCheckFailed_LogicalDriveFailed:
+ DAC960_Progress("Consistency Check Failed - "
+ "Logical Drive Failed\n", Controller);
+ break;
+ case DAC960_V1_BackgroundRebuildOrCheckFailed_OtherCauses:
+ DAC960_Progress("Consistency Check Failed - Other Causes\n",
+ Controller);
+ break;
+ case DAC960_V1_BackgroundRebuildOrCheckSuccessfullyTerminated:
+ DAC960_Progress("Consistency Check Successfully Terminated\n",
+ Controller);
+ break;
+ }
+ else if (NewEnquiry->RebuildFlag
+ == DAC960_V1_BackgroundCheckInProgress)
+ Controller->V1.NeedConsistencyCheckProgress = true;
+ Controller->MonitoringAlertMode =
+ (NewEnquiry->CriticalLogicalDriveCount > 0 ||
+ NewEnquiry->OfflineLogicalDriveCount > 0 ||
+ NewEnquiry->DeadDriveCount > 0);
+ if (NewEnquiry->RebuildFlag > DAC960_V1_BackgroundCheckInProgress)
+ {
+ Controller->V1.PendingRebuildFlag = NewEnquiry->RebuildFlag;
+ Controller->V1.RebuildFlagPending = true;
+ }
+ memcpy(&Controller->V1.Enquiry, &Controller->V1.NewEnquiry,
+ sizeof(DAC960_V1_Enquiry_T));
+ }
+ else if (CommandOpcode == DAC960_V1_PerformEventLogOperation)
+ {
+ static char
+ *DAC960_EventMessages[] =
+ { "killed because write recovery failed",
+ "killed because of SCSI bus reset failure",
+ "killed because of double check condition",
+ "killed because it was removed",
+ "killed because of gross error on SCSI chip",
+ "killed because of bad tag returned from drive",
+ "killed because of timeout on SCSI command",
+ "killed because of reset SCSI command issued from system",
+ "killed because busy or parity error count exceeded limit",
+ "killed because of 'kill drive' command from system",
+ "killed because of selection timeout",
+ "killed due to SCSI phase sequence error",
+ "killed due to unknown status" };
+ DAC960_V1_EventLogEntry_T *EventLogEntry =
+ Controller->V1.EventLogEntry;
+ if (EventLogEntry->SequenceNumber ==
+ Controller->V1.OldEventLogSequenceNumber)
+ {
+ unsigned char SenseKey = EventLogEntry->SenseKey;
+ unsigned char AdditionalSenseCode =
+ EventLogEntry->AdditionalSenseCode;
+ unsigned char AdditionalSenseCodeQualifier =
+ EventLogEntry->AdditionalSenseCodeQualifier;
+ if (SenseKey == DAC960_SenseKey_VendorSpecific &&
+ AdditionalSenseCode == 0x80 &&
+ AdditionalSenseCodeQualifier <
+ ARRAY_SIZE(DAC960_EventMessages))
+ DAC960_Critical("Physical Device %d:%d %s\n", Controller,
+ EventLogEntry->Channel,
+ EventLogEntry->TargetID,
+ DAC960_EventMessages[
+ AdditionalSenseCodeQualifier]);
+ else if (SenseKey == DAC960_SenseKey_UnitAttention &&
+ AdditionalSenseCode == 0x29)
+ {
+ if (Controller->MonitoringTimerCount > 0)
+ Controller->V1.DeviceResetCount[EventLogEntry->Channel]
+ [EventLogEntry->TargetID]++;
+ }
+ else if (!(SenseKey == DAC960_SenseKey_NoSense ||
+ (SenseKey == DAC960_SenseKey_NotReady &&
+ AdditionalSenseCode == 0x04 &&
+ (AdditionalSenseCodeQualifier == 0x01 ||
+ AdditionalSenseCodeQualifier == 0x02))))
+ {
+ DAC960_Critical("Physical Device %d:%d Error Log: "
+ "Sense Key = %X, ASC = %02X, ASCQ = %02X\n",
+ Controller,
+ EventLogEntry->Channel,
+ EventLogEntry->TargetID,
+ SenseKey,
+ AdditionalSenseCode,
+ AdditionalSenseCodeQualifier);
+ DAC960_Critical("Physical Device %d:%d Error Log: "
+ "Information = %02X%02X%02X%02X "
+ "%02X%02X%02X%02X\n",
+ Controller,
+ EventLogEntry->Channel,
+ EventLogEntry->TargetID,
+ EventLogEntry->Information[0],
+ EventLogEntry->Information[1],
+ EventLogEntry->Information[2],
+ EventLogEntry->Information[3],
+ EventLogEntry->CommandSpecificInformation[0],
+ EventLogEntry->CommandSpecificInformation[1],
+ EventLogEntry->CommandSpecificInformation[2],
+ EventLogEntry->CommandSpecificInformation[3]);
+ }
+ }
+ Controller->V1.OldEventLogSequenceNumber++;
+ }
+ else if (CommandOpcode == DAC960_V1_GetErrorTable)
+ {
+ DAC960_V1_ErrorTable_T *OldErrorTable = &Controller->V1.ErrorTable;
+ DAC960_V1_ErrorTable_T *NewErrorTable = Controller->V1.NewErrorTable;
+ int Channel, TargetID;
+ for (Channel = 0; Channel < Controller->Channels; Channel++)
+ for (TargetID = 0; TargetID < Controller->Targets; TargetID++)
+ {
+ DAC960_V1_ErrorTableEntry_T *NewErrorEntry =
+ &NewErrorTable->ErrorTableEntries[Channel][TargetID];
+ DAC960_V1_ErrorTableEntry_T *OldErrorEntry =
+ &OldErrorTable->ErrorTableEntries[Channel][TargetID];
+ if ((NewErrorEntry->ParityErrorCount !=
+ OldErrorEntry->ParityErrorCount) ||
+ (NewErrorEntry->SoftErrorCount !=
+ OldErrorEntry->SoftErrorCount) ||
+ (NewErrorEntry->HardErrorCount !=
+ OldErrorEntry->HardErrorCount) ||
+ (NewErrorEntry->MiscErrorCount !=
+ OldErrorEntry->MiscErrorCount))
+ DAC960_Critical("Physical Device %d:%d Errors: "
+ "Parity = %d, Soft = %d, "
+ "Hard = %d, Misc = %d\n",
+ Controller, Channel, TargetID,
+ NewErrorEntry->ParityErrorCount,
+ NewErrorEntry->SoftErrorCount,
+ NewErrorEntry->HardErrorCount,
+ NewErrorEntry->MiscErrorCount);
+ }
+ memcpy(&Controller->V1.ErrorTable, Controller->V1.NewErrorTable,
+ sizeof(DAC960_V1_ErrorTable_T));
+ }
+ else if (CommandOpcode == DAC960_V1_GetDeviceState)
+ {
+ DAC960_V1_DeviceState_T *OldDeviceState =
+ &Controller->V1.DeviceState[Controller->V1.DeviceStateChannel]
+ [Controller->V1.DeviceStateTargetID];
+ DAC960_V1_DeviceState_T *NewDeviceState =
+ Controller->V1.NewDeviceState;
+ if (NewDeviceState->DeviceState != OldDeviceState->DeviceState)
+ DAC960_Critical("Physical Device %d:%d is now %s\n", Controller,
+ Controller->V1.DeviceStateChannel,
+ Controller->V1.DeviceStateTargetID,
+ (NewDeviceState->DeviceState
+ == DAC960_V1_Device_Dead
+ ? "DEAD"
+ : NewDeviceState->DeviceState
+ == DAC960_V1_Device_WriteOnly
+ ? "WRITE-ONLY"
+ : NewDeviceState->DeviceState
+ == DAC960_V1_Device_Online
+ ? "ONLINE" : "STANDBY"));
+ if (OldDeviceState->DeviceState == DAC960_V1_Device_Dead &&
+ NewDeviceState->DeviceState != DAC960_V1_Device_Dead)
+ {
+ Controller->V1.NeedDeviceInquiryInformation = true;
+ Controller->V1.NeedDeviceSerialNumberInformation = true;
+ Controller->V1.DeviceResetCount
+ [Controller->V1.DeviceStateChannel]
+ [Controller->V1.DeviceStateTargetID] = 0;
+ }
+ memcpy(OldDeviceState, NewDeviceState,
+ sizeof(DAC960_V1_DeviceState_T));
+ }
+ else if (CommandOpcode == DAC960_V1_GetLogicalDriveInformation)
+ {
+ int LogicalDriveNumber;
+ for (LogicalDriveNumber = 0;
+ LogicalDriveNumber < Controller->LogicalDriveCount;
+ LogicalDriveNumber++)
+ {
+ DAC960_V1_LogicalDriveInformation_T *OldLogicalDriveInformation =
+ &Controller->V1.LogicalDriveInformation[LogicalDriveNumber];
+ DAC960_V1_LogicalDriveInformation_T *NewLogicalDriveInformation =
+ &(*Controller->V1.NewLogicalDriveInformation)[LogicalDriveNumber];
+ if (NewLogicalDriveInformation->LogicalDriveState !=
+ OldLogicalDriveInformation->LogicalDriveState)
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) "
+ "is now %s\n", Controller,
+ LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber,
+ (NewLogicalDriveInformation->LogicalDriveState
+ == DAC960_V1_LogicalDrive_Online
+ ? "ONLINE"
+ : NewLogicalDriveInformation->LogicalDriveState
+ == DAC960_V1_LogicalDrive_Critical
+ ? "CRITICAL" : "OFFLINE"));
+ if (NewLogicalDriveInformation->WriteBack !=
+ OldLogicalDriveInformation->WriteBack)
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) "
+ "is now %s\n", Controller,
+ LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber,
+ (NewLogicalDriveInformation->WriteBack
+ ? "WRITE BACK" : "WRITE THRU"));
+ }
+ memcpy(&Controller->V1.LogicalDriveInformation,
+ Controller->V1.NewLogicalDriveInformation,
+ sizeof(DAC960_V1_LogicalDriveInformationArray_T));
+ }
+ else if (CommandOpcode == DAC960_V1_GetRebuildProgress)
+ {
+ unsigned int LogicalDriveNumber =
+ Controller->V1.RebuildProgress->LogicalDriveNumber;
+ unsigned int LogicalDriveSize =
+ Controller->V1.RebuildProgress->LogicalDriveSize;
+ unsigned int BlocksCompleted =
+ LogicalDriveSize - Controller->V1.RebuildProgress->RemainingBlocks;
+ if (CommandStatus == DAC960_V1_NoRebuildOrCheckInProgress &&
+ Controller->V1.LastRebuildStatus == DAC960_V1_NormalCompletion)
+ CommandStatus = DAC960_V1_RebuildSuccessful;
+ switch (CommandStatus)
+ {
+ case DAC960_V1_NormalCompletion:
+ Controller->EphemeralProgressMessage = true;
+ DAC960_Progress("Rebuild in Progress: "
+ "Logical Drive %d (/dev/rd/c%dd%d) "
+ "%d%% completed\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber,
+ (100 * (BlocksCompleted >> 7))
+ / (LogicalDriveSize >> 7));
+ Controller->EphemeralProgressMessage = false;
+ break;
+ case DAC960_V1_RebuildFailed_LogicalDriveFailure:
+ DAC960_Progress("Rebuild Failed due to "
+ "Logical Drive Failure\n", Controller);
+ break;
+ case DAC960_V1_RebuildFailed_BadBlocksOnOther:
+ DAC960_Progress("Rebuild Failed due to "
+ "Bad Blocks on Other Drives\n", Controller);
+ break;
+ case DAC960_V1_RebuildFailed_NewDriveFailed:
+ DAC960_Progress("Rebuild Failed due to "
+ "Failure of Drive Being Rebuilt\n", Controller);
+ break;
+ case DAC960_V1_NoRebuildOrCheckInProgress:
+ break;
+ case DAC960_V1_RebuildSuccessful:
+ DAC960_Progress("Rebuild Completed Successfully\n", Controller);
+ break;
+ case DAC960_V1_RebuildSuccessfullyTerminated:
+ DAC960_Progress("Rebuild Successfully Terminated\n", Controller);
+ break;
+ }
+ Controller->V1.LastRebuildStatus = CommandStatus;
+ if (CommandType != DAC960_MonitoringCommand &&
+ Controller->V1.RebuildStatusPending)
+ {
+ Command->V1.CommandStatus = Controller->V1.PendingRebuildStatus;
+ Controller->V1.RebuildStatusPending = false;
+ }
+ else if (CommandType == DAC960_MonitoringCommand &&
+ CommandStatus != DAC960_V1_NormalCompletion &&
+ CommandStatus != DAC960_V1_NoRebuildOrCheckInProgress)
+ {
+ Controller->V1.PendingRebuildStatus = CommandStatus;
+ Controller->V1.RebuildStatusPending = true;
+ }
+ }
+ else if (CommandOpcode == DAC960_V1_RebuildStat)
+ {
+ unsigned int LogicalDriveNumber =
+ Controller->V1.RebuildProgress->LogicalDriveNumber;
+ unsigned int LogicalDriveSize =
+ Controller->V1.RebuildProgress->LogicalDriveSize;
+ unsigned int BlocksCompleted =
+ LogicalDriveSize - Controller->V1.RebuildProgress->RemainingBlocks;
+ if (CommandStatus == DAC960_V1_NormalCompletion)
+ {
+ Controller->EphemeralProgressMessage = true;
+ DAC960_Progress("Consistency Check in Progress: "
+ "Logical Drive %d (/dev/rd/c%dd%d) "
+ "%d%% completed\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber,
+ (100 * (BlocksCompleted >> 7))
+ / (LogicalDriveSize >> 7));
+ Controller->EphemeralProgressMessage = false;
+ }
+ }
+ else if (CommandOpcode == DAC960_V1_BackgroundInitializationControl)
+ {
+ unsigned int LogicalDriveNumber =
+ Controller->V1.BackgroundInitializationStatus->LogicalDriveNumber;
+ unsigned int LogicalDriveSize =
+ Controller->V1.BackgroundInitializationStatus->LogicalDriveSize;
+ unsigned int BlocksCompleted =
+ Controller->V1.BackgroundInitializationStatus->BlocksCompleted;
+ switch (CommandStatus)
+ {
+ case DAC960_V1_NormalCompletion:
+ switch (Controller->V1.BackgroundInitializationStatus->Status)
+ {
+ case DAC960_V1_BackgroundInitializationInvalid:
+ break;
+ case DAC960_V1_BackgroundInitializationStarted:
+ DAC960_Progress("Background Initialization Started\n",
+ Controller);
+ break;
+ case DAC960_V1_BackgroundInitializationInProgress:
+ if (BlocksCompleted ==
+ Controller->V1.LastBackgroundInitializationStatus.
+ BlocksCompleted &&
+ LogicalDriveNumber ==
+ Controller->V1.LastBackgroundInitializationStatus.
+ LogicalDriveNumber)
+ break;
+ Controller->EphemeralProgressMessage = true;
+ DAC960_Progress("Background Initialization in Progress: "
+ "Logical Drive %d (/dev/rd/c%dd%d) "
+ "%d%% completed\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber,
+ (100 * (BlocksCompleted >> 7))
+ / (LogicalDriveSize >> 7));
+ Controller->EphemeralProgressMessage = false;
+ break;
+ case DAC960_V1_BackgroundInitializationSuspended:
+ DAC960_Progress("Background Initialization Suspended\n",
+ Controller);
+ break;
+ case DAC960_V1_BackgroundInitializationCancelled:
+ DAC960_Progress("Background Initialization Cancelled\n",
+ Controller);
+ break;
+ }
+ memcpy(&Controller->V1.LastBackgroundInitializationStatus,
+ Controller->V1.BackgroundInitializationStatus,
+ sizeof(DAC960_V1_BackgroundInitializationStatus_T));
+ break;
+ case DAC960_V1_BackgroundInitSuccessful:
+ if (Controller->V1.BackgroundInitializationStatus->Status ==
+ DAC960_V1_BackgroundInitializationInProgress)
+ DAC960_Progress("Background Initialization "
+ "Completed Successfully\n", Controller);
+ Controller->V1.BackgroundInitializationStatus->Status =
+ DAC960_V1_BackgroundInitializationInvalid;
+ break;
+ case DAC960_V1_BackgroundInitAborted:
+ if (Controller->V1.BackgroundInitializationStatus->Status ==
+ DAC960_V1_BackgroundInitializationInProgress)
+ DAC960_Progress("Background Initialization Aborted\n",
+ Controller);
+ Controller->V1.BackgroundInitializationStatus->Status =
+ DAC960_V1_BackgroundInitializationInvalid;
+ break;
+ case DAC960_V1_NoBackgroundInitInProgress:
+ break;
+ }
+ }
+ else if (CommandOpcode == DAC960_V1_DCDB)
+ {
+ /*
+ This is a bit ugly.
+
+ The InquiryStandardData and
+ the InquiryUntitSerialNumber information
+ retrieval operations BOTH use the DAC960_V1_DCDB
+ commands. the test above can't distinguish between
+ these two cases.
+
+ Instead, we rely on the order of code later in this
+ function to ensure that DeviceInquiryInformation commands
+ are submitted before DeviceSerialNumber commands.
+ */
+ if (Controller->V1.NeedDeviceInquiryInformation)
+ {
+ DAC960_SCSI_Inquiry_T *InquiryStandardData =
+ &Controller->V1.InquiryStandardData
+ [Controller->V1.DeviceStateChannel]
+ [Controller->V1.DeviceStateTargetID];
+ if (CommandStatus != DAC960_V1_NormalCompletion)
+ {
+ memset(InquiryStandardData, 0,
+ sizeof(DAC960_SCSI_Inquiry_T));
+ InquiryStandardData->PeripheralDeviceType = 0x1F;
+ }
+ else
+ memcpy(InquiryStandardData,
+ Controller->V1.NewInquiryStandardData,
+ sizeof(DAC960_SCSI_Inquiry_T));
+ Controller->V1.NeedDeviceInquiryInformation = false;
+ }
+ else if (Controller->V1.NeedDeviceSerialNumberInformation)
+ {
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber =
+ &Controller->V1.InquiryUnitSerialNumber
+ [Controller->V1.DeviceStateChannel]
+ [Controller->V1.DeviceStateTargetID];
+ if (CommandStatus != DAC960_V1_NormalCompletion)
+ {
+ memset(InquiryUnitSerialNumber, 0,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+ InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F;
+ }
+ else
+ memcpy(InquiryUnitSerialNumber,
+ Controller->V1.NewInquiryUnitSerialNumber,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+ Controller->V1.NeedDeviceSerialNumberInformation = false;
+ }
+ }
+ /*
+ Begin submitting new monitoring commands.
+ */
+ if (Controller->V1.NewEventLogSequenceNumber
+ - Controller->V1.OldEventLogSequenceNumber > 0)
+ {
+ Command->V1.CommandMailbox.Type3E.CommandOpcode =
+ DAC960_V1_PerformEventLogOperation;
+ Command->V1.CommandMailbox.Type3E.OperationType =
+ DAC960_V1_GetEventLogEntry;
+ Command->V1.CommandMailbox.Type3E.OperationQualifier = 1;
+ Command->V1.CommandMailbox.Type3E.SequenceNumber =
+ Controller->V1.OldEventLogSequenceNumber;
+ Command->V1.CommandMailbox.Type3E.BusAddress =
+ Controller->V1.EventLogEntryDMA;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V1.NeedErrorTableInformation)
+ {
+ Controller->V1.NeedErrorTableInformation = false;
+ Command->V1.CommandMailbox.Type3.CommandOpcode =
+ DAC960_V1_GetErrorTable;
+ Command->V1.CommandMailbox.Type3.BusAddress =
+ Controller->V1.NewErrorTableDMA;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V1.NeedRebuildProgress &&
+ Controller->V1.RebuildProgressFirst)
+ {
+ Controller->V1.NeedRebuildProgress = false;
+ Command->V1.CommandMailbox.Type3.CommandOpcode =
+ DAC960_V1_GetRebuildProgress;
+ Command->V1.CommandMailbox.Type3.BusAddress =
+ Controller->V1.RebuildProgressDMA;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V1.NeedDeviceStateInformation)
+ {
+ if (Controller->V1.NeedDeviceInquiryInformation)
+ {
+ DAC960_V1_DCDB_T *DCDB = Controller->V1.MonitoringDCDB;
+ dma_addr_t DCDB_DMA = Controller->V1.MonitoringDCDB_DMA;
+
+ dma_addr_t NewInquiryStandardDataDMA =
+ Controller->V1.NewInquiryStandardDataDMA;
+
+ Command->V1.CommandMailbox.Type3.CommandOpcode = DAC960_V1_DCDB;
+ Command->V1.CommandMailbox.Type3.BusAddress = DCDB_DMA;
+ DCDB->Channel = Controller->V1.DeviceStateChannel;
+ DCDB->TargetID = Controller->V1.DeviceStateTargetID;
+ DCDB->Direction = DAC960_V1_DCDB_DataTransferDeviceToSystem;
+ DCDB->EarlyStatus = false;
+ DCDB->Timeout = DAC960_V1_DCDB_Timeout_10_seconds;
+ DCDB->NoAutomaticRequestSense = false;
+ DCDB->DisconnectPermitted = true;
+ DCDB->TransferLength = sizeof(DAC960_SCSI_Inquiry_T);
+ DCDB->BusAddress = NewInquiryStandardDataDMA;
+ DCDB->CDBLength = 6;
+ DCDB->TransferLengthHigh4 = 0;
+ DCDB->SenseLength = sizeof(DCDB->SenseData);
+ DCDB->CDB[0] = 0x12; /* INQUIRY */
+ DCDB->CDB[1] = 0; /* EVPD = 0 */
+ DCDB->CDB[2] = 0; /* Page Code */
+ DCDB->CDB[3] = 0; /* Reserved */
+ DCDB->CDB[4] = sizeof(DAC960_SCSI_Inquiry_T);
+ DCDB->CDB[5] = 0; /* Control */
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V1.NeedDeviceSerialNumberInformation)
+ {
+ DAC960_V1_DCDB_T *DCDB = Controller->V1.MonitoringDCDB;
+ dma_addr_t DCDB_DMA = Controller->V1.MonitoringDCDB_DMA;
+ dma_addr_t NewInquiryUnitSerialNumberDMA =
+ Controller->V1.NewInquiryUnitSerialNumberDMA;
+
+ Command->V1.CommandMailbox.Type3.CommandOpcode = DAC960_V1_DCDB;
+ Command->V1.CommandMailbox.Type3.BusAddress = DCDB_DMA;
+ DCDB->Channel = Controller->V1.DeviceStateChannel;
+ DCDB->TargetID = Controller->V1.DeviceStateTargetID;
+ DCDB->Direction = DAC960_V1_DCDB_DataTransferDeviceToSystem;
+ DCDB->EarlyStatus = false;
+ DCDB->Timeout = DAC960_V1_DCDB_Timeout_10_seconds;
+ DCDB->NoAutomaticRequestSense = false;
+ DCDB->DisconnectPermitted = true;
+ DCDB->TransferLength =
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T);
+ DCDB->BusAddress = NewInquiryUnitSerialNumberDMA;
+ DCDB->CDBLength = 6;
+ DCDB->TransferLengthHigh4 = 0;
+ DCDB->SenseLength = sizeof(DCDB->SenseData);
+ DCDB->CDB[0] = 0x12; /* INQUIRY */
+ DCDB->CDB[1] = 1; /* EVPD = 1 */
+ DCDB->CDB[2] = 0x80; /* Page Code */
+ DCDB->CDB[3] = 0; /* Reserved */
+ DCDB->CDB[4] = sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T);
+ DCDB->CDB[5] = 0; /* Control */
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V1.StartDeviceStateScan)
+ {
+ Controller->V1.DeviceStateChannel = 0;
+ Controller->V1.DeviceStateTargetID = 0;
+ Controller->V1.StartDeviceStateScan = false;
+ }
+ else if (++Controller->V1.DeviceStateTargetID == Controller->Targets)
+ {
+ Controller->V1.DeviceStateChannel++;
+ Controller->V1.DeviceStateTargetID = 0;
+ }
+ if (Controller->V1.DeviceStateChannel < Controller->Channels)
+ {
+ Controller->V1.NewDeviceState->DeviceState =
+ DAC960_V1_Device_Dead;
+ Command->V1.CommandMailbox.Type3D.CommandOpcode =
+ DAC960_V1_GetDeviceState;
+ Command->V1.CommandMailbox.Type3D.Channel =
+ Controller->V1.DeviceStateChannel;
+ Command->V1.CommandMailbox.Type3D.TargetID =
+ Controller->V1.DeviceStateTargetID;
+ Command->V1.CommandMailbox.Type3D.BusAddress =
+ Controller->V1.NewDeviceStateDMA;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ Controller->V1.NeedDeviceStateInformation = false;
+ }
+ if (Controller->V1.NeedLogicalDriveInformation)
+ {
+ Controller->V1.NeedLogicalDriveInformation = false;
+ Command->V1.CommandMailbox.Type3.CommandOpcode =
+ DAC960_V1_GetLogicalDriveInformation;
+ Command->V1.CommandMailbox.Type3.BusAddress =
+ Controller->V1.NewLogicalDriveInformationDMA;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V1.NeedRebuildProgress)
+ {
+ Controller->V1.NeedRebuildProgress = false;
+ Command->V1.CommandMailbox.Type3.CommandOpcode =
+ DAC960_V1_GetRebuildProgress;
+ Command->V1.CommandMailbox.Type3.BusAddress =
+ Controller->V1.RebuildProgressDMA;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V1.NeedConsistencyCheckProgress)
+ {
+ Controller->V1.NeedConsistencyCheckProgress = false;
+ Command->V1.CommandMailbox.Type3.CommandOpcode =
+ DAC960_V1_RebuildStat;
+ Command->V1.CommandMailbox.Type3.BusAddress =
+ Controller->V1.RebuildProgressDMA;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V1.NeedBackgroundInitializationStatus)
+ {
+ Controller->V1.NeedBackgroundInitializationStatus = false;
+ Command->V1.CommandMailbox.Type3B.CommandOpcode =
+ DAC960_V1_BackgroundInitializationControl;
+ Command->V1.CommandMailbox.Type3B.CommandOpcode2 = 0x20;
+ Command->V1.CommandMailbox.Type3B.BusAddress =
+ Controller->V1.BackgroundInitializationStatusDMA;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ Controller->MonitoringTimerCount++;
+ Controller->MonitoringTimer.expires =
+ jiffies + DAC960_MonitoringTimerInterval;
+ add_timer(&Controller->MonitoringTimer);
+ }
+ if (CommandType == DAC960_ImmediateCommand)
+ {
+ complete(Command->Completion);
+ Command->Completion = NULL;
+ return;
+ }
+ if (CommandType == DAC960_QueuedCommand)
+ {
+ DAC960_V1_KernelCommand_T *KernelCommand = Command->V1.KernelCommand;
+ KernelCommand->CommandStatus = Command->V1.CommandStatus;
+ Command->V1.KernelCommand = NULL;
+ if (CommandOpcode == DAC960_V1_DCDB)
+ Controller->V1.DirectCommandActive[KernelCommand->DCDB->Channel]
+ [KernelCommand->DCDB->TargetID] =
+ false;
+ DAC960_DeallocateCommand(Command);
+ KernelCommand->CompletionFunction(KernelCommand);
+ return;
+ }
+ /*
+ Queue a Status Monitoring Command to the Controller using the just
+ completed Command if one was deferred previously due to lack of a
+ free Command when the Monitoring Timer Function was called.
+ */
+ if (Controller->MonitoringCommandDeferred)
+ {
+ Controller->MonitoringCommandDeferred = false;
+ DAC960_V1_QueueMonitoringCommand(Command);
+ return;
+ }
+ /*
+ Deallocate the Command.
+ */
+ DAC960_DeallocateCommand(Command);
+ /*
+ Wake up any processes waiting on a free Command.
+ */
+ wake_up(&Controller->CommandWaitQueue);
+}
+
+
+/*
+ DAC960_V2_ReadWriteError prints an appropriate error message for Command
+ when an error occurs on a Read or Write operation.
+*/
+
+static void DAC960_V2_ReadWriteError(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ unsigned char *SenseErrors[] = { "NO SENSE", "RECOVERED ERROR",
+ "NOT READY", "MEDIUM ERROR",
+ "HARDWARE ERROR", "ILLEGAL REQUEST",
+ "UNIT ATTENTION", "DATA PROTECT",
+ "BLANK CHECK", "VENDOR-SPECIFIC",
+ "COPY ABORTED", "ABORTED COMMAND",
+ "EQUAL", "VOLUME OVERFLOW",
+ "MISCOMPARE", "RESERVED" };
+ unsigned char *CommandName = "UNKNOWN";
+ switch (Command->CommandType)
+ {
+ case DAC960_ReadCommand:
+ case DAC960_ReadRetryCommand:
+ CommandName = "READ";
+ break;
+ case DAC960_WriteCommand:
+ case DAC960_WriteRetryCommand:
+ CommandName = "WRITE";
+ break;
+ case DAC960_MonitoringCommand:
+ case DAC960_ImmediateCommand:
+ case DAC960_QueuedCommand:
+ break;
+ }
+ DAC960_Error("Error Condition %s on %s:\n", Controller,
+ SenseErrors[Command->V2.RequestSense->SenseKey], CommandName);
+ DAC960_Error(" /dev/rd/c%dd%d: absolute blocks %u..%u\n",
+ Controller, Controller->ControllerNumber,
+ Command->LogicalDriveNumber, Command->BlockNumber,
+ Command->BlockNumber + Command->BlockCount - 1);
+}
+
+
+/*
+ DAC960_V2_ReportEvent prints an appropriate message when a Controller Event
+ occurs.
+*/
+
+static void DAC960_V2_ReportEvent(DAC960_Controller_T *Controller,
+ DAC960_V2_Event_T *Event)
+{
+ DAC960_SCSI_RequestSense_T *RequestSense =
+ (DAC960_SCSI_RequestSense_T *) &Event->RequestSenseData;
+ unsigned char MessageBuffer[DAC960_LineBufferSize];
+ static struct { int EventCode; unsigned char *EventMessage; } EventList[] =
+ { /* Physical Device Events (0x0000 - 0x007F) */
+ { 0x0001, "P Online" },
+ { 0x0002, "P Standby" },
+ { 0x0005, "P Automatic Rebuild Started" },
+ { 0x0006, "P Manual Rebuild Started" },
+ { 0x0007, "P Rebuild Completed" },
+ { 0x0008, "P Rebuild Cancelled" },
+ { 0x0009, "P Rebuild Failed for Unknown Reasons" },
+ { 0x000A, "P Rebuild Failed due to New Physical Device" },
+ { 0x000B, "P Rebuild Failed due to Logical Drive Failure" },
+ { 0x000C, "S Offline" },
+ { 0x000D, "P Found" },
+ { 0x000E, "P Removed" },
+ { 0x000F, "P Unconfigured" },
+ { 0x0010, "P Expand Capacity Started" },
+ { 0x0011, "P Expand Capacity Completed" },
+ { 0x0012, "P Expand Capacity Failed" },
+ { 0x0013, "P Command Timed Out" },
+ { 0x0014, "P Command Aborted" },
+ { 0x0015, "P Command Retried" },
+ { 0x0016, "P Parity Error" },
+ { 0x0017, "P Soft Error" },
+ { 0x0018, "P Miscellaneous Error" },
+ { 0x0019, "P Reset" },
+ { 0x001A, "P Active Spare Found" },
+ { 0x001B, "P Warm Spare Found" },
+ { 0x001C, "S Sense Data Received" },
+ { 0x001D, "P Initialization Started" },
+ { 0x001E, "P Initialization Completed" },
+ { 0x001F, "P Initialization Failed" },
+ { 0x0020, "P Initialization Cancelled" },
+ { 0x0021, "P Failed because Write Recovery Failed" },
+ { 0x0022, "P Failed because SCSI Bus Reset Failed" },
+ { 0x0023, "P Failed because of Double Check Condition" },
+ { 0x0024, "P Failed because Device Cannot Be Accessed" },
+ { 0x0025, "P Failed because of Gross Error on SCSI Processor" },
+ { 0x0026, "P Failed because of Bad Tag from Device" },
+ { 0x0027, "P Failed because of Command Timeout" },
+ { 0x0028, "P Failed because of System Reset" },
+ { 0x0029, "P Failed because of Busy Status or Parity Error" },
+ { 0x002A, "P Failed because Host Set Device to Failed State" },
+ { 0x002B, "P Failed because of Selection Timeout" },
+ { 0x002C, "P Failed because of SCSI Bus Phase Error" },
+ { 0x002D, "P Failed because Device Returned Unknown Status" },
+ { 0x002E, "P Failed because Device Not Ready" },
+ { 0x002F, "P Failed because Device Not Found at Startup" },
+ { 0x0030, "P Failed because COD Write Operation Failed" },
+ { 0x0031, "P Failed because BDT Write Operation Failed" },
+ { 0x0039, "P Missing at Startup" },
+ { 0x003A, "P Start Rebuild Failed due to Physical Drive Too Small" },
+ { 0x003C, "P Temporarily Offline Device Automatically Made Online" },
+ { 0x003D, "P Standby Rebuild Started" },
+ /* Logical Device Events (0x0080 - 0x00FF) */
+ { 0x0080, "M Consistency Check Started" },
+ { 0x0081, "M Consistency Check Completed" },
+ { 0x0082, "M Consistency Check Cancelled" },
+ { 0x0083, "M Consistency Check Completed With Errors" },
+ { 0x0084, "M Consistency Check Failed due to Logical Drive Failure" },
+ { 0x0085, "M Consistency Check Failed due to Physical Device Failure" },
+ { 0x0086, "L Offline" },
+ { 0x0087, "L Critical" },
+ { 0x0088, "L Online" },
+ { 0x0089, "M Automatic Rebuild Started" },
+ { 0x008A, "M Manual Rebuild Started" },
+ { 0x008B, "M Rebuild Completed" },
+ { 0x008C, "M Rebuild Cancelled" },
+ { 0x008D, "M Rebuild Failed for Unknown Reasons" },
+ { 0x008E, "M Rebuild Failed due to New Physical Device" },
+ { 0x008F, "M Rebuild Failed due to Logical Drive Failure" },
+ { 0x0090, "M Initialization Started" },
+ { 0x0091, "M Initialization Completed" },
+ { 0x0092, "M Initialization Cancelled" },
+ { 0x0093, "M Initialization Failed" },
+ { 0x0094, "L Found" },
+ { 0x0095, "L Deleted" },
+ { 0x0096, "M Expand Capacity Started" },
+ { 0x0097, "M Expand Capacity Completed" },
+ { 0x0098, "M Expand Capacity Failed" },
+ { 0x0099, "L Bad Block Found" },
+ { 0x009A, "L Size Changed" },
+ { 0x009B, "L Type Changed" },
+ { 0x009C, "L Bad Data Block Found" },
+ { 0x009E, "L Read of Data Block in BDT" },
+ { 0x009F, "L Write Back Data for Disk Block Lost" },
+ { 0x00A0, "L Temporarily Offline RAID-5/3 Drive Made Online" },
+ { 0x00A1, "L Temporarily Offline RAID-6/1/0/7 Drive Made Online" },
+ { 0x00A2, "L Standby Rebuild Started" },
+ /* Fault Management Events (0x0100 - 0x017F) */
+ { 0x0140, "E Fan %d Failed" },
+ { 0x0141, "E Fan %d OK" },
+ { 0x0142, "E Fan %d Not Present" },
+ { 0x0143, "E Power Supply %d Failed" },
+ { 0x0144, "E Power Supply %d OK" },
+ { 0x0145, "E Power Supply %d Not Present" },
+ { 0x0146, "E Temperature Sensor %d Temperature Exceeds Safe Limit" },
+ { 0x0147, "E Temperature Sensor %d Temperature Exceeds Working Limit" },
+ { 0x0148, "E Temperature Sensor %d Temperature Normal" },
+ { 0x0149, "E Temperature Sensor %d Not Present" },
+ { 0x014A, "E Enclosure Management Unit %d Access Critical" },
+ { 0x014B, "E Enclosure Management Unit %d Access OK" },
+ { 0x014C, "E Enclosure Management Unit %d Access Offline" },
+ /* Controller Events (0x0180 - 0x01FF) */
+ { 0x0181, "C Cache Write Back Error" },
+ { 0x0188, "C Battery Backup Unit Found" },
+ { 0x0189, "C Battery Backup Unit Charge Level Low" },
+ { 0x018A, "C Battery Backup Unit Charge Level OK" },
+ { 0x0193, "C Installation Aborted" },
+ { 0x0195, "C Battery Backup Unit Physically Removed" },
+ { 0x0196, "C Memory Error During Warm Boot" },
+ { 0x019E, "C Memory Soft ECC Error Corrected" },
+ { 0x019F, "C Memory Hard ECC Error Corrected" },
+ { 0x01A2, "C Battery Backup Unit Failed" },
+ { 0x01AB, "C Mirror Race Recovery Failed" },
+ { 0x01AC, "C Mirror Race on Critical Drive" },
+ /* Controller Internal Processor Events */
+ { 0x0380, "C Internal Controller Hung" },
+ { 0x0381, "C Internal Controller Firmware Breakpoint" },
+ { 0x0390, "C Internal Controller i960 Processor Specific Error" },
+ { 0x03A0, "C Internal Controller StrongARM Processor Specific Error" },
+ { 0, "" } };
+ int EventListIndex = 0, EventCode;
+ unsigned char EventType, *EventMessage;
+ if (Event->EventCode == 0x1C &&
+ RequestSense->SenseKey == DAC960_SenseKey_VendorSpecific &&
+ (RequestSense->AdditionalSenseCode == 0x80 ||
+ RequestSense->AdditionalSenseCode == 0x81))
+ Event->EventCode = ((RequestSense->AdditionalSenseCode - 0x80) << 8) |
+ RequestSense->AdditionalSenseCodeQualifier;
+ while (true)
+ {
+ EventCode = EventList[EventListIndex].EventCode;
+ if (EventCode == Event->EventCode || EventCode == 0) break;
+ EventListIndex++;
+ }
+ EventType = EventList[EventListIndex].EventMessage[0];
+ EventMessage = &EventList[EventListIndex].EventMessage[2];
+ if (EventCode == 0)
+ {
+ DAC960_Critical("Unknown Controller Event Code %04X\n",
+ Controller, Event->EventCode);
+ return;
+ }
+ switch (EventType)
+ {
+ case 'P':
+ DAC960_Critical("Physical Device %d:%d %s\n", Controller,
+ Event->Channel, Event->TargetID, EventMessage);
+ break;
+ case 'L':
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) %s\n", Controller,
+ Event->LogicalUnit, Controller->ControllerNumber,
+ Event->LogicalUnit, EventMessage);
+ break;
+ case 'M':
+ DAC960_Progress("Logical Drive %d (/dev/rd/c%dd%d) %s\n", Controller,
+ Event->LogicalUnit, Controller->ControllerNumber,
+ Event->LogicalUnit, EventMessage);
+ break;
+ case 'S':
+ if (RequestSense->SenseKey == DAC960_SenseKey_NoSense ||
+ (RequestSense->SenseKey == DAC960_SenseKey_NotReady &&
+ RequestSense->AdditionalSenseCode == 0x04 &&
+ (RequestSense->AdditionalSenseCodeQualifier == 0x01 ||
+ RequestSense->AdditionalSenseCodeQualifier == 0x02)))
+ break;
+ DAC960_Critical("Physical Device %d:%d %s\n", Controller,
+ Event->Channel, Event->TargetID, EventMessage);
+ DAC960_Critical("Physical Device %d:%d Request Sense: "
+ "Sense Key = %X, ASC = %02X, ASCQ = %02X\n",
+ Controller,
+ Event->Channel,
+ Event->TargetID,
+ RequestSense->SenseKey,
+ RequestSense->AdditionalSenseCode,
+ RequestSense->AdditionalSenseCodeQualifier);
+ DAC960_Critical("Physical Device %d:%d Request Sense: "
+ "Information = %02X%02X%02X%02X "
+ "%02X%02X%02X%02X\n",
+ Controller,
+ Event->Channel,
+ Event->TargetID,
+ RequestSense->Information[0],
+ RequestSense->Information[1],
+ RequestSense->Information[2],
+ RequestSense->Information[3],
+ RequestSense->CommandSpecificInformation[0],
+ RequestSense->CommandSpecificInformation[1],
+ RequestSense->CommandSpecificInformation[2],
+ RequestSense->CommandSpecificInformation[3]);
+ break;
+ case 'E':
+ if (Controller->SuppressEnclosureMessages) break;
+ sprintf(MessageBuffer, EventMessage, Event->LogicalUnit);
+ DAC960_Critical("Enclosure %d %s\n", Controller,
+ Event->TargetID, MessageBuffer);
+ break;
+ case 'C':
+ DAC960_Critical("Controller %s\n", Controller, EventMessage);
+ break;
+ default:
+ DAC960_Critical("Unknown Controller Event Code %04X\n",
+ Controller, Event->EventCode);
+ break;
+ }
+}
+
+
+/*
+ DAC960_V2_ReportProgress prints an appropriate progress message for
+ Logical Device Long Operations.
+*/
+
+static void DAC960_V2_ReportProgress(DAC960_Controller_T *Controller,
+ unsigned char *MessageString,
+ unsigned int LogicalDeviceNumber,
+ unsigned long BlocksCompleted,
+ unsigned long LogicalDeviceSize)
+{
+ Controller->EphemeralProgressMessage = true;
+ DAC960_Progress("%s in Progress: Logical Drive %d (/dev/rd/c%dd%d) "
+ "%d%% completed\n", Controller,
+ MessageString,
+ LogicalDeviceNumber,
+ Controller->ControllerNumber,
+ LogicalDeviceNumber,
+ (100 * (BlocksCompleted >> 7)) / (LogicalDeviceSize >> 7));
+ Controller->EphemeralProgressMessage = false;
+}
+
+
+/*
+ DAC960_V2_ProcessCompletedCommand performs completion processing for Command
+ for DAC960 V2 Firmware Controllers.
+*/
+
+static void DAC960_V2_ProcessCompletedCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ DAC960_CommandType_T CommandType = Command->CommandType;
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_IOCTL_Opcode_T IOCTLOpcode = CommandMailbox->Common.IOCTL_Opcode;
+ DAC960_V2_CommandOpcode_T CommandOpcode = CommandMailbox->SCSI_10.CommandOpcode;
+ DAC960_V2_CommandStatus_T CommandStatus = Command->V2.CommandStatus;
+
+ if (CommandType == DAC960_ReadCommand ||
+ CommandType == DAC960_WriteCommand)
+ {
+
+#ifdef FORCE_RETRY_DEBUG
+ CommandStatus = DAC960_V2_AbormalCompletion;
+#endif
+ Command->V2.RequestSense->SenseKey = DAC960_SenseKey_MediumError;
+
+ if (CommandStatus == DAC960_V2_NormalCompletion) {
+
+ if (!DAC960_ProcessCompletedRequest(Command, true))
+ BUG();
+
+ } else if (Command->V2.RequestSense->SenseKey == DAC960_SenseKey_MediumError)
+ {
+ /*
+ * break the command down into pieces and resubmit each
+ * piece, hoping that some of them will succeed.
+ */
+ DAC960_queue_partial_rw(Command);
+ return;
+ }
+ else
+ {
+ if (Command->V2.RequestSense->SenseKey != DAC960_SenseKey_NotReady)
+ DAC960_V2_ReadWriteError(Command);
+ /*
+ Perform completion processing for all buffers in this I/O Request.
+ */
+ (void)DAC960_ProcessCompletedRequest(Command, false);
+ }
+ }
+ else if (CommandType == DAC960_ReadRetryCommand ||
+ CommandType == DAC960_WriteRetryCommand)
+ {
+ bool normal_completion;
+
+#ifdef FORCE_RETRY_FAILURE_DEBUG
+ static int retry_count = 1;
+#endif
+ /*
+ Perform completion processing for the portion that was
+ retried, and submit the next portion, if any.
+ */
+ normal_completion = true;
+ if (CommandStatus != DAC960_V2_NormalCompletion) {
+ normal_completion = false;
+ if (Command->V2.RequestSense->SenseKey != DAC960_SenseKey_NotReady)
+ DAC960_V2_ReadWriteError(Command);
+ }
+
+#ifdef FORCE_RETRY_FAILURE_DEBUG
+ if (!(++retry_count % 10000)) {
+ printk("V2 error retry failure test\n");
+ normal_completion = false;
+ DAC960_V2_ReadWriteError(Command);
+ }
+#endif
+
+ if (!DAC960_ProcessCompletedRequest(Command, normal_completion)) {
+ DAC960_queue_partial_rw(Command);
+ return;
+ }
+ }
+ else if (CommandType == DAC960_MonitoringCommand)
+ {
+ if (Controller->ShutdownMonitoringTimer)
+ return;
+ if (IOCTLOpcode == DAC960_V2_GetControllerInfo)
+ {
+ DAC960_V2_ControllerInfo_T *NewControllerInfo =
+ Controller->V2.NewControllerInformation;
+ DAC960_V2_ControllerInfo_T *ControllerInfo =
+ &Controller->V2.ControllerInformation;
+ Controller->LogicalDriveCount =
+ NewControllerInfo->LogicalDevicesPresent;
+ Controller->V2.NeedLogicalDeviceInformation = true;
+ Controller->V2.NeedPhysicalDeviceInformation = true;
+ Controller->V2.StartLogicalDeviceInformationScan = true;
+ Controller->V2.StartPhysicalDeviceInformationScan = true;
+ Controller->MonitoringAlertMode =
+ (NewControllerInfo->LogicalDevicesCritical > 0 ||
+ NewControllerInfo->LogicalDevicesOffline > 0 ||
+ NewControllerInfo->PhysicalDisksCritical > 0 ||
+ NewControllerInfo->PhysicalDisksOffline > 0);
+ memcpy(ControllerInfo, NewControllerInfo,
+ sizeof(DAC960_V2_ControllerInfo_T));
+ }
+ else if (IOCTLOpcode == DAC960_V2_GetEvent)
+ {
+ if (CommandStatus == DAC960_V2_NormalCompletion) {
+ DAC960_V2_ReportEvent(Controller, Controller->V2.Event);
+ }
+ Controller->V2.NextEventSequenceNumber++;
+ }
+ else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid &&
+ CommandStatus == DAC960_V2_NormalCompletion)
+ {
+ DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInfo =
+ Controller->V2.NewPhysicalDeviceInformation;
+ unsigned int PhysicalDeviceIndex = Controller->V2.PhysicalDeviceIndex;
+ DAC960_V2_PhysicalDeviceInfo_T *PhysicalDeviceInfo =
+ Controller->V2.PhysicalDeviceInformation[PhysicalDeviceIndex];
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber =
+ Controller->V2.InquiryUnitSerialNumber[PhysicalDeviceIndex];
+ unsigned int DeviceIndex;
+ while (PhysicalDeviceInfo != NULL &&
+ (NewPhysicalDeviceInfo->Channel >
+ PhysicalDeviceInfo->Channel ||
+ (NewPhysicalDeviceInfo->Channel ==
+ PhysicalDeviceInfo->Channel &&
+ (NewPhysicalDeviceInfo->TargetID >
+ PhysicalDeviceInfo->TargetID ||
+ (NewPhysicalDeviceInfo->TargetID ==
+ PhysicalDeviceInfo->TargetID &&
+ NewPhysicalDeviceInfo->LogicalUnit >
+ PhysicalDeviceInfo->LogicalUnit)))))
+ {
+ DAC960_Critical("Physical Device %d:%d No Longer Exists\n",
+ Controller,
+ PhysicalDeviceInfo->Channel,
+ PhysicalDeviceInfo->TargetID);
+ Controller->V2.PhysicalDeviceInformation
+ [PhysicalDeviceIndex] = NULL;
+ Controller->V2.InquiryUnitSerialNumber
+ [PhysicalDeviceIndex] = NULL;
+ kfree(PhysicalDeviceInfo);
+ kfree(InquiryUnitSerialNumber);
+ for (DeviceIndex = PhysicalDeviceIndex;
+ DeviceIndex < DAC960_V2_MaxPhysicalDevices - 1;
+ DeviceIndex++)
+ {
+ Controller->V2.PhysicalDeviceInformation[DeviceIndex] =
+ Controller->V2.PhysicalDeviceInformation[DeviceIndex+1];
+ Controller->V2.InquiryUnitSerialNumber[DeviceIndex] =
+ Controller->V2.InquiryUnitSerialNumber[DeviceIndex+1];
+ }
+ Controller->V2.PhysicalDeviceInformation
+ [DAC960_V2_MaxPhysicalDevices-1] = NULL;
+ Controller->V2.InquiryUnitSerialNumber
+ [DAC960_V2_MaxPhysicalDevices-1] = NULL;
+ PhysicalDeviceInfo =
+ Controller->V2.PhysicalDeviceInformation[PhysicalDeviceIndex];
+ InquiryUnitSerialNumber =
+ Controller->V2.InquiryUnitSerialNumber[PhysicalDeviceIndex];
+ }
+ if (PhysicalDeviceInfo == NULL ||
+ (NewPhysicalDeviceInfo->Channel !=
+ PhysicalDeviceInfo->Channel) ||
+ (NewPhysicalDeviceInfo->TargetID !=
+ PhysicalDeviceInfo->TargetID) ||
+ (NewPhysicalDeviceInfo->LogicalUnit !=
+ PhysicalDeviceInfo->LogicalUnit))
+ {
+ PhysicalDeviceInfo =
+ kmalloc(sizeof(DAC960_V2_PhysicalDeviceInfo_T), GFP_ATOMIC);
+ InquiryUnitSerialNumber =
+ kmalloc(sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T),
+ GFP_ATOMIC);
+ if (InquiryUnitSerialNumber == NULL ||
+ PhysicalDeviceInfo == NULL)
+ {
+ kfree(InquiryUnitSerialNumber);
+ InquiryUnitSerialNumber = NULL;
+ kfree(PhysicalDeviceInfo);
+ PhysicalDeviceInfo = NULL;
+ }
+ DAC960_Critical("Physical Device %d:%d Now Exists%s\n",
+ Controller,
+ NewPhysicalDeviceInfo->Channel,
+ NewPhysicalDeviceInfo->TargetID,
+ (PhysicalDeviceInfo != NULL
+ ? "" : " - Allocation Failed"));
+ if (PhysicalDeviceInfo != NULL)
+ {
+ memset(PhysicalDeviceInfo, 0,
+ sizeof(DAC960_V2_PhysicalDeviceInfo_T));
+ PhysicalDeviceInfo->PhysicalDeviceState =
+ DAC960_V2_Device_InvalidState;
+ memset(InquiryUnitSerialNumber, 0,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+ InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F;
+ for (DeviceIndex = DAC960_V2_MaxPhysicalDevices - 1;
+ DeviceIndex > PhysicalDeviceIndex;
+ DeviceIndex--)
+ {
+ Controller->V2.PhysicalDeviceInformation[DeviceIndex] =
+ Controller->V2.PhysicalDeviceInformation[DeviceIndex-1];
+ Controller->V2.InquiryUnitSerialNumber[DeviceIndex] =
+ Controller->V2.InquiryUnitSerialNumber[DeviceIndex-1];
+ }
+ Controller->V2.PhysicalDeviceInformation
+ [PhysicalDeviceIndex] =
+ PhysicalDeviceInfo;
+ Controller->V2.InquiryUnitSerialNumber
+ [PhysicalDeviceIndex] =
+ InquiryUnitSerialNumber;
+ Controller->V2.NeedDeviceSerialNumberInformation = true;
+ }
+ }
+ if (PhysicalDeviceInfo != NULL)
+ {
+ if (NewPhysicalDeviceInfo->PhysicalDeviceState !=
+ PhysicalDeviceInfo->PhysicalDeviceState)
+ DAC960_Critical(
+ "Physical Device %d:%d is now %s\n", Controller,
+ NewPhysicalDeviceInfo->Channel,
+ NewPhysicalDeviceInfo->TargetID,
+ (NewPhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Online
+ ? "ONLINE"
+ : NewPhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Rebuild
+ ? "REBUILD"
+ : NewPhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Missing
+ ? "MISSING"
+ : NewPhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Critical
+ ? "CRITICAL"
+ : NewPhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Dead
+ ? "DEAD"
+ : NewPhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_SuspectedDead
+ ? "SUSPECTED-DEAD"
+ : NewPhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_CommandedOffline
+ ? "COMMANDED-OFFLINE"
+ : NewPhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Standby
+ ? "STANDBY" : "UNKNOWN"));
+ if ((NewPhysicalDeviceInfo->ParityErrors !=
+ PhysicalDeviceInfo->ParityErrors) ||
+ (NewPhysicalDeviceInfo->SoftErrors !=
+ PhysicalDeviceInfo->SoftErrors) ||
+ (NewPhysicalDeviceInfo->HardErrors !=
+ PhysicalDeviceInfo->HardErrors) ||
+ (NewPhysicalDeviceInfo->MiscellaneousErrors !=
+ PhysicalDeviceInfo->MiscellaneousErrors) ||
+ (NewPhysicalDeviceInfo->CommandTimeouts !=
+ PhysicalDeviceInfo->CommandTimeouts) ||
+ (NewPhysicalDeviceInfo->Retries !=
+ PhysicalDeviceInfo->Retries) ||
+ (NewPhysicalDeviceInfo->Aborts !=
+ PhysicalDeviceInfo->Aborts) ||
+ (NewPhysicalDeviceInfo->PredictedFailuresDetected !=
+ PhysicalDeviceInfo->PredictedFailuresDetected))
+ {
+ DAC960_Critical("Physical Device %d:%d Errors: "
+ "Parity = %d, Soft = %d, "
+ "Hard = %d, Misc = %d\n",
+ Controller,
+ NewPhysicalDeviceInfo->Channel,
+ NewPhysicalDeviceInfo->TargetID,
+ NewPhysicalDeviceInfo->ParityErrors,
+ NewPhysicalDeviceInfo->SoftErrors,
+ NewPhysicalDeviceInfo->HardErrors,
+ NewPhysicalDeviceInfo->MiscellaneousErrors);
+ DAC960_Critical("Physical Device %d:%d Errors: "
+ "Timeouts = %d, Retries = %d, "
+ "Aborts = %d, Predicted = %d\n",
+ Controller,
+ NewPhysicalDeviceInfo->Channel,
+ NewPhysicalDeviceInfo->TargetID,
+ NewPhysicalDeviceInfo->CommandTimeouts,
+ NewPhysicalDeviceInfo->Retries,
+ NewPhysicalDeviceInfo->Aborts,
+ NewPhysicalDeviceInfo
+ ->PredictedFailuresDetected);
+ }
+ if ((PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_Dead ||
+ PhysicalDeviceInfo->PhysicalDeviceState
+ == DAC960_V2_Device_InvalidState) &&
+ NewPhysicalDeviceInfo->PhysicalDeviceState
+ != DAC960_V2_Device_Dead)
+ Controller->V2.NeedDeviceSerialNumberInformation = true;
+ memcpy(PhysicalDeviceInfo, NewPhysicalDeviceInfo,
+ sizeof(DAC960_V2_PhysicalDeviceInfo_T));
+ }
+ NewPhysicalDeviceInfo->LogicalUnit++;
+ Controller->V2.PhysicalDeviceIndex++;
+ }
+ else if (IOCTLOpcode == DAC960_V2_GetPhysicalDeviceInfoValid)
+ {
+ unsigned int DeviceIndex;
+ for (DeviceIndex = Controller->V2.PhysicalDeviceIndex;
+ DeviceIndex < DAC960_V2_MaxPhysicalDevices;
+ DeviceIndex++)
+ {
+ DAC960_V2_PhysicalDeviceInfo_T *PhysicalDeviceInfo =
+ Controller->V2.PhysicalDeviceInformation[DeviceIndex];
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber =
+ Controller->V2.InquiryUnitSerialNumber[DeviceIndex];
+ if (PhysicalDeviceInfo == NULL) break;
+ DAC960_Critical("Physical Device %d:%d No Longer Exists\n",
+ Controller,
+ PhysicalDeviceInfo->Channel,
+ PhysicalDeviceInfo->TargetID);
+ Controller->V2.PhysicalDeviceInformation[DeviceIndex] = NULL;
+ Controller->V2.InquiryUnitSerialNumber[DeviceIndex] = NULL;
+ kfree(PhysicalDeviceInfo);
+ kfree(InquiryUnitSerialNumber);
+ }
+ Controller->V2.NeedPhysicalDeviceInformation = false;
+ }
+ else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid &&
+ CommandStatus == DAC960_V2_NormalCompletion)
+ {
+ DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInfo =
+ Controller->V2.NewLogicalDeviceInformation;
+ unsigned short LogicalDeviceNumber =
+ NewLogicalDeviceInfo->LogicalDeviceNumber;
+ DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo =
+ Controller->V2.LogicalDeviceInformation[LogicalDeviceNumber];
+ if (LogicalDeviceInfo == NULL)
+ {
+ DAC960_V2_PhysicalDevice_T PhysicalDevice;
+ PhysicalDevice.Controller = 0;
+ PhysicalDevice.Channel = NewLogicalDeviceInfo->Channel;
+ PhysicalDevice.TargetID = NewLogicalDeviceInfo->TargetID;
+ PhysicalDevice.LogicalUnit = NewLogicalDeviceInfo->LogicalUnit;
+ Controller->V2.LogicalDriveToVirtualDevice[LogicalDeviceNumber] =
+ PhysicalDevice;
+ LogicalDeviceInfo = kmalloc(sizeof(DAC960_V2_LogicalDeviceInfo_T),
+ GFP_ATOMIC);
+ Controller->V2.LogicalDeviceInformation[LogicalDeviceNumber] =
+ LogicalDeviceInfo;
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) "
+ "Now Exists%s\n", Controller,
+ LogicalDeviceNumber,
+ Controller->ControllerNumber,
+ LogicalDeviceNumber,
+ (LogicalDeviceInfo != NULL
+ ? "" : " - Allocation Failed"));
+ if (LogicalDeviceInfo != NULL)
+ {
+ memset(LogicalDeviceInfo, 0,
+ sizeof(DAC960_V2_LogicalDeviceInfo_T));
+ DAC960_ComputeGenericDiskInfo(Controller);
+ }
+ }
+ if (LogicalDeviceInfo != NULL)
+ {
+ unsigned long LogicalDeviceSize =
+ NewLogicalDeviceInfo->ConfigurableDeviceSize;
+ if (NewLogicalDeviceInfo->LogicalDeviceState !=
+ LogicalDeviceInfo->LogicalDeviceState)
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) "
+ "is now %s\n", Controller,
+ LogicalDeviceNumber,
+ Controller->ControllerNumber,
+ LogicalDeviceNumber,
+ (NewLogicalDeviceInfo->LogicalDeviceState
+ == DAC960_V2_LogicalDevice_Online
+ ? "ONLINE"
+ : NewLogicalDeviceInfo->LogicalDeviceState
+ == DAC960_V2_LogicalDevice_Critical
+ ? "CRITICAL" : "OFFLINE"));
+ if ((NewLogicalDeviceInfo->SoftErrors !=
+ LogicalDeviceInfo->SoftErrors) ||
+ (NewLogicalDeviceInfo->CommandsFailed !=
+ LogicalDeviceInfo->CommandsFailed) ||
+ (NewLogicalDeviceInfo->DeferredWriteErrors !=
+ LogicalDeviceInfo->DeferredWriteErrors))
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) Errors: "
+ "Soft = %d, Failed = %d, Deferred Write = %d\n",
+ Controller, LogicalDeviceNumber,
+ Controller->ControllerNumber,
+ LogicalDeviceNumber,
+ NewLogicalDeviceInfo->SoftErrors,
+ NewLogicalDeviceInfo->CommandsFailed,
+ NewLogicalDeviceInfo->DeferredWriteErrors);
+ if (NewLogicalDeviceInfo->ConsistencyCheckInProgress)
+ DAC960_V2_ReportProgress(Controller,
+ "Consistency Check",
+ LogicalDeviceNumber,
+ NewLogicalDeviceInfo
+ ->ConsistencyCheckBlockNumber,
+ LogicalDeviceSize);
+ else if (NewLogicalDeviceInfo->RebuildInProgress)
+ DAC960_V2_ReportProgress(Controller,
+ "Rebuild",
+ LogicalDeviceNumber,
+ NewLogicalDeviceInfo
+ ->RebuildBlockNumber,
+ LogicalDeviceSize);
+ else if (NewLogicalDeviceInfo->BackgroundInitializationInProgress)
+ DAC960_V2_ReportProgress(Controller,
+ "Background Initialization",
+ LogicalDeviceNumber,
+ NewLogicalDeviceInfo
+ ->BackgroundInitializationBlockNumber,
+ LogicalDeviceSize);
+ else if (NewLogicalDeviceInfo->ForegroundInitializationInProgress)
+ DAC960_V2_ReportProgress(Controller,
+ "Foreground Initialization",
+ LogicalDeviceNumber,
+ NewLogicalDeviceInfo
+ ->ForegroundInitializationBlockNumber,
+ LogicalDeviceSize);
+ else if (NewLogicalDeviceInfo->DataMigrationInProgress)
+ DAC960_V2_ReportProgress(Controller,
+ "Data Migration",
+ LogicalDeviceNumber,
+ NewLogicalDeviceInfo
+ ->DataMigrationBlockNumber,
+ LogicalDeviceSize);
+ else if (NewLogicalDeviceInfo->PatrolOperationInProgress)
+ DAC960_V2_ReportProgress(Controller,
+ "Patrol Operation",
+ LogicalDeviceNumber,
+ NewLogicalDeviceInfo
+ ->PatrolOperationBlockNumber,
+ LogicalDeviceSize);
+ if (LogicalDeviceInfo->BackgroundInitializationInProgress &&
+ !NewLogicalDeviceInfo->BackgroundInitializationInProgress)
+ DAC960_Progress("Logical Drive %d (/dev/rd/c%dd%d) "
+ "Background Initialization %s\n",
+ Controller,
+ LogicalDeviceNumber,
+ Controller->ControllerNumber,
+ LogicalDeviceNumber,
+ (NewLogicalDeviceInfo->LogicalDeviceControl
+ .LogicalDeviceInitialized
+ ? "Completed" : "Failed"));
+ memcpy(LogicalDeviceInfo, NewLogicalDeviceInfo,
+ sizeof(DAC960_V2_LogicalDeviceInfo_T));
+ }
+ Controller->V2.LogicalDriveFoundDuringScan
+ [LogicalDeviceNumber] = true;
+ NewLogicalDeviceInfo->LogicalDeviceNumber++;
+ }
+ else if (IOCTLOpcode == DAC960_V2_GetLogicalDeviceInfoValid)
+ {
+ int LogicalDriveNumber;
+ for (LogicalDriveNumber = 0;
+ LogicalDriveNumber < DAC960_MaxLogicalDrives;
+ LogicalDriveNumber++)
+ {
+ DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo =
+ Controller->V2.LogicalDeviceInformation[LogicalDriveNumber];
+ if (LogicalDeviceInfo == NULL ||
+ Controller->V2.LogicalDriveFoundDuringScan
+ [LogicalDriveNumber])
+ continue;
+ DAC960_Critical("Logical Drive %d (/dev/rd/c%dd%d) "
+ "No Longer Exists\n", Controller,
+ LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber);
+ Controller->V2.LogicalDeviceInformation
+ [LogicalDriveNumber] = NULL;
+ kfree(LogicalDeviceInfo);
+ Controller->LogicalDriveInitiallyAccessible
+ [LogicalDriveNumber] = false;
+ DAC960_ComputeGenericDiskInfo(Controller);
+ }
+ Controller->V2.NeedLogicalDeviceInformation = false;
+ }
+ else if (CommandOpcode == DAC960_V2_SCSI_10_Passthru)
+ {
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber =
+ Controller->V2.InquiryUnitSerialNumber[Controller->V2.PhysicalDeviceIndex - 1];
+
+ if (CommandStatus != DAC960_V2_NormalCompletion) {
+ memset(InquiryUnitSerialNumber,
+ 0, sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+ InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F;
+ } else
+ memcpy(InquiryUnitSerialNumber,
+ Controller->V2.NewInquiryUnitSerialNumber,
+ sizeof(DAC960_SCSI_Inquiry_UnitSerialNumber_T));
+
+ Controller->V2.NeedDeviceSerialNumberInformation = false;
+ }
+
+ if (Controller->V2.HealthStatusBuffer->NextEventSequenceNumber
+ - Controller->V2.NextEventSequenceNumber > 0)
+ {
+ CommandMailbox->GetEvent.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->GetEvent.DataTransferSize = sizeof(DAC960_V2_Event_T);
+ CommandMailbox->GetEvent.EventSequenceNumberHigh16 =
+ Controller->V2.NextEventSequenceNumber >> 16;
+ CommandMailbox->GetEvent.ControllerNumber = 0;
+ CommandMailbox->GetEvent.IOCTL_Opcode =
+ DAC960_V2_GetEvent;
+ CommandMailbox->GetEvent.EventSequenceNumberLow16 =
+ Controller->V2.NextEventSequenceNumber & 0xFFFF;
+ CommandMailbox->GetEvent.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.EventDMA;
+ CommandMailbox->GetEvent.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->GetEvent.DataTransferSize;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V2.NeedPhysicalDeviceInformation)
+ {
+ if (Controller->V2.NeedDeviceSerialNumberInformation)
+ {
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *InquiryUnitSerialNumber =
+ Controller->V2.NewInquiryUnitSerialNumber;
+ InquiryUnitSerialNumber->PeripheralDeviceType = 0x1F;
+
+ DAC960_V2_ConstructNewUnitSerialNumber(Controller, CommandMailbox,
+ Controller->V2.NewPhysicalDeviceInformation->Channel,
+ Controller->V2.NewPhysicalDeviceInformation->TargetID,
+ Controller->V2.NewPhysicalDeviceInformation->LogicalUnit - 1);
+
+
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V2.StartPhysicalDeviceInformationScan)
+ {
+ Controller->V2.PhysicalDeviceIndex = 0;
+ Controller->V2.NewPhysicalDeviceInformation->Channel = 0;
+ Controller->V2.NewPhysicalDeviceInformation->TargetID = 0;
+ Controller->V2.NewPhysicalDeviceInformation->LogicalUnit = 0;
+ Controller->V2.StartPhysicalDeviceInformationScan = false;
+ }
+ CommandMailbox->PhysicalDeviceInfo.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->PhysicalDeviceInfo.DataTransferSize =
+ sizeof(DAC960_V2_PhysicalDeviceInfo_T);
+ CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.LogicalUnit =
+ Controller->V2.NewPhysicalDeviceInformation->LogicalUnit;
+ CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.TargetID =
+ Controller->V2.NewPhysicalDeviceInformation->TargetID;
+ CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.Channel =
+ Controller->V2.NewPhysicalDeviceInformation->Channel;
+ CommandMailbox->PhysicalDeviceInfo.IOCTL_Opcode =
+ DAC960_V2_GetPhysicalDeviceInfoValid;
+ CommandMailbox->PhysicalDeviceInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.NewPhysicalDeviceInformationDMA;
+ CommandMailbox->PhysicalDeviceInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->PhysicalDeviceInfo.DataTransferSize;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ if (Controller->V2.NeedLogicalDeviceInformation)
+ {
+ if (Controller->V2.StartLogicalDeviceInformationScan)
+ {
+ int LogicalDriveNumber;
+ for (LogicalDriveNumber = 0;
+ LogicalDriveNumber < DAC960_MaxLogicalDrives;
+ LogicalDriveNumber++)
+ Controller->V2.LogicalDriveFoundDuringScan
+ [LogicalDriveNumber] = false;
+ Controller->V2.NewLogicalDeviceInformation->LogicalDeviceNumber = 0;
+ Controller->V2.StartLogicalDeviceInformationScan = false;
+ }
+ CommandMailbox->LogicalDeviceInfo.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->LogicalDeviceInfo.DataTransferSize =
+ sizeof(DAC960_V2_LogicalDeviceInfo_T);
+ CommandMailbox->LogicalDeviceInfo.LogicalDevice.LogicalDeviceNumber =
+ Controller->V2.NewLogicalDeviceInformation->LogicalDeviceNumber;
+ CommandMailbox->LogicalDeviceInfo.IOCTL_Opcode =
+ DAC960_V2_GetLogicalDeviceInfoValid;
+ CommandMailbox->LogicalDeviceInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.NewLogicalDeviceInformationDMA;
+ CommandMailbox->LogicalDeviceInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->LogicalDeviceInfo.DataTransferSize;
+ DAC960_QueueCommand(Command);
+ return;
+ }
+ Controller->MonitoringTimerCount++;
+ Controller->MonitoringTimer.expires =
+ jiffies + DAC960_HealthStatusMonitoringInterval;
+ add_timer(&Controller->MonitoringTimer);
+ }
+ if (CommandType == DAC960_ImmediateCommand)
+ {
+ complete(Command->Completion);
+ Command->Completion = NULL;
+ return;
+ }
+ if (CommandType == DAC960_QueuedCommand)
+ {
+ DAC960_V2_KernelCommand_T *KernelCommand = Command->V2.KernelCommand;
+ KernelCommand->CommandStatus = CommandStatus;
+ KernelCommand->RequestSenseLength = Command->V2.RequestSenseLength;
+ KernelCommand->DataTransferLength = Command->V2.DataTransferResidue;
+ Command->V2.KernelCommand = NULL;
+ DAC960_DeallocateCommand(Command);
+ KernelCommand->CompletionFunction(KernelCommand);
+ return;
+ }
+ /*
+ Queue a Status Monitoring Command to the Controller using the just
+ completed Command if one was deferred previously due to lack of a
+ free Command when the Monitoring Timer Function was called.
+ */
+ if (Controller->MonitoringCommandDeferred)
+ {
+ Controller->MonitoringCommandDeferred = false;
+ DAC960_V2_QueueMonitoringCommand(Command);
+ return;
+ }
+ /*
+ Deallocate the Command.
+ */
+ DAC960_DeallocateCommand(Command);
+ /*
+ Wake up any processes waiting on a free Command.
+ */
+ wake_up(&Controller->CommandWaitQueue);
+}
+
+/*
+ DAC960_GEM_InterruptHandler handles hardware interrupts from DAC960 GEM Series
+ Controllers.
+*/
+
+static irqreturn_t DAC960_GEM_InterruptHandler(int IRQ_Channel,
+ void *DeviceIdentifier)
+{
+ DAC960_Controller_T *Controller = DeviceIdentifier;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V2_StatusMailbox_T *NextStatusMailbox;
+ unsigned long flags;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_GEM_AcknowledgeInterrupt(ControllerBaseAddress);
+ NextStatusMailbox = Controller->V2.NextStatusMailbox;
+ while (NextStatusMailbox->Fields.CommandIdentifier > 0)
+ {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier =
+ NextStatusMailbox->Fields.CommandIdentifier;
+ DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1];
+ Command->V2.CommandStatus = NextStatusMailbox->Fields.CommandStatus;
+ Command->V2.RequestSenseLength =
+ NextStatusMailbox->Fields.RequestSenseLength;
+ Command->V2.DataTransferResidue =
+ NextStatusMailbox->Fields.DataTransferResidue;
+ NextStatusMailbox->Words[0] = 0;
+ if (++NextStatusMailbox > Controller->V2.LastStatusMailbox)
+ NextStatusMailbox = Controller->V2.FirstStatusMailbox;
+ DAC960_V2_ProcessCompletedCommand(Command);
+ }
+ Controller->V2.NextStatusMailbox = NextStatusMailbox;
+ /*
+ Attempt to remove additional I/O Requests from the Controller's
+ I/O Request Queue and queue them to the Controller.
+ */
+ DAC960_ProcessRequest(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return IRQ_HANDLED;
+}
+
+/*
+ DAC960_BA_InterruptHandler handles hardware interrupts from DAC960 BA Series
+ Controllers.
+*/
+
+static irqreturn_t DAC960_BA_InterruptHandler(int IRQ_Channel,
+ void *DeviceIdentifier)
+{
+ DAC960_Controller_T *Controller = DeviceIdentifier;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V2_StatusMailbox_T *NextStatusMailbox;
+ unsigned long flags;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_BA_AcknowledgeInterrupt(ControllerBaseAddress);
+ NextStatusMailbox = Controller->V2.NextStatusMailbox;
+ while (NextStatusMailbox->Fields.CommandIdentifier > 0)
+ {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier =
+ NextStatusMailbox->Fields.CommandIdentifier;
+ DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1];
+ Command->V2.CommandStatus = NextStatusMailbox->Fields.CommandStatus;
+ Command->V2.RequestSenseLength =
+ NextStatusMailbox->Fields.RequestSenseLength;
+ Command->V2.DataTransferResidue =
+ NextStatusMailbox->Fields.DataTransferResidue;
+ NextStatusMailbox->Words[0] = 0;
+ if (++NextStatusMailbox > Controller->V2.LastStatusMailbox)
+ NextStatusMailbox = Controller->V2.FirstStatusMailbox;
+ DAC960_V2_ProcessCompletedCommand(Command);
+ }
+ Controller->V2.NextStatusMailbox = NextStatusMailbox;
+ /*
+ Attempt to remove additional I/O Requests from the Controller's
+ I/O Request Queue and queue them to the Controller.
+ */
+ DAC960_ProcessRequest(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return IRQ_HANDLED;
+}
+
+
+/*
+ DAC960_LP_InterruptHandler handles hardware interrupts from DAC960 LP Series
+ Controllers.
+*/
+
+static irqreturn_t DAC960_LP_InterruptHandler(int IRQ_Channel,
+ void *DeviceIdentifier)
+{
+ DAC960_Controller_T *Controller = DeviceIdentifier;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V2_StatusMailbox_T *NextStatusMailbox;
+ unsigned long flags;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_LP_AcknowledgeInterrupt(ControllerBaseAddress);
+ NextStatusMailbox = Controller->V2.NextStatusMailbox;
+ while (NextStatusMailbox->Fields.CommandIdentifier > 0)
+ {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier =
+ NextStatusMailbox->Fields.CommandIdentifier;
+ DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1];
+ Command->V2.CommandStatus = NextStatusMailbox->Fields.CommandStatus;
+ Command->V2.RequestSenseLength =
+ NextStatusMailbox->Fields.RequestSenseLength;
+ Command->V2.DataTransferResidue =
+ NextStatusMailbox->Fields.DataTransferResidue;
+ NextStatusMailbox->Words[0] = 0;
+ if (++NextStatusMailbox > Controller->V2.LastStatusMailbox)
+ NextStatusMailbox = Controller->V2.FirstStatusMailbox;
+ DAC960_V2_ProcessCompletedCommand(Command);
+ }
+ Controller->V2.NextStatusMailbox = NextStatusMailbox;
+ /*
+ Attempt to remove additional I/O Requests from the Controller's
+ I/O Request Queue and queue them to the Controller.
+ */
+ DAC960_ProcessRequest(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return IRQ_HANDLED;
+}
+
+
+/*
+ DAC960_LA_InterruptHandler handles hardware interrupts from DAC960 LA Series
+ Controllers.
+*/
+
+static irqreturn_t DAC960_LA_InterruptHandler(int IRQ_Channel,
+ void *DeviceIdentifier)
+{
+ DAC960_Controller_T *Controller = DeviceIdentifier;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V1_StatusMailbox_T *NextStatusMailbox;
+ unsigned long flags;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_LA_AcknowledgeInterrupt(ControllerBaseAddress);
+ NextStatusMailbox = Controller->V1.NextStatusMailbox;
+ while (NextStatusMailbox->Fields.Valid)
+ {
+ DAC960_V1_CommandIdentifier_T CommandIdentifier =
+ NextStatusMailbox->Fields.CommandIdentifier;
+ DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1];
+ Command->V1.CommandStatus = NextStatusMailbox->Fields.CommandStatus;
+ NextStatusMailbox->Word = 0;
+ if (++NextStatusMailbox > Controller->V1.LastStatusMailbox)
+ NextStatusMailbox = Controller->V1.FirstStatusMailbox;
+ DAC960_V1_ProcessCompletedCommand(Command);
+ }
+ Controller->V1.NextStatusMailbox = NextStatusMailbox;
+ /*
+ Attempt to remove additional I/O Requests from the Controller's
+ I/O Request Queue and queue them to the Controller.
+ */
+ DAC960_ProcessRequest(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return IRQ_HANDLED;
+}
+
+
+/*
+ DAC960_PG_InterruptHandler handles hardware interrupts from DAC960 PG Series
+ Controllers.
+*/
+
+static irqreturn_t DAC960_PG_InterruptHandler(int IRQ_Channel,
+ void *DeviceIdentifier)
+{
+ DAC960_Controller_T *Controller = DeviceIdentifier;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ DAC960_V1_StatusMailbox_T *NextStatusMailbox;
+ unsigned long flags;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_PG_AcknowledgeInterrupt(ControllerBaseAddress);
+ NextStatusMailbox = Controller->V1.NextStatusMailbox;
+ while (NextStatusMailbox->Fields.Valid)
+ {
+ DAC960_V1_CommandIdentifier_T CommandIdentifier =
+ NextStatusMailbox->Fields.CommandIdentifier;
+ DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1];
+ Command->V1.CommandStatus = NextStatusMailbox->Fields.CommandStatus;
+ NextStatusMailbox->Word = 0;
+ if (++NextStatusMailbox > Controller->V1.LastStatusMailbox)
+ NextStatusMailbox = Controller->V1.FirstStatusMailbox;
+ DAC960_V1_ProcessCompletedCommand(Command);
+ }
+ Controller->V1.NextStatusMailbox = NextStatusMailbox;
+ /*
+ Attempt to remove additional I/O Requests from the Controller's
+ I/O Request Queue and queue them to the Controller.
+ */
+ DAC960_ProcessRequest(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return IRQ_HANDLED;
+}
+
+
+/*
+ DAC960_PD_InterruptHandler handles hardware interrupts from DAC960 PD Series
+ Controllers.
+*/
+
+static irqreturn_t DAC960_PD_InterruptHandler(int IRQ_Channel,
+ void *DeviceIdentifier)
+{
+ DAC960_Controller_T *Controller = DeviceIdentifier;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ unsigned long flags;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ while (DAC960_PD_StatusAvailableP(ControllerBaseAddress))
+ {
+ DAC960_V1_CommandIdentifier_T CommandIdentifier =
+ DAC960_PD_ReadStatusCommandIdentifier(ControllerBaseAddress);
+ DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1];
+ Command->V1.CommandStatus =
+ DAC960_PD_ReadStatusRegister(ControllerBaseAddress);
+ DAC960_PD_AcknowledgeInterrupt(ControllerBaseAddress);
+ DAC960_PD_AcknowledgeStatus(ControllerBaseAddress);
+ DAC960_V1_ProcessCompletedCommand(Command);
+ }
+ /*
+ Attempt to remove additional I/O Requests from the Controller's
+ I/O Request Queue and queue them to the Controller.
+ */
+ DAC960_ProcessRequest(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return IRQ_HANDLED;
+}
+
+
+/*
+ DAC960_P_InterruptHandler handles hardware interrupts from DAC960 P Series
+ Controllers.
+
+ Translations of DAC960_V1_Enquiry and DAC960_V1_GetDeviceState rely
+ on the data having been placed into DAC960_Controller_T, rather than
+ an arbitrary buffer.
+*/
+
+static irqreturn_t DAC960_P_InterruptHandler(int IRQ_Channel,
+ void *DeviceIdentifier)
+{
+ DAC960_Controller_T *Controller = DeviceIdentifier;
+ void __iomem *ControllerBaseAddress = Controller->BaseAddress;
+ unsigned long flags;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ while (DAC960_PD_StatusAvailableP(ControllerBaseAddress))
+ {
+ DAC960_V1_CommandIdentifier_T CommandIdentifier =
+ DAC960_PD_ReadStatusCommandIdentifier(ControllerBaseAddress);
+ DAC960_Command_T *Command = Controller->Commands[CommandIdentifier-1];
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_CommandOpcode_T CommandOpcode =
+ CommandMailbox->Common.CommandOpcode;
+ Command->V1.CommandStatus =
+ DAC960_PD_ReadStatusRegister(ControllerBaseAddress);
+ DAC960_PD_AcknowledgeInterrupt(ControllerBaseAddress);
+ DAC960_PD_AcknowledgeStatus(ControllerBaseAddress);
+ switch (CommandOpcode)
+ {
+ case DAC960_V1_Enquiry_Old:
+ Command->V1.CommandMailbox.Common.CommandOpcode = DAC960_V1_Enquiry;
+ DAC960_P_To_PD_TranslateEnquiry(Controller->V1.NewEnquiry);
+ break;
+ case DAC960_V1_GetDeviceState_Old:
+ Command->V1.CommandMailbox.Common.CommandOpcode =
+ DAC960_V1_GetDeviceState;
+ DAC960_P_To_PD_TranslateDeviceState(Controller->V1.NewDeviceState);
+ break;
+ case DAC960_V1_Read_Old:
+ Command->V1.CommandMailbox.Common.CommandOpcode = DAC960_V1_Read;
+ DAC960_P_To_PD_TranslateReadWriteCommand(CommandMailbox);
+ break;
+ case DAC960_V1_Write_Old:
+ Command->V1.CommandMailbox.Common.CommandOpcode = DAC960_V1_Write;
+ DAC960_P_To_PD_TranslateReadWriteCommand(CommandMailbox);
+ break;
+ case DAC960_V1_ReadWithScatterGather_Old:
+ Command->V1.CommandMailbox.Common.CommandOpcode =
+ DAC960_V1_ReadWithScatterGather;
+ DAC960_P_To_PD_TranslateReadWriteCommand(CommandMailbox);
+ break;
+ case DAC960_V1_WriteWithScatterGather_Old:
+ Command->V1.CommandMailbox.Common.CommandOpcode =
+ DAC960_V1_WriteWithScatterGather;
+ DAC960_P_To_PD_TranslateReadWriteCommand(CommandMailbox);
+ break;
+ default:
+ break;
+ }
+ DAC960_V1_ProcessCompletedCommand(Command);
+ }
+ /*
+ Attempt to remove additional I/O Requests from the Controller's
+ I/O Request Queue and queue them to the Controller.
+ */
+ DAC960_ProcessRequest(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return IRQ_HANDLED;
+}
+
+
+/*
+ DAC960_V1_QueueMonitoringCommand queues a Monitoring Command to DAC960 V1
+ Firmware Controllers.
+*/
+
+static void DAC960_V1_QueueMonitoringCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ DAC960_V1_ClearCommand(Command);
+ Command->CommandType = DAC960_MonitoringCommand;
+ CommandMailbox->Type3.CommandOpcode = DAC960_V1_Enquiry;
+ CommandMailbox->Type3.BusAddress = Controller->V1.NewEnquiryDMA;
+ DAC960_QueueCommand(Command);
+}
+
+
+/*
+ DAC960_V2_QueueMonitoringCommand queues a Monitoring Command to DAC960 V2
+ Firmware Controllers.
+*/
+
+static void DAC960_V2_QueueMonitoringCommand(DAC960_Command_T *Command)
+{
+ DAC960_Controller_T *Controller = Command->Controller;
+ DAC960_V2_CommandMailbox_T *CommandMailbox = &Command->V2.CommandMailbox;
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_MonitoringCommand;
+ CommandMailbox->ControllerInfo.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->ControllerInfo.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->ControllerInfo.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->ControllerInfo.DataTransferSize =
+ sizeof(DAC960_V2_ControllerInfo_T);
+ CommandMailbox->ControllerInfo.ControllerNumber = 0;
+ CommandMailbox->ControllerInfo.IOCTL_Opcode = DAC960_V2_GetControllerInfo;
+ CommandMailbox->ControllerInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.NewControllerInformationDMA;
+ CommandMailbox->ControllerInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->ControllerInfo.DataTransferSize;
+ DAC960_QueueCommand(Command);
+}
+
+
+/*
+ DAC960_MonitoringTimerFunction is the timer function for monitoring
+ the status of DAC960 Controllers.
+*/
+
+static void DAC960_MonitoringTimerFunction(unsigned long TimerData)
+{
+ DAC960_Controller_T *Controller = (DAC960_Controller_T *) TimerData;
+ DAC960_Command_T *Command;
+ unsigned long flags;
+
+ if (Controller->FirmwareType == DAC960_V1_Controller)
+ {
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ /*
+ Queue a Status Monitoring Command to Controller.
+ */
+ Command = DAC960_AllocateCommand(Controller);
+ if (Command != NULL)
+ DAC960_V1_QueueMonitoringCommand(Command);
+ else Controller->MonitoringCommandDeferred = true;
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ }
+ else
+ {
+ DAC960_V2_ControllerInfo_T *ControllerInfo =
+ &Controller->V2.ControllerInformation;
+ unsigned int StatusChangeCounter =
+ Controller->V2.HealthStatusBuffer->StatusChangeCounter;
+ bool ForceMonitoringCommand = false;
+ if (time_after(jiffies, Controller->SecondaryMonitoringTime
+ + DAC960_SecondaryMonitoringInterval))
+ {
+ int LogicalDriveNumber;
+ for (LogicalDriveNumber = 0;
+ LogicalDriveNumber < DAC960_MaxLogicalDrives;
+ LogicalDriveNumber++)
+ {
+ DAC960_V2_LogicalDeviceInfo_T *LogicalDeviceInfo =
+ Controller->V2.LogicalDeviceInformation[LogicalDriveNumber];
+ if (LogicalDeviceInfo == NULL) continue;
+ if (!LogicalDeviceInfo->LogicalDeviceControl
+ .LogicalDeviceInitialized)
+ {
+ ForceMonitoringCommand = true;
+ break;
+ }
+ }
+ Controller->SecondaryMonitoringTime = jiffies;
+ }
+ if (StatusChangeCounter == Controller->V2.StatusChangeCounter &&
+ Controller->V2.HealthStatusBuffer->NextEventSequenceNumber
+ == Controller->V2.NextEventSequenceNumber &&
+ (ControllerInfo->BackgroundInitializationsActive +
+ ControllerInfo->LogicalDeviceInitializationsActive +
+ ControllerInfo->PhysicalDeviceInitializationsActive +
+ ControllerInfo->ConsistencyChecksActive +
+ ControllerInfo->RebuildsActive +
+ ControllerInfo->OnlineExpansionsActive == 0 ||
+ time_before(jiffies, Controller->PrimaryMonitoringTime
+ + DAC960_MonitoringTimerInterval)) &&
+ !ForceMonitoringCommand)
+ {
+ Controller->MonitoringTimer.expires =
+ jiffies + DAC960_HealthStatusMonitoringInterval;
+ add_timer(&Controller->MonitoringTimer);
+ return;
+ }
+ Controller->V2.StatusChangeCounter = StatusChangeCounter;
+ Controller->PrimaryMonitoringTime = jiffies;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ /*
+ Queue a Status Monitoring Command to Controller.
+ */
+ Command = DAC960_AllocateCommand(Controller);
+ if (Command != NULL)
+ DAC960_V2_QueueMonitoringCommand(Command);
+ else Controller->MonitoringCommandDeferred = true;
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ /*
+ Wake up any processes waiting on a Health Status Buffer change.
+ */
+ wake_up(&Controller->HealthStatusWaitQueue);
+ }
+}
+
+/*
+ DAC960_CheckStatusBuffer verifies that there is room to hold ByteCount
+ additional bytes in the Combined Status Buffer and grows the buffer if
+ necessary. It returns true if there is enough room and false otherwise.
+*/
+
+static bool DAC960_CheckStatusBuffer(DAC960_Controller_T *Controller,
+ unsigned int ByteCount)
+{
+ unsigned char *NewStatusBuffer;
+ if (Controller->InitialStatusLength + 1 +
+ Controller->CurrentStatusLength + ByteCount + 1 <=
+ Controller->CombinedStatusBufferLength)
+ return true;
+ if (Controller->CombinedStatusBufferLength == 0)
+ {
+ unsigned int NewStatusBufferLength = DAC960_InitialStatusBufferSize;
+ while (NewStatusBufferLength < ByteCount)
+ NewStatusBufferLength *= 2;
+ Controller->CombinedStatusBuffer = kmalloc(NewStatusBufferLength,
+ GFP_ATOMIC);
+ if (Controller->CombinedStatusBuffer == NULL) return false;
+ Controller->CombinedStatusBufferLength = NewStatusBufferLength;
+ return true;
+ }
+ NewStatusBuffer = kmalloc(2 * Controller->CombinedStatusBufferLength,
+ GFP_ATOMIC);
+ if (NewStatusBuffer == NULL)
+ {
+ DAC960_Warning("Unable to expand Combined Status Buffer - Truncating\n",
+ Controller);
+ return false;
+ }
+ memcpy(NewStatusBuffer, Controller->CombinedStatusBuffer,
+ Controller->CombinedStatusBufferLength);
+ kfree(Controller->CombinedStatusBuffer);
+ Controller->CombinedStatusBuffer = NewStatusBuffer;
+ Controller->CombinedStatusBufferLength *= 2;
+ Controller->CurrentStatusBuffer =
+ &NewStatusBuffer[Controller->InitialStatusLength + 1];
+ return true;
+}
+
+
+/*
+ DAC960_Message prints Driver Messages.
+*/
+
+static void DAC960_Message(DAC960_MessageLevel_T MessageLevel,
+ unsigned char *Format,
+ DAC960_Controller_T *Controller,
+ ...)
+{
+ static unsigned char Buffer[DAC960_LineBufferSize];
+ static bool BeginningOfLine = true;
+ va_list Arguments;
+ int Length = 0;
+ va_start(Arguments, Controller);
+ Length = vsprintf(Buffer, Format, Arguments);
+ va_end(Arguments);
+ if (Controller == NULL)
+ printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel],
+ DAC960_ControllerCount, Buffer);
+ else if (MessageLevel == DAC960_AnnounceLevel ||
+ MessageLevel == DAC960_InfoLevel)
+ {
+ if (!Controller->ControllerInitialized)
+ {
+ if (DAC960_CheckStatusBuffer(Controller, Length))
+ {
+ strcpy(&Controller->CombinedStatusBuffer
+ [Controller->InitialStatusLength],
+ Buffer);
+ Controller->InitialStatusLength += Length;
+ Controller->CurrentStatusBuffer =
+ &Controller->CombinedStatusBuffer
+ [Controller->InitialStatusLength + 1];
+ }
+ if (MessageLevel == DAC960_AnnounceLevel)
+ {
+ static int AnnouncementLines = 0;
+ if (++AnnouncementLines <= 2)
+ printk("%sDAC960: %s", DAC960_MessageLevelMap[MessageLevel],
+ Buffer);
+ }
+ else
+ {
+ if (BeginningOfLine)
+ {
+ if (Buffer[0] != '\n' || Length > 1)
+ printk("%sDAC960#%d: %s",
+ DAC960_MessageLevelMap[MessageLevel],
+ Controller->ControllerNumber, Buffer);
+ }
+ else printk("%s", Buffer);
+ }
+ }
+ else if (DAC960_CheckStatusBuffer(Controller, Length))
+ {
+ strcpy(&Controller->CurrentStatusBuffer[
+ Controller->CurrentStatusLength], Buffer);
+ Controller->CurrentStatusLength += Length;
+ }
+ }
+ else if (MessageLevel == DAC960_ProgressLevel)
+ {
+ strcpy(Controller->ProgressBuffer, Buffer);
+ Controller->ProgressBufferLength = Length;
+ if (Controller->EphemeralProgressMessage)
+ {
+ if (time_after_eq(jiffies, Controller->LastProgressReportTime
+ + DAC960_ProgressReportingInterval))
+ {
+ printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel],
+ Controller->ControllerNumber, Buffer);
+ Controller->LastProgressReportTime = jiffies;
+ }
+ }
+ else printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel],
+ Controller->ControllerNumber, Buffer);
+ }
+ else if (MessageLevel == DAC960_UserCriticalLevel)
+ {
+ strcpy(&Controller->UserStatusBuffer[Controller->UserStatusLength],
+ Buffer);
+ Controller->UserStatusLength += Length;
+ if (Buffer[0] != '\n' || Length > 1)
+ printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel],
+ Controller->ControllerNumber, Buffer);
+ }
+ else
+ {
+ if (BeginningOfLine)
+ printk("%sDAC960#%d: %s", DAC960_MessageLevelMap[MessageLevel],
+ Controller->ControllerNumber, Buffer);
+ else printk("%s", Buffer);
+ }
+ BeginningOfLine = (Buffer[Length-1] == '\n');
+}
+
+
+/*
+ DAC960_ParsePhysicalDevice parses spaces followed by a Physical Device
+ Channel:TargetID specification from a User Command string. It updates
+ Channel and TargetID and returns true on success and false on failure.
+*/
+
+static bool DAC960_ParsePhysicalDevice(DAC960_Controller_T *Controller,
+ char *UserCommandString,
+ unsigned char *Channel,
+ unsigned char *TargetID)
+{
+ char *NewUserCommandString = UserCommandString;
+ unsigned long XChannel, XTargetID;
+ while (*UserCommandString == ' ') UserCommandString++;
+ if (UserCommandString == NewUserCommandString)
+ return false;
+ XChannel = simple_strtoul(UserCommandString, &NewUserCommandString, 10);
+ if (NewUserCommandString == UserCommandString ||
+ *NewUserCommandString != ':' ||
+ XChannel >= Controller->Channels)
+ return false;
+ UserCommandString = ++NewUserCommandString;
+ XTargetID = simple_strtoul(UserCommandString, &NewUserCommandString, 10);
+ if (NewUserCommandString == UserCommandString ||
+ *NewUserCommandString != '\0' ||
+ XTargetID >= Controller->Targets)
+ return false;
+ *Channel = XChannel;
+ *TargetID = XTargetID;
+ return true;
+}
+
+
+/*
+ DAC960_ParseLogicalDrive parses spaces followed by a Logical Drive Number
+ specification from a User Command string. It updates LogicalDriveNumber and
+ returns true on success and false on failure.
+*/
+
+static bool DAC960_ParseLogicalDrive(DAC960_Controller_T *Controller,
+ char *UserCommandString,
+ unsigned char *LogicalDriveNumber)
+{
+ char *NewUserCommandString = UserCommandString;
+ unsigned long XLogicalDriveNumber;
+ while (*UserCommandString == ' ') UserCommandString++;
+ if (UserCommandString == NewUserCommandString)
+ return false;
+ XLogicalDriveNumber =
+ simple_strtoul(UserCommandString, &NewUserCommandString, 10);
+ if (NewUserCommandString == UserCommandString ||
+ *NewUserCommandString != '\0' ||
+ XLogicalDriveNumber > DAC960_MaxLogicalDrives - 1)
+ return false;
+ *LogicalDriveNumber = XLogicalDriveNumber;
+ return true;
+}
+
+
+/*
+ DAC960_V1_SetDeviceState sets the Device State for a Physical Device for
+ DAC960 V1 Firmware Controllers.
+*/
+
+static void DAC960_V1_SetDeviceState(DAC960_Controller_T *Controller,
+ DAC960_Command_T *Command,
+ unsigned char Channel,
+ unsigned char TargetID,
+ DAC960_V1_PhysicalDeviceState_T
+ DeviceState,
+ const unsigned char *DeviceStateString)
+{
+ DAC960_V1_CommandMailbox_T *CommandMailbox = &Command->V1.CommandMailbox;
+ CommandMailbox->Type3D.CommandOpcode = DAC960_V1_StartDevice;
+ CommandMailbox->Type3D.Channel = Channel;
+ CommandMailbox->Type3D.TargetID = TargetID;
+ CommandMailbox->Type3D.DeviceState = DeviceState;
+ CommandMailbox->Type3D.Modifier = 0;
+ DAC960_ExecuteCommand(Command);
+ switch (Command->V1.CommandStatus)
+ {
+ case DAC960_V1_NormalCompletion:
+ DAC960_UserCritical("%s of Physical Device %d:%d Succeeded\n", Controller,
+ DeviceStateString, Channel, TargetID);
+ break;
+ case DAC960_V1_UnableToStartDevice:
+ DAC960_UserCritical("%s of Physical Device %d:%d Failed - "
+ "Unable to Start Device\n", Controller,
+ DeviceStateString, Channel, TargetID);
+ break;
+ case DAC960_V1_NoDeviceAtAddress:
+ DAC960_UserCritical("%s of Physical Device %d:%d Failed - "
+ "No Device at Address\n", Controller,
+ DeviceStateString, Channel, TargetID);
+ break;
+ case DAC960_V1_InvalidChannelOrTargetOrModifier:
+ DAC960_UserCritical("%s of Physical Device %d:%d Failed - "
+ "Invalid Channel or Target or Modifier\n",
+ Controller, DeviceStateString, Channel, TargetID);
+ break;
+ case DAC960_V1_ChannelBusy:
+ DAC960_UserCritical("%s of Physical Device %d:%d Failed - "
+ "Channel Busy\n", Controller,
+ DeviceStateString, Channel, TargetID);
+ break;
+ default:
+ DAC960_UserCritical("%s of Physical Device %d:%d Failed - "
+ "Unexpected Status %04X\n", Controller,
+ DeviceStateString, Channel, TargetID,
+ Command->V1.CommandStatus);
+ break;
+ }
+}
+
+
+/*
+ DAC960_V1_ExecuteUserCommand executes a User Command for DAC960 V1 Firmware
+ Controllers.
+*/
+
+static bool DAC960_V1_ExecuteUserCommand(DAC960_Controller_T *Controller,
+ unsigned char *UserCommand)
+{
+ DAC960_Command_T *Command;
+ DAC960_V1_CommandMailbox_T *CommandMailbox;
+ unsigned long flags;
+ unsigned char Channel, TargetID, LogicalDriveNumber;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ while ((Command = DAC960_AllocateCommand(Controller)) == NULL)
+ DAC960_WaitForCommand(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ Controller->UserStatusLength = 0;
+ DAC960_V1_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox = &Command->V1.CommandMailbox;
+ if (strcmp(UserCommand, "flush-cache") == 0)
+ {
+ CommandMailbox->Type3.CommandOpcode = DAC960_V1_Flush;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Cache Flush Completed\n", Controller);
+ }
+ else if (strncmp(UserCommand, "kill", 4) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[4],
+ &Channel, &TargetID))
+ {
+ DAC960_V1_DeviceState_T *DeviceState =
+ &Controller->V1.DeviceState[Channel][TargetID];
+ if (DeviceState->Present &&
+ DeviceState->DeviceType == DAC960_V1_DiskType &&
+ DeviceState->DeviceState != DAC960_V1_Device_Dead)
+ DAC960_V1_SetDeviceState(Controller, Command, Channel, TargetID,
+ DAC960_V1_Device_Dead, "Kill");
+ else DAC960_UserCritical("Kill of Physical Device %d:%d Illegal\n",
+ Controller, Channel, TargetID);
+ }
+ else if (strncmp(UserCommand, "make-online", 11) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[11],
+ &Channel, &TargetID))
+ {
+ DAC960_V1_DeviceState_T *DeviceState =
+ &Controller->V1.DeviceState[Channel][TargetID];
+ if (DeviceState->Present &&
+ DeviceState->DeviceType == DAC960_V1_DiskType &&
+ DeviceState->DeviceState == DAC960_V1_Device_Dead)
+ DAC960_V1_SetDeviceState(Controller, Command, Channel, TargetID,
+ DAC960_V1_Device_Online, "Make Online");
+ else DAC960_UserCritical("Make Online of Physical Device %d:%d Illegal\n",
+ Controller, Channel, TargetID);
+
+ }
+ else if (strncmp(UserCommand, "make-standby", 12) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[12],
+ &Channel, &TargetID))
+ {
+ DAC960_V1_DeviceState_T *DeviceState =
+ &Controller->V1.DeviceState[Channel][TargetID];
+ if (DeviceState->Present &&
+ DeviceState->DeviceType == DAC960_V1_DiskType &&
+ DeviceState->DeviceState == DAC960_V1_Device_Dead)
+ DAC960_V1_SetDeviceState(Controller, Command, Channel, TargetID,
+ DAC960_V1_Device_Standby, "Make Standby");
+ else DAC960_UserCritical("Make Standby of Physical "
+ "Device %d:%d Illegal\n",
+ Controller, Channel, TargetID);
+ }
+ else if (strncmp(UserCommand, "rebuild", 7) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[7],
+ &Channel, &TargetID))
+ {
+ CommandMailbox->Type3D.CommandOpcode = DAC960_V1_RebuildAsync;
+ CommandMailbox->Type3D.Channel = Channel;
+ CommandMailbox->Type3D.TargetID = TargetID;
+ DAC960_ExecuteCommand(Command);
+ switch (Command->V1.CommandStatus)
+ {
+ case DAC960_V1_NormalCompletion:
+ DAC960_UserCritical("Rebuild of Physical Device %d:%d Initiated\n",
+ Controller, Channel, TargetID);
+ break;
+ case DAC960_V1_AttemptToRebuildOnlineDrive:
+ DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - "
+ "Attempt to Rebuild Online or "
+ "Unresponsive Drive\n",
+ Controller, Channel, TargetID);
+ break;
+ case DAC960_V1_NewDiskFailedDuringRebuild:
+ DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - "
+ "New Disk Failed During Rebuild\n",
+ Controller, Channel, TargetID);
+ break;
+ case DAC960_V1_InvalidDeviceAddress:
+ DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - "
+ "Invalid Device Address\n",
+ Controller, Channel, TargetID);
+ break;
+ case DAC960_V1_RebuildOrCheckAlreadyInProgress:
+ DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - "
+ "Rebuild or Consistency Check Already "
+ "in Progress\n", Controller, Channel, TargetID);
+ break;
+ default:
+ DAC960_UserCritical("Rebuild of Physical Device %d:%d Failed - "
+ "Unexpected Status %04X\n", Controller,
+ Channel, TargetID, Command->V1.CommandStatus);
+ break;
+ }
+ }
+ else if (strncmp(UserCommand, "check-consistency", 17) == 0 &&
+ DAC960_ParseLogicalDrive(Controller, &UserCommand[17],
+ &LogicalDriveNumber))
+ {
+ CommandMailbox->Type3C.CommandOpcode = DAC960_V1_CheckConsistencyAsync;
+ CommandMailbox->Type3C.LogicalDriveNumber = LogicalDriveNumber;
+ CommandMailbox->Type3C.AutoRestore = true;
+ DAC960_ExecuteCommand(Command);
+ switch (Command->V1.CommandStatus)
+ {
+ case DAC960_V1_NormalCompletion:
+ DAC960_UserCritical("Consistency Check of Logical Drive %d "
+ "(/dev/rd/c%dd%d) Initiated\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber);
+ break;
+ case DAC960_V1_DependentDiskIsDead:
+ DAC960_UserCritical("Consistency Check of Logical Drive %d "
+ "(/dev/rd/c%dd%d) Failed - "
+ "Dependent Physical Device is DEAD\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber);
+ break;
+ case DAC960_V1_InvalidOrNonredundantLogicalDrive:
+ DAC960_UserCritical("Consistency Check of Logical Drive %d "
+ "(/dev/rd/c%dd%d) Failed - "
+ "Invalid or Nonredundant Logical Drive\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber);
+ break;
+ case DAC960_V1_RebuildOrCheckAlreadyInProgress:
+ DAC960_UserCritical("Consistency Check of Logical Drive %d "
+ "(/dev/rd/c%dd%d) Failed - Rebuild or "
+ "Consistency Check Already in Progress\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber);
+ break;
+ default:
+ DAC960_UserCritical("Consistency Check of Logical Drive %d "
+ "(/dev/rd/c%dd%d) Failed - "
+ "Unexpected Status %04X\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber, Command->V1.CommandStatus);
+ break;
+ }
+ }
+ else if (strcmp(UserCommand, "cancel-rebuild") == 0 ||
+ strcmp(UserCommand, "cancel-consistency-check") == 0)
+ {
+ /*
+ the OldRebuildRateConstant is never actually used
+ once its value is retrieved from the controller.
+ */
+ unsigned char *OldRebuildRateConstant;
+ dma_addr_t OldRebuildRateConstantDMA;
+
+ OldRebuildRateConstant = pci_alloc_consistent( Controller->PCIDevice,
+ sizeof(char), &OldRebuildRateConstantDMA);
+ if (OldRebuildRateConstant == NULL) {
+ DAC960_UserCritical("Cancellation of Rebuild or "
+ "Consistency Check Failed - "
+ "Out of Memory",
+ Controller);
+ goto failure;
+ }
+ CommandMailbox->Type3R.CommandOpcode = DAC960_V1_RebuildControl;
+ CommandMailbox->Type3R.RebuildRateConstant = 0xFF;
+ CommandMailbox->Type3R.BusAddress = OldRebuildRateConstantDMA;
+ DAC960_ExecuteCommand(Command);
+ switch (Command->V1.CommandStatus)
+ {
+ case DAC960_V1_NormalCompletion:
+ DAC960_UserCritical("Rebuild or Consistency Check Cancelled\n",
+ Controller);
+ break;
+ default:
+ DAC960_UserCritical("Cancellation of Rebuild or "
+ "Consistency Check Failed - "
+ "Unexpected Status %04X\n",
+ Controller, Command->V1.CommandStatus);
+ break;
+ }
+failure:
+ pci_free_consistent(Controller->PCIDevice, sizeof(char),
+ OldRebuildRateConstant, OldRebuildRateConstantDMA);
+ }
+ else DAC960_UserCritical("Illegal User Command: '%s'\n",
+ Controller, UserCommand);
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_DeallocateCommand(Command);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return true;
+}
+
+
+/*
+ DAC960_V2_TranslatePhysicalDevice translates a Physical Device Channel and
+ TargetID into a Logical Device. It returns true on success and false
+ on failure.
+*/
+
+static bool DAC960_V2_TranslatePhysicalDevice(DAC960_Command_T *Command,
+ unsigned char Channel,
+ unsigned char TargetID,
+ unsigned short
+ *LogicalDeviceNumber)
+{
+ DAC960_V2_CommandMailbox_T SavedCommandMailbox, *CommandMailbox;
+ DAC960_Controller_T *Controller = Command->Controller;
+
+ CommandMailbox = &Command->V2.CommandMailbox;
+ memcpy(&SavedCommandMailbox, CommandMailbox,
+ sizeof(DAC960_V2_CommandMailbox_T));
+
+ CommandMailbox->PhysicalDeviceInfo.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->PhysicalDeviceInfo.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->PhysicalDeviceInfo.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->PhysicalDeviceInfo.DataTransferSize =
+ sizeof(DAC960_V2_PhysicalToLogicalDevice_T);
+ CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.TargetID = TargetID;
+ CommandMailbox->PhysicalDeviceInfo.PhysicalDevice.Channel = Channel;
+ CommandMailbox->PhysicalDeviceInfo.IOCTL_Opcode =
+ DAC960_V2_TranslatePhysicalToLogicalDevice;
+ CommandMailbox->Common.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.PhysicalToLogicalDeviceDMA;
+ CommandMailbox->Common.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->Common.DataTransferSize;
+
+ DAC960_ExecuteCommand(Command);
+ *LogicalDeviceNumber = Controller->V2.PhysicalToLogicalDevice->LogicalDeviceNumber;
+
+ memcpy(CommandMailbox, &SavedCommandMailbox,
+ sizeof(DAC960_V2_CommandMailbox_T));
+ return (Command->V2.CommandStatus == DAC960_V2_NormalCompletion);
+}
+
+
+/*
+ DAC960_V2_ExecuteUserCommand executes a User Command for DAC960 V2 Firmware
+ Controllers.
+*/
+
+static bool DAC960_V2_ExecuteUserCommand(DAC960_Controller_T *Controller,
+ unsigned char *UserCommand)
+{
+ DAC960_Command_T *Command;
+ DAC960_V2_CommandMailbox_T *CommandMailbox;
+ unsigned long flags;
+ unsigned char Channel, TargetID, LogicalDriveNumber;
+ unsigned short LogicalDeviceNumber;
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ while ((Command = DAC960_AllocateCommand(Controller)) == NULL)
+ DAC960_WaitForCommand(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ Controller->UserStatusLength = 0;
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox = &Command->V2.CommandMailbox;
+ CommandMailbox->Common.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->Common.CommandControlBits.DataTransferControllerToHost = true;
+ CommandMailbox->Common.CommandControlBits.NoAutoRequestSense = true;
+ if (strcmp(UserCommand, "flush-cache") == 0)
+ {
+ CommandMailbox->DeviceOperation.IOCTL_Opcode = DAC960_V2_PauseDevice;
+ CommandMailbox->DeviceOperation.OperationDevice =
+ DAC960_V2_RAID_Controller;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Cache Flush Completed\n", Controller);
+ }
+ else if (strncmp(UserCommand, "kill", 4) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[4],
+ &Channel, &TargetID) &&
+ DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID,
+ &LogicalDeviceNumber))
+ {
+ CommandMailbox->SetDeviceState.LogicalDevice.LogicalDeviceNumber =
+ LogicalDeviceNumber;
+ CommandMailbox->SetDeviceState.IOCTL_Opcode =
+ DAC960_V2_SetDeviceState;
+ CommandMailbox->SetDeviceState.DeviceState.PhysicalDeviceState =
+ DAC960_V2_Device_Dead;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Kill of Physical Device %d:%d %s\n",
+ Controller, Channel, TargetID,
+ (Command->V2.CommandStatus
+ == DAC960_V2_NormalCompletion
+ ? "Succeeded" : "Failed"));
+ }
+ else if (strncmp(UserCommand, "make-online", 11) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[11],
+ &Channel, &TargetID) &&
+ DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID,
+ &LogicalDeviceNumber))
+ {
+ CommandMailbox->SetDeviceState.LogicalDevice.LogicalDeviceNumber =
+ LogicalDeviceNumber;
+ CommandMailbox->SetDeviceState.IOCTL_Opcode =
+ DAC960_V2_SetDeviceState;
+ CommandMailbox->SetDeviceState.DeviceState.PhysicalDeviceState =
+ DAC960_V2_Device_Online;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Make Online of Physical Device %d:%d %s\n",
+ Controller, Channel, TargetID,
+ (Command->V2.CommandStatus
+ == DAC960_V2_NormalCompletion
+ ? "Succeeded" : "Failed"));
+ }
+ else if (strncmp(UserCommand, "make-standby", 12) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[12],
+ &Channel, &TargetID) &&
+ DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID,
+ &LogicalDeviceNumber))
+ {
+ CommandMailbox->SetDeviceState.LogicalDevice.LogicalDeviceNumber =
+ LogicalDeviceNumber;
+ CommandMailbox->SetDeviceState.IOCTL_Opcode =
+ DAC960_V2_SetDeviceState;
+ CommandMailbox->SetDeviceState.DeviceState.PhysicalDeviceState =
+ DAC960_V2_Device_Standby;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Make Standby of Physical Device %d:%d %s\n",
+ Controller, Channel, TargetID,
+ (Command->V2.CommandStatus
+ == DAC960_V2_NormalCompletion
+ ? "Succeeded" : "Failed"));
+ }
+ else if (strncmp(UserCommand, "rebuild", 7) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[7],
+ &Channel, &TargetID) &&
+ DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID,
+ &LogicalDeviceNumber))
+ {
+ CommandMailbox->LogicalDeviceInfo.LogicalDevice.LogicalDeviceNumber =
+ LogicalDeviceNumber;
+ CommandMailbox->LogicalDeviceInfo.IOCTL_Opcode =
+ DAC960_V2_RebuildDeviceStart;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Rebuild of Physical Device %d:%d %s\n",
+ Controller, Channel, TargetID,
+ (Command->V2.CommandStatus
+ == DAC960_V2_NormalCompletion
+ ? "Initiated" : "Not Initiated"));
+ }
+ else if (strncmp(UserCommand, "cancel-rebuild", 14) == 0 &&
+ DAC960_ParsePhysicalDevice(Controller, &UserCommand[14],
+ &Channel, &TargetID) &&
+ DAC960_V2_TranslatePhysicalDevice(Command, Channel, TargetID,
+ &LogicalDeviceNumber))
+ {
+ CommandMailbox->LogicalDeviceInfo.LogicalDevice.LogicalDeviceNumber =
+ LogicalDeviceNumber;
+ CommandMailbox->LogicalDeviceInfo.IOCTL_Opcode =
+ DAC960_V2_RebuildDeviceStop;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Rebuild of Physical Device %d:%d %s\n",
+ Controller, Channel, TargetID,
+ (Command->V2.CommandStatus
+ == DAC960_V2_NormalCompletion
+ ? "Cancelled" : "Not Cancelled"));
+ }
+ else if (strncmp(UserCommand, "check-consistency", 17) == 0 &&
+ DAC960_ParseLogicalDrive(Controller, &UserCommand[17],
+ &LogicalDriveNumber))
+ {
+ CommandMailbox->ConsistencyCheck.LogicalDevice.LogicalDeviceNumber =
+ LogicalDriveNumber;
+ CommandMailbox->ConsistencyCheck.IOCTL_Opcode =
+ DAC960_V2_ConsistencyCheckStart;
+ CommandMailbox->ConsistencyCheck.RestoreConsistency = true;
+ CommandMailbox->ConsistencyCheck.InitializedAreaOnly = false;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Consistency Check of Logical Drive %d "
+ "(/dev/rd/c%dd%d) %s\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber,
+ (Command->V2.CommandStatus
+ == DAC960_V2_NormalCompletion
+ ? "Initiated" : "Not Initiated"));
+ }
+ else if (strncmp(UserCommand, "cancel-consistency-check", 24) == 0 &&
+ DAC960_ParseLogicalDrive(Controller, &UserCommand[24],
+ &LogicalDriveNumber))
+ {
+ CommandMailbox->ConsistencyCheck.LogicalDevice.LogicalDeviceNumber =
+ LogicalDriveNumber;
+ CommandMailbox->ConsistencyCheck.IOCTL_Opcode =
+ DAC960_V2_ConsistencyCheckStop;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Consistency Check of Logical Drive %d "
+ "(/dev/rd/c%dd%d) %s\n",
+ Controller, LogicalDriveNumber,
+ Controller->ControllerNumber,
+ LogicalDriveNumber,
+ (Command->V2.CommandStatus
+ == DAC960_V2_NormalCompletion
+ ? "Cancelled" : "Not Cancelled"));
+ }
+ else if (strcmp(UserCommand, "perform-discovery") == 0)
+ {
+ CommandMailbox->Common.IOCTL_Opcode = DAC960_V2_StartDiscovery;
+ DAC960_ExecuteCommand(Command);
+ DAC960_UserCritical("Discovery %s\n", Controller,
+ (Command->V2.CommandStatus
+ == DAC960_V2_NormalCompletion
+ ? "Initiated" : "Not Initiated"));
+ if (Command->V2.CommandStatus == DAC960_V2_NormalCompletion)
+ {
+ CommandMailbox->ControllerInfo.CommandOpcode = DAC960_V2_IOCTL;
+ CommandMailbox->ControllerInfo.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->ControllerInfo.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->ControllerInfo.DataTransferSize =
+ sizeof(DAC960_V2_ControllerInfo_T);
+ CommandMailbox->ControllerInfo.ControllerNumber = 0;
+ CommandMailbox->ControllerInfo.IOCTL_Opcode =
+ DAC960_V2_GetControllerInfo;
+ /*
+ * How does this NOT race with the queued Monitoring
+ * usage of this structure?
+ */
+ CommandMailbox->ControllerInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer =
+ Controller->V2.NewControllerInformationDMA;
+ CommandMailbox->ControllerInfo.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->ControllerInfo.DataTransferSize;
+ while (1) {
+ DAC960_ExecuteCommand(Command);
+ if (!Controller->V2.NewControllerInformation->PhysicalScanActive)
+ break;
+ msleep(1000);
+ }
+ DAC960_UserCritical("Discovery Completed\n", Controller);
+ }
+ }
+ else if (strcmp(UserCommand, "suppress-enclosure-messages") == 0)
+ Controller->SuppressEnclosureMessages = true;
+ else DAC960_UserCritical("Illegal User Command: '%s'\n",
+ Controller, UserCommand);
+
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_DeallocateCommand(Command);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ return true;
+}
+
+static int dac960_proc_show(struct seq_file *m, void *v)
+{
+ unsigned char *StatusMessage = "OK\n";
+ int ControllerNumber;
+ for (ControllerNumber = 0;
+ ControllerNumber < DAC960_ControllerCount;
+ ControllerNumber++)
+ {
+ DAC960_Controller_T *Controller = DAC960_Controllers[ControllerNumber];
+ if (Controller == NULL) continue;
+ if (Controller->MonitoringAlertMode)
+ {
+ StatusMessage = "ALERT\n";
+ break;
+ }
+ }
+ seq_puts(m, StatusMessage);
+ return 0;
+}
+
+static int dac960_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dac960_proc_show, NULL);
+}
+
+static const struct file_operations dac960_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = dac960_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
+{
+ DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
+ seq_printf(m, "%.*s", Controller->InitialStatusLength, Controller->CombinedStatusBuffer);
+ return 0;
+}
+
+static int dac960_initial_status_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dac960_initial_status_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations dac960_initial_status_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = dac960_initial_status_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dac960_current_status_proc_show(struct seq_file *m, void *v)
+{
+ DAC960_Controller_T *Controller = (DAC960_Controller_T *) m->private;
+ unsigned char *StatusMessage =
+ "No Rebuild or Consistency Check in Progress\n";
+ int ProgressMessageLength = strlen(StatusMessage);
+ if (jiffies != Controller->LastCurrentStatusTime)
+ {
+ Controller->CurrentStatusLength = 0;
+ DAC960_AnnounceDriver(Controller);
+ DAC960_ReportControllerConfiguration(Controller);
+ DAC960_ReportDeviceConfiguration(Controller);
+ if (Controller->ProgressBufferLength > 0)
+ ProgressMessageLength = Controller->ProgressBufferLength;
+ if (DAC960_CheckStatusBuffer(Controller, 2 + ProgressMessageLength))
+ {
+ unsigned char *CurrentStatusBuffer = Controller->CurrentStatusBuffer;
+ CurrentStatusBuffer[Controller->CurrentStatusLength++] = ' ';
+ CurrentStatusBuffer[Controller->CurrentStatusLength++] = ' ';
+ if (Controller->ProgressBufferLength > 0)
+ strcpy(&CurrentStatusBuffer[Controller->CurrentStatusLength],
+ Controller->ProgressBuffer);
+ else
+ strcpy(&CurrentStatusBuffer[Controller->CurrentStatusLength],
+ StatusMessage);
+ Controller->CurrentStatusLength += ProgressMessageLength;
+ }
+ Controller->LastCurrentStatusTime = jiffies;
+ }
+ seq_printf(m, "%.*s", Controller->CurrentStatusLength, Controller->CurrentStatusBuffer);
+ return 0;
+}
+
+static int dac960_current_status_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dac960_current_status_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations dac960_current_status_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = dac960_current_status_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int dac960_user_command_proc_show(struct seq_file *m, void *v)
+{
+ DAC960_Controller_T *Controller = (DAC960_Controller_T *)m->private;
+
+ seq_printf(m, "%.*s", Controller->UserStatusLength, Controller->UserStatusBuffer);
+ return 0;
+}
+
+static int dac960_user_command_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, dac960_user_command_proc_show, PDE_DATA(inode));
+}
+
+static ssize_t dac960_user_command_proc_write(struct file *file,
+ const char __user *Buffer,
+ size_t Count, loff_t *pos)
+{
+ DAC960_Controller_T *Controller = PDE_DATA(file_inode(file));
+ unsigned char CommandBuffer[80];
+ int Length;
+ if (Count > sizeof(CommandBuffer)-1) return -EINVAL;
+ if (copy_from_user(CommandBuffer, Buffer, Count)) return -EFAULT;
+ CommandBuffer[Count] = '\0';
+ Length = strlen(CommandBuffer);
+ if (Length > 0 && CommandBuffer[Length-1] == '\n')
+ CommandBuffer[--Length] = '\0';
+ if (Controller->FirmwareType == DAC960_V1_Controller)
+ return (DAC960_V1_ExecuteUserCommand(Controller, CommandBuffer)
+ ? Count : -EBUSY);
+ else
+ return (DAC960_V2_ExecuteUserCommand(Controller, CommandBuffer)
+ ? Count : -EBUSY);
+}
+
+static const struct file_operations dac960_user_command_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = dac960_user_command_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = dac960_user_command_proc_write,
+};
+
+/*
+ DAC960_CreateProcEntries creates the /proc/rd/... entries for the
+ DAC960 Driver.
+*/
+
+static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
+{
+ struct proc_dir_entry *ControllerProcEntry;
+
+ if (DAC960_ProcDirectoryEntry == NULL) {
+ DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
+ proc_create("status", 0, DAC960_ProcDirectoryEntry,
+ &dac960_proc_fops);
+ }
+
+ sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
+ ControllerProcEntry = proc_mkdir(Controller->ControllerName,
+ DAC960_ProcDirectoryEntry);
+ proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
+ proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
+ proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
+ Controller->ControllerProcEntry = ControllerProcEntry;
+}
+
+
+/*
+ DAC960_DestroyProcEntries destroys the /proc/rd/... entries for the
+ DAC960 Driver.
+*/
+
+static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller)
+{
+ if (Controller->ControllerProcEntry == NULL)
+ return;
+ remove_proc_entry("initial_status", Controller->ControllerProcEntry);
+ remove_proc_entry("current_status", Controller->ControllerProcEntry);
+ remove_proc_entry("user_command", Controller->ControllerProcEntry);
+ remove_proc_entry(Controller->ControllerName, DAC960_ProcDirectoryEntry);
+ Controller->ControllerProcEntry = NULL;
+}
+
+#ifdef DAC960_GAM_MINOR
+
+/*
+ * DAC960_gam_ioctl is the ioctl function for performing RAID operations.
+*/
+
+static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
+ unsigned long Argument)
+{
+ long ErrorCode = 0;
+ if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+
+ mutex_lock(&DAC960_mutex);
+ switch (Request)
+ {
+ case DAC960_IOCTL_GET_CONTROLLER_COUNT:
+ ErrorCode = DAC960_ControllerCount;
+ break;
+ case DAC960_IOCTL_GET_CONTROLLER_INFO:
+ {
+ DAC960_ControllerInfo_T __user *UserSpaceControllerInfo =
+ (DAC960_ControllerInfo_T __user *) Argument;
+ DAC960_ControllerInfo_T ControllerInfo;
+ DAC960_Controller_T *Controller;
+ int ControllerNumber;
+ if (UserSpaceControllerInfo == NULL)
+ ErrorCode = -EINVAL;
+ else ErrorCode = get_user(ControllerNumber,
+ &UserSpaceControllerInfo->ControllerNumber);
+ if (ErrorCode != 0)
+ break;
+ ErrorCode = -ENXIO;
+ if (ControllerNumber < 0 ||
+ ControllerNumber > DAC960_ControllerCount - 1) {
+ break;
+ }
+ Controller = DAC960_Controllers[ControllerNumber];
+ if (Controller == NULL)
+ break;
+ memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T));
+ ControllerInfo.ControllerNumber = ControllerNumber;
+ ControllerInfo.FirmwareType = Controller->FirmwareType;
+ ControllerInfo.Channels = Controller->Channels;
+ ControllerInfo.Targets = Controller->Targets;
+ ControllerInfo.PCI_Bus = Controller->Bus;
+ ControllerInfo.PCI_Device = Controller->Device;
+ ControllerInfo.PCI_Function = Controller->Function;
+ ControllerInfo.IRQ_Channel = Controller->IRQ_Channel;
+ ControllerInfo.PCI_Address = Controller->PCI_Address;
+ strcpy(ControllerInfo.ModelName, Controller->ModelName);
+ strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion);
+ ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo,
+ sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0);
+ break;
+ }
+ case DAC960_IOCTL_V1_EXECUTE_COMMAND:
+ {
+ DAC960_V1_UserCommand_T __user *UserSpaceUserCommand =
+ (DAC960_V1_UserCommand_T __user *) Argument;
+ DAC960_V1_UserCommand_T UserCommand;
+ DAC960_Controller_T *Controller;
+ DAC960_Command_T *Command = NULL;
+ DAC960_V1_CommandOpcode_T CommandOpcode;
+ DAC960_V1_CommandStatus_T CommandStatus;
+ DAC960_V1_DCDB_T DCDB;
+ DAC960_V1_DCDB_T *DCDB_IOBUF = NULL;
+ dma_addr_t DCDB_IOBUFDMA;
+ unsigned long flags;
+ int ControllerNumber, DataTransferLength;
+ unsigned char *DataTransferBuffer = NULL;
+ dma_addr_t DataTransferBufferDMA;
+ if (UserSpaceUserCommand == NULL) {
+ ErrorCode = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&UserCommand, UserSpaceUserCommand,
+ sizeof(DAC960_V1_UserCommand_T))) {
+ ErrorCode = -EFAULT;
+ break;
+ }
+ ControllerNumber = UserCommand.ControllerNumber;
+ ErrorCode = -ENXIO;
+ if (ControllerNumber < 0 ||
+ ControllerNumber > DAC960_ControllerCount - 1)
+ break;
+ Controller = DAC960_Controllers[ControllerNumber];
+ if (Controller == NULL)
+ break;
+ ErrorCode = -EINVAL;
+ if (Controller->FirmwareType != DAC960_V1_Controller)
+ break;
+ CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode;
+ DataTransferLength = UserCommand.DataTransferLength;
+ if (CommandOpcode & 0x80)
+ break;
+ if (CommandOpcode == DAC960_V1_DCDB)
+ {
+ if (copy_from_user(&DCDB, UserCommand.DCDB,
+ sizeof(DAC960_V1_DCDB_T))) {
+ ErrorCode = -EFAULT;
+ break;
+ }
+ if (DCDB.Channel >= DAC960_V1_MaxChannels)
+ break;
+ if (!((DataTransferLength == 0 &&
+ DCDB.Direction
+ == DAC960_V1_DCDB_NoDataTransfer) ||
+ (DataTransferLength > 0 &&
+ DCDB.Direction
+ == DAC960_V1_DCDB_DataTransferDeviceToSystem) ||
+ (DataTransferLength < 0 &&
+ DCDB.Direction
+ == DAC960_V1_DCDB_DataTransferSystemToDevice)))
+ break;
+ if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength)
+ != abs(DataTransferLength))
+ break;
+ DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice,
+ sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA);
+ if (DCDB_IOBUF == NULL) {
+ ErrorCode = -ENOMEM;
+ break;
+ }
+ }
+ ErrorCode = -ENOMEM;
+ if (DataTransferLength > 0)
+ {
+ DataTransferBuffer = pci_zalloc_consistent(Controller->PCIDevice,
+ DataTransferLength,
+ &DataTransferBufferDMA);
+ if (DataTransferBuffer == NULL)
+ break;
+ }
+ else if (DataTransferLength < 0)
+ {
+ DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
+ -DataTransferLength, &DataTransferBufferDMA);
+ if (DataTransferBuffer == NULL)
+ break;
+ if (copy_from_user(DataTransferBuffer,
+ UserCommand.DataTransferBuffer,
+ -DataTransferLength)) {
+ ErrorCode = -EFAULT;
+ break;
+ }
+ }
+ if (CommandOpcode == DAC960_V1_DCDB)
+ {
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ while ((Command = DAC960_AllocateCommand(Controller)) == NULL)
+ DAC960_WaitForCommand(Controller);
+ while (Controller->V1.DirectCommandActive[DCDB.Channel]
+ [DCDB.TargetID])
+ {
+ spin_unlock_irq(&Controller->queue_lock);
+ __wait_event(Controller->CommandWaitQueue,
+ !Controller->V1.DirectCommandActive
+ [DCDB.Channel][DCDB.TargetID]);
+ spin_lock_irq(&Controller->queue_lock);
+ }
+ Controller->V1.DirectCommandActive[DCDB.Channel]
+ [DCDB.TargetID] = true;
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ DAC960_V1_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ memcpy(&Command->V1.CommandMailbox, &UserCommand.CommandMailbox,
+ sizeof(DAC960_V1_CommandMailbox_T));
+ Command->V1.CommandMailbox.Type3.BusAddress = DCDB_IOBUFDMA;
+ DCDB.BusAddress = DataTransferBufferDMA;
+ memcpy(DCDB_IOBUF, &DCDB, sizeof(DAC960_V1_DCDB_T));
+ }
+ else
+ {
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ while ((Command = DAC960_AllocateCommand(Controller)) == NULL)
+ DAC960_WaitForCommand(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ DAC960_V1_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ memcpy(&Command->V1.CommandMailbox, &UserCommand.CommandMailbox,
+ sizeof(DAC960_V1_CommandMailbox_T));
+ if (DataTransferBuffer != NULL)
+ Command->V1.CommandMailbox.Type3.BusAddress =
+ DataTransferBufferDMA;
+ }
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V1.CommandStatus;
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_DeallocateCommand(Command);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ if (DataTransferLength > 0)
+ {
+ if (copy_to_user(UserCommand.DataTransferBuffer,
+ DataTransferBuffer, DataTransferLength)) {
+ ErrorCode = -EFAULT;
+ goto Failure1;
+ }
+ }
+ if (CommandOpcode == DAC960_V1_DCDB)
+ {
+ /*
+ I don't believe Target or Channel in the DCDB_IOBUF
+ should be any different from the contents of DCDB.
+ */
+ Controller->V1.DirectCommandActive[DCDB.Channel]
+ [DCDB.TargetID] = false;
+ if (copy_to_user(UserCommand.DCDB, DCDB_IOBUF,
+ sizeof(DAC960_V1_DCDB_T))) {
+ ErrorCode = -EFAULT;
+ goto Failure1;
+ }
+ }
+ ErrorCode = CommandStatus;
+ Failure1:
+ if (DataTransferBuffer != NULL)
+ pci_free_consistent(Controller->PCIDevice, abs(DataTransferLength),
+ DataTransferBuffer, DataTransferBufferDMA);
+ if (DCDB_IOBUF != NULL)
+ pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T),
+ DCDB_IOBUF, DCDB_IOBUFDMA);
+ break;
+ }
+ case DAC960_IOCTL_V2_EXECUTE_COMMAND:
+ {
+ DAC960_V2_UserCommand_T __user *UserSpaceUserCommand =
+ (DAC960_V2_UserCommand_T __user *) Argument;
+ DAC960_V2_UserCommand_T UserCommand;
+ DAC960_Controller_T *Controller;
+ DAC960_Command_T *Command = NULL;
+ DAC960_V2_CommandMailbox_T *CommandMailbox;
+ DAC960_V2_CommandStatus_T CommandStatus;
+ unsigned long flags;
+ int ControllerNumber, DataTransferLength;
+ int DataTransferResidue, RequestSenseLength;
+ unsigned char *DataTransferBuffer = NULL;
+ dma_addr_t DataTransferBufferDMA;
+ unsigned char *RequestSenseBuffer = NULL;
+ dma_addr_t RequestSenseBufferDMA;
+
+ ErrorCode = -EINVAL;
+ if (UserSpaceUserCommand == NULL)
+ break;
+ if (copy_from_user(&UserCommand, UserSpaceUserCommand,
+ sizeof(DAC960_V2_UserCommand_T))) {
+ ErrorCode = -EFAULT;
+ break;
+ }
+ ErrorCode = -ENXIO;
+ ControllerNumber = UserCommand.ControllerNumber;
+ if (ControllerNumber < 0 ||
+ ControllerNumber > DAC960_ControllerCount - 1)
+ break;
+ Controller = DAC960_Controllers[ControllerNumber];
+ if (Controller == NULL)
+ break;
+ if (Controller->FirmwareType != DAC960_V2_Controller){
+ ErrorCode = -EINVAL;
+ break;
+ }
+ DataTransferLength = UserCommand.DataTransferLength;
+ ErrorCode = -ENOMEM;
+ if (DataTransferLength > 0)
+ {
+ DataTransferBuffer = pci_zalloc_consistent(Controller->PCIDevice,
+ DataTransferLength,
+ &DataTransferBufferDMA);
+ if (DataTransferBuffer == NULL)
+ break;
+ }
+ else if (DataTransferLength < 0)
+ {
+ DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
+ -DataTransferLength, &DataTransferBufferDMA);
+ if (DataTransferBuffer == NULL)
+ break;
+ if (copy_from_user(DataTransferBuffer,
+ UserCommand.DataTransferBuffer,
+ -DataTransferLength)) {
+ ErrorCode = -EFAULT;
+ goto Failure2;
+ }
+ }
+ RequestSenseLength = UserCommand.RequestSenseLength;
+ if (RequestSenseLength > 0)
+ {
+ RequestSenseBuffer = pci_zalloc_consistent(Controller->PCIDevice,
+ RequestSenseLength,
+ &RequestSenseBufferDMA);
+ if (RequestSenseBuffer == NULL)
+ {
+ ErrorCode = -ENOMEM;
+ goto Failure2;
+ }
+ }
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ while ((Command = DAC960_AllocateCommand(Controller)) == NULL)
+ DAC960_WaitForCommand(Controller);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ DAC960_V2_ClearCommand(Command);
+ Command->CommandType = DAC960_ImmediateCommand;
+ CommandMailbox = &Command->V2.CommandMailbox;
+ memcpy(CommandMailbox, &UserCommand.CommandMailbox,
+ sizeof(DAC960_V2_CommandMailbox_T));
+ CommandMailbox->Common.CommandControlBits
+ .AdditionalScatterGatherListMemory = false;
+ CommandMailbox->Common.CommandControlBits
+ .NoAutoRequestSense = true;
+ CommandMailbox->Common.DataTransferSize = 0;
+ CommandMailbox->Common.DataTransferPageNumber = 0;
+ memset(&CommandMailbox->Common.DataTransferMemoryAddress, 0,
+ sizeof(DAC960_V2_DataTransferMemoryAddress_T));
+ if (DataTransferLength != 0)
+ {
+ if (DataTransferLength > 0)
+ {
+ CommandMailbox->Common.CommandControlBits
+ .DataTransferControllerToHost = true;
+ CommandMailbox->Common.DataTransferSize = DataTransferLength;
+ }
+ else
+ {
+ CommandMailbox->Common.CommandControlBits
+ .DataTransferControllerToHost = false;
+ CommandMailbox->Common.DataTransferSize = -DataTransferLength;
+ }
+ CommandMailbox->Common.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentDataPointer = DataTransferBufferDMA;
+ CommandMailbox->Common.DataTransferMemoryAddress
+ .ScatterGatherSegments[0]
+ .SegmentByteCount =
+ CommandMailbox->Common.DataTransferSize;
+ }
+ if (RequestSenseLength > 0)
+ {
+ CommandMailbox->Common.CommandControlBits
+ .NoAutoRequestSense = false;
+ CommandMailbox->Common.RequestSenseSize = RequestSenseLength;
+ CommandMailbox->Common.RequestSenseBusAddress =
+ RequestSenseBufferDMA;
+ }
+ DAC960_ExecuteCommand(Command);
+ CommandStatus = Command->V2.CommandStatus;
+ RequestSenseLength = Command->V2.RequestSenseLength;
+ DataTransferResidue = Command->V2.DataTransferResidue;
+ spin_lock_irqsave(&Controller->queue_lock, flags);
+ DAC960_DeallocateCommand(Command);
+ spin_unlock_irqrestore(&Controller->queue_lock, flags);
+ if (RequestSenseLength > UserCommand.RequestSenseLength)
+ RequestSenseLength = UserCommand.RequestSenseLength;
+ if (copy_to_user(&UserSpaceUserCommand->DataTransferLength,
+ &DataTransferResidue,
+ sizeof(DataTransferResidue))) {
+ ErrorCode = -EFAULT;
+ goto Failure2;
+ }
+ if (copy_to_user(&UserSpaceUserCommand->RequestSenseLength,
+ &RequestSenseLength, sizeof(RequestSenseLength))) {
+ ErrorCode = -EFAULT;
+ goto Failure2;
+ }
+ if (DataTransferLength > 0)
+ {
+ if (copy_to_user(UserCommand.DataTransferBuffer,
+ DataTransferBuffer, DataTransferLength)) {
+ ErrorCode = -EFAULT;
+ goto Failure2;
+ }
+ }
+ if (RequestSenseLength > 0)
+ {
+ if (copy_to_user(UserCommand.RequestSenseBuffer,
+ RequestSenseBuffer, RequestSenseLength)) {
+ ErrorCode = -EFAULT;
+ goto Failure2;
+ }
+ }
+ ErrorCode = CommandStatus;
+ Failure2:
+ pci_free_consistent(Controller->PCIDevice, abs(DataTransferLength),
+ DataTransferBuffer, DataTransferBufferDMA);
+ if (RequestSenseBuffer != NULL)
+ pci_free_consistent(Controller->PCIDevice, RequestSenseLength,
+ RequestSenseBuffer, RequestSenseBufferDMA);
+ break;
+ }
+ case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
+ {
+ DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus =
+ (DAC960_V2_GetHealthStatus_T __user *) Argument;
+ DAC960_V2_GetHealthStatus_T GetHealthStatus;
+ DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer;
+ DAC960_Controller_T *Controller;
+ int ControllerNumber;
+ if (UserSpaceGetHealthStatus == NULL) {
+ ErrorCode = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus,
+ sizeof(DAC960_V2_GetHealthStatus_T))) {
+ ErrorCode = -EFAULT;
+ break;
+ }
+ ErrorCode = -ENXIO;
+ ControllerNumber = GetHealthStatus.ControllerNumber;
+ if (ControllerNumber < 0 ||
+ ControllerNumber > DAC960_ControllerCount - 1)
+ break;
+ Controller = DAC960_Controllers[ControllerNumber];
+ if (Controller == NULL)
+ break;
+ if (Controller->FirmwareType != DAC960_V2_Controller) {
+ ErrorCode = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&HealthStatusBuffer,
+ GetHealthStatus.HealthStatusBuffer,
+ sizeof(DAC960_V2_HealthStatusBuffer_T))) {
+ ErrorCode = -EFAULT;
+ break;
+ }
+ ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue,
+ !(Controller->V2.HealthStatusBuffer->StatusChangeCounter
+ == HealthStatusBuffer.StatusChangeCounter &&
+ Controller->V2.HealthStatusBuffer->NextEventSequenceNumber
+ == HealthStatusBuffer.NextEventSequenceNumber),
+ DAC960_MonitoringTimerInterval);
+ if (ErrorCode == -ERESTARTSYS) {
+ ErrorCode = -EINTR;
+ break;
+ }
+ if (copy_to_user(GetHealthStatus.HealthStatusBuffer,
+ Controller->V2.HealthStatusBuffer,
+ sizeof(DAC960_V2_HealthStatusBuffer_T)))
+ ErrorCode = -EFAULT;
+ else
+ ErrorCode = 0;
+ }
+ break;
+ default:
+ ErrorCode = -ENOTTY;
+ }
+ mutex_unlock(&DAC960_mutex);
+ return ErrorCode;
+}
+
+static const struct file_operations DAC960_gam_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = DAC960_gam_ioctl,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice DAC960_gam_dev = {
+ DAC960_GAM_MINOR,
+ "dac960_gam",
+ &DAC960_gam_fops
+};
+
+static int DAC960_gam_init(void)
+{
+ int ret;
+
+ ret = misc_register(&DAC960_gam_dev);
+ if (ret)
+ printk(KERN_ERR "DAC960_gam: can't misc_register on minor %d\n", DAC960_GAM_MINOR);
+ return ret;
+}
+
+static void DAC960_gam_cleanup(void)
+{
+ misc_deregister(&DAC960_gam_dev);
+}
+
+#endif /* DAC960_GAM_MINOR */
+
+static struct DAC960_privdata DAC960_GEM_privdata = {
+ .HardwareType = DAC960_GEM_Controller,
+ .FirmwareType = DAC960_V2_Controller,
+ .InterruptHandler = DAC960_GEM_InterruptHandler,
+ .MemoryWindowSize = DAC960_GEM_RegisterWindowSize,
+};
+
+
+static struct DAC960_privdata DAC960_BA_privdata = {
+ .HardwareType = DAC960_BA_Controller,
+ .FirmwareType = DAC960_V2_Controller,
+ .InterruptHandler = DAC960_BA_InterruptHandler,
+ .MemoryWindowSize = DAC960_BA_RegisterWindowSize,
+};
+
+static struct DAC960_privdata DAC960_LP_privdata = {
+ .HardwareType = DAC960_LP_Controller,
+ .FirmwareType = DAC960_V2_Controller,
+ .InterruptHandler = DAC960_LP_InterruptHandler,
+ .MemoryWindowSize = DAC960_LP_RegisterWindowSize,
+};
+
+static struct DAC960_privdata DAC960_LA_privdata = {
+ .HardwareType = DAC960_LA_Controller,
+ .FirmwareType = DAC960_V1_Controller,
+ .InterruptHandler = DAC960_LA_InterruptHandler,
+ .MemoryWindowSize = DAC960_LA_RegisterWindowSize,
+};
+
+static struct DAC960_privdata DAC960_PG_privdata = {
+ .HardwareType = DAC960_PG_Controller,
+ .FirmwareType = DAC960_V1_Controller,
+ .InterruptHandler = DAC960_PG_InterruptHandler,
+ .MemoryWindowSize = DAC960_PG_RegisterWindowSize,
+};
+
+static struct DAC960_privdata DAC960_PD_privdata = {
+ .HardwareType = DAC960_PD_Controller,
+ .FirmwareType = DAC960_V1_Controller,
+ .InterruptHandler = DAC960_PD_InterruptHandler,
+ .MemoryWindowSize = DAC960_PD_RegisterWindowSize,
+};
+
+static struct DAC960_privdata DAC960_P_privdata = {
+ .HardwareType = DAC960_P_Controller,
+ .FirmwareType = DAC960_V1_Controller,
+ .InterruptHandler = DAC960_P_InterruptHandler,
+ .MemoryWindowSize = DAC960_PD_RegisterWindowSize,
+};
+
+static const struct pci_device_id DAC960_id_table[] = {
+ {
+ .vendor = PCI_VENDOR_ID_MYLEX,
+ .device = PCI_DEVICE_ID_MYLEX_DAC960_GEM,
+ .subvendor = PCI_VENDOR_ID_MYLEX,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (unsigned long) &DAC960_GEM_privdata,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_MYLEX,
+ .device = PCI_DEVICE_ID_MYLEX_DAC960_BA,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (unsigned long) &DAC960_BA_privdata,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_MYLEX,
+ .device = PCI_DEVICE_ID_MYLEX_DAC960_LP,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (unsigned long) &DAC960_LP_privdata,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_DEC,
+ .device = PCI_DEVICE_ID_DEC_21285,
+ .subvendor = PCI_VENDOR_ID_MYLEX,
+ .subdevice = PCI_DEVICE_ID_MYLEX_DAC960_LA,
+ .driver_data = (unsigned long) &DAC960_LA_privdata,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_MYLEX,
+ .device = PCI_DEVICE_ID_MYLEX_DAC960_PG,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (unsigned long) &DAC960_PG_privdata,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_MYLEX,
+ .device = PCI_DEVICE_ID_MYLEX_DAC960_PD,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (unsigned long) &DAC960_PD_privdata,
+ },
+ {
+ .vendor = PCI_VENDOR_ID_MYLEX,
+ .device = PCI_DEVICE_ID_MYLEX_DAC960_P,
+ .subvendor = PCI_ANY_ID,
+ .subdevice = PCI_ANY_ID,
+ .driver_data = (unsigned long) &DAC960_P_privdata,
+ },
+ {0, },
+};
+
+MODULE_DEVICE_TABLE(pci, DAC960_id_table);
+
+static struct pci_driver DAC960_pci_driver = {
+ .name = "DAC960",
+ .id_table = DAC960_id_table,
+ .probe = DAC960_Probe,
+ .remove = DAC960_Remove,
+};
+
+static int __init DAC960_init_module(void)
+{
+ int ret;
+
+ ret = pci_register_driver(&DAC960_pci_driver);
+#ifdef DAC960_GAM_MINOR
+ if (!ret)
+ DAC960_gam_init();
+#endif
+ return ret;
+}
+
+static void __exit DAC960_cleanup_module(void)
+{
+ int i;
+
+#ifdef DAC960_GAM_MINOR
+ DAC960_gam_cleanup();
+#endif
+
+ for (i = 0; i < DAC960_ControllerCount; i++) {
+ DAC960_Controller_T *Controller = DAC960_Controllers[i];
+ if (Controller == NULL)
+ continue;
+ DAC960_FinalizeController(Controller);
+ }
+ if (DAC960_ProcDirectoryEntry != NULL) {
+ remove_proc_entry("rd/status", NULL);
+ remove_proc_entry("rd", NULL);
+ }
+ DAC960_ControllerCount = 0;
+ pci_unregister_driver(&DAC960_pci_driver);
+}
+
+module_init(DAC960_init_module);
+module_exit(DAC960_cleanup_module);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/DAC960.h b/drivers/block/DAC960.h
new file mode 100644
index 000000000..85fa9bb63
--- /dev/null
+++ b/drivers/block/DAC960.h
@@ -0,0 +1,4415 @@
+/*
+
+ Linux Driver for Mylex DAC960/AcceleRAID/eXtremeRAID PCI RAID Controllers
+
+ Copyright 1998-2001 by Leonard N. Zubkoff <lnz@dandelion.com>
+
+ This program is free software; you may redistribute and/or modify it under
+ the terms of the GNU General Public License Version 2 as published by the
+ Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful, but
+ WITHOUT ANY WARRANTY, without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for complete details.
+
+ The author respectfully requests that any modifications to this software be
+ sent directly to him for evaluation and testing.
+
+*/
+
+
+/*
+ Define the maximum number of DAC960 Controllers supported by this driver.
+*/
+
+#define DAC960_MaxControllers 8
+
+
+/*
+ Define the maximum number of Controller Channels supported by DAC960
+ V1 and V2 Firmware Controllers.
+*/
+
+#define DAC960_V1_MaxChannels 3
+#define DAC960_V2_MaxChannels 4
+
+
+/*
+ Define the maximum number of Targets per Channel supported by DAC960
+ V1 and V2 Firmware Controllers.
+*/
+
+#define DAC960_V1_MaxTargets 16
+#define DAC960_V2_MaxTargets 128
+
+
+/*
+ Define the maximum number of Logical Drives supported by DAC960
+ V1 and V2 Firmware Controllers.
+*/
+
+#define DAC960_MaxLogicalDrives 32
+
+
+/*
+ Define the maximum number of Physical Devices supported by DAC960
+ V1 and V2 Firmware Controllers.
+*/
+
+#define DAC960_V1_MaxPhysicalDevices 45
+#define DAC960_V2_MaxPhysicalDevices 272
+
+/*
+ Define a 32/64 bit I/O Address data type.
+*/
+
+typedef unsigned long DAC960_IO_Address_T;
+
+
+/*
+ Define a 32/64 bit PCI Bus Address data type.
+*/
+
+typedef unsigned long DAC960_PCI_Address_T;
+
+
+/*
+ Define a 32 bit Bus Address data type.
+*/
+
+typedef unsigned int DAC960_BusAddress32_T;
+
+
+/*
+ Define a 64 bit Bus Address data type.
+*/
+
+typedef unsigned long long DAC960_BusAddress64_T;
+
+
+/*
+ Define a 32 bit Byte Count data type.
+*/
+
+typedef unsigned int DAC960_ByteCount32_T;
+
+
+/*
+ Define a 64 bit Byte Count data type.
+*/
+
+typedef unsigned long long DAC960_ByteCount64_T;
+
+
+/*
+ dma_loaf is used by helper routines to divide a region of
+ dma mapped memory into smaller pieces, where those pieces
+ are not of uniform size.
+ */
+
+struct dma_loaf {
+ void *cpu_base;
+ dma_addr_t dma_base;
+ size_t length;
+ void *cpu_free;
+ dma_addr_t dma_free;
+};
+
+/*
+ Define the SCSI INQUIRY Standard Data structure.
+*/
+
+typedef struct DAC960_SCSI_Inquiry
+{
+ unsigned char PeripheralDeviceType:5; /* Byte 0 Bits 0-4 */
+ unsigned char PeripheralQualifier:3; /* Byte 0 Bits 5-7 */
+ unsigned char DeviceTypeModifier:7; /* Byte 1 Bits 0-6 */
+ bool RMB:1; /* Byte 1 Bit 7 */
+ unsigned char ANSI_ApprovedVersion:3; /* Byte 2 Bits 0-2 */
+ unsigned char ECMA_Version:3; /* Byte 2 Bits 3-5 */
+ unsigned char ISO_Version:2; /* Byte 2 Bits 6-7 */
+ unsigned char ResponseDataFormat:4; /* Byte 3 Bits 0-3 */
+ unsigned char :2; /* Byte 3 Bits 4-5 */
+ bool TrmIOP:1; /* Byte 3 Bit 6 */
+ bool AENC:1; /* Byte 3 Bit 7 */
+ unsigned char AdditionalLength; /* Byte 4 */
+ unsigned char :8; /* Byte 5 */
+ unsigned char :8; /* Byte 6 */
+ bool SftRe:1; /* Byte 7 Bit 0 */
+ bool CmdQue:1; /* Byte 7 Bit 1 */
+ bool :1; /* Byte 7 Bit 2 */
+ bool Linked:1; /* Byte 7 Bit 3 */
+ bool Sync:1; /* Byte 7 Bit 4 */
+ bool WBus16:1; /* Byte 7 Bit 5 */
+ bool WBus32:1; /* Byte 7 Bit 6 */
+ bool RelAdr:1; /* Byte 7 Bit 7 */
+ unsigned char VendorIdentification[8]; /* Bytes 8-15 */
+ unsigned char ProductIdentification[16]; /* Bytes 16-31 */
+ unsigned char ProductRevisionLevel[4]; /* Bytes 32-35 */
+}
+DAC960_SCSI_Inquiry_T;
+
+
+/*
+ Define the SCSI INQUIRY Unit Serial Number structure.
+*/
+
+typedef struct DAC960_SCSI_Inquiry_UnitSerialNumber
+{
+ unsigned char PeripheralDeviceType:5; /* Byte 0 Bits 0-4 */
+ unsigned char PeripheralQualifier:3; /* Byte 0 Bits 5-7 */
+ unsigned char PageCode; /* Byte 1 */
+ unsigned char :8; /* Byte 2 */
+ unsigned char PageLength; /* Byte 3 */
+ unsigned char ProductSerialNumber[28]; /* Bytes 4-31 */
+}
+DAC960_SCSI_Inquiry_UnitSerialNumber_T;
+
+
+/*
+ Define the SCSI REQUEST SENSE Sense Key type.
+*/
+
+typedef enum
+{
+ DAC960_SenseKey_NoSense = 0x0,
+ DAC960_SenseKey_RecoveredError = 0x1,
+ DAC960_SenseKey_NotReady = 0x2,
+ DAC960_SenseKey_MediumError = 0x3,
+ DAC960_SenseKey_HardwareError = 0x4,
+ DAC960_SenseKey_IllegalRequest = 0x5,
+ DAC960_SenseKey_UnitAttention = 0x6,
+ DAC960_SenseKey_DataProtect = 0x7,
+ DAC960_SenseKey_BlankCheck = 0x8,
+ DAC960_SenseKey_VendorSpecific = 0x9,
+ DAC960_SenseKey_CopyAborted = 0xA,
+ DAC960_SenseKey_AbortedCommand = 0xB,
+ DAC960_SenseKey_Equal = 0xC,
+ DAC960_SenseKey_VolumeOverflow = 0xD,
+ DAC960_SenseKey_Miscompare = 0xE,
+ DAC960_SenseKey_Reserved = 0xF
+}
+__attribute__ ((packed))
+DAC960_SCSI_RequestSenseKey_T;
+
+
+/*
+ Define the SCSI REQUEST SENSE structure.
+*/
+
+typedef struct DAC960_SCSI_RequestSense
+{
+ unsigned char ErrorCode:7; /* Byte 0 Bits 0-6 */
+ bool Valid:1; /* Byte 0 Bit 7 */
+ unsigned char SegmentNumber; /* Byte 1 */
+ DAC960_SCSI_RequestSenseKey_T SenseKey:4; /* Byte 2 Bits 0-3 */
+ unsigned char :1; /* Byte 2 Bit 4 */
+ bool ILI:1; /* Byte 2 Bit 5 */
+ bool EOM:1; /* Byte 2 Bit 6 */
+ bool Filemark:1; /* Byte 2 Bit 7 */
+ unsigned char Information[4]; /* Bytes 3-6 */
+ unsigned char AdditionalSenseLength; /* Byte 7 */
+ unsigned char CommandSpecificInformation[4]; /* Bytes 8-11 */
+ unsigned char AdditionalSenseCode; /* Byte 12 */
+ unsigned char AdditionalSenseCodeQualifier; /* Byte 13 */
+}
+DAC960_SCSI_RequestSense_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Command Opcodes.
+*/
+
+typedef enum
+{
+ /* I/O Commands */
+ DAC960_V1_ReadExtended = 0x33,
+ DAC960_V1_WriteExtended = 0x34,
+ DAC960_V1_ReadAheadExtended = 0x35,
+ DAC960_V1_ReadExtendedWithScatterGather = 0xB3,
+ DAC960_V1_WriteExtendedWithScatterGather = 0xB4,
+ DAC960_V1_Read = 0x36,
+ DAC960_V1_ReadWithScatterGather = 0xB6,
+ DAC960_V1_Write = 0x37,
+ DAC960_V1_WriteWithScatterGather = 0xB7,
+ DAC960_V1_DCDB = 0x04,
+ DAC960_V1_DCDBWithScatterGather = 0x84,
+ DAC960_V1_Flush = 0x0A,
+ /* Controller Status Related Commands */
+ DAC960_V1_Enquiry = 0x53,
+ DAC960_V1_Enquiry2 = 0x1C,
+ DAC960_V1_GetLogicalDriveElement = 0x55,
+ DAC960_V1_GetLogicalDriveInformation = 0x19,
+ DAC960_V1_IOPortRead = 0x39,
+ DAC960_V1_IOPortWrite = 0x3A,
+ DAC960_V1_GetSDStats = 0x3E,
+ DAC960_V1_GetPDStats = 0x3F,
+ DAC960_V1_PerformEventLogOperation = 0x72,
+ /* Device Related Commands */
+ DAC960_V1_StartDevice = 0x10,
+ DAC960_V1_GetDeviceState = 0x50,
+ DAC960_V1_StopChannel = 0x13,
+ DAC960_V1_StartChannel = 0x12,
+ DAC960_V1_ResetChannel = 0x1A,
+ /* Commands Associated with Data Consistency and Errors */
+ DAC960_V1_Rebuild = 0x09,
+ DAC960_V1_RebuildAsync = 0x16,
+ DAC960_V1_CheckConsistency = 0x0F,
+ DAC960_V1_CheckConsistencyAsync = 0x1E,
+ DAC960_V1_RebuildStat = 0x0C,
+ DAC960_V1_GetRebuildProgress = 0x27,
+ DAC960_V1_RebuildControl = 0x1F,
+ DAC960_V1_ReadBadBlockTable = 0x0B,
+ DAC960_V1_ReadBadDataTable = 0x25,
+ DAC960_V1_ClearBadDataTable = 0x26,
+ DAC960_V1_GetErrorTable = 0x17,
+ DAC960_V1_AddCapacityAsync = 0x2A,
+ DAC960_V1_BackgroundInitializationControl = 0x2B,
+ /* Configuration Related Commands */
+ DAC960_V1_ReadConfig2 = 0x3D,
+ DAC960_V1_WriteConfig2 = 0x3C,
+ DAC960_V1_ReadConfigurationOnDisk = 0x4A,
+ DAC960_V1_WriteConfigurationOnDisk = 0x4B,
+ DAC960_V1_ReadConfiguration = 0x4E,
+ DAC960_V1_ReadBackupConfiguration = 0x4D,
+ DAC960_V1_WriteConfiguration = 0x4F,
+ DAC960_V1_AddConfiguration = 0x4C,
+ DAC960_V1_ReadConfigurationLabel = 0x48,
+ DAC960_V1_WriteConfigurationLabel = 0x49,
+ /* Firmware Upgrade Related Commands */
+ DAC960_V1_LoadImage = 0x20,
+ DAC960_V1_StoreImage = 0x21,
+ DAC960_V1_ProgramImage = 0x22,
+ /* Diagnostic Commands */
+ DAC960_V1_SetDiagnosticMode = 0x31,
+ DAC960_V1_RunDiagnostic = 0x32,
+ /* Subsystem Service Commands */
+ DAC960_V1_GetSubsystemData = 0x70,
+ DAC960_V1_SetSubsystemParameters = 0x71,
+ /* Version 2.xx Firmware Commands */
+ DAC960_V1_Enquiry_Old = 0x05,
+ DAC960_V1_GetDeviceState_Old = 0x14,
+ DAC960_V1_Read_Old = 0x02,
+ DAC960_V1_Write_Old = 0x03,
+ DAC960_V1_ReadWithScatterGather_Old = 0x82,
+ DAC960_V1_WriteWithScatterGather_Old = 0x83
+}
+__attribute__ ((packed))
+DAC960_V1_CommandOpcode_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Command Identifier type.
+*/
+
+typedef unsigned char DAC960_V1_CommandIdentifier_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Command Status Codes.
+*/
+
+#define DAC960_V1_NormalCompletion 0x0000 /* Common */
+#define DAC960_V1_CheckConditionReceived 0x0002 /* Common */
+#define DAC960_V1_NoDeviceAtAddress 0x0102 /* Common */
+#define DAC960_V1_InvalidDeviceAddress 0x0105 /* Common */
+#define DAC960_V1_InvalidParameter 0x0105 /* Common */
+#define DAC960_V1_IrrecoverableDataError 0x0001 /* I/O */
+#define DAC960_V1_LogicalDriveNonexistentOrOffline 0x0002 /* I/O */
+#define DAC960_V1_AccessBeyondEndOfLogicalDrive 0x0105 /* I/O */
+#define DAC960_V1_BadDataEncountered 0x010C /* I/O */
+#define DAC960_V1_DeviceBusy 0x0008 /* DCDB */
+#define DAC960_V1_DeviceNonresponsive 0x000E /* DCDB */
+#define DAC960_V1_CommandTerminatedAbnormally 0x000F /* DCDB */
+#define DAC960_V1_UnableToStartDevice 0x0002 /* Device */
+#define DAC960_V1_InvalidChannelOrTargetOrModifier 0x0105 /* Device */
+#define DAC960_V1_ChannelBusy 0x0106 /* Device */
+#define DAC960_V1_ChannelNotStopped 0x0002 /* Device */
+#define DAC960_V1_AttemptToRebuildOnlineDrive 0x0002 /* Consistency */
+#define DAC960_V1_RebuildBadBlocksEncountered 0x0003 /* Consistency */
+#define DAC960_V1_NewDiskFailedDuringRebuild 0x0004 /* Consistency */
+#define DAC960_V1_RebuildOrCheckAlreadyInProgress 0x0106 /* Consistency */
+#define DAC960_V1_DependentDiskIsDead 0x0002 /* Consistency */
+#define DAC960_V1_InconsistentBlocksFound 0x0003 /* Consistency */
+#define DAC960_V1_InvalidOrNonredundantLogicalDrive 0x0105 /* Consistency */
+#define DAC960_V1_NoRebuildOrCheckInProgress 0x0105 /* Consistency */
+#define DAC960_V1_RebuildInProgress_DataValid 0x0000 /* Consistency */
+#define DAC960_V1_RebuildFailed_LogicalDriveFailure 0x0002 /* Consistency */
+#define DAC960_V1_RebuildFailed_BadBlocksOnOther 0x0003 /* Consistency */
+#define DAC960_V1_RebuildFailed_NewDriveFailed 0x0004 /* Consistency */
+#define DAC960_V1_RebuildSuccessful 0x0100 /* Consistency */
+#define DAC960_V1_RebuildSuccessfullyTerminated 0x0107 /* Consistency */
+#define DAC960_V1_BackgroundInitSuccessful 0x0100 /* Consistency */
+#define DAC960_V1_BackgroundInitAborted 0x0005 /* Consistency */
+#define DAC960_V1_NoBackgroundInitInProgress 0x0105 /* Consistency */
+#define DAC960_V1_AddCapacityInProgress 0x0004 /* Consistency */
+#define DAC960_V1_AddCapacityFailedOrSuspended 0x00F4 /* Consistency */
+#define DAC960_V1_Config2ChecksumError 0x0002 /* Configuration */
+#define DAC960_V1_ConfigurationSuspended 0x0106 /* Configuration */
+#define DAC960_V1_FailedToConfigureNVRAM 0x0105 /* Configuration */
+#define DAC960_V1_ConfigurationNotSavedStateChange 0x0106 /* Configuration */
+#define DAC960_V1_SubsystemNotInstalled 0x0001 /* Subsystem */
+#define DAC960_V1_SubsystemFailed 0x0002 /* Subsystem */
+#define DAC960_V1_SubsystemBusy 0x0106 /* Subsystem */
+
+typedef unsigned short DAC960_V1_CommandStatus_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Enquiry Command reply structure.
+*/
+
+typedef struct DAC960_V1_Enquiry
+{
+ unsigned char NumberOfLogicalDrives; /* Byte 0 */
+ unsigned int :24; /* Bytes 1-3 */
+ unsigned int LogicalDriveSizes[32]; /* Bytes 4-131 */
+ unsigned short FlashAge; /* Bytes 132-133 */
+ struct {
+ bool DeferredWriteError:1; /* Byte 134 Bit 0 */
+ bool BatteryLow:1; /* Byte 134 Bit 1 */
+ unsigned char :6; /* Byte 134 Bits 2-7 */
+ } StatusFlags;
+ unsigned char :8; /* Byte 135 */
+ unsigned char MinorFirmwareVersion; /* Byte 136 */
+ unsigned char MajorFirmwareVersion; /* Byte 137 */
+ enum {
+ DAC960_V1_NoStandbyRebuildOrCheckInProgress = 0x00,
+ DAC960_V1_StandbyRebuildInProgress = 0x01,
+ DAC960_V1_BackgroundRebuildInProgress = 0x02,
+ DAC960_V1_BackgroundCheckInProgress = 0x03,
+ DAC960_V1_StandbyRebuildCompletedWithError = 0xFF,
+ DAC960_V1_BackgroundRebuildOrCheckFailed_DriveFailed = 0xF0,
+ DAC960_V1_BackgroundRebuildOrCheckFailed_LogicalDriveFailed = 0xF1,
+ DAC960_V1_BackgroundRebuildOrCheckFailed_OtherCauses = 0xF2,
+ DAC960_V1_BackgroundRebuildOrCheckSuccessfullyTerminated = 0xF3
+ } __attribute__ ((packed)) RebuildFlag; /* Byte 138 */
+ unsigned char MaxCommands; /* Byte 139 */
+ unsigned char OfflineLogicalDriveCount; /* Byte 140 */
+ unsigned char :8; /* Byte 141 */
+ unsigned short EventLogSequenceNumber; /* Bytes 142-143 */
+ unsigned char CriticalLogicalDriveCount; /* Byte 144 */
+ unsigned int :24; /* Bytes 145-147 */
+ unsigned char DeadDriveCount; /* Byte 148 */
+ unsigned char :8; /* Byte 149 */
+ unsigned char RebuildCount; /* Byte 150 */
+ struct {
+ unsigned char :3; /* Byte 151 Bits 0-2 */
+ bool BatteryBackupUnitPresent:1; /* Byte 151 Bit 3 */
+ unsigned char :3; /* Byte 151 Bits 4-6 */
+ unsigned char :1; /* Byte 151 Bit 7 */
+ } MiscFlags;
+ struct {
+ unsigned char TargetID;
+ unsigned char Channel;
+ } DeadDrives[21]; /* Bytes 152-194 */
+ unsigned char Reserved[62]; /* Bytes 195-255 */
+}
+__attribute__ ((packed))
+DAC960_V1_Enquiry_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Enquiry2 Command reply structure.
+*/
+
+typedef struct DAC960_V1_Enquiry2
+{
+ struct {
+ enum {
+ DAC960_V1_P_PD_PU = 0x01,
+ DAC960_V1_PL = 0x02,
+ DAC960_V1_PG = 0x10,
+ DAC960_V1_PJ = 0x11,
+ DAC960_V1_PR = 0x12,
+ DAC960_V1_PT = 0x13,
+ DAC960_V1_PTL0 = 0x14,
+ DAC960_V1_PRL = 0x15,
+ DAC960_V1_PTL1 = 0x16,
+ DAC960_V1_1164P = 0x20
+ } __attribute__ ((packed)) SubModel; /* Byte 0 */
+ unsigned char ActualChannels; /* Byte 1 */
+ enum {
+ DAC960_V1_FiveChannelBoard = 0x01,
+ DAC960_V1_ThreeChannelBoard = 0x02,
+ DAC960_V1_TwoChannelBoard = 0x03,
+ DAC960_V1_ThreeChannelASIC_DAC = 0x04
+ } __attribute__ ((packed)) Model; /* Byte 2 */
+ enum {
+ DAC960_V1_EISA_Controller = 0x01,
+ DAC960_V1_MicroChannel_Controller = 0x02,
+ DAC960_V1_PCI_Controller = 0x03,
+ DAC960_V1_SCSItoSCSI_Controller = 0x08
+ } __attribute__ ((packed)) ProductFamily; /* Byte 3 */
+ } HardwareID; /* Bytes 0-3 */
+ /* MajorVersion.MinorVersion-FirmwareType-TurnID */
+ struct {
+ unsigned char MajorVersion; /* Byte 4 */
+ unsigned char MinorVersion; /* Byte 5 */
+ unsigned char TurnID; /* Byte 6 */
+ char FirmwareType; /* Byte 7 */
+ } FirmwareID; /* Bytes 4-7 */
+ unsigned char :8; /* Byte 8 */
+ unsigned int :24; /* Bytes 9-11 */
+ unsigned char ConfiguredChannels; /* Byte 12 */
+ unsigned char ActualChannels; /* Byte 13 */
+ unsigned char MaxTargets; /* Byte 14 */
+ unsigned char MaxTags; /* Byte 15 */
+ unsigned char MaxLogicalDrives; /* Byte 16 */
+ unsigned char MaxArms; /* Byte 17 */
+ unsigned char MaxSpans; /* Byte 18 */
+ unsigned char :8; /* Byte 19 */
+ unsigned int :32; /* Bytes 20-23 */
+ unsigned int MemorySize; /* Bytes 24-27 */
+ unsigned int CacheSize; /* Bytes 28-31 */
+ unsigned int FlashMemorySize; /* Bytes 32-35 */
+ unsigned int NonVolatileMemorySize; /* Bytes 36-39 */
+ struct {
+ enum {
+ DAC960_V1_RamType_DRAM = 0x0,
+ DAC960_V1_RamType_EDO = 0x1,
+ DAC960_V1_RamType_SDRAM = 0x2,
+ DAC960_V1_RamType_Last = 0x7
+ } __attribute__ ((packed)) RamType:3; /* Byte 40 Bits 0-2 */
+ enum {
+ DAC960_V1_ErrorCorrection_None = 0x0,
+ DAC960_V1_ErrorCorrection_Parity = 0x1,
+ DAC960_V1_ErrorCorrection_ECC = 0x2,
+ DAC960_V1_ErrorCorrection_Last = 0x7
+ } __attribute__ ((packed)) ErrorCorrection:3; /* Byte 40 Bits 3-5 */
+ bool FastPageMode:1; /* Byte 40 Bit 6 */
+ bool LowPowerMemory:1; /* Byte 40 Bit 7 */
+ unsigned char :8; /* Bytes 41 */
+ } MemoryType;
+ unsigned short ClockSpeed; /* Bytes 42-43 */
+ unsigned short MemorySpeed; /* Bytes 44-45 */
+ unsigned short HardwareSpeed; /* Bytes 46-47 */
+ unsigned int :32; /* Bytes 48-51 */
+ unsigned int :32; /* Bytes 52-55 */
+ unsigned char :8; /* Byte 56 */
+ unsigned char :8; /* Byte 57 */
+ unsigned short :16; /* Bytes 58-59 */
+ unsigned short MaxCommands; /* Bytes 60-61 */
+ unsigned short MaxScatterGatherEntries; /* Bytes 62-63 */
+ unsigned short MaxDriveCommands; /* Bytes 64-65 */
+ unsigned short MaxIODescriptors; /* Bytes 66-67 */
+ unsigned short MaxCombinedSectors; /* Bytes 68-69 */
+ unsigned char Latency; /* Byte 70 */
+ unsigned char :8; /* Byte 71 */
+ unsigned char SCSITimeout; /* Byte 72 */
+ unsigned char :8; /* Byte 73 */
+ unsigned short MinFreeLines; /* Bytes 74-75 */
+ unsigned int :32; /* Bytes 76-79 */
+ unsigned int :32; /* Bytes 80-83 */
+ unsigned char RebuildRateConstant; /* Byte 84 */
+ unsigned char :8; /* Byte 85 */
+ unsigned char :8; /* Byte 86 */
+ unsigned char :8; /* Byte 87 */
+ unsigned int :32; /* Bytes 88-91 */
+ unsigned int :32; /* Bytes 92-95 */
+ unsigned short PhysicalDriveBlockSize; /* Bytes 96-97 */
+ unsigned short LogicalDriveBlockSize; /* Bytes 98-99 */
+ unsigned short MaxBlocksPerCommand; /* Bytes 100-101 */
+ unsigned short BlockFactor; /* Bytes 102-103 */
+ unsigned short CacheLineSize; /* Bytes 104-105 */
+ struct {
+ enum {
+ DAC960_V1_Narrow_8bit = 0x0,
+ DAC960_V1_Wide_16bit = 0x1,
+ DAC960_V1_Wide_32bit = 0x2
+ } __attribute__ ((packed)) BusWidth:2; /* Byte 106 Bits 0-1 */
+ enum {
+ DAC960_V1_Fast = 0x0,
+ DAC960_V1_Ultra = 0x1,
+ DAC960_V1_Ultra2 = 0x2
+ } __attribute__ ((packed)) BusSpeed:2; /* Byte 106 Bits 2-3 */
+ bool Differential:1; /* Byte 106 Bit 4 */
+ unsigned char :3; /* Byte 106 Bits 5-7 */
+ } SCSICapability;
+ unsigned char :8; /* Byte 107 */
+ unsigned int :32; /* Bytes 108-111 */
+ unsigned short FirmwareBuildNumber; /* Bytes 112-113 */
+ enum {
+ DAC960_V1_AEMI = 0x01,
+ DAC960_V1_OEM1 = 0x02,
+ DAC960_V1_OEM2 = 0x04,
+ DAC960_V1_OEM3 = 0x08,
+ DAC960_V1_Conner = 0x10,
+ DAC960_V1_SAFTE = 0x20
+ } __attribute__ ((packed)) FaultManagementType; /* Byte 114 */
+ unsigned char :8; /* Byte 115 */
+ struct {
+ bool Clustering:1; /* Byte 116 Bit 0 */
+ bool MylexOnlineRAIDExpansion:1; /* Byte 116 Bit 1 */
+ bool ReadAhead:1; /* Byte 116 Bit 2 */
+ bool BackgroundInitialization:1; /* Byte 116 Bit 3 */
+ unsigned int :28; /* Bytes 116-119 */
+ } FirmwareFeatures;
+ unsigned int :32; /* Bytes 120-123 */
+ unsigned int :32; /* Bytes 124-127 */
+}
+DAC960_V1_Enquiry2_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Logical Drive State type.
+*/
+
+typedef enum
+{
+ DAC960_V1_LogicalDrive_Online = 0x03,
+ DAC960_V1_LogicalDrive_Critical = 0x04,
+ DAC960_V1_LogicalDrive_Offline = 0xFF
+}
+__attribute__ ((packed))
+DAC960_V1_LogicalDriveState_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Logical Drive Information structure.
+*/
+
+typedef struct DAC960_V1_LogicalDriveInformation
+{
+ unsigned int LogicalDriveSize; /* Bytes 0-3 */
+ DAC960_V1_LogicalDriveState_T LogicalDriveState; /* Byte 4 */
+ unsigned char RAIDLevel:7; /* Byte 5 Bits 0-6 */
+ bool WriteBack:1; /* Byte 5 Bit 7 */
+ unsigned short :16; /* Bytes 6-7 */
+}
+DAC960_V1_LogicalDriveInformation_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Get Logical Drive Information Command
+ reply structure.
+*/
+
+typedef DAC960_V1_LogicalDriveInformation_T
+ DAC960_V1_LogicalDriveInformationArray_T[DAC960_MaxLogicalDrives];
+
+
+/*
+ Define the DAC960 V1 Firmware Perform Event Log Operation Types.
+*/
+
+typedef enum
+{
+ DAC960_V1_GetEventLogEntry = 0x00
+}
+__attribute__ ((packed))
+DAC960_V1_PerformEventLogOpType_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Get Event Log Entry Command reply structure.
+*/
+
+typedef struct DAC960_V1_EventLogEntry
+{
+ unsigned char MessageType; /* Byte 0 */
+ unsigned char MessageLength; /* Byte 1 */
+ unsigned char TargetID:5; /* Byte 2 Bits 0-4 */
+ unsigned char Channel:3; /* Byte 2 Bits 5-7 */
+ unsigned char LogicalUnit:6; /* Byte 3 Bits 0-5 */
+ unsigned char :2; /* Byte 3 Bits 6-7 */
+ unsigned short SequenceNumber; /* Bytes 4-5 */
+ unsigned char ErrorCode:7; /* Byte 6 Bits 0-6 */
+ bool Valid:1; /* Byte 6 Bit 7 */
+ unsigned char SegmentNumber; /* Byte 7 */
+ DAC960_SCSI_RequestSenseKey_T SenseKey:4; /* Byte 8 Bits 0-3 */
+ unsigned char :1; /* Byte 8 Bit 4 */
+ bool ILI:1; /* Byte 8 Bit 5 */
+ bool EOM:1; /* Byte 8 Bit 6 */
+ bool Filemark:1; /* Byte 8 Bit 7 */
+ unsigned char Information[4]; /* Bytes 9-12 */
+ unsigned char AdditionalSenseLength; /* Byte 13 */
+ unsigned char CommandSpecificInformation[4]; /* Bytes 14-17 */
+ unsigned char AdditionalSenseCode; /* Byte 18 */
+ unsigned char AdditionalSenseCodeQualifier; /* Byte 19 */
+ unsigned char Dummy[12]; /* Bytes 20-31 */
+}
+DAC960_V1_EventLogEntry_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Physical Device State type.
+*/
+
+typedef enum
+{
+ DAC960_V1_Device_Dead = 0x00,
+ DAC960_V1_Device_WriteOnly = 0x02,
+ DAC960_V1_Device_Online = 0x03,
+ DAC960_V1_Device_Standby = 0x10
+}
+__attribute__ ((packed))
+DAC960_V1_PhysicalDeviceState_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Get Device State Command reply structure.
+ The structure is padded by 2 bytes for compatibility with Version 2.xx
+ Firmware.
+*/
+
+typedef struct DAC960_V1_DeviceState
+{
+ bool Present:1; /* Byte 0 Bit 0 */
+ unsigned char :7; /* Byte 0 Bits 1-7 */
+ enum {
+ DAC960_V1_OtherType = 0x0,
+ DAC960_V1_DiskType = 0x1,
+ DAC960_V1_SequentialType = 0x2,
+ DAC960_V1_CDROM_or_WORM_Type = 0x3
+ } __attribute__ ((packed)) DeviceType:2; /* Byte 1 Bits 0-1 */
+ bool :1; /* Byte 1 Bit 2 */
+ bool Fast20:1; /* Byte 1 Bit 3 */
+ bool Sync:1; /* Byte 1 Bit 4 */
+ bool Fast:1; /* Byte 1 Bit 5 */
+ bool Wide:1; /* Byte 1 Bit 6 */
+ bool TaggedQueuingSupported:1; /* Byte 1 Bit 7 */
+ DAC960_V1_PhysicalDeviceState_T DeviceState; /* Byte 2 */
+ unsigned char :8; /* Byte 3 */
+ unsigned char SynchronousMultiplier; /* Byte 4 */
+ unsigned char SynchronousOffset:5; /* Byte 5 Bits 0-4 */
+ unsigned char :3; /* Byte 5 Bits 5-7 */
+ unsigned int DiskSize __attribute__ ((packed)); /* Bytes 6-9 */
+ unsigned short :16; /* Bytes 10-11 */
+}
+DAC960_V1_DeviceState_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Get Rebuild Progress Command reply structure.
+*/
+
+typedef struct DAC960_V1_RebuildProgress
+{
+ unsigned int LogicalDriveNumber; /* Bytes 0-3 */
+ unsigned int LogicalDriveSize; /* Bytes 4-7 */
+ unsigned int RemainingBlocks; /* Bytes 8-11 */
+}
+DAC960_V1_RebuildProgress_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Background Initialization Status Command
+ reply structure.
+*/
+
+typedef struct DAC960_V1_BackgroundInitializationStatus
+{
+ unsigned int LogicalDriveSize; /* Bytes 0-3 */
+ unsigned int BlocksCompleted; /* Bytes 4-7 */
+ unsigned char Reserved1[12]; /* Bytes 8-19 */
+ unsigned int LogicalDriveNumber; /* Bytes 20-23 */
+ unsigned char RAIDLevel; /* Byte 24 */
+ enum {
+ DAC960_V1_BackgroundInitializationInvalid = 0x00,
+ DAC960_V1_BackgroundInitializationStarted = 0x02,
+ DAC960_V1_BackgroundInitializationInProgress = 0x04,
+ DAC960_V1_BackgroundInitializationSuspended = 0x05,
+ DAC960_V1_BackgroundInitializationCancelled = 0x06
+ } __attribute__ ((packed)) Status; /* Byte 25 */
+ unsigned char Reserved2[6]; /* Bytes 26-31 */
+}
+DAC960_V1_BackgroundInitializationStatus_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Error Table Entry structure.
+*/
+
+typedef struct DAC960_V1_ErrorTableEntry
+{
+ unsigned char ParityErrorCount; /* Byte 0 */
+ unsigned char SoftErrorCount; /* Byte 1 */
+ unsigned char HardErrorCount; /* Byte 2 */
+ unsigned char MiscErrorCount; /* Byte 3 */
+}
+DAC960_V1_ErrorTableEntry_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Get Error Table Command reply structure.
+*/
+
+typedef struct DAC960_V1_ErrorTable
+{
+ DAC960_V1_ErrorTableEntry_T
+ ErrorTableEntries[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets];
+}
+DAC960_V1_ErrorTable_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Read Config2 Command reply structure.
+*/
+
+typedef struct DAC960_V1_Config2
+{
+ unsigned char :1; /* Byte 0 Bit 0 */
+ bool ActiveNegationEnabled:1; /* Byte 0 Bit 1 */
+ unsigned char :5; /* Byte 0 Bits 2-6 */
+ bool NoRescanIfResetReceivedDuringScan:1; /* Byte 0 Bit 7 */
+ bool StorageWorksSupportEnabled:1; /* Byte 1 Bit 0 */
+ bool HewlettPackardSupportEnabled:1; /* Byte 1 Bit 1 */
+ bool NoDisconnectOnFirstCommand:1; /* Byte 1 Bit 2 */
+ unsigned char :2; /* Byte 1 Bits 3-4 */
+ bool AEMI_ARM:1; /* Byte 1 Bit 5 */
+ bool AEMI_OFM:1; /* Byte 1 Bit 6 */
+ unsigned char :1; /* Byte 1 Bit 7 */
+ enum {
+ DAC960_V1_OEMID_Mylex = 0x00,
+ DAC960_V1_OEMID_IBM = 0x08,
+ DAC960_V1_OEMID_HP = 0x0A,
+ DAC960_V1_OEMID_DEC = 0x0C,
+ DAC960_V1_OEMID_Siemens = 0x10,
+ DAC960_V1_OEMID_Intel = 0x12
+ } __attribute__ ((packed)) OEMID; /* Byte 2 */
+ unsigned char OEMModelNumber; /* Byte 3 */
+ unsigned char PhysicalSector; /* Byte 4 */
+ unsigned char LogicalSector; /* Byte 5 */
+ unsigned char BlockFactor; /* Byte 6 */
+ bool ReadAheadEnabled:1; /* Byte 7 Bit 0 */
+ bool LowBIOSDelay:1; /* Byte 7 Bit 1 */
+ unsigned char :2; /* Byte 7 Bits 2-3 */
+ bool ReassignRestrictedToOneSector:1; /* Byte 7 Bit 4 */
+ unsigned char :1; /* Byte 7 Bit 5 */
+ bool ForceUnitAccessDuringWriteRecovery:1; /* Byte 7 Bit 6 */
+ bool EnableLeftSymmetricRAID5Algorithm:1; /* Byte 7 Bit 7 */
+ unsigned char DefaultRebuildRate; /* Byte 8 */
+ unsigned char :8; /* Byte 9 */
+ unsigned char BlocksPerCacheLine; /* Byte 10 */
+ unsigned char BlocksPerStripe; /* Byte 11 */
+ struct {
+ enum {
+ DAC960_V1_Async = 0x0,
+ DAC960_V1_Sync_8MHz = 0x1,
+ DAC960_V1_Sync_5MHz = 0x2,
+ DAC960_V1_Sync_10or20MHz = 0x3 /* Byte 11 Bits 0-1 */
+ } __attribute__ ((packed)) Speed:2;
+ bool Force8Bit:1; /* Byte 11 Bit 2 */
+ bool DisableFast20:1; /* Byte 11 Bit 3 */
+ unsigned char :3; /* Byte 11 Bits 4-6 */
+ bool EnableTaggedQueuing:1; /* Byte 11 Bit 7 */
+ } __attribute__ ((packed)) ChannelParameters[6]; /* Bytes 12-17 */
+ unsigned char SCSIInitiatorID; /* Byte 18 */
+ unsigned char :8; /* Byte 19 */
+ enum {
+ DAC960_V1_StartupMode_ControllerSpinUp = 0x00,
+ DAC960_V1_StartupMode_PowerOnSpinUp = 0x01
+ } __attribute__ ((packed)) StartupMode; /* Byte 20 */
+ unsigned char SimultaneousDeviceSpinUpCount; /* Byte 21 */
+ unsigned char SecondsDelayBetweenSpinUps; /* Byte 22 */
+ unsigned char Reserved1[29]; /* Bytes 23-51 */
+ bool BIOSDisabled:1; /* Byte 52 Bit 0 */
+ bool CDROMBootEnabled:1; /* Byte 52 Bit 1 */
+ unsigned char :3; /* Byte 52 Bits 2-4 */
+ enum {
+ DAC960_V1_Geometry_128_32 = 0x0,
+ DAC960_V1_Geometry_255_63 = 0x1,
+ DAC960_V1_Geometry_Reserved1 = 0x2,
+ DAC960_V1_Geometry_Reserved2 = 0x3
+ } __attribute__ ((packed)) DriveGeometry:2; /* Byte 52 Bits 5-6 */
+ unsigned char :1; /* Byte 52 Bit 7 */
+ unsigned char Reserved2[9]; /* Bytes 53-61 */
+ unsigned short Checksum; /* Bytes 62-63 */
+}
+DAC960_V1_Config2_T;
+
+
+/*
+ Define the DAC960 V1 Firmware DCDB request structure.
+*/
+
+typedef struct DAC960_V1_DCDB
+{
+ unsigned char TargetID:4; /* Byte 0 Bits 0-3 */
+ unsigned char Channel:4; /* Byte 0 Bits 4-7 */
+ enum {
+ DAC960_V1_DCDB_NoDataTransfer = 0,
+ DAC960_V1_DCDB_DataTransferDeviceToSystem = 1,
+ DAC960_V1_DCDB_DataTransferSystemToDevice = 2,
+ DAC960_V1_DCDB_IllegalDataTransfer = 3
+ } __attribute__ ((packed)) Direction:2; /* Byte 1 Bits 0-1 */
+ bool EarlyStatus:1; /* Byte 1 Bit 2 */
+ unsigned char :1; /* Byte 1 Bit 3 */
+ enum {
+ DAC960_V1_DCDB_Timeout_24_hours = 0,
+ DAC960_V1_DCDB_Timeout_10_seconds = 1,
+ DAC960_V1_DCDB_Timeout_60_seconds = 2,
+ DAC960_V1_DCDB_Timeout_10_minutes = 3
+ } __attribute__ ((packed)) Timeout:2; /* Byte 1 Bits 4-5 */
+ bool NoAutomaticRequestSense:1; /* Byte 1 Bit 6 */
+ bool DisconnectPermitted:1; /* Byte 1 Bit 7 */
+ unsigned short TransferLength; /* Bytes 2-3 */
+ DAC960_BusAddress32_T BusAddress; /* Bytes 4-7 */
+ unsigned char CDBLength:4; /* Byte 8 Bits 0-3 */
+ unsigned char TransferLengthHigh4:4; /* Byte 8 Bits 4-7 */
+ unsigned char SenseLength; /* Byte 9 */
+ unsigned char CDB[12]; /* Bytes 10-21 */
+ unsigned char SenseData[64]; /* Bytes 22-85 */
+ unsigned char Status; /* Byte 86 */
+ unsigned char :8; /* Byte 87 */
+}
+DAC960_V1_DCDB_T;
+
+
+/*
+ Define the DAC960 V1 Firmware Scatter/Gather List Type 1 32 Bit Address
+ 32 Bit Byte Count structure.
+*/
+
+typedef struct DAC960_V1_ScatterGatherSegment
+{
+ DAC960_BusAddress32_T SegmentDataPointer; /* Bytes 0-3 */
+ DAC960_ByteCount32_T SegmentByteCount; /* Bytes 4-7 */
+}
+DAC960_V1_ScatterGatherSegment_T;
+
+
+/*
+ Define the 13 Byte DAC960 V1 Firmware Command Mailbox structure. Bytes 13-15
+ are not used. The Command Mailbox structure is padded to 16 bytes for
+ efficient access.
+*/
+
+typedef union DAC960_V1_CommandMailbox
+{
+ unsigned int Words[4]; /* Words 0-3 */
+ unsigned char Bytes[16]; /* Bytes 0-15 */
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ unsigned char Dummy[14]; /* Bytes 2-15 */
+ } __attribute__ ((packed)) Common;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ unsigned char Dummy1[6]; /* Bytes 2-7 */
+ DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */
+ unsigned char Dummy2[4]; /* Bytes 12-15 */
+ } __attribute__ ((packed)) Type3;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ unsigned char CommandOpcode2; /* Byte 2 */
+ unsigned char Dummy1[5]; /* Bytes 3-7 */
+ DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */
+ unsigned char Dummy2[4]; /* Bytes 12-15 */
+ } __attribute__ ((packed)) Type3B;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ unsigned char Dummy1[5]; /* Bytes 2-6 */
+ unsigned char LogicalDriveNumber:6; /* Byte 7 Bits 0-6 */
+ bool AutoRestore:1; /* Byte 7 Bit 7 */
+ unsigned char Dummy2[8]; /* Bytes 8-15 */
+ } __attribute__ ((packed)) Type3C;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ unsigned char Channel; /* Byte 2 */
+ unsigned char TargetID; /* Byte 3 */
+ DAC960_V1_PhysicalDeviceState_T DeviceState:5; /* Byte 4 Bits 0-4 */
+ unsigned char Modifier:3; /* Byte 4 Bits 5-7 */
+ unsigned char Dummy1[3]; /* Bytes 5-7 */
+ DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */
+ unsigned char Dummy2[4]; /* Bytes 12-15 */
+ } __attribute__ ((packed)) Type3D;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ DAC960_V1_PerformEventLogOpType_T OperationType; /* Byte 2 */
+ unsigned char OperationQualifier; /* Byte 3 */
+ unsigned short SequenceNumber; /* Bytes 4-5 */
+ unsigned char Dummy1[2]; /* Bytes 6-7 */
+ DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */
+ unsigned char Dummy2[4]; /* Bytes 12-15 */
+ } __attribute__ ((packed)) Type3E;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ unsigned char Dummy1[2]; /* Bytes 2-3 */
+ unsigned char RebuildRateConstant; /* Byte 4 */
+ unsigned char Dummy2[3]; /* Bytes 5-7 */
+ DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */
+ unsigned char Dummy3[4]; /* Bytes 12-15 */
+ } __attribute__ ((packed)) Type3R;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ unsigned short TransferLength; /* Bytes 2-3 */
+ unsigned int LogicalBlockAddress; /* Bytes 4-7 */
+ DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */
+ unsigned char LogicalDriveNumber; /* Byte 12 */
+ unsigned char Dummy[3]; /* Bytes 13-15 */
+ } __attribute__ ((packed)) Type4;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ struct {
+ unsigned short TransferLength:11; /* Bytes 2-3 */
+ unsigned char LogicalDriveNumber:5; /* Byte 3 Bits 3-7 */
+ } __attribute__ ((packed)) LD;
+ unsigned int LogicalBlockAddress; /* Bytes 4-7 */
+ DAC960_BusAddress32_T BusAddress; /* Bytes 8-11 */
+ unsigned char ScatterGatherCount:6; /* Byte 12 Bits 0-5 */
+ enum {
+ DAC960_V1_ScatterGather_32BitAddress_32BitByteCount = 0x0,
+ DAC960_V1_ScatterGather_32BitAddress_16BitByteCount = 0x1,
+ DAC960_V1_ScatterGather_32BitByteCount_32BitAddress = 0x2,
+ DAC960_V1_ScatterGather_16BitByteCount_32BitAddress = 0x3
+ } __attribute__ ((packed)) ScatterGatherType:2; /* Byte 12 Bits 6-7 */
+ unsigned char Dummy[3]; /* Bytes 13-15 */
+ } __attribute__ ((packed)) Type5;
+ struct {
+ DAC960_V1_CommandOpcode_T CommandOpcode; /* Byte 0 */
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 1 */
+ unsigned char CommandOpcode2; /* Byte 2 */
+ unsigned char :8; /* Byte 3 */
+ DAC960_BusAddress32_T CommandMailboxesBusAddress; /* Bytes 4-7 */
+ DAC960_BusAddress32_T StatusMailboxesBusAddress; /* Bytes 8-11 */
+ unsigned char Dummy[4]; /* Bytes 12-15 */
+ } __attribute__ ((packed)) TypeX;
+}
+DAC960_V1_CommandMailbox_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Command Opcodes.
+*/
+
+typedef enum
+{
+ DAC960_V2_MemCopy = 0x01,
+ DAC960_V2_SCSI_10_Passthru = 0x02,
+ DAC960_V2_SCSI_255_Passthru = 0x03,
+ DAC960_V2_SCSI_10 = 0x04,
+ DAC960_V2_SCSI_256 = 0x05,
+ DAC960_V2_IOCTL = 0x20
+}
+__attribute__ ((packed))
+DAC960_V2_CommandOpcode_T;
+
+
+/*
+ Define the DAC960 V2 Firmware IOCTL Opcodes.
+*/
+
+typedef enum
+{
+ DAC960_V2_GetControllerInfo = 0x01,
+ DAC960_V2_GetLogicalDeviceInfoValid = 0x03,
+ DAC960_V2_GetPhysicalDeviceInfoValid = 0x05,
+ DAC960_V2_GetHealthStatus = 0x11,
+ DAC960_V2_GetEvent = 0x15,
+ DAC960_V2_StartDiscovery = 0x81,
+ DAC960_V2_SetDeviceState = 0x82,
+ DAC960_V2_RebuildDeviceStart = 0x88,
+ DAC960_V2_RebuildDeviceStop = 0x89,
+ DAC960_V2_ConsistencyCheckStart = 0x8C,
+ DAC960_V2_ConsistencyCheckStop = 0x8D,
+ DAC960_V2_SetMemoryMailbox = 0x8E,
+ DAC960_V2_PauseDevice = 0x92,
+ DAC960_V2_TranslatePhysicalToLogicalDevice = 0xC5
+}
+__attribute__ ((packed))
+DAC960_V2_IOCTL_Opcode_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Command Identifier type.
+*/
+
+typedef unsigned short DAC960_V2_CommandIdentifier_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Command Status Codes.
+*/
+
+#define DAC960_V2_NormalCompletion 0x00
+#define DAC960_V2_AbormalCompletion 0x02
+#define DAC960_V2_DeviceBusy 0x08
+#define DAC960_V2_DeviceNonresponsive 0x0E
+#define DAC960_V2_DeviceNonresponsive2 0x0F
+#define DAC960_V2_DeviceRevervationConflict 0x18
+
+typedef unsigned char DAC960_V2_CommandStatus_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Memory Type structure.
+*/
+
+typedef struct DAC960_V2_MemoryType
+{
+ enum {
+ DAC960_V2_MemoryType_Reserved = 0x00,
+ DAC960_V2_MemoryType_DRAM = 0x01,
+ DAC960_V2_MemoryType_EDRAM = 0x02,
+ DAC960_V2_MemoryType_EDO = 0x03,
+ DAC960_V2_MemoryType_SDRAM = 0x04,
+ DAC960_V2_MemoryType_Last = 0x1F
+ } __attribute__ ((packed)) MemoryType:5; /* Byte 0 Bits 0-4 */
+ bool :1; /* Byte 0 Bit 5 */
+ bool MemoryParity:1; /* Byte 0 Bit 6 */
+ bool MemoryECC:1; /* Byte 0 Bit 7 */
+}
+DAC960_V2_MemoryType_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Processor Type structure.
+*/
+
+typedef enum
+{
+ DAC960_V2_ProcessorType_i960CA = 0x01,
+ DAC960_V2_ProcessorType_i960RD = 0x02,
+ DAC960_V2_ProcessorType_i960RN = 0x03,
+ DAC960_V2_ProcessorType_i960RP = 0x04,
+ DAC960_V2_ProcessorType_NorthBay = 0x05,
+ DAC960_V2_ProcessorType_StrongArm = 0x06,
+ DAC960_V2_ProcessorType_i960RM = 0x07
+}
+__attribute__ ((packed))
+DAC960_V2_ProcessorType_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Get Controller Info reply structure.
+*/
+
+typedef struct DAC960_V2_ControllerInfo
+{
+ unsigned char :8; /* Byte 0 */
+ enum {
+ DAC960_V2_SCSI_Bus = 0x00,
+ DAC960_V2_Fibre_Bus = 0x01,
+ DAC960_V2_PCI_Bus = 0x03
+ } __attribute__ ((packed)) BusInterfaceType; /* Byte 1 */
+ enum {
+ DAC960_V2_DAC960E = 0x01,
+ DAC960_V2_DAC960M = 0x08,
+ DAC960_V2_DAC960PD = 0x10,
+ DAC960_V2_DAC960PL = 0x11,
+ DAC960_V2_DAC960PU = 0x12,
+ DAC960_V2_DAC960PE = 0x13,
+ DAC960_V2_DAC960PG = 0x14,
+ DAC960_V2_DAC960PJ = 0x15,
+ DAC960_V2_DAC960PTL0 = 0x16,
+ DAC960_V2_DAC960PR = 0x17,
+ DAC960_V2_DAC960PRL = 0x18,
+ DAC960_V2_DAC960PT = 0x19,
+ DAC960_V2_DAC1164P = 0x1A,
+ DAC960_V2_DAC960PTL1 = 0x1B,
+ DAC960_V2_EXR2000P = 0x1C,
+ DAC960_V2_EXR3000P = 0x1D,
+ DAC960_V2_AcceleRAID352 = 0x1E,
+ DAC960_V2_AcceleRAID170 = 0x1F,
+ DAC960_V2_AcceleRAID160 = 0x20,
+ DAC960_V2_DAC960S = 0x60,
+ DAC960_V2_DAC960SU = 0x61,
+ DAC960_V2_DAC960SX = 0x62,
+ DAC960_V2_DAC960SF = 0x63,
+ DAC960_V2_DAC960SS = 0x64,
+ DAC960_V2_DAC960FL = 0x65,
+ DAC960_V2_DAC960LL = 0x66,
+ DAC960_V2_DAC960FF = 0x67,
+ DAC960_V2_DAC960HP = 0x68,
+ DAC960_V2_RAIDBRICK = 0x69,
+ DAC960_V2_METEOR_FL = 0x6A,
+ DAC960_V2_METEOR_FF = 0x6B
+ } __attribute__ ((packed)) ControllerType; /* Byte 2 */
+ unsigned char :8; /* Byte 3 */
+ unsigned short BusInterfaceSpeedMHz; /* Bytes 4-5 */
+ unsigned char BusWidthBits; /* Byte 6 */
+ unsigned char FlashCodeTypeOrProductID; /* Byte 7 */
+ unsigned char NumberOfHostPortsPresent; /* Byte 8 */
+ unsigned char Reserved1[7]; /* Bytes 9-15 */
+ unsigned char BusInterfaceName[16]; /* Bytes 16-31 */
+ unsigned char ControllerName[16]; /* Bytes 32-47 */
+ unsigned char Reserved2[16]; /* Bytes 48-63 */
+ /* Firmware Release Information */
+ unsigned char FirmwareMajorVersion; /* Byte 64 */
+ unsigned char FirmwareMinorVersion; /* Byte 65 */
+ unsigned char FirmwareTurnNumber; /* Byte 66 */
+ unsigned char FirmwareBuildNumber; /* Byte 67 */
+ unsigned char FirmwareReleaseDay; /* Byte 68 */
+ unsigned char FirmwareReleaseMonth; /* Byte 69 */
+ unsigned char FirmwareReleaseYearHigh2Digits; /* Byte 70 */
+ unsigned char FirmwareReleaseYearLow2Digits; /* Byte 71 */
+ /* Hardware Release Information */
+ unsigned char HardwareRevision; /* Byte 72 */
+ unsigned int :24; /* Bytes 73-75 */
+ unsigned char HardwareReleaseDay; /* Byte 76 */
+ unsigned char HardwareReleaseMonth; /* Byte 77 */
+ unsigned char HardwareReleaseYearHigh2Digits; /* Byte 78 */
+ unsigned char HardwareReleaseYearLow2Digits; /* Byte 79 */
+ /* Hardware Manufacturing Information */
+ unsigned char ManufacturingBatchNumber; /* Byte 80 */
+ unsigned char :8; /* Byte 81 */
+ unsigned char ManufacturingPlantNumber; /* Byte 82 */
+ unsigned char :8; /* Byte 83 */
+ unsigned char HardwareManufacturingDay; /* Byte 84 */
+ unsigned char HardwareManufacturingMonth; /* Byte 85 */
+ unsigned char HardwareManufacturingYearHigh2Digits; /* Byte 86 */
+ unsigned char HardwareManufacturingYearLow2Digits; /* Byte 87 */
+ unsigned char MaximumNumberOfPDDperXLD; /* Byte 88 */
+ unsigned char MaximumNumberOfILDperXLD; /* Byte 89 */
+ unsigned short NonvolatileMemorySizeKB; /* Bytes 90-91 */
+ unsigned char MaximumNumberOfXLD; /* Byte 92 */
+ unsigned int :24; /* Bytes 93-95 */
+ /* Unique Information per Controller */
+ unsigned char ControllerSerialNumber[16]; /* Bytes 96-111 */
+ unsigned char Reserved3[16]; /* Bytes 112-127 */
+ /* Vendor Information */
+ unsigned int :24; /* Bytes 128-130 */
+ unsigned char OEM_Code; /* Byte 131 */
+ unsigned char VendorName[16]; /* Bytes 132-147 */
+ /* Other Physical/Controller/Operation Information */
+ bool BBU_Present:1; /* Byte 148 Bit 0 */
+ bool ActiveActiveClusteringMode:1; /* Byte 148 Bit 1 */
+ unsigned char :6; /* Byte 148 Bits 2-7 */
+ unsigned char :8; /* Byte 149 */
+ unsigned short :16; /* Bytes 150-151 */
+ /* Physical Device Scan Information */
+ bool PhysicalScanActive:1; /* Byte 152 Bit 0 */
+ unsigned char :7; /* Byte 152 Bits 1-7 */
+ unsigned char PhysicalDeviceChannelNumber; /* Byte 153 */
+ unsigned char PhysicalDeviceTargetID; /* Byte 154 */
+ unsigned char PhysicalDeviceLogicalUnit; /* Byte 155 */
+ /* Maximum Command Data Transfer Sizes */
+ unsigned short MaximumDataTransferSizeInBlocks; /* Bytes 156-157 */
+ unsigned short MaximumScatterGatherEntries; /* Bytes 158-159 */
+ /* Logical/Physical Device Counts */
+ unsigned short LogicalDevicesPresent; /* Bytes 160-161 */
+ unsigned short LogicalDevicesCritical; /* Bytes 162-163 */
+ unsigned short LogicalDevicesOffline; /* Bytes 164-165 */
+ unsigned short PhysicalDevicesPresent; /* Bytes 166-167 */
+ unsigned short PhysicalDisksPresent; /* Bytes 168-169 */
+ unsigned short PhysicalDisksCritical; /* Bytes 170-171 */
+ unsigned short PhysicalDisksOffline; /* Bytes 172-173 */
+ unsigned short MaximumParallelCommands; /* Bytes 174-175 */
+ /* Channel and Target ID Information */
+ unsigned char NumberOfPhysicalChannelsPresent; /* Byte 176 */
+ unsigned char NumberOfVirtualChannelsPresent; /* Byte 177 */
+ unsigned char NumberOfPhysicalChannelsPossible; /* Byte 178 */
+ unsigned char NumberOfVirtualChannelsPossible; /* Byte 179 */
+ unsigned char MaximumTargetsPerChannel[16]; /* Bytes 180-195 */
+ unsigned char Reserved4[12]; /* Bytes 196-207 */
+ /* Memory/Cache Information */
+ unsigned short MemorySizeMB; /* Bytes 208-209 */
+ unsigned short CacheSizeMB; /* Bytes 210-211 */
+ unsigned int ValidCacheSizeInBytes; /* Bytes 212-215 */
+ unsigned int DirtyCacheSizeInBytes; /* Bytes 216-219 */
+ unsigned short MemorySpeedMHz; /* Bytes 220-221 */
+ unsigned char MemoryDataWidthBits; /* Byte 222 */
+ DAC960_V2_MemoryType_T MemoryType; /* Byte 223 */
+ unsigned char CacheMemoryTypeName[16]; /* Bytes 224-239 */
+ /* Execution Memory Information */
+ unsigned short ExecutionMemorySizeMB; /* Bytes 240-241 */
+ unsigned short ExecutionL2CacheSizeMB; /* Bytes 242-243 */
+ unsigned char Reserved5[8]; /* Bytes 244-251 */
+ unsigned short ExecutionMemorySpeedMHz; /* Bytes 252-253 */
+ unsigned char ExecutionMemoryDataWidthBits; /* Byte 254 */
+ DAC960_V2_MemoryType_T ExecutionMemoryType; /* Byte 255 */
+ unsigned char ExecutionMemoryTypeName[16]; /* Bytes 256-271 */
+ /* First CPU Type Information */
+ unsigned short FirstProcessorSpeedMHz; /* Bytes 272-273 */
+ DAC960_V2_ProcessorType_T FirstProcessorType; /* Byte 274 */
+ unsigned char FirstProcessorCount; /* Byte 275 */
+ unsigned char Reserved6[12]; /* Bytes 276-287 */
+ unsigned char FirstProcessorName[16]; /* Bytes 288-303 */
+ /* Second CPU Type Information */
+ unsigned short SecondProcessorSpeedMHz; /* Bytes 304-305 */
+ DAC960_V2_ProcessorType_T SecondProcessorType; /* Byte 306 */
+ unsigned char SecondProcessorCount; /* Byte 307 */
+ unsigned char Reserved7[12]; /* Bytes 308-319 */
+ unsigned char SecondProcessorName[16]; /* Bytes 320-335 */
+ /* Debugging/Profiling/Command Time Tracing Information */
+ unsigned short CurrentProfilingDataPageNumber; /* Bytes 336-337 */
+ unsigned short ProgramsAwaitingProfilingData; /* Bytes 338-339 */
+ unsigned short CurrentCommandTimeTraceDataPageNumber; /* Bytes 340-341 */
+ unsigned short ProgramsAwaitingCommandTimeTraceData; /* Bytes 342-343 */
+ unsigned char Reserved8[8]; /* Bytes 344-351 */
+ /* Error Counters on Physical Devices */
+ unsigned short PhysicalDeviceBusResets; /* Bytes 352-353 */
+ unsigned short PhysicalDeviceParityErrors; /* Bytes 355-355 */
+ unsigned short PhysicalDeviceSoftErrors; /* Bytes 356-357 */
+ unsigned short PhysicalDeviceCommandsFailed; /* Bytes 358-359 */
+ unsigned short PhysicalDeviceMiscellaneousErrors; /* Bytes 360-361 */
+ unsigned short PhysicalDeviceCommandTimeouts; /* Bytes 362-363 */
+ unsigned short PhysicalDeviceSelectionTimeouts; /* Bytes 364-365 */
+ unsigned short PhysicalDeviceRetriesDone; /* Bytes 366-367 */
+ unsigned short PhysicalDeviceAbortsDone; /* Bytes 368-369 */
+ unsigned short PhysicalDeviceHostCommandAbortsDone; /* Bytes 370-371 */
+ unsigned short PhysicalDevicePredictedFailuresDetected; /* Bytes 372-373 */
+ unsigned short PhysicalDeviceHostCommandsFailed; /* Bytes 374-375 */
+ unsigned short PhysicalDeviceHardErrors; /* Bytes 376-377 */
+ unsigned char Reserved9[6]; /* Bytes 378-383 */
+ /* Error Counters on Logical Devices */
+ unsigned short LogicalDeviceSoftErrors; /* Bytes 384-385 */
+ unsigned short LogicalDeviceCommandsFailed; /* Bytes 386-387 */
+ unsigned short LogicalDeviceHostCommandAbortsDone; /* Bytes 388-389 */
+ unsigned short :16; /* Bytes 390-391 */
+ /* Error Counters on Controller */
+ unsigned short ControllerMemoryErrors; /* Bytes 392-393 */
+ unsigned short ControllerHostCommandAbortsDone; /* Bytes 394-395 */
+ unsigned int :32; /* Bytes 396-399 */
+ /* Long Duration Activity Information */
+ unsigned short BackgroundInitializationsActive; /* Bytes 400-401 */
+ unsigned short LogicalDeviceInitializationsActive; /* Bytes 402-403 */
+ unsigned short PhysicalDeviceInitializationsActive; /* Bytes 404-405 */
+ unsigned short ConsistencyChecksActive; /* Bytes 406-407 */
+ unsigned short RebuildsActive; /* Bytes 408-409 */
+ unsigned short OnlineExpansionsActive; /* Bytes 410-411 */
+ unsigned short PatrolActivitiesActive; /* Bytes 412-413 */
+ unsigned short :16; /* Bytes 414-415 */
+ /* Flash ROM Information */
+ unsigned char FlashType; /* Byte 416 */
+ unsigned char :8; /* Byte 417 */
+ unsigned short FlashSizeMB; /* Bytes 418-419 */
+ unsigned int FlashLimit; /* Bytes 420-423 */
+ unsigned int FlashCount; /* Bytes 424-427 */
+ unsigned int :32; /* Bytes 428-431 */
+ unsigned char FlashTypeName[16]; /* Bytes 432-447 */
+ /* Firmware Run Time Information */
+ unsigned char RebuildRate; /* Byte 448 */
+ unsigned char BackgroundInitializationRate; /* Byte 449 */
+ unsigned char ForegroundInitializationRate; /* Byte 450 */
+ unsigned char ConsistencyCheckRate; /* Byte 451 */
+ unsigned int :32; /* Bytes 452-455 */
+ unsigned int MaximumDP; /* Bytes 456-459 */
+ unsigned int FreeDP; /* Bytes 460-463 */
+ unsigned int MaximumIOP; /* Bytes 464-467 */
+ unsigned int FreeIOP; /* Bytes 468-471 */
+ unsigned short MaximumCombLengthInBlocks; /* Bytes 472-473 */
+ unsigned short NumberOfConfigurationGroups; /* Bytes 474-475 */
+ bool InstallationAbortStatus:1; /* Byte 476 Bit 0 */
+ bool MaintenanceModeStatus:1; /* Byte 476 Bit 1 */
+ unsigned int :24; /* Bytes 476-479 */
+ unsigned char Reserved10[32]; /* Bytes 480-511 */
+ unsigned char Reserved11[512]; /* Bytes 512-1023 */
+}
+DAC960_V2_ControllerInfo_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Logical Device State type.
+*/
+
+typedef enum
+{
+ DAC960_V2_LogicalDevice_Online = 0x01,
+ DAC960_V2_LogicalDevice_Offline = 0x08,
+ DAC960_V2_LogicalDevice_Critical = 0x09
+}
+__attribute__ ((packed))
+DAC960_V2_LogicalDeviceState_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Get Logical Device Info reply structure.
+*/
+
+typedef struct DAC960_V2_LogicalDeviceInfo
+{
+ unsigned char :8; /* Byte 0 */
+ unsigned char Channel; /* Byte 1 */
+ unsigned char TargetID; /* Byte 2 */
+ unsigned char LogicalUnit; /* Byte 3 */
+ DAC960_V2_LogicalDeviceState_T LogicalDeviceState; /* Byte 4 */
+ unsigned char RAIDLevel; /* Byte 5 */
+ unsigned char StripeSize; /* Byte 6 */
+ unsigned char CacheLineSize; /* Byte 7 */
+ struct {
+ enum {
+ DAC960_V2_ReadCacheDisabled = 0x0,
+ DAC960_V2_ReadCacheEnabled = 0x1,
+ DAC960_V2_ReadAheadEnabled = 0x2,
+ DAC960_V2_IntelligentReadAheadEnabled = 0x3,
+ DAC960_V2_ReadCache_Last = 0x7
+ } __attribute__ ((packed)) ReadCache:3; /* Byte 8 Bits 0-2 */
+ enum {
+ DAC960_V2_WriteCacheDisabled = 0x0,
+ DAC960_V2_LogicalDeviceReadOnly = 0x1,
+ DAC960_V2_WriteCacheEnabled = 0x2,
+ DAC960_V2_IntelligentWriteCacheEnabled = 0x3,
+ DAC960_V2_WriteCache_Last = 0x7
+ } __attribute__ ((packed)) WriteCache:3; /* Byte 8 Bits 3-5 */
+ bool :1; /* Byte 8 Bit 6 */
+ bool LogicalDeviceInitialized:1; /* Byte 8 Bit 7 */
+ } LogicalDeviceControl; /* Byte 8 */
+ /* Logical Device Operations Status */
+ bool ConsistencyCheckInProgress:1; /* Byte 9 Bit 0 */
+ bool RebuildInProgress:1; /* Byte 9 Bit 1 */
+ bool BackgroundInitializationInProgress:1; /* Byte 9 Bit 2 */
+ bool ForegroundInitializationInProgress:1; /* Byte 9 Bit 3 */
+ bool DataMigrationInProgress:1; /* Byte 9 Bit 4 */
+ bool PatrolOperationInProgress:1; /* Byte 9 Bit 5 */
+ unsigned char :2; /* Byte 9 Bits 6-7 */
+ unsigned char RAID5WriteUpdate; /* Byte 10 */
+ unsigned char RAID5Algorithm; /* Byte 11 */
+ unsigned short LogicalDeviceNumber; /* Bytes 12-13 */
+ /* BIOS Info */
+ bool BIOSDisabled:1; /* Byte 14 Bit 0 */
+ bool CDROMBootEnabled:1; /* Byte 14 Bit 1 */
+ bool DriveCoercionEnabled:1; /* Byte 14 Bit 2 */
+ bool WriteSameDisabled:1; /* Byte 14 Bit 3 */
+ bool HBA_ModeEnabled:1; /* Byte 14 Bit 4 */
+ enum {
+ DAC960_V2_Geometry_128_32 = 0x0,
+ DAC960_V2_Geometry_255_63 = 0x1,
+ DAC960_V2_Geometry_Reserved1 = 0x2,
+ DAC960_V2_Geometry_Reserved2 = 0x3
+ } __attribute__ ((packed)) DriveGeometry:2; /* Byte 14 Bits 5-6 */
+ bool SuperReadAheadEnabled:1; /* Byte 14 Bit 7 */
+ unsigned char :8; /* Byte 15 */
+ /* Error Counters */
+ unsigned short SoftErrors; /* Bytes 16-17 */
+ unsigned short CommandsFailed; /* Bytes 18-19 */
+ unsigned short HostCommandAbortsDone; /* Bytes 20-21 */
+ unsigned short DeferredWriteErrors; /* Bytes 22-23 */
+ unsigned int :32; /* Bytes 24-27 */
+ unsigned int :32; /* Bytes 28-31 */
+ /* Device Size Information */
+ unsigned short :16; /* Bytes 32-33 */
+ unsigned short DeviceBlockSizeInBytes; /* Bytes 34-35 */
+ unsigned int OriginalDeviceSize; /* Bytes 36-39 */
+ unsigned int ConfigurableDeviceSize; /* Bytes 40-43 */
+ unsigned int :32; /* Bytes 44-47 */
+ unsigned char LogicalDeviceName[32]; /* Bytes 48-79 */
+ unsigned char SCSI_InquiryData[36]; /* Bytes 80-115 */
+ unsigned char Reserved1[12]; /* Bytes 116-127 */
+ DAC960_ByteCount64_T LastReadBlockNumber; /* Bytes 128-135 */
+ DAC960_ByteCount64_T LastWrittenBlockNumber; /* Bytes 136-143 */
+ DAC960_ByteCount64_T ConsistencyCheckBlockNumber; /* Bytes 144-151 */
+ DAC960_ByteCount64_T RebuildBlockNumber; /* Bytes 152-159 */
+ DAC960_ByteCount64_T BackgroundInitializationBlockNumber; /* Bytes 160-167 */
+ DAC960_ByteCount64_T ForegroundInitializationBlockNumber; /* Bytes 168-175 */
+ DAC960_ByteCount64_T DataMigrationBlockNumber; /* Bytes 176-183 */
+ DAC960_ByteCount64_T PatrolOperationBlockNumber; /* Bytes 184-191 */
+ unsigned char Reserved2[64]; /* Bytes 192-255 */
+}
+DAC960_V2_LogicalDeviceInfo_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Physical Device State type.
+*/
+
+typedef enum
+{
+ DAC960_V2_Device_Unconfigured = 0x00,
+ DAC960_V2_Device_Online = 0x01,
+ DAC960_V2_Device_Rebuild = 0x03,
+ DAC960_V2_Device_Missing = 0x04,
+ DAC960_V2_Device_Critical = 0x05,
+ DAC960_V2_Device_Dead = 0x08,
+ DAC960_V2_Device_SuspectedDead = 0x0C,
+ DAC960_V2_Device_CommandedOffline = 0x10,
+ DAC960_V2_Device_Standby = 0x21,
+ DAC960_V2_Device_InvalidState = 0xFF
+}
+__attribute__ ((packed))
+DAC960_V2_PhysicalDeviceState_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Get Physical Device Info reply structure.
+*/
+
+typedef struct DAC960_V2_PhysicalDeviceInfo
+{
+ unsigned char :8; /* Byte 0 */
+ unsigned char Channel; /* Byte 1 */
+ unsigned char TargetID; /* Byte 2 */
+ unsigned char LogicalUnit; /* Byte 3 */
+ /* Configuration Status Bits */
+ bool PhysicalDeviceFaultTolerant:1; /* Byte 4 Bit 0 */
+ bool PhysicalDeviceConnected:1; /* Byte 4 Bit 1 */
+ bool PhysicalDeviceLocalToController:1; /* Byte 4 Bit 2 */
+ unsigned char :5; /* Byte 4 Bits 3-7 */
+ /* Multiple Host/Controller Status Bits */
+ bool RemoteHostSystemDead:1; /* Byte 5 Bit 0 */
+ bool RemoteControllerDead:1; /* Byte 5 Bit 1 */
+ unsigned char :6; /* Byte 5 Bits 2-7 */
+ DAC960_V2_PhysicalDeviceState_T PhysicalDeviceState; /* Byte 6 */
+ unsigned char NegotiatedDataWidthBits; /* Byte 7 */
+ unsigned short NegotiatedSynchronousMegaTransfers; /* Bytes 8-9 */
+ /* Multiported Physical Device Information */
+ unsigned char NumberOfPortConnections; /* Byte 10 */
+ unsigned char DriveAccessibilityBitmap; /* Byte 11 */
+ unsigned int :32; /* Bytes 12-15 */
+ unsigned char NetworkAddress[16]; /* Bytes 16-31 */
+ unsigned short MaximumTags; /* Bytes 32-33 */
+ /* Physical Device Operations Status */
+ bool ConsistencyCheckInProgress:1; /* Byte 34 Bit 0 */
+ bool RebuildInProgress:1; /* Byte 34 Bit 1 */
+ bool MakingDataConsistentInProgress:1; /* Byte 34 Bit 2 */
+ bool PhysicalDeviceInitializationInProgress:1; /* Byte 34 Bit 3 */
+ bool DataMigrationInProgress:1; /* Byte 34 Bit 4 */
+ bool PatrolOperationInProgress:1; /* Byte 34 Bit 5 */
+ unsigned char :2; /* Byte 34 Bits 6-7 */
+ unsigned char LongOperationStatus; /* Byte 35 */
+ unsigned char ParityErrors; /* Byte 36 */
+ unsigned char SoftErrors; /* Byte 37 */
+ unsigned char HardErrors; /* Byte 38 */
+ unsigned char MiscellaneousErrors; /* Byte 39 */
+ unsigned char CommandTimeouts; /* Byte 40 */
+ unsigned char Retries; /* Byte 41 */
+ unsigned char Aborts; /* Byte 42 */
+ unsigned char PredictedFailuresDetected; /* Byte 43 */
+ unsigned int :32; /* Bytes 44-47 */
+ unsigned short :16; /* Bytes 48-49 */
+ unsigned short DeviceBlockSizeInBytes; /* Bytes 50-51 */
+ unsigned int OriginalDeviceSize; /* Bytes 52-55 */
+ unsigned int ConfigurableDeviceSize; /* Bytes 56-59 */
+ unsigned int :32; /* Bytes 60-63 */
+ unsigned char PhysicalDeviceName[16]; /* Bytes 64-79 */
+ unsigned char Reserved1[16]; /* Bytes 80-95 */
+ unsigned char Reserved2[32]; /* Bytes 96-127 */
+ unsigned char SCSI_InquiryData[36]; /* Bytes 128-163 */
+ unsigned char Reserved3[20]; /* Bytes 164-183 */
+ unsigned char Reserved4[8]; /* Bytes 184-191 */
+ DAC960_ByteCount64_T LastReadBlockNumber; /* Bytes 192-199 */
+ DAC960_ByteCount64_T LastWrittenBlockNumber; /* Bytes 200-207 */
+ DAC960_ByteCount64_T ConsistencyCheckBlockNumber; /* Bytes 208-215 */
+ DAC960_ByteCount64_T RebuildBlockNumber; /* Bytes 216-223 */
+ DAC960_ByteCount64_T MakingDataConsistentBlockNumber; /* Bytes 224-231 */
+ DAC960_ByteCount64_T DeviceInitializationBlockNumber; /* Bytes 232-239 */
+ DAC960_ByteCount64_T DataMigrationBlockNumber; /* Bytes 240-247 */
+ DAC960_ByteCount64_T PatrolOperationBlockNumber; /* Bytes 248-255 */
+ unsigned char Reserved5[256]; /* Bytes 256-511 */
+}
+DAC960_V2_PhysicalDeviceInfo_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Health Status Buffer structure.
+*/
+
+typedef struct DAC960_V2_HealthStatusBuffer
+{
+ unsigned int MicrosecondsFromControllerStartTime; /* Bytes 0-3 */
+ unsigned int MillisecondsFromControllerStartTime; /* Bytes 4-7 */
+ unsigned int SecondsFrom1January1970; /* Bytes 8-11 */
+ unsigned int :32; /* Bytes 12-15 */
+ unsigned int StatusChangeCounter; /* Bytes 16-19 */
+ unsigned int :32; /* Bytes 20-23 */
+ unsigned int DebugOutputMessageBufferIndex; /* Bytes 24-27 */
+ unsigned int CodedMessageBufferIndex; /* Bytes 28-31 */
+ unsigned int CurrentTimeTracePageNumber; /* Bytes 32-35 */
+ unsigned int CurrentProfilerPageNumber; /* Bytes 36-39 */
+ unsigned int NextEventSequenceNumber; /* Bytes 40-43 */
+ unsigned int :32; /* Bytes 44-47 */
+ unsigned char Reserved1[16]; /* Bytes 48-63 */
+ unsigned char Reserved2[64]; /* Bytes 64-127 */
+}
+DAC960_V2_HealthStatusBuffer_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Get Event reply structure.
+*/
+
+typedef struct DAC960_V2_Event
+{
+ unsigned int EventSequenceNumber; /* Bytes 0-3 */
+ unsigned int EventTime; /* Bytes 4-7 */
+ unsigned int EventCode; /* Bytes 8-11 */
+ unsigned char :8; /* Byte 12 */
+ unsigned char Channel; /* Byte 13 */
+ unsigned char TargetID; /* Byte 14 */
+ unsigned char LogicalUnit; /* Byte 15 */
+ unsigned int :32; /* Bytes 16-19 */
+ unsigned int EventSpecificParameter; /* Bytes 20-23 */
+ unsigned char RequestSenseData[40]; /* Bytes 24-63 */
+}
+DAC960_V2_Event_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Command Control Bits structure.
+*/
+
+typedef struct DAC960_V2_CommandControlBits
+{
+ bool ForceUnitAccess:1; /* Byte 0 Bit 0 */
+ bool DisablePageOut:1; /* Byte 0 Bit 1 */
+ bool :1; /* Byte 0 Bit 2 */
+ bool AdditionalScatterGatherListMemory:1; /* Byte 0 Bit 3 */
+ bool DataTransferControllerToHost:1; /* Byte 0 Bit 4 */
+ bool :1; /* Byte 0 Bit 5 */
+ bool NoAutoRequestSense:1; /* Byte 0 Bit 6 */
+ bool DisconnectProhibited:1; /* Byte 0 Bit 7 */
+}
+DAC960_V2_CommandControlBits_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Command Timeout structure.
+*/
+
+typedef struct DAC960_V2_CommandTimeout
+{
+ unsigned char TimeoutValue:6; /* Byte 0 Bits 0-5 */
+ enum {
+ DAC960_V2_TimeoutScale_Seconds = 0,
+ DAC960_V2_TimeoutScale_Minutes = 1,
+ DAC960_V2_TimeoutScale_Hours = 2,
+ DAC960_V2_TimeoutScale_Reserved = 3
+ } __attribute__ ((packed)) TimeoutScale:2; /* Byte 0 Bits 6-7 */
+}
+DAC960_V2_CommandTimeout_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Physical Device structure.
+*/
+
+typedef struct DAC960_V2_PhysicalDevice
+{
+ unsigned char LogicalUnit; /* Byte 0 */
+ unsigned char TargetID; /* Byte 1 */
+ unsigned char Channel:3; /* Byte 2 Bits 0-2 */
+ unsigned char Controller:5; /* Byte 2 Bits 3-7 */
+}
+__attribute__ ((packed))
+DAC960_V2_PhysicalDevice_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Logical Device structure.
+*/
+
+typedef struct DAC960_V2_LogicalDevice
+{
+ unsigned short LogicalDeviceNumber; /* Bytes 0-1 */
+ unsigned char :3; /* Byte 2 Bits 0-2 */
+ unsigned char Controller:5; /* Byte 2 Bits 3-7 */
+}
+__attribute__ ((packed))
+DAC960_V2_LogicalDevice_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Operation Device type.
+*/
+
+typedef enum
+{
+ DAC960_V2_Physical_Device = 0x00,
+ DAC960_V2_RAID_Device = 0x01,
+ DAC960_V2_Physical_Channel = 0x02,
+ DAC960_V2_RAID_Channel = 0x03,
+ DAC960_V2_Physical_Controller = 0x04,
+ DAC960_V2_RAID_Controller = 0x05,
+ DAC960_V2_Configuration_Group = 0x10,
+ DAC960_V2_Enclosure = 0x11
+}
+__attribute__ ((packed))
+DAC960_V2_OperationDevice_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Translate Physical To Logical Device structure.
+*/
+
+typedef struct DAC960_V2_PhysicalToLogicalDevice
+{
+ unsigned short LogicalDeviceNumber; /* Bytes 0-1 */
+ unsigned short :16; /* Bytes 2-3 */
+ unsigned char PreviousBootController; /* Byte 4 */
+ unsigned char PreviousBootChannel; /* Byte 5 */
+ unsigned char PreviousBootTargetID; /* Byte 6 */
+ unsigned char PreviousBootLogicalUnit; /* Byte 7 */
+}
+DAC960_V2_PhysicalToLogicalDevice_T;
+
+
+
+/*
+ Define the DAC960 V2 Firmware Scatter/Gather List Entry structure.
+*/
+
+typedef struct DAC960_V2_ScatterGatherSegment
+{
+ DAC960_BusAddress64_T SegmentDataPointer; /* Bytes 0-7 */
+ DAC960_ByteCount64_T SegmentByteCount; /* Bytes 8-15 */
+}
+DAC960_V2_ScatterGatherSegment_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Data Transfer Memory Address structure.
+*/
+
+typedef union DAC960_V2_DataTransferMemoryAddress
+{
+ DAC960_V2_ScatterGatherSegment_T ScatterGatherSegments[2]; /* Bytes 0-31 */
+ struct {
+ unsigned short ScatterGatherList0Length; /* Bytes 0-1 */
+ unsigned short ScatterGatherList1Length; /* Bytes 2-3 */
+ unsigned short ScatterGatherList2Length; /* Bytes 4-5 */
+ unsigned short :16; /* Bytes 6-7 */
+ DAC960_BusAddress64_T ScatterGatherList0Address; /* Bytes 8-15 */
+ DAC960_BusAddress64_T ScatterGatherList1Address; /* Bytes 16-23 */
+ DAC960_BusAddress64_T ScatterGatherList2Address; /* Bytes 24-31 */
+ } ExtendedScatterGather;
+}
+DAC960_V2_DataTransferMemoryAddress_T;
+
+
+/*
+ Define the 64 Byte DAC960 V2 Firmware Command Mailbox structure.
+*/
+
+typedef union DAC960_V2_CommandMailbox
+{
+ unsigned int Words[16]; /* Words 0-15 */
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */
+ unsigned char DataTransferPageNumber; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ unsigned int :24; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ unsigned char Reserved[10]; /* Bytes 22-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } Common;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize; /* Bytes 4-7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ DAC960_V2_PhysicalDevice_T PhysicalDevice; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char CDBLength; /* Byte 21 */
+ unsigned char SCSI_CDB[10]; /* Bytes 22-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } SCSI_10;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize; /* Bytes 4-7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ DAC960_V2_PhysicalDevice_T PhysicalDevice; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char CDBLength; /* Byte 21 */
+ unsigned short :16; /* Bytes 22-23 */
+ DAC960_BusAddress64_T SCSI_CDB_BusAddress; /* Bytes 24-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } SCSI_255;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */
+ unsigned char DataTransferPageNumber; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ unsigned short :16; /* Bytes 16-17 */
+ unsigned char ControllerNumber; /* Byte 18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ unsigned char Reserved[10]; /* Bytes 22-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } ControllerInfo;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */
+ unsigned char DataTransferPageNumber; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ DAC960_V2_LogicalDevice_T LogicalDevice; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ unsigned char Reserved[10]; /* Bytes 22-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } LogicalDeviceInfo;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */
+ unsigned char DataTransferPageNumber; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ DAC960_V2_PhysicalDevice_T PhysicalDevice; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ unsigned char Reserved[10]; /* Bytes 22-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } PhysicalDeviceInfo;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */
+ unsigned char DataTransferPageNumber; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ unsigned short EventSequenceNumberHigh16; /* Bytes 16-17 */
+ unsigned char ControllerNumber; /* Byte 18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ unsigned short EventSequenceNumberLow16; /* Bytes 22-23 */
+ unsigned char Reserved[8]; /* Bytes 24-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } GetEvent;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */
+ unsigned char DataTransferPageNumber; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ DAC960_V2_LogicalDevice_T LogicalDevice; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ union {
+ DAC960_V2_LogicalDeviceState_T LogicalDeviceState;
+ DAC960_V2_PhysicalDeviceState_T PhysicalDeviceState;
+ } DeviceState; /* Byte 22 */
+ unsigned char Reserved[9]; /* Bytes 23-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } SetDeviceState;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */
+ unsigned char DataTransferPageNumber; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ DAC960_V2_LogicalDevice_T LogicalDevice; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ bool RestoreConsistency:1; /* Byte 22 Bit 0 */
+ bool InitializedAreaOnly:1; /* Byte 22 Bit 1 */
+ unsigned char :6; /* Byte 22 Bits 2-7 */
+ unsigned char Reserved[9]; /* Bytes 23-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } ConsistencyCheck;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ unsigned char FirstCommandMailboxSizeKB; /* Byte 4 */
+ unsigned char FirstStatusMailboxSizeKB; /* Byte 5 */
+ unsigned char SecondCommandMailboxSizeKB; /* Byte 6 */
+ unsigned char SecondStatusMailboxSizeKB; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ unsigned int :24; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ unsigned char HealthStatusBufferSizeKB; /* Byte 22 */
+ unsigned char :8; /* Byte 23 */
+ DAC960_BusAddress64_T HealthStatusBufferBusAddress; /* Bytes 24-31 */
+ DAC960_BusAddress64_T FirstCommandMailboxBusAddress; /* Bytes 32-39 */
+ DAC960_BusAddress64_T FirstStatusMailboxBusAddress; /* Bytes 40-47 */
+ DAC960_BusAddress64_T SecondCommandMailboxBusAddress; /* Bytes 48-55 */
+ DAC960_BusAddress64_T SecondStatusMailboxBusAddress; /* Bytes 56-63 */
+ } SetMemoryMailbox;
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandOpcode_T CommandOpcode; /* Byte 2 */
+ DAC960_V2_CommandControlBits_T CommandControlBits; /* Byte 3 */
+ DAC960_ByteCount32_T DataTransferSize:24; /* Bytes 4-6 */
+ unsigned char DataTransferPageNumber; /* Byte 7 */
+ DAC960_BusAddress64_T RequestSenseBusAddress; /* Bytes 8-15 */
+ DAC960_V2_PhysicalDevice_T PhysicalDevice; /* Bytes 16-18 */
+ DAC960_V2_CommandTimeout_T CommandTimeout; /* Byte 19 */
+ unsigned char RequestSenseSize; /* Byte 20 */
+ unsigned char IOCTL_Opcode; /* Byte 21 */
+ DAC960_V2_OperationDevice_T OperationDevice; /* Byte 22 */
+ unsigned char Reserved[9]; /* Bytes 23-31 */
+ DAC960_V2_DataTransferMemoryAddress_T
+ DataTransferMemoryAddress; /* Bytes 32-63 */
+ } DeviceOperation;
+}
+DAC960_V2_CommandMailbox_T;
+
+
+/*
+ Define the DAC960 Driver IOCTL requests.
+*/
+
+#define DAC960_IOCTL_GET_CONTROLLER_COUNT 0xDAC001
+#define DAC960_IOCTL_GET_CONTROLLER_INFO 0xDAC002
+#define DAC960_IOCTL_V1_EXECUTE_COMMAND 0xDAC003
+#define DAC960_IOCTL_V2_EXECUTE_COMMAND 0xDAC004
+#define DAC960_IOCTL_V2_GET_HEALTH_STATUS 0xDAC005
+
+
+/*
+ Define the DAC960_IOCTL_GET_CONTROLLER_INFO reply structure.
+*/
+
+typedef struct DAC960_ControllerInfo
+{
+ unsigned char ControllerNumber;
+ unsigned char FirmwareType;
+ unsigned char Channels;
+ unsigned char Targets;
+ unsigned char PCI_Bus;
+ unsigned char PCI_Device;
+ unsigned char PCI_Function;
+ unsigned char IRQ_Channel;
+ DAC960_PCI_Address_T PCI_Address;
+ unsigned char ModelName[20];
+ unsigned char FirmwareVersion[12];
+}
+DAC960_ControllerInfo_T;
+
+
+/*
+ Define the User Mode DAC960_IOCTL_V1_EXECUTE_COMMAND request structure.
+*/
+
+typedef struct DAC960_V1_UserCommand
+{
+ unsigned char ControllerNumber;
+ DAC960_V1_CommandMailbox_T CommandMailbox;
+ int DataTransferLength;
+ void __user *DataTransferBuffer;
+ DAC960_V1_DCDB_T __user *DCDB;
+}
+DAC960_V1_UserCommand_T;
+
+
+/*
+ Define the Kernel Mode DAC960_IOCTL_V1_EXECUTE_COMMAND request structure.
+*/
+
+typedef struct DAC960_V1_KernelCommand
+{
+ unsigned char ControllerNumber;
+ DAC960_V1_CommandMailbox_T CommandMailbox;
+ int DataTransferLength;
+ void *DataTransferBuffer;
+ DAC960_V1_DCDB_T *DCDB;
+ DAC960_V1_CommandStatus_T CommandStatus;
+ void (*CompletionFunction)(struct DAC960_V1_KernelCommand *);
+ void *CompletionData;
+}
+DAC960_V1_KernelCommand_T;
+
+
+/*
+ Define the User Mode DAC960_IOCTL_V2_EXECUTE_COMMAND request structure.
+*/
+
+typedef struct DAC960_V2_UserCommand
+{
+ unsigned char ControllerNumber;
+ DAC960_V2_CommandMailbox_T CommandMailbox;
+ int DataTransferLength;
+ int RequestSenseLength;
+ void __user *DataTransferBuffer;
+ void __user *RequestSenseBuffer;
+}
+DAC960_V2_UserCommand_T;
+
+
+/*
+ Define the Kernel Mode DAC960_IOCTL_V2_EXECUTE_COMMAND request structure.
+*/
+
+typedef struct DAC960_V2_KernelCommand
+{
+ unsigned char ControllerNumber;
+ DAC960_V2_CommandMailbox_T CommandMailbox;
+ int DataTransferLength;
+ int RequestSenseLength;
+ void *DataTransferBuffer;
+ void *RequestSenseBuffer;
+ DAC960_V2_CommandStatus_T CommandStatus;
+ void (*CompletionFunction)(struct DAC960_V2_KernelCommand *);
+ void *CompletionData;
+}
+DAC960_V2_KernelCommand_T;
+
+
+/*
+ Define the User Mode DAC960_IOCTL_V2_GET_HEALTH_STATUS request structure.
+*/
+
+typedef struct DAC960_V2_GetHealthStatus
+{
+ unsigned char ControllerNumber;
+ DAC960_V2_HealthStatusBuffer_T __user *HealthStatusBuffer;
+}
+DAC960_V2_GetHealthStatus_T;
+
+
+/*
+ Import the Kernel Mode IOCTL interface.
+*/
+
+extern int DAC960_KernelIOCTL(unsigned int Request, void *Argument);
+
+
+/*
+ DAC960_DriverVersion protects the private portion of this file.
+*/
+
+#ifdef DAC960_DriverVersion
+
+
+/*
+ Define the maximum Driver Queue Depth and Controller Queue Depth supported
+ by DAC960 V1 and V2 Firmware Controllers.
+*/
+
+#define DAC960_MaxDriverQueueDepth 511
+#define DAC960_MaxControllerQueueDepth 512
+
+
+/*
+ Define the maximum number of Scatter/Gather Segments supported for any
+ DAC960 V1 and V2 Firmware controller.
+*/
+
+#define DAC960_V1_ScatterGatherLimit 33
+#define DAC960_V2_ScatterGatherLimit 128
+
+
+/*
+ Define the number of Command Mailboxes and Status Mailboxes used by the
+ DAC960 V1 and V2 Firmware Memory Mailbox Interface.
+*/
+
+#define DAC960_V1_CommandMailboxCount 256
+#define DAC960_V1_StatusMailboxCount 1024
+#define DAC960_V2_CommandMailboxCount 512
+#define DAC960_V2_StatusMailboxCount 512
+
+
+/*
+ Define the DAC960 Controller Monitoring Timer Interval.
+*/
+
+#define DAC960_MonitoringTimerInterval (10 * HZ)
+
+
+/*
+ Define the DAC960 Controller Secondary Monitoring Interval.
+*/
+
+#define DAC960_SecondaryMonitoringInterval (60 * HZ)
+
+
+/*
+ Define the DAC960 Controller Health Status Monitoring Interval.
+*/
+
+#define DAC960_HealthStatusMonitoringInterval (1 * HZ)
+
+
+/*
+ Define the DAC960 Controller Progress Reporting Interval.
+*/
+
+#define DAC960_ProgressReportingInterval (60 * HZ)
+
+
+/*
+ Define the maximum number of Partitions allowed for each Logical Drive.
+*/
+
+#define DAC960_MaxPartitions 8
+#define DAC960_MaxPartitionsBits 3
+
+/*
+ Define the DAC960 Controller fixed Block Size and Block Size Bits.
+*/
+
+#define DAC960_BlockSize 512
+#define DAC960_BlockSizeBits 9
+
+
+/*
+ Define the number of Command structures that should be allocated as a
+ group to optimize kernel memory allocation.
+*/
+
+#define DAC960_V1_CommandAllocationGroupSize 11
+#define DAC960_V2_CommandAllocationGroupSize 29
+
+
+/*
+ Define the Controller Line Buffer, Progress Buffer, User Message, and
+ Initial Status Buffer sizes.
+*/
+
+#define DAC960_LineBufferSize 100
+#define DAC960_ProgressBufferSize 200
+#define DAC960_UserMessageSize 200
+#define DAC960_InitialStatusBufferSize (8192-32)
+
+
+/*
+ Define the DAC960 Controller Firmware Types.
+*/
+
+typedef enum
+{
+ DAC960_V1_Controller = 1,
+ DAC960_V2_Controller = 2
+}
+DAC960_FirmwareType_T;
+
+
+/*
+ Define the DAC960 Controller Hardware Types.
+*/
+
+typedef enum
+{
+ DAC960_BA_Controller = 1, /* eXtremeRAID 2000 */
+ DAC960_LP_Controller = 2, /* AcceleRAID 352 */
+ DAC960_LA_Controller = 3, /* DAC1164P */
+ DAC960_PG_Controller = 4, /* DAC960PTL/PJ/PG */
+ DAC960_PD_Controller = 5, /* DAC960PU/PD/PL/P */
+ DAC960_P_Controller = 6, /* DAC960PU/PD/PL/P */
+ DAC960_GEM_Controller = 7, /* AcceleRAID 4/5/600 */
+}
+DAC960_HardwareType_T;
+
+
+/*
+ Define the Driver Message Levels.
+*/
+
+typedef enum DAC960_MessageLevel
+{
+ DAC960_AnnounceLevel = 0,
+ DAC960_InfoLevel = 1,
+ DAC960_NoticeLevel = 2,
+ DAC960_WarningLevel = 3,
+ DAC960_ErrorLevel = 4,
+ DAC960_ProgressLevel = 5,
+ DAC960_CriticalLevel = 6,
+ DAC960_UserCriticalLevel = 7
+}
+DAC960_MessageLevel_T;
+
+static char
+ *DAC960_MessageLevelMap[] =
+ { KERN_NOTICE, KERN_NOTICE, KERN_NOTICE, KERN_WARNING,
+ KERN_ERR, KERN_CRIT, KERN_CRIT, KERN_CRIT };
+
+
+/*
+ Define Driver Message macros.
+*/
+
+#define DAC960_Announce(Format, Arguments...) \
+ DAC960_Message(DAC960_AnnounceLevel, Format, ##Arguments)
+
+#define DAC960_Info(Format, Arguments...) \
+ DAC960_Message(DAC960_InfoLevel, Format, ##Arguments)
+
+#define DAC960_Notice(Format, Arguments...) \
+ DAC960_Message(DAC960_NoticeLevel, Format, ##Arguments)
+
+#define DAC960_Warning(Format, Arguments...) \
+ DAC960_Message(DAC960_WarningLevel, Format, ##Arguments)
+
+#define DAC960_Error(Format, Arguments...) \
+ DAC960_Message(DAC960_ErrorLevel, Format, ##Arguments)
+
+#define DAC960_Progress(Format, Arguments...) \
+ DAC960_Message(DAC960_ProgressLevel, Format, ##Arguments)
+
+#define DAC960_Critical(Format, Arguments...) \
+ DAC960_Message(DAC960_CriticalLevel, Format, ##Arguments)
+
+#define DAC960_UserCritical(Format, Arguments...) \
+ DAC960_Message(DAC960_UserCriticalLevel, Format, ##Arguments)
+
+
+struct DAC960_privdata {
+ DAC960_HardwareType_T HardwareType;
+ DAC960_FirmwareType_T FirmwareType;
+ irq_handler_t InterruptHandler;
+ unsigned int MemoryWindowSize;
+};
+
+
+/*
+ Define the DAC960 V1 Firmware Controller Status Mailbox structure.
+*/
+
+typedef union DAC960_V1_StatusMailbox
+{
+ unsigned int Word; /* Word 0 */
+ struct {
+ DAC960_V1_CommandIdentifier_T CommandIdentifier; /* Byte 0 */
+ unsigned char :7; /* Byte 1 Bits 0-6 */
+ bool Valid:1; /* Byte 1 Bit 7 */
+ DAC960_V1_CommandStatus_T CommandStatus; /* Bytes 2-3 */
+ } Fields;
+}
+DAC960_V1_StatusMailbox_T;
+
+
+/*
+ Define the DAC960 V2 Firmware Controller Status Mailbox structure.
+*/
+
+typedef union DAC960_V2_StatusMailbox
+{
+ unsigned int Words[2]; /* Words 0-1 */
+ struct {
+ DAC960_V2_CommandIdentifier_T CommandIdentifier; /* Bytes 0-1 */
+ DAC960_V2_CommandStatus_T CommandStatus; /* Byte 2 */
+ unsigned char RequestSenseLength; /* Byte 3 */
+ int DataTransferResidue; /* Bytes 4-7 */
+ } Fields;
+}
+DAC960_V2_StatusMailbox_T;
+
+
+/*
+ Define the DAC960 Driver Command Types.
+*/
+
+typedef enum
+{
+ DAC960_ReadCommand = 1,
+ DAC960_WriteCommand = 2,
+ DAC960_ReadRetryCommand = 3,
+ DAC960_WriteRetryCommand = 4,
+ DAC960_MonitoringCommand = 5,
+ DAC960_ImmediateCommand = 6,
+ DAC960_QueuedCommand = 7
+}
+DAC960_CommandType_T;
+
+
+/*
+ Define the DAC960 Driver Command structure.
+*/
+
+typedef struct DAC960_Command
+{
+ int CommandIdentifier;
+ DAC960_CommandType_T CommandType;
+ struct DAC960_Controller *Controller;
+ struct DAC960_Command *Next;
+ struct completion *Completion;
+ unsigned int LogicalDriveNumber;
+ unsigned int BlockNumber;
+ unsigned int BlockCount;
+ unsigned int SegmentCount;
+ int DmaDirection;
+ struct scatterlist *cmd_sglist;
+ struct request *Request;
+ union {
+ struct {
+ DAC960_V1_CommandMailbox_T CommandMailbox;
+ DAC960_V1_KernelCommand_T *KernelCommand;
+ DAC960_V1_CommandStatus_T CommandStatus;
+ DAC960_V1_ScatterGatherSegment_T *ScatterGatherList;
+ dma_addr_t ScatterGatherListDMA;
+ struct scatterlist ScatterList[DAC960_V1_ScatterGatherLimit];
+ unsigned int EndMarker[0];
+ } V1;
+ struct {
+ DAC960_V2_CommandMailbox_T CommandMailbox;
+ DAC960_V2_KernelCommand_T *KernelCommand;
+ DAC960_V2_CommandStatus_T CommandStatus;
+ unsigned char RequestSenseLength;
+ int DataTransferResidue;
+ DAC960_V2_ScatterGatherSegment_T *ScatterGatherList;
+ dma_addr_t ScatterGatherListDMA;
+ DAC960_SCSI_RequestSense_T *RequestSense;
+ dma_addr_t RequestSenseDMA;
+ struct scatterlist ScatterList[DAC960_V2_ScatterGatherLimit];
+ unsigned int EndMarker[0];
+ } V2;
+ } FW;
+}
+DAC960_Command_T;
+
+
+/*
+ Define the DAC960 Driver Controller structure.
+*/
+
+typedef struct DAC960_Controller
+{
+ void __iomem *BaseAddress;
+ void __iomem *MemoryMappedAddress;
+ DAC960_FirmwareType_T FirmwareType;
+ DAC960_HardwareType_T HardwareType;
+ DAC960_IO_Address_T IO_Address;
+ DAC960_PCI_Address_T PCI_Address;
+ struct pci_dev *PCIDevice;
+ unsigned char ControllerNumber;
+ unsigned char ControllerName[4];
+ unsigned char ModelName[20];
+ unsigned char FullModelName[28];
+ unsigned char FirmwareVersion[12];
+ unsigned char Bus;
+ unsigned char Device;
+ unsigned char Function;
+ unsigned char IRQ_Channel;
+ unsigned char Channels;
+ unsigned char Targets;
+ unsigned char MemorySize;
+ unsigned char LogicalDriveCount;
+ unsigned short CommandAllocationGroupSize;
+ unsigned short ControllerQueueDepth;
+ unsigned short DriverQueueDepth;
+ unsigned short MaxBlocksPerCommand;
+ unsigned short ControllerScatterGatherLimit;
+ unsigned short DriverScatterGatherLimit;
+ u64 BounceBufferLimit;
+ unsigned int CombinedStatusBufferLength;
+ unsigned int InitialStatusLength;
+ unsigned int CurrentStatusLength;
+ unsigned int ProgressBufferLength;
+ unsigned int UserStatusLength;
+ struct dma_loaf DmaPages;
+ unsigned long MonitoringTimerCount;
+ unsigned long PrimaryMonitoringTime;
+ unsigned long SecondaryMonitoringTime;
+ unsigned long ShutdownMonitoringTimer;
+ unsigned long LastProgressReportTime;
+ unsigned long LastCurrentStatusTime;
+ bool ControllerInitialized;
+ bool MonitoringCommandDeferred;
+ bool EphemeralProgressMessage;
+ bool DriveSpinUpMessageDisplayed;
+ bool MonitoringAlertMode;
+ bool SuppressEnclosureMessages;
+ struct timer_list MonitoringTimer;
+ struct gendisk *disks[DAC960_MaxLogicalDrives];
+ struct pci_pool *ScatterGatherPool;
+ DAC960_Command_T *FreeCommands;
+ unsigned char *CombinedStatusBuffer;
+ unsigned char *CurrentStatusBuffer;
+ struct request_queue *RequestQueue[DAC960_MaxLogicalDrives];
+ int req_q_index;
+ spinlock_t queue_lock;
+ wait_queue_head_t CommandWaitQueue;
+ wait_queue_head_t HealthStatusWaitQueue;
+ DAC960_Command_T InitialCommand;
+ DAC960_Command_T *Commands[DAC960_MaxDriverQueueDepth];
+ struct proc_dir_entry *ControllerProcEntry;
+ bool LogicalDriveInitiallyAccessible[DAC960_MaxLogicalDrives];
+ void (*QueueCommand)(DAC960_Command_T *Command);
+ bool (*ReadControllerConfiguration)(struct DAC960_Controller *);
+ bool (*ReadDeviceConfiguration)(struct DAC960_Controller *);
+ bool (*ReportDeviceConfiguration)(struct DAC960_Controller *);
+ void (*QueueReadWriteCommand)(DAC960_Command_T *Command);
+ union {
+ struct {
+ unsigned char GeometryTranslationHeads;
+ unsigned char GeometryTranslationSectors;
+ unsigned char PendingRebuildFlag;
+ unsigned short StripeSize;
+ unsigned short SegmentSize;
+ unsigned short NewEventLogSequenceNumber;
+ unsigned short OldEventLogSequenceNumber;
+ unsigned short DeviceStateChannel;
+ unsigned short DeviceStateTargetID;
+ bool DualModeMemoryMailboxInterface;
+ bool BackgroundInitializationStatusSupported;
+ bool SAFTE_EnclosureManagementEnabled;
+ bool NeedLogicalDriveInformation;
+ bool NeedErrorTableInformation;
+ bool NeedDeviceStateInformation;
+ bool NeedDeviceInquiryInformation;
+ bool NeedDeviceSerialNumberInformation;
+ bool NeedRebuildProgress;
+ bool NeedConsistencyCheckProgress;
+ bool NeedBackgroundInitializationStatus;
+ bool StartDeviceStateScan;
+ bool RebuildProgressFirst;
+ bool RebuildFlagPending;
+ bool RebuildStatusPending;
+
+ dma_addr_t FirstCommandMailboxDMA;
+ DAC960_V1_CommandMailbox_T *FirstCommandMailbox;
+ DAC960_V1_CommandMailbox_T *LastCommandMailbox;
+ DAC960_V1_CommandMailbox_T *NextCommandMailbox;
+ DAC960_V1_CommandMailbox_T *PreviousCommandMailbox1;
+ DAC960_V1_CommandMailbox_T *PreviousCommandMailbox2;
+
+ dma_addr_t FirstStatusMailboxDMA;
+ DAC960_V1_StatusMailbox_T *FirstStatusMailbox;
+ DAC960_V1_StatusMailbox_T *LastStatusMailbox;
+ DAC960_V1_StatusMailbox_T *NextStatusMailbox;
+
+ DAC960_V1_DCDB_T *MonitoringDCDB;
+ dma_addr_t MonitoringDCDB_DMA;
+
+ DAC960_V1_Enquiry_T Enquiry;
+ DAC960_V1_Enquiry_T *NewEnquiry;
+ dma_addr_t NewEnquiryDMA;
+
+ DAC960_V1_ErrorTable_T ErrorTable;
+ DAC960_V1_ErrorTable_T *NewErrorTable;
+ dma_addr_t NewErrorTableDMA;
+
+ DAC960_V1_EventLogEntry_T *EventLogEntry;
+ dma_addr_t EventLogEntryDMA;
+
+ DAC960_V1_RebuildProgress_T *RebuildProgress;
+ dma_addr_t RebuildProgressDMA;
+ DAC960_V1_CommandStatus_T LastRebuildStatus;
+ DAC960_V1_CommandStatus_T PendingRebuildStatus;
+
+ DAC960_V1_LogicalDriveInformationArray_T LogicalDriveInformation;
+ DAC960_V1_LogicalDriveInformationArray_T *NewLogicalDriveInformation;
+ dma_addr_t NewLogicalDriveInformationDMA;
+
+ DAC960_V1_BackgroundInitializationStatus_T
+ *BackgroundInitializationStatus;
+ dma_addr_t BackgroundInitializationStatusDMA;
+ DAC960_V1_BackgroundInitializationStatus_T
+ LastBackgroundInitializationStatus;
+
+ DAC960_V1_DeviceState_T
+ DeviceState[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets];
+ DAC960_V1_DeviceState_T *NewDeviceState;
+ dma_addr_t NewDeviceStateDMA;
+
+ DAC960_SCSI_Inquiry_T
+ InquiryStandardData[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets];
+ DAC960_SCSI_Inquiry_T *NewInquiryStandardData;
+ dma_addr_t NewInquiryStandardDataDMA;
+
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T
+ InquiryUnitSerialNumber[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets];
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *NewInquiryUnitSerialNumber;
+ dma_addr_t NewInquiryUnitSerialNumberDMA;
+
+ int DeviceResetCount[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets];
+ bool DirectCommandActive[DAC960_V1_MaxChannels][DAC960_V1_MaxTargets];
+ } V1;
+ struct {
+ unsigned int StatusChangeCounter;
+ unsigned int NextEventSequenceNumber;
+ unsigned int PhysicalDeviceIndex;
+ bool NeedLogicalDeviceInformation;
+ bool NeedPhysicalDeviceInformation;
+ bool NeedDeviceSerialNumberInformation;
+ bool StartLogicalDeviceInformationScan;
+ bool StartPhysicalDeviceInformationScan;
+ struct pci_pool *RequestSensePool;
+
+ dma_addr_t FirstCommandMailboxDMA;
+ DAC960_V2_CommandMailbox_T *FirstCommandMailbox;
+ DAC960_V2_CommandMailbox_T *LastCommandMailbox;
+ DAC960_V2_CommandMailbox_T *NextCommandMailbox;
+ DAC960_V2_CommandMailbox_T *PreviousCommandMailbox1;
+ DAC960_V2_CommandMailbox_T *PreviousCommandMailbox2;
+
+ dma_addr_t FirstStatusMailboxDMA;
+ DAC960_V2_StatusMailbox_T *FirstStatusMailbox;
+ DAC960_V2_StatusMailbox_T *LastStatusMailbox;
+ DAC960_V2_StatusMailbox_T *NextStatusMailbox;
+
+ dma_addr_t HealthStatusBufferDMA;
+ DAC960_V2_HealthStatusBuffer_T *HealthStatusBuffer;
+
+ DAC960_V2_ControllerInfo_T ControllerInformation;
+ DAC960_V2_ControllerInfo_T *NewControllerInformation;
+ dma_addr_t NewControllerInformationDMA;
+
+ DAC960_V2_LogicalDeviceInfo_T
+ *LogicalDeviceInformation[DAC960_MaxLogicalDrives];
+ DAC960_V2_LogicalDeviceInfo_T *NewLogicalDeviceInformation;
+ dma_addr_t NewLogicalDeviceInformationDMA;
+
+ DAC960_V2_PhysicalDeviceInfo_T
+ *PhysicalDeviceInformation[DAC960_V2_MaxPhysicalDevices];
+ DAC960_V2_PhysicalDeviceInfo_T *NewPhysicalDeviceInformation;
+ dma_addr_t NewPhysicalDeviceInformationDMA;
+
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T *NewInquiryUnitSerialNumber;
+ dma_addr_t NewInquiryUnitSerialNumberDMA;
+ DAC960_SCSI_Inquiry_UnitSerialNumber_T
+ *InquiryUnitSerialNumber[DAC960_V2_MaxPhysicalDevices];
+
+ DAC960_V2_Event_T *Event;
+ dma_addr_t EventDMA;
+
+ DAC960_V2_PhysicalToLogicalDevice_T *PhysicalToLogicalDevice;
+ dma_addr_t PhysicalToLogicalDeviceDMA;
+
+ DAC960_V2_PhysicalDevice_T
+ LogicalDriveToVirtualDevice[DAC960_MaxLogicalDrives];
+ bool LogicalDriveFoundDuringScan[DAC960_MaxLogicalDrives];
+ } V2;
+ } FW;
+ unsigned char ProgressBuffer[DAC960_ProgressBufferSize];
+ unsigned char UserStatusBuffer[DAC960_UserMessageSize];
+}
+DAC960_Controller_T;
+
+
+/*
+ Simplify access to Firmware Version Dependent Data Structure Components
+ and Functions.
+*/
+
+#define V1 FW.V1
+#define V2 FW.V2
+#define DAC960_QueueCommand(Command) \
+ (Controller->QueueCommand)(Command)
+#define DAC960_ReadControllerConfiguration(Controller) \
+ (Controller->ReadControllerConfiguration)(Controller)
+#define DAC960_ReadDeviceConfiguration(Controller) \
+ (Controller->ReadDeviceConfiguration)(Controller)
+#define DAC960_ReportDeviceConfiguration(Controller) \
+ (Controller->ReportDeviceConfiguration)(Controller)
+#define DAC960_QueueReadWriteCommand(Command) \
+ (Controller->QueueReadWriteCommand)(Command)
+
+/*
+ * dma_addr_writeql is provided to write dma_addr_t types
+ * to a 64-bit pci address space register. The controller
+ * will accept having the register written as two 32-bit
+ * values.
+ *
+ * In HIGHMEM kernels, dma_addr_t is a 64-bit value.
+ * without HIGHMEM, dma_addr_t is a 32-bit value.
+ *
+ * The compiler should always fix up the assignment
+ * to u.wq appropriately, depending upon the size of
+ * dma_addr_t.
+ */
+static inline
+void dma_addr_writeql(dma_addr_t addr, void __iomem *write_address)
+{
+ union {
+ u64 wq;
+ uint wl[2];
+ } u;
+
+ u.wq = addr;
+
+ writel(u.wl[0], write_address);
+ writel(u.wl[1], write_address + 4);
+}
+
+/*
+ Define the DAC960 GEM Series Controller Interface Register Offsets.
+ */
+
+#define DAC960_GEM_RegisterWindowSize 0x600
+
+typedef enum
+{
+ DAC960_GEM_InboundDoorBellRegisterReadSetOffset = 0x214,
+ DAC960_GEM_InboundDoorBellRegisterClearOffset = 0x218,
+ DAC960_GEM_OutboundDoorBellRegisterReadSetOffset = 0x224,
+ DAC960_GEM_OutboundDoorBellRegisterClearOffset = 0x228,
+ DAC960_GEM_InterruptStatusRegisterOffset = 0x208,
+ DAC960_GEM_InterruptMaskRegisterReadSetOffset = 0x22C,
+ DAC960_GEM_InterruptMaskRegisterClearOffset = 0x230,
+ DAC960_GEM_CommandMailboxBusAddressOffset = 0x510,
+ DAC960_GEM_CommandStatusOffset = 0x518,
+ DAC960_GEM_ErrorStatusRegisterReadSetOffset = 0x224,
+ DAC960_GEM_ErrorStatusRegisterClearOffset = 0x228,
+}
+DAC960_GEM_RegisterOffsets_T;
+
+/*
+ Define the structure of the DAC960 GEM Series Inbound Door Bell
+ */
+
+typedef union DAC960_GEM_InboundDoorBellRegister
+{
+ unsigned int All;
+ struct {
+ unsigned int :24;
+ bool HardwareMailboxNewCommand:1;
+ bool AcknowledgeHardwareMailboxStatus:1;
+ bool GenerateInterrupt:1;
+ bool ControllerReset:1;
+ bool MemoryMailboxNewCommand:1;
+ unsigned int :3;
+ } Write;
+ struct {
+ unsigned int :24;
+ bool HardwareMailboxFull:1;
+ bool InitializationInProgress:1;
+ unsigned int :6;
+ } Read;
+}
+DAC960_GEM_InboundDoorBellRegister_T;
+
+/*
+ Define the structure of the DAC960 GEM Series Outbound Door Bell Register.
+ */
+typedef union DAC960_GEM_OutboundDoorBellRegister
+{
+ unsigned int All;
+ struct {
+ unsigned int :24;
+ bool AcknowledgeHardwareMailboxInterrupt:1;
+ bool AcknowledgeMemoryMailboxInterrupt:1;
+ unsigned int :6;
+ } Write;
+ struct {
+ unsigned int :24;
+ bool HardwareMailboxStatusAvailable:1;
+ bool MemoryMailboxStatusAvailable:1;
+ unsigned int :6;
+ } Read;
+}
+DAC960_GEM_OutboundDoorBellRegister_T;
+
+/*
+ Define the structure of the DAC960 GEM Series Interrupt Mask Register.
+ */
+typedef union DAC960_GEM_InterruptMaskRegister
+{
+ unsigned int All;
+ struct {
+ unsigned int :16;
+ unsigned int :8;
+ unsigned int HardwareMailboxInterrupt:1;
+ unsigned int MemoryMailboxInterrupt:1;
+ unsigned int :6;
+ } Bits;
+}
+DAC960_GEM_InterruptMaskRegister_T;
+
+/*
+ Define the structure of the DAC960 GEM Series Error Status Register.
+ */
+
+typedef union DAC960_GEM_ErrorStatusRegister
+{
+ unsigned int All;
+ struct {
+ unsigned int :24;
+ unsigned int :5;
+ bool ErrorStatusPending:1;
+ unsigned int :2;
+ } Bits;
+}
+DAC960_GEM_ErrorStatusRegister_T;
+
+/*
+ Define inline functions to provide an abstraction for reading and writing the
+ DAC960 GEM Series Controller Interface Registers.
+*/
+
+static inline
+void DAC960_GEM_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterReadSetOffset);
+}
+
+static inline
+void DAC960_GEM_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterClearOffset);
+}
+
+static inline
+void DAC960_GEM_GenerateInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.GenerateInterrupt = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterReadSetOffset);
+}
+
+static inline
+void DAC960_GEM_ControllerReset(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.ControllerReset = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterReadSetOffset);
+}
+
+static inline
+void DAC960_GEM_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_GEM_InboundDoorBellRegisterReadSetOffset);
+}
+
+static inline
+bool DAC960_GEM_HardwareMailboxFullP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readl(ControllerBaseAddress +
+ DAC960_GEM_InboundDoorBellRegisterReadSetOffset);
+ return InboundDoorBellRegister.Read.HardwareMailboxFull;
+}
+
+static inline
+bool DAC960_GEM_InitializationInProgressP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readl(ControllerBaseAddress +
+ DAC960_GEM_InboundDoorBellRegisterReadSetOffset);
+ return InboundDoorBellRegister.Read.InitializationInProgress;
+}
+
+static inline
+void DAC960_GEM_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ writel(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_GEM_OutboundDoorBellRegisterClearOffset);
+}
+
+static inline
+void DAC960_GEM_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writel(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_GEM_OutboundDoorBellRegisterClearOffset);
+}
+
+static inline
+void DAC960_GEM_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writel(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_GEM_OutboundDoorBellRegisterClearOffset);
+}
+
+static inline
+bool DAC960_GEM_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readl(ControllerBaseAddress +
+ DAC960_GEM_OutboundDoorBellRegisterReadSetOffset);
+ return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable;
+}
+
+static inline
+bool DAC960_GEM_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readl(ControllerBaseAddress +
+ DAC960_GEM_OutboundDoorBellRegisterReadSetOffset);
+ return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable;
+}
+
+static inline
+void DAC960_GEM_EnableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0;
+ InterruptMaskRegister.Bits.HardwareMailboxInterrupt = true;
+ InterruptMaskRegister.Bits.MemoryMailboxInterrupt = true;
+ writel(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_GEM_InterruptMaskRegisterClearOffset);
+}
+
+static inline
+void DAC960_GEM_DisableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0;
+ InterruptMaskRegister.Bits.HardwareMailboxInterrupt = true;
+ InterruptMaskRegister.Bits.MemoryMailboxInterrupt = true;
+ writel(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_GEM_InterruptMaskRegisterReadSetOffset);
+}
+
+static inline
+bool DAC960_GEM_InterruptsEnabledP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_GEM_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All =
+ readl(ControllerBaseAddress +
+ DAC960_GEM_InterruptMaskRegisterReadSetOffset);
+ return !(InterruptMaskRegister.Bits.HardwareMailboxInterrupt ||
+ InterruptMaskRegister.Bits.MemoryMailboxInterrupt);
+}
+
+static inline
+void DAC960_GEM_WriteCommandMailbox(DAC960_V2_CommandMailbox_T
+ *MemoryCommandMailbox,
+ DAC960_V2_CommandMailbox_T
+ *CommandMailbox)
+{
+ memcpy(&MemoryCommandMailbox->Words[1], &CommandMailbox->Words[1],
+ sizeof(DAC960_V2_CommandMailbox_T) - sizeof(unsigned int));
+ wmb();
+ MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0];
+ mb();
+}
+
+static inline
+void DAC960_GEM_WriteHardwareMailbox(void __iomem *ControllerBaseAddress,
+ dma_addr_t CommandMailboxDMA)
+{
+ dma_addr_writeql(CommandMailboxDMA,
+ ControllerBaseAddress +
+ DAC960_GEM_CommandMailboxBusAddressOffset);
+}
+
+static inline DAC960_V2_CommandIdentifier_T
+DAC960_GEM_ReadCommandIdentifier(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_GEM_CommandStatusOffset);
+}
+
+static inline DAC960_V2_CommandStatus_T
+DAC960_GEM_ReadCommandStatus(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_GEM_CommandStatusOffset + 2);
+}
+
+static inline bool
+DAC960_GEM_ReadErrorStatus(void __iomem *ControllerBaseAddress,
+ unsigned char *ErrorStatus,
+ unsigned char *Parameter0,
+ unsigned char *Parameter1)
+{
+ DAC960_GEM_ErrorStatusRegister_T ErrorStatusRegister;
+ ErrorStatusRegister.All =
+ readl(ControllerBaseAddress + DAC960_GEM_ErrorStatusRegisterReadSetOffset);
+ if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false;
+ ErrorStatusRegister.Bits.ErrorStatusPending = false;
+ *ErrorStatus = ErrorStatusRegister.All;
+ *Parameter0 =
+ readb(ControllerBaseAddress + DAC960_GEM_CommandMailboxBusAddressOffset + 0);
+ *Parameter1 =
+ readb(ControllerBaseAddress + DAC960_GEM_CommandMailboxBusAddressOffset + 1);
+ writel(0x03000000, ControllerBaseAddress +
+ DAC960_GEM_ErrorStatusRegisterClearOffset);
+ return true;
+}
+
+/*
+ Define the DAC960 BA Series Controller Interface Register Offsets.
+*/
+
+#define DAC960_BA_RegisterWindowSize 0x80
+
+typedef enum
+{
+ DAC960_BA_InboundDoorBellRegisterOffset = 0x60,
+ DAC960_BA_OutboundDoorBellRegisterOffset = 0x61,
+ DAC960_BA_InterruptStatusRegisterOffset = 0x30,
+ DAC960_BA_InterruptMaskRegisterOffset = 0x34,
+ DAC960_BA_CommandMailboxBusAddressOffset = 0x50,
+ DAC960_BA_CommandStatusOffset = 0x58,
+ DAC960_BA_ErrorStatusRegisterOffset = 0x63
+}
+DAC960_BA_RegisterOffsets_T;
+
+
+/*
+ Define the structure of the DAC960 BA Series Inbound Door Bell Register.
+*/
+
+typedef union DAC960_BA_InboundDoorBellRegister
+{
+ unsigned char All;
+ struct {
+ bool HardwareMailboxNewCommand:1; /* Bit 0 */
+ bool AcknowledgeHardwareMailboxStatus:1; /* Bit 1 */
+ bool GenerateInterrupt:1; /* Bit 2 */
+ bool ControllerReset:1; /* Bit 3 */
+ bool MemoryMailboxNewCommand:1; /* Bit 4 */
+ unsigned char :3; /* Bits 5-7 */
+ } Write;
+ struct {
+ bool HardwareMailboxEmpty:1; /* Bit 0 */
+ bool InitializationNotInProgress:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Read;
+}
+DAC960_BA_InboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 BA Series Outbound Door Bell Register.
+*/
+
+typedef union DAC960_BA_OutboundDoorBellRegister
+{
+ unsigned char All;
+ struct {
+ bool AcknowledgeHardwareMailboxInterrupt:1; /* Bit 0 */
+ bool AcknowledgeMemoryMailboxInterrupt:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Write;
+ struct {
+ bool HardwareMailboxStatusAvailable:1; /* Bit 0 */
+ bool MemoryMailboxStatusAvailable:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Read;
+}
+DAC960_BA_OutboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 BA Series Interrupt Mask Register.
+*/
+
+typedef union DAC960_BA_InterruptMaskRegister
+{
+ unsigned char All;
+ struct {
+ unsigned int :2; /* Bits 0-1 */
+ bool DisableInterrupts:1; /* Bit 2 */
+ bool DisableInterruptsI2O:1; /* Bit 3 */
+ unsigned int :4; /* Bits 4-7 */
+ } Bits;
+}
+DAC960_BA_InterruptMaskRegister_T;
+
+
+/*
+ Define the structure of the DAC960 BA Series Error Status Register.
+*/
+
+typedef union DAC960_BA_ErrorStatusRegister
+{
+ unsigned char All;
+ struct {
+ unsigned int :2; /* Bits 0-1 */
+ bool ErrorStatusPending:1; /* Bit 2 */
+ unsigned int :5; /* Bits 3-7 */
+ } Bits;
+}
+DAC960_BA_ErrorStatusRegister_T;
+
+
+/*
+ Define inline functions to provide an abstraction for reading and writing the
+ DAC960 BA Series Controller Interface Registers.
+*/
+
+static inline
+void DAC960_BA_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_BA_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_BA_GenerateInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.GenerateInterrupt = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_BA_ControllerReset(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.ControllerReset = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_BA_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_BA_HardwareMailboxFullP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset);
+ return !InboundDoorBellRegister.Read.HardwareMailboxEmpty;
+}
+
+static inline
+bool DAC960_BA_InitializationInProgressP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_BA_InboundDoorBellRegisterOffset);
+ return !InboundDoorBellRegister.Read.InitializationNotInProgress;
+}
+
+static inline
+void DAC960_BA_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_BA_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_BA_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_BA_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable;
+}
+
+static inline
+bool DAC960_BA_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_BA_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable;
+}
+
+static inline
+void DAC960_BA_EnableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0xFF;
+ InterruptMaskRegister.Bits.DisableInterrupts = false;
+ InterruptMaskRegister.Bits.DisableInterruptsI2O = true;
+ writeb(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_BA_InterruptMaskRegisterOffset);
+}
+
+static inline
+void DAC960_BA_DisableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0xFF;
+ InterruptMaskRegister.Bits.DisableInterrupts = true;
+ InterruptMaskRegister.Bits.DisableInterruptsI2O = true;
+ writeb(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_BA_InterruptMaskRegisterOffset);
+}
+
+static inline
+bool DAC960_BA_InterruptsEnabledP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_BA_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All =
+ readb(ControllerBaseAddress + DAC960_BA_InterruptMaskRegisterOffset);
+ return !InterruptMaskRegister.Bits.DisableInterrupts;
+}
+
+static inline
+void DAC960_BA_WriteCommandMailbox(DAC960_V2_CommandMailbox_T
+ *MemoryCommandMailbox,
+ DAC960_V2_CommandMailbox_T
+ *CommandMailbox)
+{
+ memcpy(&MemoryCommandMailbox->Words[1], &CommandMailbox->Words[1],
+ sizeof(DAC960_V2_CommandMailbox_T) - sizeof(unsigned int));
+ wmb();
+ MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0];
+ mb();
+}
+
+
+static inline
+void DAC960_BA_WriteHardwareMailbox(void __iomem *ControllerBaseAddress,
+ dma_addr_t CommandMailboxDMA)
+{
+ dma_addr_writeql(CommandMailboxDMA,
+ ControllerBaseAddress +
+ DAC960_BA_CommandMailboxBusAddressOffset);
+}
+
+static inline DAC960_V2_CommandIdentifier_T
+DAC960_BA_ReadCommandIdentifier(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_BA_CommandStatusOffset);
+}
+
+static inline DAC960_V2_CommandStatus_T
+DAC960_BA_ReadCommandStatus(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_BA_CommandStatusOffset + 2);
+}
+
+static inline bool
+DAC960_BA_ReadErrorStatus(void __iomem *ControllerBaseAddress,
+ unsigned char *ErrorStatus,
+ unsigned char *Parameter0,
+ unsigned char *Parameter1)
+{
+ DAC960_BA_ErrorStatusRegister_T ErrorStatusRegister;
+ ErrorStatusRegister.All =
+ readb(ControllerBaseAddress + DAC960_BA_ErrorStatusRegisterOffset);
+ if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false;
+ ErrorStatusRegister.Bits.ErrorStatusPending = false;
+ *ErrorStatus = ErrorStatusRegister.All;
+ *Parameter0 =
+ readb(ControllerBaseAddress + DAC960_BA_CommandMailboxBusAddressOffset + 0);
+ *Parameter1 =
+ readb(ControllerBaseAddress + DAC960_BA_CommandMailboxBusAddressOffset + 1);
+ writeb(0xFF, ControllerBaseAddress + DAC960_BA_ErrorStatusRegisterOffset);
+ return true;
+}
+
+
+/*
+ Define the DAC960 LP Series Controller Interface Register Offsets.
+*/
+
+#define DAC960_LP_RegisterWindowSize 0x80
+
+typedef enum
+{
+ DAC960_LP_InboundDoorBellRegisterOffset = 0x20,
+ DAC960_LP_OutboundDoorBellRegisterOffset = 0x2C,
+ DAC960_LP_InterruptStatusRegisterOffset = 0x30,
+ DAC960_LP_InterruptMaskRegisterOffset = 0x34,
+ DAC960_LP_CommandMailboxBusAddressOffset = 0x10,
+ DAC960_LP_CommandStatusOffset = 0x18,
+ DAC960_LP_ErrorStatusRegisterOffset = 0x2E
+}
+DAC960_LP_RegisterOffsets_T;
+
+
+/*
+ Define the structure of the DAC960 LP Series Inbound Door Bell Register.
+*/
+
+typedef union DAC960_LP_InboundDoorBellRegister
+{
+ unsigned char All;
+ struct {
+ bool HardwareMailboxNewCommand:1; /* Bit 0 */
+ bool AcknowledgeHardwareMailboxStatus:1; /* Bit 1 */
+ bool GenerateInterrupt:1; /* Bit 2 */
+ bool ControllerReset:1; /* Bit 3 */
+ bool MemoryMailboxNewCommand:1; /* Bit 4 */
+ unsigned char :3; /* Bits 5-7 */
+ } Write;
+ struct {
+ bool HardwareMailboxFull:1; /* Bit 0 */
+ bool InitializationInProgress:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Read;
+}
+DAC960_LP_InboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 LP Series Outbound Door Bell Register.
+*/
+
+typedef union DAC960_LP_OutboundDoorBellRegister
+{
+ unsigned char All;
+ struct {
+ bool AcknowledgeHardwareMailboxInterrupt:1; /* Bit 0 */
+ bool AcknowledgeMemoryMailboxInterrupt:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Write;
+ struct {
+ bool HardwareMailboxStatusAvailable:1; /* Bit 0 */
+ bool MemoryMailboxStatusAvailable:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Read;
+}
+DAC960_LP_OutboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 LP Series Interrupt Mask Register.
+*/
+
+typedef union DAC960_LP_InterruptMaskRegister
+{
+ unsigned char All;
+ struct {
+ unsigned int :2; /* Bits 0-1 */
+ bool DisableInterrupts:1; /* Bit 2 */
+ unsigned int :5; /* Bits 3-7 */
+ } Bits;
+}
+DAC960_LP_InterruptMaskRegister_T;
+
+
+/*
+ Define the structure of the DAC960 LP Series Error Status Register.
+*/
+
+typedef union DAC960_LP_ErrorStatusRegister
+{
+ unsigned char All;
+ struct {
+ unsigned int :2; /* Bits 0-1 */
+ bool ErrorStatusPending:1; /* Bit 2 */
+ unsigned int :5; /* Bits 3-7 */
+ } Bits;
+}
+DAC960_LP_ErrorStatusRegister_T;
+
+
+/*
+ Define inline functions to provide an abstraction for reading and writing the
+ DAC960 LP Series Controller Interface Registers.
+*/
+
+static inline
+void DAC960_LP_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LP_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LP_GenerateInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.GenerateInterrupt = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LP_ControllerReset(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.ControllerReset = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LP_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_LP_HardwareMailboxFullP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset);
+ return InboundDoorBellRegister.Read.HardwareMailboxFull;
+}
+
+static inline
+bool DAC960_LP_InitializationInProgressP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_LP_InboundDoorBellRegisterOffset);
+ return InboundDoorBellRegister.Read.InitializationInProgress;
+}
+
+static inline
+void DAC960_LP_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LP_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LP_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_LP_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable;
+}
+
+static inline
+bool DAC960_LP_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_LP_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable;
+}
+
+static inline
+void DAC960_LP_EnableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0xFF;
+ InterruptMaskRegister.Bits.DisableInterrupts = false;
+ writeb(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_LP_InterruptMaskRegisterOffset);
+}
+
+static inline
+void DAC960_LP_DisableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0xFF;
+ InterruptMaskRegister.Bits.DisableInterrupts = true;
+ writeb(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_LP_InterruptMaskRegisterOffset);
+}
+
+static inline
+bool DAC960_LP_InterruptsEnabledP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LP_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All =
+ readb(ControllerBaseAddress + DAC960_LP_InterruptMaskRegisterOffset);
+ return !InterruptMaskRegister.Bits.DisableInterrupts;
+}
+
+static inline
+void DAC960_LP_WriteCommandMailbox(DAC960_V2_CommandMailbox_T
+ *MemoryCommandMailbox,
+ DAC960_V2_CommandMailbox_T
+ *CommandMailbox)
+{
+ memcpy(&MemoryCommandMailbox->Words[1], &CommandMailbox->Words[1],
+ sizeof(DAC960_V2_CommandMailbox_T) - sizeof(unsigned int));
+ wmb();
+ MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0];
+ mb();
+}
+
+static inline
+void DAC960_LP_WriteHardwareMailbox(void __iomem *ControllerBaseAddress,
+ dma_addr_t CommandMailboxDMA)
+{
+ dma_addr_writeql(CommandMailboxDMA,
+ ControllerBaseAddress +
+ DAC960_LP_CommandMailboxBusAddressOffset);
+}
+
+static inline DAC960_V2_CommandIdentifier_T
+DAC960_LP_ReadCommandIdentifier(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_LP_CommandStatusOffset);
+}
+
+static inline DAC960_V2_CommandStatus_T
+DAC960_LP_ReadCommandStatus(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_LP_CommandStatusOffset + 2);
+}
+
+static inline bool
+DAC960_LP_ReadErrorStatus(void __iomem *ControllerBaseAddress,
+ unsigned char *ErrorStatus,
+ unsigned char *Parameter0,
+ unsigned char *Parameter1)
+{
+ DAC960_LP_ErrorStatusRegister_T ErrorStatusRegister;
+ ErrorStatusRegister.All =
+ readb(ControllerBaseAddress + DAC960_LP_ErrorStatusRegisterOffset);
+ if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false;
+ ErrorStatusRegister.Bits.ErrorStatusPending = false;
+ *ErrorStatus = ErrorStatusRegister.All;
+ *Parameter0 =
+ readb(ControllerBaseAddress + DAC960_LP_CommandMailboxBusAddressOffset + 0);
+ *Parameter1 =
+ readb(ControllerBaseAddress + DAC960_LP_CommandMailboxBusAddressOffset + 1);
+ writeb(0xFF, ControllerBaseAddress + DAC960_LP_ErrorStatusRegisterOffset);
+ return true;
+}
+
+
+/*
+ Define the DAC960 LA Series Controller Interface Register Offsets.
+*/
+
+#define DAC960_LA_RegisterWindowSize 0x80
+
+typedef enum
+{
+ DAC960_LA_InboundDoorBellRegisterOffset = 0x60,
+ DAC960_LA_OutboundDoorBellRegisterOffset = 0x61,
+ DAC960_LA_InterruptMaskRegisterOffset = 0x34,
+ DAC960_LA_CommandOpcodeRegisterOffset = 0x50,
+ DAC960_LA_CommandIdentifierRegisterOffset = 0x51,
+ DAC960_LA_MailboxRegister2Offset = 0x52,
+ DAC960_LA_MailboxRegister3Offset = 0x53,
+ DAC960_LA_MailboxRegister4Offset = 0x54,
+ DAC960_LA_MailboxRegister5Offset = 0x55,
+ DAC960_LA_MailboxRegister6Offset = 0x56,
+ DAC960_LA_MailboxRegister7Offset = 0x57,
+ DAC960_LA_MailboxRegister8Offset = 0x58,
+ DAC960_LA_MailboxRegister9Offset = 0x59,
+ DAC960_LA_MailboxRegister10Offset = 0x5A,
+ DAC960_LA_MailboxRegister11Offset = 0x5B,
+ DAC960_LA_MailboxRegister12Offset = 0x5C,
+ DAC960_LA_StatusCommandIdentifierRegOffset = 0x5D,
+ DAC960_LA_StatusRegisterOffset = 0x5E,
+ DAC960_LA_ErrorStatusRegisterOffset = 0x63
+}
+DAC960_LA_RegisterOffsets_T;
+
+
+/*
+ Define the structure of the DAC960 LA Series Inbound Door Bell Register.
+*/
+
+typedef union DAC960_LA_InboundDoorBellRegister
+{
+ unsigned char All;
+ struct {
+ bool HardwareMailboxNewCommand:1; /* Bit 0 */
+ bool AcknowledgeHardwareMailboxStatus:1; /* Bit 1 */
+ bool GenerateInterrupt:1; /* Bit 2 */
+ bool ControllerReset:1; /* Bit 3 */
+ bool MemoryMailboxNewCommand:1; /* Bit 4 */
+ unsigned char :3; /* Bits 5-7 */
+ } Write;
+ struct {
+ bool HardwareMailboxEmpty:1; /* Bit 0 */
+ bool InitializationNotInProgress:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Read;
+}
+DAC960_LA_InboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 LA Series Outbound Door Bell Register.
+*/
+
+typedef union DAC960_LA_OutboundDoorBellRegister
+{
+ unsigned char All;
+ struct {
+ bool AcknowledgeHardwareMailboxInterrupt:1; /* Bit 0 */
+ bool AcknowledgeMemoryMailboxInterrupt:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Write;
+ struct {
+ bool HardwareMailboxStatusAvailable:1; /* Bit 0 */
+ bool MemoryMailboxStatusAvailable:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Read;
+}
+DAC960_LA_OutboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 LA Series Interrupt Mask Register.
+*/
+
+typedef union DAC960_LA_InterruptMaskRegister
+{
+ unsigned char All;
+ struct {
+ unsigned char :2; /* Bits 0-1 */
+ bool DisableInterrupts:1; /* Bit 2 */
+ unsigned char :5; /* Bits 3-7 */
+ } Bits;
+}
+DAC960_LA_InterruptMaskRegister_T;
+
+
+/*
+ Define the structure of the DAC960 LA Series Error Status Register.
+*/
+
+typedef union DAC960_LA_ErrorStatusRegister
+{
+ unsigned char All;
+ struct {
+ unsigned int :2; /* Bits 0-1 */
+ bool ErrorStatusPending:1; /* Bit 2 */
+ unsigned int :5; /* Bits 3-7 */
+ } Bits;
+}
+DAC960_LA_ErrorStatusRegister_T;
+
+
+/*
+ Define inline functions to provide an abstraction for reading and writing the
+ DAC960 LA Series Controller Interface Registers.
+*/
+
+static inline
+void DAC960_LA_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LA_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LA_GenerateInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.GenerateInterrupt = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LA_ControllerReset(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.ControllerReset = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LA_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_LA_HardwareMailboxFullP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset);
+ return !InboundDoorBellRegister.Read.HardwareMailboxEmpty;
+}
+
+static inline
+bool DAC960_LA_InitializationInProgressP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_LA_InboundDoorBellRegisterOffset);
+ return !InboundDoorBellRegister.Read.InitializationNotInProgress;
+}
+
+static inline
+void DAC960_LA_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LA_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_LA_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_LA_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable;
+}
+
+static inline
+bool DAC960_LA_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_LA_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable;
+}
+
+static inline
+void DAC960_LA_EnableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0xFF;
+ InterruptMaskRegister.Bits.DisableInterrupts = false;
+ writeb(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_LA_InterruptMaskRegisterOffset);
+}
+
+static inline
+void DAC960_LA_DisableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0xFF;
+ InterruptMaskRegister.Bits.DisableInterrupts = true;
+ writeb(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_LA_InterruptMaskRegisterOffset);
+}
+
+static inline
+bool DAC960_LA_InterruptsEnabledP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_LA_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All =
+ readb(ControllerBaseAddress + DAC960_LA_InterruptMaskRegisterOffset);
+ return !InterruptMaskRegister.Bits.DisableInterrupts;
+}
+
+static inline
+void DAC960_LA_WriteCommandMailbox(DAC960_V1_CommandMailbox_T
+ *MemoryCommandMailbox,
+ DAC960_V1_CommandMailbox_T
+ *CommandMailbox)
+{
+ MemoryCommandMailbox->Words[1] = CommandMailbox->Words[1];
+ MemoryCommandMailbox->Words[2] = CommandMailbox->Words[2];
+ MemoryCommandMailbox->Words[3] = CommandMailbox->Words[3];
+ wmb();
+ MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0];
+ mb();
+}
+
+static inline
+void DAC960_LA_WriteHardwareMailbox(void __iomem *ControllerBaseAddress,
+ DAC960_V1_CommandMailbox_T *CommandMailbox)
+{
+ writel(CommandMailbox->Words[0],
+ ControllerBaseAddress + DAC960_LA_CommandOpcodeRegisterOffset);
+ writel(CommandMailbox->Words[1],
+ ControllerBaseAddress + DAC960_LA_MailboxRegister4Offset);
+ writel(CommandMailbox->Words[2],
+ ControllerBaseAddress + DAC960_LA_MailboxRegister8Offset);
+ writeb(CommandMailbox->Bytes[12],
+ ControllerBaseAddress + DAC960_LA_MailboxRegister12Offset);
+}
+
+static inline DAC960_V1_CommandIdentifier_T
+DAC960_LA_ReadStatusCommandIdentifier(void __iomem *ControllerBaseAddress)
+{
+ return readb(ControllerBaseAddress
+ + DAC960_LA_StatusCommandIdentifierRegOffset);
+}
+
+static inline DAC960_V1_CommandStatus_T
+DAC960_LA_ReadStatusRegister(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_LA_StatusRegisterOffset);
+}
+
+static inline bool
+DAC960_LA_ReadErrorStatus(void __iomem *ControllerBaseAddress,
+ unsigned char *ErrorStatus,
+ unsigned char *Parameter0,
+ unsigned char *Parameter1)
+{
+ DAC960_LA_ErrorStatusRegister_T ErrorStatusRegister;
+ ErrorStatusRegister.All =
+ readb(ControllerBaseAddress + DAC960_LA_ErrorStatusRegisterOffset);
+ if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false;
+ ErrorStatusRegister.Bits.ErrorStatusPending = false;
+ *ErrorStatus = ErrorStatusRegister.All;
+ *Parameter0 =
+ readb(ControllerBaseAddress + DAC960_LA_CommandOpcodeRegisterOffset);
+ *Parameter1 =
+ readb(ControllerBaseAddress + DAC960_LA_CommandIdentifierRegisterOffset);
+ writeb(0xFF, ControllerBaseAddress + DAC960_LA_ErrorStatusRegisterOffset);
+ return true;
+}
+
+/*
+ Define the DAC960 PG Series Controller Interface Register Offsets.
+*/
+
+#define DAC960_PG_RegisterWindowSize 0x2000
+
+typedef enum
+{
+ DAC960_PG_InboundDoorBellRegisterOffset = 0x0020,
+ DAC960_PG_OutboundDoorBellRegisterOffset = 0x002C,
+ DAC960_PG_InterruptMaskRegisterOffset = 0x0034,
+ DAC960_PG_CommandOpcodeRegisterOffset = 0x1000,
+ DAC960_PG_CommandIdentifierRegisterOffset = 0x1001,
+ DAC960_PG_MailboxRegister2Offset = 0x1002,
+ DAC960_PG_MailboxRegister3Offset = 0x1003,
+ DAC960_PG_MailboxRegister4Offset = 0x1004,
+ DAC960_PG_MailboxRegister5Offset = 0x1005,
+ DAC960_PG_MailboxRegister6Offset = 0x1006,
+ DAC960_PG_MailboxRegister7Offset = 0x1007,
+ DAC960_PG_MailboxRegister8Offset = 0x1008,
+ DAC960_PG_MailboxRegister9Offset = 0x1009,
+ DAC960_PG_MailboxRegister10Offset = 0x100A,
+ DAC960_PG_MailboxRegister11Offset = 0x100B,
+ DAC960_PG_MailboxRegister12Offset = 0x100C,
+ DAC960_PG_StatusCommandIdentifierRegOffset = 0x1018,
+ DAC960_PG_StatusRegisterOffset = 0x101A,
+ DAC960_PG_ErrorStatusRegisterOffset = 0x103F
+}
+DAC960_PG_RegisterOffsets_T;
+
+
+/*
+ Define the structure of the DAC960 PG Series Inbound Door Bell Register.
+*/
+
+typedef union DAC960_PG_InboundDoorBellRegister
+{
+ unsigned int All;
+ struct {
+ bool HardwareMailboxNewCommand:1; /* Bit 0 */
+ bool AcknowledgeHardwareMailboxStatus:1; /* Bit 1 */
+ bool GenerateInterrupt:1; /* Bit 2 */
+ bool ControllerReset:1; /* Bit 3 */
+ bool MemoryMailboxNewCommand:1; /* Bit 4 */
+ unsigned int :27; /* Bits 5-31 */
+ } Write;
+ struct {
+ bool HardwareMailboxFull:1; /* Bit 0 */
+ bool InitializationInProgress:1; /* Bit 1 */
+ unsigned int :30; /* Bits 2-31 */
+ } Read;
+}
+DAC960_PG_InboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 PG Series Outbound Door Bell Register.
+*/
+
+typedef union DAC960_PG_OutboundDoorBellRegister
+{
+ unsigned int All;
+ struct {
+ bool AcknowledgeHardwareMailboxInterrupt:1; /* Bit 0 */
+ bool AcknowledgeMemoryMailboxInterrupt:1; /* Bit 1 */
+ unsigned int :30; /* Bits 2-31 */
+ } Write;
+ struct {
+ bool HardwareMailboxStatusAvailable:1; /* Bit 0 */
+ bool MemoryMailboxStatusAvailable:1; /* Bit 1 */
+ unsigned int :30; /* Bits 2-31 */
+ } Read;
+}
+DAC960_PG_OutboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 PG Series Interrupt Mask Register.
+*/
+
+typedef union DAC960_PG_InterruptMaskRegister
+{
+ unsigned int All;
+ struct {
+ unsigned int MessageUnitInterruptMask1:2; /* Bits 0-1 */
+ bool DisableInterrupts:1; /* Bit 2 */
+ unsigned int MessageUnitInterruptMask2:5; /* Bits 3-7 */
+ unsigned int Reserved0:24; /* Bits 8-31 */
+ } Bits;
+}
+DAC960_PG_InterruptMaskRegister_T;
+
+
+/*
+ Define the structure of the DAC960 PG Series Error Status Register.
+*/
+
+typedef union DAC960_PG_ErrorStatusRegister
+{
+ unsigned char All;
+ struct {
+ unsigned int :2; /* Bits 0-1 */
+ bool ErrorStatusPending:1; /* Bit 2 */
+ unsigned int :5; /* Bits 3-7 */
+ } Bits;
+}
+DAC960_PG_ErrorStatusRegister_T;
+
+
+/*
+ Define inline functions to provide an abstraction for reading and writing the
+ DAC960 PG Series Controller Interface Registers.
+*/
+
+static inline
+void DAC960_PG_HardwareMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.HardwareMailboxNewCommand = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PG_AcknowledgeHardwareMailboxStatus(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.AcknowledgeHardwareMailboxStatus = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PG_GenerateInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.GenerateInterrupt = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PG_ControllerReset(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.ControllerReset = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PG_MemoryMailboxNewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.MemoryMailboxNewCommand = true;
+ writel(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_PG_HardwareMailboxFullP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readl(ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset);
+ return InboundDoorBellRegister.Read.HardwareMailboxFull;
+}
+
+static inline
+bool DAC960_PG_InitializationInProgressP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readl(ControllerBaseAddress + DAC960_PG_InboundDoorBellRegisterOffset);
+ return InboundDoorBellRegister.Read.InitializationInProgress;
+}
+
+static inline
+void DAC960_PG_AcknowledgeHardwareMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ writel(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PG_AcknowledgeMemoryMailboxInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writel(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PG_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeHardwareMailboxInterrupt = true;
+ OutboundDoorBellRegister.Write.AcknowledgeMemoryMailboxInterrupt = true;
+ writel(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_PG_HardwareMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readl(ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.HardwareMailboxStatusAvailable;
+}
+
+static inline
+bool DAC960_PG_MemoryMailboxStatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readl(ControllerBaseAddress + DAC960_PG_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.MemoryMailboxStatusAvailable;
+}
+
+static inline
+void DAC960_PG_EnableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0;
+ InterruptMaskRegister.Bits.MessageUnitInterruptMask1 = 0x3;
+ InterruptMaskRegister.Bits.DisableInterrupts = false;
+ InterruptMaskRegister.Bits.MessageUnitInterruptMask2 = 0x1F;
+ writel(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_PG_InterruptMaskRegisterOffset);
+}
+
+static inline
+void DAC960_PG_DisableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All = 0;
+ InterruptMaskRegister.Bits.MessageUnitInterruptMask1 = 0x3;
+ InterruptMaskRegister.Bits.DisableInterrupts = true;
+ InterruptMaskRegister.Bits.MessageUnitInterruptMask2 = 0x1F;
+ writel(InterruptMaskRegister.All,
+ ControllerBaseAddress + DAC960_PG_InterruptMaskRegisterOffset);
+}
+
+static inline
+bool DAC960_PG_InterruptsEnabledP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PG_InterruptMaskRegister_T InterruptMaskRegister;
+ InterruptMaskRegister.All =
+ readl(ControllerBaseAddress + DAC960_PG_InterruptMaskRegisterOffset);
+ return !InterruptMaskRegister.Bits.DisableInterrupts;
+}
+
+static inline
+void DAC960_PG_WriteCommandMailbox(DAC960_V1_CommandMailbox_T
+ *MemoryCommandMailbox,
+ DAC960_V1_CommandMailbox_T
+ *CommandMailbox)
+{
+ MemoryCommandMailbox->Words[1] = CommandMailbox->Words[1];
+ MemoryCommandMailbox->Words[2] = CommandMailbox->Words[2];
+ MemoryCommandMailbox->Words[3] = CommandMailbox->Words[3];
+ wmb();
+ MemoryCommandMailbox->Words[0] = CommandMailbox->Words[0];
+ mb();
+}
+
+static inline
+void DAC960_PG_WriteHardwareMailbox(void __iomem *ControllerBaseAddress,
+ DAC960_V1_CommandMailbox_T *CommandMailbox)
+{
+ writel(CommandMailbox->Words[0],
+ ControllerBaseAddress + DAC960_PG_CommandOpcodeRegisterOffset);
+ writel(CommandMailbox->Words[1],
+ ControllerBaseAddress + DAC960_PG_MailboxRegister4Offset);
+ writel(CommandMailbox->Words[2],
+ ControllerBaseAddress + DAC960_PG_MailboxRegister8Offset);
+ writeb(CommandMailbox->Bytes[12],
+ ControllerBaseAddress + DAC960_PG_MailboxRegister12Offset);
+}
+
+static inline DAC960_V1_CommandIdentifier_T
+DAC960_PG_ReadStatusCommandIdentifier(void __iomem *ControllerBaseAddress)
+{
+ return readb(ControllerBaseAddress
+ + DAC960_PG_StatusCommandIdentifierRegOffset);
+}
+
+static inline DAC960_V1_CommandStatus_T
+DAC960_PG_ReadStatusRegister(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_PG_StatusRegisterOffset);
+}
+
+static inline bool
+DAC960_PG_ReadErrorStatus(void __iomem *ControllerBaseAddress,
+ unsigned char *ErrorStatus,
+ unsigned char *Parameter0,
+ unsigned char *Parameter1)
+{
+ DAC960_PG_ErrorStatusRegister_T ErrorStatusRegister;
+ ErrorStatusRegister.All =
+ readb(ControllerBaseAddress + DAC960_PG_ErrorStatusRegisterOffset);
+ if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false;
+ ErrorStatusRegister.Bits.ErrorStatusPending = false;
+ *ErrorStatus = ErrorStatusRegister.All;
+ *Parameter0 =
+ readb(ControllerBaseAddress + DAC960_PG_CommandOpcodeRegisterOffset);
+ *Parameter1 =
+ readb(ControllerBaseAddress + DAC960_PG_CommandIdentifierRegisterOffset);
+ writeb(0, ControllerBaseAddress + DAC960_PG_ErrorStatusRegisterOffset);
+ return true;
+}
+
+/*
+ Define the DAC960 PD Series Controller Interface Register Offsets.
+*/
+
+#define DAC960_PD_RegisterWindowSize 0x80
+
+typedef enum
+{
+ DAC960_PD_CommandOpcodeRegisterOffset = 0x00,
+ DAC960_PD_CommandIdentifierRegisterOffset = 0x01,
+ DAC960_PD_MailboxRegister2Offset = 0x02,
+ DAC960_PD_MailboxRegister3Offset = 0x03,
+ DAC960_PD_MailboxRegister4Offset = 0x04,
+ DAC960_PD_MailboxRegister5Offset = 0x05,
+ DAC960_PD_MailboxRegister6Offset = 0x06,
+ DAC960_PD_MailboxRegister7Offset = 0x07,
+ DAC960_PD_MailboxRegister8Offset = 0x08,
+ DAC960_PD_MailboxRegister9Offset = 0x09,
+ DAC960_PD_MailboxRegister10Offset = 0x0A,
+ DAC960_PD_MailboxRegister11Offset = 0x0B,
+ DAC960_PD_MailboxRegister12Offset = 0x0C,
+ DAC960_PD_StatusCommandIdentifierRegOffset = 0x0D,
+ DAC960_PD_StatusRegisterOffset = 0x0E,
+ DAC960_PD_ErrorStatusRegisterOffset = 0x3F,
+ DAC960_PD_InboundDoorBellRegisterOffset = 0x40,
+ DAC960_PD_OutboundDoorBellRegisterOffset = 0x41,
+ DAC960_PD_InterruptEnableRegisterOffset = 0x43
+}
+DAC960_PD_RegisterOffsets_T;
+
+
+/*
+ Define the structure of the DAC960 PD Series Inbound Door Bell Register.
+*/
+
+typedef union DAC960_PD_InboundDoorBellRegister
+{
+ unsigned char All;
+ struct {
+ bool NewCommand:1; /* Bit 0 */
+ bool AcknowledgeStatus:1; /* Bit 1 */
+ bool GenerateInterrupt:1; /* Bit 2 */
+ bool ControllerReset:1; /* Bit 3 */
+ unsigned char :4; /* Bits 4-7 */
+ } Write;
+ struct {
+ bool MailboxFull:1; /* Bit 0 */
+ bool InitializationInProgress:1; /* Bit 1 */
+ unsigned char :6; /* Bits 2-7 */
+ } Read;
+}
+DAC960_PD_InboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 PD Series Outbound Door Bell Register.
+*/
+
+typedef union DAC960_PD_OutboundDoorBellRegister
+{
+ unsigned char All;
+ struct {
+ bool AcknowledgeInterrupt:1; /* Bit 0 */
+ unsigned char :7; /* Bits 1-7 */
+ } Write;
+ struct {
+ bool StatusAvailable:1; /* Bit 0 */
+ unsigned char :7; /* Bits 1-7 */
+ } Read;
+}
+DAC960_PD_OutboundDoorBellRegister_T;
+
+
+/*
+ Define the structure of the DAC960 PD Series Interrupt Enable Register.
+*/
+
+typedef union DAC960_PD_InterruptEnableRegister
+{
+ unsigned char All;
+ struct {
+ bool EnableInterrupts:1; /* Bit 0 */
+ unsigned char :7; /* Bits 1-7 */
+ } Bits;
+}
+DAC960_PD_InterruptEnableRegister_T;
+
+
+/*
+ Define the structure of the DAC960 PD Series Error Status Register.
+*/
+
+typedef union DAC960_PD_ErrorStatusRegister
+{
+ unsigned char All;
+ struct {
+ unsigned int :2; /* Bits 0-1 */
+ bool ErrorStatusPending:1; /* Bit 2 */
+ unsigned int :5; /* Bits 3-7 */
+ } Bits;
+}
+DAC960_PD_ErrorStatusRegister_T;
+
+
+/*
+ Define inline functions to provide an abstraction for reading and writing the
+ DAC960 PD Series Controller Interface Registers.
+*/
+
+static inline
+void DAC960_PD_NewCommand(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.NewCommand = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PD_AcknowledgeStatus(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.AcknowledgeStatus = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PD_GenerateInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.GenerateInterrupt = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset);
+}
+
+static inline
+void DAC960_PD_ControllerReset(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All = 0;
+ InboundDoorBellRegister.Write.ControllerReset = true;
+ writeb(InboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_PD_MailboxFullP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset);
+ return InboundDoorBellRegister.Read.MailboxFull;
+}
+
+static inline
+bool DAC960_PD_InitializationInProgressP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InboundDoorBellRegister_T InboundDoorBellRegister;
+ InboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_PD_InboundDoorBellRegisterOffset);
+ return InboundDoorBellRegister.Read.InitializationInProgress;
+}
+
+static inline
+void DAC960_PD_AcknowledgeInterrupt(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All = 0;
+ OutboundDoorBellRegister.Write.AcknowledgeInterrupt = true;
+ writeb(OutboundDoorBellRegister.All,
+ ControllerBaseAddress + DAC960_PD_OutboundDoorBellRegisterOffset);
+}
+
+static inline
+bool DAC960_PD_StatusAvailableP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_OutboundDoorBellRegister_T OutboundDoorBellRegister;
+ OutboundDoorBellRegister.All =
+ readb(ControllerBaseAddress + DAC960_PD_OutboundDoorBellRegisterOffset);
+ return OutboundDoorBellRegister.Read.StatusAvailable;
+}
+
+static inline
+void DAC960_PD_EnableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InterruptEnableRegister_T InterruptEnableRegister;
+ InterruptEnableRegister.All = 0;
+ InterruptEnableRegister.Bits.EnableInterrupts = true;
+ writeb(InterruptEnableRegister.All,
+ ControllerBaseAddress + DAC960_PD_InterruptEnableRegisterOffset);
+}
+
+static inline
+void DAC960_PD_DisableInterrupts(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InterruptEnableRegister_T InterruptEnableRegister;
+ InterruptEnableRegister.All = 0;
+ InterruptEnableRegister.Bits.EnableInterrupts = false;
+ writeb(InterruptEnableRegister.All,
+ ControllerBaseAddress + DAC960_PD_InterruptEnableRegisterOffset);
+}
+
+static inline
+bool DAC960_PD_InterruptsEnabledP(void __iomem *ControllerBaseAddress)
+{
+ DAC960_PD_InterruptEnableRegister_T InterruptEnableRegister;
+ InterruptEnableRegister.All =
+ readb(ControllerBaseAddress + DAC960_PD_InterruptEnableRegisterOffset);
+ return InterruptEnableRegister.Bits.EnableInterrupts;
+}
+
+static inline
+void DAC960_PD_WriteCommandMailbox(void __iomem *ControllerBaseAddress,
+ DAC960_V1_CommandMailbox_T *CommandMailbox)
+{
+ writel(CommandMailbox->Words[0],
+ ControllerBaseAddress + DAC960_PD_CommandOpcodeRegisterOffset);
+ writel(CommandMailbox->Words[1],
+ ControllerBaseAddress + DAC960_PD_MailboxRegister4Offset);
+ writel(CommandMailbox->Words[2],
+ ControllerBaseAddress + DAC960_PD_MailboxRegister8Offset);
+ writeb(CommandMailbox->Bytes[12],
+ ControllerBaseAddress + DAC960_PD_MailboxRegister12Offset);
+}
+
+static inline DAC960_V1_CommandIdentifier_T
+DAC960_PD_ReadStatusCommandIdentifier(void __iomem *ControllerBaseAddress)
+{
+ return readb(ControllerBaseAddress
+ + DAC960_PD_StatusCommandIdentifierRegOffset);
+}
+
+static inline DAC960_V1_CommandStatus_T
+DAC960_PD_ReadStatusRegister(void __iomem *ControllerBaseAddress)
+{
+ return readw(ControllerBaseAddress + DAC960_PD_StatusRegisterOffset);
+}
+
+static inline bool
+DAC960_PD_ReadErrorStatus(void __iomem *ControllerBaseAddress,
+ unsigned char *ErrorStatus,
+ unsigned char *Parameter0,
+ unsigned char *Parameter1)
+{
+ DAC960_PD_ErrorStatusRegister_T ErrorStatusRegister;
+ ErrorStatusRegister.All =
+ readb(ControllerBaseAddress + DAC960_PD_ErrorStatusRegisterOffset);
+ if (!ErrorStatusRegister.Bits.ErrorStatusPending) return false;
+ ErrorStatusRegister.Bits.ErrorStatusPending = false;
+ *ErrorStatus = ErrorStatusRegister.All;
+ *Parameter0 =
+ readb(ControllerBaseAddress + DAC960_PD_CommandOpcodeRegisterOffset);
+ *Parameter1 =
+ readb(ControllerBaseAddress + DAC960_PD_CommandIdentifierRegisterOffset);
+ writeb(0, ControllerBaseAddress + DAC960_PD_ErrorStatusRegisterOffset);
+ return true;
+}
+
+static inline void DAC960_P_To_PD_TranslateEnquiry(void *Enquiry)
+{
+ memcpy(Enquiry + 132, Enquiry + 36, 64);
+ memset(Enquiry + 36, 0, 96);
+}
+
+static inline void DAC960_P_To_PD_TranslateDeviceState(void *DeviceState)
+{
+ memcpy(DeviceState + 2, DeviceState + 3, 1);
+ memmove(DeviceState + 4, DeviceState + 5, 2);
+ memmove(DeviceState + 6, DeviceState + 8, 4);
+}
+
+static inline
+void DAC960_PD_To_P_TranslateReadWriteCommand(DAC960_V1_CommandMailbox_T
+ *CommandMailbox)
+{
+ int LogicalDriveNumber = CommandMailbox->Type5.LD.LogicalDriveNumber;
+ CommandMailbox->Bytes[3] &= 0x7;
+ CommandMailbox->Bytes[3] |= CommandMailbox->Bytes[7] << 6;
+ CommandMailbox->Bytes[7] = LogicalDriveNumber;
+}
+
+static inline
+void DAC960_P_To_PD_TranslateReadWriteCommand(DAC960_V1_CommandMailbox_T
+ *CommandMailbox)
+{
+ int LogicalDriveNumber = CommandMailbox->Bytes[7];
+ CommandMailbox->Bytes[7] = CommandMailbox->Bytes[3] >> 6;
+ CommandMailbox->Bytes[3] &= 0x7;
+ CommandMailbox->Bytes[3] |= LogicalDriveNumber << 3;
+}
+
+
+/*
+ Define prototypes for the forward referenced DAC960 Driver Internal Functions.
+*/
+
+static void DAC960_FinalizeController(DAC960_Controller_T *);
+static void DAC960_V1_QueueReadWriteCommand(DAC960_Command_T *);
+static void DAC960_V2_QueueReadWriteCommand(DAC960_Command_T *);
+static void DAC960_RequestFunction(struct request_queue *);
+static irqreturn_t DAC960_BA_InterruptHandler(int, void *);
+static irqreturn_t DAC960_LP_InterruptHandler(int, void *);
+static irqreturn_t DAC960_LA_InterruptHandler(int, void *);
+static irqreturn_t DAC960_PG_InterruptHandler(int, void *);
+static irqreturn_t DAC960_PD_InterruptHandler(int, void *);
+static irqreturn_t DAC960_P_InterruptHandler(int, void *);
+static void DAC960_V1_QueueMonitoringCommand(DAC960_Command_T *);
+static void DAC960_V2_QueueMonitoringCommand(DAC960_Command_T *);
+static void DAC960_MonitoringTimerFunction(unsigned long);
+static void DAC960_Message(DAC960_MessageLevel_T, unsigned char *,
+ DAC960_Controller_T *, ...);
+static void DAC960_CreateProcEntries(DAC960_Controller_T *);
+static void DAC960_DestroyProcEntries(DAC960_Controller_T *);
+
+#endif /* DAC960_DriverVersion */
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
new file mode 100644
index 000000000..3ccef9eba
--- /dev/null
+++ b/drivers/block/Kconfig
@@ -0,0 +1,573 @@
+#
+# Block device driver configuration
+#
+
+menuconfig BLK_DEV
+ bool "Block devices"
+ depends on BLOCK
+ default y
+ ---help---
+ Say Y here to get to see options for various different block device
+ drivers. This option alone does not add any kernel code.
+
+ If you say N, all options in this submenu will be skipped and disabled;
+ only do this if you know what you are doing.
+
+if BLK_DEV
+
+config BLK_DEV_NULL_BLK
+ tristate "Null test block driver"
+
+config BLK_DEV_FD
+ tristate "Normal floppy disk support"
+ depends on ARCH_MAY_HAVE_PC_FDC
+ ---help---
+ If you want to use the floppy disk drive(s) of your PC under Linux,
+ say Y. Information about this driver, especially important for IBM
+ Thinkpad users, is contained in
+ <file:Documentation/blockdev/floppy.txt>.
+ That file also contains the location of the Floppy driver FAQ as
+ well as location of the fdutils package used to configure additional
+ parameters of the driver at run time.
+
+ To compile this driver as a module, choose M here: the
+ module will be called floppy.
+
+config AMIGA_FLOPPY
+ tristate "Amiga floppy support"
+ depends on AMIGA
+
+config ATARI_FLOPPY
+ tristate "Atari floppy support"
+ depends on ATARI
+
+config MAC_FLOPPY
+ tristate "Support for PowerMac floppy"
+ depends on PPC_PMAC && !PPC_PMAC64
+ help
+ If you have a SWIM-3 (Super Woz Integrated Machine 3; from Apple)
+ floppy controller, say Y here. Most commonly found in PowerMacs.
+
+config BLK_DEV_SWIM
+ tristate "Support for SWIM Macintosh floppy"
+ depends on M68K && MAC
+ help
+ You should select this option if you want floppy support
+ and you don't have a II, IIfx, Q900, Q950 or AV series.
+
+config AMIGA_Z2RAM
+ tristate "Amiga Zorro II ramdisk support"
+ depends on ZORRO
+ help
+ This enables support for using Chip RAM and Zorro II RAM as a
+ ramdisk or as a swap partition. Say Y if you want to include this
+ driver in the kernel.
+
+ To compile this driver as a module, choose M here: the
+ module will be called z2ram.
+
+config GDROM
+ tristate "SEGA Dreamcast GD-ROM drive"
+ depends on SH_DREAMCAST
+ help
+ A standard SEGA Dreamcast comes with a modified CD ROM drive called a
+ "GD-ROM" by SEGA to signify it is capable of reading special disks
+ with up to 1 GB of data. This drive will also read standard CD ROM
+ disks. Select this option to access any disks in your GD ROM drive.
+ Most users will want to say "Y" here.
+ You can also build this as a module which will be called gdrom.
+
+config PARIDE
+ tristate "Parallel port IDE device support"
+ depends on PARPORT_PC
+ ---help---
+ There are many external CD-ROM and disk devices that connect through
+ your computer's parallel port. Most of them are actually IDE devices
+ using a parallel port IDE adapter. This option enables the PARIDE
+ subsystem which contains drivers for many of these external drives.
+ Read <file:Documentation/blockdev/paride.txt> for more information.
+
+ If you have said Y to the "Parallel-port support" configuration
+ option, you may share a single port between your printer and other
+ parallel port devices. Answer Y to build PARIDE support into your
+ kernel, or M if you would like to build it as a loadable module. If
+ your parallel port support is in a loadable module, you must build
+ PARIDE as a module. If you built PARIDE support into your kernel,
+ you may still build the individual protocol modules and high-level
+ drivers as loadable modules. If you build this support as a module,
+ it will be called paride.
+
+ To use the PARIDE support, you must say Y or M here and also to at
+ least one high-level driver (e.g. "Parallel port IDE disks",
+ "Parallel port ATAPI CD-ROMs", "Parallel port ATAPI disks" etc.) and
+ to at least one protocol driver (e.g. "ATEN EH-100 protocol",
+ "MicroSolutions backpack protocol", "DataStor Commuter protocol"
+ etc.).
+
+source "drivers/block/paride/Kconfig"
+
+source "drivers/block/mtip32xx/Kconfig"
+
+source "drivers/block/zram/Kconfig"
+
+config BLK_CPQ_DA
+ tristate "Compaq SMART2 support"
+ depends on PCI && VIRT_TO_BUS && 0
+ help
+ This is the driver for Compaq Smart Array controllers. Everyone
+ using these boards should say Y here. See the file
+ <file:Documentation/blockdev/cpqarray.txt> for the current list of
+ boards supported by this driver, and for further information on the
+ use of this driver.
+
+config BLK_CPQ_CISS_DA
+ tristate "Compaq Smart Array 5xxx support"
+ depends on PCI
+ select CHECK_SIGNATURE
+ help
+ This is the driver for Compaq Smart Array 5xxx controllers.
+ Everyone using these boards should say Y here.
+ See <file:Documentation/blockdev/cciss.txt> for the current list of
+ boards supported by this driver, and for further information
+ on the use of this driver.
+
+config CISS_SCSI_TAPE
+ bool "SCSI tape drive support for Smart Array 5xxx"
+ depends on BLK_CPQ_CISS_DA && PROC_FS
+ depends on SCSI=y || SCSI=BLK_CPQ_CISS_DA
+ help
+ When enabled (Y), this option allows SCSI tape drives and SCSI medium
+ changers (tape robots) to be accessed via a Compaq 5xxx array
+ controller. (See <file:Documentation/blockdev/cciss.txt> for more details.)
+
+ "SCSI support" and "SCSI tape support" must also be enabled for this
+ option to work.
+
+ When this option is disabled (N), the SCSI portion of the driver
+ is not compiled.
+
+config BLK_DEV_DAC960
+ tristate "Mylex DAC960/DAC1100 PCI RAID Controller support"
+ depends on PCI
+ help
+ This driver adds support for the Mylex DAC960, AcceleRAID, and
+ eXtremeRAID PCI RAID controllers. See the file
+ <file:Documentation/blockdev/README.DAC960> for further information
+ about this driver.
+
+ To compile this driver as a module, choose M here: the
+ module will be called DAC960.
+
+config BLK_DEV_UMEM
+ tristate "Micro Memory MM5415 Battery Backed RAM support"
+ depends on PCI
+ ---help---
+ Saying Y here will include support for the MM5415 family of
+ battery backed (Non-volatile) RAM cards.
+ <http://www.umem.com/>
+
+ The cards appear as block devices that can be partitioned into
+ as many as 15 partitions.
+
+ To compile this driver as a module, choose M here: the
+ module will be called umem.
+
+ The umem driver has not yet been allocated a MAJOR number, so
+ one is chosen dynamically.
+
+config BLK_DEV_UBD
+ bool "Virtual block device"
+ depends on UML
+ ---help---
+ The User-Mode Linux port includes a driver called UBD which will let
+ you access arbitrary files on the host computer as block devices.
+ Unless you know that you do not need such virtual block devices say
+ Y here.
+
+config BLK_DEV_UBD_SYNC
+ bool "Always do synchronous disk IO for UBD"
+ depends on BLK_DEV_UBD
+ ---help---
+ Writes to the virtual block device are not immediately written to the
+ host's disk; this may cause problems if, for example, the User-Mode
+ Linux 'Virtual Machine' uses a journalling filesystem and the host
+ computer crashes.
+
+ Synchronous operation (i.e. always writing data to the host's disk
+ immediately) is configurable on a per-UBD basis by using a special
+ kernel command line option. Alternatively, you can say Y here to
+ turn on synchronous operation by default for all block devices.
+
+ If you're running a journalling file system (like reiserfs, for
+ example) in your virtual machine, you will want to say Y here. If
+ you care for the safety of the data in your virtual machine, Y is a
+ wise choice too. In all other cases (for example, if you're just
+ playing around with User-Mode Linux) you can choose N.
+
+config BLK_DEV_COW_COMMON
+ bool
+ default BLK_DEV_UBD
+
+config BLK_DEV_LOOP
+ tristate "Loopback device support"
+ ---help---
+ Saying Y here will allow you to use a regular file as a block
+ device; you can then create a file system on that block device and
+ mount it just as you would mount other block devices such as hard
+ drive partitions, CD-ROM drives or floppy drives. The loop devices
+ are block special device files with major number 7 and typically
+ called /dev/loop0, /dev/loop1 etc.
+
+ This is useful if you want to check an ISO 9660 file system before
+ burning the CD, or if you want to use floppy images without first
+ writing them to floppy. Furthermore, some Linux distributions avoid
+ the need for a dedicated Linux partition by keeping their complete
+ root file system inside a DOS FAT file using this loop device
+ driver.
+
+ To use the loop device, you need the losetup utility, found in the
+ util-linux package, see
+ <ftp://ftp.kernel.org/pub/linux/utils/util-linux/>.
+
+ The loop device driver can also be used to "hide" a file system in
+ a disk partition, floppy, or regular file, either using encryption
+ (scrambling the data) or steganography (hiding the data in the low
+ bits of, say, a sound file). This is also safe if the file resides
+ on a remote file server.
+
+ There are several ways of encrypting disks. Some of these require
+ kernel patches. The vanilla kernel offers the cryptoloop option
+ and a Device Mapper target (which is superior, as it supports all
+ file systems). If you want to use the cryptoloop, say Y to both
+ LOOP and CRYPTOLOOP, and make sure you have a recent (version 2.12
+ or later) version of util-linux. Additionally, be aware that
+ the cryptoloop is not safe for storing journaled filesystems.
+
+ Note that this loop device has nothing to do with the loopback
+ device used for network connections from the machine to itself.
+
+ To compile this driver as a module, choose M here: the
+ module will be called loop.
+
+ Most users will answer N here.
+
+config BLK_DEV_LOOP_MIN_COUNT
+ int "Number of loop devices to pre-create at init time"
+ depends on BLK_DEV_LOOP
+ default 8
+ help
+ Static number of loop devices to be unconditionally pre-created
+ at init time.
+
+ This default value can be overwritten on the kernel command
+ line or with module-parameter loop.max_loop.
+
+ The historic default is 8. If a late 2011 version of losetup(8)
+ is used, it can be set to 0, since needed loop devices can be
+ dynamically allocated with the /dev/loop-control interface.
+
+config BLK_DEV_CRYPTOLOOP
+ tristate "Cryptoloop Support"
+ select CRYPTO
+ select CRYPTO_CBC
+ depends on BLK_DEV_LOOP
+ ---help---
+ Say Y here if you want to be able to use the ciphers that are
+ provided by the CryptoAPI as loop transformation. This might be
+ used as hard disk encryption.
+
+ WARNING: This device is not safe for journaled file systems like
+ ext3 or Reiserfs. Please use the Device Mapper crypto module
+ instead, which can be configured to be on-disk compatible with the
+ cryptoloop device.
+
+source "drivers/block/drbd/Kconfig"
+
+config BLK_DEV_NBD
+ tristate "Network block device support"
+ depends on NET
+ ---help---
+ Saying Y here will allow your computer to be a client for network
+ block devices, i.e. it will be able to use block devices exported by
+ servers (mount file systems on them etc.). Communication between
+ client and server works over TCP/IP networking, but to the client
+ program this is hidden: it looks like a regular local file access to
+ a block device special file such as /dev/nd0.
+
+ Network block devices also allows you to run a block-device in
+ userland (making server and client physically the same computer,
+ communicating using the loopback network device).
+
+ Read <file:Documentation/blockdev/nbd.txt> for more information,
+ especially about where to find the server code, which runs in user
+ space and does not need special kernel support.
+
+ Note that this has nothing to do with the network file systems NFS
+ or Coda; you can say N here even if you intend to use NFS or Coda.
+
+ To compile this driver as a module, choose M here: the
+ module will be called nbd.
+
+ If unsure, say N.
+
+config BLK_DEV_NVME
+ tristate "NVM Express block device"
+ depends on PCI
+ ---help---
+ The NVM Express driver is for solid state drives directly
+ connected to the PCI or PCI Express bus. If you know you
+ don't have one of these, it is safe to answer N.
+
+ To compile this driver as a module, choose M here: the
+ module will be called nvme.
+
+config BLK_DEV_SKD
+ tristate "STEC S1120 Block Driver"
+ depends on PCI
+ depends on 64BIT
+ ---help---
+ Saying Y or M here will enable support for the
+ STEC, Inc. S1120 PCIe SSD.
+
+ Use device /dev/skd$N amd /dev/skd$Np$M.
+
+config BLK_DEV_OSD
+ tristate "OSD object-as-blkdev support"
+ depends on SCSI_OSD_ULD
+ ---help---
+ Saying Y or M here will allow the exporting of a single SCSI
+ OSD (object-based storage) object as a Linux block device.
+
+ For example, if you create a 2G object on an OSD device,
+ you can then use this module to present that 2G object as
+ a Linux block device.
+
+ To compile this driver as a module, choose M here: the
+ module will be called osdblk.
+
+ If unsure, say N.
+
+config BLK_DEV_SX8
+ tristate "Promise SATA SX8 support"
+ depends on PCI
+ ---help---
+ Saying Y or M here will enable support for the
+ Promise SATA SX8 controllers.
+
+ Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
+
+config BLK_DEV_RAM
+ tristate "RAM block device support"
+ ---help---
+ Saying Y here will allow you to use a portion of your RAM memory as
+ a block device, so that you can make file systems on it, read and
+ write to it and do all the other things that you can do with normal
+ block devices (such as hard drives). It is usually used to load and
+ store a copy of a minimal root file system off of a floppy into RAM
+ during the initial install of Linux.
+
+ Note that the kernel command line option "ramdisk=XX" is now obsolete.
+ For details, read <file:Documentation/blockdev/ramdisk.txt>.
+
+ To compile this driver as a module, choose M here: the
+ module will be called brd. An alias "rd" has been defined
+ for historical reasons.
+
+ Most normal users won't need the RAM disk functionality, and can
+ thus say N here.
+
+config BLK_DEV_RAM_COUNT
+ int "Default number of RAM disks"
+ default "16"
+ depends on BLK_DEV_RAM
+ help
+ The default value is 16 RAM disks. Change this if you know what you
+ are doing. If you boot from a filesystem that needs to be extracted
+ in memory, you will need at least one RAM disk (e.g. root on cramfs).
+
+config BLK_DEV_RAM_SIZE
+ int "Default RAM disk size (kbytes)"
+ depends on BLK_DEV_RAM
+ default "4096"
+ help
+ The default value is 4096 kilobytes. Only change this if you know
+ what you are doing.
+
+config BLK_DEV_RAM_DAX
+ bool "Support Direct Access (DAX) to RAM block devices"
+ depends on BLK_DEV_RAM && FS_DAX
+ default n
+ help
+ Support filesystems using DAX to access RAM block devices. This
+ avoids double-buffering data in the page cache before copying it
+ to the block device. Answering Y will slightly enlarge the kernel,
+ and will prevent RAM block device backing store memory from being
+ allocated from highmem (only a problem for highmem systems).
+
+config BLK_DEV_PMEM
+ tristate "Persistent memory block device support"
+ depends on HAS_IOMEM
+ help
+ Saying Y here will allow you to use a contiguous range of reserved
+ memory as one or more persistent block devices.
+
+ To compile this driver as a module, choose M here: the module will be
+ called 'pmem'.
+
+ If unsure, say N.
+
+config CDROM_PKTCDVD
+ tristate "Packet writing on CD/DVD media"
+ depends on !UML
+ help
+ If you have a CDROM/DVD drive that supports packet writing, say
+ Y to include support. It should work with any MMC/Mt Fuji
+ compliant ATAPI or SCSI drive, which is just about any newer
+ DVD/CD writer.
+
+ Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs
+ is possible.
+ DVD-RW disks must be in restricted overwrite mode.
+
+ See the file <file:Documentation/cdrom/packet-writing.txt>
+ for further information on the use of this driver.
+
+ To compile this driver as a module, choose M here: the
+ module will be called pktcdvd.
+
+config CDROM_PKTCDVD_BUFFERS
+ int "Free buffers for data gathering"
+ depends on CDROM_PKTCDVD
+ default "8"
+ help
+ This controls the maximum number of active concurrent packets. More
+ concurrent packets can increase write performance, but also require
+ more memory. Each concurrent packet will require approximately 64Kb
+ of non-swappable kernel memory, memory which will be allocated when
+ a disc is opened for writing.
+
+config CDROM_PKTCDVD_WCACHE
+ bool "Enable write caching"
+ depends on CDROM_PKTCDVD
+ help
+ If enabled, write caching will be set for the CD-R/W device. For now
+ this option is dangerous unless the CD-RW media is known good, as we
+ don't do deferred write error handling yet.
+
+config ATA_OVER_ETH
+ tristate "ATA over Ethernet support"
+ depends on NET
+ help
+ This driver provides Support for ATA over Ethernet block
+ devices like the Coraid EtherDrive (R) Storage Blade.
+
+config MG_DISK
+ tristate "mGine mflash, gflash support"
+ depends on ARM && GPIOLIB
+ help
+ mGine mFlash(gFlash) block device driver
+
+config MG_DISK_RES
+ int "Size of reserved area before MBR"
+ depends on MG_DISK
+ default 0
+ help
+ Define size of reserved area that usually used for boot. Unit is KB.
+ All of the block device operation will be taken this value as start
+ offset
+ Examples:
+ 1024 => 1 MB
+
+config SUNVDC
+ tristate "Sun Virtual Disk Client support"
+ depends on SUN_LDOMS
+ help
+ Support for virtual disk devices as a client under Sun
+ Logical Domains.
+
+source "drivers/s390/block/Kconfig"
+
+config XILINX_SYSACE
+ tristate "Xilinx SystemACE support"
+ depends on 4xx || MICROBLAZE
+ help
+ Include support for the Xilinx SystemACE CompactFlash interface
+
+config XEN_BLKDEV_FRONTEND
+ tristate "Xen virtual block device support"
+ depends on XEN
+ default y
+ select XEN_XENBUS_FRONTEND
+ help
+ This driver implements the front-end of the Xen virtual
+ block device driver. It communicates with a back-end driver
+ in another domain which drives the actual block device.
+
+config XEN_BLKDEV_BACKEND
+ tristate "Xen block-device backend driver"
+ depends on XEN_BACKEND
+ help
+ The block-device backend driver allows the kernel to export its
+ block devices to other guests via a high-performance shared-memory
+ interface.
+
+ The corresponding Linux frontend driver is enabled by the
+ CONFIG_XEN_BLKDEV_FRONTEND configuration option.
+
+ The backend driver attaches itself to a any block device specified
+ in the XenBus configuration. There are no limits to what the block
+ device as long as it has a major and minor.
+
+ If you are compiling a kernel to run in a Xen block backend driver
+ domain (often this is domain 0) you should say Y here. To
+ compile this driver as a module, chose M here: the module
+ will be called xen-blkback.
+
+
+config VIRTIO_BLK
+ tristate "Virtio block driver"
+ depends on VIRTIO
+ ---help---
+ This is the virtual block driver for virtio. It can be used with
+ lguest or QEMU based VMMs (like KVM or Xen). Say Y or M.
+
+config BLK_DEV_HD
+ bool "Very old hard disk (MFM/RLL/IDE) driver"
+ depends on HAVE_IDE
+ depends on !ARM || ARCH_RPC || BROKEN
+ help
+ This is a very old hard disk driver that lacks the enhanced
+ functionality of the newer ones.
+
+ It is required for systems with ancient MFM/RLL/ESDI drives.
+
+ If unsure, say N.
+
+config BLK_DEV_RBD
+ tristate "Rados block device (RBD)"
+ depends on INET && BLOCK
+ select CEPH_LIB
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO
+ default n
+ help
+ Say Y here if you want include the Rados block device, which stripes
+ a block device over objects stored in the Ceph distributed object
+ store.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
+config BLK_DEV_RSXX
+ tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver"
+ depends on PCI
+ help
+ Device driver for IBM's high speed PCIe SSD
+ storage device: Flash Adapter 900GB Full Height.
+
+ To compile this driver as a module, choose M here: the
+ module will be called rsxx.
+
+endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
new file mode 100644
index 000000000..9cc6c18a1
--- /dev/null
+++ b/drivers/block/Makefile
@@ -0,0 +1,50 @@
+#
+# Makefile for the kernel block device drivers.
+#
+# 12 June 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+#
+
+obj-$(CONFIG_MAC_FLOPPY) += swim3.o
+obj-$(CONFIG_BLK_DEV_SWIM) += swim_mod.o
+obj-$(CONFIG_BLK_DEV_FD) += floppy.o
+obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o
+obj-$(CONFIG_PS3_DISK) += ps3disk.o
+obj-$(CONFIG_PS3_VRAM) += ps3vram.o
+obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
+obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
+obj-$(CONFIG_BLK_DEV_RAM) += brd.o
+obj-$(CONFIG_BLK_DEV_PMEM) += pmem.o
+obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
+obj-$(CONFIG_BLK_CPQ_DA) += cpqarray.o
+obj-$(CONFIG_BLK_CPQ_CISS_DA) += cciss.o
+obj-$(CONFIG_BLK_DEV_DAC960) += DAC960.o
+obj-$(CONFIG_XILINX_SYSACE) += xsysace.o
+obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
+obj-$(CONFIG_MG_DISK) += mg_disk.o
+obj-$(CONFIG_SUNVDC) += sunvdc.o
+obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
+obj-$(CONFIG_BLK_DEV_SKD) += skd.o
+obj-$(CONFIG_BLK_DEV_OSD) += osdblk.o
+
+obj-$(CONFIG_BLK_DEV_UMEM) += umem.o
+obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
+obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryptoloop.o
+obj-$(CONFIG_VIRTIO_BLK) += virtio_blk.o
+
+obj-$(CONFIG_BLK_DEV_SX8) += sx8.o
+obj-$(CONFIG_BLK_DEV_HD) += hd.o
+
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/
+obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
+obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/
+
+obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
+obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o
+obj-$(CONFIG_ZRAM) += zram/
+
+nvme-y := nvme-core.o nvme-scsi.o
+skd-y := skd_main.o
+swim_mod-y := swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
new file mode 100644
index 000000000..5fd50a284
--- /dev/null
+++ b/drivers/block/amiflop.c
@@ -0,0 +1,1893 @@
+/*
+ * linux/amiga/amiflop.c
+ *
+ * Copyright (C) 1993 Greg Harp
+ * Portions of this driver are based on code contributed by Brad Pepers
+ *
+ * revised 28.5.95 by Joerg Dorchain
+ * - now no bugs(?) any more for both HD & DD
+ * - added support for 40 Track 5.25" drives, 80-track hopefully behaves
+ * like 3.5" dd (no way to test - are there any 5.25" drives out there
+ * that work on an A4000?)
+ * - wrote formatting routine (maybe dirty, but works)
+ *
+ * june/july 1995 added ms-dos support by Joerg Dorchain
+ * (portions based on messydos.device and various contributors)
+ * - currently only 9 and 18 sector disks
+ *
+ * - fixed a bug with the internal trackbuffer when using multiple
+ * disks the same time
+ * - made formatting a bit safer
+ * - added command line and machine based default for "silent" df0
+ *
+ * december 1995 adapted for 1.2.13pl4 by Joerg Dorchain
+ * - works but I think it's inefficient. (look in redo_fd_request)
+ * But the changes were very efficient. (only three and a half lines)
+ *
+ * january 1996 added special ioctl for tracking down read/write problems
+ * - usage ioctl(d, RAW_TRACK, ptr); the raw track buffer (MFM-encoded data
+ * is copied to area. (area should be large enough since no checking is
+ * done - 30K is currently sufficient). return the actual size of the
+ * trackbuffer
+ * - replaced udelays() by a timer (CIAA timer B) for the waits
+ * needed for the disk mechanic.
+ *
+ * february 1996 fixed error recovery and multiple disk access
+ * - both got broken the first time I tampered with the driver :-(
+ * - still not safe, but better than before
+ *
+ * revised Marts 3rd, 1996 by Jes Sorensen for use in the 1.3.28 kernel.
+ * - Minor changes to accept the kdev_t.
+ * - Replaced some more udelays with ms_delays. Udelay is just a loop,
+ * and so the delay will be different depending on the given
+ * processor :-(
+ * - The driver could use a major cleanup because of the new
+ * major/minor handling that came with kdev_t. It seems to work for
+ * the time being, but I can't guarantee that it will stay like
+ * that when we start using 16 (24?) bit minors.
+ *
+ * restructured jan 1997 by Joerg Dorchain
+ * - Fixed Bug accessing multiple disks
+ * - some code cleanup
+ * - added trackbuffer for each drive to speed things up
+ * - fixed some race conditions (who finds the next may send it to me ;-)
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/fd.h>
+#include <linux/hdreg.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/amifdreg.h>
+#include <linux/amifd.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+
+#include <asm/setup.h>
+#include <asm/uaccess.h>
+#include <asm/amigahw.h>
+#include <asm/amigaints.h>
+#include <asm/irq.h>
+
+#undef DEBUG /* print _LOTS_ of infos */
+
+#define RAW_IOCTL
+#ifdef RAW_IOCTL
+#define IOCTL_RAW_TRACK 0x5254524B /* 'RTRK' */
+#endif
+
+/*
+ * Defines
+ */
+
+/*
+ * Error codes
+ */
+#define FD_OK 0 /* operation succeeded */
+#define FD_ERROR -1 /* general error (seek, read, write, etc) */
+#define FD_NOUNIT 1 /* unit does not exist */
+#define FD_UNITBUSY 2 /* unit already active */
+#define FD_NOTACTIVE 3 /* unit is not active */
+#define FD_NOTREADY 4 /* unit is not ready (motor not on/no disk) */
+
+#define MFM_NOSYNC 1
+#define MFM_HEADER 2
+#define MFM_DATA 3
+#define MFM_TRACK 4
+
+/*
+ * Floppy ID values
+ */
+#define FD_NODRIVE 0x00000000 /* response when no unit is present */
+#define FD_DD_3 0xffffffff /* double-density 3.5" (880K) drive */
+#define FD_HD_3 0x55555555 /* high-density 3.5" (1760K) drive */
+#define FD_DD_5 0xaaaaaaaa /* double-density 5.25" (440K) drive */
+
+static DEFINE_MUTEX(amiflop_mutex);
+static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */
+
+module_param(fd_def_df0, ulong, 0);
+MODULE_LICENSE("GPL");
+
+/*
+ * Macros
+ */
+#define MOTOR_ON (ciab.prb &= ~DSKMOTOR)
+#define MOTOR_OFF (ciab.prb |= DSKMOTOR)
+#define SELECT(mask) (ciab.prb &= ~mask)
+#define DESELECT(mask) (ciab.prb |= mask)
+#define SELMASK(drive) (1 << (3 + (drive & 3)))
+
+static struct fd_drive_type drive_types[] = {
+/* code name tr he rdsz wrsz sm pc1 pc2 sd st st*/
+/* warning: times are now in milliseconds (ms) */
+{ FD_DD_3, "DD 3.5", 80, 2, 14716, 13630, 1, 80,161, 3, 18, 1},
+{ FD_HD_3, "HD 3.5", 80, 2, 28344, 27258, 2, 80,161, 3, 18, 1},
+{ FD_DD_5, "DD 5.25", 40, 2, 14716, 13630, 1, 40, 81, 6, 30, 2},
+{ FD_NODRIVE, "No Drive", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
+};
+static int num_dr_types = ARRAY_SIZE(drive_types);
+
+static int amiga_read(int), dos_read(int);
+static void amiga_write(int), dos_write(int);
+static struct fd_data_type data_types[] = {
+ { "Amiga", 11 , amiga_read, amiga_write},
+ { "MS-Dos", 9, dos_read, dos_write}
+};
+
+/* current info on each unit */
+static struct amiga_floppy_struct unit[FD_MAX_UNITS];
+
+static struct timer_list flush_track_timer[FD_MAX_UNITS];
+static struct timer_list post_write_timer;
+static struct timer_list motor_on_timer;
+static struct timer_list motor_off_timer[FD_MAX_UNITS];
+static int on_attempts;
+
+/* Synchronization of FDC access */
+/* request loop (trackbuffer) */
+static volatile int fdc_busy = -1;
+static volatile int fdc_nested;
+static DECLARE_WAIT_QUEUE_HEAD(fdc_wait);
+
+static DECLARE_COMPLETION(motor_on_completion);
+
+static volatile int selected = -1; /* currently selected drive */
+
+static int writepending;
+static int writefromint;
+static char *raw_buf;
+static int fdc_queue;
+
+static DEFINE_SPINLOCK(amiflop_lock);
+
+#define RAW_BUF_SIZE 30000 /* size of raw disk data */
+
+/*
+ * These are global variables, as that's the easiest way to give
+ * information to interrupts. They are the data used for the current
+ * request.
+ */
+static volatile char block_flag;
+static DECLARE_WAIT_QUEUE_HEAD(wait_fd_block);
+
+/* MS-Dos MFM Coding tables (should go quick and easy) */
+static unsigned char mfmencode[16]={
+ 0x2a, 0x29, 0x24, 0x25, 0x12, 0x11, 0x14, 0x15,
+ 0x4a, 0x49, 0x44, 0x45, 0x52, 0x51, 0x54, 0x55
+};
+static unsigned char mfmdecode[128];
+
+/* floppy internal millisecond timer stuff */
+static DECLARE_COMPLETION(ms_wait_completion);
+#define MS_TICKS ((amiga_eclock+50)/1000)
+
+/*
+ * Note that MAX_ERRORS=X doesn't imply that we retry every bad read
+ * max X times - some types of errors increase the errorcount by 2 or
+ * even 3, so we might actually retry only X/2 times before giving up.
+ */
+#define MAX_ERRORS 12
+
+#define custom amiga_custom
+
+/* Prevent "aliased" accesses. */
+static int fd_ref[4] = { 0,0,0,0 };
+static int fd_device[4] = { 0, 0, 0, 0 };
+
+/*
+ * Here come the actual hardware access and helper functions.
+ * They are not reentrant and single threaded because all drives
+ * share the same hardware and the same trackbuffer.
+ */
+
+/* Milliseconds timer */
+
+static irqreturn_t ms_isr(int irq, void *dummy)
+{
+ complete(&ms_wait_completion);
+ return IRQ_HANDLED;
+}
+
+/* all waits are queued up
+ A more generic routine would do a schedule a la timer.device */
+static void ms_delay(int ms)
+{
+ int ticks;
+ static DEFINE_MUTEX(mutex);
+
+ if (ms > 0) {
+ mutex_lock(&mutex);
+ ticks = MS_TICKS*ms-1;
+ ciaa.tblo=ticks%256;
+ ciaa.tbhi=ticks/256;
+ ciaa.crb=0x19; /*count eclock, force load, one-shoot, start */
+ wait_for_completion(&ms_wait_completion);
+ mutex_unlock(&mutex);
+ }
+}
+
+/* Hardware semaphore */
+
+/* returns true when we would get the semaphore */
+static inline int try_fdc(int drive)
+{
+ drive &= 3;
+ return ((fdc_busy < 0) || (fdc_busy == drive));
+}
+
+static void get_fdc(int drive)
+{
+ unsigned long flags;
+
+ drive &= 3;
+#ifdef DEBUG
+ printk("get_fdc: drive %d fdc_busy %d fdc_nested %d\n",drive,fdc_busy,fdc_nested);
+#endif
+ local_irq_save(flags);
+ wait_event(fdc_wait, try_fdc(drive));
+ fdc_busy = drive;
+ fdc_nested++;
+ local_irq_restore(flags);
+}
+
+static inline void rel_fdc(void)
+{
+#ifdef DEBUG
+ if (fdc_nested == 0)
+ printk("fd: unmatched rel_fdc\n");
+ printk("rel_fdc: fdc_busy %d fdc_nested %d\n",fdc_busy,fdc_nested);
+#endif
+ fdc_nested--;
+ if (fdc_nested == 0) {
+ fdc_busy = -1;
+ wake_up(&fdc_wait);
+ }
+}
+
+static void fd_select (int drive)
+{
+ unsigned char prb = ~0;
+
+ drive&=3;
+#ifdef DEBUG
+ printk("selecting %d\n",drive);
+#endif
+ if (drive == selected)
+ return;
+ get_fdc(drive);
+ selected = drive;
+
+ if (unit[drive].track % 2 != 0)
+ prb &= ~DSKSIDE;
+ if (unit[drive].motor == 1)
+ prb &= ~DSKMOTOR;
+ ciab.prb |= (SELMASK(0)|SELMASK(1)|SELMASK(2)|SELMASK(3));
+ ciab.prb = prb;
+ prb &= ~SELMASK(drive);
+ ciab.prb = prb;
+ rel_fdc();
+}
+
+static void fd_deselect (int drive)
+{
+ unsigned char prb;
+ unsigned long flags;
+
+ drive&=3;
+#ifdef DEBUG
+ printk("deselecting %d\n",drive);
+#endif
+ if (drive != selected) {
+ printk(KERN_WARNING "Deselecting drive %d while %d was selected!\n",drive,selected);
+ return;
+ }
+
+ get_fdc(drive);
+ local_irq_save(flags);
+
+ selected = -1;
+
+ prb = ciab.prb;
+ prb |= (SELMASK(0)|SELMASK(1)|SELMASK(2)|SELMASK(3));
+ ciab.prb = prb;
+
+ local_irq_restore (flags);
+ rel_fdc();
+
+}
+
+static void motor_on_callback(unsigned long nr)
+{
+ if (!(ciaa.pra & DSKRDY) || --on_attempts == 0) {
+ complete_all(&motor_on_completion);
+ } else {
+ motor_on_timer.expires = jiffies + HZ/10;
+ add_timer(&motor_on_timer);
+ }
+}
+
+static int fd_motor_on(int nr)
+{
+ nr &= 3;
+
+ del_timer(motor_off_timer + nr);
+
+ if (!unit[nr].motor) {
+ unit[nr].motor = 1;
+ fd_select(nr);
+
+ reinit_completion(&motor_on_completion);
+ motor_on_timer.data = nr;
+ mod_timer(&motor_on_timer, jiffies + HZ/2);
+
+ on_attempts = 10;
+ wait_for_completion(&motor_on_completion);
+ fd_deselect(nr);
+ }
+
+ if (on_attempts == 0) {
+ on_attempts = -1;
+#if 0
+ printk (KERN_ERR "motor_on failed, turning motor off\n");
+ fd_motor_off (nr);
+ return 0;
+#else
+ printk (KERN_WARNING "DSKRDY not set after 1.5 seconds - assuming drive is spinning notwithstanding\n");
+#endif
+ }
+
+ return 1;
+}
+
+static void fd_motor_off(unsigned long drive)
+{
+ long calledfromint;
+#ifdef MODULE
+ long decusecount;
+
+ decusecount = drive & 0x40000000;
+#endif
+ calledfromint = drive & 0x80000000;
+ drive&=3;
+ if (calledfromint && !try_fdc(drive)) {
+ /* We would be blocked in an interrupt, so try again later */
+ motor_off_timer[drive].expires = jiffies + 1;
+ add_timer(motor_off_timer + drive);
+ return;
+ }
+ unit[drive].motor = 0;
+ fd_select(drive);
+ udelay (1);
+ fd_deselect(drive);
+}
+
+static void floppy_off (unsigned int nr)
+{
+ int drive;
+
+ drive = nr & 3;
+ /* called this way it is always from interrupt */
+ motor_off_timer[drive].data = nr | 0x80000000;
+ mod_timer(motor_off_timer + drive, jiffies + 3*HZ);
+}
+
+static int fd_calibrate(int drive)
+{
+ unsigned char prb;
+ int n;
+
+ drive &= 3;
+ get_fdc(drive);
+ if (!fd_motor_on (drive))
+ return 0;
+ fd_select (drive);
+ prb = ciab.prb;
+ prb |= DSKSIDE;
+ prb &= ~DSKDIREC;
+ ciab.prb = prb;
+ for (n = unit[drive].type->tracks/2; n != 0; --n) {
+ if (ciaa.pra & DSKTRACK0)
+ break;
+ prb &= ~DSKSTEP;
+ ciab.prb = prb;
+ prb |= DSKSTEP;
+ udelay (2);
+ ciab.prb = prb;
+ ms_delay(unit[drive].type->step_delay);
+ }
+ ms_delay (unit[drive].type->settle_time);
+ prb |= DSKDIREC;
+ n = unit[drive].type->tracks + 20;
+ for (;;) {
+ prb &= ~DSKSTEP;
+ ciab.prb = prb;
+ prb |= DSKSTEP;
+ udelay (2);
+ ciab.prb = prb;
+ ms_delay(unit[drive].type->step_delay + 1);
+ if ((ciaa.pra & DSKTRACK0) == 0)
+ break;
+ if (--n == 0) {
+ printk (KERN_ERR "fd%d: calibrate failed, turning motor off\n", drive);
+ fd_motor_off (drive);
+ unit[drive].track = -1;
+ rel_fdc();
+ return 0;
+ }
+ }
+ unit[drive].track = 0;
+ ms_delay(unit[drive].type->settle_time);
+
+ rel_fdc();
+ fd_deselect(drive);
+ return 1;
+}
+
+static int fd_seek(int drive, int track)
+{
+ unsigned char prb;
+ int cnt;
+
+#ifdef DEBUG
+ printk("seeking drive %d to track %d\n",drive,track);
+#endif
+ drive &= 3;
+ get_fdc(drive);
+ if (unit[drive].track == track) {
+ rel_fdc();
+ return 1;
+ }
+ if (!fd_motor_on(drive)) {
+ rel_fdc();
+ return 0;
+ }
+ if (unit[drive].track < 0 && !fd_calibrate(drive)) {
+ rel_fdc();
+ return 0;
+ }
+
+ fd_select (drive);
+ cnt = unit[drive].track/2 - track/2;
+ prb = ciab.prb;
+ prb |= DSKSIDE | DSKDIREC;
+ if (track % 2 != 0)
+ prb &= ~DSKSIDE;
+ if (cnt < 0) {
+ cnt = - cnt;
+ prb &= ~DSKDIREC;
+ }
+ ciab.prb = prb;
+ if (track % 2 != unit[drive].track % 2)
+ ms_delay (unit[drive].type->side_time);
+ unit[drive].track = track;
+ if (cnt == 0) {
+ rel_fdc();
+ fd_deselect(drive);
+ return 1;
+ }
+ do {
+ prb &= ~DSKSTEP;
+ ciab.prb = prb;
+ prb |= DSKSTEP;
+ udelay (1);
+ ciab.prb = prb;
+ ms_delay (unit[drive].type->step_delay);
+ } while (--cnt != 0);
+ ms_delay (unit[drive].type->settle_time);
+
+ rel_fdc();
+ fd_deselect(drive);
+ return 1;
+}
+
+static unsigned long fd_get_drive_id(int drive)
+{
+ int i;
+ ulong id = 0;
+
+ drive&=3;
+ get_fdc(drive);
+ /* set up for ID */
+ MOTOR_ON;
+ udelay(2);
+ SELECT(SELMASK(drive));
+ udelay(2);
+ DESELECT(SELMASK(drive));
+ udelay(2);
+ MOTOR_OFF;
+ udelay(2);
+ SELECT(SELMASK(drive));
+ udelay(2);
+ DESELECT(SELMASK(drive));
+ udelay(2);
+
+ /* loop and read disk ID */
+ for (i=0; i<32; i++) {
+ SELECT(SELMASK(drive));
+ udelay(2);
+
+ /* read and store value of DSKRDY */
+ id <<= 1;
+ id |= (ciaa.pra & DSKRDY) ? 0 : 1; /* cia regs are low-active! */
+
+ DESELECT(SELMASK(drive));
+ }
+
+ rel_fdc();
+
+ /*
+ * RB: At least A500/A2000's df0: don't identify themselves.
+ * As every (real) Amiga has at least a 3.5" DD drive as df0:
+ * we default to that if df0: doesn't identify as a certain
+ * type.
+ */
+ if(drive == 0 && id == FD_NODRIVE)
+ {
+ id = fd_def_df0;
+ printk(KERN_NOTICE "fd: drive 0 didn't identify, setting default %08lx\n", (ulong)fd_def_df0);
+ }
+ /* return the ID value */
+ return (id);
+}
+
+static irqreturn_t fd_block_done(int irq, void *dummy)
+{
+ if (block_flag)
+ custom.dsklen = 0x4000;
+
+ if (block_flag == 2) { /* writing */
+ writepending = 2;
+ post_write_timer.expires = jiffies + 1; /* at least 2 ms */
+ post_write_timer.data = selected;
+ add_timer(&post_write_timer);
+ }
+ else { /* reading */
+ block_flag = 0;
+ wake_up (&wait_fd_block);
+ }
+ return IRQ_HANDLED;
+}
+
+static void raw_read(int drive)
+{
+ drive&=3;
+ get_fdc(drive);
+ wait_event(wait_fd_block, !block_flag);
+ fd_select(drive);
+ /* setup adkcon bits correctly */
+ custom.adkcon = ADK_MSBSYNC;
+ custom.adkcon = ADK_SETCLR|ADK_WORDSYNC|ADK_FAST;
+
+ custom.dsksync = MFM_SYNC;
+
+ custom.dsklen = 0;
+ custom.dskptr = (u_char *)ZTWO_PADDR((u_char *)raw_buf);
+ custom.dsklen = unit[drive].type->read_size/sizeof(short) | DSKLEN_DMAEN;
+ custom.dsklen = unit[drive].type->read_size/sizeof(short) | DSKLEN_DMAEN;
+
+ block_flag = 1;
+
+ wait_event(wait_fd_block, !block_flag);
+
+ custom.dsklen = 0;
+ fd_deselect(drive);
+ rel_fdc();
+}
+
+static int raw_write(int drive)
+{
+ ushort adk;
+
+ drive&=3;
+ get_fdc(drive); /* corresponds to rel_fdc() in post_write() */
+ if ((ciaa.pra & DSKPROT) == 0) {
+ rel_fdc();
+ return 0;
+ }
+ wait_event(wait_fd_block, !block_flag);
+ fd_select(drive);
+ /* clear adkcon bits */
+ custom.adkcon = ADK_PRECOMP1|ADK_PRECOMP0|ADK_WORDSYNC|ADK_MSBSYNC;
+ /* set appropriate adkcon bits */
+ adk = ADK_SETCLR|ADK_FAST;
+ if ((ulong)unit[drive].track >= unit[drive].type->precomp2)
+ adk |= ADK_PRECOMP1;
+ else if ((ulong)unit[drive].track >= unit[drive].type->precomp1)
+ adk |= ADK_PRECOMP0;
+ custom.adkcon = adk;
+
+ custom.dsklen = DSKLEN_WRITE;
+ custom.dskptr = (u_char *)ZTWO_PADDR((u_char *)raw_buf);
+ custom.dsklen = unit[drive].type->write_size/sizeof(short) | DSKLEN_DMAEN|DSKLEN_WRITE;
+ custom.dsklen = unit[drive].type->write_size/sizeof(short) | DSKLEN_DMAEN|DSKLEN_WRITE;
+
+ block_flag = 2;
+ return 1;
+}
+
+/*
+ * to be called at least 2ms after the write has finished but before any
+ * other access to the hardware.
+ */
+static void post_write (unsigned long drive)
+{
+#ifdef DEBUG
+ printk("post_write for drive %ld\n",drive);
+#endif
+ drive &= 3;
+ custom.dsklen = 0;
+ block_flag = 0;
+ writepending = 0;
+ writefromint = 0;
+ unit[drive].dirty = 0;
+ wake_up(&wait_fd_block);
+ fd_deselect(drive);
+ rel_fdc(); /* corresponds to get_fdc() in raw_write */
+}
+
+
+/*
+ * The following functions are to convert the block contents into raw data
+ * written to disk and vice versa.
+ * (Add other formats here ;-))
+ */
+
+static unsigned long scan_sync(unsigned long raw, unsigned long end)
+{
+ ushort *ptr = (ushort *)raw, *endp = (ushort *)end;
+
+ while (ptr < endp && *ptr++ != 0x4489)
+ ;
+ if (ptr < endp) {
+ while (*ptr == 0x4489 && ptr < endp)
+ ptr++;
+ return (ulong)ptr;
+ }
+ return 0;
+}
+
+static inline unsigned long checksum(unsigned long *addr, int len)
+{
+ unsigned long csum = 0;
+
+ len /= sizeof(*addr);
+ while (len-- > 0)
+ csum ^= *addr++;
+ csum = ((csum>>1) & 0x55555555) ^ (csum & 0x55555555);
+
+ return csum;
+}
+
+static unsigned long decode (unsigned long *data, unsigned long *raw,
+ int len)
+{
+ ulong *odd, *even;
+
+ /* convert length from bytes to longwords */
+ len >>= 2;
+ odd = raw;
+ even = odd + len;
+
+ /* prepare return pointer */
+ raw += len * 2;
+
+ do {
+ *data++ = ((*odd++ & 0x55555555) << 1) | (*even++ & 0x55555555);
+ } while (--len != 0);
+
+ return (ulong)raw;
+}
+
+struct header {
+ unsigned char magic;
+ unsigned char track;
+ unsigned char sect;
+ unsigned char ord;
+ unsigned char labels[16];
+ unsigned long hdrchk;
+ unsigned long datachk;
+};
+
+static int amiga_read(int drive)
+{
+ unsigned long raw;
+ unsigned long end;
+ int scnt;
+ unsigned long csum;
+ struct header hdr;
+
+ drive&=3;
+ raw = (long) raw_buf;
+ end = raw + unit[drive].type->read_size;
+
+ for (scnt = 0;scnt < unit[drive].dtype->sects * unit[drive].type->sect_mult; scnt++) {
+ if (!(raw = scan_sync(raw, end))) {
+ printk (KERN_INFO "can't find sync for sector %d\n", scnt);
+ return MFM_NOSYNC;
+ }
+
+ raw = decode ((ulong *)&hdr.magic, (ulong *)raw, 4);
+ raw = decode ((ulong *)&hdr.labels, (ulong *)raw, 16);
+ raw = decode ((ulong *)&hdr.hdrchk, (ulong *)raw, 4);
+ raw = decode ((ulong *)&hdr.datachk, (ulong *)raw, 4);
+ csum = checksum((ulong *)&hdr,
+ (char *)&hdr.hdrchk-(char *)&hdr);
+
+#ifdef DEBUG
+ printk ("(%x,%d,%d,%d) (%lx,%lx,%lx,%lx) %lx %lx\n",
+ hdr.magic, hdr.track, hdr.sect, hdr.ord,
+ *(ulong *)&hdr.labels[0], *(ulong *)&hdr.labels[4],
+ *(ulong *)&hdr.labels[8], *(ulong *)&hdr.labels[12],
+ hdr.hdrchk, hdr.datachk);
+#endif
+
+ if (hdr.hdrchk != csum) {
+ printk(KERN_INFO "MFM_HEADER: %08lx,%08lx\n", hdr.hdrchk, csum);
+ return MFM_HEADER;
+ }
+
+ /* verify track */
+ if (hdr.track != unit[drive].track) {
+ printk(KERN_INFO "MFM_TRACK: %d, %d\n", hdr.track, unit[drive].track);
+ return MFM_TRACK;
+ }
+
+ raw = decode ((ulong *)(unit[drive].trackbuf + hdr.sect*512),
+ (ulong *)raw, 512);
+ csum = checksum((ulong *)(unit[drive].trackbuf + hdr.sect*512), 512);
+
+ if (hdr.datachk != csum) {
+ printk(KERN_INFO "MFM_DATA: (%x:%d:%d:%d) sc=%d %lx, %lx\n",
+ hdr.magic, hdr.track, hdr.sect, hdr.ord, scnt,
+ hdr.datachk, csum);
+ printk (KERN_INFO "data=(%lx,%lx,%lx,%lx)\n",
+ ((ulong *)(unit[drive].trackbuf+hdr.sect*512))[0],
+ ((ulong *)(unit[drive].trackbuf+hdr.sect*512))[1],
+ ((ulong *)(unit[drive].trackbuf+hdr.sect*512))[2],
+ ((ulong *)(unit[drive].trackbuf+hdr.sect*512))[3]);
+ return MFM_DATA;
+ }
+ }
+
+ return 0;
+}
+
+static void encode(unsigned long data, unsigned long *dest)
+{
+ unsigned long data2;
+
+ data &= 0x55555555;
+ data2 = data ^ 0x55555555;
+ data |= ((data2 >> 1) | 0x80000000) & (data2 << 1);
+
+ if (*(dest - 1) & 0x00000001)
+ data &= 0x7FFFFFFF;
+
+ *dest = data;
+}
+
+static void encode_block(unsigned long *dest, unsigned long *src, int len)
+{
+ int cnt, to_cnt = 0;
+ unsigned long data;
+
+ /* odd bits */
+ for (cnt = 0; cnt < len / 4; cnt++) {
+ data = src[cnt] >> 1;
+ encode(data, dest + to_cnt++);
+ }
+
+ /* even bits */
+ for (cnt = 0; cnt < len / 4; cnt++) {
+ data = src[cnt];
+ encode(data, dest + to_cnt++);
+ }
+}
+
+static unsigned long *putsec(int disk, unsigned long *raw, int cnt)
+{
+ struct header hdr;
+ int i;
+
+ disk&=3;
+ *raw = (raw[-1]&1) ? 0x2AAAAAAA : 0xAAAAAAAA;
+ raw++;
+ *raw++ = 0x44894489;
+
+ hdr.magic = 0xFF;
+ hdr.track = unit[disk].track;
+ hdr.sect = cnt;
+ hdr.ord = unit[disk].dtype->sects * unit[disk].type->sect_mult - cnt;
+ for (i = 0; i < 16; i++)
+ hdr.labels[i] = 0;
+ hdr.hdrchk = checksum((ulong *)&hdr,
+ (char *)&hdr.hdrchk-(char *)&hdr);
+ hdr.datachk = checksum((ulong *)(unit[disk].trackbuf+cnt*512), 512);
+
+ encode_block(raw, (ulong *)&hdr.magic, 4);
+ raw += 2;
+ encode_block(raw, (ulong *)&hdr.labels, 16);
+ raw += 8;
+ encode_block(raw, (ulong *)&hdr.hdrchk, 4);
+ raw += 2;
+ encode_block(raw, (ulong *)&hdr.datachk, 4);
+ raw += 2;
+ encode_block(raw, (ulong *)(unit[disk].trackbuf+cnt*512), 512);
+ raw += 256;
+
+ return raw;
+}
+
+static void amiga_write(int disk)
+{
+ unsigned int cnt;
+ unsigned long *ptr = (unsigned long *)raw_buf;
+
+ disk&=3;
+ /* gap space */
+ for (cnt = 0; cnt < 415 * unit[disk].type->sect_mult; cnt++)
+ *ptr++ = 0xaaaaaaaa;
+
+ /* sectors */
+ for (cnt = 0; cnt < unit[disk].dtype->sects * unit[disk].type->sect_mult; cnt++)
+ ptr = putsec (disk, ptr, cnt);
+ *(ushort *)ptr = (ptr[-1]&1) ? 0x2AA8 : 0xAAA8;
+}
+
+
+struct dos_header {
+ unsigned char track, /* 0-80 */
+ side, /* 0-1 */
+ sec, /* 0-...*/
+ len_desc;/* 2 */
+ unsigned short crc; /* on 68000 we got an alignment problem,
+ but this compiler solves it by adding silently
+ adding a pad byte so data won't fit
+ and this took about 3h to discover.... */
+ unsigned char gap1[22]; /* for longword-alignedness (0x4e) */
+};
+
+/* crc routines are borrowed from the messydos-handler */
+
+/* excerpt from the messydos-device
+; The CRC is computed not only over the actual data, but including
+; the SYNC mark (3 * $a1) and the 'ID/DATA - Address Mark' ($fe/$fb).
+; As we don't read or encode these fields into our buffers, we have to
+; preload the registers containing the CRC with the values they would have
+; after stepping over these fields.
+;
+; How CRCs "really" work:
+;
+; First, you should regard a bitstring as a series of coefficients of
+; polynomials. We calculate with these polynomials in modulo-2
+; arithmetic, in which both add and subtract are done the same as
+; exclusive-or. Now, we modify our data (a very long polynomial) in
+; such a way that it becomes divisible by the CCITT-standard 16-bit
+; 16 12 5
+; polynomial: x + x + x + 1, represented by $11021. The easiest
+; way to do this would be to multiply (using proper arithmetic) our
+; datablock with $11021. So we have:
+; data * $11021 =
+; data * ($10000 + $1021) =
+; data * $10000 + data * $1021
+; The left part of this is simple: Just add two 0 bytes. But then
+; the right part (data $1021) remains difficult and even could have
+; a carry into the left part. The solution is to use a modified
+; multiplication, which has a result that is not correct, but with
+; a difference of any multiple of $11021. We then only need to keep
+; the 16 least significant bits of the result.
+;
+; The following algorithm does this for us:
+;
+; unsigned char *data, c, crclo, crchi;
+; while (not done) {
+; c = *data++ + crchi;
+; crchi = (@ c) >> 8 + crclo;
+; crclo = @ c;
+; }
+;
+; Remember, + is done with EOR, the @ operator is in two tables (high
+; and low byte separately), which is calculated as
+;
+; $1021 * (c & $F0)
+; xor $1021 * (c & $0F)
+; xor $1021 * (c >> 4) (* is regular multiplication)
+;
+;
+; Anyway, the end result is the same as the remainder of the division of
+; the data by $11021. I am afraid I need to study theory a bit more...
+
+
+my only works was to code this from manx to C....
+
+*/
+
+static ushort dos_crc(void * data_a3, int data_d0, int data_d1, int data_d3)
+{
+ static unsigned char CRCTable1[] = {
+ 0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70,0x81,0x91,0xa1,0xb1,0xc1,0xd1,0xe1,0xf1,
+ 0x12,0x02,0x32,0x22,0x52,0x42,0x72,0x62,0x93,0x83,0xb3,0xa3,0xd3,0xc3,0xf3,0xe3,
+ 0x24,0x34,0x04,0x14,0x64,0x74,0x44,0x54,0xa5,0xb5,0x85,0x95,0xe5,0xf5,0xc5,0xd5,
+ 0x36,0x26,0x16,0x06,0x76,0x66,0x56,0x46,0xb7,0xa7,0x97,0x87,0xf7,0xe7,0xd7,0xc7,
+ 0x48,0x58,0x68,0x78,0x08,0x18,0x28,0x38,0xc9,0xd9,0xe9,0xf9,0x89,0x99,0xa9,0xb9,
+ 0x5a,0x4a,0x7a,0x6a,0x1a,0x0a,0x3a,0x2a,0xdb,0xcb,0xfb,0xeb,0x9b,0x8b,0xbb,0xab,
+ 0x6c,0x7c,0x4c,0x5c,0x2c,0x3c,0x0c,0x1c,0xed,0xfd,0xcd,0xdd,0xad,0xbd,0x8d,0x9d,
+ 0x7e,0x6e,0x5e,0x4e,0x3e,0x2e,0x1e,0x0e,0xff,0xef,0xdf,0xcf,0xbf,0xaf,0x9f,0x8f,
+ 0x91,0x81,0xb1,0xa1,0xd1,0xc1,0xf1,0xe1,0x10,0x00,0x30,0x20,0x50,0x40,0x70,0x60,
+ 0x83,0x93,0xa3,0xb3,0xc3,0xd3,0xe3,0xf3,0x02,0x12,0x22,0x32,0x42,0x52,0x62,0x72,
+ 0xb5,0xa5,0x95,0x85,0xf5,0xe5,0xd5,0xc5,0x34,0x24,0x14,0x04,0x74,0x64,0x54,0x44,
+ 0xa7,0xb7,0x87,0x97,0xe7,0xf7,0xc7,0xd7,0x26,0x36,0x06,0x16,0x66,0x76,0x46,0x56,
+ 0xd9,0xc9,0xf9,0xe9,0x99,0x89,0xb9,0xa9,0x58,0x48,0x78,0x68,0x18,0x08,0x38,0x28,
+ 0xcb,0xdb,0xeb,0xfb,0x8b,0x9b,0xab,0xbb,0x4a,0x5a,0x6a,0x7a,0x0a,0x1a,0x2a,0x3a,
+ 0xfd,0xed,0xdd,0xcd,0xbd,0xad,0x9d,0x8d,0x7c,0x6c,0x5c,0x4c,0x3c,0x2c,0x1c,0x0c,
+ 0xef,0xff,0xcf,0xdf,0xaf,0xbf,0x8f,0x9f,0x6e,0x7e,0x4e,0x5e,0x2e,0x3e,0x0e,0x1e
+ };
+
+ static unsigned char CRCTable2[] = {
+ 0x00,0x21,0x42,0x63,0x84,0xa5,0xc6,0xe7,0x08,0x29,0x4a,0x6b,0x8c,0xad,0xce,0xef,
+ 0x31,0x10,0x73,0x52,0xb5,0x94,0xf7,0xd6,0x39,0x18,0x7b,0x5a,0xbd,0x9c,0xff,0xde,
+ 0x62,0x43,0x20,0x01,0xe6,0xc7,0xa4,0x85,0x6a,0x4b,0x28,0x09,0xee,0xcf,0xac,0x8d,
+ 0x53,0x72,0x11,0x30,0xd7,0xf6,0x95,0xb4,0x5b,0x7a,0x19,0x38,0xdf,0xfe,0x9d,0xbc,
+ 0xc4,0xe5,0x86,0xa7,0x40,0x61,0x02,0x23,0xcc,0xed,0x8e,0xaf,0x48,0x69,0x0a,0x2b,
+ 0xf5,0xd4,0xb7,0x96,0x71,0x50,0x33,0x12,0xfd,0xdc,0xbf,0x9e,0x79,0x58,0x3b,0x1a,
+ 0xa6,0x87,0xe4,0xc5,0x22,0x03,0x60,0x41,0xae,0x8f,0xec,0xcd,0x2a,0x0b,0x68,0x49,
+ 0x97,0xb6,0xd5,0xf4,0x13,0x32,0x51,0x70,0x9f,0xbe,0xdd,0xfc,0x1b,0x3a,0x59,0x78,
+ 0x88,0xa9,0xca,0xeb,0x0c,0x2d,0x4e,0x6f,0x80,0xa1,0xc2,0xe3,0x04,0x25,0x46,0x67,
+ 0xb9,0x98,0xfb,0xda,0x3d,0x1c,0x7f,0x5e,0xb1,0x90,0xf3,0xd2,0x35,0x14,0x77,0x56,
+ 0xea,0xcb,0xa8,0x89,0x6e,0x4f,0x2c,0x0d,0xe2,0xc3,0xa0,0x81,0x66,0x47,0x24,0x05,
+ 0xdb,0xfa,0x99,0xb8,0x5f,0x7e,0x1d,0x3c,0xd3,0xf2,0x91,0xb0,0x57,0x76,0x15,0x34,
+ 0x4c,0x6d,0x0e,0x2f,0xc8,0xe9,0x8a,0xab,0x44,0x65,0x06,0x27,0xc0,0xe1,0x82,0xa3,
+ 0x7d,0x5c,0x3f,0x1e,0xf9,0xd8,0xbb,0x9a,0x75,0x54,0x37,0x16,0xf1,0xd0,0xb3,0x92,
+ 0x2e,0x0f,0x6c,0x4d,0xaa,0x8b,0xe8,0xc9,0x26,0x07,0x64,0x45,0xa2,0x83,0xe0,0xc1,
+ 0x1f,0x3e,0x5d,0x7c,0x9b,0xba,0xd9,0xf8,0x17,0x36,0x55,0x74,0x93,0xb2,0xd1,0xf0
+ };
+
+/* look at the asm-code - what looks in C a bit strange is almost as good as handmade */
+ register int i;
+ register unsigned char *CRCT1, *CRCT2, *data, c, crch, crcl;
+
+ CRCT1=CRCTable1;
+ CRCT2=CRCTable2;
+ data=data_a3;
+ crcl=data_d1;
+ crch=data_d0;
+ for (i=data_d3; i>=0; i--) {
+ c = (*data++) ^ crch;
+ crch = CRCT1[c] ^ crcl;
+ crcl = CRCT2[c];
+ }
+ return (crch<<8)|crcl;
+}
+
+static inline ushort dos_hdr_crc (struct dos_header *hdr)
+{
+ return dos_crc(&(hdr->track), 0xb2, 0x30, 3); /* precomputed magic */
+}
+
+static inline ushort dos_data_crc(unsigned char *data)
+{
+ return dos_crc(data, 0xe2, 0x95 ,511); /* precomputed magic */
+}
+
+static inline unsigned char dos_decode_byte(ushort word)
+{
+ register ushort w2;
+ register unsigned char byte;
+ register unsigned char *dec = mfmdecode;
+
+ w2=word;
+ w2>>=8;
+ w2&=127;
+ byte = dec[w2];
+ byte <<= 4;
+ w2 = word & 127;
+ byte |= dec[w2];
+ return byte;
+}
+
+static unsigned long dos_decode(unsigned char *data, unsigned short *raw, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++)
+ *data++=dos_decode_byte(*raw++);
+ return ((ulong)raw);
+}
+
+#ifdef DEBUG
+static void dbg(unsigned long ptr)
+{
+ printk("raw data @%08lx: %08lx, %08lx ,%08lx, %08lx\n", ptr,
+ ((ulong *)ptr)[0], ((ulong *)ptr)[1],
+ ((ulong *)ptr)[2], ((ulong *)ptr)[3]);
+}
+#endif
+
+static int dos_read(int drive)
+{
+ unsigned long end;
+ unsigned long raw;
+ int scnt;
+ unsigned short crc,data_crc[2];
+ struct dos_header hdr;
+
+ drive&=3;
+ raw = (long) raw_buf;
+ end = raw + unit[drive].type->read_size;
+
+ for (scnt=0; scnt < unit[drive].dtype->sects * unit[drive].type->sect_mult; scnt++) {
+ do { /* search for the right sync of each sec-hdr */
+ if (!(raw = scan_sync (raw, end))) {
+ printk(KERN_INFO "dos_read: no hdr sync on "
+ "track %d, unit %d for sector %d\n",
+ unit[drive].track,drive,scnt);
+ return MFM_NOSYNC;
+ }
+#ifdef DEBUG
+ dbg(raw);
+#endif
+ } while (*((ushort *)raw)!=0x5554); /* loop usually only once done */
+ raw+=2; /* skip over headermark */
+ raw = dos_decode((unsigned char *)&hdr,(ushort *) raw,8);
+ crc = dos_hdr_crc(&hdr);
+
+#ifdef DEBUG
+ printk("(%3d,%d,%2d,%d) %x\n", hdr.track, hdr.side,
+ hdr.sec, hdr.len_desc, hdr.crc);
+#endif
+
+ if (crc != hdr.crc) {
+ printk(KERN_INFO "dos_read: MFM_HEADER %04x,%04x\n",
+ hdr.crc, crc);
+ return MFM_HEADER;
+ }
+ if (hdr.track != unit[drive].track/unit[drive].type->heads) {
+ printk(KERN_INFO "dos_read: MFM_TRACK %d, %d\n",
+ hdr.track,
+ unit[drive].track/unit[drive].type->heads);
+ return MFM_TRACK;
+ }
+
+ if (hdr.side != unit[drive].track%unit[drive].type->heads) {
+ printk(KERN_INFO "dos_read: MFM_SIDE %d, %d\n",
+ hdr.side,
+ unit[drive].track%unit[drive].type->heads);
+ return MFM_TRACK;
+ }
+
+ if (hdr.len_desc != 2) {
+ printk(KERN_INFO "dos_read: unknown sector len "
+ "descriptor %d\n", hdr.len_desc);
+ return MFM_DATA;
+ }
+#ifdef DEBUG
+ printk("hdr accepted\n");
+#endif
+ if (!(raw = scan_sync (raw, end))) {
+ printk(KERN_INFO "dos_read: no data sync on track "
+ "%d, unit %d for sector%d, disk sector %d\n",
+ unit[drive].track, drive, scnt, hdr.sec);
+ return MFM_NOSYNC;
+ }
+#ifdef DEBUG
+ dbg(raw);
+#endif
+
+ if (*((ushort *)raw)!=0x5545) {
+ printk(KERN_INFO "dos_read: no data mark after "
+ "sync (%d,%d,%d,%d) sc=%d\n",
+ hdr.track,hdr.side,hdr.sec,hdr.len_desc,scnt);
+ return MFM_NOSYNC;
+ }
+
+ raw+=2; /* skip data mark (included in checksum) */
+ raw = dos_decode((unsigned char *)(unit[drive].trackbuf + (hdr.sec - 1) * 512), (ushort *) raw, 512);
+ raw = dos_decode((unsigned char *)data_crc,(ushort *) raw,4);
+ crc = dos_data_crc(unit[drive].trackbuf + (hdr.sec - 1) * 512);
+
+ if (crc != data_crc[0]) {
+ printk(KERN_INFO "dos_read: MFM_DATA (%d,%d,%d,%d) "
+ "sc=%d, %x %x\n", hdr.track, hdr.side,
+ hdr.sec, hdr.len_desc, scnt,data_crc[0], crc);
+ printk(KERN_INFO "data=(%lx,%lx,%lx,%lx,...)\n",
+ ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[0],
+ ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[1],
+ ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[2],
+ ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[3]);
+ return MFM_DATA;
+ }
+ }
+ return 0;
+}
+
+static inline ushort dos_encode_byte(unsigned char byte)
+{
+ register unsigned char *enc, b2, b1;
+ register ushort word;
+
+ enc=mfmencode;
+ b1=byte;
+ b2=b1>>4;
+ b1&=15;
+ word=enc[b2] <<8 | enc [b1];
+ return (word|((word&(256|64)) ? 0: 128));
+}
+
+static void dos_encode_block(ushort *dest, unsigned char *src, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ *dest=dos_encode_byte(*src++);
+ *dest|=((dest[-1]&1)||(*dest&0x4000))? 0: 0x8000;
+ dest++;
+ }
+}
+
+static unsigned long *ms_putsec(int drive, unsigned long *raw, int cnt)
+{
+ static struct dos_header hdr={0,0,0,2,0,
+ {78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78}};
+ int i;
+ static ushort crc[2]={0,0x4e4e};
+
+ drive&=3;
+/* id gap 1 */
+/* the MFM word before is always 9254 */
+ for(i=0;i<6;i++)
+ *raw++=0xaaaaaaaa;
+/* 3 sync + 1 headermark */
+ *raw++=0x44894489;
+ *raw++=0x44895554;
+
+/* fill in the variable parts of the header */
+ hdr.track=unit[drive].track/unit[drive].type->heads;
+ hdr.side=unit[drive].track%unit[drive].type->heads;
+ hdr.sec=cnt+1;
+ hdr.crc=dos_hdr_crc(&hdr);
+
+/* header (without "magic") and id gap 2*/
+ dos_encode_block((ushort *)raw,(unsigned char *) &hdr.track,28);
+ raw+=14;
+
+/*id gap 3 */
+ for(i=0;i<6;i++)
+ *raw++=0xaaaaaaaa;
+
+/* 3 syncs and 1 datamark */
+ *raw++=0x44894489;
+ *raw++=0x44895545;
+
+/* data */
+ dos_encode_block((ushort *)raw,
+ (unsigned char *)unit[drive].trackbuf+cnt*512,512);
+ raw+=256;
+
+/*data crc + jd's special gap (long words :-/) */
+ crc[0]=dos_data_crc(unit[drive].trackbuf+cnt*512);
+ dos_encode_block((ushort *) raw,(unsigned char *)crc,4);
+ raw+=2;
+
+/* data gap */
+ for(i=0;i<38;i++)
+ *raw++=0x92549254;
+
+ return raw; /* wrote 652 MFM words */
+}
+
+static void dos_write(int disk)
+{
+ int cnt;
+ unsigned long raw = (unsigned long) raw_buf;
+ unsigned long *ptr=(unsigned long *)raw;
+
+ disk&=3;
+/* really gap4 + indexgap , but we write it first and round it up */
+ for (cnt=0;cnt<425;cnt++)
+ *ptr++=0x92549254;
+
+/* the following is just guessed */
+ if (unit[disk].type->sect_mult==2) /* check for HD-Disks */
+ for(cnt=0;cnt<473;cnt++)
+ *ptr++=0x92549254;
+
+/* now the index marks...*/
+ for (cnt=0;cnt<20;cnt++)
+ *ptr++=0x92549254;
+ for (cnt=0;cnt<6;cnt++)
+ *ptr++=0xaaaaaaaa;
+ *ptr++=0x52245224;
+ *ptr++=0x52245552;
+ for (cnt=0;cnt<20;cnt++)
+ *ptr++=0x92549254;
+
+/* sectors */
+ for(cnt = 0; cnt < unit[disk].dtype->sects * unit[disk].type->sect_mult; cnt++)
+ ptr=ms_putsec(disk,ptr,cnt);
+
+ *(ushort *)ptr = 0xaaa8; /* MFM word before is always 0x9254 */
+}
+
+/*
+ * Here comes the high level stuff (i.e. the filesystem interface)
+ * and helper functions.
+ * Normally this should be the only part that has to be adapted to
+ * different kernel versions.
+ */
+
+/* FIXME: this assumes the drive is still spinning -
+ * which is only true if we complete writing a track within three seconds
+ */
+static void flush_track_callback(unsigned long nr)
+{
+ nr&=3;
+ writefromint = 1;
+ if (!try_fdc(nr)) {
+ /* we might block in an interrupt, so try again later */
+ flush_track_timer[nr].expires = jiffies + 1;
+ add_timer(flush_track_timer + nr);
+ return;
+ }
+ get_fdc(nr);
+ (*unit[nr].dtype->write_fkt)(nr);
+ if (!raw_write(nr)) {
+ printk (KERN_NOTICE "floppy disk write protected\n");
+ writefromint = 0;
+ writepending = 0;
+ }
+ rel_fdc();
+}
+
+static int non_int_flush_track (unsigned long nr)
+{
+ unsigned long flags;
+
+ nr&=3;
+ writefromint = 0;
+ del_timer(&post_write_timer);
+ get_fdc(nr);
+ if (!fd_motor_on(nr)) {
+ writepending = 0;
+ rel_fdc();
+ return 0;
+ }
+ local_irq_save(flags);
+ if (writepending != 2) {
+ local_irq_restore(flags);
+ (*unit[nr].dtype->write_fkt)(nr);
+ if (!raw_write(nr)) {
+ printk (KERN_NOTICE "floppy disk write protected "
+ "in write!\n");
+ writepending = 0;
+ return 0;
+ }
+ wait_event(wait_fd_block, block_flag != 2);
+ }
+ else {
+ local_irq_restore(flags);
+ ms_delay(2); /* 2 ms post_write delay */
+ post_write(nr);
+ }
+ rel_fdc();
+ return 1;
+}
+
+static int get_track(int drive, int track)
+{
+ int error, errcnt;
+
+ drive&=3;
+ if (unit[drive].track == track)
+ return 0;
+ get_fdc(drive);
+ if (!fd_motor_on(drive)) {
+ rel_fdc();
+ return -1;
+ }
+
+ if (unit[drive].dirty == 1) {
+ del_timer (flush_track_timer + drive);
+ non_int_flush_track (drive);
+ }
+ errcnt = 0;
+ while (errcnt < MAX_ERRORS) {
+ if (!fd_seek(drive, track))
+ return -1;
+ raw_read(drive);
+ error = (*unit[drive].dtype->read_fkt)(drive);
+ if (error == 0) {
+ rel_fdc();
+ return 0;
+ }
+ /* Read Error Handling: recalibrate and try again */
+ unit[drive].track = -1;
+ errcnt++;
+ }
+ rel_fdc();
+ return -1;
+}
+
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+ struct request_queue *q;
+ int cnt = FD_MAX_UNITS;
+ struct request *rq = NULL;
+
+ /* Find next queue we can dispatch from */
+ fdc_queue = fdc_queue + 1;
+ if (fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+
+ for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) {
+
+ if (unit[fdc_queue].type->code == FD_NODRIVE) {
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ continue;
+ }
+
+ q = unit[fdc_queue].gendisk->queue;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ break;
+ }
+
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ }
+
+ return rq;
+}
+
+static void redo_fd_request(void)
+{
+ struct request *rq;
+ unsigned int cnt, block, track, sector;
+ int drive;
+ struct amiga_floppy_struct *floppy;
+ char *data;
+ unsigned long flags;
+ int err;
+
+next_req:
+ rq = set_next_request();
+ if (!rq) {
+ /* Nothing left to do */
+ return;
+ }
+
+ floppy = rq->rq_disk->private_data;
+ drive = floppy - unit;
+
+next_segment:
+ /* Here someone could investigate to be more efficient */
+ for (cnt = 0, err = 0; cnt < blk_rq_cur_sectors(rq); cnt++) {
+#ifdef DEBUG
+ printk("fd: sector %ld + %d requested for %s\n",
+ blk_rq_pos(rq), cnt,
+ (rq_data_dir(rq) == READ) ? "read" : "write");
+#endif
+ block = blk_rq_pos(rq) + cnt;
+ if ((int)block > floppy->blocks) {
+ err = -EIO;
+ break;
+ }
+
+ track = block / (floppy->dtype->sects * floppy->type->sect_mult);
+ sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
+ data = bio_data(rq->bio) + 512 * cnt;
+#ifdef DEBUG
+ printk("access to track %d, sector %d, with buffer at "
+ "0x%08lx\n", track, sector, data);
+#endif
+
+ if (get_track(drive, track) == -1) {
+ err = -EIO;
+ break;
+ }
+
+ if (rq_data_dir(rq) == READ) {
+ memcpy(data, floppy->trackbuf + sector * 512, 512);
+ } else {
+ memcpy(floppy->trackbuf + sector * 512, data, 512);
+
+ /* keep the drive spinning while writes are scheduled */
+ if (!fd_motor_on(drive)) {
+ err = -EIO;
+ break;
+ }
+ /*
+ * setup a callback to write the track buffer
+ * after a short (1 tick) delay.
+ */
+ local_irq_save(flags);
+
+ floppy->dirty = 1;
+ /* reset the timer */
+ mod_timer (flush_track_timer + drive, jiffies + 1);
+ local_irq_restore(flags);
+ }
+ }
+
+ if (__blk_end_request_cur(rq, err))
+ goto next_segment;
+ goto next_req;
+}
+
+static void do_fd_request(struct request_queue * q)
+{
+ redo_fd_request();
+}
+
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ int drive = MINOR(bdev->bd_dev) & 3;
+
+ geo->heads = unit[drive].type->heads;
+ geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
+ geo->cylinders = unit[drive].type->tracks;
+ return 0;
+}
+
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param)
+{
+ struct amiga_floppy_struct *p = bdev->bd_disk->private_data;
+ int drive = p - unit;
+ static struct floppy_struct getprm;
+ void __user *argp = (void __user *)param;
+
+ switch(cmd){
+ case FDFMTBEG:
+ get_fdc(drive);
+ if (fd_ref[drive] > 1) {
+ rel_fdc();
+ return -EBUSY;
+ }
+ fsync_bdev(bdev);
+ if (fd_motor_on(drive) == 0) {
+ rel_fdc();
+ return -ENODEV;
+ }
+ if (fd_calibrate(drive) == 0) {
+ rel_fdc();
+ return -ENXIO;
+ }
+ floppy_off(drive);
+ rel_fdc();
+ break;
+ case FDFMTTRK:
+ if (param < p->type->tracks * p->type->heads)
+ {
+ get_fdc(drive);
+ if (fd_seek(drive,param) != 0){
+ memset(p->trackbuf, FD_FILL_BYTE,
+ p->dtype->sects * p->type->sect_mult * 512);
+ non_int_flush_track(drive);
+ }
+ floppy_off(drive);
+ rel_fdc();
+ }
+ else
+ return -EINVAL;
+ break;
+ case FDFMTEND:
+ floppy_off(drive);
+ invalidate_bdev(bdev);
+ break;
+ case FDGETPRM:
+ memset((void *)&getprm, 0, sizeof (getprm));
+ getprm.track=p->type->tracks;
+ getprm.head=p->type->heads;
+ getprm.sect=p->dtype->sects * p->type->sect_mult;
+ getprm.size=p->blocks;
+ if (copy_to_user(argp, &getprm, sizeof(struct floppy_struct)))
+ return -EFAULT;
+ break;
+ case FDSETPRM:
+ case FDDEFPRM:
+ return -EINVAL;
+ case FDFLUSH: /* unconditionally, even if not needed */
+ del_timer (flush_track_timer + drive);
+ non_int_flush_track(drive);
+ break;
+#ifdef RAW_IOCTL
+ case IOCTL_RAW_TRACK:
+ if (copy_to_user(argp, raw_buf, p->type->read_size))
+ return -EFAULT;
+ else
+ return p->type->read_size;
+#endif
+ default:
+ printk(KERN_DEBUG "fd_ioctl: unknown cmd %d for drive %d.",
+ cmd, drive);
+ return -ENOSYS;
+ }
+ return 0;
+}
+
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param)
+{
+ int ret;
+
+ mutex_lock(&amiflop_mutex);
+ ret = fd_locked_ioctl(bdev, mode, cmd, param);
+ mutex_unlock(&amiflop_mutex);
+
+ return ret;
+}
+
+static void fd_probe(int dev)
+{
+ unsigned long code;
+ int type;
+ int drive;
+
+ drive = dev & 3;
+ code = fd_get_drive_id(drive);
+
+ /* get drive type */
+ for (type = 0; type < num_dr_types; type++)
+ if (drive_types[type].code == code)
+ break;
+
+ if (type >= num_dr_types) {
+ printk(KERN_WARNING "fd_probe: unsupported drive type "
+ "%08lx found\n", code);
+ unit[drive].type = &drive_types[num_dr_types-1]; /* FD_NODRIVE */
+ return;
+ }
+
+ unit[drive].type = drive_types + type;
+ unit[drive].track = -1;
+
+ unit[drive].disk = -1;
+ unit[drive].motor = 0;
+ unit[drive].busy = 0;
+ unit[drive].status = -1;
+}
+
+/*
+ * floppy_open check for aliasing (/dev/fd0 can be the same as
+ * /dev/PS0 etc), and disallows simultaneous access to the same
+ * drive with different device numbers.
+ */
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+ int drive = MINOR(bdev->bd_dev) & 3;
+ int system = (MINOR(bdev->bd_dev) & 4) >> 2;
+ int old_dev;
+ unsigned long flags;
+
+ mutex_lock(&amiflop_mutex);
+ old_dev = fd_device[drive];
+
+ if (fd_ref[drive] && old_dev != system) {
+ mutex_unlock(&amiflop_mutex);
+ return -EBUSY;
+ }
+
+ if (mode & (FMODE_READ|FMODE_WRITE)) {
+ check_disk_change(bdev);
+ if (mode & FMODE_WRITE) {
+ int wrprot;
+
+ get_fdc(drive);
+ fd_select (drive);
+ wrprot = !(ciaa.pra & DSKPROT);
+ fd_deselect (drive);
+ rel_fdc();
+
+ if (wrprot) {
+ mutex_unlock(&amiflop_mutex);
+ return -EROFS;
+ }
+ }
+ }
+
+ local_irq_save(flags);
+ fd_ref[drive]++;
+ fd_device[drive] = system;
+ local_irq_restore(flags);
+
+ unit[drive].dtype=&data_types[system];
+ unit[drive].blocks=unit[drive].type->heads*unit[drive].type->tracks*
+ data_types[system].sects*unit[drive].type->sect_mult;
+ set_capacity(unit[drive].gendisk, unit[drive].blocks);
+
+ printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive,
+ unit[drive].type->name, data_types[system].name);
+
+ mutex_unlock(&amiflop_mutex);
+ return 0;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+ struct amiga_floppy_struct *p = disk->private_data;
+ int drive = p - unit;
+
+ mutex_lock(&amiflop_mutex);
+ if (unit[drive].dirty == 1) {
+ del_timer (flush_track_timer + drive);
+ non_int_flush_track (drive);
+ }
+
+ if (!fd_ref[drive]--) {
+ printk(KERN_CRIT "floppy_release with fd_ref == 0");
+ fd_ref[drive] = 0;
+ }
+#ifdef MODULE
+/* the mod_use counter is handled this way */
+ floppy_off (drive | 0x40000000);
+#endif
+ mutex_unlock(&amiflop_mutex);
+}
+
+/*
+ * check_events is never called from an interrupt, so we can relax a bit
+ * here, sleep etc. Note that floppy-on tries to set current_DOR to point
+ * to the desired drive, but it will probably not survive the sleep if
+ * several floppies are used at the same time: thus the loop.
+ */
+static unsigned amiga_check_events(struct gendisk *disk, unsigned int clearing)
+{
+ struct amiga_floppy_struct *p = disk->private_data;
+ int drive = p - unit;
+ int changed;
+ static int first_time = 1;
+
+ if (first_time)
+ changed = first_time--;
+ else {
+ get_fdc(drive);
+ fd_select (drive);
+ changed = !(ciaa.pra & DSKCHANGE);
+ fd_deselect (drive);
+ rel_fdc();
+ }
+
+ if (changed) {
+ fd_probe(drive);
+ p->track = -1;
+ p->dirty = 0;
+ writepending = 0; /* if this was true before, too bad! */
+ writefromint = 0;
+ return DISK_EVENT_MEDIA_CHANGE;
+ }
+ return 0;
+}
+
+static const struct block_device_operations floppy_fops = {
+ .owner = THIS_MODULE,
+ .open = floppy_open,
+ .release = floppy_release,
+ .ioctl = fd_ioctl,
+ .getgeo = fd_getgeo,
+ .check_events = amiga_check_events,
+};
+
+static int __init fd_probe_drives(void)
+{
+ int drive,drives,nomem;
+
+ printk(KERN_INFO "FD: probing units\nfound ");
+ drives=0;
+ nomem=0;
+ for(drive=0;drive<FD_MAX_UNITS;drive++) {
+ struct gendisk *disk;
+ fd_probe(drive);
+ if (unit[drive].type->code == FD_NODRIVE)
+ continue;
+ disk = alloc_disk(1);
+ if (!disk) {
+ unit[drive].type->code = FD_NODRIVE;
+ continue;
+ }
+ unit[drive].gendisk = disk;
+
+ disk->queue = blk_init_queue(do_fd_request, &amiflop_lock);
+ if (!disk->queue) {
+ unit[drive].type->code = FD_NODRIVE;
+ continue;
+ }
+
+ drives++;
+ if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
+ printk("no mem for ");
+ unit[drive].type = &drive_types[num_dr_types - 1]; /* FD_NODRIVE */
+ drives--;
+ nomem = 1;
+ }
+ printk("fd%d ",drive);
+ disk->major = FLOPPY_MAJOR;
+ disk->first_minor = drive;
+ disk->fops = &floppy_fops;
+ sprintf(disk->disk_name, "fd%d", drive);
+ disk->private_data = &unit[drive];
+ set_capacity(disk, 880*2);
+ add_disk(disk);
+ }
+ if ((drives > 0) || (nomem == 0)) {
+ if (drives == 0)
+ printk("no drives");
+ printk("\n");
+ return drives;
+ }
+ printk("\n");
+ return -ENOMEM;
+}
+
+static struct kobject *floppy_find(dev_t dev, int *part, void *data)
+{
+ int drive = *part & 3;
+ if (unit[drive].type->code == FD_NODRIVE)
+ return NULL;
+ *part = 0;
+ return get_disk(unit[drive].gendisk);
+}
+
+static int __init amiga_floppy_probe(struct platform_device *pdev)
+{
+ int i, ret;
+
+ if (register_blkdev(FLOPPY_MAJOR,"fd"))
+ return -EBUSY;
+
+ ret = -ENOMEM;
+ raw_buf = amiga_chip_alloc(RAW_BUF_SIZE, "Floppy");
+ if (!raw_buf) {
+ printk("fd: cannot get chip mem buffer\n");
+ goto out_blkdev;
+ }
+
+ ret = -EBUSY;
+ if (request_irq(IRQ_AMIGA_DSKBLK, fd_block_done, 0, "floppy_dma", NULL)) {
+ printk("fd: cannot get irq for dma\n");
+ goto out_irq;
+ }
+
+ if (request_irq(IRQ_AMIGA_CIAA_TB, ms_isr, 0, "floppy_timer", NULL)) {
+ printk("fd: cannot get irq for timer\n");
+ goto out_irq2;
+ }
+
+ ret = -ENODEV;
+ if (fd_probe_drives() < 1) /* No usable drives */
+ goto out_probe;
+
+ blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
+ floppy_find, NULL, NULL);
+
+ /* initialize variables */
+ init_timer(&motor_on_timer);
+ motor_on_timer.expires = 0;
+ motor_on_timer.data = 0;
+ motor_on_timer.function = motor_on_callback;
+ for (i = 0; i < FD_MAX_UNITS; i++) {
+ init_timer(&motor_off_timer[i]);
+ motor_off_timer[i].expires = 0;
+ motor_off_timer[i].data = i|0x80000000;
+ motor_off_timer[i].function = fd_motor_off;
+ init_timer(&flush_track_timer[i]);
+ flush_track_timer[i].expires = 0;
+ flush_track_timer[i].data = i;
+ flush_track_timer[i].function = flush_track_callback;
+
+ unit[i].track = -1;
+ }
+
+ init_timer(&post_write_timer);
+ post_write_timer.expires = 0;
+ post_write_timer.data = 0;
+ post_write_timer.function = post_write;
+
+ for (i = 0; i < 128; i++)
+ mfmdecode[i]=255;
+ for (i = 0; i < 16; i++)
+ mfmdecode[mfmencode[i]]=i;
+
+ /* make sure that disk DMA is enabled */
+ custom.dmacon = DMAF_SETCLR | DMAF_DISK;
+
+ /* init ms timer */
+ ciaa.crb = 8; /* one-shot, stop */
+ return 0;
+
+out_probe:
+ free_irq(IRQ_AMIGA_CIAA_TB, NULL);
+out_irq2:
+ free_irq(IRQ_AMIGA_DSKBLK, NULL);
+out_irq:
+ amiga_chip_free(raw_buf);
+out_blkdev:
+ unregister_blkdev(FLOPPY_MAJOR,"fd");
+ return ret;
+}
+
+#if 0 /* not safe to unload */
+static int __exit amiga_floppy_remove(struct platform_device *pdev)
+{
+ int i;
+
+ for( i = 0; i < FD_MAX_UNITS; i++) {
+ if (unit[i].type->code != FD_NODRIVE) {
+ struct request_queue *q = unit[i].gendisk->queue;
+ del_gendisk(unit[i].gendisk);
+ put_disk(unit[i].gendisk);
+ kfree(unit[i].trackbuf);
+ if (q)
+ blk_cleanup_queue(q);
+ }
+ }
+ blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
+ free_irq(IRQ_AMIGA_CIAA_TB, NULL);
+ free_irq(IRQ_AMIGA_DSKBLK, NULL);
+ custom.dmacon = DMAF_DISK; /* disable DMA */
+ amiga_chip_free(raw_buf);
+ unregister_blkdev(FLOPPY_MAJOR, "fd");
+}
+#endif
+
+static struct platform_driver amiga_floppy_driver = {
+ .driver = {
+ .name = "amiga-floppy",
+ },
+};
+
+static int __init amiga_floppy_init(void)
+{
+ return platform_driver_probe(&amiga_floppy_driver, amiga_floppy_probe);
+}
+
+module_init(amiga_floppy_init);
+
+#ifndef MODULE
+static int __init amiga_floppy_setup (char *str)
+{
+ int n;
+ if (!MACH_IS_AMIGA)
+ return 0;
+ if (!get_option(&str, &n))
+ return 0;
+ printk (KERN_INFO "amiflop: Setting default df0 to %x\n", n);
+ fd_def_df0 = n;
+ return 1;
+}
+
+__setup("floppy=", amiga_floppy_setup);
+#endif
+
+MODULE_ALIAS("platform:amiga-floppy");
diff --git a/drivers/block/aoe/Makefile b/drivers/block/aoe/Makefile
new file mode 100644
index 000000000..06ea82cdf
--- /dev/null
+++ b/drivers/block/aoe/Makefile
@@ -0,0 +1,6 @@
+#
+# Makefile for ATA over Ethernet
+#
+
+obj-$(CONFIG_ATA_OVER_ETH) += aoe.o
+aoe-y := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
new file mode 100644
index 000000000..9220f8e83
--- /dev/null
+++ b/drivers/block/aoe/aoe.h
@@ -0,0 +1,240 @@
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
+#define VERSION "85"
+#define AOE_MAJOR 152
+#define DEVICE_NAME "aoe"
+
+/* set AOE_PARTITIONS to 1 to use whole-disks only
+ * default is 16, which is 15 partitions plus the whole disk
+ */
+#ifndef AOE_PARTITIONS
+#define AOE_PARTITIONS (16)
+#endif
+
+#define WHITESPACE " \t\v\f\n,"
+
+enum {
+ AOECMD_ATA,
+ AOECMD_CFG,
+ AOECMD_VEND_MIN = 0xf0,
+
+ AOEFL_RSP = (1<<3),
+ AOEFL_ERR = (1<<2),
+
+ AOEAFL_EXT = (1<<6),
+ AOEAFL_DEV = (1<<4),
+ AOEAFL_ASYNC = (1<<1),
+ AOEAFL_WRITE = (1<<0),
+
+ AOECCMD_READ = 0,
+ AOECCMD_TEST,
+ AOECCMD_PTEST,
+ AOECCMD_SET,
+ AOECCMD_FSET,
+
+ AOE_HVER = 0x10,
+};
+
+struct aoe_hdr {
+ unsigned char dst[6];
+ unsigned char src[6];
+ __be16 type;
+ unsigned char verfl;
+ unsigned char err;
+ __be16 major;
+ unsigned char minor;
+ unsigned char cmd;
+ __be32 tag;
+};
+
+struct aoe_atahdr {
+ unsigned char aflags;
+ unsigned char errfeat;
+ unsigned char scnt;
+ unsigned char cmdstat;
+ unsigned char lba0;
+ unsigned char lba1;
+ unsigned char lba2;
+ unsigned char lba3;
+ unsigned char lba4;
+ unsigned char lba5;
+ unsigned char res[2];
+};
+
+struct aoe_cfghdr {
+ __be16 bufcnt;
+ __be16 fwver;
+ unsigned char scnt;
+ unsigned char aoeccmd;
+ unsigned char cslen[2];
+};
+
+enum {
+ DEVFL_UP = 1, /* device is installed in system and ready for AoE->ATA commands */
+ DEVFL_TKILL = (1<<1), /* flag for timer to know when to kill self */
+ DEVFL_EXT = (1<<2), /* device accepts lba48 commands */
+ DEVFL_GDALLOC = (1<<3), /* need to alloc gendisk */
+ DEVFL_GD_NOW = (1<<4), /* allocating gendisk */
+ DEVFL_KICKME = (1<<5), /* slow polling network card catch */
+ DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */
+ DEVFL_FREEING = (1<<7), /* set when device is being cleaned up */
+ DEVFL_FREED = (1<<8), /* device has been cleaned up */
+};
+
+enum {
+ DEFAULTBCNT = 2 * 512, /* 2 sectors */
+ MIN_BUFS = 16,
+ NTARGETS = 4,
+ NAOEIFS = 8,
+ NSKBPOOLMAX = 256,
+ NFACTIVE = 61,
+
+ TIMERTICK = HZ / 10,
+ RTTSCALE = 8,
+ RTTDSCALE = 3,
+ RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE,
+ RTTDEV_INIT = RTTAVG_INIT / 4,
+
+ HARD_SCORN_SECS = 10, /* try another remote port after this */
+ MAX_TAINT = 1000, /* cap on aoetgt taint */
+};
+
+struct buf {
+ ulong nframesout;
+ struct bio *bio;
+ struct bvec_iter iter;
+ struct request *rq;
+};
+
+enum frame_flags {
+ FFL_PROBE = 1,
+};
+
+struct frame {
+ struct list_head head;
+ u32 tag;
+ struct timeval sent; /* high-res time packet was sent */
+ u32 sent_jiffs; /* low-res jiffies-based sent time */
+ ulong waited;
+ ulong waited_total;
+ struct aoetgt *t; /* parent target I belong to */
+ struct sk_buff *skb; /* command skb freed on module exit */
+ struct sk_buff *r_skb; /* response skb for async processing */
+ struct buf *buf;
+ struct bvec_iter iter;
+ char flags;
+};
+
+struct aoeif {
+ struct net_device *nd;
+ ulong lost;
+ int bcnt;
+};
+
+struct aoetgt {
+ unsigned char addr[6];
+ ushort nframes; /* cap on frames to use */
+ struct aoedev *d; /* parent device I belong to */
+ struct list_head ffree; /* list of free frames */
+ struct aoeif ifs[NAOEIFS];
+ struct aoeif *ifp; /* current aoeif in use */
+ ushort nout; /* number of AoE commands outstanding */
+ ushort maxout; /* current value for max outstanding */
+ ushort next_cwnd; /* incr maxout after decrementing to zero */
+ ushort ssthresh; /* slow start threshold */
+ ulong falloc; /* number of allocated frames */
+ int taint; /* how much we want to avoid this aoetgt */
+ int minbcnt;
+ int wpkts, rpkts;
+ char nout_probes;
+};
+
+struct aoedev {
+ struct aoedev *next;
+ ulong sysminor;
+ ulong aoemajor;
+ u32 rttavg; /* scaled AoE round trip time average */
+ u32 rttdev; /* scaled round trip time mean deviation */
+ u16 aoeminor;
+ u16 flags;
+ u16 nopen; /* (bd_openers isn't available without sleeping) */
+ u16 fw_ver; /* version of blade's firmware */
+ u16 lasttag; /* last tag sent */
+ u16 useme;
+ ulong ref;
+ struct work_struct work;/* disk create work struct */
+ struct gendisk *gd;
+ struct dentry *debugfs;
+ struct request_queue *blkq;
+ struct hd_geometry geo;
+ sector_t ssize;
+ struct timer_list timer;
+ spinlock_t lock;
+ struct sk_buff_head skbpool;
+ mempool_t *bufpool; /* for deadlock-free Buf allocation */
+ struct { /* pointers to work in progress */
+ struct buf *buf;
+ struct bio *nxbio;
+ struct request *rq;
+ } ip;
+ ulong maxbcnt;
+ struct list_head factive[NFACTIVE]; /* hash of active frames */
+ struct list_head rexmitq; /* deferred retransmissions */
+ struct aoetgt **targets;
+ ulong ntargets; /* number of allocated aoetgt pointers */
+ struct aoetgt **tgt; /* target in use when working */
+ ulong kicked;
+ char ident[512];
+};
+
+/* kthread tracking */
+struct ktstate {
+ struct completion rendez;
+ struct task_struct *task;
+ wait_queue_head_t *waitq;
+ int (*fn) (int);
+ char name[12];
+ spinlock_t *lock;
+ int id;
+ int active;
+};
+
+int aoeblk_init(void);
+void aoeblk_exit(void);
+void aoeblk_gdalloc(void *);
+void aoedisk_rm_debugfs(struct aoedev *d);
+void aoedisk_rm_sysfs(struct aoedev *d);
+
+int aoechr_init(void);
+void aoechr_exit(void);
+void aoechr_error(char *);
+
+void aoecmd_work(struct aoedev *d);
+void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
+struct sk_buff *aoecmd_ata_rsp(struct sk_buff *);
+void aoecmd_cfg_rsp(struct sk_buff *);
+void aoecmd_sleepwork(struct work_struct *);
+void aoecmd_wreset(struct aoetgt *t);
+void aoecmd_cleanslate(struct aoedev *);
+void aoecmd_exit(void);
+int aoecmd_init(void);
+struct sk_buff *aoecmd_ata_id(struct aoedev *);
+void aoe_freetframe(struct frame *);
+void aoe_flush_iocq(void);
+void aoe_flush_iocq_by_index(int);
+void aoe_end_request(struct aoedev *, struct request *, int);
+int aoe_ktstart(struct ktstate *k);
+void aoe_ktstop(struct ktstate *k);
+
+int aoedev_init(void);
+void aoedev_exit(void);
+struct aoedev *aoedev_by_aoeaddr(ulong maj, int min, int do_alloc);
+void aoedev_downdev(struct aoedev *d);
+int aoedev_flush(const char __user *str, size_t size);
+void aoe_failbuf(struct aoedev *, struct buf *);
+void aoedev_put(struct aoedev *);
+
+int aoenet_init(void);
+void aoenet_exit(void);
+void aoenet_xmit(struct sk_buff_head *);
+int is_aoe_netif(struct net_device *ifp);
+int set_aoe_iflist(const char __user *str, size_t size);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
new file mode 100644
index 000000000..46c282fff
--- /dev/null
+++ b/drivers/block/aoe/aoeblk.c
@@ -0,0 +1,464 @@
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
+/*
+ * aoeblk.c
+ * block device routines
+ */
+
+#include <linux/kernel.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/genhd.h>
+#include <linux/netdevice.h>
+#include <linux/mutex.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <scsi/sg.h>
+#include "aoe.h"
+
+static DEFINE_MUTEX(aoeblk_mutex);
+static struct kmem_cache *buf_pool_cache;
+static struct dentry *aoe_debugfs_dir;
+
+/* GPFS needs a larger value than the default. */
+static int aoe_maxsectors;
+module_param(aoe_maxsectors, int, 0644);
+MODULE_PARM_DESC(aoe_maxsectors,
+ "When nonzero, set the maximum number of sectors per I/O request");
+
+static ssize_t aoedisk_show_state(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct aoedev *d = disk->private_data;
+
+ return snprintf(page, PAGE_SIZE,
+ "%s%s\n",
+ (d->flags & DEVFL_UP) ? "up" : "down",
+ (d->flags & DEVFL_KICKME) ? ",kickme" :
+ (d->nopen && !(d->flags & DEVFL_UP)) ? ",closewait" : "");
+ /* I'd rather see nopen exported so we can ditch closewait */
+}
+static ssize_t aoedisk_show_mac(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct aoedev *d = disk->private_data;
+ struct aoetgt *t = d->targets[0];
+
+ if (t == NULL)
+ return snprintf(page, PAGE_SIZE, "none\n");
+ return snprintf(page, PAGE_SIZE, "%pm\n", t->addr);
+}
+static ssize_t aoedisk_show_netif(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct aoedev *d = disk->private_data;
+ struct net_device *nds[8], **nd, **nnd, **ne;
+ struct aoetgt **t, **te;
+ struct aoeif *ifp, *e;
+ char *p;
+
+ memset(nds, 0, sizeof nds);
+ nd = nds;
+ ne = nd + ARRAY_SIZE(nds);
+ t = d->targets;
+ te = t + d->ntargets;
+ for (; t < te && *t; t++) {
+ ifp = (*t)->ifs;
+ e = ifp + NAOEIFS;
+ for (; ifp < e && ifp->nd; ifp++) {
+ for (nnd = nds; nnd < nd; nnd++)
+ if (*nnd == ifp->nd)
+ break;
+ if (nnd == nd && nd != ne)
+ *nd++ = ifp->nd;
+ }
+ }
+
+ ne = nd;
+ nd = nds;
+ if (*nd == NULL)
+ return snprintf(page, PAGE_SIZE, "none\n");
+ for (p = page; nd < ne; nd++)
+ p += snprintf(p, PAGE_SIZE - (p-page), "%s%s",
+ p == page ? "" : ",", (*nd)->name);
+ p += snprintf(p, PAGE_SIZE - (p-page), "\n");
+ return p-page;
+}
+/* firmware version */
+static ssize_t aoedisk_show_fwver(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct aoedev *d = disk->private_data;
+
+ return snprintf(page, PAGE_SIZE, "0x%04x\n", (unsigned int) d->fw_ver);
+}
+static ssize_t aoedisk_show_payload(struct device *dev,
+ struct device_attribute *attr, char *page)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct aoedev *d = disk->private_data;
+
+ return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt);
+}
+
+static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
+{
+ struct aoedev *d;
+ struct aoetgt **t, **te;
+ struct aoeif *ifp, *ife;
+ unsigned long flags;
+ char c;
+
+ d = s->private;
+ seq_printf(s, "rttavg: %d rttdev: %d\n",
+ d->rttavg >> RTTSCALE,
+ d->rttdev >> RTTDSCALE);
+ seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool));
+ seq_printf(s, "kicked: %ld\n", d->kicked);
+ seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt);
+ seq_printf(s, "ref: %ld\n", d->ref);
+
+ spin_lock_irqsave(&d->lock, flags);
+ t = d->targets;
+ te = t + d->ntargets;
+ for (; t < te && *t; t++) {
+ c = '\t';
+ seq_printf(s, "falloc: %ld\n", (*t)->falloc);
+ seq_printf(s, "ffree: %p\n",
+ list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next);
+ seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout,
+ (*t)->maxout, (*t)->nframes);
+ seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh);
+ seq_printf(s, "\ttaint:%d\n", (*t)->taint);
+ seq_printf(s, "\tr:%d\n", (*t)->rpkts);
+ seq_printf(s, "\tw:%d\n", (*t)->wpkts);
+ ifp = (*t)->ifs;
+ ife = ifp + ARRAY_SIZE((*t)->ifs);
+ for (; ifp->nd && ifp < ife; ifp++) {
+ seq_printf(s, "%c%s", c, ifp->nd->name);
+ c = ',';
+ }
+ seq_puts(s, "\n");
+ }
+ spin_unlock_irqrestore(&d->lock, flags);
+
+ return 0;
+}
+
+static int aoe_debugfs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, aoedisk_debugfs_show, inode->i_private);
+}
+
+static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
+static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
+static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL);
+static struct device_attribute dev_attr_firmware_version = {
+ .attr = { .name = "firmware-version", .mode = S_IRUGO },
+ .show = aoedisk_show_fwver,
+};
+static DEVICE_ATTR(payload, S_IRUGO, aoedisk_show_payload, NULL);
+
+static struct attribute *aoe_attrs[] = {
+ &dev_attr_state.attr,
+ &dev_attr_mac.attr,
+ &dev_attr_netif.attr,
+ &dev_attr_firmware_version.attr,
+ &dev_attr_payload.attr,
+ NULL,
+};
+
+static const struct attribute_group attr_group = {
+ .attrs = aoe_attrs,
+};
+
+static const struct file_operations aoe_debugfs_fops = {
+ .open = aoe_debugfs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void
+aoedisk_add_debugfs(struct aoedev *d)
+{
+ struct dentry *entry;
+ char *p;
+
+ if (aoe_debugfs_dir == NULL)
+ return;
+ p = strchr(d->gd->disk_name, '/');
+ if (p == NULL)
+ p = d->gd->disk_name;
+ else
+ p++;
+ BUG_ON(*p == '\0');
+ entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d,
+ &aoe_debugfs_fops);
+ if (IS_ERR_OR_NULL(entry)) {
+ pr_info("aoe: cannot create debugfs file for %s\n",
+ d->gd->disk_name);
+ return;
+ }
+ BUG_ON(d->debugfs);
+ d->debugfs = entry;
+}
+void
+aoedisk_rm_debugfs(struct aoedev *d)
+{
+ debugfs_remove(d->debugfs);
+ d->debugfs = NULL;
+}
+
+static int
+aoedisk_add_sysfs(struct aoedev *d)
+{
+ return sysfs_create_group(&disk_to_dev(d->gd)->kobj, &attr_group);
+}
+void
+aoedisk_rm_sysfs(struct aoedev *d)
+{
+ sysfs_remove_group(&disk_to_dev(d->gd)->kobj, &attr_group);
+}
+
+static int
+aoeblk_open(struct block_device *bdev, fmode_t mode)
+{
+ struct aoedev *d = bdev->bd_disk->private_data;
+ ulong flags;
+
+ if (!virt_addr_valid(d)) {
+ pr_crit("aoe: invalid device pointer in %s\n",
+ __func__);
+ WARN_ON(1);
+ return -ENODEV;
+ }
+ if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL)
+ return -ENODEV;
+
+ mutex_lock(&aoeblk_mutex);
+ spin_lock_irqsave(&d->lock, flags);
+ if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) {
+ d->nopen++;
+ spin_unlock_irqrestore(&d->lock, flags);
+ mutex_unlock(&aoeblk_mutex);
+ return 0;
+ }
+ spin_unlock_irqrestore(&d->lock, flags);
+ mutex_unlock(&aoeblk_mutex);
+ return -ENODEV;
+}
+
+static void
+aoeblk_release(struct gendisk *disk, fmode_t mode)
+{
+ struct aoedev *d = disk->private_data;
+ ulong flags;
+
+ spin_lock_irqsave(&d->lock, flags);
+
+ if (--d->nopen == 0) {
+ spin_unlock_irqrestore(&d->lock, flags);
+ aoecmd_cfg(d->aoemajor, d->aoeminor);
+ return;
+ }
+ spin_unlock_irqrestore(&d->lock, flags);
+}
+
+static void
+aoeblk_request(struct request_queue *q)
+{
+ struct aoedev *d;
+ struct request *rq;
+
+ d = q->queuedata;
+ if ((d->flags & DEVFL_UP) == 0) {
+ pr_info_ratelimited("aoe: device %ld.%d is not up\n",
+ d->aoemajor, d->aoeminor);
+ while ((rq = blk_peek_request(q))) {
+ blk_start_request(rq);
+ aoe_end_request(d, rq, 1);
+ }
+ return;
+ }
+ aoecmd_work(d);
+}
+
+static int
+aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct aoedev *d = bdev->bd_disk->private_data;
+
+ if ((d->flags & DEVFL_UP) == 0) {
+ printk(KERN_ERR "aoe: disk not up\n");
+ return -ENODEV;
+ }
+
+ geo->cylinders = d->geo.cylinders;
+ geo->heads = d->geo.heads;
+ geo->sectors = d->geo.sectors;
+ return 0;
+}
+
+static int
+aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg)
+{
+ struct aoedev *d;
+
+ if (!arg)
+ return -EINVAL;
+
+ d = bdev->bd_disk->private_data;
+ if ((d->flags & DEVFL_UP) == 0) {
+ pr_err("aoe: disk not up\n");
+ return -ENODEV;
+ }
+
+ if (cmd == HDIO_GET_IDENTITY) {
+ if (!copy_to_user((void __user *) arg, &d->ident,
+ sizeof(d->ident)))
+ return 0;
+ return -EFAULT;
+ }
+
+ /* udev calls scsi_id, which uses SG_IO, resulting in noise */
+ if (cmd != SG_IO)
+ pr_info("aoe: unknown ioctl 0x%x\n", cmd);
+
+ return -ENOTTY;
+}
+
+static const struct block_device_operations aoe_bdops = {
+ .open = aoeblk_open,
+ .release = aoeblk_release,
+ .ioctl = aoeblk_ioctl,
+ .getgeo = aoeblk_getgeo,
+ .owner = THIS_MODULE,
+};
+
+/* alloc_disk and add_disk can sleep */
+void
+aoeblk_gdalloc(void *vp)
+{
+ struct aoedev *d = vp;
+ struct gendisk *gd;
+ mempool_t *mp;
+ struct request_queue *q;
+ enum { KB = 1024, MB = KB * KB, READ_AHEAD = 2 * MB, };
+ ulong flags;
+ int late = 0;
+
+ spin_lock_irqsave(&d->lock, flags);
+ if (d->flags & DEVFL_GDALLOC
+ && !(d->flags & DEVFL_TKILL)
+ && !(d->flags & DEVFL_GD_NOW))
+ d->flags |= DEVFL_GD_NOW;
+ else
+ late = 1;
+ spin_unlock_irqrestore(&d->lock, flags);
+ if (late)
+ return;
+
+ gd = alloc_disk(AOE_PARTITIONS);
+ if (gd == NULL) {
+ pr_err("aoe: cannot allocate disk structure for %ld.%d\n",
+ d->aoemajor, d->aoeminor);
+ goto err;
+ }
+
+ mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
+ buf_pool_cache);
+ if (mp == NULL) {
+ printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
+ d->aoemajor, d->aoeminor);
+ goto err_disk;
+ }
+ q = blk_init_queue(aoeblk_request, &d->lock);
+ if (q == NULL) {
+ pr_err("aoe: cannot allocate block queue for %ld.%d\n",
+ d->aoemajor, d->aoeminor);
+ goto err_mempool;
+ }
+
+ spin_lock_irqsave(&d->lock, flags);
+ WARN_ON(!(d->flags & DEVFL_GD_NOW));
+ WARN_ON(!(d->flags & DEVFL_GDALLOC));
+ WARN_ON(d->flags & DEVFL_TKILL);
+ WARN_ON(d->gd);
+ WARN_ON(d->flags & DEVFL_UP);
+ blk_queue_max_hw_sectors(q, 1024);
+ q->backing_dev_info.name = "aoe";
+ q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
+ d->bufpool = mp;
+ d->blkq = gd->queue = q;
+ q->queuedata = d;
+ d->gd = gd;
+ if (aoe_maxsectors)
+ blk_queue_max_hw_sectors(q, aoe_maxsectors);
+ gd->major = AOE_MAJOR;
+ gd->first_minor = d->sysminor;
+ gd->fops = &aoe_bdops;
+ gd->private_data = d;
+ set_capacity(gd, d->ssize);
+ snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
+ d->aoemajor, d->aoeminor);
+
+ d->flags &= ~DEVFL_GDALLOC;
+ d->flags |= DEVFL_UP;
+
+ spin_unlock_irqrestore(&d->lock, flags);
+
+ add_disk(gd);
+ aoedisk_add_sysfs(d);
+ aoedisk_add_debugfs(d);
+
+ spin_lock_irqsave(&d->lock, flags);
+ WARN_ON(!(d->flags & DEVFL_GD_NOW));
+ d->flags &= ~DEVFL_GD_NOW;
+ spin_unlock_irqrestore(&d->lock, flags);
+ return;
+
+err_mempool:
+ mempool_destroy(mp);
+err_disk:
+ put_disk(gd);
+err:
+ spin_lock_irqsave(&d->lock, flags);
+ d->flags &= ~DEVFL_GD_NOW;
+ schedule_work(&d->work);
+ spin_unlock_irqrestore(&d->lock, flags);
+}
+
+void
+aoeblk_exit(void)
+{
+ debugfs_remove_recursive(aoe_debugfs_dir);
+ aoe_debugfs_dir = NULL;
+ kmem_cache_destroy(buf_pool_cache);
+}
+
+int __init
+aoeblk_init(void)
+{
+ buf_pool_cache = kmem_cache_create("aoe_bufs",
+ sizeof(struct buf),
+ 0, 0, NULL);
+ if (buf_pool_cache == NULL)
+ return -ENOMEM;
+ aoe_debugfs_dir = debugfs_create_dir("aoe", NULL);
+ if (IS_ERR_OR_NULL(aoe_debugfs_dir)) {
+ pr_info("aoe: cannot create debugfs directory\n");
+ aoe_debugfs_dir = NULL;
+ }
+ return 0;
+}
+
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
new file mode 100644
index 000000000..ab41be625
--- /dev/null
+++ b/drivers/block/aoe/aoechr.c
@@ -0,0 +1,320 @@
+/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
+/*
+ * aoechr.c
+ * AoE character device driver
+ */
+
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include "aoe.h"
+
+enum {
+ //MINOR_STAT = 1, (moved to sysfs)
+ MINOR_ERR = 2,
+ MINOR_DISCOVER,
+ MINOR_INTERFACES,
+ MINOR_REVALIDATE,
+ MINOR_FLUSH,
+ MSGSZ = 2048,
+ NMSG = 100, /* message backlog to retain */
+};
+
+struct aoe_chardev {
+ ulong minor;
+ char name[32];
+};
+
+enum { EMFL_VALID = 1 };
+
+struct ErrMsg {
+ short flags;
+ short len;
+ char *msg;
+};
+
+static DEFINE_MUTEX(aoechr_mutex);
+
+/* A ring buffer of error messages, to be read through
+ * "/dev/etherd/err". When no messages are present,
+ * readers will block waiting for messages to appear.
+ */
+static struct ErrMsg emsgs[NMSG];
+static int emsgs_head_idx, emsgs_tail_idx;
+static struct completion emsgs_comp;
+static spinlock_t emsgs_lock;
+static int nblocked_emsgs_readers;
+static struct class *aoe_class;
+static struct aoe_chardev chardevs[] = {
+ { MINOR_ERR, "err" },
+ { MINOR_DISCOVER, "discover" },
+ { MINOR_INTERFACES, "interfaces" },
+ { MINOR_REVALIDATE, "revalidate" },
+ { MINOR_FLUSH, "flush" },
+};
+
+static int
+discover(void)
+{
+ aoecmd_cfg(0xffff, 0xff);
+ return 0;
+}
+
+static int
+interfaces(const char __user *str, size_t size)
+{
+ if (set_aoe_iflist(str, size)) {
+ printk(KERN_ERR
+ "aoe: could not set interface list: too many interfaces\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int
+revalidate(const char __user *str, size_t size)
+{
+ int major, minor, n;
+ ulong flags;
+ struct aoedev *d;
+ struct sk_buff *skb;
+ char buf[16];
+
+ if (size >= sizeof buf)
+ return -EINVAL;
+ buf[sizeof buf - 1] = '\0';
+ if (copy_from_user(buf, str, size))
+ return -EFAULT;
+
+ n = sscanf(buf, "e%d.%d", &major, &minor);
+ if (n != 2) {
+ pr_err("aoe: invalid device specification %s\n", buf);
+ return -EINVAL;
+ }
+ d = aoedev_by_aoeaddr(major, minor, 0);
+ if (!d)
+ return -EINVAL;
+ spin_lock_irqsave(&d->lock, flags);
+ aoecmd_cleanslate(d);
+ aoecmd_cfg(major, minor);
+loop:
+ skb = aoecmd_ata_id(d);
+ spin_unlock_irqrestore(&d->lock, flags);
+ /* try again if we are able to sleep a bit,
+ * otherwise give up this revalidation
+ */
+ if (!skb && !msleep_interruptible(250)) {
+ spin_lock_irqsave(&d->lock, flags);
+ goto loop;
+ }
+ aoedev_put(d);
+ if (skb) {
+ struct sk_buff_head queue;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+ }
+ return 0;
+}
+
+void
+aoechr_error(char *msg)
+{
+ struct ErrMsg *em;
+ char *mp;
+ ulong flags, n;
+
+ n = strlen(msg);
+
+ spin_lock_irqsave(&emsgs_lock, flags);
+
+ em = emsgs + emsgs_tail_idx;
+ if ((em->flags & EMFL_VALID)) {
+bail: spin_unlock_irqrestore(&emsgs_lock, flags);
+ return;
+ }
+
+ mp = kmemdup(msg, n, GFP_ATOMIC);
+ if (mp == NULL) {
+ printk(KERN_ERR "aoe: allocation failure, len=%ld\n", n);
+ goto bail;
+ }
+
+ em->msg = mp;
+ em->flags |= EMFL_VALID;
+ em->len = n;
+
+ emsgs_tail_idx++;
+ emsgs_tail_idx %= ARRAY_SIZE(emsgs);
+
+ spin_unlock_irqrestore(&emsgs_lock, flags);
+
+ if (nblocked_emsgs_readers)
+ complete(&emsgs_comp);
+}
+
+static ssize_t
+aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp)
+{
+ int ret = -EINVAL;
+
+ switch ((unsigned long) filp->private_data) {
+ default:
+ printk(KERN_INFO "aoe: can't write to that file.\n");
+ break;
+ case MINOR_DISCOVER:
+ ret = discover();
+ break;
+ case MINOR_INTERFACES:
+ ret = interfaces(buf, cnt);
+ break;
+ case MINOR_REVALIDATE:
+ ret = revalidate(buf, cnt);
+ break;
+ case MINOR_FLUSH:
+ ret = aoedev_flush(buf, cnt);
+ break;
+ }
+ if (ret == 0)
+ ret = cnt;
+ return ret;
+}
+
+static int
+aoechr_open(struct inode *inode, struct file *filp)
+{
+ int n, i;
+
+ mutex_lock(&aoechr_mutex);
+ n = iminor(inode);
+ filp->private_data = (void *) (unsigned long) n;
+
+ for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
+ if (chardevs[i].minor == n) {
+ mutex_unlock(&aoechr_mutex);
+ return 0;
+ }
+ mutex_unlock(&aoechr_mutex);
+ return -EINVAL;
+}
+
+static int
+aoechr_rel(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static ssize_t
+aoechr_read(struct file *filp, char __user *buf, size_t cnt, loff_t *off)
+{
+ unsigned long n;
+ char *mp;
+ struct ErrMsg *em;
+ ssize_t len;
+ ulong flags;
+
+ n = (unsigned long) filp->private_data;
+ if (n != MINOR_ERR)
+ return -EFAULT;
+
+ spin_lock_irqsave(&emsgs_lock, flags);
+
+ for (;;) {
+ em = emsgs + emsgs_head_idx;
+ if ((em->flags & EMFL_VALID) != 0)
+ break;
+ if (filp->f_flags & O_NDELAY) {
+ spin_unlock_irqrestore(&emsgs_lock, flags);
+ return -EAGAIN;
+ }
+ nblocked_emsgs_readers++;
+
+ spin_unlock_irqrestore(&emsgs_lock, flags);
+
+ n = wait_for_completion_interruptible(&emsgs_comp);
+
+ spin_lock_irqsave(&emsgs_lock, flags);
+
+ nblocked_emsgs_readers--;
+
+ if (n) {
+ spin_unlock_irqrestore(&emsgs_lock, flags);
+ return -ERESTARTSYS;
+ }
+ }
+ if (em->len > cnt) {
+ spin_unlock_irqrestore(&emsgs_lock, flags);
+ return -EAGAIN;
+ }
+ mp = em->msg;
+ len = em->len;
+ em->msg = NULL;
+ em->flags &= ~EMFL_VALID;
+
+ emsgs_head_idx++;
+ emsgs_head_idx %= ARRAY_SIZE(emsgs);
+
+ spin_unlock_irqrestore(&emsgs_lock, flags);
+
+ n = copy_to_user(buf, mp, len);
+ kfree(mp);
+ return n == 0 ? len : -EFAULT;
+}
+
+static const struct file_operations aoe_fops = {
+ .write = aoechr_write,
+ .read = aoechr_read,
+ .open = aoechr_open,
+ .release = aoechr_rel,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static char *aoe_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
+}
+
+int __init
+aoechr_init(void)
+{
+ int n, i;
+
+ n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops);
+ if (n < 0) {
+ printk(KERN_ERR "aoe: can't register char device\n");
+ return n;
+ }
+ init_completion(&emsgs_comp);
+ spin_lock_init(&emsgs_lock);
+ aoe_class = class_create(THIS_MODULE, "aoe");
+ if (IS_ERR(aoe_class)) {
+ unregister_chrdev(AOE_MAJOR, "aoechr");
+ return PTR_ERR(aoe_class);
+ }
+ aoe_class->devnode = aoe_devnode;
+
+ for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
+ device_create(aoe_class, NULL,
+ MKDEV(AOE_MAJOR, chardevs[i].minor), NULL,
+ chardevs[i].name);
+
+ return 0;
+}
+
+void
+aoechr_exit(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
+ device_destroy(aoe_class, MKDEV(AOE_MAJOR, chardevs[i].minor));
+ class_destroy(aoe_class);
+ unregister_chrdev(AOE_MAJOR, "aoechr");
+}
+
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
new file mode 100644
index 000000000..422b7d84f
--- /dev/null
+++ b/drivers/block/aoe/aoecmd.c
@@ -0,0 +1,1826 @@
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
+/*
+ * aoecmd.c
+ * Filesystem request handling methods
+ */
+
+#include <linux/ata.h>
+#include <linux/slab.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/genhd.h>
+#include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <net/net_namespace.h>
+#include <asm/unaligned.h>
+#include <linux/uio.h>
+#include "aoe.h"
+
+#define MAXIOC (8192) /* default meant to avoid most soft lockups */
+
+static void ktcomplete(struct frame *, struct sk_buff *);
+static int count_targets(struct aoedev *d, int *untainted);
+
+static struct buf *nextbuf(struct aoedev *);
+
+static int aoe_deadsecs = 60 * 3;
+module_param(aoe_deadsecs, int, 0644);
+MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
+
+static int aoe_maxout = 64;
+module_param(aoe_maxout, int, 0644);
+MODULE_PARM_DESC(aoe_maxout,
+ "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
+
+/* The number of online cpus during module initialization gives us a
+ * convenient heuristic cap on the parallelism used for ktio threads
+ * doing I/O completion. It is not important that the cap equal the
+ * actual number of running CPUs at any given time, but because of CPU
+ * hotplug, we take care to use ncpus instead of using
+ * num_online_cpus() after module initialization.
+ */
+static int ncpus;
+
+/* mutex lock used for synchronization while thread spawning */
+static DEFINE_MUTEX(ktio_spawn_lock);
+
+static wait_queue_head_t *ktiowq;
+static struct ktstate *kts;
+
+/* io completion queue */
+struct iocq_ktio {
+ struct list_head head;
+ spinlock_t lock;
+};
+static struct iocq_ktio *iocq;
+
+static struct page *empty_page;
+
+static struct sk_buff *
+new_skb(ulong len)
+{
+ struct sk_buff *skb;
+
+ skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC);
+ if (skb) {
+ skb_reserve(skb, MAX_HEADER);
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb->protocol = __constant_htons(ETH_P_AOE);
+ skb_checksum_none_assert(skb);
+ }
+ return skb;
+}
+
+static struct frame *
+getframe_deferred(struct aoedev *d, u32 tag)
+{
+ struct list_head *head, *pos, *nx;
+ struct frame *f;
+
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (f->tag == tag) {
+ list_del(pos);
+ return f;
+ }
+ }
+ return NULL;
+}
+
+static struct frame *
+getframe(struct aoedev *d, u32 tag)
+{
+ struct frame *f;
+ struct list_head *head, *pos, *nx;
+ u32 n;
+
+ n = tag % NFACTIVE;
+ head = &d->factive[n];
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (f->tag == tag) {
+ list_del(pos);
+ return f;
+ }
+ }
+ return NULL;
+}
+
+/*
+ * Leave the top bit clear so we have tagspace for userland.
+ * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
+ * This driver reserves tag -1 to mean "unused frame."
+ */
+static int
+newtag(struct aoedev *d)
+{
+ register ulong n;
+
+ n = jiffies & 0xffff;
+ return n |= (++d->lasttag & 0x7fff) << 16;
+}
+
+static u32
+aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
+{
+ u32 host_tag = newtag(d);
+
+ memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
+ memcpy(h->dst, t->addr, sizeof h->dst);
+ h->type = __constant_cpu_to_be16(ETH_P_AOE);
+ h->verfl = AOE_HVER;
+ h->major = cpu_to_be16(d->aoemajor);
+ h->minor = d->aoeminor;
+ h->cmd = AOECMD_ATA;
+ h->tag = cpu_to_be32(host_tag);
+
+ return host_tag;
+}
+
+static inline void
+put_lba(struct aoe_atahdr *ah, sector_t lba)
+{
+ ah->lba0 = lba;
+ ah->lba1 = lba >>= 8;
+ ah->lba2 = lba >>= 8;
+ ah->lba3 = lba >>= 8;
+ ah->lba4 = lba >>= 8;
+ ah->lba5 = lba >>= 8;
+}
+
+static struct aoeif *
+ifrotate(struct aoetgt *t)
+{
+ struct aoeif *ifp;
+
+ ifp = t->ifp;
+ ifp++;
+ if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
+ ifp = t->ifs;
+ if (ifp->nd == NULL)
+ return NULL;
+ return t->ifp = ifp;
+}
+
+static void
+skb_pool_put(struct aoedev *d, struct sk_buff *skb)
+{
+ __skb_queue_tail(&d->skbpool, skb);
+}
+
+static struct sk_buff *
+skb_pool_get(struct aoedev *d)
+{
+ struct sk_buff *skb = skb_peek(&d->skbpool);
+
+ if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
+ __skb_unlink(skb, &d->skbpool);
+ return skb;
+ }
+ if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX &&
+ (skb = new_skb(ETH_ZLEN)))
+ return skb;
+
+ return NULL;
+}
+
+void
+aoe_freetframe(struct frame *f)
+{
+ struct aoetgt *t;
+
+ t = f->t;
+ f->buf = NULL;
+ memset(&f->iter, 0, sizeof(f->iter));
+ f->r_skb = NULL;
+ f->flags = 0;
+ list_add(&f->head, &t->ffree);
+}
+
+static struct frame *
+newtframe(struct aoedev *d, struct aoetgt *t)
+{
+ struct frame *f;
+ struct sk_buff *skb;
+ struct list_head *pos;
+
+ if (list_empty(&t->ffree)) {
+ if (t->falloc >= NSKBPOOLMAX*2)
+ return NULL;
+ f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
+ if (f == NULL)
+ return NULL;
+ t->falloc++;
+ f->t = t;
+ } else {
+ pos = t->ffree.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ }
+
+ skb = f->skb;
+ if (skb == NULL) {
+ f->skb = skb = new_skb(ETH_ZLEN);
+ if (!skb) {
+bail: aoe_freetframe(f);
+ return NULL;
+ }
+ }
+
+ if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
+ skb = skb_pool_get(d);
+ if (skb == NULL)
+ goto bail;
+ skb_pool_put(d, f->skb);
+ f->skb = skb;
+ }
+
+ skb->truesize -= skb->data_len;
+ skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+ skb_trim(skb, 0);
+ return f;
+}
+
+static struct frame *
+newframe(struct aoedev *d)
+{
+ struct frame *f;
+ struct aoetgt *t, **tt;
+ int totout = 0;
+ int use_tainted;
+ int has_untainted;
+
+ if (!d->targets || !d->targets[0]) {
+ printk(KERN_ERR "aoe: NULL TARGETS!\n");
+ return NULL;
+ }
+ tt = d->tgt; /* last used target */
+ for (use_tainted = 0, has_untainted = 0;;) {
+ tt++;
+ if (tt >= &d->targets[d->ntargets] || !*tt)
+ tt = d->targets;
+ t = *tt;
+ if (!t->taint) {
+ has_untainted = 1;
+ totout += t->nout;
+ }
+ if (t->nout < t->maxout
+ && (use_tainted || !t->taint)
+ && t->ifp->nd) {
+ f = newtframe(d, t);
+ if (f) {
+ ifrotate(t);
+ d->tgt = tt;
+ return f;
+ }
+ }
+ if (tt == d->tgt) { /* we've looped and found nada */
+ if (!use_tainted && !has_untainted)
+ use_tainted = 1;
+ else
+ break;
+ }
+ }
+ if (totout == 0) {
+ d->kicked++;
+ d->flags |= DEVFL_KICKME;
+ }
+ return NULL;
+}
+
+static void
+skb_fillup(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter)
+{
+ int frag = 0;
+ struct bio_vec bv;
+
+ __bio_for_each_segment(bv, bio, iter, iter)
+ skb_fill_page_desc(skb, frag++, bv.bv_page,
+ bv.bv_offset, bv.bv_len);
+}
+
+static void
+fhash(struct frame *f)
+{
+ struct aoedev *d = f->t->d;
+ u32 n;
+
+ n = f->tag % NFACTIVE;
+ list_add_tail(&f->head, &d->factive[n]);
+}
+
+static void
+ata_rw_frameinit(struct frame *f)
+{
+ struct aoetgt *t;
+ struct aoe_hdr *h;
+ struct aoe_atahdr *ah;
+ struct sk_buff *skb;
+ char writebit, extbit;
+
+ skb = f->skb;
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+ ah = (struct aoe_atahdr *) (h + 1);
+ skb_put(skb, sizeof(*h) + sizeof(*ah));
+ memset(h, 0, skb->len);
+
+ writebit = 0x10;
+ extbit = 0x4;
+
+ t = f->t;
+ f->tag = aoehdr_atainit(t->d, t, h);
+ fhash(f);
+ t->nout++;
+ f->waited = 0;
+ f->waited_total = 0;
+
+ /* set up ata header */
+ ah->scnt = f->iter.bi_size >> 9;
+ put_lba(ah, f->iter.bi_sector);
+ if (t->d->flags & DEVFL_EXT) {
+ ah->aflags |= AOEAFL_EXT;
+ } else {
+ extbit = 0;
+ ah->lba3 &= 0x0f;
+ ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
+ }
+ if (f->buf && bio_data_dir(f->buf->bio) == WRITE) {
+ skb_fillup(skb, f->buf->bio, f->iter);
+ ah->aflags |= AOEAFL_WRITE;
+ skb->len += f->iter.bi_size;
+ skb->data_len = f->iter.bi_size;
+ skb->truesize += f->iter.bi_size;
+ t->wpkts++;
+ } else {
+ t->rpkts++;
+ writebit = 0;
+ }
+
+ ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
+ skb->dev = t->ifp->nd;
+}
+
+static int
+aoecmd_ata_rw(struct aoedev *d)
+{
+ struct frame *f;
+ struct buf *buf;
+ struct sk_buff *skb;
+ struct sk_buff_head queue;
+
+ buf = nextbuf(d);
+ if (buf == NULL)
+ return 0;
+ f = newframe(d);
+ if (f == NULL)
+ return 0;
+
+ /* initialize the headers & frame */
+ f->buf = buf;
+ f->iter = buf->iter;
+ f->iter.bi_size = min_t(unsigned long,
+ d->maxbcnt ?: DEFAULTBCNT,
+ f->iter.bi_size);
+ bio_advance_iter(buf->bio, &buf->iter, f->iter.bi_size);
+
+ if (!buf->iter.bi_size)
+ d->ip.buf = NULL;
+
+ /* mark all tracking fields and load out */
+ buf->nframesout += 1;
+
+ ata_rw_frameinit(f);
+
+ skb = skb_clone(f->skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+ }
+ return 1;
+}
+
+/* some callers cannot sleep, and they can call this function,
+ * transmitting the packets later, when interrupts are on
+ */
+static void
+aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue)
+{
+ struct aoe_hdr *h;
+ struct aoe_cfghdr *ch;
+ struct sk_buff *skb;
+ struct net_device *ifp;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, ifp) {
+ dev_hold(ifp);
+ if (!is_aoe_netif(ifp))
+ goto cont;
+
+ skb = new_skb(sizeof *h + sizeof *ch);
+ if (skb == NULL) {
+ printk(KERN_INFO "aoe: skb alloc failure\n");
+ goto cont;
+ }
+ skb_put(skb, sizeof *h + sizeof *ch);
+ skb->dev = ifp;
+ __skb_queue_tail(queue, skb);
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+ memset(h, 0, sizeof *h + sizeof *ch);
+
+ memset(h->dst, 0xff, sizeof h->dst);
+ memcpy(h->src, ifp->dev_addr, sizeof h->src);
+ h->type = __constant_cpu_to_be16(ETH_P_AOE);
+ h->verfl = AOE_HVER;
+ h->major = cpu_to_be16(aoemajor);
+ h->minor = aoeminor;
+ h->cmd = AOECMD_CFG;
+
+cont:
+ dev_put(ifp);
+ }
+ rcu_read_unlock();
+}
+
+static void
+resend(struct aoedev *d, struct frame *f)
+{
+ struct sk_buff *skb;
+ struct sk_buff_head queue;
+ struct aoe_hdr *h;
+ struct aoetgt *t;
+ char buf[128];
+ u32 n;
+
+ t = f->t;
+ n = newtag(d);
+ skb = f->skb;
+ if (ifrotate(t) == NULL) {
+ /* probably can't happen, but set it up to fail anyway */
+ pr_info("aoe: resend: no interfaces to rotate to.\n");
+ ktcomplete(f, NULL);
+ return;
+ }
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+
+ if (!(f->flags & FFL_PROBE)) {
+ snprintf(buf, sizeof(buf),
+ "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
+ "retransmit", d->aoemajor, d->aoeminor,
+ f->tag, jiffies, n,
+ h->src, h->dst, t->nout);
+ aoechr_error(buf);
+ }
+
+ f->tag = n;
+ fhash(f);
+ h->tag = cpu_to_be32(n);
+ memcpy(h->dst, t->addr, sizeof h->dst);
+ memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
+
+ skb->dev = t->ifp->nd;
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+}
+
+static int
+tsince_hr(struct frame *f)
+{
+ struct timeval now;
+ int n;
+
+ do_gettimeofday(&now);
+ n = now.tv_usec - f->sent.tv_usec;
+ n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+
+ if (n < 0)
+ n = -n;
+
+ /* For relatively long periods, use jiffies to avoid
+ * discrepancies caused by updates to the system time.
+ *
+ * On system with HZ of 1000, 32-bits is over 49 days
+ * worth of jiffies, or over 71 minutes worth of usecs.
+ *
+ * Jiffies overflow is handled by subtraction of unsigned ints:
+ * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
+ * $3 = 4
+ * (gdb)
+ */
+ if (n > USEC_PER_SEC / 4) {
+ n = ((u32) jiffies) - f->sent_jiffs;
+ n *= USEC_PER_SEC / HZ;
+ }
+
+ return n;
+}
+
+static int
+tsince(u32 tag)
+{
+ int n;
+
+ n = jiffies & 0xffff;
+ n -= tag & 0xffff;
+ if (n < 0)
+ n += 1<<16;
+ return jiffies_to_usecs(n + 1);
+}
+
+static struct aoeif *
+getif(struct aoetgt *t, struct net_device *nd)
+{
+ struct aoeif *p, *e;
+
+ p = t->ifs;
+ e = p + NAOEIFS;
+ for (; p < e; p++)
+ if (p->nd == nd)
+ return p;
+ return NULL;
+}
+
+static void
+ejectif(struct aoetgt *t, struct aoeif *ifp)
+{
+ struct aoeif *e;
+ struct net_device *nd;
+ ulong n;
+
+ nd = ifp->nd;
+ e = t->ifs + NAOEIFS - 1;
+ n = (e - ifp) * sizeof *ifp;
+ memmove(ifp, ifp+1, n);
+ e->nd = NULL;
+ dev_put(nd);
+}
+
+static struct frame *
+reassign_frame(struct frame *f)
+{
+ struct frame *nf;
+ struct sk_buff *skb;
+
+ nf = newframe(f->t->d);
+ if (!nf)
+ return NULL;
+ if (nf->t == f->t) {
+ aoe_freetframe(nf);
+ return NULL;
+ }
+
+ skb = nf->skb;
+ nf->skb = f->skb;
+ nf->buf = f->buf;
+ nf->iter = f->iter;
+ nf->waited = 0;
+ nf->waited_total = f->waited_total;
+ nf->sent = f->sent;
+ nf->sent_jiffs = f->sent_jiffs;
+ f->skb = skb;
+
+ return nf;
+}
+
+static void
+probe(struct aoetgt *t)
+{
+ struct aoedev *d;
+ struct frame *f;
+ struct sk_buff *skb;
+ struct sk_buff_head queue;
+ size_t n, m;
+ int frag;
+
+ d = t->d;
+ f = newtframe(d, t);
+ if (!f) {
+ pr_err("%s %pm for e%ld.%d: %s\n",
+ "aoe: cannot probe remote address",
+ t->addr,
+ (long) d->aoemajor, d->aoeminor,
+ "no frame available");
+ return;
+ }
+ f->flags |= FFL_PROBE;
+ ifrotate(t);
+ f->iter.bi_size = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT;
+ ata_rw_frameinit(f);
+ skb = f->skb;
+ for (frag = 0, n = f->iter.bi_size; n > 0; ++frag, n -= m) {
+ if (n < PAGE_SIZE)
+ m = n;
+ else
+ m = PAGE_SIZE;
+ skb_fill_page_desc(skb, frag, empty_page, 0, m);
+ }
+ skb->len += f->iter.bi_size;
+ skb->data_len = f->iter.bi_size;
+ skb->truesize += f->iter.bi_size;
+
+ skb = skb_clone(f->skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, skb);
+ aoenet_xmit(&queue);
+ }
+}
+
+static long
+rto(struct aoedev *d)
+{
+ long t;
+
+ t = 2 * d->rttavg >> RTTSCALE;
+ t += 8 * d->rttdev >> RTTDSCALE;
+ if (t == 0)
+ t = 1;
+
+ return t;
+}
+
+static void
+rexmit_deferred(struct aoedev *d)
+{
+ struct aoetgt *t;
+ struct frame *f;
+ struct frame *nf;
+ struct list_head *pos, *nx, *head;
+ int since;
+ int untainted;
+
+ count_targets(d, &untainted);
+
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ t = f->t;
+ if (t->taint) {
+ if (!(f->flags & FFL_PROBE)) {
+ nf = reassign_frame(f);
+ if (nf) {
+ if (t->nout_probes == 0
+ && untainted > 0) {
+ probe(t);
+ t->nout_probes++;
+ }
+ list_replace(&f->head, &nf->head);
+ pos = &nf->head;
+ aoe_freetframe(f);
+ f = nf;
+ t = f->t;
+ }
+ } else if (untainted < 1) {
+ /* don't probe w/o other untainted aoetgts */
+ goto stop_probe;
+ } else if (tsince_hr(f) < t->taint * rto(d)) {
+ /* reprobe slowly when taint is high */
+ continue;
+ }
+ } else if (f->flags & FFL_PROBE) {
+stop_probe: /* don't probe untainted aoetgts */
+ list_del(pos);
+ aoe_freetframe(f);
+ /* leaving d->kicked, because this is routine */
+ f->t->d->flags |= DEVFL_KICKME;
+ continue;
+ }
+ if (t->nout >= t->maxout)
+ continue;
+ list_del(pos);
+ t->nout++;
+ if (f->flags & FFL_PROBE)
+ t->nout_probes++;
+ since = tsince_hr(f);
+ f->waited += since;
+ f->waited_total += since;
+ resend(d, f);
+ }
+}
+
+/* An aoetgt accumulates demerits quickly, and successful
+ * probing redeems the aoetgt slowly.
+ */
+static void
+scorn(struct aoetgt *t)
+{
+ int n;
+
+ n = t->taint++;
+ t->taint += t->taint * 2;
+ if (n > t->taint)
+ t->taint = n;
+ if (t->taint > MAX_TAINT)
+ t->taint = MAX_TAINT;
+}
+
+static int
+count_targets(struct aoedev *d, int *untainted)
+{
+ int i, good;
+
+ for (i = good = 0; i < d->ntargets && d->targets[i]; ++i)
+ if (d->targets[i]->taint == 0)
+ good++;
+
+ if (untainted)
+ *untainted = good;
+ return i;
+}
+
+static void
+rexmit_timer(ulong vp)
+{
+ struct aoedev *d;
+ struct aoetgt *t;
+ struct aoeif *ifp;
+ struct frame *f;
+ struct list_head *head, *pos, *nx;
+ LIST_HEAD(flist);
+ register long timeout;
+ ulong flags, n;
+ int i;
+ int utgts; /* number of aoetgt descriptors (not slots) */
+ int since;
+
+ d = (struct aoedev *) vp;
+
+ spin_lock_irqsave(&d->lock, flags);
+
+ /* timeout based on observed timings and variations */
+ timeout = rto(d);
+
+ utgts = count_targets(d, NULL);
+
+ if (d->flags & DEVFL_TKILL) {
+ spin_unlock_irqrestore(&d->lock, flags);
+ return;
+ }
+
+ /* collect all frames to rexmit into flist */
+ for (i = 0; i < NFACTIVE; i++) {
+ head = &d->factive[i];
+ list_for_each_safe(pos, nx, head) {
+ f = list_entry(pos, struct frame, head);
+ if (tsince_hr(f) < timeout)
+ break; /* end of expired frames */
+ /* move to flist for later processing */
+ list_move_tail(pos, &flist);
+ }
+ }
+
+ /* process expired frames */
+ while (!list_empty(&flist)) {
+ pos = flist.next;
+ f = list_entry(pos, struct frame, head);
+ since = tsince_hr(f);
+ n = f->waited_total + since;
+ n /= USEC_PER_SEC;
+ if (aoe_deadsecs
+ && n > aoe_deadsecs
+ && !(f->flags & FFL_PROBE)) {
+ /* Waited too long. Device failure.
+ * Hang all frames on first hash bucket for downdev
+ * to clean up.
+ */
+ list_splice(&flist, &d->factive[0]);
+ aoedev_downdev(d);
+ goto out;
+ }
+
+ t = f->t;
+ n = f->waited + since;
+ n /= USEC_PER_SEC;
+ if (aoe_deadsecs && utgts > 0
+ && (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS))
+ scorn(t); /* avoid this target */
+
+ if (t->maxout != 1) {
+ t->ssthresh = t->maxout / 2;
+ t->maxout = 1;
+ }
+
+ if (f->flags & FFL_PROBE) {
+ t->nout_probes--;
+ } else {
+ ifp = getif(t, f->skb->dev);
+ if (ifp && ++ifp->lost > (t->nframes << 1)
+ && (ifp != t->ifs || t->ifs[1].nd)) {
+ ejectif(t, ifp);
+ ifp = NULL;
+ }
+ }
+ list_move_tail(pos, &d->rexmitq);
+ t->nout--;
+ }
+ rexmit_deferred(d);
+
+out:
+ if ((d->flags & DEVFL_KICKME) && d->blkq) {
+ d->flags &= ~DEVFL_KICKME;
+ d->blkq->request_fn(d->blkq);
+ }
+
+ d->timer.expires = jiffies + TIMERTICK;
+ add_timer(&d->timer);
+
+ spin_unlock_irqrestore(&d->lock, flags);
+}
+
+static unsigned long
+rqbiocnt(struct request *r)
+{
+ struct bio *bio;
+ unsigned long n = 0;
+
+ __rq_for_each_bio(bio, r)
+ n++;
+ return n;
+}
+
+/* This can be removed if we are certain that no users of the block
+ * layer will ever use zero-count pages in bios. Otherwise we have to
+ * protect against the put_page sometimes done by the network layer.
+ *
+ * See http://oss.sgi.com/archives/xfs/2007-01/msg00594.html for
+ * discussion.
+ *
+ * We cannot use get_page in the workaround, because it insists on a
+ * positive page count as a precondition. So we use _count directly.
+ */
+static void
+bio_pageinc(struct bio *bio)
+{
+ struct bio_vec bv;
+ struct page *page;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ /* Non-zero page count for non-head members of
+ * compound pages is no longer allowed by the kernel.
+ */
+ page = compound_head(bv.bv_page);
+ atomic_inc(&page->_count);
+ }
+}
+
+static void
+bio_pagedec(struct bio *bio)
+{
+ struct page *page;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ page = compound_head(bv.bv_page);
+ atomic_dec(&page->_count);
+ }
+}
+
+static void
+bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+{
+ memset(buf, 0, sizeof(*buf));
+ buf->rq = rq;
+ buf->bio = bio;
+ buf->iter = bio->bi_iter;
+ bio_pageinc(bio);
+}
+
+static struct buf *
+nextbuf(struct aoedev *d)
+{
+ struct request *rq;
+ struct request_queue *q;
+ struct buf *buf;
+ struct bio *bio;
+
+ q = d->blkq;
+ if (q == NULL)
+ return NULL; /* initializing */
+ if (d->ip.buf)
+ return d->ip.buf;
+ rq = d->ip.rq;
+ if (rq == NULL) {
+ rq = blk_peek_request(q);
+ if (rq == NULL)
+ return NULL;
+ blk_start_request(rq);
+ d->ip.rq = rq;
+ d->ip.nxbio = rq->bio;
+ rq->special = (void *) rqbiocnt(rq);
+ }
+ buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
+ if (buf == NULL) {
+ pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
+ return NULL;
+ }
+ bio = d->ip.nxbio;
+ bufinit(buf, rq, bio);
+ bio = bio->bi_next;
+ d->ip.nxbio = bio;
+ if (bio == NULL)
+ d->ip.rq = NULL;
+ return d->ip.buf = buf;
+}
+
+/* enters with d->lock held */
+void
+aoecmd_work(struct aoedev *d)
+{
+ rexmit_deferred(d);
+ while (aoecmd_ata_rw(d))
+ ;
+}
+
+/* this function performs work that has been deferred until sleeping is OK
+ */
+void
+aoecmd_sleepwork(struct work_struct *work)
+{
+ struct aoedev *d = container_of(work, struct aoedev, work);
+ struct block_device *bd;
+ u64 ssize;
+
+ if (d->flags & DEVFL_GDALLOC)
+ aoeblk_gdalloc(d);
+
+ if (d->flags & DEVFL_NEWSIZE) {
+ ssize = get_capacity(d->gd);
+ bd = bdget_disk(d->gd, 0);
+ if (bd) {
+ mutex_lock(&bd->bd_inode->i_mutex);
+ i_size_write(bd->bd_inode, (loff_t)ssize<<9);
+ mutex_unlock(&bd->bd_inode->i_mutex);
+ bdput(bd);
+ }
+ spin_lock_irq(&d->lock);
+ d->flags |= DEVFL_UP;
+ d->flags &= ~DEVFL_NEWSIZE;
+ spin_unlock_irq(&d->lock);
+ }
+}
+
+static void
+ata_ident_fixstring(u16 *id, int ns)
+{
+ u16 s;
+
+ while (ns-- > 0) {
+ s = *id;
+ *id++ = s >> 8 | s << 8;
+ }
+}
+
+static void
+ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
+{
+ u64 ssize;
+ u16 n;
+
+ /* word 83: command set supported */
+ n = get_unaligned_le16(&id[83 << 1]);
+
+ /* word 86: command set/feature enabled */
+ n |= get_unaligned_le16(&id[86 << 1]);
+
+ if (n & (1<<10)) { /* bit 10: LBA 48 */
+ d->flags |= DEVFL_EXT;
+
+ /* word 100: number lba48 sectors */
+ ssize = get_unaligned_le64(&id[100 << 1]);
+
+ /* set as in ide-disk.c:init_idedisk_capacity */
+ d->geo.cylinders = ssize;
+ d->geo.cylinders /= (255 * 63);
+ d->geo.heads = 255;
+ d->geo.sectors = 63;
+ } else {
+ d->flags &= ~DEVFL_EXT;
+
+ /* number lba28 sectors */
+ ssize = get_unaligned_le32(&id[60 << 1]);
+
+ /* NOTE: obsolete in ATA 6 */
+ d->geo.cylinders = get_unaligned_le16(&id[54 << 1]);
+ d->geo.heads = get_unaligned_le16(&id[55 << 1]);
+ d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
+ }
+
+ ata_ident_fixstring((u16 *) &id[10<<1], 10); /* serial */
+ ata_ident_fixstring((u16 *) &id[23<<1], 4); /* firmware */
+ ata_ident_fixstring((u16 *) &id[27<<1], 20); /* model */
+ memcpy(d->ident, id, sizeof(d->ident));
+
+ if (d->ssize != ssize)
+ printk(KERN_INFO
+ "aoe: %pm e%ld.%d v%04x has %llu sectors\n",
+ t->addr,
+ d->aoemajor, d->aoeminor,
+ d->fw_ver, (long long)ssize);
+ d->ssize = ssize;
+ d->geo.start = 0;
+ if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
+ return;
+ if (d->gd != NULL) {
+ set_capacity(d->gd, ssize);
+ d->flags |= DEVFL_NEWSIZE;
+ } else
+ d->flags |= DEVFL_GDALLOC;
+ schedule_work(&d->work);
+}
+
+static void
+calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
+{
+ register long n;
+
+ n = rtt;
+
+ /* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
+ n -= d->rttavg >> RTTSCALE;
+ d->rttavg += n;
+ if (n < 0)
+ n = -n;
+ n -= d->rttdev >> RTTDSCALE;
+ d->rttdev += n;
+
+ if (!t || t->maxout >= t->nframes)
+ return;
+ if (t->maxout < t->ssthresh)
+ t->maxout += 1;
+ else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
+ t->maxout += 1;
+ t->next_cwnd = t->maxout;
+ }
+}
+
+static struct aoetgt *
+gettgt(struct aoedev *d, char *addr)
+{
+ struct aoetgt **t, **e;
+
+ t = d->targets;
+ e = t + d->ntargets;
+ for (; t < e && *t; t++)
+ if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
+ return *t;
+ return NULL;
+}
+
+static void
+bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
+{
+ int soff = 0;
+ struct bio_vec bv;
+
+ iter.bi_size = cnt;
+
+ __bio_for_each_segment(bv, bio, iter, iter) {
+ char *p = page_address(bv.bv_page) + bv.bv_offset;
+ skb_copy_bits(skb, soff, p, bv.bv_len);
+ soff += bv.bv_len;
+ }
+}
+
+void
+aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
+{
+ struct bio *bio;
+ int bok;
+ struct request_queue *q;
+
+ q = d->blkq;
+ if (rq == d->ip.rq)
+ d->ip.rq = NULL;
+ do {
+ bio = rq->bio;
+ bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags);
+ } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size));
+
+ /* cf. http://lkml.org/lkml/2006/10/31/28 */
+ if (!fastfail)
+ __blk_run_queue(q);
+}
+
+static void
+aoe_end_buf(struct aoedev *d, struct buf *buf)
+{
+ struct request *rq;
+ unsigned long n;
+
+ if (buf == d->ip.buf)
+ d->ip.buf = NULL;
+ rq = buf->rq;
+ bio_pagedec(buf->bio);
+ mempool_free(buf, d->bufpool);
+ n = (unsigned long) rq->special;
+ rq->special = (void *) --n;
+ if (n == 0)
+ aoe_end_request(d, rq, 0);
+}
+
+static void
+ktiocomplete(struct frame *f)
+{
+ struct aoe_hdr *hin, *hout;
+ struct aoe_atahdr *ahin, *ahout;
+ struct buf *buf;
+ struct sk_buff *skb;
+ struct aoetgt *t;
+ struct aoeif *ifp;
+ struct aoedev *d;
+ long n;
+ int untainted;
+
+ if (f == NULL)
+ return;
+
+ t = f->t;
+ d = t->d;
+ skb = f->r_skb;
+ buf = f->buf;
+ if (f->flags & FFL_PROBE)
+ goto out;
+ if (!skb) /* just fail the buf. */
+ goto noskb;
+
+ hout = (struct aoe_hdr *) skb_mac_header(f->skb);
+ ahout = (struct aoe_atahdr *) (hout+1);
+
+ hin = (struct aoe_hdr *) skb->data;
+ skb_pull(skb, sizeof(*hin));
+ ahin = (struct aoe_atahdr *) skb->data;
+ skb_pull(skb, sizeof(*ahin));
+ if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
+ pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
+ ahout->cmdstat, ahin->cmdstat,
+ d->aoemajor, d->aoeminor);
+noskb: if (buf)
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ goto out;
+ }
+
+ n = ahout->scnt << 9;
+ switch (ahout->cmdstat) {
+ case ATA_CMD_PIO_READ:
+ case ATA_CMD_PIO_READ_EXT:
+ if (skb->len < n) {
+ pr_err("%s e%ld.%d. skb->len=%d need=%ld\n",
+ "aoe: runt data size in read from",
+ (long) d->aoemajor, d->aoeminor,
+ skb->len, n);
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ break;
+ }
+ if (n > f->iter.bi_size) {
+ pr_err_ratelimited("%s e%ld.%d. bytes=%ld need=%u\n",
+ "aoe: too-large data size in read from",
+ (long) d->aoemajor, d->aoeminor,
+ n, f->iter.bi_size);
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ break;
+ }
+ bvcpy(skb, f->buf->bio, f->iter, n);
+ case ATA_CMD_PIO_WRITE:
+ case ATA_CMD_PIO_WRITE_EXT:
+ spin_lock_irq(&d->lock);
+ ifp = getif(t, skb->dev);
+ if (ifp)
+ ifp->lost = 0;
+ spin_unlock_irq(&d->lock);
+ break;
+ case ATA_CMD_ID_ATA:
+ if (skb->len < 512) {
+ pr_info("%s e%ld.%d. skb->len=%d need=512\n",
+ "aoe: runt data size in ataid from",
+ (long) d->aoemajor, d->aoeminor,
+ skb->len);
+ break;
+ }
+ if (skb_linearize(skb))
+ break;
+ spin_lock_irq(&d->lock);
+ ataid_complete(d, t, skb->data);
+ spin_unlock_irq(&d->lock);
+ break;
+ default:
+ pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
+ ahout->cmdstat,
+ be16_to_cpu(get_unaligned(&hin->major)),
+ hin->minor);
+ }
+out:
+ spin_lock_irq(&d->lock);
+ if (t->taint > 0
+ && --t->taint > 0
+ && t->nout_probes == 0) {
+ count_targets(d, &untainted);
+ if (untainted > 0) {
+ probe(t);
+ t->nout_probes++;
+ }
+ }
+
+ aoe_freetframe(f);
+
+ if (buf && --buf->nframesout == 0 && buf->iter.bi_size == 0)
+ aoe_end_buf(d, buf);
+
+ spin_unlock_irq(&d->lock);
+ aoedev_put(d);
+ dev_kfree_skb(skb);
+}
+
+/* Enters with iocq.lock held.
+ * Returns true iff responses needing processing remain.
+ */
+static int
+ktio(int id)
+{
+ struct frame *f;
+ struct list_head *pos;
+ int i;
+ int actual_id;
+
+ for (i = 0; ; ++i) {
+ if (i == MAXIOC)
+ return 1;
+ if (list_empty(&iocq[id].head))
+ return 0;
+ pos = iocq[id].head.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ spin_unlock_irq(&iocq[id].lock);
+ ktiocomplete(f);
+
+ /* Figure out if extra threads are required. */
+ actual_id = f->t->d->aoeminor % ncpus;
+
+ if (!kts[actual_id].active) {
+ BUG_ON(id != 0);
+ mutex_lock(&ktio_spawn_lock);
+ if (!kts[actual_id].active
+ && aoe_ktstart(&kts[actual_id]) == 0)
+ kts[actual_id].active = 1;
+ mutex_unlock(&ktio_spawn_lock);
+ }
+ spin_lock_irq(&iocq[id].lock);
+ }
+}
+
+static int
+kthread(void *vp)
+{
+ struct ktstate *k;
+ DECLARE_WAITQUEUE(wait, current);
+ int more;
+
+ k = vp;
+ current->flags |= PF_NOFREEZE;
+ set_user_nice(current, -10);
+ complete(&k->rendez); /* tell spawner we're running */
+ do {
+ spin_lock_irq(k->lock);
+ more = k->fn(k->id);
+ if (!more) {
+ add_wait_queue(k->waitq, &wait);
+ __set_current_state(TASK_INTERRUPTIBLE);
+ }
+ spin_unlock_irq(k->lock);
+ if (!more) {
+ schedule();
+ remove_wait_queue(k->waitq, &wait);
+ } else
+ cond_resched();
+ } while (!kthread_should_stop());
+ complete(&k->rendez); /* tell spawner we're stopping */
+ return 0;
+}
+
+void
+aoe_ktstop(struct ktstate *k)
+{
+ kthread_stop(k->task);
+ wait_for_completion(&k->rendez);
+}
+
+int
+aoe_ktstart(struct ktstate *k)
+{
+ struct task_struct *task;
+
+ init_completion(&k->rendez);
+ task = kthread_run(kthread, k, "%s", k->name);
+ if (task == NULL || IS_ERR(task))
+ return -ENOMEM;
+ k->task = task;
+ wait_for_completion(&k->rendez); /* allow kthread to start */
+ init_completion(&k->rendez); /* for waiting for exit later */
+ return 0;
+}
+
+/* pass it off to kthreads for processing */
+static void
+ktcomplete(struct frame *f, struct sk_buff *skb)
+{
+ int id;
+ ulong flags;
+
+ f->r_skb = skb;
+ id = f->t->d->aoeminor % ncpus;
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ if (!kts[id].active) {
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ /* The thread with id has not been spawned yet,
+ * so delegate the work to the main thread and
+ * try spawning a new thread.
+ */
+ id = 0;
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ }
+ list_add_tail(&f->head, &iocq[id].head);
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ wake_up(&ktiowq[id]);
+}
+
+struct sk_buff *
+aoecmd_ata_rsp(struct sk_buff *skb)
+{
+ struct aoedev *d;
+ struct aoe_hdr *h;
+ struct frame *f;
+ u32 n;
+ ulong flags;
+ char ebuf[128];
+ u16 aoemajor;
+
+ h = (struct aoe_hdr *) skb->data;
+ aoemajor = be16_to_cpu(get_unaligned(&h->major));
+ d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
+ if (d == NULL) {
+ snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
+ "for unknown device %d.%d\n",
+ aoemajor, h->minor);
+ aoechr_error(ebuf);
+ return skb;
+ }
+
+ spin_lock_irqsave(&d->lock, flags);
+
+ n = be32_to_cpu(get_unaligned(&h->tag));
+ f = getframe(d, n);
+ if (f) {
+ calc_rttavg(d, f->t, tsince_hr(f));
+ f->t->nout--;
+ if (f->flags & FFL_PROBE)
+ f->t->nout_probes--;
+ } else {
+ f = getframe_deferred(d, n);
+ if (f) {
+ calc_rttavg(d, NULL, tsince_hr(f));
+ } else {
+ calc_rttavg(d, NULL, tsince(n));
+ spin_unlock_irqrestore(&d->lock, flags);
+ aoedev_put(d);
+ snprintf(ebuf, sizeof(ebuf),
+ "%15s e%d.%d tag=%08x@%08lx s=%pm d=%pm\n",
+ "unexpected rsp",
+ get_unaligned_be16(&h->major),
+ h->minor,
+ get_unaligned_be32(&h->tag),
+ jiffies,
+ h->src,
+ h->dst);
+ aoechr_error(ebuf);
+ return skb;
+ }
+ }
+ aoecmd_work(d);
+
+ spin_unlock_irqrestore(&d->lock, flags);
+
+ ktcomplete(f, skb);
+
+ /*
+ * Note here that we do not perform an aoedev_put, as we are
+ * leaving this reference for the ktio to release.
+ */
+ return NULL;
+}
+
+void
+aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
+{
+ struct sk_buff_head queue;
+
+ __skb_queue_head_init(&queue);
+ aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
+ aoenet_xmit(&queue);
+}
+
+struct sk_buff *
+aoecmd_ata_id(struct aoedev *d)
+{
+ struct aoe_hdr *h;
+ struct aoe_atahdr *ah;
+ struct frame *f;
+ struct sk_buff *skb;
+ struct aoetgt *t;
+
+ f = newframe(d);
+ if (f == NULL)
+ return NULL;
+
+ t = *d->tgt;
+
+ /* initialize the headers & frame */
+ skb = f->skb;
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+ ah = (struct aoe_atahdr *) (h+1);
+ skb_put(skb, sizeof *h + sizeof *ah);
+ memset(h, 0, skb->len);
+ f->tag = aoehdr_atainit(d, t, h);
+ fhash(f);
+ t->nout++;
+ f->waited = 0;
+ f->waited_total = 0;
+
+ /* set up ata header */
+ ah->scnt = 1;
+ ah->cmdstat = ATA_CMD_ID_ATA;
+ ah->lba3 = 0xa0;
+
+ skb->dev = t->ifp->nd;
+
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
+ d->timer.function = rexmit_timer;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (skb) {
+ do_gettimeofday(&f->sent);
+ f->sent_jiffs = (u32) jiffies;
+ }
+
+ return skb;
+}
+
+static struct aoetgt **
+grow_targets(struct aoedev *d)
+{
+ ulong oldn, newn;
+ struct aoetgt **tt;
+
+ oldn = d->ntargets;
+ newn = oldn * 2;
+ tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC);
+ if (!tt)
+ return NULL;
+ memmove(tt, d->targets, sizeof(*d->targets) * oldn);
+ d->tgt = tt + (d->tgt - d->targets);
+ kfree(d->targets);
+ d->targets = tt;
+ d->ntargets = newn;
+
+ return &d->targets[oldn];
+}
+
+static struct aoetgt *
+addtgt(struct aoedev *d, char *addr, ulong nframes)
+{
+ struct aoetgt *t, **tt, **te;
+
+ tt = d->targets;
+ te = tt + d->ntargets;
+ for (; tt < te && *tt; tt++)
+ ;
+
+ if (tt == te) {
+ tt = grow_targets(d);
+ if (!tt)
+ goto nomem;
+ }
+ t = kzalloc(sizeof(*t), GFP_ATOMIC);
+ if (!t)
+ goto nomem;
+ t->nframes = nframes;
+ t->d = d;
+ memcpy(t->addr, addr, sizeof t->addr);
+ t->ifp = t->ifs;
+ aoecmd_wreset(t);
+ t->maxout = t->nframes / 2;
+ INIT_LIST_HEAD(&t->ffree);
+ return *tt = t;
+
+ nomem:
+ pr_info("aoe: cannot allocate memory to add target\n");
+ return NULL;
+}
+
+static void
+setdbcnt(struct aoedev *d)
+{
+ struct aoetgt **t, **e;
+ int bcnt = 0;
+
+ t = d->targets;
+ e = t + d->ntargets;
+ for (; t < e && *t; t++)
+ if (bcnt == 0 || bcnt > (*t)->minbcnt)
+ bcnt = (*t)->minbcnt;
+ if (bcnt != d->maxbcnt) {
+ d->maxbcnt = bcnt;
+ pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
+ d->aoemajor, d->aoeminor, bcnt);
+ }
+}
+
+static void
+setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
+{
+ struct aoedev *d;
+ struct aoeif *p, *e;
+ int minbcnt;
+
+ d = t->d;
+ minbcnt = bcnt;
+ p = t->ifs;
+ e = p + NAOEIFS;
+ for (; p < e; p++) {
+ if (p->nd == NULL)
+ break; /* end of the valid interfaces */
+ if (p->nd == nd) {
+ p->bcnt = bcnt; /* we're updating */
+ nd = NULL;
+ } else if (minbcnt > p->bcnt)
+ minbcnt = p->bcnt; /* find the min interface */
+ }
+ if (nd) {
+ if (p == e) {
+ pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
+ return;
+ }
+ dev_hold(nd);
+ p->nd = nd;
+ p->bcnt = bcnt;
+ }
+ t->minbcnt = minbcnt;
+ setdbcnt(d);
+}
+
+void
+aoecmd_cfg_rsp(struct sk_buff *skb)
+{
+ struct aoedev *d;
+ struct aoe_hdr *h;
+ struct aoe_cfghdr *ch;
+ struct aoetgt *t;
+ ulong flags, aoemajor;
+ struct sk_buff *sl;
+ struct sk_buff_head queue;
+ u16 n;
+
+ sl = NULL;
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+ ch = (struct aoe_cfghdr *) (h+1);
+
+ /*
+ * Enough people have their dip switches set backwards to
+ * warrant a loud message for this special case.
+ */
+ aoemajor = get_unaligned_be16(&h->major);
+ if (aoemajor == 0xfff) {
+ printk(KERN_ERR "aoe: Warning: shelf address is all ones. "
+ "Check shelf dip switches.\n");
+ return;
+ }
+ if (aoemajor == 0xffff) {
+ pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
+ aoemajor, (int) h->minor);
+ return;
+ }
+ if (h->minor == 0xff) {
+ pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
+ aoemajor, (int) h->minor);
+ return;
+ }
+
+ n = be16_to_cpu(ch->bufcnt);
+ if (n > aoe_maxout) /* keep it reasonable */
+ n = aoe_maxout;
+
+ d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
+ if (d == NULL) {
+ pr_info("aoe: device allocation failure\n");
+ return;
+ }
+
+ spin_lock_irqsave(&d->lock, flags);
+
+ t = gettgt(d, h->src);
+ if (t) {
+ t->nframes = n;
+ if (n < t->maxout)
+ aoecmd_wreset(t);
+ } else {
+ t = addtgt(d, h->src, n);
+ if (!t)
+ goto bail;
+ }
+ n = skb->dev->mtu;
+ n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
+ n /= 512;
+ if (n > ch->scnt)
+ n = ch->scnt;
+ n = n ? n * 512 : DEFAULTBCNT;
+ setifbcnt(t, skb->dev, n);
+
+ /* don't change users' perspective */
+ if (d->nopen == 0) {
+ d->fw_ver = be16_to_cpu(ch->fwver);
+ sl = aoecmd_ata_id(d);
+ }
+bail:
+ spin_unlock_irqrestore(&d->lock, flags);
+ aoedev_put(d);
+ if (sl) {
+ __skb_queue_head_init(&queue);
+ __skb_queue_tail(&queue, sl);
+ aoenet_xmit(&queue);
+ }
+}
+
+void
+aoecmd_wreset(struct aoetgt *t)
+{
+ t->maxout = 1;
+ t->ssthresh = t->nframes / 2;
+ t->next_cwnd = t->nframes;
+}
+
+void
+aoecmd_cleanslate(struct aoedev *d)
+{
+ struct aoetgt **t, **te;
+
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
+ d->maxbcnt = 0;
+
+ t = d->targets;
+ te = t + d->ntargets;
+ for (; t < te && *t; t++)
+ aoecmd_wreset(*t);
+}
+
+void
+aoe_failbuf(struct aoedev *d, struct buf *buf)
+{
+ if (buf == NULL)
+ return;
+ buf->iter.bi_size = 0;
+ clear_bit(BIO_UPTODATE, &buf->bio->bi_flags);
+ if (buf->nframesout == 0)
+ aoe_end_buf(d, buf);
+}
+
+void
+aoe_flush_iocq(void)
+{
+ int i;
+
+ for (i = 0; i < ncpus; i++) {
+ if (kts[i].active)
+ aoe_flush_iocq_by_index(i);
+ }
+}
+
+void
+aoe_flush_iocq_by_index(int id)
+{
+ struct frame *f;
+ struct aoedev *d;
+ LIST_HEAD(flist);
+ struct list_head *pos;
+ struct sk_buff *skb;
+ ulong flags;
+
+ spin_lock_irqsave(&iocq[id].lock, flags);
+ list_splice_init(&iocq[id].head, &flist);
+ spin_unlock_irqrestore(&iocq[id].lock, flags);
+ while (!list_empty(&flist)) {
+ pos = flist.next;
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ d = f->t->d;
+ skb = f->r_skb;
+ spin_lock_irqsave(&d->lock, flags);
+ if (f->buf) {
+ f->buf->nframesout--;
+ aoe_failbuf(d, f->buf);
+ }
+ aoe_freetframe(f);
+ spin_unlock_irqrestore(&d->lock, flags);
+ dev_kfree_skb(skb);
+ aoedev_put(d);
+ }
+}
+
+int __init
+aoecmd_init(void)
+{
+ void *p;
+ int i;
+ int ret;
+
+ /* get_zeroed_page returns page with ref count 1 */
+ p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT);
+ if (!p)
+ return -ENOMEM;
+ empty_page = virt_to_page(p);
+
+ ncpus = num_online_cpus();
+
+ iocq = kcalloc(ncpus, sizeof(struct iocq_ktio), GFP_KERNEL);
+ if (!iocq)
+ return -ENOMEM;
+
+ kts = kcalloc(ncpus, sizeof(struct ktstate), GFP_KERNEL);
+ if (!kts) {
+ ret = -ENOMEM;
+ goto kts_fail;
+ }
+
+ ktiowq = kcalloc(ncpus, sizeof(wait_queue_head_t), GFP_KERNEL);
+ if (!ktiowq) {
+ ret = -ENOMEM;
+ goto ktiowq_fail;
+ }
+
+ mutex_init(&ktio_spawn_lock);
+
+ for (i = 0; i < ncpus; i++) {
+ INIT_LIST_HEAD(&iocq[i].head);
+ spin_lock_init(&iocq[i].lock);
+ init_waitqueue_head(&ktiowq[i]);
+ snprintf(kts[i].name, sizeof(kts[i].name), "aoe_ktio%d", i);
+ kts[i].fn = ktio;
+ kts[i].waitq = &ktiowq[i];
+ kts[i].lock = &iocq[i].lock;
+ kts[i].id = i;
+ kts[i].active = 0;
+ }
+ kts[0].active = 1;
+ if (aoe_ktstart(&kts[0])) {
+ ret = -ENOMEM;
+ goto ktstart_fail;
+ }
+ return 0;
+
+ktstart_fail:
+ kfree(ktiowq);
+ktiowq_fail:
+ kfree(kts);
+kts_fail:
+ kfree(iocq);
+
+ return ret;
+}
+
+void
+aoecmd_exit(void)
+{
+ int i;
+
+ for (i = 0; i < ncpus; i++)
+ if (kts[i].active)
+ aoe_ktstop(&kts[i]);
+
+ aoe_flush_iocq();
+
+ /* Free up the iocq and thread speicific configuration
+ * allocated during startup.
+ */
+ kfree(iocq);
+ kfree(kts);
+ kfree(ktiowq);
+
+ free_page((unsigned long) page_address(empty_page));
+ empty_page = NULL;
+}
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
new file mode 100644
index 000000000..e774c50b6
--- /dev/null
+++ b/drivers/block/aoe/aoedev.c
@@ -0,0 +1,526 @@
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
+/*
+ * aoedev.c
+ * AoE device utility functions; maintains device list.
+ */
+
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+#include <linux/kdev_t.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
+#include "aoe.h"
+
+static void dummy_timer(ulong);
+static void freetgt(struct aoedev *d, struct aoetgt *t);
+static void skbpoolfree(struct aoedev *d);
+
+static int aoe_dyndevs = 1;
+module_param(aoe_dyndevs, int, 0644);
+MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
+
+static struct aoedev *devlist;
+static DEFINE_SPINLOCK(devlist_lock);
+
+/* Because some systems will have one, many, or no
+ * - partitions,
+ * - slots per shelf,
+ * - or shelves,
+ * we need some flexibility in the way the minor numbers
+ * are allocated. So they are dynamic.
+ */
+#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
+
+static DEFINE_SPINLOCK(used_minors_lock);
+static DECLARE_BITMAP(used_minors, N_DEVS);
+
+static int
+minor_get_dyn(ulong *sysminor)
+{
+ ulong flags;
+ ulong n;
+ int error = 0;
+
+ spin_lock_irqsave(&used_minors_lock, flags);
+ n = find_first_zero_bit(used_minors, N_DEVS);
+ if (n < N_DEVS)
+ set_bit(n, used_minors);
+ else
+ error = -1;
+ spin_unlock_irqrestore(&used_minors_lock, flags);
+
+ *sysminor = n * AOE_PARTITIONS;
+ return error;
+}
+
+static int
+minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
+{
+ ulong flags;
+ ulong n;
+ int error = 0;
+ enum {
+ /* for backwards compatibility when !aoe_dyndevs,
+ * a static number of supported slots per shelf */
+ NPERSHELF = 16,
+ };
+
+ if (aoemin >= NPERSHELF) {
+ pr_err("aoe: %s %d slots per shelf\n",
+ "static minor device numbers support only",
+ NPERSHELF);
+ error = -1;
+ goto out;
+ }
+
+ n = aoemaj * NPERSHELF + aoemin;
+ if (n >= N_DEVS) {
+ pr_err("aoe: %s with e%ld.%d\n",
+ "cannot use static minor device numbers",
+ aoemaj, aoemin);
+ error = -1;
+ goto out;
+ }
+
+ spin_lock_irqsave(&used_minors_lock, flags);
+ if (test_bit(n, used_minors)) {
+ pr_err("aoe: %s %lu\n",
+ "existing device already has static minor number",
+ n);
+ error = -1;
+ } else
+ set_bit(n, used_minors);
+ spin_unlock_irqrestore(&used_minors_lock, flags);
+ *sysminor = n * AOE_PARTITIONS;
+out:
+ return error;
+}
+
+static int
+minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
+{
+ if (aoe_dyndevs)
+ return minor_get_dyn(sysminor);
+ else
+ return minor_get_static(sysminor, aoemaj, aoemin);
+}
+
+static void
+minor_free(ulong minor)
+{
+ ulong flags;
+
+ minor /= AOE_PARTITIONS;
+ BUG_ON(minor >= N_DEVS);
+
+ spin_lock_irqsave(&used_minors_lock, flags);
+ BUG_ON(!test_bit(minor, used_minors));
+ clear_bit(minor, used_minors);
+ spin_unlock_irqrestore(&used_minors_lock, flags);
+}
+
+/*
+ * Users who grab a pointer to the device with aoedev_by_aoeaddr
+ * automatically get a reference count and must be responsible
+ * for performing a aoedev_put. With the addition of async
+ * kthread processing I'm no longer confident that we can
+ * guarantee consistency in the face of device flushes.
+ *
+ * For the time being, we only bother to add extra references for
+ * frames sitting on the iocq. When the kthreads finish processing
+ * these frames, they will aoedev_put the device.
+ */
+
+void
+aoedev_put(struct aoedev *d)
+{
+ ulong flags;
+
+ spin_lock_irqsave(&devlist_lock, flags);
+ d->ref--;
+ spin_unlock_irqrestore(&devlist_lock, flags);
+}
+
+static void
+dummy_timer(ulong vp)
+{
+ struct aoedev *d;
+
+ d = (struct aoedev *)vp;
+ if (d->flags & DEVFL_TKILL)
+ return;
+ d->timer.expires = jiffies + HZ;
+ add_timer(&d->timer);
+}
+
+static void
+aoe_failip(struct aoedev *d)
+{
+ struct request *rq;
+ struct bio *bio;
+ unsigned long n;
+
+ aoe_failbuf(d, d->ip.buf);
+
+ rq = d->ip.rq;
+ if (rq == NULL)
+ return;
+ while ((bio = d->ip.nxbio)) {
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ d->ip.nxbio = bio->bi_next;
+ n = (unsigned long) rq->special;
+ rq->special = (void *) --n;
+ }
+ if ((unsigned long) rq->special == 0)
+ aoe_end_request(d, rq, 0);
+}
+
+static void
+downdev_frame(struct list_head *pos)
+{
+ struct frame *f;
+
+ f = list_entry(pos, struct frame, head);
+ list_del(pos);
+ if (f->buf) {
+ f->buf->nframesout--;
+ aoe_failbuf(f->t->d, f->buf);
+ }
+ aoe_freetframe(f);
+}
+
+void
+aoedev_downdev(struct aoedev *d)
+{
+ struct aoetgt *t, **tt, **te;
+ struct list_head *head, *pos, *nx;
+ struct request *rq;
+ int i;
+
+ d->flags &= ~DEVFL_UP;
+
+ /* clean out active and to-be-retransmitted buffers */
+ for (i = 0; i < NFACTIVE; i++) {
+ head = &d->factive[i];
+ list_for_each_safe(pos, nx, head)
+ downdev_frame(pos);
+ }
+ head = &d->rexmitq;
+ list_for_each_safe(pos, nx, head)
+ downdev_frame(pos);
+
+ /* reset window dressings */
+ tt = d->targets;
+ te = tt + d->ntargets;
+ for (; tt < te && (t = *tt); tt++) {
+ aoecmd_wreset(t);
+ t->nout = 0;
+ }
+
+ /* clean out the in-process request (if any) */
+ aoe_failip(d);
+
+ /* fast fail all pending I/O */
+ if (d->blkq) {
+ while ((rq = blk_peek_request(d->blkq))) {
+ blk_start_request(rq);
+ aoe_end_request(d, rq, 1);
+ }
+ }
+
+ if (d->gd)
+ set_capacity(d->gd, 0);
+}
+
+/* return whether the user asked for this particular
+ * device to be flushed
+ */
+static int
+user_req(char *s, size_t slen, struct aoedev *d)
+{
+ const char *p;
+ size_t lim;
+
+ if (!d->gd)
+ return 0;
+ p = kbasename(d->gd->disk_name);
+ lim = sizeof(d->gd->disk_name);
+ lim -= p - d->gd->disk_name;
+ if (slen < lim)
+ lim = slen;
+
+ return !strncmp(s, p, lim);
+}
+
+static void
+freedev(struct aoedev *d)
+{
+ struct aoetgt **t, **e;
+ int freeing = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&d->lock, flags);
+ if (d->flags & DEVFL_TKILL
+ && !(d->flags & DEVFL_FREEING)) {
+ d->flags |= DEVFL_FREEING;
+ freeing = 1;
+ }
+ spin_unlock_irqrestore(&d->lock, flags);
+ if (!freeing)
+ return;
+
+ del_timer_sync(&d->timer);
+ if (d->gd) {
+ aoedisk_rm_debugfs(d);
+ aoedisk_rm_sysfs(d);
+ del_gendisk(d->gd);
+ put_disk(d->gd);
+ blk_cleanup_queue(d->blkq);
+ }
+ t = d->targets;
+ e = t + d->ntargets;
+ for (; t < e && *t; t++)
+ freetgt(d, *t);
+ if (d->bufpool)
+ mempool_destroy(d->bufpool);
+ skbpoolfree(d);
+ minor_free(d->sysminor);
+
+ spin_lock_irqsave(&d->lock, flags);
+ d->flags |= DEVFL_FREED;
+ spin_unlock_irqrestore(&d->lock, flags);
+}
+
+enum flush_parms {
+ NOT_EXITING = 0,
+ EXITING = 1,
+};
+
+static int
+flush(const char __user *str, size_t cnt, int exiting)
+{
+ ulong flags;
+ struct aoedev *d, **dd;
+ char buf[16];
+ int all = 0;
+ int specified = 0; /* flush a specific device */
+ unsigned int skipflags;
+
+ skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
+
+ if (!exiting && cnt >= 3) {
+ if (cnt > sizeof buf)
+ cnt = sizeof buf;
+ if (copy_from_user(buf, str, cnt))
+ return -EFAULT;
+ all = !strncmp(buf, "all", 3);
+ if (!all)
+ specified = 1;
+ }
+
+ flush_scheduled_work();
+ /* pass one: without sleeping, do aoedev_downdev */
+ spin_lock_irqsave(&devlist_lock, flags);
+ for (d = devlist; d; d = d->next) {
+ spin_lock(&d->lock);
+ if (exiting) {
+ /* unconditionally take each device down */
+ } else if (specified) {
+ if (!user_req(buf, cnt, d))
+ goto cont;
+ } else if ((!all && (d->flags & DEVFL_UP))
+ || d->flags & skipflags
+ || d->nopen
+ || d->ref)
+ goto cont;
+
+ aoedev_downdev(d);
+ d->flags |= DEVFL_TKILL;
+cont:
+ spin_unlock(&d->lock);
+ }
+ spin_unlock_irqrestore(&devlist_lock, flags);
+
+ /* pass two: call freedev, which might sleep,
+ * for aoedevs marked with DEVFL_TKILL
+ */
+restart:
+ spin_lock_irqsave(&devlist_lock, flags);
+ for (d = devlist; d; d = d->next) {
+ spin_lock(&d->lock);
+ if (d->flags & DEVFL_TKILL
+ && !(d->flags & DEVFL_FREEING)) {
+ spin_unlock(&d->lock);
+ spin_unlock_irqrestore(&devlist_lock, flags);
+ freedev(d);
+ goto restart;
+ }
+ spin_unlock(&d->lock);
+ }
+
+ /* pass three: remove aoedevs marked with DEVFL_FREED */
+ for (dd = &devlist, d = *dd; d; d = *dd) {
+ struct aoedev *doomed = NULL;
+
+ spin_lock(&d->lock);
+ if (d->flags & DEVFL_FREED) {
+ *dd = d->next;
+ doomed = d;
+ } else {
+ dd = &d->next;
+ }
+ spin_unlock(&d->lock);
+ if (doomed)
+ kfree(doomed->targets);
+ kfree(doomed);
+ }
+ spin_unlock_irqrestore(&devlist_lock, flags);
+
+ return 0;
+}
+
+int
+aoedev_flush(const char __user *str, size_t cnt)
+{
+ return flush(str, cnt, NOT_EXITING);
+}
+
+/* This has been confirmed to occur once with Tms=3*1000 due to the
+ * driver changing link and not processing its transmit ring. The
+ * problem is hard enough to solve by returning an error that I'm
+ * still punting on "solving" this.
+ */
+static void
+skbfree(struct sk_buff *skb)
+{
+ enum { Sms = 250, Tms = 30 * 1000};
+ int i = Tms / Sms;
+
+ if (skb == NULL)
+ return;
+ while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0)
+ msleep(Sms);
+ if (i < 0) {
+ printk(KERN_ERR
+ "aoe: %s holds ref: %s\n",
+ skb->dev ? skb->dev->name : "netif",
+ "cannot free skb -- memory leaked.");
+ return;
+ }
+ skb->truesize -= skb->data_len;
+ skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+ skb_trim(skb, 0);
+ dev_kfree_skb(skb);
+}
+
+static void
+skbpoolfree(struct aoedev *d)
+{
+ struct sk_buff *skb, *tmp;
+
+ skb_queue_walk_safe(&d->skbpool, skb, tmp)
+ skbfree(skb);
+
+ __skb_queue_head_init(&d->skbpool);
+}
+
+/* find it or allocate it */
+struct aoedev *
+aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
+{
+ struct aoedev *d;
+ int i;
+ ulong flags;
+ ulong sysminor = 0;
+
+ spin_lock_irqsave(&devlist_lock, flags);
+
+ for (d=devlist; d; d=d->next)
+ if (d->aoemajor == maj && d->aoeminor == min) {
+ spin_lock(&d->lock);
+ if (d->flags & DEVFL_TKILL) {
+ spin_unlock(&d->lock);
+ d = NULL;
+ goto out;
+ }
+ d->ref++;
+ spin_unlock(&d->lock);
+ break;
+ }
+ if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
+ goto out;
+ d = kcalloc(1, sizeof *d, GFP_ATOMIC);
+ if (!d)
+ goto out;
+ d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC);
+ if (!d->targets) {
+ kfree(d);
+ d = NULL;
+ goto out;
+ }
+ d->ntargets = NTARGETS;
+ INIT_WORK(&d->work, aoecmd_sleepwork);
+ spin_lock_init(&d->lock);
+ skb_queue_head_init(&d->skbpool);
+ init_timer(&d->timer);
+ d->timer.data = (ulong) d;
+ d->timer.function = dummy_timer;
+ d->timer.expires = jiffies + HZ;
+ add_timer(&d->timer);
+ d->bufpool = NULL; /* defer to aoeblk_gdalloc */
+ d->tgt = d->targets;
+ d->ref = 1;
+ for (i = 0; i < NFACTIVE; i++)
+ INIT_LIST_HEAD(&d->factive[i]);
+ INIT_LIST_HEAD(&d->rexmitq);
+ d->sysminor = sysminor;
+ d->aoemajor = maj;
+ d->aoeminor = min;
+ d->rttavg = RTTAVG_INIT;
+ d->rttdev = RTTDEV_INIT;
+ d->next = devlist;
+ devlist = d;
+ out:
+ spin_unlock_irqrestore(&devlist_lock, flags);
+ return d;
+}
+
+static void
+freetgt(struct aoedev *d, struct aoetgt *t)
+{
+ struct frame *f;
+ struct list_head *pos, *nx, *head;
+ struct aoeif *ifp;
+
+ for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
+ if (!ifp->nd)
+ break;
+ dev_put(ifp->nd);
+ }
+
+ head = &t->ffree;
+ list_for_each_safe(pos, nx, head) {
+ list_del(pos);
+ f = list_entry(pos, struct frame, head);
+ skbfree(f->skb);
+ kfree(f);
+ }
+ kfree(t);
+}
+
+void
+aoedev_exit(void)
+{
+ flush_scheduled_work();
+ flush(NULL, 0, EXITING);
+}
+
+int __init
+aoedev_init(void)
+{
+ return 0;
+}
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
new file mode 100644
index 000000000..4b987c2fe
--- /dev/null
+++ b/drivers/block/aoe/aoemain.c
@@ -0,0 +1,115 @@
+/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */
+/*
+ * aoemain.c
+ * Module initialization routines, discover timer
+ */
+
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include "aoe.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sam Hopkins <sah@coraid.com>");
+MODULE_DESCRIPTION("AoE block/char driver for 2.6.2 and newer 2.6 kernels");
+MODULE_VERSION(VERSION);
+
+enum { TINIT, TRUN, TKILL };
+
+static void
+discover_timer(ulong vp)
+{
+ static struct timer_list t;
+ static volatile ulong die;
+ static spinlock_t lock;
+ ulong flags;
+ enum { DTIMERTICK = HZ * 60 }; /* one minute */
+
+ switch (vp) {
+ case TINIT:
+ init_timer(&t);
+ spin_lock_init(&lock);
+ t.data = TRUN;
+ t.function = discover_timer;
+ die = 0;
+ case TRUN:
+ spin_lock_irqsave(&lock, flags);
+ if (!die) {
+ t.expires = jiffies + DTIMERTICK;
+ add_timer(&t);
+ }
+ spin_unlock_irqrestore(&lock, flags);
+
+ aoecmd_cfg(0xffff, 0xff);
+ return;
+ case TKILL:
+ spin_lock_irqsave(&lock, flags);
+ die = 1;
+ spin_unlock_irqrestore(&lock, flags);
+
+ del_timer_sync(&t);
+ default:
+ return;
+ }
+}
+
+static void
+aoe_exit(void)
+{
+ discover_timer(TKILL);
+
+ aoenet_exit();
+ unregister_blkdev(AOE_MAJOR, DEVICE_NAME);
+ aoecmd_exit();
+ aoechr_exit();
+ aoedev_exit();
+ aoeblk_exit(); /* free cache after de-allocating bufs */
+}
+
+static int __init
+aoe_init(void)
+{
+ int ret;
+
+ ret = aoedev_init();
+ if (ret)
+ return ret;
+ ret = aoechr_init();
+ if (ret)
+ goto chr_fail;
+ ret = aoeblk_init();
+ if (ret)
+ goto blk_fail;
+ ret = aoenet_init();
+ if (ret)
+ goto net_fail;
+ ret = aoecmd_init();
+ if (ret)
+ goto cmd_fail;
+ ret = register_blkdev(AOE_MAJOR, DEVICE_NAME);
+ if (ret < 0) {
+ printk(KERN_ERR "aoe: can't register major\n");
+ goto blkreg_fail;
+ }
+ printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION);
+ discover_timer(TINIT);
+ return 0;
+ blkreg_fail:
+ aoecmd_exit();
+ cmd_fail:
+ aoenet_exit();
+ net_fail:
+ aoeblk_exit();
+ blk_fail:
+ aoechr_exit();
+ chr_fail:
+ aoedev_exit();
+
+ printk(KERN_INFO "aoe: initialisation failure.\n");
+ return ret;
+}
+
+module_init(aoe_init);
+module_exit(aoe_exit);
+
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
new file mode 100644
index 000000000..63773a905
--- /dev/null
+++ b/drivers/block/aoe/aoenet.c
@@ -0,0 +1,223 @@
+/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */
+/*
+ * aoenet.c
+ * Ethernet portion of AoE driver
+ */
+
+#include <linux/gfp.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <net/net_namespace.h>
+#include <asm/unaligned.h>
+#include "aoe.h"
+
+#define NECODES 5
+
+static char *aoe_errlist[] =
+{
+ "no such error",
+ "unrecognized command code",
+ "bad argument parameter",
+ "device unavailable",
+ "config string present",
+ "unsupported version"
+};
+
+enum {
+ IFLISTSZ = 1024,
+};
+
+static char aoe_iflist[IFLISTSZ];
+module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600);
+MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]");
+
+static wait_queue_head_t txwq;
+static struct ktstate kts;
+
+#ifndef MODULE
+static int __init aoe_iflist_setup(char *str)
+{
+ strncpy(aoe_iflist, str, IFLISTSZ);
+ aoe_iflist[IFLISTSZ - 1] = '\0';
+ return 1;
+}
+
+__setup("aoe_iflist=", aoe_iflist_setup);
+#endif
+
+static spinlock_t txlock;
+static struct sk_buff_head skbtxq;
+
+/* enters with txlock held */
+static int
+tx(int id) __must_hold(&txlock)
+{
+ struct sk_buff *skb;
+ struct net_device *ifp;
+
+ while ((skb = skb_dequeue(&skbtxq))) {
+ spin_unlock_irq(&txlock);
+ ifp = skb->dev;
+ if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit())
+ pr_warn("aoe: packet could not be sent on %s. %s\n",
+ ifp ? ifp->name : "netif",
+ "consider increasing tx_queue_len");
+ spin_lock_irq(&txlock);
+ }
+ return 0;
+}
+
+int
+is_aoe_netif(struct net_device *ifp)
+{
+ register char *p, *q;
+ register int len;
+
+ if (aoe_iflist[0] == '\0')
+ return 1;
+
+ p = aoe_iflist + strspn(aoe_iflist, WHITESPACE);
+ for (; *p; p = q + strspn(q, WHITESPACE)) {
+ q = p + strcspn(p, WHITESPACE);
+ if (q != p)
+ len = q - p;
+ else
+ len = strlen(p); /* last token in aoe_iflist */
+
+ if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len))
+ return 1;
+ if (q == p)
+ break;
+ }
+
+ return 0;
+}
+
+int
+set_aoe_iflist(const char __user *user_str, size_t size)
+{
+ if (size >= IFLISTSZ)
+ return -EINVAL;
+
+ if (copy_from_user(aoe_iflist, user_str, size)) {
+ printk(KERN_INFO "aoe: copy from user failed\n");
+ return -EFAULT;
+ }
+ aoe_iflist[size] = 0x00;
+ return 0;
+}
+
+void
+aoenet_xmit(struct sk_buff_head *queue)
+{
+ struct sk_buff *skb, *tmp;
+ ulong flags;
+
+ skb_queue_walk_safe(queue, skb, tmp) {
+ __skb_unlink(skb, queue);
+ spin_lock_irqsave(&txlock, flags);
+ skb_queue_tail(&skbtxq, skb);
+ spin_unlock_irqrestore(&txlock, flags);
+ wake_up(&txwq);
+ }
+}
+
+/*
+ * (1) len doesn't include the header by default. I want this.
+ */
+static int
+aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
+{
+ struct aoe_hdr *h;
+ struct aoe_atahdr *ah;
+ u32 n;
+ int sn;
+
+ if (dev_net(ifp) != &init_net)
+ goto exit;
+
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return 0;
+ if (!is_aoe_netif(ifp))
+ goto exit;
+ skb_push(skb, ETH_HLEN); /* (1) */
+ sn = sizeof(*h) + sizeof(*ah);
+ if (skb->len >= sn) {
+ sn -= skb_headlen(skb);
+ if (sn > 0 && !__pskb_pull_tail(skb, sn))
+ goto exit;
+ }
+ h = (struct aoe_hdr *) skb->data;
+ n = get_unaligned_be32(&h->tag);
+ if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
+ goto exit;
+
+ if (h->verfl & AOEFL_ERR) {
+ n = h->err;
+ if (n > NECODES)
+ n = 0;
+ if (net_ratelimit())
+ printk(KERN_ERR
+ "%s%d.%d@%s; ecode=%d '%s'\n",
+ "aoe: error packet from ",
+ get_unaligned_be16(&h->major),
+ h->minor, skb->dev->name,
+ h->err, aoe_errlist[n]);
+ goto exit;
+ }
+
+ switch (h->cmd) {
+ case AOECMD_ATA:
+ /* ata_rsp may keep skb for later processing or give it back */
+ skb = aoecmd_ata_rsp(skb);
+ break;
+ case AOECMD_CFG:
+ aoecmd_cfg_rsp(skb);
+ break;
+ default:
+ if (h->cmd >= AOECMD_VEND_MIN)
+ break; /* don't complain about vendor commands */
+ pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd);
+ break;
+ }
+
+ if (!skb)
+ return 0;
+exit:
+ dev_kfree_skb(skb);
+ return 0;
+}
+
+static struct packet_type aoe_pt __read_mostly = {
+ .type = __constant_htons(ETH_P_AOE),
+ .func = aoenet_rcv,
+};
+
+int __init
+aoenet_init(void)
+{
+ skb_queue_head_init(&skbtxq);
+ init_waitqueue_head(&txwq);
+ spin_lock_init(&txlock);
+ kts.lock = &txlock;
+ kts.fn = tx;
+ kts.waitq = &txwq;
+ kts.id = 0;
+ snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id);
+ if (aoe_ktstart(&kts))
+ return -EAGAIN;
+ dev_add_pack(&aoe_pt);
+ return 0;
+}
+
+void
+aoenet_exit(void)
+{
+ aoe_ktstop(&kts);
+ skb_queue_purge(&skbtxq);
+ dev_remove_pack(&aoe_pt);
+}
+
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
new file mode 100644
index 000000000..2104b1b4c
--- /dev/null
+++ b/drivers/block/ataflop.c
@@ -0,0 +1,2056 @@
+/*
+ * drivers/block/ataflop.c
+ *
+ * Copyright (C) 1993 Greg Harp
+ * Atari Support by Bjoern Brauel, Roman Hodek
+ *
+ * Big cleanup Sep 11..14 1994 Roman Hodek:
+ * - Driver now works interrupt driven
+ * - Support for two drives; should work, but I cannot test that :-(
+ * - Reading is done in whole tracks and buffered to speed up things
+ * - Disk change detection and drive deselecting after motor-off
+ * similar to TOS
+ * - Autodetection of disk format (DD/HD); untested yet, because I
+ * don't have an HD drive :-(
+ *
+ * Fixes Nov 13 1994 Martin Schaller:
+ * - Autodetection works now
+ * - Support for 5 1/4'' disks
+ * - Removed drive type (unknown on atari)
+ * - Do seeks with 8 Mhz
+ *
+ * Changes by Andreas Schwab:
+ * - After errors in multiple read mode try again reading single sectors
+ * (Feb 1995):
+ * - Clean up error handling
+ * - Set blk_size for proper size checking
+ * - Initialize track register when testing presence of floppy
+ * - Implement some ioctl's
+ *
+ * Changes by Torsten Lang:
+ * - When probing the floppies we should add the FDCCMDADD_H flag since
+ * the FDC will otherwise wait forever when no disk is inserted...
+ *
+ * ++ Freddi Aschwanden (fa) 20.9.95 fixes for medusa:
+ * - MFPDELAY() after each FDC access -> atari
+ * - more/other disk formats
+ * - DMA to the block buffer directly if we have a 32bit DMA
+ * - for medusa, the step rate is always 3ms
+ * - on medusa, use only cache_push()
+ * Roman:
+ * - Make disk format numbering independent from minors
+ * - Let user set max. supported drive type (speeds up format
+ * detection, saves buffer space)
+ *
+ * Roman 10/15/95:
+ * - implement some more ioctls
+ * - disk formatting
+ *
+ * Andreas 95/12/12:
+ * - increase gap size at start of track for HD/ED disks
+ *
+ * Michael (MSch) 11/07/96:
+ * - implemented FDSETPRM and FDDEFPRM ioctl
+ *
+ * Andreas (97/03/19):
+ * - implemented missing BLK* ioctls
+ *
+ * Things left to do:
+ * - Formatting
+ * - Maybe a better strategy for disk change detection (does anyone
+ * know one?)
+ */
+
+#include <linux/module.h>
+
+#include <linux/fd.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/wait.h>
+
+#include <asm/atafd.h>
+#include <asm/atafdreg.h>
+#include <asm/atariints.h>
+#include <asm/atari_stdma.h>
+#include <asm/atari_stram.h>
+
+#define FD_MAX_UNITS 2
+
+#undef DEBUG
+
+static DEFINE_MUTEX(ataflop_mutex);
+static struct request *fd_request;
+static int fdc_queue;
+
+/* Disk types: DD, HD, ED */
+static struct atari_disk_type {
+ const char *name;
+ unsigned spt; /* sectors per track */
+ unsigned blocks; /* total number of blocks */
+ unsigned fdc_speed; /* fdc_speed setting */
+ unsigned stretch; /* track doubling ? */
+} atari_disk_type[] = {
+ { "d360", 9, 720, 0, 0}, /* 0: 360kB diskette */
+ { "D360", 9, 720, 0, 1}, /* 1: 360kb in 720k or 1.2MB drive */
+ { "D720", 9,1440, 0, 0}, /* 2: 720kb in 720k or 1.2MB drive */
+ { "D820", 10,1640, 0, 0}, /* 3: DD disk with 82 tracks/10 sectors */
+/* formats above are probed for type DD */
+#define MAX_TYPE_DD 3
+ { "h1200",15,2400, 3, 0}, /* 4: 1.2MB diskette */
+ { "H1440",18,2880, 3, 0}, /* 5: 1.4 MB diskette (HD) */
+ { "H1640",20,3280, 3, 0}, /* 6: 1.64MB diskette (fat HD) 82 tr 20 sec */
+/* formats above are probed for types DD and HD */
+#define MAX_TYPE_HD 6
+ { "E2880",36,5760, 3, 0}, /* 7: 2.8 MB diskette (ED) */
+ { "E3280",40,6560, 3, 0}, /* 8: 3.2 MB diskette (fat ED) 82 tr 40 sec */
+/* formats above are probed for types DD, HD and ED */
+#define MAX_TYPE_ED 8
+/* types below are never autoprobed */
+ { "H1680",21,3360, 3, 0}, /* 9: 1.68MB diskette (fat HD) 80 tr 21 sec */
+ { "h410",10,820, 0, 1}, /* 10: 410k diskette 41 tr 10 sec, stretch */
+ { "h1476",18,2952, 3, 0}, /* 11: 1.48MB diskette 82 tr 18 sec */
+ { "H1722",21,3444, 3, 0}, /* 12: 1.72MB diskette 82 tr 21 sec */
+ { "h420",10,840, 0, 1}, /* 13: 420k diskette 42 tr 10 sec, stretch */
+ { "H830",10,1660, 0, 0}, /* 14: 820k diskette 83 tr 10 sec */
+ { "h1494",18,2952, 3, 0}, /* 15: 1.49MB diskette 83 tr 18 sec */
+ { "H1743",21,3486, 3, 0}, /* 16: 1.74MB diskette 83 tr 21 sec */
+ { "h880",11,1760, 0, 0}, /* 17: 880k diskette 80 tr 11 sec */
+ { "D1040",13,2080, 0, 0}, /* 18: 1.04MB diskette 80 tr 13 sec */
+ { "D1120",14,2240, 0, 0}, /* 19: 1.12MB diskette 80 tr 14 sec */
+ { "h1600",20,3200, 3, 0}, /* 20: 1.60MB diskette 80 tr 20 sec */
+ { "H1760",22,3520, 3, 0}, /* 21: 1.76MB diskette 80 tr 22 sec */
+ { "H1920",24,3840, 3, 0}, /* 22: 1.92MB diskette 80 tr 24 sec */
+ { "E3200",40,6400, 3, 0}, /* 23: 3.2MB diskette 80 tr 40 sec */
+ { "E3520",44,7040, 3, 0}, /* 24: 3.52MB diskette 80 tr 44 sec */
+ { "E3840",48,7680, 3, 0}, /* 25: 3.84MB diskette 80 tr 48 sec */
+ { "H1840",23,3680, 3, 0}, /* 26: 1.84MB diskette 80 tr 23 sec */
+ { "D800",10,1600, 0, 0}, /* 27: 800k diskette 80 tr 10 sec */
+};
+
+static int StartDiskType[] = {
+ MAX_TYPE_DD,
+ MAX_TYPE_HD,
+ MAX_TYPE_ED
+};
+
+#define TYPE_DD 0
+#define TYPE_HD 1
+#define TYPE_ED 2
+
+static int DriveType = TYPE_HD;
+
+static DEFINE_SPINLOCK(ataflop_lock);
+
+/* Array for translating minors into disk formats */
+static struct {
+ int index;
+ unsigned drive_types;
+} minor2disktype[] = {
+ { 0, TYPE_DD }, /* 1: d360 */
+ { 4, TYPE_HD }, /* 2: h1200 */
+ { 1, TYPE_DD }, /* 3: D360 */
+ { 2, TYPE_DD }, /* 4: D720 */
+ { 1, TYPE_DD }, /* 5: h360 = D360 */
+ { 2, TYPE_DD }, /* 6: h720 = D720 */
+ { 5, TYPE_HD }, /* 7: H1440 */
+ { 7, TYPE_ED }, /* 8: E2880 */
+/* some PC formats :-) */
+ { 8, TYPE_ED }, /* 9: E3280 <- was "CompaQ" == E2880 for PC */
+ { 5, TYPE_HD }, /* 10: h1440 = H1440 */
+ { 9, TYPE_HD }, /* 11: H1680 */
+ { 10, TYPE_DD }, /* 12: h410 */
+ { 3, TYPE_DD }, /* 13: H820 <- == D820, 82x10 */
+ { 11, TYPE_HD }, /* 14: h1476 */
+ { 12, TYPE_HD }, /* 15: H1722 */
+ { 13, TYPE_DD }, /* 16: h420 */
+ { 14, TYPE_DD }, /* 17: H830 */
+ { 15, TYPE_HD }, /* 18: h1494 */
+ { 16, TYPE_HD }, /* 19: H1743 */
+ { 17, TYPE_DD }, /* 20: h880 */
+ { 18, TYPE_DD }, /* 21: D1040 */
+ { 19, TYPE_DD }, /* 22: D1120 */
+ { 20, TYPE_HD }, /* 23: h1600 */
+ { 21, TYPE_HD }, /* 24: H1760 */
+ { 22, TYPE_HD }, /* 25: H1920 */
+ { 23, TYPE_ED }, /* 26: E3200 */
+ { 24, TYPE_ED }, /* 27: E3520 */
+ { 25, TYPE_ED }, /* 28: E3840 */
+ { 26, TYPE_HD }, /* 29: H1840 */
+ { 27, TYPE_DD }, /* 30: D800 */
+ { 6, TYPE_HD }, /* 31: H1640 <- was H1600 == h1600 for PC */
+};
+
+#define NUM_DISK_MINORS ARRAY_SIZE(minor2disktype)
+
+/*
+ * Maximum disk size (in kilobytes). This default is used whenever the
+ * current disk size is unknown.
+ */
+#define MAX_DISK_SIZE 3280
+
+/*
+ * MSch: User-provided type information. 'drive' points to
+ * the respective entry of this array. Set by FDSETPRM ioctls.
+ */
+static struct atari_disk_type user_params[FD_MAX_UNITS];
+
+/*
+ * User-provided permanent type information. 'drive' points to
+ * the respective entry of this array. Set by FDDEFPRM ioctls,
+ * restored upon disk change by floppy_revalidate() if valid (as seen by
+ * default_params[].blocks > 0 - a bit in unit[].flags might be used for this?)
+ */
+static struct atari_disk_type default_params[FD_MAX_UNITS];
+
+/* current info on each unit */
+static struct atari_floppy_struct {
+ int connected; /* !=0 : drive is connected */
+ int autoprobe; /* !=0 : do autoprobe */
+
+ struct atari_disk_type *disktype; /* current type of disk */
+
+ int track; /* current head position or -1 if
+ unknown */
+ unsigned int steprate; /* steprate setting */
+ unsigned int wpstat; /* current state of WP signal (for
+ disk change detection) */
+ int flags; /* flags */
+ struct gendisk *disk;
+ int ref;
+ int type;
+} unit[FD_MAX_UNITS];
+
+#define UD unit[drive]
+#define UDT unit[drive].disktype
+#define SUD unit[SelectedDrive]
+#define SUDT unit[SelectedDrive].disktype
+
+
+#define FDC_READ(reg) ({ \
+ /* unsigned long __flags; */ \
+ unsigned short __val; \
+ /* local_irq_save(__flags); */ \
+ dma_wd.dma_mode_status = 0x80 | (reg); \
+ udelay(25); \
+ __val = dma_wd.fdc_acces_seccount; \
+ MFPDELAY(); \
+ /* local_irq_restore(__flags); */ \
+ __val & 0xff; \
+})
+
+#define FDC_WRITE(reg,val) \
+ do { \
+ /* unsigned long __flags; */ \
+ /* local_irq_save(__flags); */ \
+ dma_wd.dma_mode_status = 0x80 | (reg); \
+ udelay(25); \
+ dma_wd.fdc_acces_seccount = (val); \
+ MFPDELAY(); \
+ /* local_irq_restore(__flags); */ \
+ } while(0)
+
+
+/* Buffering variables:
+ * First, there is a DMA buffer in ST-RAM that is used for floppy DMA
+ * operations. Second, a track buffer is used to cache a whole track
+ * of the disk to save read operations. These are two separate buffers
+ * because that allows write operations without clearing the track buffer.
+ */
+
+static int MaxSectors[] = {
+ 11, 22, 44
+};
+static int BufferSize[] = {
+ 15*512, 30*512, 60*512
+};
+
+#define BUFFER_SIZE (BufferSize[DriveType])
+
+unsigned char *DMABuffer; /* buffer for writes */
+static unsigned long PhysDMABuffer; /* physical address */
+
+static int UseTrackbuffer = -1; /* Do track buffering? */
+module_param(UseTrackbuffer, int, 0);
+
+unsigned char *TrackBuffer; /* buffer for reads */
+static unsigned long PhysTrackBuffer; /* physical address */
+static int BufferDrive, BufferSide, BufferTrack;
+static int read_track; /* non-zero if we are reading whole tracks */
+
+#define SECTOR_BUFFER(sec) (TrackBuffer + ((sec)-1)*512)
+#define IS_BUFFERED(drive,side,track) \
+ (BufferDrive == (drive) && BufferSide == (side) && BufferTrack == (track))
+
+/*
+ * These are global variables, as that's the easiest way to give
+ * information to interrupts. They are the data used for the current
+ * request.
+ */
+static int SelectedDrive = 0;
+static int ReqCmd, ReqBlock;
+static int ReqSide, ReqTrack, ReqSector, ReqCnt;
+static int HeadSettleFlag = 0;
+static unsigned char *ReqData, *ReqBuffer;
+static int MotorOn = 0, MotorOffTrys;
+static int IsFormatting = 0, FormatError;
+
+static int UserSteprate[FD_MAX_UNITS] = { -1, -1 };
+module_param_array(UserSteprate, int, NULL, 0);
+
+/* Synchronization of FDC access. */
+static volatile int fdc_busy = 0;
+static DECLARE_WAIT_QUEUE_HEAD(fdc_wait);
+static DECLARE_COMPLETION(format_wait);
+
+static unsigned long changed_floppies = 0xff, fake_change = 0;
+#define CHECK_CHANGE_DELAY HZ/2
+
+#define FD_MOTOR_OFF_DELAY (3*HZ)
+#define FD_MOTOR_OFF_MAXTRY (10*20)
+
+#define FLOPPY_TIMEOUT (6*HZ)
+#define RECALIBRATE_ERRORS 4 /* After this many errors the drive
+ * will be recalibrated. */
+#define MAX_ERRORS 8 /* After this many errors the driver
+ * will give up. */
+
+
+/*
+ * The driver is trying to determine the correct media format
+ * while Probing is set. fd_rwsec_done() clears it after a
+ * successful access.
+ */
+static int Probing = 0;
+
+/* This flag is set when a dummy seek is necessary to make the WP
+ * status bit accessible.
+ */
+static int NeedSeek = 0;
+
+
+#ifdef DEBUG
+#define DPRINT(a) printk a
+#else
+#define DPRINT(a)
+#endif
+
+/***************************** Prototypes *****************************/
+
+static void fd_select_side( int side );
+static void fd_select_drive( int drive );
+static void fd_deselect( void );
+static void fd_motor_off_timer( unsigned long dummy );
+static void check_change( unsigned long dummy );
+static irqreturn_t floppy_irq (int irq, void *dummy);
+static void fd_error( void );
+static int do_format(int drive, int type, struct atari_format_descr *desc);
+static void do_fd_action( int drive );
+static void fd_calibrate( void );
+static void fd_calibrate_done( int status );
+static void fd_seek( void );
+static void fd_seek_done( int status );
+static void fd_rwsec( void );
+static void fd_readtrack_check( unsigned long dummy );
+static void fd_rwsec_done( int status );
+static void fd_rwsec_done1(int status);
+static void fd_writetrack( void );
+static void fd_writetrack_done( int status );
+static void fd_times_out( unsigned long dummy );
+static void finish_fdc( void );
+static void finish_fdc_done( int dummy );
+static void setup_req_params( int drive );
+static void redo_fd_request( void);
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
+ cmd, unsigned long param);
+static void fd_probe( int drive );
+static int fd_test_drive_present( int drive );
+static void config_types( void );
+static int floppy_open(struct block_device *bdev, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
+
+/************************* End of Prototypes **************************/
+
+static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer, 0, 0);
+static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0);
+static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0);
+static DEFINE_TIMER(fd_timer, check_change, 0, 0);
+
+static void fd_end_request_cur(int err)
+{
+ if (!__blk_end_request_cur(fd_request, err))
+ fd_request = NULL;
+}
+
+static inline void start_motor_off_timer(void)
+{
+ mod_timer(&motor_off_timer, jiffies + FD_MOTOR_OFF_DELAY);
+ MotorOffTrys = 0;
+}
+
+static inline void start_check_change_timer( void )
+{
+ mod_timer(&fd_timer, jiffies + CHECK_CHANGE_DELAY);
+}
+
+static inline void start_timeout(void)
+{
+ mod_timer(&timeout_timer, jiffies + FLOPPY_TIMEOUT);
+}
+
+static inline void stop_timeout(void)
+{
+ del_timer(&timeout_timer);
+}
+
+/* Select the side to use. */
+
+static void fd_select_side( int side )
+{
+ unsigned long flags;
+
+ /* protect against various other ints mucking around with the PSG */
+ local_irq_save(flags);
+
+ sound_ym.rd_data_reg_sel = 14; /* Select PSG Port A */
+ sound_ym.wd_data = (side == 0) ? sound_ym.rd_data_reg_sel | 0x01 :
+ sound_ym.rd_data_reg_sel & 0xfe;
+
+ local_irq_restore(flags);
+}
+
+
+/* Select a drive, update the FDC's track register and set the correct
+ * clock speed for this disk's type.
+ */
+
+static void fd_select_drive( int drive )
+{
+ unsigned long flags;
+ unsigned char tmp;
+
+ if (drive == SelectedDrive)
+ return;
+
+ /* protect against various other ints mucking around with the PSG */
+ local_irq_save(flags);
+ sound_ym.rd_data_reg_sel = 14; /* Select PSG Port A */
+ tmp = sound_ym.rd_data_reg_sel;
+ sound_ym.wd_data = (tmp | DSKDRVNONE) & ~(drive == 0 ? DSKDRV0 : DSKDRV1);
+ atari_dont_touch_floppy_select = 1;
+ local_irq_restore(flags);
+
+ /* restore track register to saved value */
+ FDC_WRITE( FDCREG_TRACK, UD.track );
+ udelay(25);
+
+ /* select 8/16 MHz */
+ if (UDT)
+ if (ATARIHW_PRESENT(FDCSPEED))
+ dma_wd.fdc_speed = UDT->fdc_speed;
+
+ SelectedDrive = drive;
+}
+
+
+/* Deselect both drives. */
+
+static void fd_deselect( void )
+{
+ unsigned long flags;
+
+ /* protect against various other ints mucking around with the PSG */
+ local_irq_save(flags);
+ atari_dont_touch_floppy_select = 0;
+ sound_ym.rd_data_reg_sel=14; /* Select PSG Port A */
+ sound_ym.wd_data = (sound_ym.rd_data_reg_sel |
+ (MACH_IS_FALCON ? 3 : 7)); /* no drives selected */
+ /* On Falcon, the drive B select line is used on the printer port, so
+ * leave it alone... */
+ SelectedDrive = -1;
+ local_irq_restore(flags);
+}
+
+
+/* This timer function deselects the drives when the FDC switched the
+ * motor off. The deselection cannot happen earlier because the FDC
+ * counts the index signals, which arrive only if one drive is selected.
+ */
+
+static void fd_motor_off_timer( unsigned long dummy )
+{
+ unsigned char status;
+
+ if (SelectedDrive < 0)
+ /* no drive selected, needn't deselect anyone */
+ return;
+
+ if (stdma_islocked())
+ goto retry;
+
+ status = FDC_READ( FDCREG_STATUS );
+
+ if (!(status & 0x80)) {
+ /* motor already turned off by FDC -> deselect drives */
+ MotorOn = 0;
+ fd_deselect();
+ return;
+ }
+ /* not yet off, try again */
+
+ retry:
+ /* Test again later; if tested too often, it seems there is no disk
+ * in the drive and the FDC will leave the motor on forever (or,
+ * at least until a disk is inserted). So we'll test only twice
+ * per second from then on...
+ */
+ mod_timer(&motor_off_timer,
+ jiffies + (MotorOffTrys++ < FD_MOTOR_OFF_MAXTRY ? HZ/20 : HZ/2));
+}
+
+
+/* This function is repeatedly called to detect disk changes (as good
+ * as possible) and keep track of the current state of the write protection.
+ */
+
+static void check_change( unsigned long dummy )
+{
+ static int drive = 0;
+
+ unsigned long flags;
+ unsigned char old_porta;
+ int stat;
+
+ if (++drive > 1 || !UD.connected)
+ drive = 0;
+
+ /* protect against various other ints mucking around with the PSG */
+ local_irq_save(flags);
+
+ if (!stdma_islocked()) {
+ sound_ym.rd_data_reg_sel = 14;
+ old_porta = sound_ym.rd_data_reg_sel;
+ sound_ym.wd_data = (old_porta | DSKDRVNONE) &
+ ~(drive == 0 ? DSKDRV0 : DSKDRV1);
+ stat = !!(FDC_READ( FDCREG_STATUS ) & FDCSTAT_WPROT);
+ sound_ym.wd_data = old_porta;
+
+ if (stat != UD.wpstat) {
+ DPRINT(( "wpstat[%d] = %d\n", drive, stat ));
+ UD.wpstat = stat;
+ set_bit (drive, &changed_floppies);
+ }
+ }
+ local_irq_restore(flags);
+
+ start_check_change_timer();
+}
+
+
+/* Handling of the Head Settling Flag: This flag should be set after each
+ * seek operation, because we don't use seeks with verify.
+ */
+
+static inline void set_head_settle_flag(void)
+{
+ HeadSettleFlag = FDCCMDADD_E;
+}
+
+static inline int get_head_settle_flag(void)
+{
+ int tmp = HeadSettleFlag;
+ HeadSettleFlag = 0;
+ return( tmp );
+}
+
+static inline void copy_buffer(void *from, void *to)
+{
+ ulong *p1 = (ulong *)from, *p2 = (ulong *)to;
+ int cnt;
+
+ for (cnt = 512/4; cnt; cnt--)
+ *p2++ = *p1++;
+}
+
+
+
+
+/* General Interrupt Handling */
+
+static void (*FloppyIRQHandler)( int status ) = NULL;
+
+static irqreturn_t floppy_irq (int irq, void *dummy)
+{
+ unsigned char status;
+ void (*handler)( int );
+
+ handler = xchg(&FloppyIRQHandler, NULL);
+
+ if (handler) {
+ nop();
+ status = FDC_READ( FDCREG_STATUS );
+ DPRINT(("FDC irq, status = %02x handler = %08lx\n",status,(unsigned long)handler));
+ handler( status );
+ }
+ else {
+ DPRINT(("FDC irq, no handler\n"));
+ }
+ return IRQ_HANDLED;
+}
+
+
+/* Error handling: If some error happened, retry some times, then
+ * recalibrate, then try again, and fail after MAX_ERRORS.
+ */
+
+static void fd_error( void )
+{
+ if (IsFormatting) {
+ IsFormatting = 0;
+ FormatError = 1;
+ complete(&format_wait);
+ return;
+ }
+
+ if (!fd_request)
+ return;
+
+ fd_request->errors++;
+ if (fd_request->errors >= MAX_ERRORS) {
+ printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
+ fd_end_request_cur(-EIO);
+ }
+ else if (fd_request->errors == RECALIBRATE_ERRORS) {
+ printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
+ if (SelectedDrive != -1)
+ SUD.track = -1;
+ }
+ redo_fd_request();
+}
+
+
+
+#define SET_IRQ_HANDLER(proc) do { FloppyIRQHandler = (proc); } while(0)
+
+
+/* ---------- Formatting ---------- */
+
+#define FILL(n,val) \
+ do { \
+ memset( p, val, n ); \
+ p += n; \
+ } while(0)
+
+static int do_format(int drive, int type, struct atari_format_descr *desc)
+{
+ unsigned char *p;
+ int sect, nsect;
+ unsigned long flags;
+
+ DPRINT(("do_format( dr=%d tr=%d he=%d offs=%d )\n",
+ drive, desc->track, desc->head, desc->sect_offset ));
+
+ wait_event(fdc_wait, cmpxchg(&fdc_busy, 0, 1) == 0);
+ local_irq_save(flags);
+ stdma_lock(floppy_irq, NULL);
+ atari_turnon_irq( IRQ_MFP_FDC ); /* should be already, just to be sure */
+ local_irq_restore(flags);
+
+ if (type) {
+ if (--type >= NUM_DISK_MINORS ||
+ minor2disktype[type].drive_types > DriveType) {
+ redo_fd_request();
+ return -EINVAL;
+ }
+ type = minor2disktype[type].index;
+ UDT = &atari_disk_type[type];
+ }
+
+ if (!UDT || desc->track >= UDT->blocks/UDT->spt/2 || desc->head >= 2) {
+ redo_fd_request();
+ return -EINVAL;
+ }
+
+ nsect = UDT->spt;
+ p = TrackBuffer;
+ /* The track buffer is used for the raw track data, so its
+ contents become invalid! */
+ BufferDrive = -1;
+ /* stop deselect timer */
+ del_timer( &motor_off_timer );
+
+ FILL( 60 * (nsect / 9), 0x4e );
+ for( sect = 0; sect < nsect; ++sect ) {
+ FILL( 12, 0 );
+ FILL( 3, 0xf5 );
+ *p++ = 0xfe;
+ *p++ = desc->track;
+ *p++ = desc->head;
+ *p++ = (nsect + sect - desc->sect_offset) % nsect + 1;
+ *p++ = 2;
+ *p++ = 0xf7;
+ FILL( 22, 0x4e );
+ FILL( 12, 0 );
+ FILL( 3, 0xf5 );
+ *p++ = 0xfb;
+ FILL( 512, 0xe5 );
+ *p++ = 0xf7;
+ FILL( 40, 0x4e );
+ }
+ FILL( TrackBuffer+BUFFER_SIZE-p, 0x4e );
+
+ IsFormatting = 1;
+ FormatError = 0;
+ ReqTrack = desc->track;
+ ReqSide = desc->head;
+ do_fd_action( drive );
+
+ wait_for_completion(&format_wait);
+
+ redo_fd_request();
+ return( FormatError ? -EIO : 0 );
+}
+
+
+/* do_fd_action() is the general procedure for a fd request: All
+ * required parameter settings (drive select, side select, track
+ * position) are checked and set if needed. For each of these
+ * parameters and the actual reading or writing exist two functions:
+ * one that starts the setting (or skips it if possible) and one
+ * callback for the "done" interrupt. Each done func calls the next
+ * set function to propagate the request down to fd_rwsec_done().
+ */
+
+static void do_fd_action( int drive )
+{
+ DPRINT(("do_fd_action\n"));
+
+ if (UseTrackbuffer && !IsFormatting) {
+ repeat:
+ if (IS_BUFFERED( drive, ReqSide, ReqTrack )) {
+ if (ReqCmd == READ) {
+ copy_buffer( SECTOR_BUFFER(ReqSector), ReqData );
+ if (++ReqCnt < blk_rq_cur_sectors(fd_request)) {
+ /* read next sector */
+ setup_req_params( drive );
+ goto repeat;
+ }
+ else {
+ /* all sectors finished */
+ fd_end_request_cur(0);
+ redo_fd_request();
+ return;
+ }
+ }
+ else {
+ /* cmd == WRITE, pay attention to track buffer
+ * consistency! */
+ copy_buffer( ReqData, SECTOR_BUFFER(ReqSector) );
+ }
+ }
+ }
+
+ if (SelectedDrive != drive)
+ fd_select_drive( drive );
+
+ if (UD.track == -1)
+ fd_calibrate();
+ else if (UD.track != ReqTrack << UDT->stretch)
+ fd_seek();
+ else if (IsFormatting)
+ fd_writetrack();
+ else
+ fd_rwsec();
+}
+
+
+/* Seek to track 0 if the current track is unknown */
+
+static void fd_calibrate( void )
+{
+ if (SUD.track >= 0) {
+ fd_calibrate_done( 0 );
+ return;
+ }
+
+ if (ATARIHW_PRESENT(FDCSPEED))
+ dma_wd.fdc_speed = 0; /* always seek with 8 Mhz */;
+ DPRINT(("fd_calibrate\n"));
+ SET_IRQ_HANDLER( fd_calibrate_done );
+ /* we can't verify, since the speed may be incorrect */
+ FDC_WRITE( FDCREG_CMD, FDCCMD_RESTORE | SUD.steprate );
+
+ NeedSeek = 1;
+ MotorOn = 1;
+ start_timeout();
+ /* wait for IRQ */
+}
+
+
+static void fd_calibrate_done( int status )
+{
+ DPRINT(("fd_calibrate_done()\n"));
+ stop_timeout();
+
+ /* set the correct speed now */
+ if (ATARIHW_PRESENT(FDCSPEED))
+ dma_wd.fdc_speed = SUDT->fdc_speed;
+ if (status & FDCSTAT_RECNF) {
+ printk(KERN_ERR "fd%d: restore failed\n", SelectedDrive );
+ fd_error();
+ }
+ else {
+ SUD.track = 0;
+ fd_seek();
+ }
+}
+
+
+/* Seek the drive to the requested track. The drive must have been
+ * calibrated at some point before this.
+ */
+
+static void fd_seek( void )
+{
+ if (SUD.track == ReqTrack << SUDT->stretch) {
+ fd_seek_done( 0 );
+ return;
+ }
+
+ if (ATARIHW_PRESENT(FDCSPEED)) {
+ dma_wd.fdc_speed = 0; /* always seek witch 8 Mhz */
+ MFPDELAY();
+ }
+
+ DPRINT(("fd_seek() to track %d\n",ReqTrack));
+ FDC_WRITE( FDCREG_DATA, ReqTrack << SUDT->stretch);
+ udelay(25);
+ SET_IRQ_HANDLER( fd_seek_done );
+ FDC_WRITE( FDCREG_CMD, FDCCMD_SEEK | SUD.steprate );
+
+ MotorOn = 1;
+ set_head_settle_flag();
+ start_timeout();
+ /* wait for IRQ */
+}
+
+
+static void fd_seek_done( int status )
+{
+ DPRINT(("fd_seek_done()\n"));
+ stop_timeout();
+
+ /* set the correct speed */
+ if (ATARIHW_PRESENT(FDCSPEED))
+ dma_wd.fdc_speed = SUDT->fdc_speed;
+ if (status & FDCSTAT_RECNF) {
+ printk(KERN_ERR "fd%d: seek error (to track %d)\n",
+ SelectedDrive, ReqTrack );
+ /* we don't know exactly which track we are on now! */
+ SUD.track = -1;
+ fd_error();
+ }
+ else {
+ SUD.track = ReqTrack << SUDT->stretch;
+ NeedSeek = 0;
+ if (IsFormatting)
+ fd_writetrack();
+ else
+ fd_rwsec();
+ }
+}
+
+
+/* This does the actual reading/writing after positioning the head
+ * over the correct track.
+ */
+
+static int MultReadInProgress = 0;
+
+
+static void fd_rwsec( void )
+{
+ unsigned long paddr, flags;
+ unsigned int rwflag, old_motoron;
+ unsigned int track;
+
+ DPRINT(("fd_rwsec(), Sec=%d, Access=%c\n",ReqSector, ReqCmd == WRITE ? 'w' : 'r' ));
+ if (ReqCmd == WRITE) {
+ if (ATARIHW_PRESENT(EXTD_DMA)) {
+ paddr = virt_to_phys(ReqData);
+ }
+ else {
+ copy_buffer( ReqData, DMABuffer );
+ paddr = PhysDMABuffer;
+ }
+ dma_cache_maintenance( paddr, 512, 1 );
+ rwflag = 0x100;
+ }
+ else {
+ if (read_track)
+ paddr = PhysTrackBuffer;
+ else
+ paddr = ATARIHW_PRESENT(EXTD_DMA) ?
+ virt_to_phys(ReqData) : PhysDMABuffer;
+ rwflag = 0;
+ }
+
+ fd_select_side( ReqSide );
+
+ /* Start sector of this operation */
+ FDC_WRITE( FDCREG_SECTOR, read_track ? 1 : ReqSector );
+ MFPDELAY();
+ /* Cheat for track if stretch != 0 */
+ if (SUDT->stretch) {
+ track = FDC_READ( FDCREG_TRACK);
+ MFPDELAY();
+ FDC_WRITE( FDCREG_TRACK, track >> SUDT->stretch);
+ }
+ udelay(25);
+
+ /* Setup DMA */
+ local_irq_save(flags);
+ dma_wd.dma_lo = (unsigned char)paddr;
+ MFPDELAY();
+ paddr >>= 8;
+ dma_wd.dma_md = (unsigned char)paddr;
+ MFPDELAY();
+ paddr >>= 8;
+ if (ATARIHW_PRESENT(EXTD_DMA))
+ st_dma_ext_dmahi = (unsigned short)paddr;
+ else
+ dma_wd.dma_hi = (unsigned char)paddr;
+ MFPDELAY();
+ local_irq_restore(flags);
+
+ /* Clear FIFO and switch DMA to correct mode */
+ dma_wd.dma_mode_status = 0x90 | rwflag;
+ MFPDELAY();
+ dma_wd.dma_mode_status = 0x90 | (rwflag ^ 0x100);
+ MFPDELAY();
+ dma_wd.dma_mode_status = 0x90 | rwflag;
+ MFPDELAY();
+
+ /* How many sectors for DMA */
+ dma_wd.fdc_acces_seccount = read_track ? SUDT->spt : 1;
+
+ udelay(25);
+
+ /* Start operation */
+ dma_wd.dma_mode_status = FDCSELREG_STP | rwflag;
+ udelay(25);
+ SET_IRQ_HANDLER( fd_rwsec_done );
+ dma_wd.fdc_acces_seccount =
+ (get_head_settle_flag() |
+ (rwflag ? FDCCMD_WRSEC : (FDCCMD_RDSEC | (read_track ? FDCCMDADD_M : 0))));
+
+ old_motoron = MotorOn;
+ MotorOn = 1;
+ NeedSeek = 1;
+ /* wait for interrupt */
+
+ if (read_track) {
+ /* If reading a whole track, wait about one disk rotation and
+ * then check if all sectors are read. The FDC will even
+ * search for the first non-existent sector and need 1 sec to
+ * recognise that it isn't present :-(
+ */
+ MultReadInProgress = 1;
+ mod_timer(&readtrack_timer,
+ /* 1 rot. + 5 rot.s if motor was off */
+ jiffies + HZ/5 + (old_motoron ? 0 : HZ));
+ }
+ start_timeout();
+}
+
+
+static void fd_readtrack_check( unsigned long dummy )
+{
+ unsigned long flags, addr, addr2;
+
+ local_irq_save(flags);
+
+ if (!MultReadInProgress) {
+ /* This prevents a race condition that could arise if the
+ * interrupt is triggered while the calling of this timer
+ * callback function takes place. The IRQ function then has
+ * already cleared 'MultReadInProgress' when flow of control
+ * gets here.
+ */
+ local_irq_restore(flags);
+ return;
+ }
+
+ /* get the current DMA address */
+ /* ++ f.a. read twice to avoid being fooled by switcher */
+ addr = 0;
+ do {
+ addr2 = addr;
+ addr = dma_wd.dma_lo & 0xff;
+ MFPDELAY();
+ addr |= (dma_wd.dma_md & 0xff) << 8;
+ MFPDELAY();
+ if (ATARIHW_PRESENT( EXTD_DMA ))
+ addr |= (st_dma_ext_dmahi & 0xffff) << 16;
+ else
+ addr |= (dma_wd.dma_hi & 0xff) << 16;
+ MFPDELAY();
+ } while(addr != addr2);
+
+ if (addr >= PhysTrackBuffer + SUDT->spt*512) {
+ /* already read enough data, force an FDC interrupt to stop
+ * the read operation
+ */
+ SET_IRQ_HANDLER( NULL );
+ MultReadInProgress = 0;
+ local_irq_restore(flags);
+ DPRINT(("fd_readtrack_check(): done\n"));
+ FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI );
+ udelay(25);
+
+ /* No error until now -- the FDC would have interrupted
+ * otherwise!
+ */
+ fd_rwsec_done1(0);
+ }
+ else {
+ /* not yet finished, wait another tenth rotation */
+ local_irq_restore(flags);
+ DPRINT(("fd_readtrack_check(): not yet finished\n"));
+ mod_timer(&readtrack_timer, jiffies + HZ/5/10);
+ }
+}
+
+
+static void fd_rwsec_done( int status )
+{
+ DPRINT(("fd_rwsec_done()\n"));
+
+ if (read_track) {
+ del_timer(&readtrack_timer);
+ if (!MultReadInProgress)
+ return;
+ MultReadInProgress = 0;
+ }
+ fd_rwsec_done1(status);
+}
+
+static void fd_rwsec_done1(int status)
+{
+ unsigned int track;
+
+ stop_timeout();
+
+ /* Correct the track if stretch != 0 */
+ if (SUDT->stretch) {
+ track = FDC_READ( FDCREG_TRACK);
+ MFPDELAY();
+ FDC_WRITE( FDCREG_TRACK, track << SUDT->stretch);
+ }
+
+ if (!UseTrackbuffer) {
+ dma_wd.dma_mode_status = 0x90;
+ MFPDELAY();
+ if (!(dma_wd.dma_mode_status & 0x01)) {
+ printk(KERN_ERR "fd%d: DMA error\n", SelectedDrive );
+ goto err_end;
+ }
+ }
+ MFPDELAY();
+
+ if (ReqCmd == WRITE && (status & FDCSTAT_WPROT)) {
+ printk(KERN_NOTICE "fd%d: is write protected\n", SelectedDrive );
+ goto err_end;
+ }
+ if ((status & FDCSTAT_RECNF) &&
+ /* RECNF is no error after a multiple read when the FDC
+ searched for a non-existent sector! */
+ !(read_track && FDC_READ(FDCREG_SECTOR) > SUDT->spt)) {
+ if (Probing) {
+ if (SUDT > atari_disk_type) {
+ if (SUDT[-1].blocks > ReqBlock) {
+ /* try another disk type */
+ SUDT--;
+ set_capacity(unit[SelectedDrive].disk,
+ SUDT->blocks);
+ } else
+ Probing = 0;
+ }
+ else {
+ if (SUD.flags & FTD_MSG)
+ printk(KERN_INFO "fd%d: Auto-detected floppy type %s\n",
+ SelectedDrive, SUDT->name );
+ Probing=0;
+ }
+ } else {
+/* record not found, but not probing. Maybe stretch wrong ? Restart probing */
+ if (SUD.autoprobe) {
+ SUDT = atari_disk_type + StartDiskType[DriveType];
+ set_capacity(unit[SelectedDrive].disk,
+ SUDT->blocks);
+ Probing = 1;
+ }
+ }
+ if (Probing) {
+ if (ATARIHW_PRESENT(FDCSPEED)) {
+ dma_wd.fdc_speed = SUDT->fdc_speed;
+ MFPDELAY();
+ }
+ setup_req_params( SelectedDrive );
+ BufferDrive = -1;
+ do_fd_action( SelectedDrive );
+ return;
+ }
+
+ printk(KERN_ERR "fd%d: sector %d not found (side %d, track %d)\n",
+ SelectedDrive, FDC_READ (FDCREG_SECTOR), ReqSide, ReqTrack );
+ goto err_end;
+ }
+ if (status & FDCSTAT_CRC) {
+ printk(KERN_ERR "fd%d: CRC error (side %d, track %d, sector %d)\n",
+ SelectedDrive, ReqSide, ReqTrack, FDC_READ (FDCREG_SECTOR) );
+ goto err_end;
+ }
+ if (status & FDCSTAT_LOST) {
+ printk(KERN_ERR "fd%d: lost data (side %d, track %d, sector %d)\n",
+ SelectedDrive, ReqSide, ReqTrack, FDC_READ (FDCREG_SECTOR) );
+ goto err_end;
+ }
+
+ Probing = 0;
+
+ if (ReqCmd == READ) {
+ if (!read_track) {
+ void *addr;
+ addr = ATARIHW_PRESENT( EXTD_DMA ) ? ReqData : DMABuffer;
+ dma_cache_maintenance( virt_to_phys(addr), 512, 0 );
+ if (!ATARIHW_PRESENT( EXTD_DMA ))
+ copy_buffer (addr, ReqData);
+ } else {
+ dma_cache_maintenance( PhysTrackBuffer, MaxSectors[DriveType] * 512, 0 );
+ BufferDrive = SelectedDrive;
+ BufferSide = ReqSide;
+ BufferTrack = ReqTrack;
+ copy_buffer (SECTOR_BUFFER (ReqSector), ReqData);
+ }
+ }
+
+ if (++ReqCnt < blk_rq_cur_sectors(fd_request)) {
+ /* read next sector */
+ setup_req_params( SelectedDrive );
+ do_fd_action( SelectedDrive );
+ }
+ else {
+ /* all sectors finished */
+ fd_end_request_cur(0);
+ redo_fd_request();
+ }
+ return;
+
+ err_end:
+ BufferDrive = -1;
+ fd_error();
+}
+
+
+static void fd_writetrack( void )
+{
+ unsigned long paddr, flags;
+ unsigned int track;
+
+ DPRINT(("fd_writetrack() Tr=%d Si=%d\n", ReqTrack, ReqSide ));
+
+ paddr = PhysTrackBuffer;
+ dma_cache_maintenance( paddr, BUFFER_SIZE, 1 );
+
+ fd_select_side( ReqSide );
+
+ /* Cheat for track if stretch != 0 */
+ if (SUDT->stretch) {
+ track = FDC_READ( FDCREG_TRACK);
+ MFPDELAY();
+ FDC_WRITE(FDCREG_TRACK,track >> SUDT->stretch);
+ }
+ udelay(40);
+
+ /* Setup DMA */
+ local_irq_save(flags);
+ dma_wd.dma_lo = (unsigned char)paddr;
+ MFPDELAY();
+ paddr >>= 8;
+ dma_wd.dma_md = (unsigned char)paddr;
+ MFPDELAY();
+ paddr >>= 8;
+ if (ATARIHW_PRESENT( EXTD_DMA ))
+ st_dma_ext_dmahi = (unsigned short)paddr;
+ else
+ dma_wd.dma_hi = (unsigned char)paddr;
+ MFPDELAY();
+ local_irq_restore(flags);
+
+ /* Clear FIFO and switch DMA to correct mode */
+ dma_wd.dma_mode_status = 0x190;
+ MFPDELAY();
+ dma_wd.dma_mode_status = 0x90;
+ MFPDELAY();
+ dma_wd.dma_mode_status = 0x190;
+ MFPDELAY();
+
+ /* How many sectors for DMA */
+ dma_wd.fdc_acces_seccount = BUFFER_SIZE/512;
+ udelay(40);
+
+ /* Start operation */
+ dma_wd.dma_mode_status = FDCSELREG_STP | 0x100;
+ udelay(40);
+ SET_IRQ_HANDLER( fd_writetrack_done );
+ dma_wd.fdc_acces_seccount = FDCCMD_WRTRA | get_head_settle_flag();
+
+ MotorOn = 1;
+ start_timeout();
+ /* wait for interrupt */
+}
+
+
+static void fd_writetrack_done( int status )
+{
+ DPRINT(("fd_writetrack_done()\n"));
+
+ stop_timeout();
+
+ if (status & FDCSTAT_WPROT) {
+ printk(KERN_NOTICE "fd%d: is write protected\n", SelectedDrive );
+ goto err_end;
+ }
+ if (status & FDCSTAT_LOST) {
+ printk(KERN_ERR "fd%d: lost data (side %d, track %d)\n",
+ SelectedDrive, ReqSide, ReqTrack );
+ goto err_end;
+ }
+
+ complete(&format_wait);
+ return;
+
+ err_end:
+ fd_error();
+}
+
+static void fd_times_out( unsigned long dummy )
+{
+ atari_disable_irq( IRQ_MFP_FDC );
+ if (!FloppyIRQHandler) goto end; /* int occurred after timer was fired, but
+ * before we came here... */
+
+ SET_IRQ_HANDLER( NULL );
+ /* If the timeout occurred while the readtrack_check timer was
+ * active, we need to cancel it, else bad things will happen */
+ if (UseTrackbuffer)
+ del_timer( &readtrack_timer );
+ FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI );
+ udelay( 25 );
+
+ printk(KERN_ERR "floppy timeout\n" );
+ fd_error();
+ end:
+ atari_enable_irq( IRQ_MFP_FDC );
+}
+
+
+/* The (noop) seek operation here is needed to make the WP bit in the
+ * FDC status register accessible for check_change. If the last disk
+ * operation would have been a RDSEC, this bit would always read as 0
+ * no matter what :-( To save time, the seek goes to the track we're
+ * already on.
+ */
+
+static void finish_fdc( void )
+{
+ if (!NeedSeek) {
+ finish_fdc_done( 0 );
+ }
+ else {
+ DPRINT(("finish_fdc: dummy seek started\n"));
+ FDC_WRITE (FDCREG_DATA, SUD.track);
+ SET_IRQ_HANDLER( finish_fdc_done );
+ FDC_WRITE (FDCREG_CMD, FDCCMD_SEEK);
+ MotorOn = 1;
+ start_timeout();
+ /* we must wait for the IRQ here, because the ST-DMA
+ is released immediately afterwards and the interrupt
+ may be delivered to the wrong driver. */
+ }
+}
+
+
+static void finish_fdc_done( int dummy )
+{
+ unsigned long flags;
+
+ DPRINT(("finish_fdc_done entered\n"));
+ stop_timeout();
+ NeedSeek = 0;
+
+ if (timer_pending(&fd_timer) && time_before(fd_timer.expires, jiffies + 5))
+ /* If the check for a disk change is done too early after this
+ * last seek command, the WP bit still reads wrong :-((
+ */
+ mod_timer(&fd_timer, jiffies + 5);
+ else
+ start_check_change_timer();
+ start_motor_off_timer();
+
+ local_irq_save(flags);
+ stdma_release();
+ fdc_busy = 0;
+ wake_up( &fdc_wait );
+ local_irq_restore(flags);
+
+ DPRINT(("finish_fdc() finished\n"));
+}
+
+/* The detection of disk changes is a dark chapter in Atari history :-(
+ * Because the "Drive ready" signal isn't present in the Atari
+ * hardware, one has to rely on the "Write Protect". This works fine,
+ * as long as no write protected disks are used. TOS solves this
+ * problem by introducing tri-state logic ("maybe changed") and
+ * looking at the serial number in block 0. This isn't possible for
+ * Linux, since the floppy driver can't make assumptions about the
+ * filesystem used on the disk and thus the contents of block 0. I've
+ * chosen the method to always say "The disk was changed" if it is
+ * unsure whether it was. This implies that every open or mount
+ * invalidates the disk buffers if you work with write protected
+ * disks. But at least this is better than working with incorrect data
+ * due to unrecognised disk changes.
+ */
+
+static unsigned int floppy_check_events(struct gendisk *disk,
+ unsigned int clearing)
+{
+ struct atari_floppy_struct *p = disk->private_data;
+ unsigned int drive = p - unit;
+ if (test_bit (drive, &fake_change)) {
+ /* simulated change (e.g. after formatting) */
+ return DISK_EVENT_MEDIA_CHANGE;
+ }
+ if (test_bit (drive, &changed_floppies)) {
+ /* surely changed (the WP signal changed at least once) */
+ return DISK_EVENT_MEDIA_CHANGE;
+ }
+ if (UD.wpstat) {
+ /* WP is on -> could be changed: to be sure, buffers should be
+ * invalidated...
+ */
+ return DISK_EVENT_MEDIA_CHANGE;
+ }
+
+ return 0;
+}
+
+static int floppy_revalidate(struct gendisk *disk)
+{
+ struct atari_floppy_struct *p = disk->private_data;
+ unsigned int drive = p - unit;
+
+ if (test_bit(drive, &changed_floppies) ||
+ test_bit(drive, &fake_change) ||
+ p->disktype == 0) {
+ if (UD.flags & FTD_MSG)
+ printk(KERN_ERR "floppy: clear format %p!\n", UDT);
+ BufferDrive = -1;
+ clear_bit(drive, &fake_change);
+ clear_bit(drive, &changed_floppies);
+ /* MSch: clearing geometry makes sense only for autoprobe
+ formats, for 'permanent user-defined' parameter:
+ restore default_params[] here if flagged valid! */
+ if (default_params[drive].blocks == 0)
+ UDT = NULL;
+ else
+ UDT = &default_params[drive];
+ }
+ return 0;
+}
+
+
+/* This sets up the global variables describing the current request. */
+
+static void setup_req_params( int drive )
+{
+ int block = ReqBlock + ReqCnt;
+
+ ReqTrack = block / UDT->spt;
+ ReqSector = block - ReqTrack * UDT->spt + 1;
+ ReqSide = ReqTrack & 1;
+ ReqTrack >>= 1;
+ ReqData = ReqBuffer + 512 * ReqCnt;
+
+ if (UseTrackbuffer)
+ read_track = (ReqCmd == READ && fd_request->errors == 0);
+ else
+ read_track = 0;
+
+ DPRINT(("Request params: Si=%d Tr=%d Se=%d Data=%08lx\n",ReqSide,
+ ReqTrack, ReqSector, (unsigned long)ReqData ));
+}
+
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+ struct request_queue *q;
+ int old_pos = fdc_queue;
+ struct request *rq = NULL;
+
+ do {
+ q = unit[fdc_queue].disk->queue;
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ break;
+ }
+ } while (fdc_queue != old_pos);
+
+ return rq;
+}
+
+
+static void redo_fd_request(void)
+{
+ int drive, type;
+ struct atari_floppy_struct *floppy;
+
+ DPRINT(("redo_fd_request: fd_request=%p dev=%s fd_request->sector=%ld\n",
+ fd_request, fd_request ? fd_request->rq_disk->disk_name : "",
+ fd_request ? blk_rq_pos(fd_request) : 0 ));
+
+ IsFormatting = 0;
+
+repeat:
+ if (!fd_request) {
+ fd_request = set_next_request();
+ if (!fd_request)
+ goto the_end;
+ }
+
+ floppy = fd_request->rq_disk->private_data;
+ drive = floppy - unit;
+ type = floppy->type;
+
+ if (!UD.connected) {
+ /* drive not connected */
+ printk(KERN_ERR "Unknown Device: fd%d\n", drive );
+ fd_end_request_cur(-EIO);
+ goto repeat;
+ }
+
+ if (type == 0) {
+ if (!UDT) {
+ Probing = 1;
+ UDT = atari_disk_type + StartDiskType[DriveType];
+ set_capacity(floppy->disk, UDT->blocks);
+ UD.autoprobe = 1;
+ }
+ }
+ else {
+ /* user supplied disk type */
+ if (--type >= NUM_DISK_MINORS) {
+ printk(KERN_WARNING "fd%d: invalid disk format", drive );
+ fd_end_request_cur(-EIO);
+ goto repeat;
+ }
+ if (minor2disktype[type].drive_types > DriveType) {
+ printk(KERN_WARNING "fd%d: unsupported disk format", drive );
+ fd_end_request_cur(-EIO);
+ goto repeat;
+ }
+ type = minor2disktype[type].index;
+ UDT = &atari_disk_type[type];
+ set_capacity(floppy->disk, UDT->blocks);
+ UD.autoprobe = 0;
+ }
+
+ if (blk_rq_pos(fd_request) + 1 > UDT->blocks) {
+ fd_end_request_cur(-EIO);
+ goto repeat;
+ }
+
+ /* stop deselect timer */
+ del_timer( &motor_off_timer );
+
+ ReqCnt = 0;
+ ReqCmd = rq_data_dir(fd_request);
+ ReqBlock = blk_rq_pos(fd_request);
+ ReqBuffer = bio_data(fd_request->bio);
+ setup_req_params( drive );
+ do_fd_action( drive );
+
+ return;
+
+ the_end:
+ finish_fdc();
+}
+
+
+void do_fd_request(struct request_queue * q)
+{
+ DPRINT(("do_fd_request for pid %d\n",current->pid));
+ wait_event(fdc_wait, cmpxchg(&fdc_busy, 0, 1) == 0);
+ stdma_lock(floppy_irq, NULL);
+
+ atari_disable_irq( IRQ_MFP_FDC );
+ redo_fd_request();
+ atari_enable_irq( IRQ_MFP_FDC );
+}
+
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct atari_floppy_struct *floppy = disk->private_data;
+ int drive = floppy - unit;
+ int type = floppy->type;
+ struct atari_format_descr fmt_desc;
+ struct atari_disk_type *dtp;
+ struct floppy_struct getprm;
+ int settype;
+ struct floppy_struct setprm;
+ void __user *argp = (void __user *)param;
+
+ switch (cmd) {
+ case FDGETPRM:
+ if (type) {
+ if (--type >= NUM_DISK_MINORS)
+ return -ENODEV;
+ if (minor2disktype[type].drive_types > DriveType)
+ return -ENODEV;
+ type = minor2disktype[type].index;
+ dtp = &atari_disk_type[type];
+ if (UD.flags & FTD_MSG)
+ printk (KERN_ERR "floppy%d: found dtp %p name %s!\n",
+ drive, dtp, dtp->name);
+ }
+ else {
+ if (!UDT)
+ return -ENXIO;
+ else
+ dtp = UDT;
+ }
+ memset((void *)&getprm, 0, sizeof(getprm));
+ getprm.size = dtp->blocks;
+ getprm.sect = dtp->spt;
+ getprm.head = 2;
+ getprm.track = dtp->blocks/dtp->spt/2;
+ getprm.stretch = dtp->stretch;
+ if (copy_to_user(argp, &getprm, sizeof(getprm)))
+ return -EFAULT;
+ return 0;
+ }
+ switch (cmd) {
+ case FDSETPRM:
+ case FDDEFPRM:
+ /*
+ * MSch 7/96: simple 'set geometry' case: just set the
+ * 'default' device params (minor == 0).
+ * Currently, the drive geometry is cleared after each
+ * disk change and subsequent revalidate()! simple
+ * implementation of FDDEFPRM: save geometry from a
+ * FDDEFPRM call and restore it in floppy_revalidate() !
+ */
+
+ /* get the parameters from user space */
+ if (floppy->ref != 1 && floppy->ref != -1)
+ return -EBUSY;
+ if (copy_from_user(&setprm, argp, sizeof(setprm)))
+ return -EFAULT;
+ /*
+ * first of all: check for floppy change and revalidate,
+ * or the next access will revalidate - and clear UDT :-(
+ */
+
+ if (floppy_check_events(disk, 0))
+ floppy_revalidate(disk);
+
+ if (UD.flags & FTD_MSG)
+ printk (KERN_INFO "floppy%d: setting size %d spt %d str %d!\n",
+ drive, setprm.size, setprm.sect, setprm.stretch);
+
+ /* what if type > 0 here? Overwrite specified entry ? */
+ if (type) {
+ /* refuse to re-set a predefined type for now */
+ redo_fd_request();
+ return -EINVAL;
+ }
+
+ /*
+ * type == 0: first look for a matching entry in the type list,
+ * and set the UD.disktype field to use the perdefined entry.
+ * TODO: add user-defined format to head of autoprobe list ?
+ * Useful to include the user-type for future autodetection!
+ */
+
+ for (settype = 0; settype < NUM_DISK_MINORS; settype++) {
+ int setidx = 0;
+ if (minor2disktype[settype].drive_types > DriveType) {
+ /* skip this one, invalid for drive ... */
+ continue;
+ }
+ setidx = minor2disktype[settype].index;
+ dtp = &atari_disk_type[setidx];
+
+ /* found matching entry ?? */
+ if ( dtp->blocks == setprm.size
+ && dtp->spt == setprm.sect
+ && dtp->stretch == setprm.stretch ) {
+ if (UD.flags & FTD_MSG)
+ printk (KERN_INFO "floppy%d: setting %s %p!\n",
+ drive, dtp->name, dtp);
+ UDT = dtp;
+ set_capacity(floppy->disk, UDT->blocks);
+
+ if (cmd == FDDEFPRM) {
+ /* save settings as permanent default type */
+ default_params[drive].name = dtp->name;
+ default_params[drive].spt = dtp->spt;
+ default_params[drive].blocks = dtp->blocks;
+ default_params[drive].fdc_speed = dtp->fdc_speed;
+ default_params[drive].stretch = dtp->stretch;
+ }
+
+ return 0;
+ }
+
+ }
+
+ /* no matching disk type found above - setting user_params */
+
+ if (cmd == FDDEFPRM) {
+ /* set permanent type */
+ dtp = &default_params[drive];
+ } else
+ /* set user type (reset by disk change!) */
+ dtp = &user_params[drive];
+
+ dtp->name = "user format";
+ dtp->blocks = setprm.size;
+ dtp->spt = setprm.sect;
+ if (setprm.sect > 14)
+ dtp->fdc_speed = 3;
+ else
+ dtp->fdc_speed = 0;
+ dtp->stretch = setprm.stretch;
+
+ if (UD.flags & FTD_MSG)
+ printk (KERN_INFO "floppy%d: blk %d spt %d str %d!\n",
+ drive, dtp->blocks, dtp->spt, dtp->stretch);
+
+ /* sanity check */
+ if (setprm.track != dtp->blocks/dtp->spt/2 ||
+ setprm.head != 2) {
+ redo_fd_request();
+ return -EINVAL;
+ }
+
+ UDT = dtp;
+ set_capacity(floppy->disk, UDT->blocks);
+
+ return 0;
+ case FDMSGON:
+ UD.flags |= FTD_MSG;
+ return 0;
+ case FDMSGOFF:
+ UD.flags &= ~FTD_MSG;
+ return 0;
+ case FDSETEMSGTRESH:
+ return -EINVAL;
+ case FDFMTBEG:
+ return 0;
+ case FDFMTTRK:
+ if (floppy->ref != 1 && floppy->ref != -1)
+ return -EBUSY;
+ if (copy_from_user(&fmt_desc, argp, sizeof(fmt_desc)))
+ return -EFAULT;
+ return do_format(drive, type, &fmt_desc);
+ case FDCLRPRM:
+ UDT = NULL;
+ /* MSch: invalidate default_params */
+ default_params[drive].blocks = 0;
+ set_capacity(floppy->disk, MAX_DISK_SIZE * 2);
+ case FDFMTEND:
+ case FDFLUSH:
+ /* invalidate the buffer track to force a reread */
+ BufferDrive = -1;
+ set_bit(drive, &fake_change);
+ check_disk_change(bdev);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ int ret;
+
+ mutex_lock(&ataflop_mutex);
+ ret = fd_locked_ioctl(bdev, mode, cmd, arg);
+ mutex_unlock(&ataflop_mutex);
+
+ return ret;
+}
+
+/* Initialize the 'unit' variable for drive 'drive' */
+
+static void __init fd_probe( int drive )
+{
+ UD.connected = 0;
+ UDT = NULL;
+
+ if (!fd_test_drive_present( drive ))
+ return;
+
+ UD.connected = 1;
+ UD.track = 0;
+ switch( UserSteprate[drive] ) {
+ case 2:
+ UD.steprate = FDCSTEP_2;
+ break;
+ case 3:
+ UD.steprate = FDCSTEP_3;
+ break;
+ case 6:
+ UD.steprate = FDCSTEP_6;
+ break;
+ case 12:
+ UD.steprate = FDCSTEP_12;
+ break;
+ default: /* should be -1 for "not set by user" */
+ if (ATARIHW_PRESENT( FDCSPEED ) || MACH_IS_MEDUSA)
+ UD.steprate = FDCSTEP_3;
+ else
+ UD.steprate = FDCSTEP_6;
+ break;
+ }
+ MotorOn = 1; /* from probe restore operation! */
+}
+
+
+/* This function tests the physical presence of a floppy drive (not
+ * whether a disk is inserted). This is done by issuing a restore
+ * command, waiting max. 2 seconds (that should be enough to move the
+ * head across the whole disk) and looking at the state of the "TR00"
+ * signal. This should now be raised if there is a drive connected
+ * (and there is no hardware failure :-) Otherwise, the drive is
+ * declared absent.
+ */
+
+static int __init fd_test_drive_present( int drive )
+{
+ unsigned long timeout;
+ unsigned char status;
+ int ok;
+
+ if (drive >= (MACH_IS_FALCON ? 1 : 2)) return( 0 );
+ fd_select_drive( drive );
+
+ /* disable interrupt temporarily */
+ atari_turnoff_irq( IRQ_MFP_FDC );
+ FDC_WRITE (FDCREG_TRACK, 0xff00);
+ FDC_WRITE( FDCREG_CMD, FDCCMD_RESTORE | FDCCMDADD_H | FDCSTEP_6 );
+
+ timeout = jiffies + 2*HZ+HZ/2;
+ while (time_before(jiffies, timeout))
+ if (!(st_mfp.par_dt_reg & 0x20))
+ break;
+
+ status = FDC_READ( FDCREG_STATUS );
+ ok = (status & FDCSTAT_TR00) != 0;
+
+ /* force interrupt to abort restore operation (FDC would try
+ * about 50 seconds!) */
+ FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI );
+ udelay(500);
+ status = FDC_READ( FDCREG_STATUS );
+ udelay(20);
+
+ if (ok) {
+ /* dummy seek command to make WP bit accessible */
+ FDC_WRITE( FDCREG_DATA, 0 );
+ FDC_WRITE( FDCREG_CMD, FDCCMD_SEEK );
+ while( st_mfp.par_dt_reg & 0x20 )
+ ;
+ status = FDC_READ( FDCREG_STATUS );
+ }
+
+ atari_turnon_irq( IRQ_MFP_FDC );
+ return( ok );
+}
+
+
+/* Look how many and which kind of drives are connected. If there are
+ * floppies, additionally start the disk-change and motor-off timers.
+ */
+
+static void __init config_types( void )
+{
+ int drive, cnt = 0;
+
+ /* for probing drives, set the FDC speed to 8 MHz */
+ if (ATARIHW_PRESENT(FDCSPEED))
+ dma_wd.fdc_speed = 0;
+
+ printk(KERN_INFO "Probing floppy drive(s):\n");
+ for( drive = 0; drive < FD_MAX_UNITS; drive++ ) {
+ fd_probe( drive );
+ if (UD.connected) {
+ printk(KERN_INFO "fd%d\n", drive);
+ ++cnt;
+ }
+ }
+
+ if (FDC_READ( FDCREG_STATUS ) & FDCSTAT_BUSY) {
+ /* If FDC is still busy from probing, give it another FORCI
+ * command to abort the operation. If this isn't done, the FDC
+ * will interrupt later and its IRQ line stays low, because
+ * the status register isn't read. And this will block any
+ * interrupts on this IRQ line :-(
+ */
+ FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI );
+ udelay(500);
+ FDC_READ( FDCREG_STATUS );
+ udelay(20);
+ }
+
+ if (cnt > 0) {
+ start_motor_off_timer();
+ if (cnt == 1) fd_select_drive( 0 );
+ start_check_change_timer();
+ }
+}
+
+/*
+ * floppy_open check for aliasing (/dev/fd0 can be the same as
+ * /dev/PS0 etc), and disallows simultaneous access to the same
+ * drive with different device numbers.
+ */
+
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+ struct atari_floppy_struct *p = bdev->bd_disk->private_data;
+ int type = MINOR(bdev->bd_dev) >> 2;
+
+ DPRINT(("fd_open: type=%d\n",type));
+ if (p->ref && p->type != type)
+ return -EBUSY;
+
+ if (p->ref == -1 || (p->ref && mode & FMODE_EXCL))
+ return -EBUSY;
+
+ if (mode & FMODE_EXCL)
+ p->ref = -1;
+ else
+ p->ref++;
+
+ p->type = type;
+
+ if (mode & FMODE_NDELAY)
+ return 0;
+
+ if (mode & (FMODE_READ|FMODE_WRITE)) {
+ check_disk_change(bdev);
+ if (mode & FMODE_WRITE) {
+ if (p->wpstat) {
+ if (p->ref < 0)
+ p->ref = 0;
+ else
+ p->ref--;
+ return -EROFS;
+ }
+ }
+ }
+ return 0;
+}
+
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+ int ret;
+
+ mutex_lock(&ataflop_mutex);
+ ret = floppy_open(bdev, mode);
+ mutex_unlock(&ataflop_mutex);
+
+ return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+ struct atari_floppy_struct *p = disk->private_data;
+ mutex_lock(&ataflop_mutex);
+ if (p->ref < 0)
+ p->ref = 0;
+ else if (!p->ref--) {
+ printk(KERN_ERR "floppy_release with fd_ref == 0");
+ p->ref = 0;
+ }
+ mutex_unlock(&ataflop_mutex);
+}
+
+static const struct block_device_operations floppy_fops = {
+ .owner = THIS_MODULE,
+ .open = floppy_unlocked_open,
+ .release = floppy_release,
+ .ioctl = fd_ioctl,
+ .check_events = floppy_check_events,
+ .revalidate_disk= floppy_revalidate,
+};
+
+static struct kobject *floppy_find(dev_t dev, int *part, void *data)
+{
+ int drive = *part & 3;
+ int type = *part >> 2;
+ if (drive >= FD_MAX_UNITS || type > NUM_DISK_MINORS)
+ return NULL;
+ *part = 0;
+ return get_disk(unit[drive].disk);
+}
+
+static int __init atari_floppy_init (void)
+{
+ int i;
+
+ if (!MACH_IS_ATARI)
+ /* Amiga, Mac, ... don't have Atari-compatible floppy :-) */
+ return -ENODEV;
+
+ if (register_blkdev(FLOPPY_MAJOR,"fd"))
+ return -EBUSY;
+
+ for (i = 0; i < FD_MAX_UNITS; i++) {
+ unit[i].disk = alloc_disk(1);
+ if (!unit[i].disk)
+ goto Enomem;
+ }
+
+ if (UseTrackbuffer < 0)
+ /* not set by user -> use default: for now, we turn
+ track buffering off for all Medusas, though it
+ could be used with ones that have a counter
+ card. But the test is too hard :-( */
+ UseTrackbuffer = !MACH_IS_MEDUSA;
+
+ /* initialize variables */
+ SelectedDrive = -1;
+ BufferDrive = -1;
+
+ DMABuffer = atari_stram_alloc(BUFFER_SIZE+512, "ataflop");
+ if (!DMABuffer) {
+ printk(KERN_ERR "atari_floppy_init: cannot get dma buffer\n");
+ goto Enomem;
+ }
+ TrackBuffer = DMABuffer + 512;
+ PhysDMABuffer = atari_stram_to_phys(DMABuffer);
+ PhysTrackBuffer = virt_to_phys(TrackBuffer);
+ BufferDrive = BufferSide = BufferTrack = -1;
+
+ for (i = 0; i < FD_MAX_UNITS; i++) {
+ unit[i].track = -1;
+ unit[i].flags = 0;
+ unit[i].disk->major = FLOPPY_MAJOR;
+ unit[i].disk->first_minor = i;
+ sprintf(unit[i].disk->disk_name, "fd%d", i);
+ unit[i].disk->fops = &floppy_fops;
+ unit[i].disk->private_data = &unit[i];
+ unit[i].disk->queue = blk_init_queue(do_fd_request,
+ &ataflop_lock);
+ if (!unit[i].disk->queue)
+ goto Enomem;
+ set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
+ add_disk(unit[i].disk);
+ }
+
+ blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
+ floppy_find, NULL, NULL);
+
+ printk(KERN_INFO "Atari floppy driver: max. %cD, %strack buffering\n",
+ DriveType == 0 ? 'D' : DriveType == 1 ? 'H' : 'E',
+ UseTrackbuffer ? "" : "no ");
+ config_types();
+
+ return 0;
+Enomem:
+ while (i--) {
+ struct request_queue *q = unit[i].disk->queue;
+
+ put_disk(unit[i].disk);
+ if (q)
+ blk_cleanup_queue(q);
+ }
+
+ unregister_blkdev(FLOPPY_MAJOR, "fd");
+ return -ENOMEM;
+}
+
+#ifndef MODULE
+static int __init atari_floppy_setup(char *str)
+{
+ int ints[3 + FD_MAX_UNITS];
+ int i;
+
+ if (!MACH_IS_ATARI)
+ return 0;
+
+ str = get_options(str, 3 + FD_MAX_UNITS, ints);
+
+ if (ints[0] < 1) {
+ printk(KERN_ERR "ataflop_setup: no arguments!\n" );
+ return 0;
+ }
+ else if (ints[0] > 2+FD_MAX_UNITS) {
+ printk(KERN_ERR "ataflop_setup: too many arguments\n" );
+ }
+
+ if (ints[1] < 0 || ints[1] > 2)
+ printk(KERN_ERR "ataflop_setup: bad drive type\n" );
+ else
+ DriveType = ints[1];
+
+ if (ints[0] >= 2)
+ UseTrackbuffer = (ints[2] > 0);
+
+ for( i = 3; i <= ints[0] && i-3 < FD_MAX_UNITS; ++i ) {
+ if (ints[i] != 2 && ints[i] != 3 && ints[i] != 6 && ints[i] != 12)
+ printk(KERN_ERR "ataflop_setup: bad steprate\n" );
+ else
+ UserSteprate[i-3] = ints[i];
+ }
+ return 1;
+}
+
+__setup("floppy=", atari_floppy_setup);
+#endif
+
+static void __exit atari_floppy_exit(void)
+{
+ int i;
+ blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
+ for (i = 0; i < FD_MAX_UNITS; i++) {
+ struct request_queue *q = unit[i].disk->queue;
+
+ del_gendisk(unit[i].disk);
+ put_disk(unit[i].disk);
+ blk_cleanup_queue(q);
+ }
+ unregister_blkdev(FLOPPY_MAJOR, "fd");
+
+ del_timer_sync(&fd_timer);
+ atari_stram_free( DMABuffer );
+}
+
+module_init(atari_floppy_init)
+module_exit(atari_floppy_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
new file mode 100644
index 000000000..64ab4951e
--- /dev/null
+++ b/drivers/block/brd.c
@@ -0,0 +1,651 @@
+/*
+ * Ram backed block device driver.
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ *
+ * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
+ * of their respective owners.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/highmem.h>
+#include <linux/mutex.h>
+#include <linux/radix-tree.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#define SECTOR_SHIFT 9
+#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
+
+/*
+ * Each block ramdisk device has a radix_tree brd_pages of pages that stores
+ * the pages containing the block device's contents. A brd page's ->index is
+ * its offset in PAGE_SIZE units. This is similar to, but in no way connected
+ * with, the kernel's pagecache or buffer cache (which sit above our block
+ * device).
+ */
+struct brd_device {
+ int brd_number;
+
+ struct request_queue *brd_queue;
+ struct gendisk *brd_disk;
+ struct list_head brd_list;
+
+ /*
+ * Backing store of pages and lock to protect it. This is the contents
+ * of the block device.
+ */
+ spinlock_t brd_lock;
+ struct radix_tree_root brd_pages;
+};
+
+/*
+ * Look up and return a brd's page for a given sector.
+ */
+static DEFINE_MUTEX(brd_mutex);
+static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
+{
+ pgoff_t idx;
+ struct page *page;
+
+ /*
+ * The page lifetime is protected by the fact that we have opened the
+ * device node -- brd pages will never be deleted under us, so we
+ * don't need any further locking or refcounting.
+ *
+ * This is strictly true for the radix-tree nodes as well (ie. we
+ * don't actually need the rcu_read_lock()), however that is not a
+ * documented feature of the radix-tree API so it is better to be
+ * safe here (we don't have total exclusion from radix tree updates
+ * here, only deletes).
+ */
+ rcu_read_lock();
+ idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
+ page = radix_tree_lookup(&brd->brd_pages, idx);
+ rcu_read_unlock();
+
+ BUG_ON(page && page->index != idx);
+
+ return page;
+}
+
+/*
+ * Look up and return a brd's page for a given sector.
+ * If one does not exist, allocate an empty page, and insert that. Then
+ * return it.
+ */
+static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
+{
+ pgoff_t idx;
+ struct page *page;
+ gfp_t gfp_flags;
+
+ page = brd_lookup_page(brd, sector);
+ if (page)
+ return page;
+
+ /*
+ * Must use NOIO because we don't want to recurse back into the
+ * block or filesystem layers from page reclaim.
+ *
+ * Cannot support DAX and highmem, because our ->direct_access
+ * routine for DAX must return memory that is always addressable.
+ * If DAX was reworked to use pfns and kmap throughout, this
+ * restriction might be able to be lifted.
+ */
+ gfp_flags = GFP_NOIO | __GFP_ZERO;
+#ifndef CONFIG_BLK_DEV_RAM_DAX
+ gfp_flags |= __GFP_HIGHMEM;
+#endif
+ page = alloc_page(gfp_flags);
+ if (!page)
+ return NULL;
+
+ if (radix_tree_preload(GFP_NOIO)) {
+ __free_page(page);
+ return NULL;
+ }
+
+ spin_lock(&brd->brd_lock);
+ idx = sector >> PAGE_SECTORS_SHIFT;
+ page->index = idx;
+ if (radix_tree_insert(&brd->brd_pages, idx, page)) {
+ __free_page(page);
+ page = radix_tree_lookup(&brd->brd_pages, idx);
+ BUG_ON(!page);
+ BUG_ON(page->index != idx);
+ }
+ spin_unlock(&brd->brd_lock);
+
+ radix_tree_preload_end();
+
+ return page;
+}
+
+static void brd_free_page(struct brd_device *brd, sector_t sector)
+{
+ struct page *page;
+ pgoff_t idx;
+
+ spin_lock(&brd->brd_lock);
+ idx = sector >> PAGE_SECTORS_SHIFT;
+ page = radix_tree_delete(&brd->brd_pages, idx);
+ spin_unlock(&brd->brd_lock);
+ if (page)
+ __free_page(page);
+}
+
+static void brd_zero_page(struct brd_device *brd, sector_t sector)
+{
+ struct page *page;
+
+ page = brd_lookup_page(brd, sector);
+ if (page)
+ clear_highpage(page);
+}
+
+/*
+ * Free all backing store pages and radix tree. This must only be called when
+ * there are no other users of the device.
+ */
+#define FREE_BATCH 16
+static void brd_free_pages(struct brd_device *brd)
+{
+ unsigned long pos = 0;
+ struct page *pages[FREE_BATCH];
+ int nr_pages;
+
+ do {
+ int i;
+
+ nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
+ (void **)pages, pos, FREE_BATCH);
+
+ for (i = 0; i < nr_pages; i++) {
+ void *ret;
+
+ BUG_ON(pages[i]->index < pos);
+ pos = pages[i]->index;
+ ret = radix_tree_delete(&brd->brd_pages, pos);
+ BUG_ON(!ret || ret != pages[i]);
+ __free_page(pages[i]);
+ }
+
+ pos++;
+
+ /*
+ * This assumes radix_tree_gang_lookup always returns as
+ * many pages as possible. If the radix-tree code changes,
+ * so will this have to.
+ */
+ } while (nr_pages == FREE_BATCH);
+}
+
+/*
+ * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
+ */
+static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
+{
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+ size_t copy;
+
+ copy = min_t(size_t, n, PAGE_SIZE - offset);
+ if (!brd_insert_page(brd, sector))
+ return -ENOSPC;
+ if (copy < n) {
+ sector += copy >> SECTOR_SHIFT;
+ if (!brd_insert_page(brd, sector))
+ return -ENOSPC;
+ }
+ return 0;
+}
+
+static void discard_from_brd(struct brd_device *brd,
+ sector_t sector, size_t n)
+{
+ while (n >= PAGE_SIZE) {
+ /*
+ * Don't want to actually discard pages here because
+ * re-allocating the pages can result in writeback
+ * deadlocks under heavy load.
+ */
+ if (0)
+ brd_free_page(brd, sector);
+ else
+ brd_zero_page(brd, sector);
+ sector += PAGE_SIZE >> SECTOR_SHIFT;
+ n -= PAGE_SIZE;
+ }
+}
+
+/*
+ * Copy n bytes from src to the brd starting at sector. Does not sleep.
+ */
+static void copy_to_brd(struct brd_device *brd, const void *src,
+ sector_t sector, size_t n)
+{
+ struct page *page;
+ void *dst;
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+ size_t copy;
+
+ copy = min_t(size_t, n, PAGE_SIZE - offset);
+ page = brd_lookup_page(brd, sector);
+ BUG_ON(!page);
+
+ dst = kmap_atomic(page);
+ memcpy(dst + offset, src, copy);
+ kunmap_atomic(dst);
+
+ if (copy < n) {
+ src += copy;
+ sector += copy >> SECTOR_SHIFT;
+ copy = n - copy;
+ page = brd_lookup_page(brd, sector);
+ BUG_ON(!page);
+
+ dst = kmap_atomic(page);
+ memcpy(dst, src, copy);
+ kunmap_atomic(dst);
+ }
+}
+
+/*
+ * Copy n bytes to dst from the brd starting at sector. Does not sleep.
+ */
+static void copy_from_brd(void *dst, struct brd_device *brd,
+ sector_t sector, size_t n)
+{
+ struct page *page;
+ void *src;
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+ size_t copy;
+
+ copy = min_t(size_t, n, PAGE_SIZE - offset);
+ page = brd_lookup_page(brd, sector);
+ if (page) {
+ src = kmap_atomic(page);
+ memcpy(dst, src + offset, copy);
+ kunmap_atomic(src);
+ } else
+ memset(dst, 0, copy);
+
+ if (copy < n) {
+ dst += copy;
+ sector += copy >> SECTOR_SHIFT;
+ copy = n - copy;
+ page = brd_lookup_page(brd, sector);
+ if (page) {
+ src = kmap_atomic(page);
+ memcpy(dst, src, copy);
+ kunmap_atomic(src);
+ } else
+ memset(dst, 0, copy);
+ }
+}
+
+/*
+ * Process a single bvec of a bio.
+ */
+static int brd_do_bvec(struct brd_device *brd, struct page *page,
+ unsigned int len, unsigned int off, int rw,
+ sector_t sector)
+{
+ void *mem;
+ int err = 0;
+
+ if (rw != READ) {
+ err = copy_to_brd_setup(brd, sector, len);
+ if (err)
+ goto out;
+ }
+
+ mem = kmap_atomic(page);
+ if (rw == READ) {
+ copy_from_brd(mem + off, brd, sector, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ copy_to_brd(brd, mem + off, sector, len);
+ }
+ kunmap_atomic(mem);
+
+out:
+ return err;
+}
+
+static void brd_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct brd_device *brd = bdev->bd_disk->private_data;
+ int rw;
+ struct bio_vec bvec;
+ sector_t sector;
+ struct bvec_iter iter;
+ int err = -EIO;
+
+ sector = bio->bi_iter.bi_sector;
+ if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
+ goto out;
+
+ if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+ err = 0;
+ discard_from_brd(brd, sector, bio->bi_iter.bi_size);
+ goto out;
+ }
+
+ rw = bio_rw(bio);
+ if (rw == READA)
+ rw = READ;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ unsigned int len = bvec.bv_len;
+ err = brd_do_bvec(brd, bvec.bv_page, len,
+ bvec.bv_offset, rw, sector);
+ if (err)
+ break;
+ sector += len >> SECTOR_SHIFT;
+ }
+
+out:
+ bio_endio(bio, err);
+}
+
+static int brd_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, int rw)
+{
+ struct brd_device *brd = bdev->bd_disk->private_data;
+ int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector);
+ page_endio(page, rw & WRITE, err);
+ return err;
+}
+
+#ifdef CONFIG_BLK_DEV_RAM_DAX
+static long brd_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
+{
+ struct brd_device *brd = bdev->bd_disk->private_data;
+ struct page *page;
+
+ if (!brd)
+ return -ENODEV;
+ page = brd_insert_page(brd, sector);
+ if (!page)
+ return -ENOSPC;
+ *kaddr = page_address(page);
+ *pfn = page_to_pfn(page);
+
+ /*
+ * TODO: If size > PAGE_SIZE, we could look to see if the next page in
+ * the file happens to be mapped to the next page of physical RAM.
+ */
+ return PAGE_SIZE;
+}
+#else
+#define brd_direct_access NULL
+#endif
+
+static int brd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ int error;
+ struct brd_device *brd = bdev->bd_disk->private_data;
+
+ if (cmd != BLKFLSBUF)
+ return -ENOTTY;
+
+ /*
+ * ram device BLKFLSBUF has special semantics, we want to actually
+ * release and destroy the ramdisk data.
+ */
+ mutex_lock(&brd_mutex);
+ mutex_lock(&bdev->bd_mutex);
+ error = -EBUSY;
+ if (bdev->bd_openers <= 1) {
+ /*
+ * Kill the cache first, so it isn't written back to the
+ * device.
+ *
+ * Another thread might instantiate more buffercache here,
+ * but there is not much we can do to close that race.
+ */
+ kill_bdev(bdev);
+ brd_free_pages(brd);
+ error = 0;
+ }
+ mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&brd_mutex);
+
+ return error;
+}
+
+static const struct block_device_operations brd_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = brd_rw_page,
+ .ioctl = brd_ioctl,
+ .direct_access = brd_direct_access,
+};
+
+/*
+ * And now the modules code and kernel interface.
+ */
+static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
+module_param(rd_nr, int, S_IRUGO);
+MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+
+int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
+module_param(rd_size, int, S_IRUGO);
+MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+
+static int max_part = 1;
+module_param(max_part, int, S_IRUGO);
+MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
+MODULE_ALIAS("rd");
+
+#ifndef MODULE
+/* Legacy boot options - nonmodular */
+static int __init ramdisk_size(char *str)
+{
+ rd_size = simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("ramdisk_size=", ramdisk_size);
+#endif
+
+/*
+ * The device scheme is derived from loop.c. Keep them in synch where possible
+ * (should share code eventually).
+ */
+static LIST_HEAD(brd_devices);
+static DEFINE_MUTEX(brd_devices_mutex);
+
+static struct brd_device *brd_alloc(int i)
+{
+ struct brd_device *brd;
+ struct gendisk *disk;
+
+ brd = kzalloc(sizeof(*brd), GFP_KERNEL);
+ if (!brd)
+ goto out;
+ brd->brd_number = i;
+ spin_lock_init(&brd->brd_lock);
+ INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
+
+ brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!brd->brd_queue)
+ goto out_free_dev;
+
+ blk_queue_make_request(brd->brd_queue, brd_make_request);
+ blk_queue_max_hw_sectors(brd->brd_queue, 1024);
+ blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
+
+ /* This is so fdisk will align partitions on 4k, because of
+ * direct_access API needing 4k alignment, returning a PFN
+ * (This is only a problem on very small devices <= 4M,
+ * otherwise fdisk will align on 1M. Regardless this call
+ * is harmless)
+ */
+ blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
+
+ brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
+ brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
+ brd->brd_queue->limits.discard_zeroes_data = 1;
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
+
+ disk = brd->brd_disk = alloc_disk(max_part);
+ if (!disk)
+ goto out_free_queue;
+ disk->major = RAMDISK_MAJOR;
+ disk->first_minor = i * max_part;
+ disk->fops = &brd_fops;
+ disk->private_data = brd;
+ disk->queue = brd->brd_queue;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ sprintf(disk->disk_name, "ram%d", i);
+ set_capacity(disk, rd_size * 2);
+
+ return brd;
+
+out_free_queue:
+ blk_cleanup_queue(brd->brd_queue);
+out_free_dev:
+ kfree(brd);
+out:
+ return NULL;
+}
+
+static void brd_free(struct brd_device *brd)
+{
+ put_disk(brd->brd_disk);
+ blk_cleanup_queue(brd->brd_queue);
+ brd_free_pages(brd);
+ kfree(brd);
+}
+
+static struct brd_device *brd_init_one(int i, bool *new)
+{
+ struct brd_device *brd;
+
+ *new = false;
+ list_for_each_entry(brd, &brd_devices, brd_list) {
+ if (brd->brd_number == i)
+ goto out;
+ }
+
+ brd = brd_alloc(i);
+ if (brd) {
+ add_disk(brd->brd_disk);
+ list_add_tail(&brd->brd_list, &brd_devices);
+ }
+ *new = true;
+out:
+ return brd;
+}
+
+static void brd_del_one(struct brd_device *brd)
+{
+ list_del(&brd->brd_list);
+ del_gendisk(brd->brd_disk);
+ brd_free(brd);
+}
+
+static struct kobject *brd_probe(dev_t dev, int *part, void *data)
+{
+ struct brd_device *brd;
+ struct kobject *kobj;
+ bool new;
+
+ mutex_lock(&brd_devices_mutex);
+ brd = brd_init_one(MINOR(dev) / max_part, &new);
+ kobj = brd ? get_disk(brd->brd_disk) : NULL;
+ mutex_unlock(&brd_devices_mutex);
+
+ if (new)
+ *part = 0;
+
+ return kobj;
+}
+
+static int __init brd_init(void)
+{
+ struct brd_device *brd, *next;
+ int i;
+
+ /*
+ * brd module now has a feature to instantiate underlying device
+ * structure on-demand, provided that there is an access dev node.
+ *
+ * (1) if rd_nr is specified, create that many upfront. else
+ * it defaults to CONFIG_BLK_DEV_RAM_COUNT
+ * (2) User can further extend brd devices by create dev node themselves
+ * and have kernel automatically instantiate actual device
+ * on-demand. Example:
+ * mknod /path/devnod_name b 1 X # 1 is the rd major
+ * fdisk -l /path/devnod_name
+ * If (X / max_part) was not already created it will be created
+ * dynamically.
+ */
+
+ if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
+ return -EIO;
+
+ if (unlikely(!max_part))
+ max_part = 1;
+
+ for (i = 0; i < rd_nr; i++) {
+ brd = brd_alloc(i);
+ if (!brd)
+ goto out_free;
+ list_add_tail(&brd->brd_list, &brd_devices);
+ }
+
+ /* point of no return */
+
+ list_for_each_entry(brd, &brd_devices, brd_list)
+ add_disk(brd->brd_disk);
+
+ blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
+ THIS_MODULE, brd_probe, NULL, NULL);
+
+ pr_info("brd: module loaded\n");
+ return 0;
+
+out_free:
+ list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
+ list_del(&brd->brd_list);
+ brd_free(brd);
+ }
+ unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+ pr_info("brd: module NOT loaded !!!\n");
+ return -ENOMEM;
+}
+
+static void __exit brd_exit(void)
+{
+ struct brd_device *brd, *next;
+
+ list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
+ brd_del_one(brd);
+
+ blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);
+ unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+
+ pr_info("brd: module unloaded\n");
+}
+
+module_init(brd_init);
+module_exit(brd_exit);
+
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
new file mode 100644
index 000000000..ff20f192b
--- /dev/null
+++ b/drivers/block/cciss.c
@@ -0,0 +1,5392 @@
+/*
+ * Disk Array driver for HP Smart Array controllers.
+ * (C) Copyright 2000, 2007 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307, USA.
+ *
+ * Questions/Comments/Bugfixes to iss_storagedev@hp.com
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/pci-aspm.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/blkpg.h>
+#include <linux/timer.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/hdreg.h>
+#include <linux/spinlock.h>
+#include <linux/compat.h>
+#include <linux/mutex.h>
+#include <linux/bitmap.h>
+#include <linux/io.h>
+#include <asm/uaccess.h>
+
+#include <linux/dma-mapping.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <linux/completion.h>
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <scsi/scsi_ioctl.h>
+#include <linux/cdrom.h>
+#include <linux/scatterlist.h>
+#include <linux/kthread.h>
+
+#define CCISS_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin))
+#define DRIVER_NAME "HP CISS Driver (v 3.6.26)"
+#define DRIVER_VERSION CCISS_DRIVER_VERSION(3, 6, 26)
+
+/* Embedded module documentation macros - see modules.h */
+MODULE_AUTHOR("Hewlett-Packard Company");
+MODULE_DESCRIPTION("Driver for HP Smart Array Controllers");
+MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
+MODULE_VERSION("3.6.26");
+MODULE_LICENSE("GPL");
+static int cciss_tape_cmds = 6;
+module_param(cciss_tape_cmds, int, 0644);
+MODULE_PARM_DESC(cciss_tape_cmds,
+ "number of commands to allocate for tape devices (default: 6)");
+static int cciss_simple_mode;
+module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cciss_simple_mode,
+ "Use 'simple mode' rather than 'performant mode'");
+
+static int cciss_allow_hpsa;
+module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR);
+MODULE_PARM_DESC(cciss_allow_hpsa,
+ "Prevent cciss driver from accessing hardware known to be "
+ " supported by the hpsa driver");
+
+static DEFINE_MUTEX(cciss_mutex);
+static struct proc_dir_entry *proc_cciss;
+
+#include "cciss_cmd.h"
+#include "cciss.h"
+#include <linux/cciss_ioctl.h>
+
+/* define the PCI info for the cards we can control */
+static const struct pci_device_id cciss_pci_device_id[] = {
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISS, 0x0E11, 0x4070},
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSB, 0x0E11, 0x4080},
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSB, 0x0E11, 0x4082},
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSB, 0x0E11, 0x4083},
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x4091},
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x409A},
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x409B},
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x409C},
+ {PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_CISSC, 0x0E11, 0x409D},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSA, 0x103C, 0x3225},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3223},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3234},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3235},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3211},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3212},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3213},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3214},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3215},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3237},
+ {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D},
+ {0,}
+};
+
+MODULE_DEVICE_TABLE(pci, cciss_pci_device_id);
+
+/* board_id = Subsystem Device ID & Vendor ID
+ * product = Marketing Name for the board
+ * access = Address of the struct of function pointers
+ */
+static struct board_type products[] = {
+ {0x40700E11, "Smart Array 5300", &SA5_access},
+ {0x40800E11, "Smart Array 5i", &SA5B_access},
+ {0x40820E11, "Smart Array 532", &SA5B_access},
+ {0x40830E11, "Smart Array 5312", &SA5B_access},
+ {0x409A0E11, "Smart Array 641", &SA5_access},
+ {0x409B0E11, "Smart Array 642", &SA5_access},
+ {0x409C0E11, "Smart Array 6400", &SA5_access},
+ {0x409D0E11, "Smart Array 6400 EM", &SA5_access},
+ {0x40910E11, "Smart Array 6i", &SA5_access},
+ {0x3225103C, "Smart Array P600", &SA5_access},
+ {0x3223103C, "Smart Array P800", &SA5_access},
+ {0x3234103C, "Smart Array P400", &SA5_access},
+ {0x3235103C, "Smart Array P400i", &SA5_access},
+ {0x3211103C, "Smart Array E200i", &SA5_access},
+ {0x3212103C, "Smart Array E200", &SA5_access},
+ {0x3213103C, "Smart Array E200i", &SA5_access},
+ {0x3214103C, "Smart Array E200i", &SA5_access},
+ {0x3215103C, "Smart Array E200i", &SA5_access},
+ {0x3237103C, "Smart Array E500", &SA5_access},
+ {0x3223103C, "Smart Array P800", &SA5_access},
+ {0x3234103C, "Smart Array P400", &SA5_access},
+ {0x323D103C, "Smart Array P700m", &SA5_access},
+};
+
+/* How long to wait (in milliseconds) for board to go into simple mode */
+#define MAX_CONFIG_WAIT 30000
+#define MAX_IOCTL_CONFIG_WAIT 1000
+
+/*define how many times we will try a command because of bus resets */
+#define MAX_CMD_RETRIES 3
+
+#define MAX_CTLR 32
+
+/* Originally cciss driver only supports 8 major numbers */
+#define MAX_CTLR_ORIG 8
+
+static ctlr_info_t *hba[MAX_CTLR];
+
+static struct task_struct *cciss_scan_thread;
+static DEFINE_MUTEX(scan_mutex);
+static LIST_HEAD(scan_q);
+
+static void do_cciss_request(struct request_queue *q);
+static irqreturn_t do_cciss_intx(int irq, void *dev_id);
+static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id);
+static int cciss_open(struct block_device *bdev, fmode_t mode);
+static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode);
+static void cciss_release(struct gendisk *disk, fmode_t mode);
+static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg);
+static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+
+static int cciss_revalidate(struct gendisk *disk);
+static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl);
+static int deregister_disk(ctlr_info_t *h, int drv_index,
+ int clear_all, int via_ioctl);
+
+static void cciss_read_capacity(ctlr_info_t *h, int logvol,
+ sector_t *total_size, unsigned int *block_size);
+static void cciss_read_capacity_16(ctlr_info_t *h, int logvol,
+ sector_t *total_size, unsigned int *block_size);
+static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol,
+ sector_t total_size,
+ unsigned int block_size, InquiryData_struct *inq_buff,
+ drive_info_struct *drv);
+static void cciss_interrupt_mode(ctlr_info_t *);
+static int cciss_enter_simple_mode(struct ctlr_info *h);
+static void start_io(ctlr_info_t *h);
+static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size,
+ __u8 page_code, unsigned char scsi3addr[],
+ int cmd_type);
+static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c,
+ int attempt_retry);
+static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c);
+
+static int add_to_scan_list(struct ctlr_info *h);
+static int scan_thread(void *data);
+static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c);
+static void cciss_hba_release(struct device *dev);
+static void cciss_device_release(struct device *dev);
+static void cciss_free_gendisk(ctlr_info_t *h, int drv_index);
+static void cciss_free_drive_info(ctlr_info_t *h, int drv_index);
+static inline u32 next_command(ctlr_info_t *h);
+static int cciss_find_cfg_addrs(struct pci_dev *pdev, void __iomem *vaddr,
+ u32 *cfg_base_addr, u64 *cfg_base_addr_index,
+ u64 *cfg_offset);
+static int cciss_pci_find_memory_BAR(struct pci_dev *pdev,
+ unsigned long *memory_bar);
+static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag);
+static int write_driver_ver_to_cfgtable(CfgTable_struct __iomem *cfgtable);
+
+/* performant mode helper functions */
+static void calc_bucket_map(int *bucket, int num_buckets, int nsgs,
+ int *bucket_map);
+static void cciss_put_controller_into_performant_mode(ctlr_info_t *h);
+
+#ifdef CONFIG_PROC_FS
+static void cciss_procinit(ctlr_info_t *h);
+#else
+static void cciss_procinit(ctlr_info_t *h)
+{
+}
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_COMPAT
+static int cciss_compat_ioctl(struct block_device *, fmode_t,
+ unsigned, unsigned long);
+#endif
+
+static const struct block_device_operations cciss_fops = {
+ .owner = THIS_MODULE,
+ .open = cciss_unlocked_open,
+ .release = cciss_release,
+ .ioctl = cciss_ioctl,
+ .getgeo = cciss_getgeo,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = cciss_compat_ioctl,
+#endif
+ .revalidate_disk = cciss_revalidate,
+};
+
+/* set_performant_mode: Modify the tag for cciss performant
+ * set bit 0 for pull model, bits 3-1 for block fetch
+ * register number
+ */
+static void set_performant_mode(ctlr_info_t *h, CommandList_struct *c)
+{
+ if (likely(h->transMethod & CFGTBL_Trans_Performant))
+ c->busaddr |= 1 | (h->blockFetchTable[c->Header.SGList] << 1);
+}
+
+/*
+ * Enqueuing and dequeuing functions for cmdlists.
+ */
+static inline void addQ(struct list_head *list, CommandList_struct *c)
+{
+ list_add_tail(&c->list, list);
+}
+
+static inline void removeQ(CommandList_struct *c)
+{
+ /*
+ * After kexec/dump some commands might still
+ * be in flight, which the firmware will try
+ * to complete. Resetting the firmware doesn't work
+ * with old fw revisions, so we have to mark
+ * them off as 'stale' to prevent the driver from
+ * falling over.
+ */
+ if (WARN_ON(list_empty(&c->list))) {
+ c->cmd_type = CMD_MSG_STALE;
+ return;
+ }
+
+ list_del_init(&c->list);
+}
+
+static void enqueue_cmd_and_start_io(ctlr_info_t *h,
+ CommandList_struct *c)
+{
+ unsigned long flags;
+ set_performant_mode(h, c);
+ spin_lock_irqsave(&h->lock, flags);
+ addQ(&h->reqQ, c);
+ h->Qdepth++;
+ if (h->Qdepth > h->maxQsinceinit)
+ h->maxQsinceinit = h->Qdepth;
+ start_io(h);
+ spin_unlock_irqrestore(&h->lock, flags);
+}
+
+static void cciss_free_sg_chain_blocks(SGDescriptor_struct **cmd_sg_list,
+ int nr_cmds)
+{
+ int i;
+
+ if (!cmd_sg_list)
+ return;
+ for (i = 0; i < nr_cmds; i++) {
+ kfree(cmd_sg_list[i]);
+ cmd_sg_list[i] = NULL;
+ }
+ kfree(cmd_sg_list);
+}
+
+static SGDescriptor_struct **cciss_allocate_sg_chain_blocks(
+ ctlr_info_t *h, int chainsize, int nr_cmds)
+{
+ int j;
+ SGDescriptor_struct **cmd_sg_list;
+
+ if (chainsize <= 0)
+ return NULL;
+
+ cmd_sg_list = kmalloc(sizeof(*cmd_sg_list) * nr_cmds, GFP_KERNEL);
+ if (!cmd_sg_list)
+ return NULL;
+
+ /* Build up chain blocks for each command */
+ for (j = 0; j < nr_cmds; j++) {
+ /* Need a block of chainsized s/g elements. */
+ cmd_sg_list[j] = kmalloc((chainsize *
+ sizeof(*cmd_sg_list[j])), GFP_KERNEL);
+ if (!cmd_sg_list[j]) {
+ dev_err(&h->pdev->dev, "Cannot get memory "
+ "for s/g chains.\n");
+ goto clean;
+ }
+ }
+ return cmd_sg_list;
+clean:
+ cciss_free_sg_chain_blocks(cmd_sg_list, nr_cmds);
+ return NULL;
+}
+
+static void cciss_unmap_sg_chain_block(ctlr_info_t *h, CommandList_struct *c)
+{
+ SGDescriptor_struct *chain_sg;
+ u64bit temp64;
+
+ if (c->Header.SGTotal <= h->max_cmd_sgentries)
+ return;
+
+ chain_sg = &c->SG[h->max_cmd_sgentries - 1];
+ temp64.val32.lower = chain_sg->Addr.lower;
+ temp64.val32.upper = chain_sg->Addr.upper;
+ pci_unmap_single(h->pdev, temp64.val, chain_sg->Len, PCI_DMA_TODEVICE);
+}
+
+static void cciss_map_sg_chain_block(ctlr_info_t *h, CommandList_struct *c,
+ SGDescriptor_struct *chain_block, int len)
+{
+ SGDescriptor_struct *chain_sg;
+ u64bit temp64;
+
+ chain_sg = &c->SG[h->max_cmd_sgentries - 1];
+ chain_sg->Ext = CCISS_SG_CHAIN;
+ chain_sg->Len = len;
+ temp64.val = pci_map_single(h->pdev, chain_block, len,
+ PCI_DMA_TODEVICE);
+ chain_sg->Addr.lower = temp64.val32.lower;
+ chain_sg->Addr.upper = temp64.val32.upper;
+}
+
+#include "cciss_scsi.c" /* For SCSI tape support */
+
+static const char *raid_label[] = { "0", "4", "1(1+0)", "5", "5+1", "ADG",
+ "UNKNOWN"
+};
+#define RAID_UNKNOWN (ARRAY_SIZE(raid_label)-1)
+
+#ifdef CONFIG_PROC_FS
+
+/*
+ * Report information about this controller.
+ */
+#define ENG_GIG 1000000000
+#define ENG_GIG_FACTOR (ENG_GIG/512)
+#define ENGAGE_SCSI "engage scsi"
+
+static void cciss_seq_show_header(struct seq_file *seq)
+{
+ ctlr_info_t *h = seq->private;
+
+ seq_printf(seq, "%s: HP %s Controller\n"
+ "Board ID: 0x%08lx\n"
+ "Firmware Version: %c%c%c%c\n"
+ "IRQ: %d\n"
+ "Logical drives: %d\n"
+ "Current Q depth: %d\n"
+ "Current # commands on controller: %d\n"
+ "Max Q depth since init: %d\n"
+ "Max # commands on controller since init: %d\n"
+ "Max SG entries since init: %d\n",
+ h->devname,
+ h->product_name,
+ (unsigned long)h->board_id,
+ h->firm_ver[0], h->firm_ver[1], h->firm_ver[2],
+ h->firm_ver[3], (unsigned int)h->intr[h->intr_mode],
+ h->num_luns,
+ h->Qdepth, h->commands_outstanding,
+ h->maxQsinceinit, h->max_outstanding, h->maxSG);
+
+#ifdef CONFIG_CISS_SCSI_TAPE
+ cciss_seq_tape_report(seq, h);
+#endif /* CONFIG_CISS_SCSI_TAPE */
+}
+
+static void *cciss_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ ctlr_info_t *h = seq->private;
+ unsigned long flags;
+
+ /* prevent displaying bogus info during configuration
+ * or deconfiguration of a logical volume
+ */
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring) {
+ spin_unlock_irqrestore(&h->lock, flags);
+ return ERR_PTR(-EBUSY);
+ }
+ h->busy_configuring = 1;
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (*pos == 0)
+ cciss_seq_show_header(seq);
+
+ return pos;
+}
+
+static int cciss_seq_show(struct seq_file *seq, void *v)
+{
+ sector_t vol_sz, vol_sz_frac;
+ ctlr_info_t *h = seq->private;
+ unsigned ctlr = h->ctlr;
+ loff_t *pos = v;
+ drive_info_struct *drv = h->drv[*pos];
+
+ if (*pos > h->highest_lun)
+ return 0;
+
+ if (drv == NULL) /* it's possible for h->drv[] to have holes. */
+ return 0;
+
+ if (drv->heads == 0)
+ return 0;
+
+ vol_sz = drv->nr_blocks;
+ vol_sz_frac = sector_div(vol_sz, ENG_GIG_FACTOR);
+ vol_sz_frac *= 100;
+ sector_div(vol_sz_frac, ENG_GIG_FACTOR);
+
+ if (drv->raid_level < 0 || drv->raid_level > RAID_UNKNOWN)
+ drv->raid_level = RAID_UNKNOWN;
+ seq_printf(seq, "cciss/c%dd%d:"
+ "\t%4u.%02uGB\tRAID %s\n",
+ ctlr, (int) *pos, (int)vol_sz, (int)vol_sz_frac,
+ raid_label[drv->raid_level]);
+ return 0;
+}
+
+static void *cciss_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ctlr_info_t *h = seq->private;
+
+ if (*pos > h->highest_lun)
+ return NULL;
+ *pos += 1;
+
+ return pos;
+}
+
+static void cciss_seq_stop(struct seq_file *seq, void *v)
+{
+ ctlr_info_t *h = seq->private;
+
+ /* Only reset h->busy_configuring if we succeeded in setting
+ * it during cciss_seq_start. */
+ if (v == ERR_PTR(-EBUSY))
+ return;
+
+ h->busy_configuring = 0;
+}
+
+static const struct seq_operations cciss_seq_ops = {
+ .start = cciss_seq_start,
+ .show = cciss_seq_show,
+ .next = cciss_seq_next,
+ .stop = cciss_seq_stop,
+};
+
+static int cciss_seq_open(struct inode *inode, struct file *file)
+{
+ int ret = seq_open(file, &cciss_seq_ops);
+ struct seq_file *seq = file->private_data;
+
+ if (!ret)
+ seq->private = PDE_DATA(inode);
+
+ return ret;
+}
+
+static ssize_t
+cciss_proc_write(struct file *file, const char __user *buf,
+ size_t length, loff_t *ppos)
+{
+ int err;
+ char *buffer;
+
+#ifndef CONFIG_CISS_SCSI_TAPE
+ return -EINVAL;
+#endif
+
+ if (!buf || length > PAGE_SIZE - 1)
+ return -EINVAL;
+
+ buffer = (char *)__get_free_page(GFP_KERNEL);
+ if (!buffer)
+ return -ENOMEM;
+
+ err = -EFAULT;
+ if (copy_from_user(buffer, buf, length))
+ goto out;
+ buffer[length] = '\0';
+
+#ifdef CONFIG_CISS_SCSI_TAPE
+ if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) {
+ struct seq_file *seq = file->private_data;
+ ctlr_info_t *h = seq->private;
+
+ err = cciss_engage_scsi(h);
+ if (err == 0)
+ err = length;
+ } else
+#endif /* CONFIG_CISS_SCSI_TAPE */
+ err = -EINVAL;
+ /* might be nice to have "disengage" too, but it's not
+ safely possible. (only 1 module use count, lock issues.) */
+
+out:
+ free_page((unsigned long)buffer);
+ return err;
+}
+
+static const struct file_operations cciss_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = cciss_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = cciss_proc_write,
+};
+
+static void cciss_procinit(ctlr_info_t *h)
+{
+ struct proc_dir_entry *pde;
+
+ if (proc_cciss == NULL)
+ proc_cciss = proc_mkdir("driver/cciss", NULL);
+ if (!proc_cciss)
+ return;
+ pde = proc_create_data(h->devname, S_IWUSR | S_IRUSR | S_IRGRP |
+ S_IROTH, proc_cciss,
+ &cciss_proc_fops, h);
+}
+#endif /* CONFIG_PROC_FS */
+
+#define MAX_PRODUCT_NAME_LEN 19
+
+#define to_hba(n) container_of(n, struct ctlr_info, dev)
+#define to_drv(n) container_of(n, drive_info_struct, dev)
+
+/* List of controllers which cannot be hard reset on kexec with reset_devices */
+static u32 unresettable_controller[] = {
+ 0x324a103C, /* Smart Array P712m */
+ 0x324b103C, /* SmartArray P711m */
+ 0x3223103C, /* Smart Array P800 */
+ 0x3234103C, /* Smart Array P400 */
+ 0x3235103C, /* Smart Array P400i */
+ 0x3211103C, /* Smart Array E200i */
+ 0x3212103C, /* Smart Array E200 */
+ 0x3213103C, /* Smart Array E200i */
+ 0x3214103C, /* Smart Array E200i */
+ 0x3215103C, /* Smart Array E200i */
+ 0x3237103C, /* Smart Array E500 */
+ 0x323D103C, /* Smart Array P700m */
+ 0x409C0E11, /* Smart Array 6400 */
+ 0x409D0E11, /* Smart Array 6400 EM */
+};
+
+/* List of controllers which cannot even be soft reset */
+static u32 soft_unresettable_controller[] = {
+ 0x409C0E11, /* Smart Array 6400 */
+ 0x409D0E11, /* Smart Array 6400 EM */
+};
+
+static int ctlr_is_hard_resettable(u32 board_id)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++)
+ if (unresettable_controller[i] == board_id)
+ return 0;
+ return 1;
+}
+
+static int ctlr_is_soft_resettable(u32 board_id)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(soft_unresettable_controller); i++)
+ if (soft_unresettable_controller[i] == board_id)
+ return 0;
+ return 1;
+}
+
+static int ctlr_is_resettable(u32 board_id)
+{
+ return ctlr_is_hard_resettable(board_id) ||
+ ctlr_is_soft_resettable(board_id);
+}
+
+static ssize_t host_show_resettable(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ctlr_info *h = to_hba(dev);
+
+ return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h->board_id));
+}
+static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL);
+
+static ssize_t host_store_rescan(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ctlr_info *h = to_hba(dev);
+
+ add_to_scan_list(h);
+ wake_up_process(cciss_scan_thread);
+ wait_for_completion_interruptible(&h->scan_wait);
+
+ return count;
+}
+static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan);
+
+static ssize_t host_show_transport_mode(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct ctlr_info *h = to_hba(dev);
+
+ return snprintf(buf, 20, "%s\n",
+ h->transMethod & CFGTBL_Trans_Performant ?
+ "performant" : "simple");
+}
+static DEVICE_ATTR(transport_mode, S_IRUGO, host_show_transport_mode, NULL);
+
+static ssize_t dev_show_unique_id(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ drive_info_struct *drv = to_drv(dev);
+ struct ctlr_info *h = to_hba(drv->dev.parent);
+ __u8 sn[16];
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring)
+ ret = -EBUSY;
+ else
+ memcpy(sn, drv->serial_no, sizeof(sn));
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (ret)
+ return ret;
+ else
+ return snprintf(buf, 16 * 2 + 2,
+ "%02X%02X%02X%02X%02X%02X%02X%02X"
+ "%02X%02X%02X%02X%02X%02X%02X%02X\n",
+ sn[0], sn[1], sn[2], sn[3],
+ sn[4], sn[5], sn[6], sn[7],
+ sn[8], sn[9], sn[10], sn[11],
+ sn[12], sn[13], sn[14], sn[15]);
+}
+static DEVICE_ATTR(unique_id, S_IRUGO, dev_show_unique_id, NULL);
+
+static ssize_t dev_show_vendor(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ drive_info_struct *drv = to_drv(dev);
+ struct ctlr_info *h = to_hba(drv->dev.parent);
+ char vendor[VENDOR_LEN + 1];
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring)
+ ret = -EBUSY;
+ else
+ memcpy(vendor, drv->vendor, VENDOR_LEN + 1);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (ret)
+ return ret;
+ else
+ return snprintf(buf, sizeof(vendor) + 1, "%s\n", drv->vendor);
+}
+static DEVICE_ATTR(vendor, S_IRUGO, dev_show_vendor, NULL);
+
+static ssize_t dev_show_model(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ drive_info_struct *drv = to_drv(dev);
+ struct ctlr_info *h = to_hba(drv->dev.parent);
+ char model[MODEL_LEN + 1];
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring)
+ ret = -EBUSY;
+ else
+ memcpy(model, drv->model, MODEL_LEN + 1);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (ret)
+ return ret;
+ else
+ return snprintf(buf, sizeof(model) + 1, "%s\n", drv->model);
+}
+static DEVICE_ATTR(model, S_IRUGO, dev_show_model, NULL);
+
+static ssize_t dev_show_rev(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ drive_info_struct *drv = to_drv(dev);
+ struct ctlr_info *h = to_hba(drv->dev.parent);
+ char rev[REV_LEN + 1];
+ unsigned long flags;
+ int ret = 0;
+
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring)
+ ret = -EBUSY;
+ else
+ memcpy(rev, drv->rev, REV_LEN + 1);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (ret)
+ return ret;
+ else
+ return snprintf(buf, sizeof(rev) + 1, "%s\n", drv->rev);
+}
+static DEVICE_ATTR(rev, S_IRUGO, dev_show_rev, NULL);
+
+static ssize_t cciss_show_lunid(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ drive_info_struct *drv = to_drv(dev);
+ struct ctlr_info *h = to_hba(drv->dev.parent);
+ unsigned long flags;
+ unsigned char lunid[8];
+
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring) {
+ spin_unlock_irqrestore(&h->lock, flags);
+ return -EBUSY;
+ }
+ if (!drv->heads) {
+ spin_unlock_irqrestore(&h->lock, flags);
+ return -ENOTTY;
+ }
+ memcpy(lunid, drv->LunID, sizeof(lunid));
+ spin_unlock_irqrestore(&h->lock, flags);
+ return snprintf(buf, 20, "0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+ lunid[0], lunid[1], lunid[2], lunid[3],
+ lunid[4], lunid[5], lunid[6], lunid[7]);
+}
+static DEVICE_ATTR(lunid, S_IRUGO, cciss_show_lunid, NULL);
+
+static ssize_t cciss_show_raid_level(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ drive_info_struct *drv = to_drv(dev);
+ struct ctlr_info *h = to_hba(drv->dev.parent);
+ int raid;
+ unsigned long flags;
+
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring) {
+ spin_unlock_irqrestore(&h->lock, flags);
+ return -EBUSY;
+ }
+ raid = drv->raid_level;
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (raid < 0 || raid > RAID_UNKNOWN)
+ raid = RAID_UNKNOWN;
+
+ return snprintf(buf, strlen(raid_label[raid]) + 7, "RAID %s\n",
+ raid_label[raid]);
+}
+static DEVICE_ATTR(raid_level, S_IRUGO, cciss_show_raid_level, NULL);
+
+static ssize_t cciss_show_usage_count(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ drive_info_struct *drv = to_drv(dev);
+ struct ctlr_info *h = to_hba(drv->dev.parent);
+ unsigned long flags;
+ int count;
+
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring) {
+ spin_unlock_irqrestore(&h->lock, flags);
+ return -EBUSY;
+ }
+ count = drv->usage_count;
+ spin_unlock_irqrestore(&h->lock, flags);
+ return snprintf(buf, 20, "%d\n", count);
+}
+static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL);
+
+static struct attribute *cciss_host_attrs[] = {
+ &dev_attr_rescan.attr,
+ &dev_attr_resettable.attr,
+ &dev_attr_transport_mode.attr,
+ NULL
+};
+
+static struct attribute_group cciss_host_attr_group = {
+ .attrs = cciss_host_attrs,
+};
+
+static const struct attribute_group *cciss_host_attr_groups[] = {
+ &cciss_host_attr_group,
+ NULL
+};
+
+static struct device_type cciss_host_type = {
+ .name = "cciss_host",
+ .groups = cciss_host_attr_groups,
+ .release = cciss_hba_release,
+};
+
+static struct attribute *cciss_dev_attrs[] = {
+ &dev_attr_unique_id.attr,
+ &dev_attr_model.attr,
+ &dev_attr_vendor.attr,
+ &dev_attr_rev.attr,
+ &dev_attr_lunid.attr,
+ &dev_attr_raid_level.attr,
+ &dev_attr_usage_count.attr,
+ NULL
+};
+
+static struct attribute_group cciss_dev_attr_group = {
+ .attrs = cciss_dev_attrs,
+};
+
+static const struct attribute_group *cciss_dev_attr_groups[] = {
+ &cciss_dev_attr_group,
+ NULL
+};
+
+static struct device_type cciss_dev_type = {
+ .name = "cciss_device",
+ .groups = cciss_dev_attr_groups,
+ .release = cciss_device_release,
+};
+
+static struct bus_type cciss_bus_type = {
+ .name = "cciss",
+};
+
+/*
+ * cciss_hba_release is called when the reference count
+ * of h->dev goes to zero.
+ */
+static void cciss_hba_release(struct device *dev)
+{
+ /*
+ * nothing to do, but need this to avoid a warning
+ * about not having a release handler from lib/kref.c.
+ */
+}
+
+/*
+ * Initialize sysfs entry for each controller. This sets up and registers
+ * the 'cciss#' directory for each individual controller under
+ * /sys/bus/pci/devices/<dev>/.
+ */
+static int cciss_create_hba_sysfs_entry(struct ctlr_info *h)
+{
+ device_initialize(&h->dev);
+ h->dev.type = &cciss_host_type;
+ h->dev.bus = &cciss_bus_type;
+ dev_set_name(&h->dev, "%s", h->devname);
+ h->dev.parent = &h->pdev->dev;
+
+ return device_add(&h->dev);
+}
+
+/*
+ * Remove sysfs entries for an hba.
+ */
+static void cciss_destroy_hba_sysfs_entry(struct ctlr_info *h)
+{
+ device_del(&h->dev);
+ put_device(&h->dev); /* final put. */
+}
+
+/* cciss_device_release is called when the reference count
+ * of h->drv[x]dev goes to zero.
+ */
+static void cciss_device_release(struct device *dev)
+{
+ drive_info_struct *drv = to_drv(dev);
+ kfree(drv);
+}
+
+/*
+ * Initialize sysfs for each logical drive. This sets up and registers
+ * the 'c#d#' directory for each individual logical drive under
+ * /sys/bus/pci/devices/<dev/ccis#/. We also create a link from
+ * /sys/block/cciss!c#d# to this entry.
+ */
+static long cciss_create_ld_sysfs_entry(struct ctlr_info *h,
+ int drv_index)
+{
+ struct device *dev;
+
+ if (h->drv[drv_index]->device_initialized)
+ return 0;
+
+ dev = &h->drv[drv_index]->dev;
+ device_initialize(dev);
+ dev->type = &cciss_dev_type;
+ dev->bus = &cciss_bus_type;
+ dev_set_name(dev, "c%dd%d", h->ctlr, drv_index);
+ dev->parent = &h->dev;
+ h->drv[drv_index]->device_initialized = 1;
+ return device_add(dev);
+}
+
+/*
+ * Remove sysfs entries for a logical drive.
+ */
+static void cciss_destroy_ld_sysfs_entry(struct ctlr_info *h, int drv_index,
+ int ctlr_exiting)
+{
+ struct device *dev = &h->drv[drv_index]->dev;
+
+ /* special case for c*d0, we only destroy it on controller exit */
+ if (drv_index == 0 && !ctlr_exiting)
+ return;
+
+ device_del(dev);
+ put_device(dev); /* the "final" put. */
+ h->drv[drv_index] = NULL;
+}
+
+/*
+ * For operations that cannot sleep, a command block is allocated at init,
+ * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
+ * which ones are free or in use.
+ */
+static CommandList_struct *cmd_alloc(ctlr_info_t *h)
+{
+ CommandList_struct *c;
+ int i;
+ u64bit temp64;
+ dma_addr_t cmd_dma_handle, err_dma_handle;
+
+ do {
+ i = find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds);
+ if (i == h->nr_cmds)
+ return NULL;
+ } while (test_and_set_bit(i, h->cmd_pool_bits) != 0);
+ c = h->cmd_pool + i;
+ memset(c, 0, sizeof(CommandList_struct));
+ cmd_dma_handle = h->cmd_pool_dhandle + i * sizeof(CommandList_struct);
+ c->err_info = h->errinfo_pool + i;
+ memset(c->err_info, 0, sizeof(ErrorInfo_struct));
+ err_dma_handle = h->errinfo_pool_dhandle
+ + i * sizeof(ErrorInfo_struct);
+ h->nr_allocs++;
+
+ c->cmdindex = i;
+
+ INIT_LIST_HEAD(&c->list);
+ c->busaddr = (__u32) cmd_dma_handle;
+ temp64.val = (__u64) err_dma_handle;
+ c->ErrDesc.Addr.lower = temp64.val32.lower;
+ c->ErrDesc.Addr.upper = temp64.val32.upper;
+ c->ErrDesc.Len = sizeof(ErrorInfo_struct);
+
+ c->ctlr = h->ctlr;
+ return c;
+}
+
+/* allocate a command using pci_alloc_consistent, used for ioctls,
+ * etc., not for the main i/o path.
+ */
+static CommandList_struct *cmd_special_alloc(ctlr_info_t *h)
+{
+ CommandList_struct *c;
+ u64bit temp64;
+ dma_addr_t cmd_dma_handle, err_dma_handle;
+
+ c = pci_zalloc_consistent(h->pdev, sizeof(CommandList_struct),
+ &cmd_dma_handle);
+ if (c == NULL)
+ return NULL;
+
+ c->cmdindex = -1;
+
+ c->err_info = pci_zalloc_consistent(h->pdev, sizeof(ErrorInfo_struct),
+ &err_dma_handle);
+
+ if (c->err_info == NULL) {
+ pci_free_consistent(h->pdev,
+ sizeof(CommandList_struct), c, cmd_dma_handle);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&c->list);
+ c->busaddr = (__u32) cmd_dma_handle;
+ temp64.val = (__u64) err_dma_handle;
+ c->ErrDesc.Addr.lower = temp64.val32.lower;
+ c->ErrDesc.Addr.upper = temp64.val32.upper;
+ c->ErrDesc.Len = sizeof(ErrorInfo_struct);
+
+ c->ctlr = h->ctlr;
+ return c;
+}
+
+static void cmd_free(ctlr_info_t *h, CommandList_struct *c)
+{
+ int i;
+
+ i = c - h->cmd_pool;
+ clear_bit(i, h->cmd_pool_bits);
+ h->nr_frees++;
+}
+
+static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c)
+{
+ u64bit temp64;
+
+ temp64.val32.lower = c->ErrDesc.Addr.lower;
+ temp64.val32.upper = c->ErrDesc.Addr.upper;
+ pci_free_consistent(h->pdev, sizeof(ErrorInfo_struct),
+ c->err_info, (dma_addr_t) temp64.val);
+ pci_free_consistent(h->pdev, sizeof(CommandList_struct), c,
+ (dma_addr_t) cciss_tag_discard_error_bits(h, (u32) c->busaddr));
+}
+
+static inline ctlr_info_t *get_host(struct gendisk *disk)
+{
+ return disk->queue->queuedata;
+}
+
+static inline drive_info_struct *get_drv(struct gendisk *disk)
+{
+ return disk->private_data;
+}
+
+/*
+ * Open. Make sure the device is really there.
+ */
+static int cciss_open(struct block_device *bdev, fmode_t mode)
+{
+ ctlr_info_t *h = get_host(bdev->bd_disk);
+ drive_info_struct *drv = get_drv(bdev->bd_disk);
+
+ dev_dbg(&h->pdev->dev, "cciss_open %s\n", bdev->bd_disk->disk_name);
+ if (drv->busy_configuring)
+ return -EBUSY;
+ /*
+ * Root is allowed to open raw volume zero even if it's not configured
+ * so array config can still work. Root is also allowed to open any
+ * volume that has a LUN ID, so it can issue IOCTL to reread the
+ * disk information. I don't think I really like this
+ * but I'm already using way to many device nodes to claim another one
+ * for "raw controller".
+ */
+ if (drv->heads == 0) {
+ if (MINOR(bdev->bd_dev) != 0) { /* not node 0? */
+ /* if not node 0 make sure it is a partition = 0 */
+ if (MINOR(bdev->bd_dev) & 0x0f) {
+ return -ENXIO;
+ /* if it is, make sure we have a LUN ID */
+ } else if (memcmp(drv->LunID, CTLR_LUNID,
+ sizeof(drv->LunID))) {
+ return -ENXIO;
+ }
+ }
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ }
+ drv->usage_count++;
+ h->usage_count++;
+ return 0;
+}
+
+static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+ int ret;
+
+ mutex_lock(&cciss_mutex);
+ ret = cciss_open(bdev, mode);
+ mutex_unlock(&cciss_mutex);
+
+ return ret;
+}
+
+/*
+ * Close. Sync first.
+ */
+static void cciss_release(struct gendisk *disk, fmode_t mode)
+{
+ ctlr_info_t *h;
+ drive_info_struct *drv;
+
+ mutex_lock(&cciss_mutex);
+ h = get_host(disk);
+ drv = get_drv(disk);
+ dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name);
+ drv->usage_count--;
+ h->usage_count--;
+ mutex_unlock(&cciss_mutex);
+}
+
+#ifdef CONFIG_COMPAT
+
+static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long arg);
+static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long arg);
+
+static int cciss_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case CCISS_GETPCIINFO:
+ case CCISS_GETINTINFO:
+ case CCISS_SETINTINFO:
+ case CCISS_GETNODENAME:
+ case CCISS_SETNODENAME:
+ case CCISS_GETHEARTBEAT:
+ case CCISS_GETBUSTYPES:
+ case CCISS_GETFIRMVER:
+ case CCISS_GETDRIVVER:
+ case CCISS_REVALIDVOLS:
+ case CCISS_DEREGDISK:
+ case CCISS_REGNEWDISK:
+ case CCISS_REGNEWD:
+ case CCISS_RESCANDISK:
+ case CCISS_GETLUNINFO:
+ return cciss_ioctl(bdev, mode, cmd, arg);
+
+ case CCISS_PASSTHRU32:
+ return cciss_ioctl32_passthru(bdev, mode, cmd, arg);
+ case CCISS_BIG_PASSTHRU32:
+ return cciss_ioctl32_big_passthru(bdev, mode, cmd, arg);
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+
+static int cciss_ioctl32_passthru(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long arg)
+{
+ IOCTL32_Command_struct __user *arg32 =
+ (IOCTL32_Command_struct __user *) arg;
+ IOCTL_Command_struct arg64;
+ IOCTL_Command_struct __user *p = compat_alloc_user_space(sizeof(arg64));
+ int err;
+ u32 cp;
+
+ memset(&arg64, 0, sizeof(arg64));
+ err = 0;
+ err |=
+ copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
+ sizeof(arg64.LUN_info));
+ err |=
+ copy_from_user(&arg64.Request, &arg32->Request,
+ sizeof(arg64.Request));
+ err |=
+ copy_from_user(&arg64.error_info, &arg32->error_info,
+ sizeof(arg64.error_info));
+ err |= get_user(arg64.buf_size, &arg32->buf_size);
+ err |= get_user(cp, &arg32->buf);
+ arg64.buf = compat_ptr(cp);
+ err |= copy_to_user(p, &arg64, sizeof(arg64));
+
+ if (err)
+ return -EFAULT;
+
+ err = cciss_ioctl(bdev, mode, CCISS_PASSTHRU, (unsigned long)p);
+ if (err)
+ return err;
+ err |=
+ copy_in_user(&arg32->error_info, &p->error_info,
+ sizeof(arg32->error_info));
+ if (err)
+ return -EFAULT;
+ return err;
+}
+
+static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long arg)
+{
+ BIG_IOCTL32_Command_struct __user *arg32 =
+ (BIG_IOCTL32_Command_struct __user *) arg;
+ BIG_IOCTL_Command_struct arg64;
+ BIG_IOCTL_Command_struct __user *p =
+ compat_alloc_user_space(sizeof(arg64));
+ int err;
+ u32 cp;
+
+ memset(&arg64, 0, sizeof(arg64));
+ err = 0;
+ err |=
+ copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
+ sizeof(arg64.LUN_info));
+ err |=
+ copy_from_user(&arg64.Request, &arg32->Request,
+ sizeof(arg64.Request));
+ err |=
+ copy_from_user(&arg64.error_info, &arg32->error_info,
+ sizeof(arg64.error_info));
+ err |= get_user(arg64.buf_size, &arg32->buf_size);
+ err |= get_user(arg64.malloc_size, &arg32->malloc_size);
+ err |= get_user(cp, &arg32->buf);
+ arg64.buf = compat_ptr(cp);
+ err |= copy_to_user(p, &arg64, sizeof(arg64));
+
+ if (err)
+ return -EFAULT;
+
+ err = cciss_ioctl(bdev, mode, CCISS_BIG_PASSTHRU, (unsigned long)p);
+ if (err)
+ return err;
+ err |=
+ copy_in_user(&arg32->error_info, &p->error_info,
+ sizeof(arg32->error_info));
+ if (err)
+ return -EFAULT;
+ return err;
+}
+#endif
+
+static int cciss_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ drive_info_struct *drv = get_drv(bdev->bd_disk);
+
+ if (!drv->cylinders)
+ return -ENXIO;
+
+ geo->heads = drv->heads;
+ geo->sectors = drv->sectors;
+ geo->cylinders = drv->cylinders;
+ return 0;
+}
+
+static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c)
+{
+ if (c->err_info->CommandStatus == CMD_TARGET_STATUS &&
+ c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
+ (void)check_for_unit_attention(h, c);
+}
+
+static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
+{
+ cciss_pci_info_struct pciinfo;
+
+ if (!argp)
+ return -EINVAL;
+ pciinfo.domain = pci_domain_nr(h->pdev->bus);
+ pciinfo.bus = h->pdev->bus->number;
+ pciinfo.dev_fn = h->pdev->devfn;
+ pciinfo.board_id = h->board_id;
+ if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
+{
+ cciss_coalint_struct intinfo;
+ unsigned long flags;
+
+ if (!argp)
+ return -EINVAL;
+ spin_lock_irqsave(&h->lock, flags);
+ intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay);
+ intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount);
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (copy_to_user
+ (argp, &intinfo, sizeof(cciss_coalint_struct)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
+{
+ cciss_coalint_struct intinfo;
+ unsigned long flags;
+ int i;
+
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&intinfo, argp, sizeof(intinfo)))
+ return -EFAULT;
+ if ((intinfo.delay == 0) && (intinfo.count == 0))
+ return -EINVAL;
+ spin_lock_irqsave(&h->lock, flags);
+ /* Update the field, and then ring the doorbell */
+ writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay));
+ writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount));
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+
+ for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+ if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+ break;
+ udelay(1000); /* delay and try again */
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (i >= MAX_IOCTL_CONFIG_WAIT)
+ return -EAGAIN;
+ return 0;
+}
+
+static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
+{
+ NodeName_type NodeName;
+ unsigned long flags;
+ int i;
+
+ if (!argp)
+ return -EINVAL;
+ spin_lock_irqsave(&h->lock, flags);
+ for (i = 0; i < 16; i++)
+ NodeName[i] = readb(&h->cfgtable->ServerName[i]);
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
+{
+ NodeName_type NodeName;
+ unsigned long flags;
+ int i;
+
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(NodeName, argp, sizeof(NodeName_type)))
+ return -EFAULT;
+ spin_lock_irqsave(&h->lock, flags);
+ /* Update the field, and then ring the doorbell */
+ for (i = 0; i < 16; i++)
+ writeb(NodeName[i], &h->cfgtable->ServerName[i]);
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+ for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+ if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+ break;
+ udelay(1000); /* delay and try again */
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (i >= MAX_IOCTL_CONFIG_WAIT)
+ return -EAGAIN;
+ return 0;
+}
+
+static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
+{
+ Heartbeat_type heartbeat;
+ unsigned long flags;
+
+ if (!argp)
+ return -EINVAL;
+ spin_lock_irqsave(&h->lock, flags);
+ heartbeat = readl(&h->cfgtable->HeartBeat);
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
+{
+ BusTypes_type BusTypes;
+ unsigned long flags;
+
+ if (!argp)
+ return -EINVAL;
+ spin_lock_irqsave(&h->lock, flags);
+ BusTypes = readl(&h->cfgtable->BusTypes);
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
+{
+ FirmwareVer_type firmware;
+
+ if (!argp)
+ return -EINVAL;
+ memcpy(firmware, h->firm_ver, 4);
+
+ if (copy_to_user
+ (argp, firmware, sizeof(FirmwareVer_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
+{
+ DriverVer_type DriverVer = DRIVER_VERSION;
+
+ if (!argp)
+ return -EINVAL;
+ if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getluninfo(ctlr_info_t *h,
+ struct gendisk *disk, void __user *argp)
+{
+ LogvolInfo_struct luninfo;
+ drive_info_struct *drv = get_drv(disk);
+
+ if (!argp)
+ return -EINVAL;
+ memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID));
+ luninfo.num_opens = drv->usage_count;
+ luninfo.num_parts = 0;
+ if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_passthru(ctlr_info_t *h, void __user *argp)
+{
+ IOCTL_Command_struct iocommand;
+ CommandList_struct *c;
+ char *buff = NULL;
+ u64bit temp64;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ if (!argp)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ if (copy_from_user
+ (&iocommand, argp, sizeof(IOCTL_Command_struct)))
+ return -EFAULT;
+ if ((iocommand.buf_size < 1) &&
+ (iocommand.Request.Type.Direction != XFER_NONE)) {
+ return -EINVAL;
+ }
+ if (iocommand.buf_size > 0) {
+ buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
+ if (buff == NULL)
+ return -EFAULT;
+ }
+ if (iocommand.Request.Type.Direction == XFER_WRITE) {
+ /* Copy the data into the buffer we created */
+ if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) {
+ kfree(buff);
+ return -EFAULT;
+ }
+ } else {
+ memset(buff, 0, iocommand.buf_size);
+ }
+ c = cmd_special_alloc(h);
+ if (!c) {
+ kfree(buff);
+ return -ENOMEM;
+ }
+ /* Fill in the command type */
+ c->cmd_type = CMD_IOCTL_PEND;
+ /* Fill in Command Header */
+ c->Header.ReplyQueue = 0; /* unused in simple mode */
+ if (iocommand.buf_size > 0) { /* buffer to fill */
+ c->Header.SGList = 1;
+ c->Header.SGTotal = 1;
+ } else { /* no buffers to fill */
+ c->Header.SGList = 0;
+ c->Header.SGTotal = 0;
+ }
+ c->Header.LUN = iocommand.LUN_info;
+ /* use the kernel address the cmd block for tag */
+ c->Header.Tag.lower = c->busaddr;
+
+ /* Fill in Request block */
+ c->Request = iocommand.Request;
+
+ /* Fill in the scatter gather information */
+ if (iocommand.buf_size > 0) {
+ temp64.val = pci_map_single(h->pdev, buff,
+ iocommand.buf_size, PCI_DMA_BIDIRECTIONAL);
+ c->SG[0].Addr.lower = temp64.val32.lower;
+ c->SG[0].Addr.upper = temp64.val32.upper;
+ c->SG[0].Len = iocommand.buf_size;
+ c->SG[0].Ext = 0; /* we are not chaining */
+ }
+ c->waiting = &wait;
+
+ enqueue_cmd_and_start_io(h, c);
+ wait_for_completion(&wait);
+
+ /* unlock the buffers from DMA */
+ temp64.val32.lower = c->SG[0].Addr.lower;
+ temp64.val32.upper = c->SG[0].Addr.upper;
+ pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size,
+ PCI_DMA_BIDIRECTIONAL);
+ check_ioctl_unit_attention(h, c);
+
+ /* Copy the error information out */
+ iocommand.error_info = *(c->err_info);
+ if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
+ kfree(buff);
+ cmd_special_free(h, c);
+ return -EFAULT;
+ }
+
+ if (iocommand.Request.Type.Direction == XFER_READ) {
+ /* Copy the data out of the buffer we created */
+ if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) {
+ kfree(buff);
+ cmd_special_free(h, c);
+ return -EFAULT;
+ }
+ }
+ kfree(buff);
+ cmd_special_free(h, c);
+ return 0;
+}
+
+static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
+{
+ BIG_IOCTL_Command_struct *ioc;
+ CommandList_struct *c;
+ unsigned char **buff = NULL;
+ int *buff_size = NULL;
+ u64bit temp64;
+ BYTE sg_used = 0;
+ int status = 0;
+ int i;
+ DECLARE_COMPLETION_ONSTACK(wait);
+ __u32 left;
+ __u32 sz;
+ BYTE __user *data_ptr;
+
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ ioc = kmalloc(sizeof(*ioc), GFP_KERNEL);
+ if (!ioc) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ if (copy_from_user(ioc, argp, sizeof(*ioc))) {
+ status = -EFAULT;
+ goto cleanup1;
+ }
+ if ((ioc->buf_size < 1) &&
+ (ioc->Request.Type.Direction != XFER_NONE)) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ /* Check kmalloc limits using all SGs */
+ if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
+ if (!buff) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL);
+ if (!buff_size) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ left = ioc->buf_size;
+ data_ptr = ioc->buf;
+ while (left) {
+ sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
+ buff_size[sg_used] = sz;
+ buff[sg_used] = kmalloc(sz, GFP_KERNEL);
+ if (buff[sg_used] == NULL) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ if (ioc->Request.Type.Direction == XFER_WRITE) {
+ if (copy_from_user(buff[sg_used], data_ptr, sz)) {
+ status = -EFAULT;
+ goto cleanup1;
+ }
+ } else {
+ memset(buff[sg_used], 0, sz);
+ }
+ left -= sz;
+ data_ptr += sz;
+ sg_used++;
+ }
+ c = cmd_special_alloc(h);
+ if (!c) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ c->cmd_type = CMD_IOCTL_PEND;
+ c->Header.ReplyQueue = 0;
+ c->Header.SGList = sg_used;
+ c->Header.SGTotal = sg_used;
+ c->Header.LUN = ioc->LUN_info;
+ c->Header.Tag.lower = c->busaddr;
+
+ c->Request = ioc->Request;
+ for (i = 0; i < sg_used; i++) {
+ temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i],
+ PCI_DMA_BIDIRECTIONAL);
+ c->SG[i].Addr.lower = temp64.val32.lower;
+ c->SG[i].Addr.upper = temp64.val32.upper;
+ c->SG[i].Len = buff_size[i];
+ c->SG[i].Ext = 0; /* we are not chaining */
+ }
+ c->waiting = &wait;
+ enqueue_cmd_and_start_io(h, c);
+ wait_for_completion(&wait);
+ /* unlock the buffers from DMA */
+ for (i = 0; i < sg_used; i++) {
+ temp64.val32.lower = c->SG[i].Addr.lower;
+ temp64.val32.upper = c->SG[i].Addr.upper;
+ pci_unmap_single(h->pdev,
+ (dma_addr_t) temp64.val, buff_size[i],
+ PCI_DMA_BIDIRECTIONAL);
+ }
+ check_ioctl_unit_attention(h, c);
+ /* Copy the error information out */
+ ioc->error_info = *(c->err_info);
+ if (copy_to_user(argp, ioc, sizeof(*ioc))) {
+ cmd_special_free(h, c);
+ status = -EFAULT;
+ goto cleanup1;
+ }
+ if (ioc->Request.Type.Direction == XFER_READ) {
+ /* Copy the data out of the buffer we created */
+ BYTE __user *ptr = ioc->buf;
+ for (i = 0; i < sg_used; i++) {
+ if (copy_to_user(ptr, buff[i], buff_size[i])) {
+ cmd_special_free(h, c);
+ status = -EFAULT;
+ goto cleanup1;
+ }
+ ptr += buff_size[i];
+ }
+ }
+ cmd_special_free(h, c);
+ status = 0;
+cleanup1:
+ if (buff) {
+ for (i = 0; i < sg_used; i++)
+ kfree(buff[i]);
+ kfree(buff);
+ }
+ kfree(buff_size);
+ kfree(ioc);
+ return status;
+}
+
+static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ ctlr_info_t *h = get_host(disk);
+ void __user *argp = (void __user *)arg;
+
+ dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
+ cmd, arg);
+ switch (cmd) {
+ case CCISS_GETPCIINFO:
+ return cciss_getpciinfo(h, argp);
+ case CCISS_GETINTINFO:
+ return cciss_getintinfo(h, argp);
+ case CCISS_SETINTINFO:
+ return cciss_setintinfo(h, argp);
+ case CCISS_GETNODENAME:
+ return cciss_getnodename(h, argp);
+ case CCISS_SETNODENAME:
+ return cciss_setnodename(h, argp);
+ case CCISS_GETHEARTBEAT:
+ return cciss_getheartbeat(h, argp);
+ case CCISS_GETBUSTYPES:
+ return cciss_getbustypes(h, argp);
+ case CCISS_GETFIRMVER:
+ return cciss_getfirmver(h, argp);
+ case CCISS_GETDRIVVER:
+ return cciss_getdrivver(h, argp);
+ case CCISS_DEREGDISK:
+ case CCISS_REGNEWD:
+ case CCISS_REVALIDVOLS:
+ return rebuild_lun_table(h, 0, 1);
+ case CCISS_GETLUNINFO:
+ return cciss_getluninfo(h, disk, argp);
+ case CCISS_PASSTHRU:
+ return cciss_passthru(h, argp);
+ case CCISS_BIG_PASSTHRU:
+ return cciss_bigpassthru(h, argp);
+
+ /* scsi_cmd_blk_ioctl handles these, below, though some are not */
+ /* very meaningful for cciss. SG_IO is the main one people want. */
+
+ case SG_GET_VERSION_NUM:
+ case SG_SET_TIMEOUT:
+ case SG_GET_TIMEOUT:
+ case SG_GET_RESERVED_SIZE:
+ case SG_SET_RESERVED_SIZE:
+ case SG_EMULATED_HOST:
+ case SG_IO:
+ case SCSI_IOCTL_SEND_COMMAND:
+ return scsi_cmd_blk_ioctl(bdev, mode, cmd, argp);
+
+ /* scsi_cmd_blk_ioctl would normally handle these, below, but */
+ /* they aren't a good fit for cciss, as CD-ROMs are */
+ /* not supported, and we don't have any bus/target/lun */
+ /* which we present to the kernel. */
+
+ case CDROM_SEND_PACKET:
+ case CDROMCLOSETRAY:
+ case CDROMEJECT:
+ case SCSI_IOCTL_GET_IDLUN:
+ case SCSI_IOCTL_GET_BUS_NUMBER:
+ default:
+ return -ENOTTY;
+ }
+}
+
+static void cciss_check_queues(ctlr_info_t *h)
+{
+ int start_queue = h->next_to_run;
+ int i;
+
+ /* check to see if we have maxed out the number of commands that can
+ * be placed on the queue. If so then exit. We do this check here
+ * in case the interrupt we serviced was from an ioctl and did not
+ * free any new commands.
+ */
+ if ((find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds)) == h->nr_cmds)
+ return;
+
+ /* We have room on the queue for more commands. Now we need to queue
+ * them up. We will also keep track of the next queue to run so
+ * that every queue gets a chance to be started first.
+ */
+ for (i = 0; i < h->highest_lun + 1; i++) {
+ int curr_queue = (start_queue + i) % (h->highest_lun + 1);
+ /* make sure the disk has been added and the drive is real
+ * because this can be called from the middle of init_one.
+ */
+ if (!h->drv[curr_queue])
+ continue;
+ if (!(h->drv[curr_queue]->queue) ||
+ !(h->drv[curr_queue]->heads))
+ continue;
+ blk_start_queue(h->gendisk[curr_queue]->queue);
+
+ /* check to see if we have maxed out the number of commands
+ * that can be placed on the queue.
+ */
+ if ((find_first_zero_bit(h->cmd_pool_bits, h->nr_cmds)) == h->nr_cmds) {
+ if (curr_queue == start_queue) {
+ h->next_to_run =
+ (start_queue + 1) % (h->highest_lun + 1);
+ break;
+ } else {
+ h->next_to_run = curr_queue;
+ break;
+ }
+ }
+ }
+}
+
+static void cciss_softirq_done(struct request *rq)
+{
+ CommandList_struct *c = rq->completion_data;
+ ctlr_info_t *h = hba[c->ctlr];
+ SGDescriptor_struct *curr_sg = c->SG;
+ u64bit temp64;
+ unsigned long flags;
+ int i, ddir;
+ int sg_index = 0;
+
+ if (c->Request.Type.Direction == XFER_READ)
+ ddir = PCI_DMA_FROMDEVICE;
+ else
+ ddir = PCI_DMA_TODEVICE;
+
+ /* command did not need to be retried */
+ /* unmap the DMA mapping for all the scatter gather elements */
+ for (i = 0; i < c->Header.SGList; i++) {
+ if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) {
+ cciss_unmap_sg_chain_block(h, c);
+ /* Point to the next block */
+ curr_sg = h->cmd_sg_list[c->cmdindex];
+ sg_index = 0;
+ }
+ temp64.val32.lower = curr_sg[sg_index].Addr.lower;
+ temp64.val32.upper = curr_sg[sg_index].Addr.upper;
+ pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len,
+ ddir);
+ ++sg_index;
+ }
+
+ dev_dbg(&h->pdev->dev, "Done with %p\n", rq);
+
+ /* set the residual count for pc requests */
+ if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
+ rq->resid_len = c->err_info->ResidualCnt;
+
+ blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
+
+ spin_lock_irqsave(&h->lock, flags);
+ cmd_free(h, c);
+ cciss_check_queues(h);
+ spin_unlock_irqrestore(&h->lock, flags);
+}
+
+static inline void log_unit_to_scsi3addr(ctlr_info_t *h,
+ unsigned char scsi3addr[], uint32_t log_unit)
+{
+ memcpy(scsi3addr, h->drv[log_unit]->LunID,
+ sizeof(h->drv[log_unit]->LunID));
+}
+
+/* This function gets the SCSI vendor, model, and revision of a logical drive
+ * via the inquiry page 0. Model, vendor, and rev are set to empty strings if
+ * they cannot be read.
+ */
+static void cciss_get_device_descr(ctlr_info_t *h, int logvol,
+ char *vendor, char *model, char *rev)
+{
+ int rc;
+ InquiryData_struct *inq_buf;
+ unsigned char scsi3addr[8];
+
+ *vendor = '\0';
+ *model = '\0';
+ *rev = '\0';
+
+ inq_buf = kzalloc(sizeof(InquiryData_struct), GFP_KERNEL);
+ if (!inq_buf)
+ return;
+
+ log_unit_to_scsi3addr(h, scsi3addr, logvol);
+ rc = sendcmd_withirq(h, CISS_INQUIRY, inq_buf, sizeof(*inq_buf), 0,
+ scsi3addr, TYPE_CMD);
+ if (rc == IO_OK) {
+ memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
+ vendor[VENDOR_LEN] = '\0';
+ memcpy(model, &inq_buf->data_byte[16], MODEL_LEN);
+ model[MODEL_LEN] = '\0';
+ memcpy(rev, &inq_buf->data_byte[32], REV_LEN);
+ rev[REV_LEN] = '\0';
+ }
+
+ kfree(inq_buf);
+ return;
+}
+
+/* This function gets the serial number of a logical drive via
+ * inquiry page 0x83. Serial no. is 16 bytes. If the serial
+ * number cannot be had, for whatever reason, 16 bytes of 0xff
+ * are returned instead.
+ */
+static void cciss_get_serial_no(ctlr_info_t *h, int logvol,
+ unsigned char *serial_no, int buflen)
+{
+#define PAGE_83_INQ_BYTES 64
+ int rc;
+ unsigned char *buf;
+ unsigned char scsi3addr[8];
+
+ if (buflen > 16)
+ buflen = 16;
+ memset(serial_no, 0xff, buflen);
+ buf = kzalloc(PAGE_83_INQ_BYTES, GFP_KERNEL);
+ if (!buf)
+ return;
+ memset(serial_no, 0, buflen);
+ log_unit_to_scsi3addr(h, scsi3addr, logvol);
+ rc = sendcmd_withirq(h, CISS_INQUIRY, buf,
+ PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
+ if (rc == IO_OK)
+ memcpy(serial_no, &buf[8], buflen);
+ kfree(buf);
+ return;
+}
+
+/*
+ * cciss_add_disk sets up the block device queue for a logical drive
+ */
+static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
+ int drv_index)
+{
+ disk->queue = blk_init_queue(do_cciss_request, &h->lock);
+ if (!disk->queue)
+ goto init_queue_failure;
+ sprintf(disk->disk_name, "cciss/c%dd%d", h->ctlr, drv_index);
+ disk->major = h->major;
+ disk->first_minor = drv_index << NWD_SHIFT;
+ disk->fops = &cciss_fops;
+ if (cciss_create_ld_sysfs_entry(h, drv_index))
+ goto cleanup_queue;
+ disk->private_data = h->drv[drv_index];
+ disk->driverfs_dev = &h->drv[drv_index]->dev;
+
+ /* Set up queue information */
+ blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
+
+ /* This is a hardware imposed limit. */
+ blk_queue_max_segments(disk->queue, h->maxsgentries);
+
+ blk_queue_max_hw_sectors(disk->queue, h->cciss_max_sectors);
+
+ blk_queue_softirq_done(disk->queue, cciss_softirq_done);
+
+ disk->queue->queuedata = h;
+
+ blk_queue_logical_block_size(disk->queue,
+ h->drv[drv_index]->block_size);
+
+ /* Make sure all queue data is written out before */
+ /* setting h->drv[drv_index]->queue, as setting this */
+ /* allows the interrupt handler to start the queue */
+ wmb();
+ h->drv[drv_index]->queue = disk->queue;
+ add_disk(disk);
+ return 0;
+
+cleanup_queue:
+ blk_cleanup_queue(disk->queue);
+ disk->queue = NULL;
+init_queue_failure:
+ return -1;
+}
+
+/* This function will check the usage_count of the drive to be updated/added.
+ * If the usage_count is zero and it is a heretofore unknown drive, or,
+ * the drive's capacity, geometry, or serial number has changed,
+ * then the drive information will be updated and the disk will be
+ * re-registered with the kernel. If these conditions don't hold,
+ * then it will be left alone for the next reboot. The exception to this
+ * is disk 0 which will always be left registered with the kernel since it
+ * is also the controller node. Any changes to disk 0 will show up on
+ * the next reboot.
+ */
+static void cciss_update_drive_info(ctlr_info_t *h, int drv_index,
+ int first_time, int via_ioctl)
+{
+ struct gendisk *disk;
+ InquiryData_struct *inq_buff = NULL;
+ unsigned int block_size;
+ sector_t total_size;
+ unsigned long flags = 0;
+ int ret = 0;
+ drive_info_struct *drvinfo;
+
+ /* Get information about the disk and modify the driver structure */
+ inq_buff = kmalloc(sizeof(InquiryData_struct), GFP_KERNEL);
+ drvinfo = kzalloc(sizeof(*drvinfo), GFP_KERNEL);
+ if (inq_buff == NULL || drvinfo == NULL)
+ goto mem_msg;
+
+ /* testing to see if 16-byte CDBs are already being used */
+ if (h->cciss_read == CCISS_READ_16) {
+ cciss_read_capacity_16(h, drv_index,
+ &total_size, &block_size);
+
+ } else {
+ cciss_read_capacity(h, drv_index, &total_size, &block_size);
+ /* if read_capacity returns all F's this volume is >2TB */
+ /* in size so we switch to 16-byte CDB's for all */
+ /* read/write ops */
+ if (total_size == 0xFFFFFFFFULL) {
+ cciss_read_capacity_16(h, drv_index,
+ &total_size, &block_size);
+ h->cciss_read = CCISS_READ_16;
+ h->cciss_write = CCISS_WRITE_16;
+ } else {
+ h->cciss_read = CCISS_READ_10;
+ h->cciss_write = CCISS_WRITE_10;
+ }
+ }
+
+ cciss_geometry_inquiry(h, drv_index, total_size, block_size,
+ inq_buff, drvinfo);
+ drvinfo->block_size = block_size;
+ drvinfo->nr_blocks = total_size + 1;
+
+ cciss_get_device_descr(h, drv_index, drvinfo->vendor,
+ drvinfo->model, drvinfo->rev);
+ cciss_get_serial_no(h, drv_index, drvinfo->serial_no,
+ sizeof(drvinfo->serial_no));
+ /* Save the lunid in case we deregister the disk, below. */
+ memcpy(drvinfo->LunID, h->drv[drv_index]->LunID,
+ sizeof(drvinfo->LunID));
+
+ /* Is it the same disk we already know, and nothing's changed? */
+ if (h->drv[drv_index]->raid_level != -1 &&
+ ((memcmp(drvinfo->serial_no,
+ h->drv[drv_index]->serial_no, 16) == 0) &&
+ drvinfo->block_size == h->drv[drv_index]->block_size &&
+ drvinfo->nr_blocks == h->drv[drv_index]->nr_blocks &&
+ drvinfo->heads == h->drv[drv_index]->heads &&
+ drvinfo->sectors == h->drv[drv_index]->sectors &&
+ drvinfo->cylinders == h->drv[drv_index]->cylinders))
+ /* The disk is unchanged, nothing to update */
+ goto freeret;
+
+ /* If we get here it's not the same disk, or something's changed,
+ * so we need to * deregister it, and re-register it, if it's not
+ * in use.
+ * If the disk already exists then deregister it before proceeding
+ * (unless it's the first disk (for the controller node).
+ */
+ if (h->drv[drv_index]->raid_level != -1 && drv_index != 0) {
+ dev_warn(&h->pdev->dev, "disk %d has changed.\n", drv_index);
+ spin_lock_irqsave(&h->lock, flags);
+ h->drv[drv_index]->busy_configuring = 1;
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ /* deregister_disk sets h->drv[drv_index]->queue = NULL
+ * which keeps the interrupt handler from starting
+ * the queue.
+ */
+ ret = deregister_disk(h, drv_index, 0, via_ioctl);
+ }
+
+ /* If the disk is in use return */
+ if (ret)
+ goto freeret;
+
+ /* Save the new information from cciss_geometry_inquiry
+ * and serial number inquiry. If the disk was deregistered
+ * above, then h->drv[drv_index] will be NULL.
+ */
+ if (h->drv[drv_index] == NULL) {
+ drvinfo->device_initialized = 0;
+ h->drv[drv_index] = drvinfo;
+ drvinfo = NULL; /* so it won't be freed below. */
+ } else {
+ /* special case for cxd0 */
+ h->drv[drv_index]->block_size = drvinfo->block_size;
+ h->drv[drv_index]->nr_blocks = drvinfo->nr_blocks;
+ h->drv[drv_index]->heads = drvinfo->heads;
+ h->drv[drv_index]->sectors = drvinfo->sectors;
+ h->drv[drv_index]->cylinders = drvinfo->cylinders;
+ h->drv[drv_index]->raid_level = drvinfo->raid_level;
+ memcpy(h->drv[drv_index]->serial_no, drvinfo->serial_no, 16);
+ memcpy(h->drv[drv_index]->vendor, drvinfo->vendor,
+ VENDOR_LEN + 1);
+ memcpy(h->drv[drv_index]->model, drvinfo->model, MODEL_LEN + 1);
+ memcpy(h->drv[drv_index]->rev, drvinfo->rev, REV_LEN + 1);
+ }
+
+ ++h->num_luns;
+ disk = h->gendisk[drv_index];
+ set_capacity(disk, h->drv[drv_index]->nr_blocks);
+
+ /* If it's not disk 0 (drv_index != 0)
+ * or if it was disk 0, but there was previously
+ * no actual corresponding configured logical drive
+ * (raid_leve == -1) then we want to update the
+ * logical drive's information.
+ */
+ if (drv_index || first_time) {
+ if (cciss_add_disk(h, disk, drv_index) != 0) {
+ cciss_free_gendisk(h, drv_index);
+ cciss_free_drive_info(h, drv_index);
+ dev_warn(&h->pdev->dev, "could not update disk %d\n",
+ drv_index);
+ --h->num_luns;
+ }
+ }
+
+freeret:
+ kfree(inq_buff);
+ kfree(drvinfo);
+ return;
+mem_msg:
+ dev_err(&h->pdev->dev, "out of memory\n");
+ goto freeret;
+}
+
+/* This function will find the first index of the controllers drive array
+ * that has a null drv pointer and allocate the drive info struct and
+ * will return that index This is where new drives will be added.
+ * If the index to be returned is greater than the highest_lun index for
+ * the controller then highest_lun is set * to this new index.
+ * If there are no available indexes or if tha allocation fails, then -1
+ * is returned. * "controller_node" is used to know if this is a real
+ * logical drive, or just the controller node, which determines if this
+ * counts towards highest_lun.
+ */
+static int cciss_alloc_drive_info(ctlr_info_t *h, int controller_node)
+{
+ int i;
+ drive_info_struct *drv;
+
+ /* Search for an empty slot for our drive info */
+ for (i = 0; i < CISS_MAX_LUN; i++) {
+
+ /* if not cxd0 case, and it's occupied, skip it. */
+ if (h->drv[i] && i != 0)
+ continue;
+ /*
+ * If it's cxd0 case, and drv is alloc'ed already, and a
+ * disk is configured there, skip it.
+ */
+ if (i == 0 && h->drv[i] && h->drv[i]->raid_level != -1)
+ continue;
+
+ /*
+ * We've found an empty slot. Update highest_lun
+ * provided this isn't just the fake cxd0 controller node.
+ */
+ if (i > h->highest_lun && !controller_node)
+ h->highest_lun = i;
+
+ /* If adding a real disk at cxd0, and it's already alloc'ed */
+ if (i == 0 && h->drv[i] != NULL)
+ return i;
+
+ /*
+ * Found an empty slot, not already alloc'ed. Allocate it.
+ * Mark it with raid_level == -1, so we know it's new later on.
+ */
+ drv = kzalloc(sizeof(*drv), GFP_KERNEL);
+ if (!drv)
+ return -1;
+ drv->raid_level = -1; /* so we know it's new */
+ h->drv[i] = drv;
+ return i;
+ }
+ return -1;
+}
+
+static void cciss_free_drive_info(ctlr_info_t *h, int drv_index)
+{
+ kfree(h->drv[drv_index]);
+ h->drv[drv_index] = NULL;
+}
+
+static void cciss_free_gendisk(ctlr_info_t *h, int drv_index)
+{
+ put_disk(h->gendisk[drv_index]);
+ h->gendisk[drv_index] = NULL;
+}
+
+/* cciss_add_gendisk finds a free hba[]->drv structure
+ * and allocates a gendisk if needed, and sets the lunid
+ * in the drvinfo structure. It returns the index into
+ * the ->drv[] array, or -1 if none are free.
+ * is_controller_node indicates whether highest_lun should
+ * count this disk, or if it's only being added to provide
+ * a means to talk to the controller in case no logical
+ * drives have yet been configured.
+ */
+static int cciss_add_gendisk(ctlr_info_t *h, unsigned char lunid[],
+ int controller_node)
+{
+ int drv_index;
+
+ drv_index = cciss_alloc_drive_info(h, controller_node);
+ if (drv_index == -1)
+ return -1;
+
+ /*Check if the gendisk needs to be allocated */
+ if (!h->gendisk[drv_index]) {
+ h->gendisk[drv_index] =
+ alloc_disk(1 << NWD_SHIFT);
+ if (!h->gendisk[drv_index]) {
+ dev_err(&h->pdev->dev,
+ "could not allocate a new disk %d\n",
+ drv_index);
+ goto err_free_drive_info;
+ }
+ }
+ memcpy(h->drv[drv_index]->LunID, lunid,
+ sizeof(h->drv[drv_index]->LunID));
+ if (cciss_create_ld_sysfs_entry(h, drv_index))
+ goto err_free_disk;
+ /* Don't need to mark this busy because nobody */
+ /* else knows about this disk yet to contend */
+ /* for access to it. */
+ h->drv[drv_index]->busy_configuring = 0;
+ wmb();
+ return drv_index;
+
+err_free_disk:
+ cciss_free_gendisk(h, drv_index);
+err_free_drive_info:
+ cciss_free_drive_info(h, drv_index);
+ return -1;
+}
+
+/* This is for the special case of a controller which
+ * has no logical drives. In this case, we still need
+ * to register a disk so the controller can be accessed
+ * by the Array Config Utility.
+ */
+static void cciss_add_controller_node(ctlr_info_t *h)
+{
+ struct gendisk *disk;
+ int drv_index;
+
+ if (h->gendisk[0] != NULL) /* already did this? Then bail. */
+ return;
+
+ drv_index = cciss_add_gendisk(h, CTLR_LUNID, 1);
+ if (drv_index == -1)
+ goto error;
+ h->drv[drv_index]->block_size = 512;
+ h->drv[drv_index]->nr_blocks = 0;
+ h->drv[drv_index]->heads = 0;
+ h->drv[drv_index]->sectors = 0;
+ h->drv[drv_index]->cylinders = 0;
+ h->drv[drv_index]->raid_level = -1;
+ memset(h->drv[drv_index]->serial_no, 0, 16);
+ disk = h->gendisk[drv_index];
+ if (cciss_add_disk(h, disk, drv_index) == 0)
+ return;
+ cciss_free_gendisk(h, drv_index);
+ cciss_free_drive_info(h, drv_index);
+error:
+ dev_warn(&h->pdev->dev, "could not add disk 0.\n");
+ return;
+}
+
+/* This function will add and remove logical drives from the Logical
+ * drive array of the controller and maintain persistency of ordering
+ * so that mount points are preserved until the next reboot. This allows
+ * for the removal of logical drives in the middle of the drive array
+ * without a re-ordering of those drives.
+ * INPUT
+ * h = The controller to perform the operations on
+ */
+static int rebuild_lun_table(ctlr_info_t *h, int first_time,
+ int via_ioctl)
+{
+ int num_luns;
+ ReportLunData_struct *ld_buff = NULL;
+ int return_code;
+ int listlength = 0;
+ int i;
+ int drv_found;
+ int drv_index = 0;
+ unsigned char lunid[8] = CTLR_LUNID;
+ unsigned long flags;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ /* Set busy_configuring flag for this operation */
+ spin_lock_irqsave(&h->lock, flags);
+ if (h->busy_configuring) {
+ spin_unlock_irqrestore(&h->lock, flags);
+ return -EBUSY;
+ }
+ h->busy_configuring = 1;
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ ld_buff = kzalloc(sizeof(ReportLunData_struct), GFP_KERNEL);
+ if (ld_buff == NULL)
+ goto mem_msg;
+
+ return_code = sendcmd_withirq(h, CISS_REPORT_LOG, ld_buff,
+ sizeof(ReportLunData_struct),
+ 0, CTLR_LUNID, TYPE_CMD);
+
+ if (return_code == IO_OK)
+ listlength = be32_to_cpu(*(__be32 *) ld_buff->LUNListLength);
+ else { /* reading number of logical volumes failed */
+ dev_warn(&h->pdev->dev,
+ "report logical volume command failed\n");
+ listlength = 0;
+ goto freeret;
+ }
+
+ num_luns = listlength / 8; /* 8 bytes per entry */
+ if (num_luns > CISS_MAX_LUN) {
+ num_luns = CISS_MAX_LUN;
+ dev_warn(&h->pdev->dev, "more luns configured"
+ " on controller than can be handled by"
+ " this driver.\n");
+ }
+
+ if (num_luns == 0)
+ cciss_add_controller_node(h);
+
+ /* Compare controller drive array to driver's drive array
+ * to see if any drives are missing on the controller due
+ * to action of Array Config Utility (user deletes drive)
+ * and deregister logical drives which have disappeared.
+ */
+ for (i = 0; i <= h->highest_lun; i++) {
+ int j;
+ drv_found = 0;
+
+ /* skip holes in the array from already deleted drives */
+ if (h->drv[i] == NULL)
+ continue;
+
+ for (j = 0; j < num_luns; j++) {
+ memcpy(lunid, &ld_buff->LUN[j][0], sizeof(lunid));
+ if (memcmp(h->drv[i]->LunID, lunid,
+ sizeof(lunid)) == 0) {
+ drv_found = 1;
+ break;
+ }
+ }
+ if (!drv_found) {
+ /* Deregister it from the OS, it's gone. */
+ spin_lock_irqsave(&h->lock, flags);
+ h->drv[i]->busy_configuring = 1;
+ spin_unlock_irqrestore(&h->lock, flags);
+ return_code = deregister_disk(h, i, 1, via_ioctl);
+ if (h->drv[i] != NULL)
+ h->drv[i]->busy_configuring = 0;
+ }
+ }
+
+ /* Compare controller drive array to driver's drive array.
+ * Check for updates in the drive information and any new drives
+ * on the controller due to ACU adding logical drives, or changing
+ * a logical drive's size, etc. Reregister any new/changed drives
+ */
+ for (i = 0; i < num_luns; i++) {
+ int j;
+
+ drv_found = 0;
+
+ memcpy(lunid, &ld_buff->LUN[i][0], sizeof(lunid));
+ /* Find if the LUN is already in the drive array
+ * of the driver. If so then update its info
+ * if not in use. If it does not exist then find
+ * the first free index and add it.
+ */
+ for (j = 0; j <= h->highest_lun; j++) {
+ if (h->drv[j] != NULL &&
+ memcmp(h->drv[j]->LunID, lunid,
+ sizeof(h->drv[j]->LunID)) == 0) {
+ drv_index = j;
+ drv_found = 1;
+ break;
+ }
+ }
+
+ /* check if the drive was found already in the array */
+ if (!drv_found) {
+ drv_index = cciss_add_gendisk(h, lunid, 0);
+ if (drv_index == -1)
+ goto freeret;
+ }
+ cciss_update_drive_info(h, drv_index, first_time, via_ioctl);
+ } /* end for */
+
+freeret:
+ kfree(ld_buff);
+ h->busy_configuring = 0;
+ /* We return -1 here to tell the ACU that we have registered/updated
+ * all of the drives that we can and to keep it from calling us
+ * additional times.
+ */
+ return -1;
+mem_msg:
+ dev_err(&h->pdev->dev, "out of memory\n");
+ h->busy_configuring = 0;
+ goto freeret;
+}
+
+static void cciss_clear_drive_info(drive_info_struct *drive_info)
+{
+ /* zero out the disk size info */
+ drive_info->nr_blocks = 0;
+ drive_info->block_size = 0;
+ drive_info->heads = 0;
+ drive_info->sectors = 0;
+ drive_info->cylinders = 0;
+ drive_info->raid_level = -1;
+ memset(drive_info->serial_no, 0, sizeof(drive_info->serial_no));
+ memset(drive_info->model, 0, sizeof(drive_info->model));
+ memset(drive_info->rev, 0, sizeof(drive_info->rev));
+ memset(drive_info->vendor, 0, sizeof(drive_info->vendor));
+ /*
+ * don't clear the LUNID though, we need to remember which
+ * one this one is.
+ */
+}
+
+/* This function will deregister the disk and it's queue from the
+ * kernel. It must be called with the controller lock held and the
+ * drv structures busy_configuring flag set. It's parameters are:
+ *
+ * disk = This is the disk to be deregistered
+ * drv = This is the drive_info_struct associated with the disk to be
+ * deregistered. It contains information about the disk used
+ * by the driver.
+ * clear_all = This flag determines whether or not the disk information
+ * is going to be completely cleared out and the highest_lun
+ * reset. Sometimes we want to clear out information about
+ * the disk in preparation for re-adding it. In this case
+ * the highest_lun should be left unchanged and the LunID
+ * should not be cleared.
+ * via_ioctl
+ * This indicates whether we've reached this path via ioctl.
+ * This affects the maximum usage count allowed for c0d0 to be messed with.
+ * If this path is reached via ioctl(), then the max_usage_count will
+ * be 1, as the process calling ioctl() has got to have the device open.
+ * If we get here via sysfs, then the max usage count will be zero.
+*/
+static int deregister_disk(ctlr_info_t *h, int drv_index,
+ int clear_all, int via_ioctl)
+{
+ int i;
+ struct gendisk *disk;
+ drive_info_struct *drv;
+ int recalculate_highest_lun;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ drv = h->drv[drv_index];
+ disk = h->gendisk[drv_index];
+
+ /* make sure logical volume is NOT is use */
+ if (clear_all || (h->gendisk[0] == disk)) {
+ if (drv->usage_count > via_ioctl)
+ return -EBUSY;
+ } else if (drv->usage_count > 0)
+ return -EBUSY;
+
+ recalculate_highest_lun = (drv == h->drv[h->highest_lun]);
+
+ /* invalidate the devices and deregister the disk. If it is disk
+ * zero do not deregister it but just zero out it's values. This
+ * allows us to delete disk zero but keep the controller registered.
+ */
+ if (h->gendisk[0] != disk) {
+ struct request_queue *q = disk->queue;
+ if (disk->flags & GENHD_FL_UP) {
+ cciss_destroy_ld_sysfs_entry(h, drv_index, 0);
+ del_gendisk(disk);
+ }
+ if (q)
+ blk_cleanup_queue(q);
+ /* If clear_all is set then we are deleting the logical
+ * drive, not just refreshing its info. For drives
+ * other than disk 0 we will call put_disk. We do not
+ * do this for disk 0 as we need it to be able to
+ * configure the controller.
+ */
+ if (clear_all){
+ /* This isn't pretty, but we need to find the
+ * disk in our array and NULL our the pointer.
+ * This is so that we will call alloc_disk if
+ * this index is used again later.
+ */
+ for (i=0; i < CISS_MAX_LUN; i++){
+ if (h->gendisk[i] == disk) {
+ h->gendisk[i] = NULL;
+ break;
+ }
+ }
+ put_disk(disk);
+ }
+ } else {
+ set_capacity(disk, 0);
+ cciss_clear_drive_info(drv);
+ }
+
+ --h->num_luns;
+
+ /* if it was the last disk, find the new hightest lun */
+ if (clear_all && recalculate_highest_lun) {
+ int newhighest = -1;
+ for (i = 0; i <= h->highest_lun; i++) {
+ /* if the disk has size > 0, it is available */
+ if (h->drv[i] && h->drv[i]->heads)
+ newhighest = i;
+ }
+ h->highest_lun = newhighest;
+ }
+ return 0;
+}
+
+static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
+ size_t size, __u8 page_code, unsigned char *scsi3addr,
+ int cmd_type)
+{
+ u64bit buff_dma_handle;
+ int status = IO_OK;
+
+ c->cmd_type = CMD_IOCTL_PEND;
+ c->Header.ReplyQueue = 0;
+ if (buff != NULL) {
+ c->Header.SGList = 1;
+ c->Header.SGTotal = 1;
+ } else {
+ c->Header.SGList = 0;
+ c->Header.SGTotal = 0;
+ }
+ c->Header.Tag.lower = c->busaddr;
+ memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8);
+
+ c->Request.Type.Type = cmd_type;
+ if (cmd_type == TYPE_CMD) {
+ switch (cmd) {
+ case CISS_INQUIRY:
+ /* are we trying to read a vital product page */
+ if (page_code != 0) {
+ c->Request.CDB[1] = 0x01;
+ c->Request.CDB[2] = page_code;
+ }
+ c->Request.CDBLen = 6;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_READ;
+ c->Request.Timeout = 0;
+ c->Request.CDB[0] = CISS_INQUIRY;
+ c->Request.CDB[4] = size & 0xFF;
+ break;
+ case CISS_REPORT_LOG:
+ case CISS_REPORT_PHYS:
+ /* Talking to controller so It's a physical command
+ mode = 00 target = 0. Nothing to write.
+ */
+ c->Request.CDBLen = 12;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_READ;
+ c->Request.Timeout = 0;
+ c->Request.CDB[0] = cmd;
+ c->Request.CDB[6] = (size >> 24) & 0xFF; /* MSB */
+ c->Request.CDB[7] = (size >> 16) & 0xFF;
+ c->Request.CDB[8] = (size >> 8) & 0xFF;
+ c->Request.CDB[9] = size & 0xFF;
+ break;
+
+ case CCISS_READ_CAPACITY:
+ c->Request.CDBLen = 10;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_READ;
+ c->Request.Timeout = 0;
+ c->Request.CDB[0] = cmd;
+ break;
+ case CCISS_READ_CAPACITY_16:
+ c->Request.CDBLen = 16;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_READ;
+ c->Request.Timeout = 0;
+ c->Request.CDB[0] = cmd;
+ c->Request.CDB[1] = 0x10;
+ c->Request.CDB[10] = (size >> 24) & 0xFF;
+ c->Request.CDB[11] = (size >> 16) & 0xFF;
+ c->Request.CDB[12] = (size >> 8) & 0xFF;
+ c->Request.CDB[13] = size & 0xFF;
+ c->Request.Timeout = 0;
+ c->Request.CDB[0] = cmd;
+ break;
+ case CCISS_CACHE_FLUSH:
+ c->Request.CDBLen = 12;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_WRITE;
+ c->Request.Timeout = 0;
+ c->Request.CDB[0] = BMIC_WRITE;
+ c->Request.CDB[6] = BMIC_CACHE_FLUSH;
+ c->Request.CDB[7] = (size >> 8) & 0xFF;
+ c->Request.CDB[8] = size & 0xFF;
+ break;
+ case TEST_UNIT_READY:
+ c->Request.CDBLen = 6;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_NONE;
+ c->Request.Timeout = 0;
+ break;
+ default:
+ dev_warn(&h->pdev->dev, "Unknown Command 0x%c\n", cmd);
+ return IO_ERROR;
+ }
+ } else if (cmd_type == TYPE_MSG) {
+ switch (cmd) {
+ case CCISS_ABORT_MSG:
+ c->Request.CDBLen = 12;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_WRITE;
+ c->Request.Timeout = 0;
+ c->Request.CDB[0] = cmd; /* abort */
+ c->Request.CDB[1] = 0; /* abort a command */
+ /* buff contains the tag of the command to abort */
+ memcpy(&c->Request.CDB[4], buff, 8);
+ break;
+ case CCISS_RESET_MSG:
+ c->Request.CDBLen = 16;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_NONE;
+ c->Request.Timeout = 0;
+ memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
+ c->Request.CDB[0] = cmd; /* reset */
+ c->Request.CDB[1] = CCISS_RESET_TYPE_TARGET;
+ break;
+ case CCISS_NOOP_MSG:
+ c->Request.CDBLen = 1;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = XFER_WRITE;
+ c->Request.Timeout = 0;
+ c->Request.CDB[0] = cmd;
+ break;
+ default:
+ dev_warn(&h->pdev->dev,
+ "unknown message type %d\n", cmd);
+ return IO_ERROR;
+ }
+ } else {
+ dev_warn(&h->pdev->dev, "unknown command type %d\n", cmd_type);
+ return IO_ERROR;
+ }
+ /* Fill in the scatter gather information */
+ if (size > 0) {
+ buff_dma_handle.val = (__u64) pci_map_single(h->pdev,
+ buff, size,
+ PCI_DMA_BIDIRECTIONAL);
+ c->SG[0].Addr.lower = buff_dma_handle.val32.lower;
+ c->SG[0].Addr.upper = buff_dma_handle.val32.upper;
+ c->SG[0].Len = size;
+ c->SG[0].Ext = 0; /* we are not chaining */
+ }
+ return status;
+}
+
+static int cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr,
+ u8 reset_type)
+{
+ CommandList_struct *c;
+ int return_status;
+
+ c = cmd_alloc(h);
+ if (!c)
+ return -ENOMEM;
+ return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0,
+ CTLR_LUNID, TYPE_MSG);
+ c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */
+ if (return_status != IO_OK) {
+ cmd_special_free(h, c);
+ return return_status;
+ }
+ c->waiting = NULL;
+ enqueue_cmd_and_start_io(h, c);
+ /* Don't wait for completion, the reset won't complete. Don't free
+ * the command either. This is the last command we will send before
+ * re-initializing everything, so it doesn't matter and won't leak.
+ */
+ return 0;
+}
+
+static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
+{
+ switch (c->err_info->ScsiStatus) {
+ case SAM_STAT_GOOD:
+ return IO_OK;
+ case SAM_STAT_CHECK_CONDITION:
+ switch (0xf & c->err_info->SenseInfo[2]) {
+ case 0: return IO_OK; /* no sense */
+ case 1: return IO_OK; /* recovered error */
+ default:
+ if (check_for_unit_attention(h, c))
+ return IO_NEEDS_RETRY;
+ dev_warn(&h->pdev->dev, "cmd 0x%02x "
+ "check condition, sense key = 0x%02x\n",
+ c->Request.CDB[0], c->err_info->SenseInfo[2]);
+ }
+ break;
+ default:
+ dev_warn(&h->pdev->dev, "cmd 0x%02x"
+ "scsi status = 0x%02x\n",
+ c->Request.CDB[0], c->err_info->ScsiStatus);
+ break;
+ }
+ return IO_ERROR;
+}
+
+static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c)
+{
+ int return_status = IO_OK;
+
+ if (c->err_info->CommandStatus == CMD_SUCCESS)
+ return IO_OK;
+
+ switch (c->err_info->CommandStatus) {
+ case CMD_TARGET_STATUS:
+ return_status = check_target_status(h, c);
+ break;
+ case CMD_DATA_UNDERRUN:
+ case CMD_DATA_OVERRUN:
+ /* expected for inquiry and report lun commands */
+ break;
+ case CMD_INVALID:
+ dev_warn(&h->pdev->dev, "cmd 0x%02x is "
+ "reported invalid\n", c->Request.CDB[0]);
+ return_status = IO_ERROR;
+ break;
+ case CMD_PROTOCOL_ERR:
+ dev_warn(&h->pdev->dev, "cmd 0x%02x has "
+ "protocol error\n", c->Request.CDB[0]);
+ return_status = IO_ERROR;
+ break;
+ case CMD_HARDWARE_ERR:
+ dev_warn(&h->pdev->dev, "cmd 0x%02x had "
+ " hardware error\n", c->Request.CDB[0]);
+ return_status = IO_ERROR;
+ break;
+ case CMD_CONNECTION_LOST:
+ dev_warn(&h->pdev->dev, "cmd 0x%02x had "
+ "connection lost\n", c->Request.CDB[0]);
+ return_status = IO_ERROR;
+ break;
+ case CMD_ABORTED:
+ dev_warn(&h->pdev->dev, "cmd 0x%02x was "
+ "aborted\n", c->Request.CDB[0]);
+ return_status = IO_ERROR;
+ break;
+ case CMD_ABORT_FAILED:
+ dev_warn(&h->pdev->dev, "cmd 0x%02x reports "
+ "abort failed\n", c->Request.CDB[0]);
+ return_status = IO_ERROR;
+ break;
+ case CMD_UNSOLICITED_ABORT:
+ dev_warn(&h->pdev->dev, "unsolicited abort 0x%02x\n",
+ c->Request.CDB[0]);
+ return_status = IO_NEEDS_RETRY;
+ break;
+ case CMD_UNABORTABLE:
+ dev_warn(&h->pdev->dev, "cmd unabortable\n");
+ return_status = IO_ERROR;
+ break;
+ default:
+ dev_warn(&h->pdev->dev, "cmd 0x%02x returned "
+ "unknown status %x\n", c->Request.CDB[0],
+ c->err_info->CommandStatus);
+ return_status = IO_ERROR;
+ }
+ return return_status;
+}
+
+static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c,
+ int attempt_retry)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ u64bit buff_dma_handle;
+ int return_status = IO_OK;
+
+resend_cmd2:
+ c->waiting = &wait;
+ enqueue_cmd_and_start_io(h, c);
+
+ wait_for_completion(&wait);
+
+ if (c->err_info->CommandStatus == 0 || !attempt_retry)
+ goto command_done;
+
+ return_status = process_sendcmd_error(h, c);
+
+ if (return_status == IO_NEEDS_RETRY &&
+ c->retry_count < MAX_CMD_RETRIES) {
+ dev_warn(&h->pdev->dev, "retrying 0x%02x\n",
+ c->Request.CDB[0]);
+ c->retry_count++;
+ /* erase the old error information */
+ memset(c->err_info, 0, sizeof(ErrorInfo_struct));
+ return_status = IO_OK;
+ reinit_completion(&wait);
+ goto resend_cmd2;
+ }
+
+command_done:
+ /* unlock the buffers from DMA */
+ buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
+ buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
+ pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
+ c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
+ return return_status;
+}
+
+static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size,
+ __u8 page_code, unsigned char scsi3addr[],
+ int cmd_type)
+{
+ CommandList_struct *c;
+ int return_status;
+
+ c = cmd_special_alloc(h);
+ if (!c)
+ return -ENOMEM;
+ return_status = fill_cmd(h, c, cmd, buff, size, page_code,
+ scsi3addr, cmd_type);
+ if (return_status == IO_OK)
+ return_status = sendcmd_withirq_core(h, c, 1);
+
+ cmd_special_free(h, c);
+ return return_status;
+}
+
+static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol,
+ sector_t total_size,
+ unsigned int block_size,
+ InquiryData_struct *inq_buff,
+ drive_info_struct *drv)
+{
+ int return_code;
+ unsigned long t;
+ unsigned char scsi3addr[8];
+
+ memset(inq_buff, 0, sizeof(InquiryData_struct));
+ log_unit_to_scsi3addr(h, scsi3addr, logvol);
+ return_code = sendcmd_withirq(h, CISS_INQUIRY, inq_buff,
+ sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD);
+ if (return_code == IO_OK) {
+ if (inq_buff->data_byte[8] == 0xFF) {
+ dev_warn(&h->pdev->dev,
+ "reading geometry failed, volume "
+ "does not support reading geometry\n");
+ drv->heads = 255;
+ drv->sectors = 32; /* Sectors per track */
+ drv->cylinders = total_size + 1;
+ drv->raid_level = RAID_UNKNOWN;
+ } else {
+ drv->heads = inq_buff->data_byte[6];
+ drv->sectors = inq_buff->data_byte[7];
+ drv->cylinders = (inq_buff->data_byte[4] & 0xff) << 8;
+ drv->cylinders += inq_buff->data_byte[5];
+ drv->raid_level = inq_buff->data_byte[8];
+ }
+ drv->block_size = block_size;
+ drv->nr_blocks = total_size + 1;
+ t = drv->heads * drv->sectors;
+ if (t > 1) {
+ sector_t real_size = total_size + 1;
+ unsigned long rem = sector_div(real_size, t);
+ if (rem)
+ real_size++;
+ drv->cylinders = real_size;
+ }
+ } else { /* Get geometry failed */
+ dev_warn(&h->pdev->dev, "reading geometry failed\n");
+ }
+}
+
+static void
+cciss_read_capacity(ctlr_info_t *h, int logvol, sector_t *total_size,
+ unsigned int *block_size)
+{
+ ReadCapdata_struct *buf;
+ int return_code;
+ unsigned char scsi3addr[8];
+
+ buf = kzalloc(sizeof(ReadCapdata_struct), GFP_KERNEL);
+ if (!buf) {
+ dev_warn(&h->pdev->dev, "out of memory\n");
+ return;
+ }
+
+ log_unit_to_scsi3addr(h, scsi3addr, logvol);
+ return_code = sendcmd_withirq(h, CCISS_READ_CAPACITY, buf,
+ sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD);
+ if (return_code == IO_OK) {
+ *total_size = be32_to_cpu(*(__be32 *) buf->total_size);
+ *block_size = be32_to_cpu(*(__be32 *) buf->block_size);
+ } else { /* read capacity command failed */
+ dev_warn(&h->pdev->dev, "read capacity failed\n");
+ *total_size = 0;
+ *block_size = BLOCK_SIZE;
+ }
+ kfree(buf);
+}
+
+static void cciss_read_capacity_16(ctlr_info_t *h, int logvol,
+ sector_t *total_size, unsigned int *block_size)
+{
+ ReadCapdata_struct_16 *buf;
+ int return_code;
+ unsigned char scsi3addr[8];
+
+ buf = kzalloc(sizeof(ReadCapdata_struct_16), GFP_KERNEL);
+ if (!buf) {
+ dev_warn(&h->pdev->dev, "out of memory\n");
+ return;
+ }
+
+ log_unit_to_scsi3addr(h, scsi3addr, logvol);
+ return_code = sendcmd_withirq(h, CCISS_READ_CAPACITY_16,
+ buf, sizeof(ReadCapdata_struct_16),
+ 0, scsi3addr, TYPE_CMD);
+ if (return_code == IO_OK) {
+ *total_size = be64_to_cpu(*(__be64 *) buf->total_size);
+ *block_size = be32_to_cpu(*(__be32 *) buf->block_size);
+ } else { /* read capacity command failed */
+ dev_warn(&h->pdev->dev, "read capacity failed\n");
+ *total_size = 0;
+ *block_size = BLOCK_SIZE;
+ }
+ dev_info(&h->pdev->dev, " blocks= %llu block_size= %d\n",
+ (unsigned long long)*total_size+1, *block_size);
+ kfree(buf);
+}
+
+static int cciss_revalidate(struct gendisk *disk)
+{
+ ctlr_info_t *h = get_host(disk);
+ drive_info_struct *drv = get_drv(disk);
+ int logvol;
+ int FOUND = 0;
+ unsigned int block_size;
+ sector_t total_size;
+ InquiryData_struct *inq_buff = NULL;
+
+ for (logvol = 0; logvol <= h->highest_lun; logvol++) {
+ if (!h->drv[logvol])
+ continue;
+ if (memcmp(h->drv[logvol]->LunID, drv->LunID,
+ sizeof(drv->LunID)) == 0) {
+ FOUND = 1;
+ break;
+ }
+ }
+
+ if (!FOUND)
+ return 1;
+
+ inq_buff = kmalloc(sizeof(InquiryData_struct), GFP_KERNEL);
+ if (inq_buff == NULL) {
+ dev_warn(&h->pdev->dev, "out of memory\n");
+ return 1;
+ }
+ if (h->cciss_read == CCISS_READ_10) {
+ cciss_read_capacity(h, logvol,
+ &total_size, &block_size);
+ } else {
+ cciss_read_capacity_16(h, logvol,
+ &total_size, &block_size);
+ }
+ cciss_geometry_inquiry(h, logvol, total_size, block_size,
+ inq_buff, drv);
+
+ blk_queue_logical_block_size(drv->queue, drv->block_size);
+ set_capacity(disk, drv->nr_blocks);
+
+ kfree(inq_buff);
+ return 0;
+}
+
+/*
+ * Map (physical) PCI mem into (virtual) kernel space
+ */
+static void __iomem *remap_pci_mem(ulong base, ulong size)
+{
+ ulong page_base = ((ulong) base) & PAGE_MASK;
+ ulong page_offs = ((ulong) base) - page_base;
+ void __iomem *page_remapped = ioremap(page_base, page_offs + size);
+
+ return page_remapped ? (page_remapped + page_offs) : NULL;
+}
+
+/*
+ * Takes jobs of the Q and sends them to the hardware, then puts it on
+ * the Q to wait for completion.
+ */
+static void start_io(ctlr_info_t *h)
+{
+ CommandList_struct *c;
+
+ while (!list_empty(&h->reqQ)) {
+ c = list_entry(h->reqQ.next, CommandList_struct, list);
+ /* can't do anything if fifo is full */
+ if ((h->access.fifo_full(h))) {
+ dev_warn(&h->pdev->dev, "fifo full\n");
+ break;
+ }
+
+ /* Get the first entry from the Request Q */
+ removeQ(c);
+ h->Qdepth--;
+
+ /* Tell the controller execute command */
+ h->access.submit_command(h, c);
+
+ /* Put job onto the completed Q */
+ addQ(&h->cmpQ, c);
+ }
+}
+
+/* Assumes that h->lock is held. */
+/* Zeros out the error record and then resends the command back */
+/* to the controller */
+static inline void resend_cciss_cmd(ctlr_info_t *h, CommandList_struct *c)
+{
+ /* erase the old error information */
+ memset(c->err_info, 0, sizeof(ErrorInfo_struct));
+
+ /* add it to software queue and then send it to the controller */
+ addQ(&h->reqQ, c);
+ h->Qdepth++;
+ if (h->Qdepth > h->maxQsinceinit)
+ h->maxQsinceinit = h->Qdepth;
+
+ start_io(h);
+}
+
+static inline unsigned int make_status_bytes(unsigned int scsi_status_byte,
+ unsigned int msg_byte, unsigned int host_byte,
+ unsigned int driver_byte)
+{
+ /* inverse of macros in scsi.h */
+ return (scsi_status_byte & 0xff) |
+ ((msg_byte & 0xff) << 8) |
+ ((host_byte & 0xff) << 16) |
+ ((driver_byte & 0xff) << 24);
+}
+
+static inline int evaluate_target_status(ctlr_info_t *h,
+ CommandList_struct *cmd, int *retry_cmd)
+{
+ unsigned char sense_key;
+ unsigned char status_byte, msg_byte, host_byte, driver_byte;
+ int error_value;
+
+ *retry_cmd = 0;
+ /* If we get in here, it means we got "target status", that is, scsi status */
+ status_byte = cmd->err_info->ScsiStatus;
+ driver_byte = DRIVER_OK;
+ msg_byte = cmd->err_info->CommandStatus; /* correct? seems too device specific */
+
+ if (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC)
+ host_byte = DID_PASSTHROUGH;
+ else
+ host_byte = DID_OK;
+
+ error_value = make_status_bytes(status_byte, msg_byte,
+ host_byte, driver_byte);
+
+ if (cmd->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION) {
+ if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC)
+ dev_warn(&h->pdev->dev, "cmd %p "
+ "has SCSI Status 0x%x\n",
+ cmd, cmd->err_info->ScsiStatus);
+ return error_value;
+ }
+
+ /* check the sense key */
+ sense_key = 0xf & cmd->err_info->SenseInfo[2];
+ /* no status or recovered error */
+ if (((sense_key == 0x0) || (sense_key == 0x1)) &&
+ (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC))
+ error_value = 0;
+
+ if (check_for_unit_attention(h, cmd)) {
+ *retry_cmd = !(cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC);
+ return 0;
+ }
+
+ /* Not SG_IO or similar? */
+ if (cmd->rq->cmd_type != REQ_TYPE_BLOCK_PC) {
+ if (error_value != 0)
+ dev_warn(&h->pdev->dev, "cmd %p has CHECK CONDITION"
+ " sense key = 0x%x\n", cmd, sense_key);
+ return error_value;
+ }
+
+ /* SG_IO or similar, copy sense data back */
+ if (cmd->rq->sense) {
+ if (cmd->rq->sense_len > cmd->err_info->SenseLen)
+ cmd->rq->sense_len = cmd->err_info->SenseLen;
+ memcpy(cmd->rq->sense, cmd->err_info->SenseInfo,
+ cmd->rq->sense_len);
+ } else
+ cmd->rq->sense_len = 0;
+
+ return error_value;
+}
+
+/* checks the status of the job and calls complete buffers to mark all
+ * buffers for the completed job. Note that this function does not need
+ * to hold the hba/queue lock.
+ */
+static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
+ int timeout)
+{
+ int retry_cmd = 0;
+ struct request *rq = cmd->rq;
+
+ rq->errors = 0;
+
+ if (timeout)
+ rq->errors = make_status_bytes(0, 0, 0, DRIVER_TIMEOUT);
+
+ if (cmd->err_info->CommandStatus == 0) /* no error has occurred */
+ goto after_error_processing;
+
+ switch (cmd->err_info->CommandStatus) {
+ case CMD_TARGET_STATUS:
+ rq->errors = evaluate_target_status(h, cmd, &retry_cmd);
+ break;
+ case CMD_DATA_UNDERRUN:
+ if (cmd->rq->cmd_type == REQ_TYPE_FS) {
+ dev_warn(&h->pdev->dev, "cmd %p has"
+ " completed with data underrun "
+ "reported\n", cmd);
+ cmd->rq->resid_len = cmd->err_info->ResidualCnt;
+ }
+ break;
+ case CMD_DATA_OVERRUN:
+ if (cmd->rq->cmd_type == REQ_TYPE_FS)
+ dev_warn(&h->pdev->dev, "cciss: cmd %p has"
+ " completed with data overrun "
+ "reported\n", cmd);
+ break;
+ case CMD_INVALID:
+ dev_warn(&h->pdev->dev, "cciss: cmd %p is "
+ "reported invalid\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ERROR);
+ break;
+ case CMD_PROTOCOL_ERR:
+ dev_warn(&h->pdev->dev, "cciss: cmd %p has "
+ "protocol error\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ERROR);
+ break;
+ case CMD_HARDWARE_ERR:
+ dev_warn(&h->pdev->dev, "cciss: cmd %p had "
+ " hardware error\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ERROR);
+ break;
+ case CMD_CONNECTION_LOST:
+ dev_warn(&h->pdev->dev, "cciss: cmd %p had "
+ "connection lost\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ERROR);
+ break;
+ case CMD_ABORTED:
+ dev_warn(&h->pdev->dev, "cciss: cmd %p was "
+ "aborted\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ABORT);
+ break;
+ case CMD_ABORT_FAILED:
+ dev_warn(&h->pdev->dev, "cciss: cmd %p reports "
+ "abort failed\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ERROR);
+ break;
+ case CMD_UNSOLICITED_ABORT:
+ dev_warn(&h->pdev->dev, "cciss%d: unsolicited "
+ "abort %p\n", h->ctlr, cmd);
+ if (cmd->retry_count < MAX_CMD_RETRIES) {
+ retry_cmd = 1;
+ dev_warn(&h->pdev->dev, "retrying %p\n", cmd);
+ cmd->retry_count++;
+ } else
+ dev_warn(&h->pdev->dev,
+ "%p retried too many times\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ABORT);
+ break;
+ case CMD_TIMEOUT:
+ dev_warn(&h->pdev->dev, "cmd %p timedout\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ERROR);
+ break;
+ case CMD_UNABORTABLE:
+ dev_warn(&h->pdev->dev, "cmd %p unabortable\n", cmd);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC ?
+ DID_PASSTHROUGH : DID_ERROR);
+ break;
+ default:
+ dev_warn(&h->pdev->dev, "cmd %p returned "
+ "unknown status %x\n", cmd,
+ cmd->err_info->CommandStatus);
+ rq->errors = make_status_bytes(SAM_STAT_GOOD,
+ cmd->err_info->CommandStatus, DRIVER_OK,
+ (cmd->rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
+ DID_PASSTHROUGH : DID_ERROR);
+ }
+
+after_error_processing:
+
+ /* We need to return this command */
+ if (retry_cmd) {
+ resend_cciss_cmd(h, cmd);
+ return;
+ }
+ cmd->rq->completion_data = cmd;
+ blk_complete_request(cmd->rq);
+}
+
+static inline u32 cciss_tag_contains_index(u32 tag)
+{
+#define DIRECT_LOOKUP_BIT 0x10
+ return tag & DIRECT_LOOKUP_BIT;
+}
+
+static inline u32 cciss_tag_to_index(u32 tag)
+{
+#define DIRECT_LOOKUP_SHIFT 5
+ return tag >> DIRECT_LOOKUP_SHIFT;
+}
+
+static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag)
+{
+#define CCISS_PERF_ERROR_BITS ((1 << DIRECT_LOOKUP_SHIFT) - 1)
+#define CCISS_SIMPLE_ERROR_BITS 0x03
+ if (likely(h->transMethod & CFGTBL_Trans_Performant))
+ return tag & ~CCISS_PERF_ERROR_BITS;
+ return tag & ~CCISS_SIMPLE_ERROR_BITS;
+}
+
+static inline void cciss_mark_tag_indexed(u32 *tag)
+{
+ *tag |= DIRECT_LOOKUP_BIT;
+}
+
+static inline void cciss_set_tag_index(u32 *tag, u32 index)
+{
+ *tag |= (index << DIRECT_LOOKUP_SHIFT);
+}
+
+/*
+ * Get a request and submit it to the controller.
+ */
+static void do_cciss_request(struct request_queue *q)
+{
+ ctlr_info_t *h = q->queuedata;
+ CommandList_struct *c;
+ sector_t start_blk;
+ int seg;
+ struct request *creq;
+ u64bit temp64;
+ struct scatterlist *tmp_sg;
+ SGDescriptor_struct *curr_sg;
+ drive_info_struct *drv;
+ int i, dir;
+ int sg_index = 0;
+ int chained = 0;
+
+ queue:
+ creq = blk_peek_request(q);
+ if (!creq)
+ goto startio;
+
+ BUG_ON(creq->nr_phys_segments > h->maxsgentries);
+
+ c = cmd_alloc(h);
+ if (!c)
+ goto full;
+
+ blk_start_request(creq);
+
+ tmp_sg = h->scatter_list[c->cmdindex];
+ spin_unlock_irq(q->queue_lock);
+
+ c->cmd_type = CMD_RWREQ;
+ c->rq = creq;
+
+ /* fill in the request */
+ drv = creq->rq_disk->private_data;
+ c->Header.ReplyQueue = 0; /* unused in simple mode */
+ /* got command from pool, so use the command block index instead */
+ /* for direct lookups. */
+ /* The first 2 bits are reserved for controller error reporting. */
+ cciss_set_tag_index(&c->Header.Tag.lower, c->cmdindex);
+ cciss_mark_tag_indexed(&c->Header.Tag.lower);
+ memcpy(&c->Header.LUN, drv->LunID, sizeof(drv->LunID));
+ c->Request.CDBLen = 10; /* 12 byte commands not in FW yet; */
+ c->Request.Type.Type = TYPE_CMD; /* It is a command. */
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction =
+ (rq_data_dir(creq) == READ) ? XFER_READ : XFER_WRITE;
+ c->Request.Timeout = 0; /* Don't time out */
+ c->Request.CDB[0] =
+ (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write;
+ start_blk = blk_rq_pos(creq);
+ dev_dbg(&h->pdev->dev, "sector =%d nr_sectors=%d\n",
+ (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
+ sg_init_table(tmp_sg, h->maxsgentries);
+ seg = blk_rq_map_sg(q, creq, tmp_sg);
+
+ /* get the DMA records for the setup */
+ if (c->Request.Type.Direction == XFER_READ)
+ dir = PCI_DMA_FROMDEVICE;
+ else
+ dir = PCI_DMA_TODEVICE;
+
+ curr_sg = c->SG;
+ sg_index = 0;
+ chained = 0;
+
+ for (i = 0; i < seg; i++) {
+ if (((sg_index+1) == (h->max_cmd_sgentries)) &&
+ !chained && ((seg - i) > 1)) {
+ /* Point to next chain block. */
+ curr_sg = h->cmd_sg_list[c->cmdindex];
+ sg_index = 0;
+ chained = 1;
+ }
+ curr_sg[sg_index].Len = tmp_sg[i].length;
+ temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]),
+ tmp_sg[i].offset,
+ tmp_sg[i].length, dir);
+ curr_sg[sg_index].Addr.lower = temp64.val32.lower;
+ curr_sg[sg_index].Addr.upper = temp64.val32.upper;
+ curr_sg[sg_index].Ext = 0; /* we are not chaining */
+ ++sg_index;
+ }
+ if (chained)
+ cciss_map_sg_chain_block(h, c, h->cmd_sg_list[c->cmdindex],
+ (seg - (h->max_cmd_sgentries - 1)) *
+ sizeof(SGDescriptor_struct));
+
+ /* track how many SG entries we are using */
+ if (seg > h->maxSG)
+ h->maxSG = seg;
+
+ dev_dbg(&h->pdev->dev, "Submitting %u sectors in %d segments "
+ "chained[%d]\n",
+ blk_rq_sectors(creq), seg, chained);
+
+ c->Header.SGTotal = seg + chained;
+ if (seg <= h->max_cmd_sgentries)
+ c->Header.SGList = c->Header.SGTotal;
+ else
+ c->Header.SGList = h->max_cmd_sgentries;
+ set_performant_mode(h, c);
+
+ if (likely(creq->cmd_type == REQ_TYPE_FS)) {
+ if(h->cciss_read == CCISS_READ_10) {
+ c->Request.CDB[1] = 0;
+ c->Request.CDB[2] = (start_blk >> 24) & 0xff; /* MSB */
+ c->Request.CDB[3] = (start_blk >> 16) & 0xff;
+ c->Request.CDB[4] = (start_blk >> 8) & 0xff;
+ c->Request.CDB[5] = start_blk & 0xff;
+ c->Request.CDB[6] = 0; /* (sect >> 24) & 0xff; MSB */
+ c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff;
+ c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff;
+ c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0;
+ } else {
+ u32 upper32 = upper_32_bits(start_blk);
+
+ c->Request.CDBLen = 16;
+ c->Request.CDB[1]= 0;
+ c->Request.CDB[2]= (upper32 >> 24) & 0xff; /* MSB */
+ c->Request.CDB[3]= (upper32 >> 16) & 0xff;
+ c->Request.CDB[4]= (upper32 >> 8) & 0xff;
+ c->Request.CDB[5]= upper32 & 0xff;
+ c->Request.CDB[6]= (start_blk >> 24) & 0xff;
+ c->Request.CDB[7]= (start_blk >> 16) & 0xff;
+ c->Request.CDB[8]= (start_blk >> 8) & 0xff;
+ c->Request.CDB[9]= start_blk & 0xff;
+ c->Request.CDB[10]= (blk_rq_sectors(creq) >> 24) & 0xff;
+ c->Request.CDB[11]= (blk_rq_sectors(creq) >> 16) & 0xff;
+ c->Request.CDB[12]= (blk_rq_sectors(creq) >> 8) & 0xff;
+ c->Request.CDB[13]= blk_rq_sectors(creq) & 0xff;
+ c->Request.CDB[14] = c->Request.CDB[15] = 0;
+ }
+ } else if (creq->cmd_type == REQ_TYPE_BLOCK_PC) {
+ c->Request.CDBLen = creq->cmd_len;
+ memcpy(c->Request.CDB, creq->cmd, BLK_MAX_CDB);
+ } else {
+ dev_warn(&h->pdev->dev, "bad request type %d\n",
+ creq->cmd_type);
+ BUG();
+ }
+
+ spin_lock_irq(q->queue_lock);
+
+ addQ(&h->reqQ, c);
+ h->Qdepth++;
+ if (h->Qdepth > h->maxQsinceinit)
+ h->maxQsinceinit = h->Qdepth;
+
+ goto queue;
+full:
+ blk_stop_queue(q);
+startio:
+ /* We will already have the driver lock here so not need
+ * to lock it.
+ */
+ start_io(h);
+}
+
+static inline unsigned long get_next_completion(ctlr_info_t *h)
+{
+ return h->access.command_completed(h);
+}
+
+static inline int interrupt_pending(ctlr_info_t *h)
+{
+ return h->access.intr_pending(h);
+}
+
+static inline long interrupt_not_for_us(ctlr_info_t *h)
+{
+ return ((h->access.intr_pending(h) == 0) ||
+ (h->interrupts_enabled == 0));
+}
+
+static inline int bad_tag(ctlr_info_t *h, u32 tag_index,
+ u32 raw_tag)
+{
+ if (unlikely(tag_index >= h->nr_cmds)) {
+ dev_warn(&h->pdev->dev, "bad tag 0x%08x ignored.\n", raw_tag);
+ return 1;
+ }
+ return 0;
+}
+
+static inline void finish_cmd(ctlr_info_t *h, CommandList_struct *c,
+ u32 raw_tag)
+{
+ removeQ(c);
+ if (likely(c->cmd_type == CMD_RWREQ))
+ complete_command(h, c, 0);
+ else if (c->cmd_type == CMD_IOCTL_PEND)
+ complete(c->waiting);
+#ifdef CONFIG_CISS_SCSI_TAPE
+ else if (c->cmd_type == CMD_SCSI)
+ complete_scsi_command(c, 0, raw_tag);
+#endif
+}
+
+static inline u32 next_command(ctlr_info_t *h)
+{
+ u32 a;
+
+ if (unlikely(!(h->transMethod & CFGTBL_Trans_Performant)))
+ return h->access.command_completed(h);
+
+ if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
+ a = *(h->reply_pool_head); /* Next cmd in ring buffer */
+ (h->reply_pool_head)++;
+ h->commands_outstanding--;
+ } else {
+ a = FIFO_EMPTY;
+ }
+ /* Check for wraparound */
+ if (h->reply_pool_head == (h->reply_pool + h->max_commands)) {
+ h->reply_pool_head = h->reply_pool;
+ h->reply_pool_wraparound ^= 1;
+ }
+ return a;
+}
+
+/* process completion of an indexed ("direct lookup") command */
+static inline u32 process_indexed_cmd(ctlr_info_t *h, u32 raw_tag)
+{
+ u32 tag_index;
+ CommandList_struct *c;
+
+ tag_index = cciss_tag_to_index(raw_tag);
+ if (bad_tag(h, tag_index, raw_tag))
+ return next_command(h);
+ c = h->cmd_pool + tag_index;
+ finish_cmd(h, c, raw_tag);
+ return next_command(h);
+}
+
+/* process completion of a non-indexed command */
+static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag)
+{
+ CommandList_struct *c = NULL;
+ __u32 busaddr_masked, tag_masked;
+
+ tag_masked = cciss_tag_discard_error_bits(h, raw_tag);
+ list_for_each_entry(c, &h->cmpQ, list) {
+ busaddr_masked = cciss_tag_discard_error_bits(h, c->busaddr);
+ if (busaddr_masked == tag_masked) {
+ finish_cmd(h, c, raw_tag);
+ return next_command(h);
+ }
+ }
+ bad_tag(h, h->nr_cmds + 1, raw_tag);
+ return next_command(h);
+}
+
+/* Some controllers, like p400, will give us one interrupt
+ * after a soft reset, even if we turned interrupts off.
+ * Only need to check for this in the cciss_xxx_discard_completions
+ * functions.
+ */
+static int ignore_bogus_interrupt(ctlr_info_t *h)
+{
+ if (likely(!reset_devices))
+ return 0;
+
+ if (likely(h->interrupts_enabled))
+ return 0;
+
+ dev_info(&h->pdev->dev, "Received interrupt while interrupts disabled "
+ "(known firmware bug.) Ignoring.\n");
+
+ return 1;
+}
+
+static irqreturn_t cciss_intx_discard_completions(int irq, void *dev_id)
+{
+ ctlr_info_t *h = dev_id;
+ unsigned long flags;
+ u32 raw_tag;
+
+ if (ignore_bogus_interrupt(h))
+ return IRQ_NONE;
+
+ if (interrupt_not_for_us(h))
+ return IRQ_NONE;
+ spin_lock_irqsave(&h->lock, flags);
+ while (interrupt_pending(h)) {
+ raw_tag = get_next_completion(h);
+ while (raw_tag != FIFO_EMPTY)
+ raw_tag = next_command(h);
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t cciss_msix_discard_completions(int irq, void *dev_id)
+{
+ ctlr_info_t *h = dev_id;
+ unsigned long flags;
+ u32 raw_tag;
+
+ if (ignore_bogus_interrupt(h))
+ return IRQ_NONE;
+
+ spin_lock_irqsave(&h->lock, flags);
+ raw_tag = get_next_completion(h);
+ while (raw_tag != FIFO_EMPTY)
+ raw_tag = next_command(h);
+ spin_unlock_irqrestore(&h->lock, flags);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t do_cciss_intx(int irq, void *dev_id)
+{
+ ctlr_info_t *h = dev_id;
+ unsigned long flags;
+ u32 raw_tag;
+
+ if (interrupt_not_for_us(h))
+ return IRQ_NONE;
+ spin_lock_irqsave(&h->lock, flags);
+ while (interrupt_pending(h)) {
+ raw_tag = get_next_completion(h);
+ while (raw_tag != FIFO_EMPTY) {
+ if (cciss_tag_contains_index(raw_tag))
+ raw_tag = process_indexed_cmd(h, raw_tag);
+ else
+ raw_tag = process_nonindexed_cmd(h, raw_tag);
+ }
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ return IRQ_HANDLED;
+}
+
+/* Add a second interrupt handler for MSI/MSI-X mode. In this mode we never
+ * check the interrupt pending register because it is not set.
+ */
+static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id)
+{
+ ctlr_info_t *h = dev_id;
+ unsigned long flags;
+ u32 raw_tag;
+
+ spin_lock_irqsave(&h->lock, flags);
+ raw_tag = get_next_completion(h);
+ while (raw_tag != FIFO_EMPTY) {
+ if (cciss_tag_contains_index(raw_tag))
+ raw_tag = process_indexed_cmd(h, raw_tag);
+ else
+ raw_tag = process_nonindexed_cmd(h, raw_tag);
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ return IRQ_HANDLED;
+}
+
+/**
+ * add_to_scan_list() - add controller to rescan queue
+ * @h: Pointer to the controller.
+ *
+ * Adds the controller to the rescan queue if not already on the queue.
+ *
+ * returns 1 if added to the queue, 0 if skipped (could be on the
+ * queue already, or the controller could be initializing or shutting
+ * down).
+ **/
+static int add_to_scan_list(struct ctlr_info *h)
+{
+ struct ctlr_info *test_h;
+ int found = 0;
+ int ret = 0;
+
+ if (h->busy_initializing)
+ return 0;
+
+ if (!mutex_trylock(&h->busy_shutting_down))
+ return 0;
+
+ mutex_lock(&scan_mutex);
+ list_for_each_entry(test_h, &scan_q, scan_list) {
+ if (test_h == h) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found && !h->busy_scanning) {
+ reinit_completion(&h->scan_wait);
+ list_add_tail(&h->scan_list, &scan_q);
+ ret = 1;
+ }
+ mutex_unlock(&scan_mutex);
+ mutex_unlock(&h->busy_shutting_down);
+
+ return ret;
+}
+
+/**
+ * remove_from_scan_list() - remove controller from rescan queue
+ * @h: Pointer to the controller.
+ *
+ * Removes the controller from the rescan queue if present. Blocks if
+ * the controller is currently conducting a rescan. The controller
+ * can be in one of three states:
+ * 1. Doesn't need a scan
+ * 2. On the scan list, but not scanning yet (we remove it)
+ * 3. Busy scanning (and not on the list). In this case we want to wait for
+ * the scan to complete to make sure the scanning thread for this
+ * controller is completely idle.
+ **/
+static void remove_from_scan_list(struct ctlr_info *h)
+{
+ struct ctlr_info *test_h, *tmp_h;
+
+ mutex_lock(&scan_mutex);
+ list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) {
+ if (test_h == h) { /* state 2. */
+ list_del(&h->scan_list);
+ complete_all(&h->scan_wait);
+ mutex_unlock(&scan_mutex);
+ return;
+ }
+ }
+ if (h->busy_scanning) { /* state 3. */
+ mutex_unlock(&scan_mutex);
+ wait_for_completion(&h->scan_wait);
+ } else { /* state 1, nothing to do. */
+ mutex_unlock(&scan_mutex);
+ }
+}
+
+/**
+ * scan_thread() - kernel thread used to rescan controllers
+ * @data: Ignored.
+ *
+ * A kernel thread used scan for drive topology changes on
+ * controllers. The thread processes only one controller at a time
+ * using a queue. Controllers are added to the queue using
+ * add_to_scan_list() and removed from the queue either after done
+ * processing or using remove_from_scan_list().
+ *
+ * returns 0.
+ **/
+static int scan_thread(void *data)
+{
+ struct ctlr_info *h;
+
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ if (kthread_should_stop())
+ break;
+
+ while (1) {
+ mutex_lock(&scan_mutex);
+ if (list_empty(&scan_q)) {
+ mutex_unlock(&scan_mutex);
+ break;
+ }
+
+ h = list_entry(scan_q.next,
+ struct ctlr_info,
+ scan_list);
+ list_del(&h->scan_list);
+ h->busy_scanning = 1;
+ mutex_unlock(&scan_mutex);
+
+ rebuild_lun_table(h, 0, 0);
+ complete_all(&h->scan_wait);
+ mutex_lock(&scan_mutex);
+ h->busy_scanning = 0;
+ mutex_unlock(&scan_mutex);
+ }
+ }
+
+ return 0;
+}
+
+static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c)
+{
+ if (c->err_info->SenseInfo[2] != UNIT_ATTENTION)
+ return 0;
+
+ switch (c->err_info->SenseInfo[12]) {
+ case STATE_CHANGED:
+ dev_warn(&h->pdev->dev, "a state change "
+ "detected, command retried\n");
+ return 1;
+ break;
+ case LUN_FAILED:
+ dev_warn(&h->pdev->dev, "LUN failure "
+ "detected, action required\n");
+ return 1;
+ break;
+ case REPORT_LUNS_CHANGED:
+ dev_warn(&h->pdev->dev, "report LUN data changed\n");
+ /*
+ * Here, we could call add_to_scan_list and wake up the scan thread,
+ * except that it's quite likely that we will get more than one
+ * REPORT_LUNS_CHANGED condition in quick succession, which means
+ * that those which occur after the first one will likely happen
+ * *during* the scan_thread's rescan. And the rescan code is not
+ * robust enough to restart in the middle, undoing what it has already
+ * done, and it's not clear that it's even possible to do this, since
+ * part of what it does is notify the block layer, which starts
+ * doing it's own i/o to read partition tables and so on, and the
+ * driver doesn't have visibility to know what might need undoing.
+ * In any event, if possible, it is horribly complicated to get right
+ * so we just don't do it for now.
+ *
+ * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012.
+ */
+ return 1;
+ break;
+ case POWER_OR_RESET:
+ dev_warn(&h->pdev->dev,
+ "a power on or device reset detected\n");
+ return 1;
+ break;
+ case UNIT_ATTENTION_CLEARED:
+ dev_warn(&h->pdev->dev,
+ "unit attention cleared by another initiator\n");
+ return 1;
+ break;
+ default:
+ dev_warn(&h->pdev->dev, "unknown unit attention detected\n");
+ return 1;
+ }
+}
+
+/*
+ * We cannot read the structure directly, for portability we must use
+ * the io functions.
+ * This is for debug only.
+ */
+static void print_cfg_table(ctlr_info_t *h)
+{
+ int i;
+ char temp_name[17];
+ CfgTable_struct *tb = h->cfgtable;
+
+ dev_dbg(&h->pdev->dev, "Controller Configuration information\n");
+ dev_dbg(&h->pdev->dev, "------------------------------------\n");
+ for (i = 0; i < 4; i++)
+ temp_name[i] = readb(&(tb->Signature[i]));
+ temp_name[4] = '\0';
+ dev_dbg(&h->pdev->dev, " Signature = %s\n", temp_name);
+ dev_dbg(&h->pdev->dev, " Spec Number = %d\n",
+ readl(&(tb->SpecValence)));
+ dev_dbg(&h->pdev->dev, " Transport methods supported = 0x%x\n",
+ readl(&(tb->TransportSupport)));
+ dev_dbg(&h->pdev->dev, " Transport methods active = 0x%x\n",
+ readl(&(tb->TransportActive)));
+ dev_dbg(&h->pdev->dev, " Requested transport Method = 0x%x\n",
+ readl(&(tb->HostWrite.TransportRequest)));
+ dev_dbg(&h->pdev->dev, " Coalesce Interrupt Delay = 0x%x\n",
+ readl(&(tb->HostWrite.CoalIntDelay)));
+ dev_dbg(&h->pdev->dev, " Coalesce Interrupt Count = 0x%x\n",
+ readl(&(tb->HostWrite.CoalIntCount)));
+ dev_dbg(&h->pdev->dev, " Max outstanding commands = 0x%d\n",
+ readl(&(tb->CmdsOutMax)));
+ dev_dbg(&h->pdev->dev, " Bus Types = 0x%x\n",
+ readl(&(tb->BusTypes)));
+ for (i = 0; i < 16; i++)
+ temp_name[i] = readb(&(tb->ServerName[i]));
+ temp_name[16] = '\0';
+ dev_dbg(&h->pdev->dev, " Server Name = %s\n", temp_name);
+ dev_dbg(&h->pdev->dev, " Heartbeat Counter = 0x%x\n\n\n",
+ readl(&(tb->HeartBeat)));
+}
+
+static int find_PCI_BAR_index(struct pci_dev *pdev, unsigned long pci_bar_addr)
+{
+ int i, offset, mem_type, bar_type;
+ if (pci_bar_addr == PCI_BASE_ADDRESS_0) /* looking for BAR zero? */
+ return 0;
+ offset = 0;
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+ bar_type = pci_resource_flags(pdev, i) & PCI_BASE_ADDRESS_SPACE;
+ if (bar_type == PCI_BASE_ADDRESS_SPACE_IO)
+ offset += 4;
+ else {
+ mem_type = pci_resource_flags(pdev, i) &
+ PCI_BASE_ADDRESS_MEM_TYPE_MASK;
+ switch (mem_type) {
+ case PCI_BASE_ADDRESS_MEM_TYPE_32:
+ case PCI_BASE_ADDRESS_MEM_TYPE_1M:
+ offset += 4; /* 32 bit */
+ break;
+ case PCI_BASE_ADDRESS_MEM_TYPE_64:
+ offset += 8;
+ break;
+ default: /* reserved in PCI 2.2 */
+ dev_warn(&pdev->dev,
+ "Base address is invalid\n");
+ return -1;
+ break;
+ }
+ }
+ if (offset == pci_bar_addr - PCI_BASE_ADDRESS_0)
+ return i + 1;
+ }
+ return -1;
+}
+
+/* Fill in bucket_map[], given nsgs (the max number of
+ * scatter gather elements supported) and bucket[],
+ * which is an array of 8 integers. The bucket[] array
+ * contains 8 different DMA transfer sizes (in 16
+ * byte increments) which the controller uses to fetch
+ * commands. This function fills in bucket_map[], which
+ * maps a given number of scatter gather elements to one of
+ * the 8 DMA transfer sizes. The point of it is to allow the
+ * controller to only do as much DMA as needed to fetch the
+ * command, with the DMA transfer size encoded in the lower
+ * bits of the command address.
+ */
+static void calc_bucket_map(int bucket[], int num_buckets,
+ int nsgs, int *bucket_map)
+{
+ int i, j, b, size;
+
+ /* even a command with 0 SGs requires 4 blocks */
+#define MINIMUM_TRANSFER_BLOCKS 4
+#define NUM_BUCKETS 8
+ /* Note, bucket_map must have nsgs+1 entries. */
+ for (i = 0; i <= nsgs; i++) {
+ /* Compute size of a command with i SG entries */
+ size = i + MINIMUM_TRANSFER_BLOCKS;
+ b = num_buckets; /* Assume the biggest bucket */
+ /* Find the bucket that is just big enough */
+ for (j = 0; j < 8; j++) {
+ if (bucket[j] >= size) {
+ b = j;
+ break;
+ }
+ }
+ /* for a command with i SG entries, use bucket b. */
+ bucket_map[i] = b;
+ }
+}
+
+static void cciss_wait_for_mode_change_ack(ctlr_info_t *h)
+{
+ int i;
+
+ /* under certain very rare conditions, this can take awhile.
+ * (e.g.: hot replace a failed 144GB drive in a RAID 5 set right
+ * as we enter this code.) */
+ for (i = 0; i < MAX_CONFIG_WAIT; i++) {
+ if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+ break;
+ usleep_range(10000, 20000);
+ }
+}
+
+static void cciss_enter_performant_mode(ctlr_info_t *h, u32 use_short_tags)
+{
+ /* This is a bit complicated. There are 8 registers on
+ * the controller which we write to to tell it 8 different
+ * sizes of commands which there may be. It's a way of
+ * reducing the DMA done to fetch each command. Encoded into
+ * each command's tag are 3 bits which communicate to the controller
+ * which of the eight sizes that command fits within. The size of
+ * each command depends on how many scatter gather entries there are.
+ * Each SG entry requires 16 bytes. The eight registers are programmed
+ * with the number of 16-byte blocks a command of that size requires.
+ * The smallest command possible requires 5 such 16 byte blocks.
+ * the largest command possible requires MAXSGENTRIES + 4 16-byte
+ * blocks. Note, this only extends to the SG entries contained
+ * within the command block, and does not extend to chained blocks
+ * of SG elements. bft[] contains the eight values we write to
+ * the registers. They are not evenly distributed, but have more
+ * sizes for small commands, and fewer sizes for larger commands.
+ */
+ __u32 trans_offset;
+ int bft[8] = { 5, 6, 8, 10, 12, 20, 28, MAXSGENTRIES + 4};
+ /*
+ * 5 = 1 s/g entry or 4k
+ * 6 = 2 s/g entry or 8k
+ * 8 = 4 s/g entry or 16k
+ * 10 = 6 s/g entry or 24k
+ */
+ unsigned long register_value;
+ BUILD_BUG_ON(28 > MAXSGENTRIES + 4);
+
+ h->reply_pool_wraparound = 1; /* spec: init to 1 */
+
+ /* Controller spec: zero out this buffer. */
+ memset(h->reply_pool, 0, h->max_commands * sizeof(__u64));
+ h->reply_pool_head = h->reply_pool;
+
+ trans_offset = readl(&(h->cfgtable->TransMethodOffset));
+ calc_bucket_map(bft, ARRAY_SIZE(bft), h->maxsgentries,
+ h->blockFetchTable);
+ writel(bft[0], &h->transtable->BlockFetch0);
+ writel(bft[1], &h->transtable->BlockFetch1);
+ writel(bft[2], &h->transtable->BlockFetch2);
+ writel(bft[3], &h->transtable->BlockFetch3);
+ writel(bft[4], &h->transtable->BlockFetch4);
+ writel(bft[5], &h->transtable->BlockFetch5);
+ writel(bft[6], &h->transtable->BlockFetch6);
+ writel(bft[7], &h->transtable->BlockFetch7);
+
+ /* size of controller ring buffer */
+ writel(h->max_commands, &h->transtable->RepQSize);
+ writel(1, &h->transtable->RepQCount);
+ writel(0, &h->transtable->RepQCtrAddrLow32);
+ writel(0, &h->transtable->RepQCtrAddrHigh32);
+ writel(h->reply_pool_dhandle, &h->transtable->RepQAddr0Low32);
+ writel(0, &h->transtable->RepQAddr0High32);
+ writel(CFGTBL_Trans_Performant | use_short_tags,
+ &(h->cfgtable->HostWrite.TransportRequest));
+
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+ cciss_wait_for_mode_change_ack(h);
+ register_value = readl(&(h->cfgtable->TransportActive));
+ if (!(register_value & CFGTBL_Trans_Performant))
+ dev_warn(&h->pdev->dev, "cciss: unable to get board into"
+ " performant mode\n");
+}
+
+static void cciss_put_controller_into_performant_mode(ctlr_info_t *h)
+{
+ __u32 trans_support;
+
+ if (cciss_simple_mode)
+ return;
+
+ dev_dbg(&h->pdev->dev, "Trying to put board into Performant mode\n");
+ /* Attempt to put controller into performant mode if supported */
+ /* Does board support performant mode? */
+ trans_support = readl(&(h->cfgtable->TransportSupport));
+ if (!(trans_support & PERFORMANT_MODE))
+ return;
+
+ dev_dbg(&h->pdev->dev, "Placing controller into performant mode\n");
+ /* Performant mode demands commands on a 32 byte boundary
+ * pci_alloc_consistent aligns on page boundarys already.
+ * Just need to check if divisible by 32
+ */
+ if ((sizeof(CommandList_struct) % 32) != 0) {
+ dev_warn(&h->pdev->dev, "%s %d %s\n",
+ "cciss info: command size[",
+ (int)sizeof(CommandList_struct),
+ "] not divisible by 32, no performant mode..\n");
+ return;
+ }
+
+ /* Performant mode ring buffer and supporting data structures */
+ h->reply_pool = (__u64 *)pci_alloc_consistent(
+ h->pdev, h->max_commands * sizeof(__u64),
+ &(h->reply_pool_dhandle));
+
+ /* Need a block fetch table for performant mode */
+ h->blockFetchTable = kmalloc(((h->maxsgentries+1) *
+ sizeof(__u32)), GFP_KERNEL);
+
+ if ((h->reply_pool == NULL) || (h->blockFetchTable == NULL))
+ goto clean_up;
+
+ cciss_enter_performant_mode(h,
+ trans_support & CFGTBL_Trans_use_short_tags);
+
+ /* Change the access methods to the performant access methods */
+ h->access = SA5_performant_access;
+ h->transMethod = CFGTBL_Trans_Performant;
+
+ return;
+clean_up:
+ kfree(h->blockFetchTable);
+ if (h->reply_pool)
+ pci_free_consistent(h->pdev,
+ h->max_commands * sizeof(__u64),
+ h->reply_pool,
+ h->reply_pool_dhandle);
+ return;
+
+} /* cciss_put_controller_into_performant_mode */
+
+/* If MSI/MSI-X is supported by the kernel we will try to enable it on
+ * controllers that are capable. If not, we use IO-APIC mode.
+ */
+
+static void cciss_interrupt_mode(ctlr_info_t *h)
+{
+#ifdef CONFIG_PCI_MSI
+ int err;
+ struct msix_entry cciss_msix_entries[4] = { {0, 0}, {0, 1},
+ {0, 2}, {0, 3}
+ };
+
+ /* Some boards advertise MSI but don't really support it */
+ if ((h->board_id == 0x40700E11) || (h->board_id == 0x40800E11) ||
+ (h->board_id == 0x40820E11) || (h->board_id == 0x40830E11))
+ goto default_int_mode;
+
+ if (pci_find_capability(h->pdev, PCI_CAP_ID_MSIX)) {
+ err = pci_enable_msix_exact(h->pdev, cciss_msix_entries, 4);
+ if (!err) {
+ h->intr[0] = cciss_msix_entries[0].vector;
+ h->intr[1] = cciss_msix_entries[1].vector;
+ h->intr[2] = cciss_msix_entries[2].vector;
+ h->intr[3] = cciss_msix_entries[3].vector;
+ h->msix_vector = 1;
+ return;
+ } else {
+ dev_warn(&h->pdev->dev,
+ "MSI-X init failed %d\n", err);
+ }
+ }
+ if (pci_find_capability(h->pdev, PCI_CAP_ID_MSI)) {
+ if (!pci_enable_msi(h->pdev))
+ h->msi_vector = 1;
+ else
+ dev_warn(&h->pdev->dev, "MSI init failed\n");
+ }
+default_int_mode:
+#endif /* CONFIG_PCI_MSI */
+ /* if we get here we're going to use the default interrupt mode */
+ h->intr[h->intr_mode] = h->pdev->irq;
+ return;
+}
+
+static int cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
+{
+ int i;
+ u32 subsystem_vendor_id, subsystem_device_id;
+
+ subsystem_vendor_id = pdev->subsystem_vendor;
+ subsystem_device_id = pdev->subsystem_device;
+ *board_id = ((subsystem_device_id << 16) & 0xffff0000) |
+ subsystem_vendor_id;
+
+ for (i = 0; i < ARRAY_SIZE(products); i++) {
+ /* Stand aside for hpsa driver on request */
+ if (cciss_allow_hpsa)
+ return -ENODEV;
+ if (*board_id == products[i].board_id)
+ return i;
+ }
+ dev_warn(&pdev->dev, "unrecognized board ID: 0x%08x, ignoring.\n",
+ *board_id);
+ return -ENODEV;
+}
+
+static inline bool cciss_board_disabled(ctlr_info_t *h)
+{
+ u16 command;
+
+ (void) pci_read_config_word(h->pdev, PCI_COMMAND, &command);
+ return ((command & PCI_COMMAND_MEMORY) == 0);
+}
+
+static int cciss_pci_find_memory_BAR(struct pci_dev *pdev,
+ unsigned long *memory_bar)
+{
+ int i;
+
+ for (i = 0; i < DEVICE_COUNT_RESOURCE; i++)
+ if (pci_resource_flags(pdev, i) & IORESOURCE_MEM) {
+ /* addressing mode bits already removed */
+ *memory_bar = pci_resource_start(pdev, i);
+ dev_dbg(&pdev->dev, "memory BAR = %lx\n",
+ *memory_bar);
+ return 0;
+ }
+ dev_warn(&pdev->dev, "no memory BAR found\n");
+ return -ENODEV;
+}
+
+static int cciss_wait_for_board_state(struct pci_dev *pdev,
+ void __iomem *vaddr, int wait_for_ready)
+#define BOARD_READY 1
+#define BOARD_NOT_READY 0
+{
+ int i, iterations;
+ u32 scratchpad;
+
+ if (wait_for_ready)
+ iterations = CCISS_BOARD_READY_ITERATIONS;
+ else
+ iterations = CCISS_BOARD_NOT_READY_ITERATIONS;
+
+ for (i = 0; i < iterations; i++) {
+ scratchpad = readl(vaddr + SA5_SCRATCHPAD_OFFSET);
+ if (wait_for_ready) {
+ if (scratchpad == CCISS_FIRMWARE_READY)
+ return 0;
+ } else {
+ if (scratchpad != CCISS_FIRMWARE_READY)
+ return 0;
+ }
+ msleep(CCISS_BOARD_READY_POLL_INTERVAL_MSECS);
+ }
+ dev_warn(&pdev->dev, "board not ready, timed out.\n");
+ return -ENODEV;
+}
+
+static int cciss_find_cfg_addrs(struct pci_dev *pdev, void __iomem *vaddr,
+ u32 *cfg_base_addr, u64 *cfg_base_addr_index,
+ u64 *cfg_offset)
+{
+ *cfg_base_addr = readl(vaddr + SA5_CTCFG_OFFSET);
+ *cfg_offset = readl(vaddr + SA5_CTMEM_OFFSET);
+ *cfg_base_addr &= (u32) 0x0000ffff;
+ *cfg_base_addr_index = find_PCI_BAR_index(pdev, *cfg_base_addr);
+ if (*cfg_base_addr_index == -1) {
+ dev_warn(&pdev->dev, "cannot find cfg_base_addr_index, "
+ "*cfg_base_addr = 0x%08x\n", *cfg_base_addr);
+ return -ENODEV;
+ }
+ return 0;
+}
+
+static int cciss_find_cfgtables(ctlr_info_t *h)
+{
+ u64 cfg_offset;
+ u32 cfg_base_addr;
+ u64 cfg_base_addr_index;
+ u32 trans_offset;
+ int rc;
+
+ rc = cciss_find_cfg_addrs(h->pdev, h->vaddr, &cfg_base_addr,
+ &cfg_base_addr_index, &cfg_offset);
+ if (rc)
+ return rc;
+ h->cfgtable = remap_pci_mem(pci_resource_start(h->pdev,
+ cfg_base_addr_index) + cfg_offset, sizeof(*h->cfgtable));
+ if (!h->cfgtable)
+ return -ENOMEM;
+ rc = write_driver_ver_to_cfgtable(h->cfgtable);
+ if (rc)
+ return rc;
+ /* Find performant mode table. */
+ trans_offset = readl(&h->cfgtable->TransMethodOffset);
+ h->transtable = remap_pci_mem(pci_resource_start(h->pdev,
+ cfg_base_addr_index)+cfg_offset+trans_offset,
+ sizeof(*h->transtable));
+ if (!h->transtable)
+ return -ENOMEM;
+ return 0;
+}
+
+static void cciss_get_max_perf_mode_cmds(struct ctlr_info *h)
+{
+ h->max_commands = readl(&(h->cfgtable->MaxPerformantModeCommands));
+
+ /* Limit commands in memory limited kdump scenario. */
+ if (reset_devices && h->max_commands > 32)
+ h->max_commands = 32;
+
+ if (h->max_commands < 16) {
+ dev_warn(&h->pdev->dev, "Controller reports "
+ "max supported commands of %d, an obvious lie. "
+ "Using 16. Ensure that firmware is up to date.\n",
+ h->max_commands);
+ h->max_commands = 16;
+ }
+}
+
+/* Interrogate the hardware for some limits:
+ * max commands, max SG elements without chaining, and with chaining,
+ * SG chain block size, etc.
+ */
+static void cciss_find_board_params(ctlr_info_t *h)
+{
+ cciss_get_max_perf_mode_cmds(h);
+ h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds;
+ h->maxsgentries = readl(&(h->cfgtable->MaxSGElements));
+ /*
+ * The P600 may exhibit poor performnace under some workloads
+ * if we use the value in the configuration table. Limit this
+ * controller to MAXSGENTRIES (32) instead.
+ */
+ if (h->board_id == 0x3225103C)
+ h->maxsgentries = MAXSGENTRIES;
+ /*
+ * Limit in-command s/g elements to 32 save dma'able memory.
+ * Howvever spec says if 0, use 31
+ */
+ h->max_cmd_sgentries = 31;
+ if (h->maxsgentries > 512) {
+ h->max_cmd_sgentries = 32;
+ h->chainsize = h->maxsgentries - h->max_cmd_sgentries + 1;
+ h->maxsgentries--; /* save one for chain pointer */
+ } else {
+ h->maxsgentries = 31; /* default to traditional values */
+ h->chainsize = 0;
+ }
+}
+
+static inline bool CISS_signature_present(ctlr_info_t *h)
+{
+ if (!check_signature(h->cfgtable->Signature, "CISS", 4)) {
+ dev_warn(&h->pdev->dev, "not a valid CISS config table\n");
+ return false;
+ }
+ return true;
+}
+
+/* Need to enable prefetch in the SCSI core for 6400 in x86 */
+static inline void cciss_enable_scsi_prefetch(ctlr_info_t *h)
+{
+#ifdef CONFIG_X86
+ u32 prefetch;
+
+ prefetch = readl(&(h->cfgtable->SCSI_Prefetch));
+ prefetch |= 0x100;
+ writel(prefetch, &(h->cfgtable->SCSI_Prefetch));
+#endif
+}
+
+/* Disable DMA prefetch for the P600. Otherwise an ASIC bug may result
+ * in a prefetch beyond physical memory.
+ */
+static inline void cciss_p600_dma_prefetch_quirk(ctlr_info_t *h)
+{
+ u32 dma_prefetch;
+ __u32 dma_refetch;
+
+ if (h->board_id != 0x3225103C)
+ return;
+ dma_prefetch = readl(h->vaddr + I2O_DMA1_CFG);
+ dma_prefetch |= 0x8000;
+ writel(dma_prefetch, h->vaddr + I2O_DMA1_CFG);
+ pci_read_config_dword(h->pdev, PCI_COMMAND_PARITY, &dma_refetch);
+ dma_refetch |= 0x1;
+ pci_write_config_dword(h->pdev, PCI_COMMAND_PARITY, dma_refetch);
+}
+
+static int cciss_pci_init(ctlr_info_t *h)
+{
+ int prod_index, err;
+
+ prod_index = cciss_lookup_board_id(h->pdev, &h->board_id);
+ if (prod_index < 0)
+ return -ENODEV;
+ h->product_name = products[prod_index].product_name;
+ h->access = *(products[prod_index].access);
+
+ if (cciss_board_disabled(h)) {
+ dev_warn(&h->pdev->dev, "controller appears to be disabled\n");
+ return -ENODEV;
+ }
+
+ pci_disable_link_state(h->pdev, PCIE_LINK_STATE_L0S |
+ PCIE_LINK_STATE_L1 | PCIE_LINK_STATE_CLKPM);
+
+ err = pci_enable_device(h->pdev);
+ if (err) {
+ dev_warn(&h->pdev->dev, "Unable to Enable PCI device\n");
+ return err;
+ }
+
+ err = pci_request_regions(h->pdev, "cciss");
+ if (err) {
+ dev_warn(&h->pdev->dev,
+ "Cannot obtain PCI resources, aborting\n");
+ return err;
+ }
+
+ dev_dbg(&h->pdev->dev, "irq = %x\n", h->pdev->irq);
+ dev_dbg(&h->pdev->dev, "board_id = %x\n", h->board_id);
+
+/* If the kernel supports MSI/MSI-X we will try to enable that functionality,
+ * else we use the IO-APIC interrupt assigned to us by system ROM.
+ */
+ cciss_interrupt_mode(h);
+ err = cciss_pci_find_memory_BAR(h->pdev, &h->paddr);
+ if (err)
+ goto err_out_free_res;
+ h->vaddr = remap_pci_mem(h->paddr, 0x250);
+ if (!h->vaddr) {
+ err = -ENOMEM;
+ goto err_out_free_res;
+ }
+ err = cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY);
+ if (err)
+ goto err_out_free_res;
+ err = cciss_find_cfgtables(h);
+ if (err)
+ goto err_out_free_res;
+ print_cfg_table(h);
+ cciss_find_board_params(h);
+
+ if (!CISS_signature_present(h)) {
+ err = -ENODEV;
+ goto err_out_free_res;
+ }
+ cciss_enable_scsi_prefetch(h);
+ cciss_p600_dma_prefetch_quirk(h);
+ err = cciss_enter_simple_mode(h);
+ if (err)
+ goto err_out_free_res;
+ cciss_put_controller_into_performant_mode(h);
+ return 0;
+
+err_out_free_res:
+ /*
+ * Deliberately omit pci_disable_device(): it does something nasty to
+ * Smart Array controllers that pci_enable_device does not undo
+ */
+ if (h->transtable)
+ iounmap(h->transtable);
+ if (h->cfgtable)
+ iounmap(h->cfgtable);
+ if (h->vaddr)
+ iounmap(h->vaddr);
+ pci_release_regions(h->pdev);
+ return err;
+}
+
+/* Function to find the first free pointer into our hba[] array
+ * Returns -1 if no free entries are left.
+ */
+static int alloc_cciss_hba(struct pci_dev *pdev)
+{
+ int i;
+
+ for (i = 0; i < MAX_CTLR; i++) {
+ if (!hba[i]) {
+ ctlr_info_t *h;
+
+ h = kzalloc(sizeof(ctlr_info_t), GFP_KERNEL);
+ if (!h)
+ goto Enomem;
+ hba[i] = h;
+ return i;
+ }
+ }
+ dev_warn(&pdev->dev, "This driver supports a maximum"
+ " of %d controllers.\n", MAX_CTLR);
+ return -1;
+Enomem:
+ dev_warn(&pdev->dev, "out of memory.\n");
+ return -1;
+}
+
+static void free_hba(ctlr_info_t *h)
+{
+ int i;
+
+ hba[h->ctlr] = NULL;
+ for (i = 0; i < h->highest_lun + 1; i++)
+ if (h->gendisk[i] != NULL)
+ put_disk(h->gendisk[i]);
+ kfree(h);
+}
+
+/* Send a message CDB to the firmware. */
+static int cciss_message(struct pci_dev *pdev, unsigned char opcode,
+ unsigned char type)
+{
+ typedef struct {
+ CommandListHeader_struct CommandHeader;
+ RequestBlock_struct Request;
+ ErrDescriptor_struct ErrorDescriptor;
+ } Command;
+ static const size_t cmd_sz = sizeof(Command) + sizeof(ErrorInfo_struct);
+ Command *cmd;
+ dma_addr_t paddr64;
+ uint32_t paddr32, tag;
+ void __iomem *vaddr;
+ int i, err;
+
+ vaddr = ioremap_nocache(pci_resource_start(pdev, 0), pci_resource_len(pdev, 0));
+ if (vaddr == NULL)
+ return -ENOMEM;
+
+ /* The Inbound Post Queue only accepts 32-bit physical addresses for the
+ CCISS commands, so they must be allocated from the lower 4GiB of
+ memory. */
+ err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (err) {
+ iounmap(vaddr);
+ return -ENOMEM;
+ }
+
+ cmd = pci_alloc_consistent(pdev, cmd_sz, &paddr64);
+ if (cmd == NULL) {
+ iounmap(vaddr);
+ return -ENOMEM;
+ }
+
+ /* This must fit, because of the 32-bit consistent DMA mask. Also,
+ although there's no guarantee, we assume that the address is at
+ least 4-byte aligned (most likely, it's page-aligned). */
+ paddr32 = paddr64;
+
+ cmd->CommandHeader.ReplyQueue = 0;
+ cmd->CommandHeader.SGList = 0;
+ cmd->CommandHeader.SGTotal = 0;
+ cmd->CommandHeader.Tag.lower = paddr32;
+ cmd->CommandHeader.Tag.upper = 0;
+ memset(&cmd->CommandHeader.LUN.LunAddrBytes, 0, 8);
+
+ cmd->Request.CDBLen = 16;
+ cmd->Request.Type.Type = TYPE_MSG;
+ cmd->Request.Type.Attribute = ATTR_HEADOFQUEUE;
+ cmd->Request.Type.Direction = XFER_NONE;
+ cmd->Request.Timeout = 0; /* Don't time out */
+ cmd->Request.CDB[0] = opcode;
+ cmd->Request.CDB[1] = type;
+ memset(&cmd->Request.CDB[2], 0, 14); /* the rest of the CDB is reserved */
+
+ cmd->ErrorDescriptor.Addr.lower = paddr32 + sizeof(Command);
+ cmd->ErrorDescriptor.Addr.upper = 0;
+ cmd->ErrorDescriptor.Len = sizeof(ErrorInfo_struct);
+
+ writel(paddr32, vaddr + SA5_REQUEST_PORT_OFFSET);
+
+ for (i = 0; i < 10; i++) {
+ tag = readl(vaddr + SA5_REPLY_PORT_OFFSET);
+ if ((tag & ~3) == paddr32)
+ break;
+ msleep(CCISS_POST_RESET_NOOP_TIMEOUT_MSECS);
+ }
+
+ iounmap(vaddr);
+
+ /* we leak the DMA buffer here ... no choice since the controller could
+ still complete the command. */
+ if (i == 10) {
+ dev_err(&pdev->dev,
+ "controller message %02x:%02x timed out\n",
+ opcode, type);
+ return -ETIMEDOUT;
+ }
+
+ pci_free_consistent(pdev, cmd_sz, cmd, paddr64);
+
+ if (tag & 2) {
+ dev_err(&pdev->dev, "controller message %02x:%02x failed\n",
+ opcode, type);
+ return -EIO;
+ }
+
+ dev_info(&pdev->dev, "controller message %02x:%02x succeeded\n",
+ opcode, type);
+ return 0;
+}
+
+#define cciss_noop(p) cciss_message(p, 3, 0)
+
+static int cciss_controller_hard_reset(struct pci_dev *pdev,
+ void * __iomem vaddr, u32 use_doorbell)
+{
+ u16 pmcsr;
+ int pos;
+
+ if (use_doorbell) {
+ /* For everything after the P600, the PCI power state method
+ * of resetting the controller doesn't work, so we have this
+ * other way using the doorbell register.
+ */
+ dev_info(&pdev->dev, "using doorbell to reset controller\n");
+ writel(use_doorbell, vaddr + SA5_DOORBELL);
+ } else { /* Try to do it the PCI power state way */
+
+ /* Quoting from the Open CISS Specification: "The Power
+ * Management Control/Status Register (CSR) controls the power
+ * state of the device. The normal operating state is D0,
+ * CSR=00h. The software off state is D3, CSR=03h. To reset
+ * the controller, place the interface device in D3 then to D0,
+ * this causes a secondary PCI reset which will reset the
+ * controller." */
+
+ pos = pci_find_capability(pdev, PCI_CAP_ID_PM);
+ if (pos == 0) {
+ dev_err(&pdev->dev,
+ "cciss_controller_hard_reset: "
+ "PCI PM not supported\n");
+ return -ENODEV;
+ }
+ dev_info(&pdev->dev, "using PCI PM to reset controller\n");
+ /* enter the D3hot power management state */
+ pci_read_config_word(pdev, pos + PCI_PM_CTRL, &pmcsr);
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ pmcsr |= PCI_D3hot;
+ pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
+
+ msleep(500);
+
+ /* enter the D0 power management state */
+ pmcsr &= ~PCI_PM_CTRL_STATE_MASK;
+ pmcsr |= PCI_D0;
+ pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr);
+
+ /*
+ * The P600 requires a small delay when changing states.
+ * Otherwise we may think the board did not reset and we bail.
+ * This for kdump only and is particular to the P600.
+ */
+ msleep(500);
+ }
+ return 0;
+}
+
+static void init_driver_version(char *driver_version, int len)
+{
+ memset(driver_version, 0, len);
+ strncpy(driver_version, "cciss " DRIVER_NAME, len - 1);
+}
+
+static int write_driver_ver_to_cfgtable(CfgTable_struct __iomem *cfgtable)
+{
+ char *driver_version;
+ int i, size = sizeof(cfgtable->driver_version);
+
+ driver_version = kmalloc(size, GFP_KERNEL);
+ if (!driver_version)
+ return -ENOMEM;
+
+ init_driver_version(driver_version, size);
+ for (i = 0; i < size; i++)
+ writeb(driver_version[i], &cfgtable->driver_version[i]);
+ kfree(driver_version);
+ return 0;
+}
+
+static void read_driver_ver_from_cfgtable(CfgTable_struct __iomem *cfgtable,
+ unsigned char *driver_ver)
+{
+ int i;
+
+ for (i = 0; i < sizeof(cfgtable->driver_version); i++)
+ driver_ver[i] = readb(&cfgtable->driver_version[i]);
+}
+
+static int controller_reset_failed(CfgTable_struct __iomem *cfgtable)
+{
+
+ char *driver_ver, *old_driver_ver;
+ int rc, size = sizeof(cfgtable->driver_version);
+
+ old_driver_ver = kmalloc(2 * size, GFP_KERNEL);
+ if (!old_driver_ver)
+ return -ENOMEM;
+ driver_ver = old_driver_ver + size;
+
+ /* After a reset, the 32 bytes of "driver version" in the cfgtable
+ * should have been changed, otherwise we know the reset failed.
+ */
+ init_driver_version(old_driver_ver, size);
+ read_driver_ver_from_cfgtable(cfgtable, driver_ver);
+ rc = !memcmp(driver_ver, old_driver_ver, size);
+ kfree(old_driver_ver);
+ return rc;
+}
+
+/* This does a hard reset of the controller using PCI power management
+ * states or using the doorbell register. */
+static int cciss_kdump_hard_reset_controller(struct pci_dev *pdev)
+{
+ u64 cfg_offset;
+ u32 cfg_base_addr;
+ u64 cfg_base_addr_index;
+ void __iomem *vaddr;
+ unsigned long paddr;
+ u32 misc_fw_support;
+ int rc;
+ CfgTable_struct __iomem *cfgtable;
+ u32 use_doorbell;
+ u32 board_id;
+ u16 command_register;
+
+ /* For controllers as old a the p600, this is very nearly
+ * the same thing as
+ *
+ * pci_save_state(pci_dev);
+ * pci_set_power_state(pci_dev, PCI_D3hot);
+ * pci_set_power_state(pci_dev, PCI_D0);
+ * pci_restore_state(pci_dev);
+ *
+ * For controllers newer than the P600, the pci power state
+ * method of resetting doesn't work so we have another way
+ * using the doorbell register.
+ */
+
+ /* Exclude 640x boards. These are two pci devices in one slot
+ * which share a battery backed cache module. One controls the
+ * cache, the other accesses the cache through the one that controls
+ * it. If we reset the one controlling the cache, the other will
+ * likely not be happy. Just forbid resetting this conjoined mess.
+ */
+ cciss_lookup_board_id(pdev, &board_id);
+ if (!ctlr_is_resettable(board_id)) {
+ dev_warn(&pdev->dev, "Cannot reset Smart Array 640x "
+ "due to shared cache module.");
+ return -ENODEV;
+ }
+
+ /* if controller is soft- but not hard resettable... */
+ if (!ctlr_is_hard_resettable(board_id))
+ return -ENOTSUPP; /* try soft reset later. */
+
+ /* Save the PCI command register */
+ pci_read_config_word(pdev, 4, &command_register);
+ /* Turn the board off. This is so that later pci_restore_state()
+ * won't turn the board on before the rest of config space is ready.
+ */
+ pci_disable_device(pdev);
+ pci_save_state(pdev);
+
+ /* find the first memory BAR, so we can find the cfg table */
+ rc = cciss_pci_find_memory_BAR(pdev, &paddr);
+ if (rc)
+ return rc;
+ vaddr = remap_pci_mem(paddr, 0x250);
+ if (!vaddr)
+ return -ENOMEM;
+
+ /* find cfgtable in order to check if reset via doorbell is supported */
+ rc = cciss_find_cfg_addrs(pdev, vaddr, &cfg_base_addr,
+ &cfg_base_addr_index, &cfg_offset);
+ if (rc)
+ goto unmap_vaddr;
+ cfgtable = remap_pci_mem(pci_resource_start(pdev,
+ cfg_base_addr_index) + cfg_offset, sizeof(*cfgtable));
+ if (!cfgtable) {
+ rc = -ENOMEM;
+ goto unmap_vaddr;
+ }
+ rc = write_driver_ver_to_cfgtable(cfgtable);
+ if (rc)
+ goto unmap_vaddr;
+
+ /* If reset via doorbell register is supported, use that.
+ * There are two such methods. Favor the newest method.
+ */
+ misc_fw_support = readl(&cfgtable->misc_fw_support);
+ use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET2;
+ if (use_doorbell) {
+ use_doorbell = DOORBELL_CTLR_RESET2;
+ } else {
+ use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET;
+ if (use_doorbell) {
+ dev_warn(&pdev->dev, "Controller claims that "
+ "'Bit 2 doorbell reset' is "
+ "supported, but not 'bit 5 doorbell reset'. "
+ "Firmware update is recommended.\n");
+ rc = -ENOTSUPP; /* use the soft reset */
+ goto unmap_cfgtable;
+ }
+ }
+
+ rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell);
+ if (rc)
+ goto unmap_cfgtable;
+ pci_restore_state(pdev);
+ rc = pci_enable_device(pdev);
+ if (rc) {
+ dev_warn(&pdev->dev, "failed to enable device.\n");
+ goto unmap_cfgtable;
+ }
+ pci_write_config_word(pdev, 4, command_register);
+
+ /* Some devices (notably the HP Smart Array 5i Controller)
+ need a little pause here */
+ msleep(CCISS_POST_RESET_PAUSE_MSECS);
+
+ /* Wait for board to become not ready, then ready. */
+ dev_info(&pdev->dev, "Waiting for board to reset.\n");
+ rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY);
+ if (rc) {
+ dev_warn(&pdev->dev, "Failed waiting for board to hard reset."
+ " Will try soft reset.\n");
+ rc = -ENOTSUPP; /* Not expected, but try soft reset later */
+ goto unmap_cfgtable;
+ }
+ rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY);
+ if (rc) {
+ dev_warn(&pdev->dev,
+ "failed waiting for board to become ready "
+ "after hard reset\n");
+ goto unmap_cfgtable;
+ }
+
+ rc = controller_reset_failed(vaddr);
+ if (rc < 0)
+ goto unmap_cfgtable;
+ if (rc) {
+ dev_warn(&pdev->dev, "Unable to successfully hard reset "
+ "controller. Will try soft reset.\n");
+ rc = -ENOTSUPP; /* Not expected, but try soft reset later */
+ } else {
+ dev_info(&pdev->dev, "Board ready after hard reset.\n");
+ }
+
+unmap_cfgtable:
+ iounmap(cfgtable);
+
+unmap_vaddr:
+ iounmap(vaddr);
+ return rc;
+}
+
+static int cciss_init_reset_devices(struct pci_dev *pdev)
+{
+ int rc, i;
+
+ if (!reset_devices)
+ return 0;
+
+ /* Reset the controller with a PCI power-cycle or via doorbell */
+ rc = cciss_kdump_hard_reset_controller(pdev);
+
+ /* -ENOTSUPP here means we cannot reset the controller
+ * but it's already (and still) up and running in
+ * "performant mode". Or, it might be 640x, which can't reset
+ * due to concerns about shared bbwc between 6402/6404 pair.
+ */
+ if (rc == -ENOTSUPP)
+ return rc; /* just try to do the kdump anyhow. */
+ if (rc)
+ return -ENODEV;
+
+ /* Now try to get the controller to respond to a no-op */
+ dev_warn(&pdev->dev, "Waiting for controller to respond to no-op\n");
+ for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) {
+ if (cciss_noop(pdev) == 0)
+ break;
+ else
+ dev_warn(&pdev->dev, "no-op failed%s\n",
+ (i < CCISS_POST_RESET_NOOP_RETRIES - 1 ?
+ "; re-trying" : ""));
+ msleep(CCISS_POST_RESET_NOOP_INTERVAL_MSECS);
+ }
+ return 0;
+}
+
+static int cciss_allocate_cmd_pool(ctlr_info_t *h)
+{
+ h->cmd_pool_bits = kmalloc(BITS_TO_LONGS(h->nr_cmds) *
+ sizeof(unsigned long), GFP_KERNEL);
+ h->cmd_pool = pci_alloc_consistent(h->pdev,
+ h->nr_cmds * sizeof(CommandList_struct),
+ &(h->cmd_pool_dhandle));
+ h->errinfo_pool = pci_alloc_consistent(h->pdev,
+ h->nr_cmds * sizeof(ErrorInfo_struct),
+ &(h->errinfo_pool_dhandle));
+ if ((h->cmd_pool_bits == NULL)
+ || (h->cmd_pool == NULL)
+ || (h->errinfo_pool == NULL)) {
+ dev_err(&h->pdev->dev, "out of memory");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static int cciss_allocate_scatterlists(ctlr_info_t *h)
+{
+ int i;
+
+ /* zero it, so that on free we need not know how many were alloc'ed */
+ h->scatter_list = kzalloc(h->max_commands *
+ sizeof(struct scatterlist *), GFP_KERNEL);
+ if (!h->scatter_list)
+ return -ENOMEM;
+
+ for (i = 0; i < h->nr_cmds; i++) {
+ h->scatter_list[i] = kmalloc(sizeof(struct scatterlist) *
+ h->maxsgentries, GFP_KERNEL);
+ if (h->scatter_list[i] == NULL) {
+ dev_err(&h->pdev->dev, "could not allocate "
+ "s/g lists\n");
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+static void cciss_free_scatterlists(ctlr_info_t *h)
+{
+ int i;
+
+ if (h->scatter_list) {
+ for (i = 0; i < h->nr_cmds; i++)
+ kfree(h->scatter_list[i]);
+ kfree(h->scatter_list);
+ }
+}
+
+static void cciss_free_cmd_pool(ctlr_info_t *h)
+{
+ kfree(h->cmd_pool_bits);
+ if (h->cmd_pool)
+ pci_free_consistent(h->pdev,
+ h->nr_cmds * sizeof(CommandList_struct),
+ h->cmd_pool, h->cmd_pool_dhandle);
+ if (h->errinfo_pool)
+ pci_free_consistent(h->pdev,
+ h->nr_cmds * sizeof(ErrorInfo_struct),
+ h->errinfo_pool, h->errinfo_pool_dhandle);
+}
+
+static int cciss_request_irq(ctlr_info_t *h,
+ irqreturn_t (*msixhandler)(int, void *),
+ irqreturn_t (*intxhandler)(int, void *))
+{
+ if (h->msix_vector || h->msi_vector) {
+ if (!request_irq(h->intr[h->intr_mode], msixhandler,
+ 0, h->devname, h))
+ return 0;
+ dev_err(&h->pdev->dev, "Unable to get msi irq %d"
+ " for %s\n", h->intr[h->intr_mode],
+ h->devname);
+ return -1;
+ }
+
+ if (!request_irq(h->intr[h->intr_mode], intxhandler,
+ IRQF_SHARED, h->devname, h))
+ return 0;
+ dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n",
+ h->intr[h->intr_mode], h->devname);
+ return -1;
+}
+
+static int cciss_kdump_soft_reset(ctlr_info_t *h)
+{
+ if (cciss_send_reset(h, CTLR_LUNID, CCISS_RESET_TYPE_CONTROLLER)) {
+ dev_warn(&h->pdev->dev, "Resetting array controller failed.\n");
+ return -EIO;
+ }
+
+ dev_info(&h->pdev->dev, "Waiting for board to soft reset.\n");
+ if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_NOT_READY)) {
+ dev_warn(&h->pdev->dev, "Soft reset had no effect.\n");
+ return -1;
+ }
+
+ dev_info(&h->pdev->dev, "Board reset, awaiting READY status.\n");
+ if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY)) {
+ dev_warn(&h->pdev->dev, "Board failed to become ready "
+ "after soft reset.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h)
+{
+ int ctlr = h->ctlr;
+
+ free_irq(h->intr[h->intr_mode], h);
+#ifdef CONFIG_PCI_MSI
+ if (h->msix_vector)
+ pci_disable_msix(h->pdev);
+ else if (h->msi_vector)
+ pci_disable_msi(h->pdev);
+#endif /* CONFIG_PCI_MSI */
+ cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+ cciss_free_scatterlists(h);
+ cciss_free_cmd_pool(h);
+ kfree(h->blockFetchTable);
+ if (h->reply_pool)
+ pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
+ h->reply_pool, h->reply_pool_dhandle);
+ if (h->transtable)
+ iounmap(h->transtable);
+ if (h->cfgtable)
+ iounmap(h->cfgtable);
+ if (h->vaddr)
+ iounmap(h->vaddr);
+ unregister_blkdev(h->major, h->devname);
+ cciss_destroy_hba_sysfs_entry(h);
+ pci_release_regions(h->pdev);
+ kfree(h);
+ hba[ctlr] = NULL;
+}
+
+/*
+ * This is it. Find all the controllers and register them. I really hate
+ * stealing all these major device numbers.
+ * returns the number of block devices registered.
+ */
+static int cciss_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ int i;
+ int j = 0;
+ int rc;
+ int try_soft_reset = 0;
+ int dac, return_code;
+ InquiryData_struct *inq_buff;
+ ctlr_info_t *h;
+ unsigned long flags;
+
+ /*
+ * By default the cciss driver is used for all older HP Smart Array
+ * controllers. There are module paramaters that allow a user to
+ * override this behavior and instead use the hpsa SCSI driver. If
+ * this is the case cciss may be loaded first from the kdump initrd
+ * image and cause a kernel panic. So if reset_devices is true and
+ * cciss_allow_hpsa is set just bail.
+ */
+ if ((reset_devices) && (cciss_allow_hpsa == 1))
+ return -ENODEV;
+ rc = cciss_init_reset_devices(pdev);
+ if (rc) {
+ if (rc != -ENOTSUPP)
+ return rc;
+ /* If the reset fails in a particular way (it has no way to do
+ * a proper hard reset, so returns -ENOTSUPP) we can try to do
+ * a soft reset once we get the controller configured up to the
+ * point that it can accept a command.
+ */
+ try_soft_reset = 1;
+ rc = 0;
+ }
+
+reinit_after_soft_reset:
+
+ i = alloc_cciss_hba(pdev);
+ if (i < 0)
+ return -ENOMEM;
+
+ h = hba[i];
+ h->pdev = pdev;
+ h->busy_initializing = 1;
+ h->intr_mode = cciss_simple_mode ? SIMPLE_MODE_INT : PERF_MODE_INT;
+ INIT_LIST_HEAD(&h->cmpQ);
+ INIT_LIST_HEAD(&h->reqQ);
+ mutex_init(&h->busy_shutting_down);
+
+ if (cciss_pci_init(h) != 0)
+ goto clean_no_release_regions;
+
+ sprintf(h->devname, "cciss%d", i);
+ h->ctlr = i;
+
+ if (cciss_tape_cmds < 2)
+ cciss_tape_cmds = 2;
+ if (cciss_tape_cmds > 16)
+ cciss_tape_cmds = 16;
+
+ init_completion(&h->scan_wait);
+
+ if (cciss_create_hba_sysfs_entry(h))
+ goto clean0;
+
+ /* configure PCI DMA stuff */
+ if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
+ dac = 1;
+ else if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)))
+ dac = 0;
+ else {
+ dev_err(&h->pdev->dev, "no suitable DMA available\n");
+ goto clean1;
+ }
+
+ /*
+ * register with the major number, or get a dynamic major number
+ * by passing 0 as argument. This is done for greater than
+ * 8 controller support.
+ */
+ if (i < MAX_CTLR_ORIG)
+ h->major = COMPAQ_CISS_MAJOR + i;
+ rc = register_blkdev(h->major, h->devname);
+ if (rc == -EBUSY || rc == -EINVAL) {
+ dev_err(&h->pdev->dev,
+ "Unable to get major number %d for %s "
+ "on hba %d\n", h->major, h->devname, i);
+ goto clean1;
+ } else {
+ if (i >= MAX_CTLR_ORIG)
+ h->major = rc;
+ }
+
+ /* make sure the board interrupts are off */
+ h->access.set_intr_mask(h, CCISS_INTR_OFF);
+ rc = cciss_request_irq(h, do_cciss_msix_intr, do_cciss_intx);
+ if (rc)
+ goto clean2;
+
+ dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n",
+ h->devname, pdev->device, pci_name(pdev),
+ h->intr[h->intr_mode], dac ? "" : " not");
+
+ if (cciss_allocate_cmd_pool(h))
+ goto clean4;
+
+ if (cciss_allocate_scatterlists(h))
+ goto clean4;
+
+ h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
+ h->chainsize, h->nr_cmds);
+ if (!h->cmd_sg_list && h->chainsize > 0)
+ goto clean4;
+
+ spin_lock_init(&h->lock);
+
+ /* Initialize the pdev driver private data.
+ have it point to h. */
+ pci_set_drvdata(pdev, h);
+ /* command and error info recs zeroed out before
+ they are used */
+ bitmap_zero(h->cmd_pool_bits, h->nr_cmds);
+
+ h->num_luns = 0;
+ h->highest_lun = -1;
+ for (j = 0; j < CISS_MAX_LUN; j++) {
+ h->drv[j] = NULL;
+ h->gendisk[j] = NULL;
+ }
+
+ /* At this point, the controller is ready to take commands.
+ * Now, if reset_devices and the hard reset didn't work, try
+ * the soft reset and see if that works.
+ */
+ if (try_soft_reset) {
+
+ /* This is kind of gross. We may or may not get a completion
+ * from the soft reset command, and if we do, then the value
+ * from the fifo may or may not be valid. So, we wait 10 secs
+ * after the reset throwing away any completions we get during
+ * that time. Unregister the interrupt handler and register
+ * fake ones to scoop up any residual completions.
+ */
+ spin_lock_irqsave(&h->lock, flags);
+ h->access.set_intr_mask(h, CCISS_INTR_OFF);
+ spin_unlock_irqrestore(&h->lock, flags);
+ free_irq(h->intr[h->intr_mode], h);
+ rc = cciss_request_irq(h, cciss_msix_discard_completions,
+ cciss_intx_discard_completions);
+ if (rc) {
+ dev_warn(&h->pdev->dev, "Failed to request_irq after "
+ "soft reset.\n");
+ goto clean4;
+ }
+
+ rc = cciss_kdump_soft_reset(h);
+ if (rc) {
+ dev_warn(&h->pdev->dev, "Soft reset failed.\n");
+ goto clean4;
+ }
+
+ dev_info(&h->pdev->dev, "Board READY.\n");
+ dev_info(&h->pdev->dev,
+ "Waiting for stale completions to drain.\n");
+ h->access.set_intr_mask(h, CCISS_INTR_ON);
+ msleep(10000);
+ h->access.set_intr_mask(h, CCISS_INTR_OFF);
+
+ rc = controller_reset_failed(h->cfgtable);
+ if (rc)
+ dev_info(&h->pdev->dev,
+ "Soft reset appears to have failed.\n");
+
+ /* since the controller's reset, we have to go back and re-init
+ * everything. Easiest to just forget what we've done and do it
+ * all over again.
+ */
+ cciss_undo_allocations_after_kdump_soft_reset(h);
+ try_soft_reset = 0;
+ if (rc)
+ /* don't go to clean4, we already unallocated */
+ return -ENODEV;
+
+ goto reinit_after_soft_reset;
+ }
+
+ cciss_scsi_setup(h);
+
+ /* Turn the interrupts on so we can service requests */
+ h->access.set_intr_mask(h, CCISS_INTR_ON);
+
+ /* Get the firmware version */
+ inq_buff = kzalloc(sizeof(InquiryData_struct), GFP_KERNEL);
+ if (inq_buff == NULL) {
+ dev_err(&h->pdev->dev, "out of memory\n");
+ goto clean4;
+ }
+
+ return_code = sendcmd_withirq(h, CISS_INQUIRY, inq_buff,
+ sizeof(InquiryData_struct), 0, CTLR_LUNID, TYPE_CMD);
+ if (return_code == IO_OK) {
+ h->firm_ver[0] = inq_buff->data_byte[32];
+ h->firm_ver[1] = inq_buff->data_byte[33];
+ h->firm_ver[2] = inq_buff->data_byte[34];
+ h->firm_ver[3] = inq_buff->data_byte[35];
+ } else { /* send command failed */
+ dev_warn(&h->pdev->dev, "unable to determine firmware"
+ " version of controller\n");
+ }
+ kfree(inq_buff);
+
+ cciss_procinit(h);
+
+ h->cciss_max_sectors = 8192;
+
+ rebuild_lun_table(h, 1, 0);
+ cciss_engage_scsi(h);
+ h->busy_initializing = 0;
+ return 0;
+
+clean4:
+ cciss_free_cmd_pool(h);
+ cciss_free_scatterlists(h);
+ cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+ free_irq(h->intr[h->intr_mode], h);
+clean2:
+ unregister_blkdev(h->major, h->devname);
+clean1:
+ cciss_destroy_hba_sysfs_entry(h);
+clean0:
+ pci_release_regions(pdev);
+clean_no_release_regions:
+ h->busy_initializing = 0;
+
+ /*
+ * Deliberately omit pci_disable_device(): it does something nasty to
+ * Smart Array controllers that pci_enable_device does not undo
+ */
+ pci_set_drvdata(pdev, NULL);
+ free_hba(h);
+ return -ENODEV;
+}
+
+static void cciss_shutdown(struct pci_dev *pdev)
+{
+ ctlr_info_t *h;
+ char *flush_buf;
+ int return_code;
+
+ h = pci_get_drvdata(pdev);
+ flush_buf = kzalloc(4, GFP_KERNEL);
+ if (!flush_buf) {
+ dev_warn(&h->pdev->dev, "cache not flushed, out of memory.\n");
+ return;
+ }
+ /* write all data in the battery backed cache to disk */
+ return_code = sendcmd_withirq(h, CCISS_CACHE_FLUSH, flush_buf,
+ 4, 0, CTLR_LUNID, TYPE_CMD);
+ kfree(flush_buf);
+ if (return_code != IO_OK)
+ dev_warn(&h->pdev->dev, "Error flushing cache\n");
+ h->access.set_intr_mask(h, CCISS_INTR_OFF);
+ free_irq(h->intr[h->intr_mode], h);
+}
+
+static int cciss_enter_simple_mode(struct ctlr_info *h)
+{
+ u32 trans_support;
+
+ trans_support = readl(&(h->cfgtable->TransportSupport));
+ if (!(trans_support & SIMPLE_MODE))
+ return -ENOTSUPP;
+
+ h->max_commands = readl(&(h->cfgtable->CmdsOutMax));
+ writel(CFGTBL_Trans_Simple, &(h->cfgtable->HostWrite.TransportRequest));
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+ cciss_wait_for_mode_change_ack(h);
+ print_cfg_table(h);
+ if (!(readl(&(h->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) {
+ dev_warn(&h->pdev->dev, "unable to get board into simple mode\n");
+ return -ENODEV;
+ }
+ h->transMethod = CFGTBL_Trans_Simple;
+ return 0;
+}
+
+
+static void cciss_remove_one(struct pci_dev *pdev)
+{
+ ctlr_info_t *h;
+ int i, j;
+
+ if (pci_get_drvdata(pdev) == NULL) {
+ dev_err(&pdev->dev, "Unable to remove device\n");
+ return;
+ }
+
+ h = pci_get_drvdata(pdev);
+ i = h->ctlr;
+ if (hba[i] == NULL) {
+ dev_err(&pdev->dev, "device appears to already be removed\n");
+ return;
+ }
+
+ mutex_lock(&h->busy_shutting_down);
+
+ remove_from_scan_list(h);
+ remove_proc_entry(h->devname, proc_cciss);
+ unregister_blkdev(h->major, h->devname);
+
+ /* remove it from the disk list */
+ for (j = 0; j < CISS_MAX_LUN; j++) {
+ struct gendisk *disk = h->gendisk[j];
+ if (disk) {
+ struct request_queue *q = disk->queue;
+
+ if (disk->flags & GENHD_FL_UP) {
+ cciss_destroy_ld_sysfs_entry(h, j, 1);
+ del_gendisk(disk);
+ }
+ if (q)
+ blk_cleanup_queue(q);
+ }
+ }
+
+#ifdef CONFIG_CISS_SCSI_TAPE
+ cciss_unregister_scsi(h); /* unhook from SCSI subsystem */
+#endif
+
+ cciss_shutdown(pdev);
+
+#ifdef CONFIG_PCI_MSI
+ if (h->msix_vector)
+ pci_disable_msix(h->pdev);
+ else if (h->msi_vector)
+ pci_disable_msi(h->pdev);
+#endif /* CONFIG_PCI_MSI */
+
+ iounmap(h->transtable);
+ iounmap(h->cfgtable);
+ iounmap(h->vaddr);
+
+ cciss_free_cmd_pool(h);
+ /* Free up sg elements */
+ for (j = 0; j < h->nr_cmds; j++)
+ kfree(h->scatter_list[j]);
+ kfree(h->scatter_list);
+ cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds);
+ kfree(h->blockFetchTable);
+ if (h->reply_pool)
+ pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64),
+ h->reply_pool, h->reply_pool_dhandle);
+ /*
+ * Deliberately omit pci_disable_device(): it does something nasty to
+ * Smart Array controllers that pci_enable_device does not undo
+ */
+ pci_release_regions(pdev);
+ pci_set_drvdata(pdev, NULL);
+ cciss_destroy_hba_sysfs_entry(h);
+ mutex_unlock(&h->busy_shutting_down);
+ free_hba(h);
+}
+
+static struct pci_driver cciss_pci_driver = {
+ .name = "cciss",
+ .probe = cciss_init_one,
+ .remove = cciss_remove_one,
+ .id_table = cciss_pci_device_id, /* id_table */
+ .shutdown = cciss_shutdown,
+};
+
+/*
+ * This is it. Register the PCI driver information for the cards we control
+ * the OS will call our registered routines when it finds one of our cards.
+ */
+static int __init cciss_init(void)
+{
+ int err;
+
+ /*
+ * The hardware requires that commands are aligned on a 64-bit
+ * boundary. Given that we use pci_alloc_consistent() to allocate an
+ * array of them, the size must be a multiple of 8 bytes.
+ */
+ BUILD_BUG_ON(sizeof(CommandList_struct) % COMMANDLIST_ALIGNMENT);
+ printk(KERN_INFO DRIVER_NAME "\n");
+
+ err = bus_register(&cciss_bus_type);
+ if (err)
+ return err;
+
+ /* Start the scan thread */
+ cciss_scan_thread = kthread_run(scan_thread, NULL, "cciss_scan");
+ if (IS_ERR(cciss_scan_thread)) {
+ err = PTR_ERR(cciss_scan_thread);
+ goto err_bus_unregister;
+ }
+
+ /* Register for our PCI devices */
+ err = pci_register_driver(&cciss_pci_driver);
+ if (err)
+ goto err_thread_stop;
+
+ return err;
+
+err_thread_stop:
+ kthread_stop(cciss_scan_thread);
+err_bus_unregister:
+ bus_unregister(&cciss_bus_type);
+
+ return err;
+}
+
+static void __exit cciss_cleanup(void)
+{
+ int i;
+
+ pci_unregister_driver(&cciss_pci_driver);
+ /* double check that all controller entrys have been removed */
+ for (i = 0; i < MAX_CTLR; i++) {
+ if (hba[i] != NULL) {
+ dev_warn(&hba[i]->pdev->dev,
+ "had to remove controller\n");
+ cciss_remove_one(hba[i]->pdev);
+ }
+ }
+ kthread_stop(cciss_scan_thread);
+ if (proc_cciss)
+ remove_proc_entry("driver/cciss", NULL);
+ bus_unregister(&cciss_bus_type);
+}
+
+module_init(cciss_init);
+module_exit(cciss_cleanup);
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
new file mode 100644
index 000000000..7fda30e4a
--- /dev/null
+++ b/drivers/block/cciss.h
@@ -0,0 +1,435 @@
+#ifndef CCISS_H
+#define CCISS_H
+
+#include <linux/genhd.h>
+#include <linux/mutex.h>
+
+#include "cciss_cmd.h"
+
+
+#define NWD_SHIFT 4
+#define MAX_PART (1 << NWD_SHIFT)
+
+#define IO_OK 0
+#define IO_ERROR 1
+#define IO_NEEDS_RETRY 3
+
+#define VENDOR_LEN 8
+#define MODEL_LEN 16
+#define REV_LEN 4
+
+struct ctlr_info;
+typedef struct ctlr_info ctlr_info_t;
+
+struct access_method {
+ void (*submit_command)(ctlr_info_t *h, CommandList_struct *c);
+ void (*set_intr_mask)(ctlr_info_t *h, unsigned long val);
+ unsigned long (*fifo_full)(ctlr_info_t *h);
+ bool (*intr_pending)(ctlr_info_t *h);
+ unsigned long (*command_completed)(ctlr_info_t *h);
+};
+typedef struct _drive_info_struct
+{
+ unsigned char LunID[8];
+ int usage_count;
+ struct request_queue *queue;
+ sector_t nr_blocks;
+ int block_size;
+ int heads;
+ int sectors;
+ int cylinders;
+ int raid_level; /* set to -1 to indicate that
+ * the drive is not in use/configured
+ */
+ int busy_configuring; /* This is set when a drive is being removed
+ * to prevent it from being opened or it's
+ * queue from being started.
+ */
+ struct device dev;
+ __u8 serial_no[16]; /* from inquiry page 0x83,
+ * not necc. null terminated.
+ */
+ char vendor[VENDOR_LEN + 1]; /* SCSI vendor string */
+ char model[MODEL_LEN + 1]; /* SCSI model string */
+ char rev[REV_LEN + 1]; /* SCSI revision string */
+ char device_initialized; /* indicates whether dev is initialized */
+} drive_info_struct;
+
+struct ctlr_info
+{
+ int ctlr;
+ char devname[8];
+ char *product_name;
+ char firm_ver[4]; /* Firmware version */
+ struct pci_dev *pdev;
+ __u32 board_id;
+ void __iomem *vaddr;
+ unsigned long paddr;
+ int nr_cmds; /* Number of commands allowed on this controller */
+ CfgTable_struct __iomem *cfgtable;
+ int interrupts_enabled;
+ int major;
+ int max_commands;
+ int commands_outstanding;
+ int max_outstanding; /* Debug */
+ int num_luns;
+ int highest_lun;
+ int usage_count; /* number of opens all all minor devices */
+ /* Need space for temp sg list
+ * number of scatter/gathers supported
+ * number of scatter/gathers in chained block
+ */
+ struct scatterlist **scatter_list;
+ int maxsgentries;
+ int chainsize;
+ int max_cmd_sgentries;
+ SGDescriptor_struct **cmd_sg_list;
+
+# define PERF_MODE_INT 0
+# define DOORBELL_INT 1
+# define SIMPLE_MODE_INT 2
+# define MEMQ_MODE_INT 3
+ unsigned int intr[4];
+ unsigned int msix_vector;
+ unsigned int msi_vector;
+ int intr_mode;
+ int cciss_max_sectors;
+ BYTE cciss_read;
+ BYTE cciss_write;
+ BYTE cciss_read_capacity;
+
+ /* information about each logical volume */
+ drive_info_struct *drv[CISS_MAX_LUN];
+
+ struct access_method access;
+
+ /* queue and queue Info */
+ struct list_head reqQ;
+ struct list_head cmpQ;
+ unsigned int Qdepth;
+ unsigned int maxQsinceinit;
+ unsigned int maxSG;
+ spinlock_t lock;
+
+ /* pointers to command and error info pool */
+ CommandList_struct *cmd_pool;
+ dma_addr_t cmd_pool_dhandle;
+ ErrorInfo_struct *errinfo_pool;
+ dma_addr_t errinfo_pool_dhandle;
+ unsigned long *cmd_pool_bits;
+ int nr_allocs;
+ int nr_frees;
+ int busy_configuring;
+ int busy_initializing;
+ int busy_scanning;
+ struct mutex busy_shutting_down;
+
+ /* This element holds the zero based queue number of the last
+ * queue to be started. It is used for fairness.
+ */
+ int next_to_run;
+
+ /* Disk structures we need to pass back */
+ struct gendisk *gendisk[CISS_MAX_LUN];
+#ifdef CONFIG_CISS_SCSI_TAPE
+ struct cciss_scsi_adapter_data_t *scsi_ctlr;
+#endif
+ unsigned char alive;
+ struct list_head scan_list;
+ struct completion scan_wait;
+ struct device dev;
+ /*
+ * Performant mode tables.
+ */
+ u32 trans_support;
+ u32 trans_offset;
+ struct TransTable_struct *transtable;
+ unsigned long transMethod;
+
+ /*
+ * Performant mode completion buffer
+ */
+ u64 *reply_pool;
+ dma_addr_t reply_pool_dhandle;
+ u64 *reply_pool_head;
+ size_t reply_pool_size;
+ unsigned char reply_pool_wraparound;
+ u32 *blockFetchTable;
+};
+
+/* Defining the diffent access_methods
+ *
+ * Memory mapped FIFO interface (SMART 53xx cards)
+ */
+#define SA5_DOORBELL 0x20
+#define SA5_REQUEST_PORT_OFFSET 0x40
+#define SA5_REPLY_INTR_MASK_OFFSET 0x34
+#define SA5_REPLY_PORT_OFFSET 0x44
+#define SA5_INTR_STATUS 0x30
+#define SA5_SCRATCHPAD_OFFSET 0xB0
+
+#define SA5_CTCFG_OFFSET 0xB4
+#define SA5_CTMEM_OFFSET 0xB8
+
+#define SA5_INTR_OFF 0x08
+#define SA5B_INTR_OFF 0x04
+#define SA5_INTR_PENDING 0x08
+#define SA5B_INTR_PENDING 0x04
+#define FIFO_EMPTY 0xffffffff
+#define CCISS_FIRMWARE_READY 0xffff0000 /* value in scratchpad register */
+/* Perf. mode flags */
+#define SA5_PERF_INTR_PENDING 0x04
+#define SA5_PERF_INTR_OFF 0x05
+#define SA5_OUTDB_STATUS_PERF_BIT 0x01
+#define SA5_OUTDB_CLEAR_PERF_BIT 0x01
+#define SA5_OUTDB_CLEAR 0xA0
+#define SA5_OUTDB_CLEAR_PERF_BIT 0x01
+#define SA5_OUTDB_STATUS 0x9C
+
+
+#define CISS_ERROR_BIT 0x02
+
+#define CCISS_INTR_ON 1
+#define CCISS_INTR_OFF 0
+
+
+/* CCISS_BOARD_READY_WAIT_SECS is how long to wait for a board
+ * to become ready, in seconds, before giving up on it.
+ * CCISS_BOARD_READY_POLL_INTERVAL_MSECS * is how long to wait
+ * between polling the board to see if it is ready, in
+ * milliseconds. CCISS_BOARD_READY_ITERATIONS is derived
+ * the above.
+ */
+#define CCISS_BOARD_READY_WAIT_SECS (120)
+#define CCISS_BOARD_NOT_READY_WAIT_SECS (100)
+#define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100)
+#define CCISS_BOARD_READY_ITERATIONS \
+ ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \
+ CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
+#define CCISS_BOARD_NOT_READY_ITERATIONS \
+ ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \
+ CCISS_BOARD_READY_POLL_INTERVAL_MSECS)
+#define CCISS_POST_RESET_PAUSE_MSECS (3000)
+#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (4000)
+#define CCISS_POST_RESET_NOOP_RETRIES (12)
+#define CCISS_POST_RESET_NOOP_TIMEOUT_MSECS (10000)
+
+/*
+ Send the command to the hardware
+*/
+static void SA5_submit_command( ctlr_info_t *h, CommandList_struct *c)
+{
+#ifdef CCISS_DEBUG
+ printk(KERN_WARNING "cciss%d: Sending %08x - down to controller\n",
+ h->ctlr, c->busaddr);
+#endif /* CCISS_DEBUG */
+ writel(c->busaddr, h->vaddr + SA5_REQUEST_PORT_OFFSET);
+ readl(h->vaddr + SA5_SCRATCHPAD_OFFSET);
+ h->commands_outstanding++;
+ if ( h->commands_outstanding > h->max_outstanding)
+ h->max_outstanding = h->commands_outstanding;
+}
+
+/*
+ * This card is the opposite of the other cards.
+ * 0 turns interrupts on...
+ * 0x08 turns them off...
+ */
+static void SA5_intr_mask(ctlr_info_t *h, unsigned long val)
+{
+ if (val)
+ { /* Turn interrupts on */
+ h->interrupts_enabled = 1;
+ writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ } else /* Turn them off */
+ {
+ h->interrupts_enabled = 0;
+ writel( SA5_INTR_OFF,
+ h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ }
+}
+/*
+ * This card is the opposite of the other cards.
+ * 0 turns interrupts on...
+ * 0x04 turns them off...
+ */
+static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val)
+{
+ if (val)
+ { /* Turn interrupts on */
+ h->interrupts_enabled = 1;
+ writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ } else /* Turn them off */
+ {
+ h->interrupts_enabled = 0;
+ writel( SA5B_INTR_OFF,
+ h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ }
+}
+
+/* Performant mode intr_mask */
+static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val)
+{
+ if (val) { /* turn on interrupts */
+ h->interrupts_enabled = 1;
+ writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ } else {
+ h->interrupts_enabled = 0;
+ writel(SA5_PERF_INTR_OFF,
+ h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET);
+ }
+}
+
+/*
+ * Returns true if fifo is full.
+ *
+ */
+static unsigned long SA5_fifo_full(ctlr_info_t *h)
+{
+ if( h->commands_outstanding >= h->max_commands)
+ return(1);
+ else
+ return(0);
+
+}
+/*
+ * returns value read from hardware.
+ * returns FIFO_EMPTY if there is nothing to read
+ */
+static unsigned long SA5_completed(ctlr_info_t *h)
+{
+ unsigned long register_value
+ = readl(h->vaddr + SA5_REPLY_PORT_OFFSET);
+ if(register_value != FIFO_EMPTY)
+ {
+ h->commands_outstanding--;
+#ifdef CCISS_DEBUG
+ printk("cciss: Read %lx back from board\n", register_value);
+#endif /* CCISS_DEBUG */
+ }
+#ifdef CCISS_DEBUG
+ else
+ {
+ printk("cciss: FIFO Empty read\n");
+ }
+#endif
+ return ( register_value);
+
+}
+
+/* Performant mode command completed */
+static unsigned long SA5_performant_completed(ctlr_info_t *h)
+{
+ unsigned long register_value = FIFO_EMPTY;
+
+ /* flush the controller write of the reply queue by reading
+ * outbound doorbell status register.
+ */
+ register_value = readl(h->vaddr + SA5_OUTDB_STATUS);
+ /* msi auto clears the interrupt pending bit. */
+ if (!(h->msi_vector || h->msix_vector)) {
+ writel(SA5_OUTDB_CLEAR_PERF_BIT, h->vaddr + SA5_OUTDB_CLEAR);
+ /* Do a read in order to flush the write to the controller
+ * (as per spec.)
+ */
+ register_value = readl(h->vaddr + SA5_OUTDB_STATUS);
+ }
+
+ if ((*(h->reply_pool_head) & 1) == (h->reply_pool_wraparound)) {
+ register_value = *(h->reply_pool_head);
+ (h->reply_pool_head)++;
+ h->commands_outstanding--;
+ } else {
+ register_value = FIFO_EMPTY;
+ }
+ /* Check for wraparound */
+ if (h->reply_pool_head == (h->reply_pool + h->max_commands)) {
+ h->reply_pool_head = h->reply_pool;
+ h->reply_pool_wraparound ^= 1;
+ }
+
+ return register_value;
+}
+/*
+ * Returns true if an interrupt is pending..
+ */
+static bool SA5_intr_pending(ctlr_info_t *h)
+{
+ unsigned long register_value =
+ readl(h->vaddr + SA5_INTR_STATUS);
+#ifdef CCISS_DEBUG
+ printk("cciss: intr_pending %lx\n", register_value);
+#endif /* CCISS_DEBUG */
+ if( register_value & SA5_INTR_PENDING)
+ return 1;
+ return 0 ;
+}
+
+/*
+ * Returns true if an interrupt is pending..
+ */
+static bool SA5B_intr_pending(ctlr_info_t *h)
+{
+ unsigned long register_value =
+ readl(h->vaddr + SA5_INTR_STATUS);
+#ifdef CCISS_DEBUG
+ printk("cciss: intr_pending %lx\n", register_value);
+#endif /* CCISS_DEBUG */
+ if( register_value & SA5B_INTR_PENDING)
+ return 1;
+ return 0 ;
+}
+
+static bool SA5_performant_intr_pending(ctlr_info_t *h)
+{
+ unsigned long register_value = readl(h->vaddr + SA5_INTR_STATUS);
+
+ if (!register_value)
+ return false;
+
+ if (h->msi_vector || h->msix_vector)
+ return true;
+
+ /* Read outbound doorbell to flush */
+ register_value = readl(h->vaddr + SA5_OUTDB_STATUS);
+ return register_value & SA5_OUTDB_STATUS_PERF_BIT;
+}
+
+static struct access_method SA5_access = {
+ SA5_submit_command,
+ SA5_intr_mask,
+ SA5_fifo_full,
+ SA5_intr_pending,
+ SA5_completed,
+};
+
+static struct access_method SA5B_access = {
+ SA5_submit_command,
+ SA5B_intr_mask,
+ SA5_fifo_full,
+ SA5B_intr_pending,
+ SA5_completed,
+};
+
+static struct access_method SA5_performant_access = {
+ SA5_submit_command,
+ SA5_performant_intr_mask,
+ SA5_fifo_full,
+ SA5_performant_intr_pending,
+ SA5_performant_completed,
+};
+
+struct board_type {
+ __u32 board_id;
+ char *product_name;
+ struct access_method *access;
+ int nr_cmds; /* Max cmds this kind of ctlr can handle. */
+};
+
+#endif /* CCISS_H */
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
new file mode 100644
index 000000000..d9be6b4d4
--- /dev/null
+++ b/drivers/block/cciss_cmd.h
@@ -0,0 +1,269 @@
+#ifndef CCISS_CMD_H
+#define CCISS_CMD_H
+
+#include <linux/cciss_defs.h>
+
+/* DEFINES */
+#define CISS_VERSION "1.00"
+
+/* general boundary definitions */
+#define MAXSGENTRIES 32
+#define CCISS_SG_CHAIN 0x80000000
+#define MAXREPLYQS 256
+
+/* Unit Attentions ASC's as defined for the MSA2012sa */
+#define POWER_OR_RESET 0x29
+#define STATE_CHANGED 0x2a
+#define UNIT_ATTENTION_CLEARED 0x2f
+#define LUN_FAILED 0x3e
+#define REPORT_LUNS_CHANGED 0x3f
+
+/* Unit Attentions ASCQ's as defined for the MSA2012sa */
+
+ /* These ASCQ's defined for ASC = POWER_OR_RESET */
+#define POWER_ON_RESET 0x00
+#define POWER_ON_REBOOT 0x01
+#define SCSI_BUS_RESET 0x02
+#define MSA_TARGET_RESET 0x03
+#define CONTROLLER_FAILOVER 0x04
+#define TRANSCEIVER_SE 0x05
+#define TRANSCEIVER_LVD 0x06
+
+ /* These ASCQ's defined for ASC = STATE_CHANGED */
+#define RESERVATION_PREEMPTED 0x03
+#define ASYM_ACCESS_CHANGED 0x06
+#define LUN_CAPACITY_CHANGED 0x09
+
+/* config space register offsets */
+#define CFG_VENDORID 0x00
+#define CFG_DEVICEID 0x02
+#define CFG_I2OBAR 0x10
+#define CFG_MEM1BAR 0x14
+
+/* i2o space register offsets */
+#define I2O_IBDB_SET 0x20
+#define I2O_IBDB_CLEAR 0x70
+#define I2O_INT_STATUS 0x30
+#define I2O_INT_MASK 0x34
+#define I2O_IBPOST_Q 0x40
+#define I2O_OBPOST_Q 0x44
+#define I2O_DMA1_CFG 0x214
+
+/* Configuration Table */
+#define CFGTBL_ChangeReq 0x00000001l
+#define CFGTBL_AccCmds 0x00000001l
+#define DOORBELL_CTLR_RESET 0x00000004l
+#define DOORBELL_CTLR_RESET2 0x00000020l
+
+#define CFGTBL_Trans_Simple 0x00000002l
+#define CFGTBL_Trans_Performant 0x00000004l
+#define CFGTBL_Trans_use_short_tags 0x20000000l
+
+#define CFGTBL_BusType_Ultra2 0x00000001l
+#define CFGTBL_BusType_Ultra3 0x00000002l
+#define CFGTBL_BusType_Fibre1G 0x00000100l
+#define CFGTBL_BusType_Fibre2G 0x00000200l
+typedef struct _vals32
+{
+ __u32 lower;
+ __u32 upper;
+} vals32;
+
+typedef union _u64bit
+{
+ vals32 val32;
+ __u64 val;
+} u64bit;
+
+/* Type defs used in the following structs */
+#define QWORD vals32
+
+/* STRUCTURES */
+#define CISS_MAX_PHYS_LUN 1024
+/* SCSI-3 Cmmands */
+
+#pragma pack(1)
+
+#define CISS_INQUIRY 0x12
+/* Date returned */
+typedef struct _InquiryData_struct
+{
+ BYTE data_byte[36];
+} InquiryData_struct;
+
+#define CISS_REPORT_LOG 0xc2 /* Report Logical LUNs */
+#define CISS_REPORT_PHYS 0xc3 /* Report Physical LUNs */
+/* Data returned */
+typedef struct _ReportLUNdata_struct
+{
+ BYTE LUNListLength[4];
+ DWORD reserved;
+ BYTE LUN[CISS_MAX_LUN][8];
+} ReportLunData_struct;
+
+#define CCISS_READ_CAPACITY 0x25 /* Read Capacity */
+typedef struct _ReadCapdata_struct
+{
+ BYTE total_size[4]; /* Total size in blocks */
+ BYTE block_size[4]; /* Size of blocks in bytes */
+} ReadCapdata_struct;
+
+#define CCISS_READ_CAPACITY_16 0x9e /* Read Capacity 16 */
+
+/* service action to differentiate a 16 byte read capacity from
+ other commands that use the 0x9e SCSI op code */
+
+#define CCISS_READ_CAPACITY_16_SERVICE_ACT 0x10
+
+typedef struct _ReadCapdata_struct_16
+{
+ BYTE total_size[8]; /* Total size in blocks */
+ BYTE block_size[4]; /* Size of blocks in bytes */
+ BYTE prot_en:1; /* protection enable bit */
+ BYTE rto_en:1; /* reference tag own enable bit */
+ BYTE reserved:6; /* reserved bits */
+ BYTE reserved2[18]; /* reserved bytes per spec */
+} ReadCapdata_struct_16;
+
+/* Define the supported read/write commands for cciss based controllers */
+
+#define CCISS_READ_10 0x28 /* Read(10) */
+#define CCISS_WRITE_10 0x2a /* Write(10) */
+#define CCISS_READ_16 0x88 /* Read(16) */
+#define CCISS_WRITE_16 0x8a /* Write(16) */
+
+/* Define the CDB lengths supported by cciss based controllers */
+
+#define CDB_LEN10 10
+#define CDB_LEN16 16
+
+/* BMIC commands */
+#define BMIC_READ 0x26
+#define BMIC_WRITE 0x27
+#define BMIC_CACHE_FLUSH 0xc2
+#define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */
+
+#define CCISS_ABORT_MSG 0x00
+#define CCISS_RESET_MSG 0x01
+#define CCISS_RESET_TYPE_CONTROLLER 0x00
+#define CCISS_RESET_TYPE_BUS 0x01
+#define CCISS_RESET_TYPE_TARGET 0x03
+#define CCISS_RESET_TYPE_LUN 0x04
+#define CCISS_NOOP_MSG 0x03
+
+/* Command List Structure */
+#define CTLR_LUNID "\0\0\0\0\0\0\0\0"
+
+typedef struct _CommandListHeader_struct {
+ BYTE ReplyQueue;
+ BYTE SGList;
+ HWORD SGTotal;
+ QWORD Tag;
+ LUNAddr_struct LUN;
+} CommandListHeader_struct;
+typedef struct _ErrDescriptor_struct {
+ QWORD Addr;
+ DWORD Len;
+} ErrDescriptor_struct;
+typedef struct _SGDescriptor_struct {
+ QWORD Addr;
+ DWORD Len;
+ DWORD Ext;
+} SGDescriptor_struct;
+
+/* Command types */
+#define CMD_RWREQ 0x00
+#define CMD_IOCTL_PEND 0x01
+#define CMD_SCSI 0x03
+#define CMD_MSG_DONE 0x04
+#define CMD_MSG_TIMEOUT 0x05
+#define CMD_MSG_STALE 0xff
+
+/* This structure needs to be divisible by COMMANDLIST_ALIGNMENT
+ * because low bits of the address are used to to indicate that
+ * whether the tag contains an index or an address. PAD_32 and
+ * PAD_64 can be adjusted independently as needed for 32-bit
+ * and 64-bits systems.
+ */
+#define COMMANDLIST_ALIGNMENT (32)
+#define IS_64_BIT ((sizeof(long) - 4)/4)
+#define IS_32_BIT (!IS_64_BIT)
+#define PAD_32 (0)
+#define PAD_64 (4)
+#define PADSIZE (IS_32_BIT * PAD_32 + IS_64_BIT * PAD_64)
+#define DIRECT_LOOKUP_BIT 0x10
+#define DIRECT_LOOKUP_SHIFT 5
+
+typedef struct _CommandList_struct {
+ CommandListHeader_struct Header;
+ RequestBlock_struct Request;
+ ErrDescriptor_struct ErrDesc;
+ SGDescriptor_struct SG[MAXSGENTRIES];
+ /* information associated with the command */
+ __u32 busaddr; /* physical address of this record */
+ ErrorInfo_struct * err_info; /* pointer to the allocated mem */
+ int ctlr;
+ int cmd_type;
+ long cmdindex;
+ struct list_head list;
+ struct request * rq;
+ struct completion *waiting;
+ int retry_count;
+ void * scsi_cmd;
+ char pad[PADSIZE];
+} CommandList_struct;
+
+/* Configuration Table Structure */
+typedef struct _HostWrite_struct {
+ DWORD TransportRequest;
+ DWORD Reserved;
+ DWORD CoalIntDelay;
+ DWORD CoalIntCount;
+} HostWrite_struct;
+
+typedef struct _CfgTable_struct {
+ BYTE Signature[4];
+ DWORD SpecValence;
+#define SIMPLE_MODE 0x02
+#define PERFORMANT_MODE 0x04
+#define MEMQ_MODE 0x08
+ DWORD TransportSupport;
+ DWORD TransportActive;
+ HostWrite_struct HostWrite;
+ DWORD CmdsOutMax;
+ DWORD BusTypes;
+ DWORD TransMethodOffset;
+ BYTE ServerName[16];
+ DWORD HeartBeat;
+ DWORD SCSI_Prefetch;
+ DWORD MaxSGElements;
+ DWORD MaxLogicalUnits;
+ DWORD MaxPhysicalDrives;
+ DWORD MaxPhysicalDrivesPerLogicalUnit;
+ DWORD MaxPerformantModeCommands;
+ u8 reserved[0x78 - 0x58];
+ u32 misc_fw_support; /* offset 0x78 */
+#define MISC_FW_DOORBELL_RESET (0x02)
+#define MISC_FW_DOORBELL_RESET2 (0x10)
+ u8 driver_version[32];
+} CfgTable_struct;
+
+struct TransTable_struct {
+ u32 BlockFetch0;
+ u32 BlockFetch1;
+ u32 BlockFetch2;
+ u32 BlockFetch3;
+ u32 BlockFetch4;
+ u32 BlockFetch5;
+ u32 BlockFetch6;
+ u32 BlockFetch7;
+ u32 RepQSize;
+ u32 RepQCount;
+ u32 RepQCtrAddrLow32;
+ u32 RepQCtrAddrHigh32;
+ u32 RepQAddr0Low32;
+ u32 RepQAddr0High32;
+};
+
+#pragma pack()
+#endif /* CCISS_CMD_H */
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
new file mode 100644
index 000000000..ecd845cd2
--- /dev/null
+++ b/drivers/block/cciss_scsi.c
@@ -0,0 +1,1712 @@
+/*
+ * Disk Array driver for HP Smart Array controllers, SCSI Tape module.
+ * (C) Copyright 2001, 2007 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 300, Boston, MA
+ * 02111-1307, USA.
+ *
+ * Questions/Comments/Bugfixes to iss_storagedev@hp.com
+ *
+ * Author: Stephen M. Cameron
+ */
+#ifdef CONFIG_CISS_SCSI_TAPE
+
+/* Here we have code to present the driver as a scsi driver
+ as it is simultaneously presented as a block driver. The
+ reason for doing this is to allow access to SCSI tape drives
+ through the array controller. Note in particular, neither
+ physical nor logical disks are presented through the scsi layer. */
+
+#include <linux/timer.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <linux/atomic.h>
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_host.h>
+
+#include "cciss_scsi.h"
+
+#define CCISS_ABORT_MSG 0x00
+#define CCISS_RESET_MSG 0x01
+
+static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff,
+ size_t size,
+ __u8 page_code, unsigned char *scsi3addr,
+ int cmd_type);
+
+static CommandList_struct *cmd_alloc(ctlr_info_t *h);
+static CommandList_struct *cmd_special_alloc(ctlr_info_t *h);
+static void cmd_free(ctlr_info_t *h, CommandList_struct *c);
+static void cmd_special_free(ctlr_info_t *h, CommandList_struct *c);
+
+static int cciss_scsi_write_info(struct Scsi_Host *sh,
+ char *buffer, /* data buffer */
+ int length); /* length of data in buffer */
+static int cciss_scsi_show_info(struct seq_file *m,
+ struct Scsi_Host *sh);
+
+static int cciss_scsi_queue_command (struct Scsi_Host *h,
+ struct scsi_cmnd *cmd);
+static int cciss_eh_device_reset_handler(struct scsi_cmnd *);
+static int cciss_eh_abort_handler(struct scsi_cmnd *);
+
+static struct cciss_scsi_hba_t ccissscsi[MAX_CTLR] = {
+ { .name = "cciss0", .ndevices = 0 },
+ { .name = "cciss1", .ndevices = 0 },
+ { .name = "cciss2", .ndevices = 0 },
+ { .name = "cciss3", .ndevices = 0 },
+ { .name = "cciss4", .ndevices = 0 },
+ { .name = "cciss5", .ndevices = 0 },
+ { .name = "cciss6", .ndevices = 0 },
+ { .name = "cciss7", .ndevices = 0 },
+};
+
+static struct scsi_host_template cciss_driver_template = {
+ .module = THIS_MODULE,
+ .name = "cciss",
+ .proc_name = "cciss",
+ .write_info = cciss_scsi_write_info,
+ .show_info = cciss_scsi_show_info,
+ .queuecommand = cciss_scsi_queue_command,
+ .this_id = 7,
+ .cmd_per_lun = 1,
+ .use_clustering = DISABLE_CLUSTERING,
+ /* Can't have eh_bus_reset_handler or eh_host_reset_handler for cciss */
+ .eh_device_reset_handler= cciss_eh_device_reset_handler,
+ .eh_abort_handler = cciss_eh_abort_handler,
+};
+
+#pragma pack(1)
+
+#define SCSI_PAD_32 8
+#define SCSI_PAD_64 8
+
+struct cciss_scsi_cmd_stack_elem_t {
+ CommandList_struct cmd;
+ ErrorInfo_struct Err;
+ __u32 busaddr;
+ int cmdindex;
+ u8 pad[IS_32_BIT * SCSI_PAD_32 + IS_64_BIT * SCSI_PAD_64];
+};
+
+#pragma pack()
+
+#pragma pack(1)
+struct cciss_scsi_cmd_stack_t {
+ struct cciss_scsi_cmd_stack_elem_t *pool;
+ struct cciss_scsi_cmd_stack_elem_t **elem;
+ dma_addr_t cmd_pool_handle;
+ int top;
+ int nelems;
+};
+#pragma pack()
+
+struct cciss_scsi_adapter_data_t {
+ struct Scsi_Host *scsi_host;
+ struct cciss_scsi_cmd_stack_t cmd_stack;
+ SGDescriptor_struct **cmd_sg_list;
+ int registered;
+ spinlock_t lock; // to protect ccissscsi[ctlr];
+};
+
+#define CPQ_TAPE_LOCK(h, flags) spin_lock_irqsave( \
+ &h->scsi_ctlr->lock, flags);
+#define CPQ_TAPE_UNLOCK(h, flags) spin_unlock_irqrestore( \
+ &h->scsi_ctlr->lock, flags);
+
+static CommandList_struct *
+scsi_cmd_alloc(ctlr_info_t *h)
+{
+ /* assume only one process in here at a time, locking done by caller. */
+ /* use h->lock */
+ /* might be better to rewrite how we allocate scsi commands in a way that */
+ /* needs no locking at all. */
+
+ /* take the top memory chunk off the stack and return it, if any. */
+ struct cciss_scsi_cmd_stack_elem_t *c;
+ struct cciss_scsi_adapter_data_t *sa;
+ struct cciss_scsi_cmd_stack_t *stk;
+ u64bit temp64;
+
+ sa = h->scsi_ctlr;
+ stk = &sa->cmd_stack;
+
+ if (stk->top < 0)
+ return NULL;
+ c = stk->elem[stk->top];
+ /* memset(c, 0, sizeof(*c)); */
+ memset(&c->cmd, 0, sizeof(c->cmd));
+ memset(&c->Err, 0, sizeof(c->Err));
+ /* set physical addr of cmd and addr of scsi parameters */
+ c->cmd.busaddr = c->busaddr;
+ c->cmd.cmdindex = c->cmdindex;
+ /* (__u32) (stk->cmd_pool_handle +
+ (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top)); */
+
+ temp64.val = (__u64) (c->busaddr + sizeof(CommandList_struct));
+ /* (__u64) (stk->cmd_pool_handle +
+ (sizeof(struct cciss_scsi_cmd_stack_elem_t)*stk->top) +
+ sizeof(CommandList_struct)); */
+ stk->top--;
+ c->cmd.ErrDesc.Addr.lower = temp64.val32.lower;
+ c->cmd.ErrDesc.Addr.upper = temp64.val32.upper;
+ c->cmd.ErrDesc.Len = sizeof(ErrorInfo_struct);
+
+ c->cmd.ctlr = h->ctlr;
+ c->cmd.err_info = &c->Err;
+
+ return (CommandList_struct *) c;
+}
+
+static void
+scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c)
+{
+ /* assume only one process in here at a time, locking done by caller. */
+ /* use h->lock */
+ /* drop the free memory chunk on top of the stack. */
+
+ struct cciss_scsi_adapter_data_t *sa;
+ struct cciss_scsi_cmd_stack_t *stk;
+
+ sa = h->scsi_ctlr;
+ stk = &sa->cmd_stack;
+ stk->top++;
+ if (stk->top >= stk->nelems) {
+ dev_err(&h->pdev->dev,
+ "scsi_cmd_free called too many times.\n");
+ BUG();
+ }
+ stk->elem[stk->top] = (struct cciss_scsi_cmd_stack_elem_t *) c;
+}
+
+static int
+scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa)
+{
+ int i;
+ struct cciss_scsi_cmd_stack_t *stk;
+ size_t size;
+
+ stk = &sa->cmd_stack;
+ stk->nelems = cciss_tape_cmds + 2;
+ sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h,
+ h->chainsize, stk->nelems);
+ if (!sa->cmd_sg_list && h->chainsize > 0)
+ return -ENOMEM;
+
+ size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
+
+ /* Check alignment, see cciss_cmd.h near CommandList_struct def. */
+ BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0);
+ /* pci_alloc_consistent guarantees 32-bit DMA address will be used */
+ stk->pool = (struct cciss_scsi_cmd_stack_elem_t *)
+ pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle);
+
+ if (stk->pool == NULL) {
+ cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
+ sa->cmd_sg_list = NULL;
+ return -ENOMEM;
+ }
+ stk->elem = kmalloc(sizeof(stk->elem[0]) * stk->nelems, GFP_KERNEL);
+ if (!stk->elem) {
+ pci_free_consistent(h->pdev, size, stk->pool,
+ stk->cmd_pool_handle);
+ return -1;
+ }
+ for (i = 0; i < stk->nelems; i++) {
+ stk->elem[i] = &stk->pool[i];
+ stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle +
+ (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i));
+ stk->elem[i]->cmdindex = i;
+ }
+ stk->top = stk->nelems-1;
+ return 0;
+}
+
+static void
+scsi_cmd_stack_free(ctlr_info_t *h)
+{
+ struct cciss_scsi_adapter_data_t *sa;
+ struct cciss_scsi_cmd_stack_t *stk;
+ size_t size;
+
+ sa = h->scsi_ctlr;
+ stk = &sa->cmd_stack;
+ if (stk->top != stk->nelems-1) {
+ dev_warn(&h->pdev->dev,
+ "bug: %d scsi commands are still outstanding.\n",
+ stk->nelems - stk->top);
+ }
+ size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems;
+
+ pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle);
+ stk->pool = NULL;
+ cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems);
+ kfree(stk->elem);
+ stk->elem = NULL;
+}
+
+#if 0
+static int xmargin=8;
+static int amargin=60;
+
+static void
+print_bytes (unsigned char *c, int len, int hex, int ascii)
+{
+
+ int i;
+ unsigned char *x;
+
+ if (hex)
+ {
+ x = c;
+ for (i=0;i<len;i++)
+ {
+ if ((i % xmargin) == 0 && i>0) printk("\n");
+ if ((i % xmargin) == 0) printk("0x%04x:", i);
+ printk(" %02x", *x);
+ x++;
+ }
+ printk("\n");
+ }
+ if (ascii)
+ {
+ x = c;
+ for (i=0;i<len;i++)
+ {
+ if ((i % amargin) == 0 && i>0) printk("\n");
+ if ((i % amargin) == 0) printk("0x%04x:", i);
+ if (*x > 26 && *x < 128) printk("%c", *x);
+ else printk(".");
+ x++;
+ }
+ printk("\n");
+ }
+}
+
+static void
+print_cmd(CommandList_struct *cp)
+{
+ printk("queue:%d\n", cp->Header.ReplyQueue);
+ printk("sglist:%d\n", cp->Header.SGList);
+ printk("sgtot:%d\n", cp->Header.SGTotal);
+ printk("Tag:0x%08x/0x%08x\n", cp->Header.Tag.upper,
+ cp->Header.Tag.lower);
+ printk("LUN:0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+ cp->Header.LUN.LunAddrBytes[0],
+ cp->Header.LUN.LunAddrBytes[1],
+ cp->Header.LUN.LunAddrBytes[2],
+ cp->Header.LUN.LunAddrBytes[3],
+ cp->Header.LUN.LunAddrBytes[4],
+ cp->Header.LUN.LunAddrBytes[5],
+ cp->Header.LUN.LunAddrBytes[6],
+ cp->Header.LUN.LunAddrBytes[7]);
+ printk("CDBLen:%d\n", cp->Request.CDBLen);
+ printk("Type:%d\n",cp->Request.Type.Type);
+ printk("Attr:%d\n",cp->Request.Type.Attribute);
+ printk(" Dir:%d\n",cp->Request.Type.Direction);
+ printk("Timeout:%d\n",cp->Request.Timeout);
+ printk( "CDB: %02x %02x %02x %02x %02x %02x %02x %02x"
+ " %02x %02x %02x %02x %02x %02x %02x %02x\n",
+ cp->Request.CDB[0], cp->Request.CDB[1],
+ cp->Request.CDB[2], cp->Request.CDB[3],
+ cp->Request.CDB[4], cp->Request.CDB[5],
+ cp->Request.CDB[6], cp->Request.CDB[7],
+ cp->Request.CDB[8], cp->Request.CDB[9],
+ cp->Request.CDB[10], cp->Request.CDB[11],
+ cp->Request.CDB[12], cp->Request.CDB[13],
+ cp->Request.CDB[14], cp->Request.CDB[15]),
+ printk("edesc.Addr: 0x%08x/0%08x, Len = %d\n",
+ cp->ErrDesc.Addr.upper, cp->ErrDesc.Addr.lower,
+ cp->ErrDesc.Len);
+ printk("sgs..........Errorinfo:\n");
+ printk("scsistatus:%d\n", cp->err_info->ScsiStatus);
+ printk("senselen:%d\n", cp->err_info->SenseLen);
+ printk("cmd status:%d\n", cp->err_info->CommandStatus);
+ printk("resid cnt:%d\n", cp->err_info->ResidualCnt);
+ printk("offense size:%d\n", cp->err_info->MoreErrInfo.Invalid_Cmd.offense_size);
+ printk("offense byte:%d\n", cp->err_info->MoreErrInfo.Invalid_Cmd.offense_num);
+ printk("offense value:%d\n", cp->err_info->MoreErrInfo.Invalid_Cmd.offense_value);
+
+}
+
+#endif
+
+static int
+find_bus_target_lun(ctlr_info_t *h, int *bus, int *target, int *lun)
+{
+ /* finds an unused bus, target, lun for a new device */
+ /* assumes h->scsi_ctlr->lock is held */
+ int i, found=0;
+ unsigned char target_taken[CCISS_MAX_SCSI_DEVS_PER_HBA];
+
+ memset(&target_taken[0], 0, CCISS_MAX_SCSI_DEVS_PER_HBA);
+
+ target_taken[SELF_SCSI_ID] = 1;
+ for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++)
+ target_taken[ccissscsi[h->ctlr].dev[i].target] = 1;
+
+ for (i = 0; i < CCISS_MAX_SCSI_DEVS_PER_HBA; i++) {
+ if (!target_taken[i]) {
+ *bus = 0; *target=i; *lun = 0; found=1;
+ break;
+ }
+ }
+ return (!found);
+}
+struct scsi2map {
+ char scsi3addr[8];
+ int bus, target, lun;
+};
+
+static int
+cciss_scsi_add_entry(ctlr_info_t *h, int hostno,
+ struct cciss_scsi_dev_t *device,
+ struct scsi2map *added, int *nadded)
+{
+ /* assumes h->scsi_ctlr->lock is held */
+ int n = ccissscsi[h->ctlr].ndevices;
+ struct cciss_scsi_dev_t *sd;
+ int i, bus, target, lun;
+ unsigned char addr1[8], addr2[8];
+
+ if (n >= CCISS_MAX_SCSI_DEVS_PER_HBA) {
+ dev_warn(&h->pdev->dev, "Too many devices, "
+ "some will be inaccessible.\n");
+ return -1;
+ }
+
+ bus = target = -1;
+ lun = 0;
+ /* Is this device a non-zero lun of a multi-lun device */
+ /* byte 4 of the 8-byte LUN addr will contain the logical unit no. */
+ if (device->scsi3addr[4] != 0) {
+ /* Search through our list and find the device which */
+ /* has the same 8 byte LUN address, excepting byte 4. */
+ /* Assign the same bus and target for this new LUN. */
+ /* Use the logical unit number from the firmware. */
+ memcpy(addr1, device->scsi3addr, 8);
+ addr1[4] = 0;
+ for (i = 0; i < n; i++) {
+ sd = &ccissscsi[h->ctlr].dev[i];
+ memcpy(addr2, sd->scsi3addr, 8);
+ addr2[4] = 0;
+ /* differ only in byte 4? */
+ if (memcmp(addr1, addr2, 8) == 0) {
+ bus = sd->bus;
+ target = sd->target;
+ lun = device->scsi3addr[4];
+ break;
+ }
+ }
+ }
+
+ sd = &ccissscsi[h->ctlr].dev[n];
+ if (lun == 0) {
+ if (find_bus_target_lun(h,
+ &sd->bus, &sd->target, &sd->lun) != 0)
+ return -1;
+ } else {
+ sd->bus = bus;
+ sd->target = target;
+ sd->lun = lun;
+ }
+ added[*nadded].bus = sd->bus;
+ added[*nadded].target = sd->target;
+ added[*nadded].lun = sd->lun;
+ (*nadded)++;
+
+ memcpy(sd->scsi3addr, device->scsi3addr, 8);
+ memcpy(sd->vendor, device->vendor, sizeof(sd->vendor));
+ memcpy(sd->revision, device->revision, sizeof(sd->revision));
+ memcpy(sd->device_id, device->device_id, sizeof(sd->device_id));
+ sd->devtype = device->devtype;
+
+ ccissscsi[h->ctlr].ndevices++;
+
+ /* initially, (before registering with scsi layer) we don't
+ know our hostno and we don't want to print anything first
+ time anyway (the scsi layer's inquiries will show that info) */
+ if (hostno != -1)
+ dev_info(&h->pdev->dev, "%s device c%db%dt%dl%d added.\n",
+ scsi_device_type(sd->devtype), hostno,
+ sd->bus, sd->target, sd->lun);
+ return 0;
+}
+
+static void
+cciss_scsi_remove_entry(ctlr_info_t *h, int hostno, int entry,
+ struct scsi2map *removed, int *nremoved)
+{
+ /* assumes h->ctlr]->scsi_ctlr->lock is held */
+ int i;
+ struct cciss_scsi_dev_t sd;
+
+ if (entry < 0 || entry >= CCISS_MAX_SCSI_DEVS_PER_HBA) return;
+ sd = ccissscsi[h->ctlr].dev[entry];
+ removed[*nremoved].bus = sd.bus;
+ removed[*nremoved].target = sd.target;
+ removed[*nremoved].lun = sd.lun;
+ (*nremoved)++;
+ for (i = entry; i < ccissscsi[h->ctlr].ndevices-1; i++)
+ ccissscsi[h->ctlr].dev[i] = ccissscsi[h->ctlr].dev[i+1];
+ ccissscsi[h->ctlr].ndevices--;
+ dev_info(&h->pdev->dev, "%s device c%db%dt%dl%d removed.\n",
+ scsi_device_type(sd.devtype), hostno,
+ sd.bus, sd.target, sd.lun);
+}
+
+
+#define SCSI3ADDR_EQ(a,b) ( \
+ (a)[7] == (b)[7] && \
+ (a)[6] == (b)[6] && \
+ (a)[5] == (b)[5] && \
+ (a)[4] == (b)[4] && \
+ (a)[3] == (b)[3] && \
+ (a)[2] == (b)[2] && \
+ (a)[1] == (b)[1] && \
+ (a)[0] == (b)[0])
+
+static void fixup_botched_add(ctlr_info_t *h, char *scsi3addr)
+{
+ /* called when scsi_add_device fails in order to re-adjust */
+ /* ccissscsi[] to match the mid layer's view. */
+ unsigned long flags;
+ int i, j;
+ CPQ_TAPE_LOCK(h, flags);
+ for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) {
+ if (memcmp(scsi3addr,
+ ccissscsi[h->ctlr].dev[i].scsi3addr, 8) == 0) {
+ for (j = i; j < ccissscsi[h->ctlr].ndevices-1; j++)
+ ccissscsi[h->ctlr].dev[j] =
+ ccissscsi[h->ctlr].dev[j+1];
+ ccissscsi[h->ctlr].ndevices--;
+ break;
+ }
+ }
+ CPQ_TAPE_UNLOCK(h, flags);
+}
+
+static int device_is_the_same(struct cciss_scsi_dev_t *dev1,
+ struct cciss_scsi_dev_t *dev2)
+{
+ return dev1->devtype == dev2->devtype &&
+ memcmp(dev1->scsi3addr, dev2->scsi3addr,
+ sizeof(dev1->scsi3addr)) == 0 &&
+ memcmp(dev1->device_id, dev2->device_id,
+ sizeof(dev1->device_id)) == 0 &&
+ memcmp(dev1->vendor, dev2->vendor,
+ sizeof(dev1->vendor)) == 0 &&
+ memcmp(dev1->model, dev2->model,
+ sizeof(dev1->model)) == 0 &&
+ memcmp(dev1->revision, dev2->revision,
+ sizeof(dev1->revision)) == 0;
+}
+
+static int
+adjust_cciss_scsi_table(ctlr_info_t *h, int hostno,
+ struct cciss_scsi_dev_t sd[], int nsds)
+{
+ /* sd contains scsi3 addresses and devtypes, but
+ bus target and lun are not filled in. This funciton
+ takes what's in sd to be the current and adjusts
+ ccissscsi[] to be in line with what's in sd. */
+
+ int i,j, found, changes=0;
+ struct cciss_scsi_dev_t *csd;
+ unsigned long flags;
+ struct scsi2map *added, *removed;
+ int nadded, nremoved;
+ struct Scsi_Host *sh = NULL;
+
+ added = kzalloc(sizeof(*added) * CCISS_MAX_SCSI_DEVS_PER_HBA,
+ GFP_KERNEL);
+ removed = kzalloc(sizeof(*removed) * CCISS_MAX_SCSI_DEVS_PER_HBA,
+ GFP_KERNEL);
+
+ if (!added || !removed) {
+ dev_warn(&h->pdev->dev,
+ "Out of memory in adjust_cciss_scsi_table\n");
+ goto free_and_out;
+ }
+
+ CPQ_TAPE_LOCK(h, flags);
+
+ if (hostno != -1) /* if it's not the first time... */
+ sh = h->scsi_ctlr->scsi_host;
+
+ /* find any devices in ccissscsi[] that are not in
+ sd[] and remove them from ccissscsi[] */
+
+ i = 0;
+ nremoved = 0;
+ nadded = 0;
+ while (i < ccissscsi[h->ctlr].ndevices) {
+ csd = &ccissscsi[h->ctlr].dev[i];
+ found=0;
+ for (j=0;j<nsds;j++) {
+ if (SCSI3ADDR_EQ(sd[j].scsi3addr,
+ csd->scsi3addr)) {
+ if (device_is_the_same(&sd[j], csd))
+ found=2;
+ else
+ found=1;
+ break;
+ }
+ }
+
+ if (found == 0) { /* device no longer present. */
+ changes++;
+ cciss_scsi_remove_entry(h, hostno, i,
+ removed, &nremoved);
+ /* remove ^^^, hence i not incremented */
+ } else if (found == 1) { /* device is different in some way */
+ changes++;
+ dev_info(&h->pdev->dev,
+ "device c%db%dt%dl%d has changed.\n",
+ hostno, csd->bus, csd->target, csd->lun);
+ cciss_scsi_remove_entry(h, hostno, i,
+ removed, &nremoved);
+ /* remove ^^^, hence i not incremented */
+ if (cciss_scsi_add_entry(h, hostno, &sd[j],
+ added, &nadded) != 0)
+ /* we just removed one, so add can't fail. */
+ BUG();
+ csd->devtype = sd[j].devtype;
+ memcpy(csd->device_id, sd[j].device_id,
+ sizeof(csd->device_id));
+ memcpy(csd->vendor, sd[j].vendor,
+ sizeof(csd->vendor));
+ memcpy(csd->model, sd[j].model,
+ sizeof(csd->model));
+ memcpy(csd->revision, sd[j].revision,
+ sizeof(csd->revision));
+ } else /* device is same as it ever was, */
+ i++; /* so just move along. */
+ }
+
+ /* Now, make sure every device listed in sd[] is also
+ listed in ccissscsi[], adding them if they aren't found */
+
+ for (i=0;i<nsds;i++) {
+ found=0;
+ for (j = 0; j < ccissscsi[h->ctlr].ndevices; j++) {
+ csd = &ccissscsi[h->ctlr].dev[j];
+ if (SCSI3ADDR_EQ(sd[i].scsi3addr,
+ csd->scsi3addr)) {
+ if (device_is_the_same(&sd[i], csd))
+ found=2; /* found device */
+ else
+ found=1; /* found a bug. */
+ break;
+ }
+ }
+ if (!found) {
+ changes++;
+ if (cciss_scsi_add_entry(h, hostno, &sd[i],
+ added, &nadded) != 0)
+ break;
+ } else if (found == 1) {
+ /* should never happen... */
+ changes++;
+ dev_warn(&h->pdev->dev,
+ "device unexpectedly changed\n");
+ /* but if it does happen, we just ignore that device */
+ }
+ }
+ CPQ_TAPE_UNLOCK(h, flags);
+
+ /* Don't notify scsi mid layer of any changes the first time through */
+ /* (or if there are no changes) scsi_scan_host will do it later the */
+ /* first time through. */
+ if (hostno == -1 || !changes)
+ goto free_and_out;
+
+ /* Notify scsi mid layer of any removed devices */
+ for (i = 0; i < nremoved; i++) {
+ struct scsi_device *sdev =
+ scsi_device_lookup(sh, removed[i].bus,
+ removed[i].target, removed[i].lun);
+ if (sdev != NULL) {
+ scsi_remove_device(sdev);
+ scsi_device_put(sdev);
+ } else {
+ /* We don't expect to get here. */
+ /* future cmds to this device will get selection */
+ /* timeout as if the device was gone. */
+ dev_warn(&h->pdev->dev, "didn't find "
+ "c%db%dt%dl%d\n for removal.",
+ hostno, removed[i].bus,
+ removed[i].target, removed[i].lun);
+ }
+ }
+
+ /* Notify scsi mid layer of any added devices */
+ for (i = 0; i < nadded; i++) {
+ int rc;
+ rc = scsi_add_device(sh, added[i].bus,
+ added[i].target, added[i].lun);
+ if (rc == 0)
+ continue;
+ dev_warn(&h->pdev->dev, "scsi_add_device "
+ "c%db%dt%dl%d failed, device not added.\n",
+ hostno, added[i].bus, added[i].target, added[i].lun);
+ /* now we have to remove it from ccissscsi, */
+ /* since it didn't get added to scsi mid layer */
+ fixup_botched_add(h, added[i].scsi3addr);
+ }
+
+free_and_out:
+ kfree(added);
+ kfree(removed);
+ return 0;
+}
+
+static int
+lookup_scsi3addr(ctlr_info_t *h, int bus, int target, int lun, char *scsi3addr)
+{
+ int i;
+ struct cciss_scsi_dev_t *sd;
+ unsigned long flags;
+
+ CPQ_TAPE_LOCK(h, flags);
+ for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) {
+ sd = &ccissscsi[h->ctlr].dev[i];
+ if (sd->bus == bus &&
+ sd->target == target &&
+ sd->lun == lun) {
+ memcpy(scsi3addr, &sd->scsi3addr[0], 8);
+ CPQ_TAPE_UNLOCK(h, flags);
+ return 0;
+ }
+ }
+ CPQ_TAPE_UNLOCK(h, flags);
+ return -1;
+}
+
+static void
+cciss_scsi_setup(ctlr_info_t *h)
+{
+ struct cciss_scsi_adapter_data_t * shba;
+
+ ccissscsi[h->ctlr].ndevices = 0;
+ shba = (struct cciss_scsi_adapter_data_t *)
+ kmalloc(sizeof(*shba), GFP_KERNEL);
+ if (shba == NULL)
+ return;
+ shba->scsi_host = NULL;
+ spin_lock_init(&shba->lock);
+ shba->registered = 0;
+ if (scsi_cmd_stack_setup(h, shba) != 0) {
+ kfree(shba);
+ shba = NULL;
+ }
+ h->scsi_ctlr = shba;
+ return;
+}
+
+static void complete_scsi_command(CommandList_struct *c, int timeout,
+ __u32 tag)
+{
+ struct scsi_cmnd *cmd;
+ ctlr_info_t *h;
+ ErrorInfo_struct *ei;
+
+ ei = c->err_info;
+
+ /* First, see if it was a message rather than a command */
+ if (c->Request.Type.Type == TYPE_MSG) {
+ c->cmd_type = CMD_MSG_DONE;
+ return;
+ }
+
+ cmd = (struct scsi_cmnd *) c->scsi_cmd;
+ h = hba[c->ctlr];
+
+ scsi_dma_unmap(cmd);
+ if (c->Header.SGTotal > h->max_cmd_sgentries)
+ cciss_unmap_sg_chain_block(h, c);
+
+ cmd->result = (DID_OK << 16); /* host byte */
+ cmd->result |= (COMMAND_COMPLETE << 8); /* msg byte */
+ /* cmd->result |= (GOOD < 1); */ /* status byte */
+
+ cmd->result |= (ei->ScsiStatus);
+ /* printk("Scsistatus is 0x%02x\n", ei->ScsiStatus); */
+
+ /* copy the sense data whether we need to or not. */
+
+ memcpy(cmd->sense_buffer, ei->SenseInfo,
+ ei->SenseLen > SCSI_SENSE_BUFFERSIZE ?
+ SCSI_SENSE_BUFFERSIZE :
+ ei->SenseLen);
+ scsi_set_resid(cmd, ei->ResidualCnt);
+
+ if(ei->CommandStatus != 0)
+ { /* an error has occurred */
+ switch(ei->CommandStatus)
+ {
+ case CMD_TARGET_STATUS:
+ /* Pass it up to the upper layers... */
+ if (!ei->ScsiStatus) {
+
+ /* Ordinarily, this case should never happen, but there is a bug
+ in some released firmware revisions that allows it to happen
+ if, for example, a 4100 backplane loses power and the tape
+ drive is in it. We assume that it's a fatal error of some
+ kind because we can't show that it wasn't. We will make it
+ look like selection timeout since that is the most common
+ reason for this to occur, and it's severe enough. */
+
+ cmd->result = DID_NO_CONNECT << 16;
+ }
+ break;
+ case CMD_DATA_UNDERRUN: /* let mid layer handle it. */
+ break;
+ case CMD_DATA_OVERRUN:
+ dev_warn(&h->pdev->dev, "%p has"
+ " completed with data overrun "
+ "reported\n", c);
+ break;
+ case CMD_INVALID: {
+ /* print_bytes(c, sizeof(*c), 1, 0);
+ print_cmd(c); */
+ /* We get CMD_INVALID if you address a non-existent tape drive instead
+ of a selection timeout (no response). You will see this if you yank
+ out a tape drive, then try to access it. This is kind of a shame
+ because it means that any other CMD_INVALID (e.g. driver bug) will
+ get interpreted as a missing target. */
+ cmd->result = DID_NO_CONNECT << 16;
+ }
+ break;
+ case CMD_PROTOCOL_ERR:
+ cmd->result = DID_ERROR << 16;
+ dev_warn(&h->pdev->dev,
+ "%p has protocol error\n", c);
+ break;
+ case CMD_HARDWARE_ERR:
+ cmd->result = DID_ERROR << 16;
+ dev_warn(&h->pdev->dev,
+ "%p had hardware error\n", c);
+ break;
+ case CMD_CONNECTION_LOST:
+ cmd->result = DID_ERROR << 16;
+ dev_warn(&h->pdev->dev,
+ "%p had connection lost\n", c);
+ break;
+ case CMD_ABORTED:
+ cmd->result = DID_ABORT << 16;
+ dev_warn(&h->pdev->dev, "%p was aborted\n", c);
+ break;
+ case CMD_ABORT_FAILED:
+ cmd->result = DID_ERROR << 16;
+ dev_warn(&h->pdev->dev,
+ "%p reports abort failed\n", c);
+ break;
+ case CMD_UNSOLICITED_ABORT:
+ cmd->result = DID_ABORT << 16;
+ dev_warn(&h->pdev->dev, "%p aborted due to an "
+ "unsolicited abort\n", c);
+ break;
+ case CMD_TIMEOUT:
+ cmd->result = DID_TIME_OUT << 16;
+ dev_warn(&h->pdev->dev, "%p timedout\n", c);
+ break;
+ case CMD_UNABORTABLE:
+ cmd->result = DID_ERROR << 16;
+ dev_warn(&h->pdev->dev, "c %p command "
+ "unabortable\n", c);
+ break;
+ default:
+ cmd->result = DID_ERROR << 16;
+ dev_warn(&h->pdev->dev,
+ "%p returned unknown status %x\n", c,
+ ei->CommandStatus);
+ }
+ }
+ cmd->scsi_done(cmd);
+ scsi_cmd_free(h, c);
+}
+
+static int
+cciss_scsi_detect(ctlr_info_t *h)
+{
+ struct Scsi_Host *sh;
+ int error;
+
+ sh = scsi_host_alloc(&cciss_driver_template, sizeof(struct ctlr_info *));
+ if (sh == NULL)
+ goto fail;
+ sh->io_port = 0; // good enough? FIXME,
+ sh->n_io_port = 0; // I don't think we use these two...
+ sh->this_id = SELF_SCSI_ID;
+ sh->can_queue = cciss_tape_cmds;
+ sh->sg_tablesize = h->maxsgentries;
+ sh->max_cmd_len = MAX_COMMAND_SIZE;
+ sh->max_sectors = h->cciss_max_sectors;
+
+ ((struct cciss_scsi_adapter_data_t *)
+ h->scsi_ctlr)->scsi_host = sh;
+ sh->hostdata[0] = (unsigned long) h;
+ sh->irq = h->intr[SIMPLE_MODE_INT];
+ sh->unique_id = sh->irq;
+ error = scsi_add_host(sh, &h->pdev->dev);
+ if (error)
+ goto fail_host_put;
+ scsi_scan_host(sh);
+ return 1;
+
+ fail_host_put:
+ scsi_host_put(sh);
+ fail:
+ return 0;
+}
+
+static void
+cciss_unmap_one(struct pci_dev *pdev,
+ CommandList_struct *c,
+ size_t buflen,
+ int data_direction)
+{
+ u64bit addr64;
+
+ addr64.val32.lower = c->SG[0].Addr.lower;
+ addr64.val32.upper = c->SG[0].Addr.upper;
+ pci_unmap_single(pdev, (dma_addr_t) addr64.val, buflen, data_direction);
+}
+
+static void
+cciss_map_one(struct pci_dev *pdev,
+ CommandList_struct *c,
+ unsigned char *buf,
+ size_t buflen,
+ int data_direction)
+{
+ __u64 addr64;
+
+ addr64 = (__u64) pci_map_single(pdev, buf, buflen, data_direction);
+ c->SG[0].Addr.lower =
+ (__u32) (addr64 & (__u64) 0x00000000FFFFFFFF);
+ c->SG[0].Addr.upper =
+ (__u32) ((addr64 >> 32) & (__u64) 0x00000000FFFFFFFF);
+ c->SG[0].Len = buflen;
+ c->Header.SGList = (__u8) 1; /* no. SGs contig in this cmd */
+ c->Header.SGTotal = (__u16) 1; /* total sgs in this cmd list */
+}
+
+static int
+cciss_scsi_do_simple_cmd(ctlr_info_t *h,
+ CommandList_struct *c,
+ unsigned char *scsi3addr,
+ unsigned char *cdb,
+ unsigned char cdblen,
+ unsigned char *buf, int bufsize,
+ int direction)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ c->cmd_type = CMD_IOCTL_PEND; /* treat this like an ioctl */
+ c->scsi_cmd = NULL;
+ c->Header.ReplyQueue = 0; /* unused in simple mode */
+ memcpy(&c->Header.LUN, scsi3addr, sizeof(c->Header.LUN));
+ c->Header.Tag.lower = c->busaddr; /* Use k. address of cmd as tag */
+ // Fill in the request block...
+
+ /* printk("Using scsi3addr 0x%02x%0x2%0x2%0x2%0x2%0x2%0x2%0x2\n",
+ scsi3addr[0], scsi3addr[1], scsi3addr[2], scsi3addr[3],
+ scsi3addr[4], scsi3addr[5], scsi3addr[6], scsi3addr[7]); */
+
+ memset(c->Request.CDB, 0, sizeof(c->Request.CDB));
+ memcpy(c->Request.CDB, cdb, cdblen);
+ c->Request.Timeout = 0;
+ c->Request.CDBLen = cdblen;
+ c->Request.Type.Type = TYPE_CMD;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ c->Request.Type.Direction = direction;
+
+ /* Fill in the SG list and do dma mapping */
+ cciss_map_one(h->pdev, c, (unsigned char *) buf,
+ bufsize, DMA_FROM_DEVICE);
+
+ c->waiting = &wait;
+ enqueue_cmd_and_start_io(h, c);
+ wait_for_completion(&wait);
+
+ /* undo the dma mapping */
+ cciss_unmap_one(h->pdev, c, bufsize, DMA_FROM_DEVICE);
+ return(0);
+}
+
+static void
+cciss_scsi_interpret_error(ctlr_info_t *h, CommandList_struct *c)
+{
+ ErrorInfo_struct *ei;
+
+ ei = c->err_info;
+ switch(ei->CommandStatus)
+ {
+ case CMD_TARGET_STATUS:
+ dev_warn(&h->pdev->dev,
+ "cmd %p has completed with errors\n", c);
+ dev_warn(&h->pdev->dev,
+ "cmd %p has SCSI Status = %x\n",
+ c, ei->ScsiStatus);
+ if (ei->ScsiStatus == 0)
+ dev_warn(&h->pdev->dev,
+ "SCSI status is abnormally zero. "
+ "(probably indicates selection timeout "
+ "reported incorrectly due to a known "
+ "firmware bug, circa July, 2001.)\n");
+ break;
+ case CMD_DATA_UNDERRUN: /* let mid layer handle it. */
+ dev_info(&h->pdev->dev, "UNDERRUN\n");
+ break;
+ case CMD_DATA_OVERRUN:
+ dev_warn(&h->pdev->dev, "%p has"
+ " completed with data overrun "
+ "reported\n", c);
+ break;
+ case CMD_INVALID: {
+ /* controller unfortunately reports SCSI passthru's */
+ /* to non-existent targets as invalid commands. */
+ dev_warn(&h->pdev->dev,
+ "%p is reported invalid (probably means "
+ "target device no longer present)\n", c);
+ /* print_bytes((unsigned char *) c, sizeof(*c), 1, 0);
+ print_cmd(c); */
+ }
+ break;
+ case CMD_PROTOCOL_ERR:
+ dev_warn(&h->pdev->dev, "%p has protocol error\n", c);
+ break;
+ case CMD_HARDWARE_ERR:
+ /* cmd->result = DID_ERROR << 16; */
+ dev_warn(&h->pdev->dev, "%p had hardware error\n", c);
+ break;
+ case CMD_CONNECTION_LOST:
+ dev_warn(&h->pdev->dev, "%p had connection lost\n", c);
+ break;
+ case CMD_ABORTED:
+ dev_warn(&h->pdev->dev, "%p was aborted\n", c);
+ break;
+ case CMD_ABORT_FAILED:
+ dev_warn(&h->pdev->dev,
+ "%p reports abort failed\n", c);
+ break;
+ case CMD_UNSOLICITED_ABORT:
+ dev_warn(&h->pdev->dev,
+ "%p aborted due to an unsolicited abort\n", c);
+ break;
+ case CMD_TIMEOUT:
+ dev_warn(&h->pdev->dev, "%p timedout\n", c);
+ break;
+ case CMD_UNABORTABLE:
+ dev_warn(&h->pdev->dev,
+ "%p unabortable\n", c);
+ break;
+ default:
+ dev_warn(&h->pdev->dev,
+ "%p returned unknown status %x\n",
+ c, ei->CommandStatus);
+ }
+}
+
+static int
+cciss_scsi_do_inquiry(ctlr_info_t *h, unsigned char *scsi3addr,
+ unsigned char page, unsigned char *buf,
+ unsigned char bufsize)
+{
+ int rc;
+ CommandList_struct *c;
+ char cdb[6];
+ ErrorInfo_struct *ei;
+ unsigned long flags;
+
+ spin_lock_irqsave(&h->lock, flags);
+ c = scsi_cmd_alloc(h);
+ spin_unlock_irqrestore(&h->lock, flags);
+
+ if (c == NULL) { /* trouble... */
+ printk("cmd_alloc returned NULL!\n");
+ return -1;
+ }
+
+ ei = c->err_info;
+
+ cdb[0] = CISS_INQUIRY;
+ cdb[1] = (page != 0);
+ cdb[2] = page;
+ cdb[3] = 0;
+ cdb[4] = bufsize;
+ cdb[5] = 0;
+ rc = cciss_scsi_do_simple_cmd(h, c, scsi3addr, cdb,
+ 6, buf, bufsize, XFER_READ);
+
+ if (rc != 0) return rc; /* something went wrong */
+
+ if (ei->CommandStatus != 0 &&
+ ei->CommandStatus != CMD_DATA_UNDERRUN) {
+ cciss_scsi_interpret_error(h, c);
+ rc = -1;
+ }
+ spin_lock_irqsave(&h->lock, flags);
+ scsi_cmd_free(h, c);
+ spin_unlock_irqrestore(&h->lock, flags);
+ return rc;
+}
+
+/* Get the device id from inquiry page 0x83 */
+static int cciss_scsi_get_device_id(ctlr_info_t *h, unsigned char *scsi3addr,
+ unsigned char *device_id, int buflen)
+{
+ int rc;
+ unsigned char *buf;
+
+ if (buflen > 16)
+ buflen = 16;
+ buf = kzalloc(64, GFP_KERNEL);
+ if (!buf)
+ return -1;
+ rc = cciss_scsi_do_inquiry(h, scsi3addr, 0x83, buf, 64);
+ if (rc == 0)
+ memcpy(device_id, &buf[8], buflen);
+ kfree(buf);
+ return rc != 0;
+}
+
+static int
+cciss_scsi_do_report_phys_luns(ctlr_info_t *h,
+ ReportLunData_struct *buf, int bufsize)
+{
+ int rc;
+ CommandList_struct *c;
+ unsigned char cdb[12];
+ unsigned char scsi3addr[8];
+ ErrorInfo_struct *ei;
+ unsigned long flags;
+
+ spin_lock_irqsave(&h->lock, flags);
+ c = scsi_cmd_alloc(h);
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (c == NULL) { /* trouble... */
+ printk("cmd_alloc returned NULL!\n");
+ return -1;
+ }
+
+ memset(&scsi3addr[0], 0, 8); /* address the controller */
+ cdb[0] = CISS_REPORT_PHYS;
+ cdb[1] = 0;
+ cdb[2] = 0;
+ cdb[3] = 0;
+ cdb[4] = 0;
+ cdb[5] = 0;
+ cdb[6] = (bufsize >> 24) & 0xFF; //MSB
+ cdb[7] = (bufsize >> 16) & 0xFF;
+ cdb[8] = (bufsize >> 8) & 0xFF;
+ cdb[9] = bufsize & 0xFF;
+ cdb[10] = 0;
+ cdb[11] = 0;
+
+ rc = cciss_scsi_do_simple_cmd(h, c, scsi3addr,
+ cdb, 12,
+ (unsigned char *) buf,
+ bufsize, XFER_READ);
+
+ if (rc != 0) return rc; /* something went wrong */
+
+ ei = c->err_info;
+ if (ei->CommandStatus != 0 &&
+ ei->CommandStatus != CMD_DATA_UNDERRUN) {
+ cciss_scsi_interpret_error(h, c);
+ rc = -1;
+ }
+ spin_lock_irqsave(&h->lock, flags);
+ scsi_cmd_free(h, c);
+ spin_unlock_irqrestore(&h->lock, flags);
+ return rc;
+}
+
+static void
+cciss_update_non_disk_devices(ctlr_info_t *h, int hostno)
+{
+ /* the idea here is we could get notified from /proc
+ that some devices have changed, so we do a report
+ physical luns cmd, and adjust our list of devices
+ accordingly. (We can't rely on the scsi-mid layer just
+ doing inquiries, because the "busses" that the scsi
+ mid-layer probes are totally fabricated by this driver,
+ so new devices wouldn't show up.
+
+ the scsi3addr's of devices won't change so long as the
+ adapter is not reset. That means we can rescan and
+ tell which devices we already know about, vs. new
+ devices, vs. disappearing devices.
+
+ Also, if you yank out a tape drive, then put in a disk
+ in it's place, (say, a configured volume from another
+ array controller for instance) _don't_ poke this driver
+ (so it thinks it's still a tape, but _do_ poke the scsi
+ mid layer, so it does an inquiry... the scsi mid layer
+ will see the physical disk. This would be bad. Need to
+ think about how to prevent that. One idea would be to
+ snoop all scsi responses and if an inquiry repsonse comes
+ back that reports a disk, chuck it an return selection
+ timeout instead and adjust our table... Not sure i like
+ that though.
+
+ */
+#define OBDR_TAPE_INQ_SIZE 49
+#define OBDR_TAPE_SIG "$DR-10"
+ ReportLunData_struct *ld_buff;
+ unsigned char *inq_buff;
+ unsigned char scsi3addr[8];
+ __u32 num_luns=0;
+ unsigned char *ch;
+ struct cciss_scsi_dev_t *currentsd, *this_device;
+ int ncurrent=0;
+ int reportlunsize = sizeof(*ld_buff) + CISS_MAX_PHYS_LUN * 8;
+ int i;
+
+ ld_buff = kzalloc(reportlunsize, GFP_KERNEL);
+ inq_buff = kmalloc(OBDR_TAPE_INQ_SIZE, GFP_KERNEL);
+ currentsd = kzalloc(sizeof(*currentsd) *
+ (CCISS_MAX_SCSI_DEVS_PER_HBA+1), GFP_KERNEL);
+ if (ld_buff == NULL || inq_buff == NULL || currentsd == NULL) {
+ printk(KERN_ERR "cciss: out of memory\n");
+ goto out;
+ }
+ this_device = &currentsd[CCISS_MAX_SCSI_DEVS_PER_HBA];
+ if (cciss_scsi_do_report_phys_luns(h, ld_buff, reportlunsize) == 0) {
+ ch = &ld_buff->LUNListLength[0];
+ num_luns = ((ch[0]<<24) | (ch[1]<<16) | (ch[2]<<8) | ch[3]) / 8;
+ if (num_luns > CISS_MAX_PHYS_LUN) {
+ printk(KERN_WARNING
+ "cciss: Maximum physical LUNs (%d) exceeded. "
+ "%d LUNs ignored.\n", CISS_MAX_PHYS_LUN,
+ num_luns - CISS_MAX_PHYS_LUN);
+ num_luns = CISS_MAX_PHYS_LUN;
+ }
+ }
+ else {
+ printk(KERN_ERR "cciss: Report physical LUNs failed.\n");
+ goto out;
+ }
+
+
+ /* adjust our table of devices */
+ for (i = 0; i < num_luns; i++) {
+ /* for each physical lun, do an inquiry */
+ if (ld_buff->LUN[i][3] & 0xC0) continue;
+ memset(inq_buff, 0, OBDR_TAPE_INQ_SIZE);
+ memcpy(&scsi3addr[0], &ld_buff->LUN[i][0], 8);
+
+ if (cciss_scsi_do_inquiry(h, scsi3addr, 0, inq_buff,
+ (unsigned char) OBDR_TAPE_INQ_SIZE) != 0)
+ /* Inquiry failed (msg printed already) */
+ continue; /* so we will skip this device. */
+
+ this_device->devtype = (inq_buff[0] & 0x1f);
+ this_device->bus = -1;
+ this_device->target = -1;
+ this_device->lun = -1;
+ memcpy(this_device->scsi3addr, scsi3addr, 8);
+ memcpy(this_device->vendor, &inq_buff[8],
+ sizeof(this_device->vendor));
+ memcpy(this_device->model, &inq_buff[16],
+ sizeof(this_device->model));
+ memcpy(this_device->revision, &inq_buff[32],
+ sizeof(this_device->revision));
+ memset(this_device->device_id, 0,
+ sizeof(this_device->device_id));
+ cciss_scsi_get_device_id(h, scsi3addr,
+ this_device->device_id, sizeof(this_device->device_id));
+
+ switch (this_device->devtype)
+ {
+ case 0x05: /* CD-ROM */ {
+
+ /* We don't *really* support actual CD-ROM devices,
+ * just this "One Button Disaster Recovery" tape drive
+ * which temporarily pretends to be a CD-ROM drive.
+ * So we check that the device is really an OBDR tape
+ * device by checking for "$DR-10" in bytes 43-48 of
+ * the inquiry data.
+ */
+ char obdr_sig[7];
+
+ strncpy(obdr_sig, &inq_buff[43], 6);
+ obdr_sig[6] = '\0';
+ if (strncmp(obdr_sig, OBDR_TAPE_SIG, 6) != 0)
+ /* Not OBDR device, ignore it. */
+ break;
+ }
+ /* fall through . . . */
+ case 0x01: /* sequential access, (tape) */
+ case 0x08: /* medium changer */
+ if (ncurrent >= CCISS_MAX_SCSI_DEVS_PER_HBA) {
+ printk(KERN_INFO "cciss%d: %s ignored, "
+ "too many devices.\n", h->ctlr,
+ scsi_device_type(this_device->devtype));
+ break;
+ }
+ currentsd[ncurrent] = *this_device;
+ ncurrent++;
+ break;
+ default:
+ break;
+ }
+ }
+
+ adjust_cciss_scsi_table(h, hostno, currentsd, ncurrent);
+out:
+ kfree(inq_buff);
+ kfree(ld_buff);
+ kfree(currentsd);
+ return;
+}
+
+static int
+is_keyword(char *ptr, int len, char *verb) // Thanks to ncr53c8xx.c
+{
+ int verb_len = strlen(verb);
+ if (len >= verb_len && !memcmp(verb,ptr,verb_len))
+ return verb_len;
+ else
+ return 0;
+}
+
+static int
+cciss_scsi_user_command(ctlr_info_t *h, int hostno, char *buffer, int length)
+{
+ int arg_len;
+
+ if ((arg_len = is_keyword(buffer, length, "rescan")) != 0)
+ cciss_update_non_disk_devices(h, hostno);
+ else
+ return -EINVAL;
+ return length;
+}
+
+static int
+cciss_scsi_write_info(struct Scsi_Host *sh,
+ char *buffer, /* data buffer */
+ int length) /* length of data in buffer */
+{
+ ctlr_info_t *h = (ctlr_info_t *) sh->hostdata[0];
+ if (h == NULL) /* This really shouldn't ever happen. */
+ return -EINVAL;
+
+ return cciss_scsi_user_command(h, sh->host_no,
+ buffer, length);
+}
+
+static int
+cciss_scsi_show_info(struct seq_file *m, struct Scsi_Host *sh)
+{
+
+ ctlr_info_t *h = (ctlr_info_t *) sh->hostdata[0];
+ int i;
+
+ if (h == NULL) /* This really shouldn't ever happen. */
+ return -EINVAL;
+
+ seq_printf(m, "cciss%d: SCSI host: %d\n",
+ h->ctlr, sh->host_no);
+
+ /* this information is needed by apps to know which cciss
+ device corresponds to which scsi host number without
+ having to open a scsi target device node. The device
+ information is not a duplicate of /proc/scsi/scsi because
+ the two may be out of sync due to scsi hotplug, rather
+ this info is for an app to be able to use to know how to
+ get them back in sync. */
+
+ for (i = 0; i < ccissscsi[h->ctlr].ndevices; i++) {
+ struct cciss_scsi_dev_t *sd =
+ &ccissscsi[h->ctlr].dev[i];
+ seq_printf(m, "c%db%dt%dl%d %02d "
+ "0x%02x%02x%02x%02x%02x%02x%02x%02x\n",
+ sh->host_no, sd->bus, sd->target, sd->lun,
+ sd->devtype,
+ sd->scsi3addr[0], sd->scsi3addr[1],
+ sd->scsi3addr[2], sd->scsi3addr[3],
+ sd->scsi3addr[4], sd->scsi3addr[5],
+ sd->scsi3addr[6], sd->scsi3addr[7]);
+ }
+ return 0;
+}
+
+/* cciss_scatter_gather takes a struct scsi_cmnd, (cmd), and does the pci
+ dma mapping and fills in the scatter gather entries of the
+ cciss command, c. */
+
+static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
+ struct scsi_cmnd *cmd)
+{
+ unsigned int len;
+ struct scatterlist *sg;
+ __u64 addr64;
+ int request_nsgs, i, chained, sg_index;
+ struct cciss_scsi_adapter_data_t *sa = h->scsi_ctlr;
+ SGDescriptor_struct *curr_sg;
+
+ BUG_ON(scsi_sg_count(cmd) > h->maxsgentries);
+
+ chained = 0;
+ sg_index = 0;
+ curr_sg = c->SG;
+ request_nsgs = scsi_dma_map(cmd);
+ if (request_nsgs) {
+ scsi_for_each_sg(cmd, sg, request_nsgs, i) {
+ if (sg_index + 1 == h->max_cmd_sgentries &&
+ !chained && request_nsgs - i > 1) {
+ chained = 1;
+ sg_index = 0;
+ curr_sg = sa->cmd_sg_list[c->cmdindex];
+ }
+ addr64 = (__u64) sg_dma_address(sg);
+ len = sg_dma_len(sg);
+ curr_sg[sg_index].Addr.lower =
+ (__u32) (addr64 & 0x0FFFFFFFFULL);
+ curr_sg[sg_index].Addr.upper =
+ (__u32) ((addr64 >> 32) & 0x0FFFFFFFFULL);
+ curr_sg[sg_index].Len = len;
+ curr_sg[sg_index].Ext = 0;
+ ++sg_index;
+ }
+ if (chained)
+ cciss_map_sg_chain_block(h, c,
+ sa->cmd_sg_list[c->cmdindex],
+ (request_nsgs - (h->max_cmd_sgentries - 1)) *
+ sizeof(SGDescriptor_struct));
+ }
+ /* track how many SG entries we are using */
+ if (request_nsgs > h->maxSG)
+ h->maxSG = request_nsgs;
+ c->Header.SGTotal = (u16) request_nsgs + chained;
+ if (request_nsgs > h->max_cmd_sgentries)
+ c->Header.SGList = h->max_cmd_sgentries;
+ else
+ c->Header.SGList = c->Header.SGTotal;
+ return;
+}
+
+
+static int
+cciss_scsi_queue_command_lck(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
+{
+ ctlr_info_t *h;
+ int rc;
+ unsigned char scsi3addr[8];
+ CommandList_struct *c;
+ unsigned long flags;
+
+ // Get the ptr to our adapter structure (hba[i]) out of cmd->host.
+ // We violate cmd->host privacy here. (Is there another way?)
+ h = (ctlr_info_t *) cmd->device->host->hostdata[0];
+
+ rc = lookup_scsi3addr(h, cmd->device->channel, cmd->device->id,
+ cmd->device->lun, scsi3addr);
+ if (rc != 0) {
+ /* the scsi nexus does not match any that we presented... */
+ /* pretend to mid layer that we got selection timeout */
+ cmd->result = DID_NO_CONNECT << 16;
+ done(cmd);
+ /* we might want to think about registering controller itself
+ as a processor device on the bus so sg binds to it. */
+ return 0;
+ }
+
+ /* Ok, we have a reasonable scsi nexus, so send the cmd down, and
+ see what the device thinks of it. */
+
+ spin_lock_irqsave(&h->lock, flags);
+ c = scsi_cmd_alloc(h);
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (c == NULL) { /* trouble... */
+ dev_warn(&h->pdev->dev, "scsi_cmd_alloc returned NULL!\n");
+ /* FIXME: next 3 lines are -> BAD! <- */
+ cmd->result = DID_NO_CONNECT << 16;
+ done(cmd);
+ return 0;
+ }
+
+ // Fill in the command list header
+
+ cmd->scsi_done = done; // save this for use by completion code
+
+ /* save c in case we have to abort it */
+ cmd->host_scribble = (unsigned char *) c;
+
+ c->cmd_type = CMD_SCSI;
+ c->scsi_cmd = cmd;
+ c->Header.ReplyQueue = 0; /* unused in simple mode */
+ memcpy(&c->Header.LUN.LunAddrBytes[0], &scsi3addr[0], 8);
+ c->Header.Tag.lower = c->busaddr; /* Use k. address of cmd as tag */
+
+ // Fill in the request block...
+
+ c->Request.Timeout = 0;
+ memset(c->Request.CDB, 0, sizeof(c->Request.CDB));
+ BUG_ON(cmd->cmd_len > sizeof(c->Request.CDB));
+ c->Request.CDBLen = cmd->cmd_len;
+ memcpy(c->Request.CDB, cmd->cmnd, cmd->cmd_len);
+ c->Request.Type.Type = TYPE_CMD;
+ c->Request.Type.Attribute = ATTR_SIMPLE;
+ switch(cmd->sc_data_direction)
+ {
+ case DMA_TO_DEVICE:
+ c->Request.Type.Direction = XFER_WRITE;
+ break;
+ case DMA_FROM_DEVICE:
+ c->Request.Type.Direction = XFER_READ;
+ break;
+ case DMA_NONE:
+ c->Request.Type.Direction = XFER_NONE;
+ break;
+ case DMA_BIDIRECTIONAL:
+ // This can happen if a buggy application does a scsi passthru
+ // and sets both inlen and outlen to non-zero. ( see
+ // ../scsi/scsi_ioctl.c:scsi_ioctl_send_command() )
+
+ c->Request.Type.Direction = XFER_RSVD;
+ // This is technically wrong, and cciss controllers should
+ // reject it with CMD_INVALID, which is the most correct
+ // response, but non-fibre backends appear to let it
+ // slide by, and give the same results as if this field
+ // were set correctly. Either way is acceptable for
+ // our purposes here.
+
+ break;
+
+ default:
+ dev_warn(&h->pdev->dev, "unknown data direction: %d\n",
+ cmd->sc_data_direction);
+ BUG();
+ break;
+ }
+ cciss_scatter_gather(h, c, cmd);
+ enqueue_cmd_and_start_io(h, c);
+ /* the cmd'll come back via intr handler in complete_scsi_command() */
+ return 0;
+}
+
+static DEF_SCSI_QCMD(cciss_scsi_queue_command)
+
+static void cciss_unregister_scsi(ctlr_info_t *h)
+{
+ struct cciss_scsi_adapter_data_t *sa;
+ struct cciss_scsi_cmd_stack_t *stk;
+ unsigned long flags;
+
+ /* we are being forcibly unloaded, and may not refuse. */
+
+ spin_lock_irqsave(&h->lock, flags);
+ sa = h->scsi_ctlr;
+ stk = &sa->cmd_stack;
+
+ /* if we weren't ever actually registered, don't unregister */
+ if (sa->registered) {
+ spin_unlock_irqrestore(&h->lock, flags);
+ scsi_remove_host(sa->scsi_host);
+ scsi_host_put(sa->scsi_host);
+ spin_lock_irqsave(&h->lock, flags);
+ }
+
+ /* set scsi_host to NULL so our detect routine will
+ find us on register */
+ sa->scsi_host = NULL;
+ spin_unlock_irqrestore(&h->lock, flags);
+ scsi_cmd_stack_free(h);
+ kfree(sa);
+}
+
+static int cciss_engage_scsi(ctlr_info_t *h)
+{
+ struct cciss_scsi_adapter_data_t *sa;
+ struct cciss_scsi_cmd_stack_t *stk;
+ unsigned long flags;
+
+ spin_lock_irqsave(&h->lock, flags);
+ sa = h->scsi_ctlr;
+ stk = &sa->cmd_stack;
+
+ if (sa->registered) {
+ dev_info(&h->pdev->dev, "SCSI subsystem already engaged.\n");
+ spin_unlock_irqrestore(&h->lock, flags);
+ return -ENXIO;
+ }
+ sa->registered = 1;
+ spin_unlock_irqrestore(&h->lock, flags);
+ cciss_update_non_disk_devices(h, -1);
+ cciss_scsi_detect(h);
+ return 0;
+}
+
+static void
+cciss_seq_tape_report(struct seq_file *seq, ctlr_info_t *h)
+{
+ unsigned long flags;
+
+ CPQ_TAPE_LOCK(h, flags);
+ seq_printf(seq,
+ "Sequential access devices: %d\n\n",
+ ccissscsi[h->ctlr].ndevices);
+ CPQ_TAPE_UNLOCK(h, flags);
+}
+
+static int wait_for_device_to_become_ready(ctlr_info_t *h,
+ unsigned char lunaddr[])
+{
+ int rc;
+ int count = 0;
+ int waittime = HZ;
+ CommandList_struct *c;
+
+ c = cmd_alloc(h);
+ if (!c) {
+ dev_warn(&h->pdev->dev, "out of memory in "
+ "wait_for_device_to_become_ready.\n");
+ return IO_ERROR;
+ }
+
+ /* Send test unit ready until device ready, or give up. */
+ while (count < 20) {
+
+ /* Wait for a bit. do this first, because if we send
+ * the TUR right away, the reset will just abort it.
+ */
+ schedule_timeout_uninterruptible(waittime);
+ count++;
+
+ /* Increase wait time with each try, up to a point. */
+ if (waittime < (HZ * 30))
+ waittime = waittime * 2;
+
+ /* Send the Test Unit Ready */
+ rc = fill_cmd(h, c, TEST_UNIT_READY, NULL, 0, 0,
+ lunaddr, TYPE_CMD);
+ if (rc == 0)
+ rc = sendcmd_withirq_core(h, c, 0);
+
+ (void) process_sendcmd_error(h, c);
+
+ if (rc != 0)
+ goto retry_tur;
+
+ if (c->err_info->CommandStatus == CMD_SUCCESS)
+ break;
+
+ if (c->err_info->CommandStatus == CMD_TARGET_STATUS &&
+ c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION) {
+ if (c->err_info->SenseInfo[2] == NO_SENSE)
+ break;
+ if (c->err_info->SenseInfo[2] == UNIT_ATTENTION) {
+ unsigned char asc;
+ asc = c->err_info->SenseInfo[12];
+ check_for_unit_attention(h, c);
+ if (asc == POWER_OR_RESET)
+ break;
+ }
+ }
+retry_tur:
+ dev_warn(&h->pdev->dev, "Waiting %d secs "
+ "for device to become ready.\n",
+ waittime / HZ);
+ rc = 1; /* device not ready. */
+ }
+
+ if (rc)
+ dev_warn(&h->pdev->dev, "giving up on device.\n");
+ else
+ dev_warn(&h->pdev->dev, "device is ready.\n");
+
+ cmd_free(h, c);
+ return rc;
+}
+
+/* Need at least one of these error handlers to keep ../scsi/hosts.c from
+ * complaining. Doing a host- or bus-reset can't do anything good here.
+ * Despite what it might say in scsi_error.c, there may well be commands
+ * on the controller, as the cciss driver registers twice, once as a block
+ * device for the logical drives, and once as a scsi device, for any tape
+ * drives. So we know there are no commands out on the tape drives, but we
+ * don't know there are no commands on the controller, and it is likely
+ * that there probably are, as the cciss block device is most commonly used
+ * as a boot device (embedded controller on HP/Compaq systems.)
+*/
+
+static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
+{
+ int rc;
+ CommandList_struct *cmd_in_trouble;
+ unsigned char lunaddr[8];
+ ctlr_info_t *h;
+
+ /* find the controller to which the command to be aborted was sent */
+ h = (ctlr_info_t *) scsicmd->device->host->hostdata[0];
+ if (h == NULL) /* paranoia */
+ return FAILED;
+ dev_warn(&h->pdev->dev, "resetting tape drive or medium changer.\n");
+ /* find the command that's giving us trouble */
+ cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble;
+ if (cmd_in_trouble == NULL) /* paranoia */
+ return FAILED;
+ memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
+ /* send a reset to the SCSI LUN which the command was sent to */
+ rc = sendcmd_withirq(h, CCISS_RESET_MSG, NULL, 0, 0, lunaddr,
+ TYPE_MSG);
+ if (rc == 0 && wait_for_device_to_become_ready(h, lunaddr) == 0)
+ return SUCCESS;
+ dev_warn(&h->pdev->dev, "resetting device failed.\n");
+ return FAILED;
+}
+
+static int cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
+{
+ int rc;
+ CommandList_struct *cmd_to_abort;
+ unsigned char lunaddr[8];
+ ctlr_info_t *h;
+
+ /* find the controller to which the command to be aborted was sent */
+ h = (ctlr_info_t *) scsicmd->device->host->hostdata[0];
+ if (h == NULL) /* paranoia */
+ return FAILED;
+ dev_warn(&h->pdev->dev, "aborting tardy SCSI cmd\n");
+
+ /* find the command to be aborted */
+ cmd_to_abort = (CommandList_struct *) scsicmd->host_scribble;
+ if (cmd_to_abort == NULL) /* paranoia */
+ return FAILED;
+ memcpy(lunaddr, &cmd_to_abort->Header.LUN.LunAddrBytes[0], 8);
+ rc = sendcmd_withirq(h, CCISS_ABORT_MSG, &cmd_to_abort->Header.Tag,
+ 0, 0, lunaddr, TYPE_MSG);
+ if (rc == 0)
+ return SUCCESS;
+ return FAILED;
+
+}
+
+#else /* no CONFIG_CISS_SCSI_TAPE */
+
+/* If no tape support, then these become defined out of existence */
+
+#define cciss_scsi_setup(cntl_num)
+#define cciss_engage_scsi(h)
+
+#endif /* CONFIG_CISS_SCSI_TAPE */
diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h
new file mode 100644
index 000000000..e71d98672
--- /dev/null
+++ b/drivers/block/cciss_scsi.h
@@ -0,0 +1,79 @@
+/*
+ * Disk Array driver for HP Smart Array controllers, SCSI Tape module.
+ * (C) Copyright 2001, 2007 Hewlett-Packard Development Company, L.P.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 300, Boston, MA
+ * 02111-1307, USA.
+ *
+ * Questions/Comments/Bugfixes to iss_storagedev@hp.com
+ *
+ */
+#ifdef CONFIG_CISS_SCSI_TAPE
+#ifndef _CCISS_SCSI_H_
+#define _CCISS_SCSI_H_
+
+#include <scsi/scsicam.h> /* possibly irrelevant, since we don't show disks */
+
+ /* the scsi id of the adapter... */
+#define SELF_SCSI_ID 15
+ /* 15 is somewhat arbitrary, since the scsi-2 bus
+ that's presented by the driver to the OS is
+ fabricated. The "real" scsi-3 bus the
+ hardware presents is fabricated too.
+ The actual, honest-to-goodness physical
+ bus that the devices are attached to is not
+ addressible natively, and may in fact turn
+ out to be not scsi at all. */
+
+
+/*
+
+If the upper scsi layer tries to track how many commands we have
+outstanding, it will be operating under the misapprehension that it is
+the only one sending us requests. We also have the block interface,
+which is where most requests must surely come from, so the upper layer's
+notion of how many requests we have outstanding will be wrong most or
+all of the time.
+
+Note, the normal SCSI mid-layer error handling doesn't work well
+for this driver because 1) it takes the io_request_lock before
+calling error handlers and uses a local variable to store flags,
+so the io_request_lock cannot be released and interrupts enabled
+inside the error handlers, and, the error handlers cannot poll
+for command completion because they might get commands from the
+block half of the driver completing, and not know what to do
+with them. That's what we get for making a hybrid scsi/block
+driver, I suppose.
+
+*/
+
+struct cciss_scsi_dev_t {
+ int devtype;
+ int bus, target, lun; /* as presented to the OS */
+ unsigned char scsi3addr[8]; /* as presented to the HW */
+ unsigned char device_id[16]; /* from inquiry pg. 0x83 */
+ unsigned char vendor[8]; /* bytes 8-15 of inquiry data */
+ unsigned char model[16]; /* bytes 16-31 of inquiry data */
+ unsigned char revision[4]; /* bytes 32-35 of inquiry data */
+};
+
+struct cciss_scsi_hba_t {
+ char *name;
+ int ndevices;
+#define CCISS_MAX_SCSI_DEVS_PER_HBA 16
+ struct cciss_scsi_dev_t dev[CCISS_MAX_SCSI_DEVS_PER_HBA];
+};
+
+#endif /* _CCISS_SCSI_H_ */
+#endif /* CONFIG_CISS_SCSI_TAPE */
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
new file mode 100644
index 000000000..f749df9e1
--- /dev/null
+++ b/drivers/block/cpqarray.c
@@ -0,0 +1,1820 @@
+/*
+ * Disk Array driver for Compaq SMART2 Controllers
+ * Copyright 1998 Compaq Computer Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Questions/Comments/Bugfixes to iss_storagedev@hp.com
+ *
+ */
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/major.h>
+#include <linux/fs.h>
+#include <linux/blkpg.h>
+#include <linux/timer.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/hdreg.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <linux/scatterlist.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+
+#define SMART2_DRIVER_VERSION(maj,min,submin) ((maj<<16)|(min<<8)|(submin))
+
+#define DRIVER_NAME "Compaq SMART2 Driver (v 2.6.0)"
+#define DRIVER_VERSION SMART2_DRIVER_VERSION(2,6,0)
+
+/* Embedded module documentation macros - see modules.h */
+/* Original author Chris Frantz - Compaq Computer Corporation */
+MODULE_AUTHOR("Compaq Computer Corporation");
+MODULE_DESCRIPTION("Driver for Compaq Smart2 Array Controllers version 2.6.0");
+MODULE_LICENSE("GPL");
+
+#include "cpqarray.h"
+#include "ida_cmd.h"
+#include "smart1,2.h"
+#include "ida_ioctl.h"
+
+#define READ_AHEAD 128
+#define NR_CMDS 128 /* This could probably go as high as ~400 */
+
+#define MAX_CTLR 8
+#define CTLR_SHIFT 8
+
+#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */
+
+static DEFINE_MUTEX(cpqarray_mutex);
+static int nr_ctlr;
+static ctlr_info_t *hba[MAX_CTLR];
+
+static int eisa[8];
+
+#define NR_PRODUCTS ARRAY_SIZE(products)
+
+/* board_id = Subsystem Device ID & Vendor ID
+ * product = Marketing Name for the board
+ * access = Address of the struct of function pointers
+ */
+static struct board_type products[] = {
+ { 0x0040110E, "IDA", &smart1_access },
+ { 0x0140110E, "IDA-2", &smart1_access },
+ { 0x1040110E, "IAES", &smart1_access },
+ { 0x2040110E, "SMART", &smart1_access },
+ { 0x3040110E, "SMART-2/E", &smart2e_access },
+ { 0x40300E11, "SMART-2/P", &smart2_access },
+ { 0x40310E11, "SMART-2SL", &smart2_access },
+ { 0x40320E11, "Smart Array 3200", &smart2_access },
+ { 0x40330E11, "Smart Array 3100ES", &smart2_access },
+ { 0x40340E11, "Smart Array 221", &smart2_access },
+ { 0x40400E11, "Integrated Array", &smart4_access },
+ { 0x40480E11, "Compaq Raid LC2", &smart4_access },
+ { 0x40500E11, "Smart Array 4200", &smart4_access },
+ { 0x40510E11, "Smart Array 4250ES", &smart4_access },
+ { 0x40580E11, "Smart Array 431", &smart4_access },
+};
+
+/* define the PCI info for the PCI cards this driver can control */
+static const struct pci_device_id cpqarray_pci_device_id[] =
+{
+ { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX,
+ 0x0E11, 0x4058, 0, 0, 0}, /* SA431 */
+ { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX,
+ 0x0E11, 0x4051, 0, 0, 0}, /* SA4250ES */
+ { PCI_VENDOR_ID_DEC, PCI_DEVICE_ID_COMPAQ_42XX,
+ 0x0E11, 0x4050, 0, 0, 0}, /* SA4200 */
+ { PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C1510,
+ 0x0E11, 0x4048, 0, 0, 0}, /* LC2 */
+ { PCI_VENDOR_ID_NCR, PCI_DEVICE_ID_NCR_53C1510,
+ 0x0E11, 0x4040, 0, 0, 0}, /* Integrated Array */
+ { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P,
+ 0x0E11, 0x4034, 0, 0, 0}, /* SA 221 */
+ { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P,
+ 0x0E11, 0x4033, 0, 0, 0}, /* SA 3100ES*/
+ { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P,
+ 0x0E11, 0x4032, 0, 0, 0}, /* SA 3200*/
+ { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P,
+ 0x0E11, 0x4031, 0, 0, 0}, /* SA 2SL*/
+ { PCI_VENDOR_ID_COMPAQ, PCI_DEVICE_ID_COMPAQ_SMART2P,
+ 0x0E11, 0x4030, 0, 0, 0}, /* SA 2P */
+ { 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, cpqarray_pci_device_id);
+
+static struct gendisk *ida_gendisk[MAX_CTLR][NWD];
+
+/* Debug... */
+#define DBG(s) do { s } while(0)
+/* Debug (general info)... */
+#define DBGINFO(s) do { } while(0)
+/* Debug Paranoid... */
+#define DBGP(s) do { } while(0)
+/* Debug Extra Paranoid... */
+#define DBGPX(s) do { } while(0)
+
+static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev);
+static void __iomem *remap_pci_mem(ulong base, ulong size);
+static int cpqarray_eisa_detect(void);
+static int pollcomplete(int ctlr);
+static void getgeometry(int ctlr);
+static void start_fwbk(int ctlr);
+
+static cmdlist_t * cmd_alloc(ctlr_info_t *h, int get_from_pool);
+static void cmd_free(ctlr_info_t *h, cmdlist_t *c, int got_from_pool);
+
+static void free_hba(int i);
+static int alloc_cpqarray_hba(void);
+
+static int sendcmd(
+ __u8 cmd,
+ int ctlr,
+ void *buff,
+ size_t size,
+ unsigned int blk,
+ unsigned int blkcnt,
+ unsigned int log_unit );
+
+static int ida_unlocked_open(struct block_device *bdev, fmode_t mode);
+static void ida_release(struct gendisk *disk, fmode_t mode);
+static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
+static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io);
+
+static void do_ida_request(struct request_queue *q);
+static void start_io(ctlr_info_t *h);
+
+static inline void addQ(cmdlist_t **Qptr, cmdlist_t *c);
+static inline cmdlist_t *removeQ(cmdlist_t **Qptr, cmdlist_t *c);
+static inline void complete_command(cmdlist_t *cmd, int timeout);
+
+static irqreturn_t do_ida_intr(int irq, void *dev_id);
+static void ida_timer(unsigned long tdata);
+static int ida_revalidate(struct gendisk *disk);
+static int revalidate_allvol(ctlr_info_t *host);
+static int cpqarray_register_ctlr(int ctlr, struct pci_dev *pdev);
+
+#ifdef CONFIG_PROC_FS
+static void ida_procinit(int i);
+#else
+static void ida_procinit(int i) {}
+#endif
+
+static inline drv_info_t *get_drv(struct gendisk *disk)
+{
+ return disk->private_data;
+}
+
+static inline ctlr_info_t *get_host(struct gendisk *disk)
+{
+ return disk->queue->queuedata;
+}
+
+
+static const struct block_device_operations ida_fops = {
+ .owner = THIS_MODULE,
+ .open = ida_unlocked_open,
+ .release = ida_release,
+ .ioctl = ida_ioctl,
+ .getgeo = ida_getgeo,
+ .revalidate_disk= ida_revalidate,
+};
+
+
+#ifdef CONFIG_PROC_FS
+
+static struct proc_dir_entry *proc_array;
+static const struct file_operations ida_proc_fops;
+
+/*
+ * Get us a file in /proc/array that says something about each controller.
+ * Create /proc/array if it doesn't exist yet.
+ */
+static void __init ida_procinit(int i)
+{
+ if (proc_array == NULL) {
+ proc_array = proc_mkdir("driver/cpqarray", NULL);
+ if (!proc_array) return;
+ }
+
+ proc_create_data(hba[i]->devname, 0, proc_array, &ida_proc_fops, hba[i]);
+}
+
+/*
+ * Report information about this controller.
+ */
+static int ida_proc_show(struct seq_file *m, void *v)
+{
+ int i, ctlr;
+ ctlr_info_t *h = (ctlr_info_t*)m->private;
+ drv_info_t *drv;
+#ifdef CPQ_PROC_PRINT_QUEUES
+ cmdlist_t *c;
+ unsigned long flags;
+#endif
+
+ ctlr = h->ctlr;
+ seq_printf(m, "%s: Compaq %s Controller\n"
+ " Board ID: 0x%08lx\n"
+ " Firmware Revision: %c%c%c%c\n"
+ " Controller Sig: 0x%08lx\n"
+ " Memory Address: 0x%08lx\n"
+ " I/O Port: 0x%04x\n"
+ " IRQ: %d\n"
+ " Logical drives: %d\n"
+ " Physical drives: %d\n\n"
+ " Current Q depth: %d\n"
+ " Max Q depth since init: %d\n\n",
+ h->devname,
+ h->product_name,
+ (unsigned long)h->board_id,
+ h->firm_rev[0], h->firm_rev[1], h->firm_rev[2], h->firm_rev[3],
+ (unsigned long)h->ctlr_sig, (unsigned long)h->vaddr,
+ (unsigned int) h->io_mem_addr, (unsigned int)h->intr,
+ h->log_drives, h->phys_drives,
+ h->Qdepth, h->maxQsinceinit);
+
+ seq_puts(m, "Logical Drive Info:\n");
+
+ for(i=0; i<h->log_drives; i++) {
+ drv = &h->drv[i];
+ seq_printf(m, "ida/c%dd%d: blksz=%d nr_blks=%d\n",
+ ctlr, i, drv->blk_size, drv->nr_blks);
+ }
+
+#ifdef CPQ_PROC_PRINT_QUEUES
+ spin_lock_irqsave(IDA_LOCK(h->ctlr), flags);
+ seq_puts(m, "\nCurrent Queues:\n");
+
+ c = h->reqQ;
+ seq_printf(m, "reqQ = %p", c);
+ if (c) c=c->next;
+ while(c && c != h->reqQ) {
+ seq_printf(m, "->%p", c);
+ c=c->next;
+ }
+
+ c = h->cmpQ;
+ seq_printf(m, "\ncmpQ = %p", c);
+ if (c) c=c->next;
+ while(c && c != h->cmpQ) {
+ seq_printf(m, "->%p", c);
+ c=c->next;
+ }
+
+ seq_putc(m, '\n');
+ spin_unlock_irqrestore(IDA_LOCK(h->ctlr), flags);
+#endif
+ seq_printf(m, "nr_allocs = %d\nnr_frees = %d\n",
+ h->nr_allocs, h->nr_frees);
+ return 0;
+}
+
+static int ida_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ida_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ida_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = ida_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+module_param_array(eisa, int, NULL, 0);
+
+static void release_io_mem(ctlr_info_t *c)
+{
+ /* if IO mem was not protected do nothing */
+ if( c->io_mem_addr == 0)
+ return;
+ release_region(c->io_mem_addr, c->io_mem_length);
+ c->io_mem_addr = 0;
+ c->io_mem_length = 0;
+}
+
+static void cpqarray_remove_one(int i)
+{
+ int j;
+ char buff[4];
+
+ /* sendcmd will turn off interrupt, and send the flush...
+ * To write all data in the battery backed cache to disks
+ * no data returned, but don't want to send NULL to sendcmd */
+ if( sendcmd(FLUSH_CACHE, i, buff, 4, 0, 0, 0))
+ {
+ printk(KERN_WARNING "Unable to flush cache on controller %d\n",
+ i);
+ }
+ free_irq(hba[i]->intr, hba[i]);
+ iounmap(hba[i]->vaddr);
+ unregister_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname);
+ del_timer(&hba[i]->timer);
+ remove_proc_entry(hba[i]->devname, proc_array);
+ pci_free_consistent(hba[i]->pci_dev,
+ NR_CMDS * sizeof(cmdlist_t), (hba[i]->cmd_pool),
+ hba[i]->cmd_pool_dhandle);
+ kfree(hba[i]->cmd_pool_bits);
+ for(j = 0; j < NWD; j++) {
+ if (ida_gendisk[i][j]->flags & GENHD_FL_UP)
+ del_gendisk(ida_gendisk[i][j]);
+ put_disk(ida_gendisk[i][j]);
+ }
+ blk_cleanup_queue(hba[i]->queue);
+ release_io_mem(hba[i]);
+ free_hba(i);
+}
+
+static void cpqarray_remove_one_pci(struct pci_dev *pdev)
+{
+ int i;
+ ctlr_info_t *tmp_ptr;
+
+ if (pci_get_drvdata(pdev) == NULL) {
+ printk( KERN_ERR "cpqarray: Unable to remove device \n");
+ return;
+ }
+
+ tmp_ptr = pci_get_drvdata(pdev);
+ i = tmp_ptr->ctlr;
+ if (hba[i] == NULL) {
+ printk(KERN_ERR "cpqarray: controller %d appears to have"
+ "already been removed \n", i);
+ return;
+ }
+ pci_set_drvdata(pdev, NULL);
+
+ cpqarray_remove_one(i);
+}
+
+/* removing an instance that was not removed automatically..
+ * must be an eisa card.
+ */
+static void cpqarray_remove_one_eisa(int i)
+{
+ if (hba[i] == NULL) {
+ printk(KERN_ERR "cpqarray: controller %d appears to have"
+ "already been removed \n", i);
+ return;
+ }
+ cpqarray_remove_one(i);
+}
+
+/* pdev is NULL for eisa */
+static int cpqarray_register_ctlr(int i, struct pci_dev *pdev)
+{
+ struct request_queue *q;
+ int j;
+
+ /*
+ * register block devices
+ * Find disks and fill in structs
+ * Get an interrupt, set the Q depth and get into /proc
+ */
+
+ /* If this successful it should insure that we are the only */
+ /* instance of the driver */
+ if (register_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname)) {
+ goto Enomem4;
+ }
+ hba[i]->access.set_intr_mask(hba[i], 0);
+ if (request_irq(hba[i]->intr, do_ida_intr, IRQF_SHARED,
+ hba[i]->devname, hba[i]))
+ {
+ printk(KERN_ERR "cpqarray: Unable to get irq %d for %s\n",
+ hba[i]->intr, hba[i]->devname);
+ goto Enomem3;
+ }
+
+ for (j=0; j<NWD; j++) {
+ ida_gendisk[i][j] = alloc_disk(1 << NWD_SHIFT);
+ if (!ida_gendisk[i][j])
+ goto Enomem2;
+ }
+
+ hba[i]->cmd_pool = pci_alloc_consistent(
+ hba[i]->pci_dev, NR_CMDS * sizeof(cmdlist_t),
+ &(hba[i]->cmd_pool_dhandle));
+ hba[i]->cmd_pool_bits = kcalloc(
+ DIV_ROUND_UP(NR_CMDS, BITS_PER_LONG), sizeof(unsigned long),
+ GFP_KERNEL);
+
+ if (!hba[i]->cmd_pool_bits || !hba[i]->cmd_pool)
+ goto Enomem1;
+
+ memset(hba[i]->cmd_pool, 0, NR_CMDS * sizeof(cmdlist_t));
+ printk(KERN_INFO "cpqarray: Finding drives on %s",
+ hba[i]->devname);
+
+ spin_lock_init(&hba[i]->lock);
+ q = blk_init_queue(do_ida_request, &hba[i]->lock);
+ if (!q)
+ goto Enomem1;
+
+ hba[i]->queue = q;
+ q->queuedata = hba[i];
+
+ getgeometry(i);
+ start_fwbk(i);
+
+ ida_procinit(i);
+
+ if (pdev)
+ blk_queue_bounce_limit(q, hba[i]->pci_dev->dma_mask);
+
+ /* This is a hardware imposed limit. */
+ blk_queue_max_segments(q, SG_MAX);
+
+ init_timer(&hba[i]->timer);
+ hba[i]->timer.expires = jiffies + IDA_TIMER;
+ hba[i]->timer.data = (unsigned long)hba[i];
+ hba[i]->timer.function = ida_timer;
+ add_timer(&hba[i]->timer);
+
+ /* Enable IRQ now that spinlock and rate limit timer are set up */
+ hba[i]->access.set_intr_mask(hba[i], FIFO_NOT_EMPTY);
+
+ for(j=0; j<NWD; j++) {
+ struct gendisk *disk = ida_gendisk[i][j];
+ drv_info_t *drv = &hba[i]->drv[j];
+ sprintf(disk->disk_name, "ida/c%dd%d", i, j);
+ disk->major = COMPAQ_SMART2_MAJOR + i;
+ disk->first_minor = j<<NWD_SHIFT;
+ disk->fops = &ida_fops;
+ if (j && !drv->nr_blks)
+ continue;
+ blk_queue_logical_block_size(hba[i]->queue, drv->blk_size);
+ set_capacity(disk, drv->nr_blks);
+ disk->queue = hba[i]->queue;
+ disk->private_data = drv;
+ add_disk(disk);
+ }
+
+ /* done ! */
+ return(i);
+
+Enomem1:
+ nr_ctlr = i;
+ kfree(hba[i]->cmd_pool_bits);
+ if (hba[i]->cmd_pool)
+ pci_free_consistent(hba[i]->pci_dev, NR_CMDS*sizeof(cmdlist_t),
+ hba[i]->cmd_pool, hba[i]->cmd_pool_dhandle);
+Enomem2:
+ while (j--) {
+ put_disk(ida_gendisk[i][j]);
+ ida_gendisk[i][j] = NULL;
+ }
+ free_irq(hba[i]->intr, hba[i]);
+Enomem3:
+ unregister_blkdev(COMPAQ_SMART2_MAJOR+i, hba[i]->devname);
+Enomem4:
+ if (pdev)
+ pci_set_drvdata(pdev, NULL);
+ release_io_mem(hba[i]);
+ free_hba(i);
+
+ printk( KERN_ERR "cpqarray: out of memory");
+
+ return -1;
+}
+
+static int cpqarray_init_one(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int i;
+
+ printk(KERN_DEBUG "cpqarray: Device 0x%x has been found at"
+ " bus %d dev %d func %d\n",
+ pdev->device, pdev->bus->number, PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn));
+ i = alloc_cpqarray_hba();
+ if( i < 0 )
+ return (-1);
+ memset(hba[i], 0, sizeof(ctlr_info_t));
+ sprintf(hba[i]->devname, "ida%d", i);
+ hba[i]->ctlr = i;
+ /* Initialize the pdev driver private data */
+ pci_set_drvdata(pdev, hba[i]);
+
+ if (cpqarray_pci_init(hba[i], pdev) != 0) {
+ pci_set_drvdata(pdev, NULL);
+ release_io_mem(hba[i]);
+ free_hba(i);
+ return -1;
+ }
+
+ return (cpqarray_register_ctlr(i, pdev));
+}
+
+static struct pci_driver cpqarray_pci_driver = {
+ .name = "cpqarray",
+ .probe = cpqarray_init_one,
+ .remove = cpqarray_remove_one_pci,
+ .id_table = cpqarray_pci_device_id,
+};
+
+/*
+ * This is it. Find all the controllers and register them.
+ * returns the number of block devices registered.
+ */
+static int __init cpqarray_init(void)
+{
+ int num_cntlrs_reg = 0;
+ int i;
+ int rc = 0;
+
+ /* detect controllers */
+ printk(DRIVER_NAME "\n");
+
+ rc = pci_register_driver(&cpqarray_pci_driver);
+ if (rc)
+ return rc;
+ cpqarray_eisa_detect();
+
+ for (i=0; i < MAX_CTLR; i++) {
+ if (hba[i] != NULL)
+ num_cntlrs_reg++;
+ }
+
+ if (num_cntlrs_reg)
+ return 0;
+ else {
+ pci_unregister_driver(&cpqarray_pci_driver);
+ return -ENODEV;
+ }
+}
+
+/* Function to find the first free pointer into our hba[] array */
+/* Returns -1 if no free entries are left. */
+static int alloc_cpqarray_hba(void)
+{
+ int i;
+
+ for(i=0; i< MAX_CTLR; i++) {
+ if (hba[i] == NULL) {
+ hba[i] = kmalloc(sizeof(ctlr_info_t), GFP_KERNEL);
+ if(hba[i]==NULL) {
+ printk(KERN_ERR "cpqarray: out of memory.\n");
+ return (-1);
+ }
+ return (i);
+ }
+ }
+ printk(KERN_WARNING "cpqarray: This driver supports a maximum"
+ " of 8 controllers.\n");
+ return(-1);
+}
+
+static void free_hba(int i)
+{
+ kfree(hba[i]);
+ hba[i]=NULL;
+}
+
+/*
+ * Find the IO address of the controller, its IRQ and so forth. Fill
+ * in some basic stuff into the ctlr_info_t structure.
+ */
+static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev)
+{
+ ushort vendor_id, device_id, command;
+ unchar cache_line_size, latency_timer;
+ unchar irq, revision;
+ unsigned long addr[6];
+ __u32 board_id;
+
+ int i;
+
+ c->pci_dev = pdev;
+ pci_set_master(pdev);
+ if (pci_enable_device(pdev)) {
+ printk(KERN_ERR "cpqarray: Unable to Enable PCI device\n");
+ return -1;
+ }
+ vendor_id = pdev->vendor;
+ device_id = pdev->device;
+ revision = pdev->revision;
+ irq = pdev->irq;
+
+ for(i=0; i<6; i++)
+ addr[i] = pci_resource_start(pdev, i);
+
+ if (pci_set_dma_mask(pdev, CPQARRAY_DMA_MASK) != 0)
+ {
+ printk(KERN_ERR "cpqarray: Unable to set DMA mask\n");
+ return -1;
+ }
+
+ pci_read_config_word(pdev, PCI_COMMAND, &command);
+ pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size);
+ pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer);
+
+ pci_read_config_dword(pdev, 0x2c, &board_id);
+
+ /* check to see if controller has been disabled */
+ if(!(command & 0x02)) {
+ printk(KERN_WARNING
+ "cpqarray: controller appears to be disabled\n");
+ return(-1);
+ }
+
+DBGINFO(
+ printk("vendor_id = %x\n", vendor_id);
+ printk("device_id = %x\n", device_id);
+ printk("command = %x\n", command);
+ for(i=0; i<6; i++)
+ printk("addr[%d] = %lx\n", i, addr[i]);
+ printk("revision = %x\n", revision);
+ printk("irq = %x\n", irq);
+ printk("cache_line_size = %x\n", cache_line_size);
+ printk("latency_timer = %x\n", latency_timer);
+ printk("board_id = %x\n", board_id);
+);
+
+ c->intr = irq;
+
+ for(i=0; i<6; i++) {
+ if (pci_resource_flags(pdev, i) & PCI_BASE_ADDRESS_SPACE_IO)
+ { /* IO space */
+ c->io_mem_addr = addr[i];
+ c->io_mem_length = pci_resource_end(pdev, i)
+ - pci_resource_start(pdev, i) + 1;
+ if(!request_region( c->io_mem_addr, c->io_mem_length,
+ "cpqarray"))
+ {
+ printk( KERN_WARNING "cpqarray I/O memory range already in use addr %lx length = %ld\n", c->io_mem_addr, c->io_mem_length);
+ c->io_mem_addr = 0;
+ c->io_mem_length = 0;
+ }
+ break;
+ }
+ }
+
+ c->paddr = 0;
+ for(i=0; i<6; i++)
+ if (!(pci_resource_flags(pdev, i) &
+ PCI_BASE_ADDRESS_SPACE_IO)) {
+ c->paddr = pci_resource_start (pdev, i);
+ break;
+ }
+ if (!c->paddr)
+ return -1;
+ c->vaddr = remap_pci_mem(c->paddr, 128);
+ if (!c->vaddr)
+ return -1;
+ c->board_id = board_id;
+
+ for(i=0; i<NR_PRODUCTS; i++) {
+ if (board_id == products[i].board_id) {
+ c->product_name = products[i].product_name;
+ c->access = *(products[i].access);
+ break;
+ }
+ }
+ if (i == NR_PRODUCTS) {
+ printk(KERN_WARNING "cpqarray: Sorry, I don't know how"
+ " to access the SMART Array controller %08lx\n",
+ (unsigned long)board_id);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Map (physical) PCI mem into (virtual) kernel space
+ */
+static void __iomem *remap_pci_mem(ulong base, ulong size)
+{
+ ulong page_base = ((ulong) base) & PAGE_MASK;
+ ulong page_offs = ((ulong) base) - page_base;
+ void __iomem *page_remapped = ioremap(page_base, page_offs+size);
+
+ return (page_remapped ? (page_remapped + page_offs) : NULL);
+}
+
+#ifndef MODULE
+/*
+ * Config string is a comma separated set of i/o addresses of EISA cards.
+ */
+static int cpqarray_setup(char *str)
+{
+ int i, ints[9];
+
+ (void)get_options(str, ARRAY_SIZE(ints), ints);
+
+ for(i=0; i<ints[0] && i<8; i++)
+ eisa[i] = ints[i+1];
+ return 1;
+}
+
+__setup("smart2=", cpqarray_setup);
+
+#endif
+
+/*
+ * Find an EISA controller's signature. Set up an hba if we find it.
+ */
+static int cpqarray_eisa_detect(void)
+{
+ int i=0, j;
+ __u32 board_id;
+ int intr;
+ int ctlr;
+ int num_ctlr = 0;
+
+ while(i<8 && eisa[i]) {
+ ctlr = alloc_cpqarray_hba();
+ if(ctlr == -1)
+ break;
+ board_id = inl(eisa[i]+0xC80);
+ for(j=0; j < NR_PRODUCTS; j++)
+ if (board_id == products[j].board_id)
+ break;
+
+ if (j == NR_PRODUCTS) {
+ printk(KERN_WARNING "cpqarray: Sorry, I don't know how"
+ " to access the SMART Array controller %08lx\n", (unsigned long)board_id);
+ continue;
+ }
+
+ memset(hba[ctlr], 0, sizeof(ctlr_info_t));
+ hba[ctlr]->io_mem_addr = eisa[i];
+ hba[ctlr]->io_mem_length = 0x7FF;
+ if(!request_region(hba[ctlr]->io_mem_addr,
+ hba[ctlr]->io_mem_length,
+ "cpqarray"))
+ {
+ printk(KERN_WARNING "cpqarray: I/O range already in "
+ "use addr = %lx length = %ld\n",
+ hba[ctlr]->io_mem_addr,
+ hba[ctlr]->io_mem_length);
+ free_hba(ctlr);
+ continue;
+ }
+
+ /*
+ * Read the config register to find our interrupt
+ */
+ intr = inb(eisa[i]+0xCC0) >> 4;
+ if (intr & 1) intr = 11;
+ else if (intr & 2) intr = 10;
+ else if (intr & 4) intr = 14;
+ else if (intr & 8) intr = 15;
+
+ hba[ctlr]->intr = intr;
+ sprintf(hba[ctlr]->devname, "ida%d", nr_ctlr);
+ hba[ctlr]->product_name = products[j].product_name;
+ hba[ctlr]->access = *(products[j].access);
+ hba[ctlr]->ctlr = ctlr;
+ hba[ctlr]->board_id = board_id;
+ hba[ctlr]->pci_dev = NULL; /* not PCI */
+
+DBGINFO(
+ printk("i = %d, j = %d\n", i, j);
+ printk("irq = %x\n", intr);
+ printk("product name = %s\n", products[j].product_name);
+ printk("board_id = %x\n", board_id);
+);
+
+ num_ctlr++;
+ i++;
+
+ if (cpqarray_register_ctlr(ctlr, NULL) == -1)
+ printk(KERN_WARNING
+ "cpqarray: Can't register EISA controller %d\n",
+ ctlr);
+
+ }
+
+ return num_ctlr;
+}
+
+/*
+ * Open. Make sure the device is really there.
+ */
+static int ida_open(struct block_device *bdev, fmode_t mode)
+{
+ drv_info_t *drv = get_drv(bdev->bd_disk);
+ ctlr_info_t *host = get_host(bdev->bd_disk);
+
+ DBGINFO(printk("ida_open %s\n", bdev->bd_disk->disk_name));
+ /*
+ * Root is allowed to open raw volume zero even if it's not configured
+ * so array config can still work. I don't think I really like this,
+ * but I'm already using way to many device nodes to claim another one
+ * for "raw controller".
+ */
+ if (!drv->nr_blks) {
+ if (!capable(CAP_SYS_RAWIO))
+ return -ENXIO;
+ if (!capable(CAP_SYS_ADMIN) && drv != host->drv)
+ return -ENXIO;
+ }
+ host->usage_count++;
+ return 0;
+}
+
+static int ida_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+ int ret;
+
+ mutex_lock(&cpqarray_mutex);
+ ret = ida_open(bdev, mode);
+ mutex_unlock(&cpqarray_mutex);
+
+ return ret;
+}
+
+/*
+ * Close. Sync first.
+ */
+static void ida_release(struct gendisk *disk, fmode_t mode)
+{
+ ctlr_info_t *host;
+
+ mutex_lock(&cpqarray_mutex);
+ host = get_host(disk);
+ host->usage_count--;
+ mutex_unlock(&cpqarray_mutex);
+}
+
+/*
+ * Enqueuing and dequeuing functions for cmdlists.
+ */
+static inline void addQ(cmdlist_t **Qptr, cmdlist_t *c)
+{
+ if (*Qptr == NULL) {
+ *Qptr = c;
+ c->next = c->prev = c;
+ } else {
+ c->prev = (*Qptr)->prev;
+ c->next = (*Qptr);
+ (*Qptr)->prev->next = c;
+ (*Qptr)->prev = c;
+ }
+}
+
+static inline cmdlist_t *removeQ(cmdlist_t **Qptr, cmdlist_t *c)
+{
+ if (c && c->next != c) {
+ if (*Qptr == c) *Qptr = c->next;
+ c->prev->next = c->next;
+ c->next->prev = c->prev;
+ } else {
+ *Qptr = NULL;
+ }
+ return c;
+}
+
+/*
+ * Get a request and submit it to the controller.
+ * This routine needs to grab all the requests it possibly can from the
+ * req Q and submit them. Interrupts are off (and need to be off) when you
+ * are in here (either via the dummy do_ida_request functions or by being
+ * called from the interrupt handler
+ */
+static void do_ida_request(struct request_queue *q)
+{
+ ctlr_info_t *h = q->queuedata;
+ cmdlist_t *c;
+ struct request *creq;
+ struct scatterlist tmp_sg[SG_MAX];
+ int i, dir, seg;
+
+queue_next:
+ creq = blk_peek_request(q);
+ if (!creq)
+ goto startio;
+
+ BUG_ON(creq->nr_phys_segments > SG_MAX);
+
+ if ((c = cmd_alloc(h,1)) == NULL)
+ goto startio;
+
+ blk_start_request(creq);
+
+ c->ctlr = h->ctlr;
+ c->hdr.unit = (drv_info_t *)(creq->rq_disk->private_data) - h->drv;
+ c->hdr.size = sizeof(rblk_t) >> 2;
+ c->size += sizeof(rblk_t);
+
+ c->req.hdr.blk = blk_rq_pos(creq);
+ c->rq = creq;
+DBGPX(
+ printk("sector=%d, nr_sectors=%u\n",
+ blk_rq_pos(creq), blk_rq_sectors(creq));
+);
+ sg_init_table(tmp_sg, SG_MAX);
+ seg = blk_rq_map_sg(q, creq, tmp_sg);
+
+ /* Now do all the DMA Mappings */
+ if (rq_data_dir(creq) == READ)
+ dir = PCI_DMA_FROMDEVICE;
+ else
+ dir = PCI_DMA_TODEVICE;
+ for( i=0; i < seg; i++)
+ {
+ c->req.sg[i].size = tmp_sg[i].length;
+ c->req.sg[i].addr = (__u32) pci_map_page(h->pci_dev,
+ sg_page(&tmp_sg[i]),
+ tmp_sg[i].offset,
+ tmp_sg[i].length, dir);
+ }
+DBGPX( printk("Submitting %u sectors in %d segments\n", blk_rq_sectors(creq), seg); );
+ c->req.hdr.sg_cnt = seg;
+ c->req.hdr.blk_cnt = blk_rq_sectors(creq);
+ c->req.hdr.cmd = (rq_data_dir(creq) == READ) ? IDA_READ : IDA_WRITE;
+ c->type = CMD_RWREQ;
+
+ /* Put the request on the tail of the request queue */
+ addQ(&h->reqQ, c);
+ h->Qdepth++;
+ if (h->Qdepth > h->maxQsinceinit)
+ h->maxQsinceinit = h->Qdepth;
+
+ goto queue_next;
+
+startio:
+ start_io(h);
+}
+
+/*
+ * start_io submits everything on a controller's request queue
+ * and moves it to the completion queue.
+ *
+ * Interrupts had better be off if you're in here
+ */
+static void start_io(ctlr_info_t *h)
+{
+ cmdlist_t *c;
+
+ while((c = h->reqQ) != NULL) {
+ /* Can't do anything if we're busy */
+ if (h->access.fifo_full(h) == 0)
+ return;
+
+ /* Get the first entry from the request Q */
+ removeQ(&h->reqQ, c);
+ h->Qdepth--;
+
+ /* Tell the controller to do our bidding */
+ h->access.submit_command(h, c);
+
+ /* Get onto the completion Q */
+ addQ(&h->cmpQ, c);
+ }
+}
+
+/*
+ * Mark all buffers that cmd was responsible for
+ */
+static inline void complete_command(cmdlist_t *cmd, int timeout)
+{
+ struct request *rq = cmd->rq;
+ int error = 0;
+ int i, ddir;
+
+ if (cmd->req.hdr.rcode & RCODE_NONFATAL &&
+ (hba[cmd->ctlr]->misc_tflags & MISC_NONFATAL_WARN) == 0) {
+ printk(KERN_NOTICE "Non Fatal error on ida/c%dd%d\n",
+ cmd->ctlr, cmd->hdr.unit);
+ hba[cmd->ctlr]->misc_tflags |= MISC_NONFATAL_WARN;
+ }
+ if (cmd->req.hdr.rcode & RCODE_FATAL) {
+ printk(KERN_WARNING "Fatal error on ida/c%dd%d\n",
+ cmd->ctlr, cmd->hdr.unit);
+ error = -EIO;
+ }
+ if (cmd->req.hdr.rcode & RCODE_INVREQ) {
+ printk(KERN_WARNING "Invalid request on ida/c%dd%d = (cmd=%x sect=%d cnt=%d sg=%d ret=%x)\n",
+ cmd->ctlr, cmd->hdr.unit, cmd->req.hdr.cmd,
+ cmd->req.hdr.blk, cmd->req.hdr.blk_cnt,
+ cmd->req.hdr.sg_cnt, cmd->req.hdr.rcode);
+ error = -EIO;
+ }
+ if (timeout)
+ error = -EIO;
+ /* unmap the DMA mapping for all the scatter gather elements */
+ if (cmd->req.hdr.cmd == IDA_READ)
+ ddir = PCI_DMA_FROMDEVICE;
+ else
+ ddir = PCI_DMA_TODEVICE;
+ for(i=0; i<cmd->req.hdr.sg_cnt; i++)
+ pci_unmap_page(hba[cmd->ctlr]->pci_dev, cmd->req.sg[i].addr,
+ cmd->req.sg[i].size, ddir);
+
+ DBGPX(printk("Done with %p\n", rq););
+ __blk_end_request_all(rq, error);
+}
+
+/*
+ * The controller will interrupt us upon completion of commands.
+ * Find the command on the completion queue, remove it, tell the OS and
+ * try to queue up more IO
+ */
+static irqreturn_t do_ida_intr(int irq, void *dev_id)
+{
+ ctlr_info_t *h = dev_id;
+ cmdlist_t *c;
+ unsigned long istat;
+ unsigned long flags;
+ __u32 a,a1;
+
+ istat = h->access.intr_pending(h);
+ /* Is this interrupt for us? */
+ if (istat == 0)
+ return IRQ_NONE;
+
+ /*
+ * If there are completed commands in the completion queue,
+ * we had better do something about it.
+ */
+ spin_lock_irqsave(IDA_LOCK(h->ctlr), flags);
+ if (istat & FIFO_NOT_EMPTY) {
+ while((a = h->access.command_completed(h))) {
+ a1 = a; a &= ~3;
+ if ((c = h->cmpQ) == NULL)
+ {
+ printk(KERN_WARNING "cpqarray: Completion of %08lx ignored\n", (unsigned long)a1);
+ continue;
+ }
+ while(c->busaddr != a) {
+ c = c->next;
+ if (c == h->cmpQ)
+ break;
+ }
+ /*
+ * If we've found the command, take it off the
+ * completion Q and free it
+ */
+ if (c->busaddr == a) {
+ removeQ(&h->cmpQ, c);
+ /* Check for invalid command.
+ * Controller returns command error,
+ * But rcode = 0.
+ */
+
+ if((a1 & 0x03) && (c->req.hdr.rcode == 0))
+ {
+ c->req.hdr.rcode = RCODE_INVREQ;
+ }
+ if (c->type == CMD_RWREQ) {
+ complete_command(c, 0);
+ cmd_free(h, c, 1);
+ } else if (c->type == CMD_IOCTL_PEND) {
+ c->type = CMD_IOCTL_DONE;
+ }
+ continue;
+ }
+ }
+ }
+
+ /*
+ * See if we can queue up some more IO
+ */
+ do_ida_request(h->queue);
+ spin_unlock_irqrestore(IDA_LOCK(h->ctlr), flags);
+ return IRQ_HANDLED;
+}
+
+/*
+ * This timer was for timing out requests that haven't happened after
+ * IDA_TIMEOUT. That wasn't such a good idea. This timer is used to
+ * reset a flags structure so we don't flood the user with
+ * "Non-Fatal error" messages.
+ */
+static void ida_timer(unsigned long tdata)
+{
+ ctlr_info_t *h = (ctlr_info_t*)tdata;
+
+ h->timer.expires = jiffies + IDA_TIMER;
+ add_timer(&h->timer);
+ h->misc_tflags = 0;
+}
+
+static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ drv_info_t *drv = get_drv(bdev->bd_disk);
+
+ if (drv->cylinders) {
+ geo->heads = drv->heads;
+ geo->sectors = drv->sectors;
+ geo->cylinders = drv->cylinders;
+ } else {
+ geo->heads = 0xff;
+ geo->sectors = 0x3f;
+ geo->cylinders = drv->nr_blks / (0xff*0x3f);
+ }
+
+ return 0;
+}
+
+/*
+ * ida_ioctl does some miscellaneous stuff like reporting drive geometry,
+ * setting readahead and submitting commands from userspace to the controller.
+ */
+static int ida_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+{
+ drv_info_t *drv = get_drv(bdev->bd_disk);
+ ctlr_info_t *host = get_host(bdev->bd_disk);
+ int error;
+ ida_ioctl_t __user *io = (ida_ioctl_t __user *)arg;
+ ida_ioctl_t *my_io;
+
+ switch(cmd) {
+ case IDAGETDRVINFO:
+ if (copy_to_user(&io->c.drv, drv, sizeof(drv_info_t)))
+ return -EFAULT;
+ return 0;
+ case IDAPASSTHRU:
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ my_io = kmalloc(sizeof(ida_ioctl_t), GFP_KERNEL);
+ if (!my_io)
+ return -ENOMEM;
+ error = -EFAULT;
+ if (copy_from_user(my_io, io, sizeof(*my_io)))
+ goto out_passthru;
+ error = ida_ctlr_ioctl(host, drv - host->drv, my_io);
+ if (error)
+ goto out_passthru;
+ error = -EFAULT;
+ if (copy_to_user(io, my_io, sizeof(*my_io)))
+ goto out_passthru;
+ error = 0;
+out_passthru:
+ kfree(my_io);
+ return error;
+ case IDAGETCTLRSIG:
+ if (!arg) return -EINVAL;
+ if (put_user(host->ctlr_sig, (int __user *)arg))
+ return -EFAULT;
+ return 0;
+ case IDAREVALIDATEVOLS:
+ if (MINOR(bdev->bd_dev) != 0)
+ return -ENXIO;
+ return revalidate_allvol(host);
+ case IDADRIVERVERSION:
+ if (!arg) return -EINVAL;
+ if (put_user(DRIVER_VERSION, (unsigned long __user *)arg))
+ return -EFAULT;
+ return 0;
+ case IDAGETPCIINFO:
+ {
+
+ ida_pci_info_struct pciinfo;
+
+ if (!arg) return -EINVAL;
+ memset(&pciinfo, 0, sizeof(pciinfo));
+ pciinfo.bus = host->pci_dev->bus->number;
+ pciinfo.dev_fn = host->pci_dev->devfn;
+ pciinfo.board_id = host->board_id;
+ if(copy_to_user((void __user *) arg, &pciinfo,
+ sizeof( ida_pci_info_struct)))
+ return -EFAULT;
+ return(0);
+ }
+
+ default:
+ return -EINVAL;
+ }
+
+}
+
+static int ida_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param)
+{
+ int ret;
+
+ mutex_lock(&cpqarray_mutex);
+ ret = ida_locked_ioctl(bdev, mode, cmd, param);
+ mutex_unlock(&cpqarray_mutex);
+
+ return ret;
+}
+
+/*
+ * ida_ctlr_ioctl is for passing commands to the controller from userspace.
+ * The command block (io) has already been copied to kernel space for us,
+ * however, any elements in the sglist need to be copied to kernel space
+ * or copied back to userspace.
+ *
+ * Only root may perform a controller passthru command, however I'm not doing
+ * any serious sanity checking on the arguments. Doing an IDA_WRITE_MEDIA and
+ * putting a 64M buffer in the sglist is probably a *bad* idea.
+ */
+static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io)
+{
+ int ctlr = h->ctlr;
+ cmdlist_t *c;
+ void *p = NULL;
+ unsigned long flags;
+ int error;
+
+ if ((c = cmd_alloc(h, 0)) == NULL)
+ return -ENOMEM;
+ c->ctlr = ctlr;
+ c->hdr.unit = (io->unit & UNITVALID) ? (io->unit & ~UNITVALID) : dsk;
+ c->hdr.size = sizeof(rblk_t) >> 2;
+ c->size += sizeof(rblk_t);
+
+ c->req.hdr.cmd = io->cmd;
+ c->req.hdr.blk = io->blk;
+ c->req.hdr.blk_cnt = io->blk_cnt;
+ c->type = CMD_IOCTL_PEND;
+
+ /* Pre submit processing */
+ switch(io->cmd) {
+ case PASSTHRU_A:
+ p = memdup_user(io->sg[0].addr, io->sg[0].size);
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
+ cmd_free(h, c, 0);
+ return error;
+ }
+ c->req.hdr.blk = pci_map_single(h->pci_dev, &(io->c),
+ sizeof(ida_ioctl_t),
+ PCI_DMA_BIDIRECTIONAL);
+ c->req.sg[0].size = io->sg[0].size;
+ c->req.sg[0].addr = pci_map_single(h->pci_dev, p,
+ c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL);
+ c->req.hdr.sg_cnt = 1;
+ break;
+ case IDA_READ:
+ case READ_FLASH_ROM:
+ case SENSE_CONTROLLER_PERFORMANCE:
+ p = kmalloc(io->sg[0].size, GFP_KERNEL);
+ if (!p)
+ {
+ error = -ENOMEM;
+ cmd_free(h, c, 0);
+ return(error);
+ }
+
+ c->req.sg[0].size = io->sg[0].size;
+ c->req.sg[0].addr = pci_map_single(h->pci_dev, p,
+ c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL);
+ c->req.hdr.sg_cnt = 1;
+ break;
+ case IDA_WRITE:
+ case IDA_WRITE_MEDIA:
+ case DIAG_PASS_THRU:
+ case COLLECT_BUFFER:
+ case WRITE_FLASH_ROM:
+ p = memdup_user(io->sg[0].addr, io->sg[0].size);
+ if (IS_ERR(p)) {
+ error = PTR_ERR(p);
+ cmd_free(h, c, 0);
+ return error;
+ }
+ c->req.sg[0].size = io->sg[0].size;
+ c->req.sg[0].addr = pci_map_single(h->pci_dev, p,
+ c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL);
+ c->req.hdr.sg_cnt = 1;
+ break;
+ default:
+ c->req.sg[0].size = sizeof(io->c);
+ c->req.sg[0].addr = pci_map_single(h->pci_dev,&io->c,
+ c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL);
+ c->req.hdr.sg_cnt = 1;
+ }
+
+ /* Put the request on the tail of the request queue */
+ spin_lock_irqsave(IDA_LOCK(ctlr), flags);
+ addQ(&h->reqQ, c);
+ h->Qdepth++;
+ start_io(h);
+ spin_unlock_irqrestore(IDA_LOCK(ctlr), flags);
+
+ /* Wait for completion */
+ while(c->type != CMD_IOCTL_DONE)
+ schedule();
+
+ /* Unmap the DMA */
+ pci_unmap_single(h->pci_dev, c->req.sg[0].addr, c->req.sg[0].size,
+ PCI_DMA_BIDIRECTIONAL);
+ /* Post submit processing */
+ switch(io->cmd) {
+ case PASSTHRU_A:
+ pci_unmap_single(h->pci_dev, c->req.hdr.blk,
+ sizeof(ida_ioctl_t),
+ PCI_DMA_BIDIRECTIONAL);
+ case IDA_READ:
+ case DIAG_PASS_THRU:
+ case SENSE_CONTROLLER_PERFORMANCE:
+ case READ_FLASH_ROM:
+ if (copy_to_user(io->sg[0].addr, p, io->sg[0].size)) {
+ kfree(p);
+ return -EFAULT;
+ }
+ /* fall through and free p */
+ case IDA_WRITE:
+ case IDA_WRITE_MEDIA:
+ case COLLECT_BUFFER:
+ case WRITE_FLASH_ROM:
+ kfree(p);
+ break;
+ default:;
+ /* Nothing to do */
+ }
+
+ io->rcode = c->req.hdr.rcode;
+ cmd_free(h, c, 0);
+ return(0);
+}
+
+/*
+ * Commands are pre-allocated in a large block. Here we use a simple bitmap
+ * scheme to suballocte them to the driver. Operations that are not time
+ * critical (and can wait for kmalloc and possibly sleep) can pass in NULL
+ * as the first argument to get a new command.
+ */
+static cmdlist_t * cmd_alloc(ctlr_info_t *h, int get_from_pool)
+{
+ cmdlist_t * c;
+ int i;
+ dma_addr_t cmd_dhandle;
+
+ if (!get_from_pool) {
+ c = (cmdlist_t*)pci_alloc_consistent(h->pci_dev,
+ sizeof(cmdlist_t), &cmd_dhandle);
+ if(c==NULL)
+ return NULL;
+ } else {
+ do {
+ i = find_first_zero_bit(h->cmd_pool_bits, NR_CMDS);
+ if (i == NR_CMDS)
+ return NULL;
+ } while(test_and_set_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG)) != 0);
+ c = h->cmd_pool + i;
+ cmd_dhandle = h->cmd_pool_dhandle + i*sizeof(cmdlist_t);
+ h->nr_allocs++;
+ }
+
+ memset(c, 0, sizeof(cmdlist_t));
+ c->busaddr = cmd_dhandle;
+ return c;
+}
+
+static void cmd_free(ctlr_info_t *h, cmdlist_t *c, int got_from_pool)
+{
+ int i;
+
+ if (!got_from_pool) {
+ pci_free_consistent(h->pci_dev, sizeof(cmdlist_t), c,
+ c->busaddr);
+ } else {
+ i = c - h->cmd_pool;
+ clear_bit(i&(BITS_PER_LONG-1), h->cmd_pool_bits+(i/BITS_PER_LONG));
+ h->nr_frees++;
+ }
+}
+
+/***********************************************************************
+ name: sendcmd
+ Send a command to an IDA using the memory mapped FIFO interface
+ and wait for it to complete.
+ This routine should only be called at init time.
+***********************************************************************/
+static int sendcmd(
+ __u8 cmd,
+ int ctlr,
+ void *buff,
+ size_t size,
+ unsigned int blk,
+ unsigned int blkcnt,
+ unsigned int log_unit )
+{
+ cmdlist_t *c;
+ int complete;
+ unsigned long temp;
+ unsigned long i;
+ ctlr_info_t *info_p = hba[ctlr];
+
+ c = cmd_alloc(info_p, 1);
+ if(!c)
+ return IO_ERROR;
+ c->ctlr = ctlr;
+ c->hdr.unit = log_unit;
+ c->hdr.prio = 0;
+ c->hdr.size = sizeof(rblk_t) >> 2;
+ c->size += sizeof(rblk_t);
+
+ /* The request information. */
+ c->req.hdr.next = 0;
+ c->req.hdr.rcode = 0;
+ c->req.bp = 0;
+ c->req.hdr.sg_cnt = 1;
+ c->req.hdr.reserved = 0;
+
+ if (size == 0)
+ c->req.sg[0].size = 512;
+ else
+ c->req.sg[0].size = size;
+
+ c->req.hdr.blk = blk;
+ c->req.hdr.blk_cnt = blkcnt;
+ c->req.hdr.cmd = (unsigned char) cmd;
+ c->req.sg[0].addr = (__u32) pci_map_single(info_p->pci_dev,
+ buff, c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL);
+ /*
+ * Disable interrupt
+ */
+ info_p->access.set_intr_mask(info_p, 0);
+ /* Make sure there is room in the command FIFO */
+ /* Actually it should be completely empty at this time. */
+ for (i = 200000; i > 0; i--) {
+ temp = info_p->access.fifo_full(info_p);
+ if (temp != 0) {
+ break;
+ }
+ udelay(10);
+DBG(
+ printk(KERN_WARNING "cpqarray ida%d: idaSendPciCmd FIFO full,"
+ " waiting!\n", ctlr);
+);
+ }
+ /*
+ * Send the cmd
+ */
+ info_p->access.submit_command(info_p, c);
+ complete = pollcomplete(ctlr);
+
+ pci_unmap_single(info_p->pci_dev, (dma_addr_t) c->req.sg[0].addr,
+ c->req.sg[0].size, PCI_DMA_BIDIRECTIONAL);
+ if (complete != 1) {
+ if (complete != c->busaddr) {
+ printk( KERN_WARNING
+ "cpqarray ida%d: idaSendPciCmd "
+ "Invalid command list address returned! (%08lx)\n",
+ ctlr, (unsigned long)complete);
+ cmd_free(info_p, c, 1);
+ return (IO_ERROR);
+ }
+ } else {
+ printk( KERN_WARNING
+ "cpqarray ida%d: idaSendPciCmd Timeout out, "
+ "No command list address returned!\n",
+ ctlr);
+ cmd_free(info_p, c, 1);
+ return (IO_ERROR);
+ }
+
+ if (c->req.hdr.rcode & 0x00FE) {
+ if (!(c->req.hdr.rcode & BIG_PROBLEM)) {
+ printk( KERN_WARNING
+ "cpqarray ida%d: idaSendPciCmd, error: "
+ "Controller failed at init time "
+ "cmd: 0x%x, return code = 0x%x\n",
+ ctlr, c->req.hdr.cmd, c->req.hdr.rcode);
+
+ cmd_free(info_p, c, 1);
+ return (IO_ERROR);
+ }
+ }
+ cmd_free(info_p, c, 1);
+ return (IO_OK);
+}
+
+/*
+ * revalidate_allvol is for online array config utilities. After a
+ * utility reconfigures the drives in the array, it can use this function
+ * (through an ioctl) to make the driver zap any previous disk structs for
+ * that controller and get new ones.
+ *
+ * Right now I'm using the getgeometry() function to do this, but this
+ * function should probably be finer grained and allow you to revalidate one
+ * particualar logical volume (instead of all of them on a particular
+ * controller).
+ */
+static int revalidate_allvol(ctlr_info_t *host)
+{
+ int ctlr = host->ctlr;
+ int i;
+ unsigned long flags;
+
+ spin_lock_irqsave(IDA_LOCK(ctlr), flags);
+ if (host->usage_count > 1) {
+ spin_unlock_irqrestore(IDA_LOCK(ctlr), flags);
+ printk(KERN_WARNING "cpqarray: Device busy for volume"
+ " revalidation (usage=%d)\n", host->usage_count);
+ return -EBUSY;
+ }
+ host->usage_count++;
+ spin_unlock_irqrestore(IDA_LOCK(ctlr), flags);
+
+ /*
+ * Set the partition and block size structures for all volumes
+ * on this controller to zero. We will reread all of this data
+ */
+ set_capacity(ida_gendisk[ctlr][0], 0);
+ for (i = 1; i < NWD; i++) {
+ struct gendisk *disk = ida_gendisk[ctlr][i];
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ }
+ memset(host->drv, 0, sizeof(drv_info_t)*NWD);
+
+ /*
+ * Tell the array controller not to give us any interrupts while
+ * we check the new geometry. Then turn interrupts back on when
+ * we're done.
+ */
+ host->access.set_intr_mask(host, 0);
+ getgeometry(ctlr);
+ host->access.set_intr_mask(host, FIFO_NOT_EMPTY);
+
+ for(i=0; i<NWD; i++) {
+ struct gendisk *disk = ida_gendisk[ctlr][i];
+ drv_info_t *drv = &host->drv[i];
+ if (i && !drv->nr_blks)
+ continue;
+ blk_queue_logical_block_size(host->queue, drv->blk_size);
+ set_capacity(disk, drv->nr_blks);
+ disk->queue = host->queue;
+ disk->private_data = drv;
+ if (i)
+ add_disk(disk);
+ }
+
+ host->usage_count--;
+ return 0;
+}
+
+static int ida_revalidate(struct gendisk *disk)
+{
+ drv_info_t *drv = disk->private_data;
+ set_capacity(disk, drv->nr_blks);
+ return 0;
+}
+
+/********************************************************************
+ name: pollcomplete
+ Wait polling for a command to complete.
+ The memory mapped FIFO is polled for the completion.
+ Used only at init time, interrupts disabled.
+ ********************************************************************/
+static int pollcomplete(int ctlr)
+{
+ int done;
+ int i;
+
+ /* Wait (up to 2 seconds) for a command to complete */
+
+ for (i = 200000; i > 0; i--) {
+ done = hba[ctlr]->access.command_completed(hba[ctlr]);
+ if (done == 0) {
+ udelay(10); /* a short fixed delay */
+ } else
+ return (done);
+ }
+ /* Invalid address to tell caller we ran out of time */
+ return 1;
+}
+/*****************************************************************
+ start_fwbk
+ Starts controller firmwares background processing.
+ Currently only the Integrated Raid controller needs this done.
+ If the PCI mem address registers are written to after this,
+ data corruption may occur
+*****************************************************************/
+static void start_fwbk(int ctlr)
+{
+ id_ctlr_t *id_ctlr_buf;
+ int ret_code;
+
+ if( (hba[ctlr]->board_id != 0x40400E11)
+ && (hba[ctlr]->board_id != 0x40480E11) )
+
+ /* Not a Integrated Raid, so there is nothing for us to do */
+ return;
+ printk(KERN_DEBUG "cpqarray: Starting firmware's background"
+ " processing\n");
+ /* Command does not return anything, but idasend command needs a
+ buffer */
+ id_ctlr_buf = kmalloc(sizeof(id_ctlr_t), GFP_KERNEL);
+ if(id_ctlr_buf==NULL)
+ {
+ printk(KERN_WARNING "cpqarray: Out of memory. "
+ "Unable to start background processing.\n");
+ return;
+ }
+ ret_code = sendcmd(RESUME_BACKGROUND_ACTIVITY, ctlr,
+ id_ctlr_buf, 0, 0, 0, 0);
+ if(ret_code != IO_OK)
+ printk(KERN_WARNING "cpqarray: Unable to start"
+ " background processing\n");
+
+ kfree(id_ctlr_buf);
+}
+/*****************************************************************
+ getgeometry
+ Get ida logical volume geometry from the controller
+ This is a large bit of code which once existed in two flavors,
+ It is used only at init time.
+*****************************************************************/
+static void getgeometry(int ctlr)
+{
+ id_log_drv_t *id_ldrive;
+ id_ctlr_t *id_ctlr_buf;
+ sense_log_drv_stat_t *id_lstatus_buf;
+ config_t *sense_config_buf;
+ unsigned int log_unit, log_index;
+ int ret_code, size;
+ drv_info_t *drv;
+ ctlr_info_t *info_p = hba[ctlr];
+ int i;
+
+ info_p->log_drv_map = 0;
+
+ id_ldrive = kzalloc(sizeof(id_log_drv_t), GFP_KERNEL);
+ if (!id_ldrive) {
+ printk( KERN_ERR "cpqarray: out of memory.\n");
+ goto err_0;
+ }
+
+ id_ctlr_buf = kzalloc(sizeof(id_ctlr_t), GFP_KERNEL);
+ if (!id_ctlr_buf) {
+ printk( KERN_ERR "cpqarray: out of memory.\n");
+ goto err_1;
+ }
+
+ id_lstatus_buf = kzalloc(sizeof(sense_log_drv_stat_t), GFP_KERNEL);
+ if (!id_lstatus_buf) {
+ printk( KERN_ERR "cpqarray: out of memory.\n");
+ goto err_2;
+ }
+
+ sense_config_buf = kzalloc(sizeof(config_t), GFP_KERNEL);
+ if (!sense_config_buf) {
+ printk( KERN_ERR "cpqarray: out of memory.\n");
+ goto err_3;
+ }
+
+ info_p->phys_drives = 0;
+ info_p->log_drv_map = 0;
+ info_p->drv_assign_map = 0;
+ info_p->drv_spare_map = 0;
+ info_p->mp_failed_drv_map = 0; /* only initialized here */
+ /* Get controllers info for this logical drive */
+ ret_code = sendcmd(ID_CTLR, ctlr, id_ctlr_buf, 0, 0, 0, 0);
+ if (ret_code == IO_ERROR) {
+ /*
+ * If can't get controller info, set the logical drive map to 0,
+ * so the idastubopen will fail on all logical drives
+ * on the controller.
+ */
+ printk(KERN_ERR "cpqarray: error sending ID controller\n");
+ goto err_4;
+ }
+
+ info_p->log_drives = id_ctlr_buf->nr_drvs;
+ for(i=0;i<4;i++)
+ info_p->firm_rev[i] = id_ctlr_buf->firm_rev[i];
+ info_p->ctlr_sig = id_ctlr_buf->cfg_sig;
+
+ printk(" (%s)\n", info_p->product_name);
+ /*
+ * Initialize logical drive map to zero
+ */
+ log_index = 0;
+ /*
+ * Get drive geometry for all logical drives
+ */
+ if (id_ctlr_buf->nr_drvs > 16)
+ printk(KERN_WARNING "cpqarray ida%d: This driver supports "
+ "16 logical drives per controller.\n. "
+ " Additional drives will not be "
+ "detected\n", ctlr);
+
+ for (log_unit = 0;
+ (log_index < id_ctlr_buf->nr_drvs)
+ && (log_unit < NWD);
+ log_unit++) {
+ size = sizeof(sense_log_drv_stat_t);
+
+ /*
+ Send "Identify logical drive status" cmd
+ */
+ ret_code = sendcmd(SENSE_LOG_DRV_STAT,
+ ctlr, id_lstatus_buf, size, 0, 0, log_unit);
+ if (ret_code == IO_ERROR) {
+ /*
+ If can't get logical drive status, set
+ the logical drive map to 0, so the
+ idastubopen will fail for all logical drives
+ on the controller.
+ */
+ info_p->log_drv_map = 0;
+ printk( KERN_WARNING
+ "cpqarray ida%d: idaGetGeometry - Controller"
+ " failed to report status of logical drive %d\n"
+ "Access to this controller has been disabled\n",
+ ctlr, log_unit);
+ goto err_4;
+ }
+ /*
+ Make sure the logical drive is configured
+ */
+ if (id_lstatus_buf->status != LOG_NOT_CONF) {
+ ret_code = sendcmd(ID_LOG_DRV, ctlr, id_ldrive,
+ sizeof(id_log_drv_t), 0, 0, log_unit);
+ /*
+ If error, the bit for this
+ logical drive won't be set and
+ idastubopen will return error.
+ */
+ if (ret_code != IO_ERROR) {
+ drv = &info_p->drv[log_unit];
+ drv->blk_size = id_ldrive->blk_size;
+ drv->nr_blks = id_ldrive->nr_blks;
+ drv->cylinders = id_ldrive->drv.cyl;
+ drv->heads = id_ldrive->drv.heads;
+ drv->sectors = id_ldrive->drv.sect_per_track;
+ info_p->log_drv_map |= (1 << log_unit);
+
+ printk(KERN_INFO "cpqarray ida/c%dd%d: blksz=%d nr_blks=%d\n",
+ ctlr, log_unit, drv->blk_size, drv->nr_blks);
+ ret_code = sendcmd(SENSE_CONFIG,
+ ctlr, sense_config_buf,
+ sizeof(config_t), 0, 0, log_unit);
+ if (ret_code == IO_ERROR) {
+ info_p->log_drv_map = 0;
+ printk(KERN_ERR "cpqarray: error sending sense config\n");
+ goto err_4;
+ }
+
+ info_p->phys_drives =
+ sense_config_buf->ctlr_phys_drv;
+ info_p->drv_assign_map
+ |= sense_config_buf->drv_asgn_map;
+ info_p->drv_assign_map
+ |= sense_config_buf->spare_asgn_map;
+ info_p->drv_spare_map
+ |= sense_config_buf->spare_asgn_map;
+ } /* end of if no error on id_ldrive */
+ log_index = log_index + 1;
+ } /* end of if logical drive configured */
+ } /* end of for log_unit */
+
+ /* Free all the buffers and return */
+err_4:
+ kfree(sense_config_buf);
+err_3:
+ kfree(id_lstatus_buf);
+err_2:
+ kfree(id_ctlr_buf);
+err_1:
+ kfree(id_ldrive);
+err_0:
+ return;
+}
+
+static void __exit cpqarray_exit(void)
+{
+ int i;
+
+ pci_unregister_driver(&cpqarray_pci_driver);
+
+ /* Double check that all controller entries have been removed */
+ for(i=0; i<MAX_CTLR; i++) {
+ if (hba[i] != NULL) {
+ printk(KERN_WARNING "cpqarray: Removing EISA "
+ "controller %d\n", i);
+ cpqarray_remove_one_eisa(i);
+ }
+ }
+
+ remove_proc_entry("driver/cpqarray", NULL);
+}
+
+module_init(cpqarray_init)
+module_exit(cpqarray_exit)
diff --git a/drivers/block/cpqarray.h b/drivers/block/cpqarray.h
new file mode 100644
index 000000000..be73e9d57
--- /dev/null
+++ b/drivers/block/cpqarray.h
@@ -0,0 +1,126 @@
+/*
+ * Disk Array driver for Compaq SMART2 Controllers
+ * Copyright 1998 Compaq Computer Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Questions/Comments/Bugfixes to iss_storagedev@hp.com
+ *
+ * If you want to make changes, improve or add functionality to this
+ * driver, you'll probably need the Compaq Array Controller Interface
+ * Specificiation (Document number ECG086/1198)
+ */
+#ifndef CPQARRAY_H
+#define CPQARRAY_H
+
+#ifdef __KERNEL__
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/timer.h>
+#endif
+
+#include "ida_cmd.h"
+
+#define IO_OK 0
+#define IO_ERROR 1
+#define NWD 16
+#define NWD_SHIFT 4
+
+#define IDA_TIMER (5*HZ)
+#define IDA_TIMEOUT (10*HZ)
+
+#define MISC_NONFATAL_WARN 0x01
+
+typedef struct {
+ unsigned blk_size;
+ unsigned nr_blks;
+ unsigned cylinders;
+ unsigned heads;
+ unsigned sectors;
+ int usage_count;
+} drv_info_t;
+
+#ifdef __KERNEL__
+
+struct ctlr_info;
+typedef struct ctlr_info ctlr_info_t;
+
+struct access_method {
+ void (*submit_command)(ctlr_info_t *h, cmdlist_t *c);
+ void (*set_intr_mask)(ctlr_info_t *h, unsigned long val);
+ unsigned long (*fifo_full)(ctlr_info_t *h);
+ unsigned long (*intr_pending)(ctlr_info_t *h);
+ unsigned long (*command_completed)(ctlr_info_t *h);
+};
+
+struct board_type {
+ __u32 board_id;
+ char *product_name;
+ struct access_method *access;
+};
+
+struct ctlr_info {
+ int ctlr;
+ char devname[8];
+ __u32 log_drv_map;
+ __u32 drv_assign_map;
+ __u32 drv_spare_map;
+ __u32 mp_failed_drv_map;
+
+ char firm_rev[4];
+ int ctlr_sig;
+
+ int log_drives;
+ int phys_drives;
+
+ struct pci_dev *pci_dev; /* NULL if EISA */
+ __u32 board_id;
+ char *product_name;
+
+ void __iomem *vaddr;
+ unsigned long paddr;
+ unsigned long io_mem_addr;
+ unsigned long io_mem_length;
+ int intr;
+ int usage_count;
+ drv_info_t drv[NWD];
+ struct proc_dir_entry *proc;
+
+ struct access_method access;
+
+ cmdlist_t *reqQ;
+ cmdlist_t *cmpQ;
+ cmdlist_t *cmd_pool;
+ dma_addr_t cmd_pool_dhandle;
+ unsigned long *cmd_pool_bits;
+ struct request_queue *queue;
+ spinlock_t lock;
+
+ unsigned int Qdepth;
+ unsigned int maxQsinceinit;
+
+ unsigned int nr_requests;
+ unsigned int nr_allocs;
+ unsigned int nr_frees;
+ struct timer_list timer;
+ unsigned int misc_tflags;
+};
+
+#define IDA_LOCK(i) (&hba[i]->lock)
+
+#endif
+
+#endif /* CPQARRAY_H */
diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c
new file mode 100644
index 000000000..99e773cb7
--- /dev/null
+++ b/drivers/block/cryptoloop.c
@@ -0,0 +1,216 @@
+/*
+ Linux loop encryption enabling module
+
+ Copyright (C) 2002 Herbert Valerio Riedel <hvr@gnu.org>
+ Copyright (C) 2003 Fruhwirth Clemens <clemens@endorphin.org>
+
+ This module is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This module is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this module; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/crypto.h>
+#include <linux/blkdev.h>
+#include <linux/scatterlist.h>
+#include <asm/uaccess.h>
+#include "loop.h"
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("loop blockdevice transferfunction adaptor / CryptoAPI");
+MODULE_AUTHOR("Herbert Valerio Riedel <hvr@gnu.org>");
+
+#define LOOP_IV_SECTOR_BITS 9
+#define LOOP_IV_SECTOR_SIZE (1 << LOOP_IV_SECTOR_BITS)
+
+static int
+cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info)
+{
+ int err = -EINVAL;
+ int cipher_len;
+ int mode_len;
+ char cms[LO_NAME_SIZE]; /* cipher-mode string */
+ char *cipher;
+ char *mode;
+ char *cmsp = cms; /* c-m string pointer */
+ struct crypto_blkcipher *tfm;
+
+ /* encryption breaks for non sector aligned offsets */
+
+ if (info->lo_offset % LOOP_IV_SECTOR_SIZE)
+ goto out;
+
+ strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE);
+ cms[LO_NAME_SIZE - 1] = 0;
+
+ cipher = cmsp;
+ cipher_len = strcspn(cmsp, "-");
+
+ mode = cmsp + cipher_len;
+ mode_len = 0;
+ if (*mode) {
+ mode++;
+ mode_len = strcspn(mode, "-");
+ }
+
+ if (!mode_len) {
+ mode = "cbc";
+ mode_len = 3;
+ }
+
+ if (cipher_len + mode_len + 3 > LO_NAME_SIZE)
+ return -EINVAL;
+
+ memmove(cms, mode, mode_len);
+ cmsp = cms + mode_len;
+ *cmsp++ = '(';
+ memcpy(cmsp, info->lo_crypt_name, cipher_len);
+ cmsp += cipher_len;
+ *cmsp++ = ')';
+ *cmsp = 0;
+
+ tfm = crypto_alloc_blkcipher(cms, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ err = crypto_blkcipher_setkey(tfm, info->lo_encrypt_key,
+ info->lo_encrypt_key_size);
+
+ if (err != 0)
+ goto out_free_tfm;
+
+ lo->key_data = tfm;
+ return 0;
+
+ out_free_tfm:
+ crypto_free_blkcipher(tfm);
+
+ out:
+ return err;
+}
+
+
+typedef int (*encdec_cbc_t)(struct blkcipher_desc *desc,
+ struct scatterlist *sg_out,
+ struct scatterlist *sg_in,
+ unsigned int nsg);
+
+static int
+cryptoloop_transfer(struct loop_device *lo, int cmd,
+ struct page *raw_page, unsigned raw_off,
+ struct page *loop_page, unsigned loop_off,
+ int size, sector_t IV)
+{
+ struct crypto_blkcipher *tfm = lo->key_data;
+ struct blkcipher_desc desc = {
+ .tfm = tfm,
+ .flags = CRYPTO_TFM_REQ_MAY_SLEEP,
+ };
+ struct scatterlist sg_out;
+ struct scatterlist sg_in;
+
+ encdec_cbc_t encdecfunc;
+ struct page *in_page, *out_page;
+ unsigned in_offs, out_offs;
+ int err;
+
+ sg_init_table(&sg_out, 1);
+ sg_init_table(&sg_in, 1);
+
+ if (cmd == READ) {
+ in_page = raw_page;
+ in_offs = raw_off;
+ out_page = loop_page;
+ out_offs = loop_off;
+ encdecfunc = crypto_blkcipher_crt(tfm)->decrypt;
+ } else {
+ in_page = loop_page;
+ in_offs = loop_off;
+ out_page = raw_page;
+ out_offs = raw_off;
+ encdecfunc = crypto_blkcipher_crt(tfm)->encrypt;
+ }
+
+ while (size > 0) {
+ const int sz = min(size, LOOP_IV_SECTOR_SIZE);
+ u32 iv[4] = { 0, };
+ iv[0] = cpu_to_le32(IV & 0xffffffff);
+
+ sg_set_page(&sg_in, in_page, sz, in_offs);
+ sg_set_page(&sg_out, out_page, sz, out_offs);
+
+ desc.info = iv;
+ err = encdecfunc(&desc, &sg_out, &sg_in, sz);
+ if (err)
+ return err;
+
+ IV++;
+ size -= sz;
+ in_offs += sz;
+ out_offs += sz;
+ }
+
+ return 0;
+}
+
+static int
+cryptoloop_ioctl(struct loop_device *lo, int cmd, unsigned long arg)
+{
+ return -EINVAL;
+}
+
+static int
+cryptoloop_release(struct loop_device *lo)
+{
+ struct crypto_blkcipher *tfm = lo->key_data;
+ if (tfm != NULL) {
+ crypto_free_blkcipher(tfm);
+ lo->key_data = NULL;
+ return 0;
+ }
+ printk(KERN_ERR "cryptoloop_release(): tfm == NULL?\n");
+ return -EINVAL;
+}
+
+static struct loop_func_table cryptoloop_funcs = {
+ .number = LO_CRYPT_CRYPTOAPI,
+ .init = cryptoloop_init,
+ .ioctl = cryptoloop_ioctl,
+ .transfer = cryptoloop_transfer,
+ .release = cryptoloop_release,
+ .owner = THIS_MODULE
+};
+
+static int __init
+init_cryptoloop(void)
+{
+ int rc = loop_register_transfer(&cryptoloop_funcs);
+
+ if (rc)
+ printk(KERN_ERR "cryptoloop: loop_register_transfer failed\n");
+ return rc;
+}
+
+static void __exit
+cleanup_cryptoloop(void)
+{
+ if (loop_unregister_transfer(LO_CRYPT_CRYPTOAPI))
+ printk(KERN_ERR
+ "cryptoloop: loop_unregister_transfer failed\n");
+}
+
+module_init(init_cryptoloop);
+module_exit(cleanup_cryptoloop);
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000..7845bd6ee
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,73 @@
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS or INET not selected"
+ depends on PROC_FS='n' || INET='n'
+
+config BLK_DEV_DRBD
+ tristate "DRBD Distributed Replicated Block Device support"
+ depends on PROC_FS && INET
+ select LRU_CACHE
+ select LIBCRC32C
+ default n
+ help
+
+ NOTE: In order to authenticate connections you have to select
+ CRYPTO_HMAC and a hash function as well.
+
+ DRBD is a shared-nothing, synchronously replicated block device. It
+ is designed to serve as a building block for high availability
+ clusters and in this context, is a "drop-in" replacement for shared
+ storage. Simplistically, you could see it as a network RAID 1.
+
+ Each minor device has a role, which can be 'primary' or 'secondary'.
+ On the node with the primary device the application is supposed to
+ run and to access the device (/dev/drbdX). Every write is sent to
+ the local 'lower level block device' and, across the network, to the
+ node with the device in 'secondary' state. The secondary device
+ simply writes the data to its lower level block device.
+
+ DRBD can also be used in dual-Primary mode (device writable on both
+ nodes), which means it can exhibit shared disk semantics in a
+ shared-nothing cluster. Needless to say, on top of dual-Primary
+ DRBD utilizing a cluster file system is necessary to maintain for
+ cache coherency.
+
+ For automatic failover you need a cluster manager (e.g. heartbeat).
+ See also: http://www.drbd.org/, http://www.linux-ha.org
+
+ If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+ bool "DRBD fault injection"
+ depends on BLK_DEV_DRBD
+ help
+
+ Say Y here if you want to simulate IO errors, in order to test DRBD's
+ behavior.
+
+ The actual simulation of IO errors is done by writing 3 values to
+ /sys/module/drbd/parameters/
+
+ enable_faults: bitmask of...
+ 1 meta data write
+ 2 read
+ 4 resync data write
+ 8 read
+ 16 data write
+ 32 data read
+ 64 read ahead
+ 128 kmalloc of bitmap
+ 256 allocation of peer_requests
+ 512 insert data corruption on receiving side
+
+ fault_devs: bitmask of minor numbers
+ fault_rate: frequency in percent
+
+ Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+ echo 16 > /sys/module/drbd/parameters/enable_faults
+ echo 1 > /sys/module/drbd/parameters/fault_devs
+ echo 5 > /sys/module/drbd/parameters/fault_rate
+
+ If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000..4464e353c
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,8 @@
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
+
+obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000..1318e3217
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1219 @@
+/*
+ drbd_actlog.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include <linux/dynamic_debug.h>
+#include "drbd_int.h"
+
+
+enum al_transaction_types {
+ AL_TR_UPDATE = 0,
+ AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+ /* don't we all like magic */
+ __be32 magic;
+
+ /* to identify the most recent transaction block
+ * in the on disk ring buffer */
+ __be32 tr_number;
+
+ /* checksum on the full 4k block, with this field set to 0. */
+ __be32 crc32c;
+
+ /* type of transaction, special transaction types like:
+ * purge-all, set-all-idle, set-all-active, ... to-be-defined
+ * see also enum al_transaction_types */
+ __be16 transaction_type;
+
+ /* we currently allow only a few thousand extents,
+ * so 16bit will be enough for the slot number. */
+
+ /* how many updates in this transaction */
+ __be16 n_updates;
+
+ /* maximum slot number, "al-extents" in drbd.conf speak.
+ * Having this in each transaction should make reconfiguration
+ * of that parameter easier. */
+ __be16 context_size;
+
+ /* slot number the context starts with */
+ __be16 context_start_slot_nr;
+
+ /* Some reserved bytes. Expected usage is a 64bit counter of
+ * sectors-written since device creation, and other data generation tag
+ * supporting usage */
+ __be32 __reserved[4];
+
+ /* --- 36 byte used --- */
+
+ /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+ * in one transaction, then use the remaining byte in the 4k block for
+ * context information. "Flexible" number of updates per transaction
+ * does not help, as we have to account for the case when all update
+ * slots are used anyways, so it would only complicate code without
+ * additional benefit.
+ */
+ __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* but the extent number is 32bit, which at an extent size of 4 MiB
+ * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+ __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* --- 420 bytes used (36 + 64*6) --- */
+
+ /* 4096 - 420 = 3676 = 919 * 4 */
+ __be32 context[AL_CONTEXT_PER_TRANSACTION];
+};
+
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
+{
+ int r;
+
+ wait_event(device->misc_wait,
+ (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
+ device->state.disk <= D_FAILED);
+
+ if (r)
+ return NULL;
+
+ device->md_io.current_use = intent;
+ device->md_io.start_jif = jiffies;
+ device->md_io.submit_jif = device->md_io.start_jif - 1;
+ return page_address(device->md_io.page);
+}
+
+void drbd_md_put_buffer(struct drbd_device *device)
+{
+ if (atomic_dec_and_test(&device->md_io.in_use))
+ wake_up(&device->misc_wait);
+}
+
+void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ unsigned int *done)
+{
+ long dt;
+
+ rcu_read_lock();
+ dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+ rcu_read_unlock();
+ dt = dt * HZ / 10;
+ if (dt == 0)
+ dt = MAX_SCHEDULE_TIMEOUT;
+
+ dt = wait_event_timeout(device->misc_wait,
+ *done || test_bit(FORCE_DETACH, &device->flags), dt);
+ if (dt == 0) {
+ drbd_err(device, "meta-data IO operation timed out\n");
+ drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH);
+ }
+}
+
+static int _drbd_md_sync_page_io(struct drbd_device *device,
+ struct drbd_backing_dev *bdev,
+ sector_t sector, int rw)
+{
+ struct bio *bio;
+ /* we do all our meta data IO in aligned 4k blocks. */
+ const int size = 4096;
+ int err;
+
+ device->md_io.done = 0;
+ device->md_io.error = -ENODEV;
+
+ if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags))
+ rw |= REQ_FUA | REQ_FLUSH;
+ rw |= REQ_SYNC | REQ_NOIDLE;
+
+ bio = bio_alloc_drbd(GFP_NOIO);
+ bio->bi_bdev = bdev->md_bdev;
+ bio->bi_iter.bi_sector = sector;
+ err = -EIO;
+ if (bio_add_page(bio, device->md_io.page, size, 0) != size)
+ goto out;
+ bio->bi_private = device;
+ bio->bi_end_io = drbd_md_endio;
+ bio->bi_rw = rw;
+
+ if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL)
+ /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+ ;
+ else if (!get_ldev_if_state(device, D_ATTACHING)) {
+ /* Corresponding put_ldev in drbd_md_endio() */
+ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+ err = -ENODEV;
+ goto out;
+ }
+
+ bio_get(bio); /* one bio_put() is in the completion handler */
+ atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+ device->md_io.submit_jif = jiffies;
+ if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+ bio_endio(bio, -EIO);
+ else
+ submit_bio(rw, bio);
+ wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
+ if (bio_flagged(bio, BIO_UPTODATE))
+ err = device->md_io.error;
+
+ out:
+ bio_put(bio);
+ return err;
+}
+
+int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ sector_t sector, int rw)
+{
+ int err;
+ D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
+
+ BUG_ON(!bdev->md_bdev);
+
+ dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ",
+ (void*)_RET_IP_ );
+
+ if (sector < drbd_md_first_sector(bdev) ||
+ sector + 7 > drbd_md_last_sector(bdev))
+ drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
+
+ err = _drbd_md_sync_page_io(device, bdev, sector, rw);
+ if (err) {
+ drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
+ }
+ return err;
+}
+
+static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr)
+{
+ struct lc_element *tmp;
+ tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+ return bm_ext;
+ }
+ return NULL;
+}
+
+static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock)
+{
+ struct lc_element *al_ext;
+ struct bm_extent *bm_ext;
+ int wake;
+
+ spin_lock_irq(&device->al_lock);
+ bm_ext = find_active_resync_extent(device, enr);
+ if (bm_ext) {
+ wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+ spin_unlock_irq(&device->al_lock);
+ if (wake)
+ wake_up(&device->al_wait);
+ return NULL;
+ }
+ if (nonblock)
+ al_ext = lc_try_get(device->act_log, enr);
+ else
+ al_ext = lc_get(device->act_log, enr);
+ spin_unlock_irq(&device->al_lock);
+ return al_ext;
+}
+
+bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+
+ D_ASSERT(device, (unsigned)(last - first) <= 1);
+ D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+ /* FIXME figure out a fast path for bios crossing AL extent boundaries */
+ if (first != last)
+ return false;
+
+ return _al_get(device, first, true);
+}
+
+bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ bool need_transaction = false;
+
+ D_ASSERT(device, first <= last);
+ D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ wait_event(device->al_wait,
+ (al_ext = _al_get(device, enr, false)) != NULL);
+ if (al_ext->lc_number != enr)
+ need_transaction = true;
+ }
+ return need_transaction;
+}
+
+static int al_write_transaction(struct drbd_device *device);
+
+void drbd_al_begin_io_commit(struct drbd_device *device)
+{
+ bool locked = false;
+
+ /* Serialize multiple transactions.
+ * This uses test_and_set_bit, memory barrier is implicit.
+ */
+ wait_event(device->al_wait,
+ device->act_log->pending_changes == 0 ||
+ (locked = lc_try_lock_for_transaction(device->act_log)));
+
+ if (locked) {
+ /* Double check: it may have been committed by someone else,
+ * while we have been waiting for the lock. */
+ if (device->act_log->pending_changes) {
+ bool write_al_updates;
+
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+
+ if (write_al_updates)
+ al_write_transaction(device);
+ spin_lock_irq(&device->al_lock);
+ /* FIXME
+ if (err)
+ we need an "lc_cancel" here;
+ */
+ lc_committed(device->act_log);
+ spin_unlock_irq(&device->al_lock);
+ }
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
+ }
+}
+
+/*
+ * @delegate: delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
+{
+ if (drbd_al_begin_io_prepare(device, i))
+ drbd_al_begin_io_commit(device);
+}
+
+int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
+{
+ struct lru_cache *al = device->act_log;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned nr_al_extents;
+ unsigned available_update_slots;
+ unsigned enr;
+
+ D_ASSERT(device, first <= last);
+
+ nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+ available_update_slots = min(al->nr_elements - al->used,
+ al->max_pending_changes - al->pending_changes);
+
+ /* We want all necessary updates for a given request within the same transaction
+ * We could first check how many updates are *actually* needed,
+ * and use that instead of the worst-case nr_al_extents */
+ if (available_update_slots < nr_al_extents) {
+ /* Too many activity log extents are currently "hot".
+ *
+ * If we have accumulated pending changes already,
+ * we made progress.
+ *
+ * If we cannot get even a single pending change through,
+ * stop the fast path until we made some progress,
+ * or requests to "cold" extents could be starved. */
+ if (!al->pending_changes)
+ __set_bit(__LC_STARVING, &device->act_log->flags);
+ return -ENOBUFS;
+ }
+
+ /* Is resync active in this area? */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *tmp;
+ tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+ if (unlikely(tmp != NULL)) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
+ return -EBUSY;
+ return -EWOULDBLOCK;
+ }
+ }
+ }
+
+ /* Checkout the refcounts.
+ * Given that we checked for available elements and update slots above,
+ * this has to be successful. */
+ for (enr = first; enr <= last; enr++) {
+ struct lc_element *al_ext;
+ al_ext = lc_get_cumulative(device->act_log, enr);
+ if (!al_ext)
+ drbd_info(device, "LOGIC BUG for enr=%u\n", enr);
+ }
+ return 0;
+}
+
+void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
+{
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ struct lc_element *extent;
+ unsigned long flags;
+
+ D_ASSERT(device, first <= last);
+ spin_lock_irqsave(&device->al_lock, flags);
+
+ for (enr = first; enr <= last; enr++) {
+ extent = lc_find(device->act_log, enr);
+ if (!extent) {
+ drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr);
+ continue;
+ }
+ lc_put(device->act_log, extent);
+ }
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ wake_up(&device->al_wait);
+}
+
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+ return al_enr >>
+ /* bit to page */
+ ((PAGE_SHIFT + 3) -
+ /* al extent number to bit */
+ (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+ const unsigned int stripes = device->ldev->md.al_stripes;
+ const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+
+ /* transaction number, modulo on-disk ring buffer wrap around */
+ unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+
+ /* ... to aligned 4k on disk block */
+ t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+ /* ... to 512 byte sector in activity log */
+ t *= 8;
+
+ /* ... plus offset to the on disk position */
+ return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+
+int al_write_transaction(struct drbd_device *device)
+{
+ struct al_transaction_on_disk *buffer;
+ struct lc_element *e;
+ sector_t sector;
+ int i, mx;
+ unsigned extent_nr;
+ unsigned crc = 0;
+ int err = 0;
+
+ if (!get_ldev(device)) {
+ drbd_err(device, "disk is %s, cannot start al transaction\n",
+ drbd_disk_str(device->state.disk));
+ return -EIO;
+ }
+
+ /* The bitmap write may have failed, causing a state change. */
+ if (device->state.disk < D_INCONSISTENT) {
+ drbd_err(device,
+ "disk is %s, cannot write al transaction\n",
+ drbd_disk_str(device->state.disk));
+ put_ldev(device);
+ return -EIO;
+ }
+
+ /* protects md_io_buffer, al_tr_cycle, ... */
+ buffer = drbd_md_get_buffer(device, __func__);
+ if (!buffer) {
+ drbd_err(device, "disk failed while waiting for md_io buffer\n");
+ put_ldev(device);
+ return -ENODEV;
+ }
+
+ memset(buffer, 0, sizeof(*buffer));
+ buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+ buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+ i = 0;
+
+ /* Even though no one can start to change this list
+ * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+ * lc_try_lock_for_transaction() --, someone may still
+ * be in the process of changing it. */
+ spin_lock_irq(&device->al_lock);
+ list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+ if (i == AL_UPDATES_PER_TRANSACTION) {
+ i++;
+ break;
+ }
+ buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+ buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+ if (e->lc_number != LC_FREE)
+ drbd_bm_mark_for_writeout(device,
+ al_extent_to_bm_page(e->lc_number));
+ i++;
+ }
+ spin_unlock_irq(&device->al_lock);
+ BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+
+ buffer->n_updates = cpu_to_be16(i);
+ for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+ buffer->update_slot_nr[i] = cpu_to_be16(-1);
+ buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+ }
+
+ buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+ buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+
+ mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+ device->act_log->nr_elements - device->al_tr_cycle);
+ for (i = 0; i < mx; i++) {
+ unsigned idx = device->al_tr_cycle + i;
+ extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+ buffer->context[i] = cpu_to_be32(extent_nr);
+ }
+ for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+ buffer->context[i] = cpu_to_be32(LC_FREE);
+
+ device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+ if (device->al_tr_cycle >= device->act_log->nr_elements)
+ device->al_tr_cycle = 0;
+
+ sector = al_tr_number_to_on_disk_sector(device);
+
+ crc = crc32c(0, buffer, 4096);
+ buffer->crc32c = cpu_to_be32(crc);
+
+ if (drbd_bm_write_hinted(device))
+ err = -EIO;
+ else {
+ bool write_al_updates;
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+ if (write_al_updates) {
+ if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+ err = -EIO;
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ } else {
+ device->al_tr_number++;
+ device->al_writ_cnt++;
+ }
+ }
+ }
+
+ drbd_md_put_buffer(device);
+ put_ldev(device);
+
+ return err;
+}
+
+static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
+{
+ int rv;
+
+ spin_lock_irq(&device->al_lock);
+ rv = (al_ext->refcnt == 0);
+ if (likely(rv))
+ lc_del(device->act_log, al_ext);
+ spin_unlock_irq(&device->al_lock);
+
+ return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @device: DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock device->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_device *device)
+{
+ struct lc_element *al_ext;
+ int i;
+
+ D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags));
+
+ for (i = 0; i < device->act_log->nr_elements; i++) {
+ al_ext = lc_element_by_index(device->act_log, i);
+ if (al_ext->lc_number == LC_FREE)
+ continue;
+ wait_event(device->al_wait, _try_lc_del(device, al_ext));
+ }
+
+ wake_up(&device->al_wait);
+}
+
+int drbd_initialize_al(struct drbd_device *device, void *buffer)
+{
+ struct al_transaction_on_disk *al = buffer;
+ struct drbd_md *md = &device->ldev->md;
+ sector_t al_base = md->md_offset + md->al_offset;
+ int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
+ int i;
+
+ memset(al, 0, 4096);
+ al->magic = cpu_to_be32(DRBD_AL_MAGIC);
+ al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
+ al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+
+ for (i = 0; i < al_size_4k; i++) {
+ int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static const char *drbd_change_sync_fname[] = {
+ [RECORD_RS_FAILED] = "drbd_rs_failed_io",
+ [SET_IN_SYNC] = "drbd_set_in_sync",
+ [SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
+};
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
+ * potentially pulling in (and recounting the corresponding bits)
+ * this resync extent into the resync extent lru cache.
+ *
+ * Returns whether all bits have been cleared for this resync extent,
+ * precisely: (rs_left <= rs_failed)
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static bool update_rs_extent(struct drbd_device *device,
+ unsigned int enr, int count,
+ enum update_sync_bits_mode mode)
+{
+ struct lc_element *e;
+
+ D_ASSERT(device, atomic_read(&device->local_cnt));
+
+ /* When setting out-of-sync bits,
+ * we don't need it cached (lc_find).
+ * But if it is present in the cache,
+ * we should update the cached bit count.
+ * Otherwise, that extent should be in the resync extent lru cache
+ * already -- or we want to pull it in if necessary -- (lc_get),
+ * then update and check rs_left and rs_failed. */
+ if (mode == SET_OUT_OF_SYNC)
+ e = lc_find(device->resync, enr);
+ else
+ e = lc_get(device->resync, enr);
+ if (e) {
+ struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+ if (ext->lce.lc_number == enr) {
+ if (mode == SET_IN_SYNC)
+ ext->rs_left -= count;
+ else if (mode == SET_OUT_OF_SYNC)
+ ext->rs_left += count;
+ else
+ ext->rs_failed += count;
+ if (ext->rs_left < ext->rs_failed) {
+ drbd_warn(device, "BAD! enr=%u rs_left=%d "
+ "rs_failed=%d count=%d cstate=%s\n",
+ ext->lce.lc_number, ext->rs_left,
+ ext->rs_failed, count,
+ drbd_conn_str(device->state.conn));
+
+ /* We don't expect to be able to clear more bits
+ * than have been set when we originally counted
+ * the set bits to cache that value in ext->rs_left.
+ * Whatever the reason (disconnect during resync,
+ * delayed local completion of an application write),
+ * try to fix it up by recounting here. */
+ ext->rs_left = drbd_bm_e_weight(device, enr);
+ }
+ } else {
+ /* Normally this element should be in the cache,
+ * since drbd_rs_begin_io() pulled it already in.
+ *
+ * But maybe an application write finished, and we set
+ * something outside the resync lru_cache in sync.
+ */
+ int rs_left = drbd_bm_e_weight(device, enr);
+ if (ext->flags != 0) {
+ drbd_warn(device, "changing resync lce: %d[%u;%02lx]"
+ " -> %d[%u;00]\n",
+ ext->lce.lc_number, ext->rs_left,
+ ext->flags, enr, rs_left);
+ ext->flags = 0;
+ }
+ if (ext->rs_failed) {
+ drbd_warn(device, "Kicking resync_lru element enr=%u "
+ "out with rs_failed=%d\n",
+ ext->lce.lc_number, ext->rs_failed);
+ }
+ ext->rs_left = rs_left;
+ ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
+ /* we don't keep a persistent log of the resync lru,
+ * we can commit any change right away. */
+ lc_committed(device->resync);
+ }
+ if (mode != SET_OUT_OF_SYNC)
+ lc_put(device->resync, &ext->lce);
+ /* no race, we are within the al_lock! */
+
+ if (ext->rs_left <= ext->rs_failed) {
+ ext->rs_failed = 0;
+ return true;
+ }
+ } else if (mode != SET_OUT_OF_SYNC) {
+ /* be quiet if lc_find() did not find it. */
+ drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
+ device->resync_locked,
+ device->resync->nr_elements,
+ device->resync->flags);
+ }
+ return false;
+}
+
+void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
+{
+ unsigned long now = jiffies;
+ unsigned long last = device->rs_mark_time[device->rs_last_mark];
+ int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+ if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+ if (device->rs_mark_left[device->rs_last_mark] != still_to_go &&
+ device->state.conn != C_PAUSED_SYNC_T &&
+ device->state.conn != C_PAUSED_SYNC_S) {
+ device->rs_mark_time[next] = now;
+ device->rs_mark_left[next] = still_to_go;
+ device->rs_last_mark = next;
+ }
+ }
+}
+
+/* It is called lazy update, so don't do write-out too often. */
+static bool lazy_bitmap_update_due(struct drbd_device *device)
+{
+ return time_after(jiffies, device->rs_last_bcast + 2*HZ);
+}
+
+static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
+{
+ if (rs_done)
+ set_bit(RS_DONE, &device->flags);
+ /* and also set RS_PROGRESS below */
+ else if (!lazy_bitmap_update_due(device))
+ return;
+
+ drbd_device_post_work(device, RS_PROGRESS);
+}
+
+static int update_sync_bits(struct drbd_device *device,
+ unsigned long sbnr, unsigned long ebnr,
+ enum update_sync_bits_mode mode)
+{
+ /*
+ * We keep a count of set bits per resync-extent in the ->rs_left
+ * caching member, so we need to loop and work within the resync extent
+ * alignment. Typically this loop will execute exactly once.
+ */
+ unsigned long flags;
+ unsigned long count = 0;
+ unsigned int cleared = 0;
+ while (sbnr <= ebnr) {
+ /* set temporary boundary bit number to last bit number within
+ * the resync extent of the current start bit number,
+ * but cap at provided end bit number */
+ unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
+ unsigned long c;
+
+ if (mode == RECORD_RS_FAILED)
+ /* Only called from drbd_rs_failed_io(), bits
+ * supposedly still set. Recount, maybe some
+ * of the bits have been successfully cleared
+ * by application IO meanwhile.
+ */
+ c = drbd_bm_count_bits(device, sbnr, tbnr);
+ else if (mode == SET_IN_SYNC)
+ c = drbd_bm_clear_bits(device, sbnr, tbnr);
+ else /* if (mode == SET_OUT_OF_SYNC) */
+ c = drbd_bm_set_bits(device, sbnr, tbnr);
+
+ if (c) {
+ spin_lock_irqsave(&device->al_lock, flags);
+ cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ count += c;
+ }
+ sbnr = tbnr + 1;
+ }
+ if (count) {
+ if (mode == SET_IN_SYNC) {
+ unsigned long still_to_go = drbd_bm_total_weight(device);
+ bool rs_is_done = (still_to_go <= device->rs_failed);
+ drbd_advance_rs_marks(device, still_to_go);
+ if (cleared || rs_is_done)
+ maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
+ } else if (mode == RECORD_RS_FAILED)
+ device->rs_failed += count;
+ wake_up(&device->al_wait);
+ }
+ return count;
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector. Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+ enum update_sync_bits_mode mode)
+{
+ /* Is called from worker and receiver context _only_ */
+ unsigned long sbnr, ebnr, lbnr;
+ unsigned long count = 0;
+ sector_t esector, nr_sectors;
+
+ /* This would be an empty REQ_FLUSH, be silent. */
+ if ((mode == SET_OUT_OF_SYNC) && size == 0)
+ return 0;
+
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) {
+ drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
+ drbd_change_sync_fname[mode],
+ (unsigned long long)sector, size);
+ return 0;
+ }
+
+ if (!get_ldev(device))
+ return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
+
+ nr_sectors = drbd_get_capacity(device->this_bdev);
+ esector = sector + (size >> 9) - 1;
+
+ if (!expect(sector < nr_sectors))
+ goto out;
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
+
+ lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+ if (mode == SET_IN_SYNC) {
+ /* Round up start sector, round down end sector. We make sure
+ * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
+ if (unlikely(esector < BM_SECT_PER_BIT-1))
+ goto out;
+ if (unlikely(esector == (nr_sectors-1)))
+ ebnr = lbnr;
+ else
+ ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+ sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+ } else {
+ /* We set it out of sync, or record resync failure.
+ * Should not round anything here. */
+ sbnr = BM_SECT_TO_BIT(sector);
+ ebnr = BM_SECT_TO_BIT(esector);
+ }
+
+ count = update_sync_bits(device, sbnr, ebnr, mode);
+out:
+ put_ldev(device);
+ return count;
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr)
+{
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int wakeup = 0;
+ unsigned long rs_flags;
+
+ spin_lock_irq(&device->al_lock);
+ if (device->resync_locked > device->resync->nr_elements/2) {
+ spin_unlock_irq(&device->al_lock);
+ return NULL;
+ }
+ e = lc_get(device->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ if (bm_ext->lce.lc_number != enr) {
+ bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+ bm_ext->rs_failed = 0;
+ lc_committed(device->resync);
+ wakeup = 1;
+ }
+ if (bm_ext->lce.refcnt == 1)
+ device->resync_locked++;
+ set_bit(BME_NO_WRITES, &bm_ext->flags);
+ }
+ rs_flags = device->resync->flags;
+ spin_unlock_irq(&device->al_lock);
+ if (wakeup)
+ wake_up(&device->al_wait);
+
+ if (!bm_ext) {
+ if (rs_flags & LC_STARVING)
+ drbd_warn(device, "Have to wait for element"
+ " (resync LRU too small?)\n");
+ BUG_ON(rs_flags & LC_LOCKED);
+ }
+
+ return bm_ext;
+}
+
+static int _is_in_al(struct drbd_device *device, unsigned int enr)
+{
+ int rv;
+
+ spin_lock_irq(&device->al_lock);
+ rv = lc_is_used(device->act_log, enr);
+ spin_unlock_irq(&device->al_lock);
+
+ return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @device: DRBD device.
+ * @sector: The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ struct bm_extent *bm_ext;
+ int i, sig;
+ bool sa;
+
+retry:
+ sig = wait_event_interruptible(device->al_wait,
+ (bm_ext = _bme_get(device, enr)));
+ if (sig)
+ return -EINTR;
+
+ if (test_bit(BME_LOCKED, &bm_ext->flags))
+ return 0;
+
+ /* step aside only while we are above c-min-rate; unless disabled. */
+ sa = drbd_rs_c_min_rate_throttle(device);
+
+ for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+ sig = wait_event_interruptible(device->al_wait,
+ !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
+ (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));
+
+ if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
+ spin_lock_irq(&device->al_lock);
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
+ device->resync_locked--;
+ wake_up(&device->al_wait);
+ }
+ spin_unlock_irq(&device->al_lock);
+ if (sig)
+ return -EINTR;
+ if (schedule_timeout_interruptible(HZ/10))
+ return -EINTR;
+ goto retry;
+ }
+ }
+ set_bit(BME_LOCKED, &bm_ext->flags);
+ return 0;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @device: DRBD device.
+ * @sector: The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int i;
+ bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+ /* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+ * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+ * need to throttle. There is at most one such half-locked extent,
+ * which is remembered in resync_wenr. */
+
+ if (throttle && device->resync_wenr != enr)
+ return -EAGAIN;
+
+ spin_lock_irq(&device->al_lock);
+ if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
+ /* in case you have very heavy scattered io, it may
+ * stall the syncer undefined if we give up the ref count
+ * when we try again and requeue.
+ *
+ * if we don't give up the refcount, but the next time
+ * we are scheduled this extent has been "synced" by new
+ * application writes, we'd miss the lc_put on the
+ * extent we keep the refcount on.
+ * so we remembered which extent we had to try again, and
+ * if the next requested one is something else, we do
+ * the lc_put here...
+ * we also have to wake_up
+ */
+ e = lc_find(device->resync, device->resync_wenr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ device->resync_wenr = LC_FREE;
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0;
+ device->resync_locked--;
+ }
+ wake_up(&device->al_wait);
+ } else {
+ drbd_alert(device, "LOGIC BUG\n");
+ }
+ }
+ /* TRY. */
+ e = lc_try_get(device->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (bm_ext) {
+ if (test_bit(BME_LOCKED, &bm_ext->flags))
+ goto proceed;
+ if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+ device->resync_locked++;
+ } else {
+ /* we did set the BME_NO_WRITES,
+ * but then could not set BME_LOCKED,
+ * so we tried again.
+ * drop the extra reference. */
+ bm_ext->lce.refcnt--;
+ D_ASSERT(device, bm_ext->lce.refcnt > 0);
+ }
+ goto check_al;
+ } else {
+ /* do we rather want to try later? */
+ if (device->resync_locked > device->resync->nr_elements-3)
+ goto try_again;
+ /* Do or do not. There is no try. -- Yoda */
+ e = lc_get(device->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (!bm_ext) {
+ const unsigned long rs_flags = device->resync->flags;
+ if (rs_flags & LC_STARVING)
+ drbd_warn(device, "Have to wait for element"
+ " (resync LRU too small?)\n");
+ BUG_ON(rs_flags & LC_LOCKED);
+ goto try_again;
+ }
+ if (bm_ext->lce.lc_number != enr) {
+ bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+ bm_ext->rs_failed = 0;
+ lc_committed(device->resync);
+ wake_up(&device->al_wait);
+ D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+ }
+ set_bit(BME_NO_WRITES, &bm_ext->flags);
+ D_ASSERT(device, bm_ext->lce.refcnt == 1);
+ device->resync_locked++;
+ goto check_al;
+ }
+check_al:
+ for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+ if (lc_is_used(device->act_log, al_enr+i))
+ goto try_again;
+ }
+ set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+ device->resync_wenr = LC_FREE;
+ spin_unlock_irq(&device->al_lock);
+ return 0;
+
+try_again:
+ if (bm_ext) {
+ if (throttle) {
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ device->resync_wenr = LC_FREE;
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0;
+ device->resync_locked--;
+ }
+ wake_up(&device->al_wait);
+ } else
+ device->resync_wenr = enr;
+ }
+ spin_unlock_irq(&device->al_lock);
+ return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
+{
+ unsigned int enr = BM_SECT_TO_EXT(sector);
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ unsigned long flags;
+
+ spin_lock_irqsave(&device->al_lock, flags);
+ e = lc_find(device->resync, enr);
+ bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+ if (!bm_ext) {
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
+ return;
+ }
+
+ if (bm_ext->lce.refcnt == 0) {
+ spin_unlock_irqrestore(&device->al_lock, flags);
+ drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, "
+ "but refcnt is 0!?\n",
+ (unsigned long long)sector, enr);
+ return;
+ }
+
+ if (lc_put(device->resync, &bm_ext->lce) == 0) {
+ bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
+ device->resync_locked--;
+ wake_up(&device->al_wait);
+ }
+
+ spin_unlock_irqrestore(&device->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @device: DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_device *device)
+{
+ spin_lock_irq(&device->al_lock);
+
+ if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */
+ lc_reset(device->resync);
+ put_ldev(device);
+ }
+ device->resync_locked = 0;
+ device->resync_wenr = LC_FREE;
+ spin_unlock_irq(&device->al_lock);
+ wake_up(&device->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @device: DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_device *device)
+{
+ struct lc_element *e;
+ struct bm_extent *bm_ext;
+ int i;
+
+ spin_lock_irq(&device->al_lock);
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ /* ok, ->resync is there. */
+ for (i = 0; i < device->resync->nr_elements; i++) {
+ e = lc_element_by_index(device->resync, i);
+ bm_ext = lc_entry(e, struct bm_extent, lce);
+ if (bm_ext->lce.lc_number == LC_FREE)
+ continue;
+ if (bm_ext->lce.lc_number == device->resync_wenr) {
+ drbd_info(device, "dropping %u in drbd_rs_del_all, apparently"
+ " got 'synced' by application io\n",
+ device->resync_wenr);
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+ clear_bit(BME_NO_WRITES, &bm_ext->flags);
+ device->resync_wenr = LC_FREE;
+ lc_put(device->resync, &bm_ext->lce);
+ }
+ if (bm_ext->lce.refcnt != 0) {
+ drbd_info(device, "Retrying drbd_rs_del_all() later. "
+ "refcnt=%d\n", bm_ext->lce.refcnt);
+ put_ldev(device);
+ spin_unlock_irq(&device->al_lock);
+ return -EAGAIN;
+ }
+ D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+ D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags));
+ lc_del(device->resync, &bm_ext->lce);
+ }
+ D_ASSERT(device, device->resync->used == 0);
+ put_ldev(device);
+ }
+ spin_unlock_irq(&device->al_lock);
+ wake_up(&device->al_wait);
+
+ return 0;
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000..434c77dcc
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1648 @@
+/*
+ drbd_bitmap.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <linux/slab.h>
+#include <asm/kmap_types.h>
+
+#include "drbd_int.h"
+
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name bm_... => internal to implementation, "private".
+ */
+
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50 bytes backend storage (1 PiB)
+ * 1 << (50 - 12) bits needed
+ * 38 --> we need u64 to index and count bits
+ * 1 << (38 - 3) bitmap bytes needed
+ * 35 --> we still need u64 to index and count bytes
+ * (that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2) 32bit longs needed
+ * 33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3) 64bit longs needed
+ * 32 --> we could get away with a 32bit unsigned int to index and count
+ * 64bit long words, but I rather stay with unsigned long for now.
+ * We probably should neither count nor point to bytes or long words
+ * directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ * 22 --> we need that much 4KiB pages of bitmap.
+ * 1 << (22 + 3) --> on a 64bit arch,
+ * we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
+
+ * bitmap storage and IO:
+ * Bitmap is stored little endian on disk, and is kept little endian in
+ * core memory. Currently we still hold the full bitmap in core as long
+ * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ * seems excessive.
+ *
+ * We plan to reduce the amount of in-core bitmap pages by paging them in
+ * and out against their on-disk location as necessary, but need to make
+ * sure we don't cause too much meta data IO, and must not deadlock in
+ * tight memory situations. This needs some more work.
+ */
+
+/*
+ * NOTE
+ * Access to the *bm_pages is protected by bm_lock.
+ * It is safe to read the other members within the lock.
+ *
+ * drbd_bm_set_bits is called from bio_endio callbacks,
+ * We may be called with irq already disabled,
+ * so we need spin_lock_irqsave().
+ * And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+ struct page **bm_pages;
+ spinlock_t bm_lock;
+
+ /* see LIMITATIONS: above */
+
+ unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */
+ unsigned long bm_bits;
+ size_t bm_words;
+ size_t bm_number_of_pages;
+ sector_t bm_dev_capacity;
+ struct mutex bm_change; /* serializes resize operations */
+
+ wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
+
+ enum bm_flag bm_flags;
+
+ /* debugging aid, in case we are still racy somewhere */
+ char *bm_why;
+ struct task_struct *bm_task;
+};
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_device *device, const char *func)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!__ratelimit(&drbd_ratelimit_state))
+ return;
+ drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
+ current->comm, task_pid_nr(current),
+ func, b->bm_why ?: "?",
+ b->bm_task->comm, task_pid_nr(b->bm_task));
+}
+
+void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ int trylock_failed;
+
+ if (!b) {
+ drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
+ return;
+ }
+
+ trylock_failed = !mutex_trylock(&b->bm_change);
+
+ if (trylock_failed) {
+ drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
+ current->comm, task_pid_nr(current),
+ why, b->bm_why ?: "?",
+ b->bm_task->comm, task_pid_nr(b->bm_task));
+ mutex_lock(&b->bm_change);
+ }
+ if (BM_LOCKED_MASK & b->bm_flags)
+ drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
+ b->bm_flags |= flags & BM_LOCKED_MASK;
+
+ b->bm_why = why;
+ b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!b) {
+ drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
+ return;
+ }
+
+ if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
+ drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
+
+ b->bm_flags &= ~BM_LOCKED_MASK;
+ b->bm_why = NULL;
+ b->bm_task = NULL;
+ mutex_unlock(&b->bm_change);
+}
+
+/* we store some "meta" info about our pages in page->private */
+/* at a granularity of 4k storage per bitmap bit:
+ * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
+ * 1<<38 bits,
+ * 1<<23 4k bitmap pages.
+ * Use 24 bits as page index, covers 2 peta byte storage
+ * at a granularity of 4k per bit.
+ * Used to report the failed page idx on io error from the endio handlers.
+ */
+#define BM_PAGE_IDX_MASK ((1UL<<24)-1)
+/* this page is currently read in, or written back */
+#define BM_PAGE_IO_LOCK 31
+/* if there has been an IO error for this page */
+#define BM_PAGE_IO_ERROR 30
+/* this is to be able to intelligently skip disk IO,
+ * set if bits have been set since last IO. */
+#define BM_PAGE_NEED_WRITEOUT 29
+/* to mark for lazy writeout once syncer cleared all clearable bits,
+ * we if bits have been cleared since last IO. */
+#define BM_PAGE_LAZY_WRITEOUT 28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT 27
+
+/* store_page_idx uses non-atomic assignment. It is only used directly after
+ * allocating the page. All other bm_set_page_* and bm_clear_page_* need to
+ * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
+ * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
+ * requires it all to be atomic as well. */
+static void bm_store_page_idx(struct page *page, unsigned long idx)
+{
+ BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
+ set_page_private(page, idx);
+}
+
+static unsigned long bm_page_to_idx(struct page *page)
+{
+ return page_private(page) & BM_PAGE_IDX_MASK;
+}
+
+/* As is very unlikely that the same page is under IO from more than one
+ * context, we can get away with a bit per page and one wait queue per bitmap.
+ */
+static void bm_page_lock_io(struct drbd_device *device, int page_nr)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ void *addr = &page_private(b->bm_pages[page_nr]);
+ wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
+}
+
+static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ void *addr = &page_private(b->bm_pages[page_nr]);
+ clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
+ wake_up(&device->bitmap->bm_io_wait);
+}
+
+/* set _before_ submit_io, so it may be reset due to being changed
+ * while this page is in flight... will get submitted later again */
+static void bm_set_page_unchanged(struct page *page)
+{
+ /* use cmpxchg? */
+ clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+ clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static void bm_set_page_need_writeout(struct page *page)
+{
+ set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+}
+
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @device: DRBD device.
+ * @page_nr: the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
+{
+ struct page *page;
+ if (page_nr >= device->bitmap->bm_number_of_pages) {
+ drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
+ page_nr, (int)device->bitmap->bm_number_of_pages);
+ return;
+ }
+ page = device->bitmap->bm_pages[page_nr];
+ set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_unchanged(struct page *page)
+{
+ volatile const unsigned long *addr = &page_private(page);
+ return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
+}
+
+static void bm_set_page_io_err(struct page *page)
+{
+ set_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_clear_page_io_err(struct page *page)
+{
+ clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_set_page_lazy_writeout(struct page *page)
+{
+ set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_lazy_writeout(struct page *page)
+{
+ return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+/* on a 32bit box, this would allow for exactly (2<<38) bits. */
+static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
+{
+ /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+ unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
+ BUG_ON(page_nr >= b->bm_number_of_pages);
+ return page_nr;
+}
+
+static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
+{
+ /* page_nr = (bitnr/8) >> PAGE_SHIFT; */
+ unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
+ BUG_ON(page_nr >= b->bm_number_of_pages);
+ return page_nr;
+}
+
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+ struct page *page = b->bm_pages[idx];
+ return (unsigned long *) kmap_atomic(page);
+}
+
+static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+ return __bm_map_pidx(b, idx);
+}
+
+static void __bm_unmap(unsigned long *p_addr)
+{
+ kunmap_atomic(p_addr);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+ return __bm_unmap(p_addr);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_device*, but for the debug macros I like to have the device around
+ * to be able to report device specific.
+ */
+
+
+static void bm_free_pages(struct page **pages, unsigned long number)
+{
+ unsigned long i;
+ if (!pages)
+ return;
+
+ for (i = 0; i < number; i++) {
+ if (!pages[i]) {
+ pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
+ i, number);
+ continue;
+ }
+ __free_page(pages[i]);
+ pages[i] = NULL;
+ }
+}
+
+static void bm_vk_free(void *ptr, int v)
+{
+ if (v)
+ vfree(ptr);
+ else
+ kfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+ struct page **old_pages = b->bm_pages;
+ struct page **new_pages, *page;
+ unsigned int i, bytes, vmalloced = 0;
+ unsigned long have = b->bm_number_of_pages;
+
+ BUG_ON(have == 0 && old_pages != NULL);
+ BUG_ON(have != 0 && old_pages == NULL);
+
+ if (have == want)
+ return old_pages;
+
+ /* Trying kmalloc first, falling back to vmalloc.
+ * GFP_NOIO, as this is called while drbd IO is "suspended",
+ * and during resize or attach on diskless Primary,
+ * we must not block on IO to ourselves.
+ * Context is receiver thread or dmsetup. */
+ bytes = sizeof(struct page *)*want;
+ new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
+ if (!new_pages) {
+ new_pages = __vmalloc(bytes,
+ GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL);
+ if (!new_pages)
+ return NULL;
+ vmalloced = 1;
+ }
+
+ if (want >= have) {
+ for (i = 0; i < have; i++)
+ new_pages[i] = old_pages[i];
+ for (; i < want; i++) {
+ page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+ if (!page) {
+ bm_free_pages(new_pages + have, i - have);
+ bm_vk_free(new_pages, vmalloced);
+ return NULL;
+ }
+ /* we want to know which page it is
+ * from the endio handlers */
+ bm_store_page_idx(page, i);
+ new_pages[i] = page;
+ }
+ } else {
+ for (i = 0; i < want; i++)
+ new_pages[i] = old_pages[i];
+ /* NOT HERE, we are outside the spinlock!
+ bm_free_pages(old_pages + want, have - want);
+ */
+ }
+
+ if (vmalloced)
+ b->bm_flags |= BM_P_VMALLOCED;
+ else
+ b->bm_flags &= ~BM_P_VMALLOCED;
+
+ return new_pages;
+}
+
+/*
+ * called on driver init only. TODO call when a device is created.
+ * allocates the drbd_bitmap, and stores it in device->bitmap.
+ */
+int drbd_bm_init(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ WARN_ON(b != NULL);
+ b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+ if (!b)
+ return -ENOMEM;
+ spin_lock_init(&b->bm_lock);
+ mutex_init(&b->bm_change);
+ init_waitqueue_head(&b->bm_io_wait);
+
+ device->bitmap = b;
+
+ return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_device *device)
+{
+ if (!expect(device->bitmap))
+ return 0;
+ return device->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_device *device)
+{
+ if (!expect(device->bitmap))
+ return;
+ bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
+ bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags));
+ kfree(device->bitmap);
+ device->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+#define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3))
+#define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1)
+#define BITS_PER_LONG_MASK (BITS_PER_LONG - 1)
+static int bm_clear_surplus(struct drbd_bitmap *b)
+{
+ unsigned long mask;
+ unsigned long *p_addr, *bm;
+ int tmp;
+ int cleared = 0;
+
+ /* number of bits modulo bits per page */
+ tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+ /* mask the used bits of the word containing the last bit */
+ mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+ /* bitmap is always stored little endian,
+ * on disk and in core memory alike */
+ mask = cpu_to_lel(mask);
+
+ p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+ bm = p_addr + (tmp/BITS_PER_LONG);
+ if (mask) {
+ /* If mask != 0, we are not exactly aligned, so bm now points
+ * to the long containing the last bit.
+ * If mask == 0, bm already points to the word immediately
+ * after the last (long word aligned) bit. */
+ cleared = hweight_long(*bm & ~mask);
+ *bm &= mask;
+ bm++;
+ }
+
+ if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+ /* on a 32bit arch, we may need to zero out
+ * a padding long to align with a 64bit remote */
+ cleared += hweight_long(*bm);
+ *bm = 0;
+ }
+ bm_unmap(p_addr);
+ return cleared;
+}
+
+static void bm_set_surplus(struct drbd_bitmap *b)
+{
+ unsigned long mask;
+ unsigned long *p_addr, *bm;
+ int tmp;
+
+ /* number of bits modulo bits per page */
+ tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+ /* mask the used bits of the word containing the last bit */
+ mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+ /* bitmap is always stored little endian,
+ * on disk and in core memory alike */
+ mask = cpu_to_lel(mask);
+
+ p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+ bm = p_addr + (tmp/BITS_PER_LONG);
+ if (mask) {
+ /* If mask != 0, we are not exactly aligned, so bm now points
+ * to the long containing the last bit.
+ * If mask == 0, bm already points to the word immediately
+ * after the last (long word aligned) bit. */
+ *bm |= ~mask;
+ bm++;
+ }
+
+ if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+ /* on a 32bit arch, we may need to zero out
+ * a padding long to align with a 64bit remote */
+ *bm = ~0UL;
+ }
+ bm_unmap(p_addr);
+}
+
+/* you better not modify the bitmap while this is running,
+ * or its results will be stale */
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+ unsigned long *p_addr;
+ unsigned long bits = 0;
+ unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
+ int idx, i, last_word;
+
+ /* all but last page */
+ for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
+ p_addr = __bm_map_pidx(b, idx);
+ for (i = 0; i < LWPP; i++)
+ bits += hweight_long(p_addr[i]);
+ __bm_unmap(p_addr);
+ cond_resched();
+ }
+ /* last (or only) page */
+ last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
+ p_addr = __bm_map_pidx(b, idx);
+ for (i = 0; i < last_word; i++)
+ bits += hweight_long(p_addr[i]);
+ p_addr[last_word] &= cpu_to_lel(mask);
+ bits += hweight_long(p_addr[last_word]);
+ /* 32bit arch, may have an unused padding long */
+ if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
+ p_addr[last_word+1] = 0;
+ __bm_unmap(p_addr);
+ return bits;
+}
+
+/* offset and len in long words.*/
+static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+ unsigned long *p_addr, *bm;
+ unsigned int idx;
+ size_t do_now, end;
+
+ end = offset + len;
+
+ if (end > b->bm_words) {
+ pr_alert("bm_memset end > bm_words\n");
+ return;
+ }
+
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+ idx = bm_word_to_page_idx(b, offset);
+ p_addr = bm_map_pidx(b, idx);
+ bm = p_addr + MLPP(offset);
+ if (bm+do_now > p_addr + LWPP) {
+ pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+ p_addr, bm, (int)do_now);
+ } else
+ memset(bm, c, do_now * sizeof(long));
+ bm_unmap(p_addr);
+ bm_set_page_need_writeout(b->bm_pages[idx]);
+ offset += do_now;
+ }
+}
+
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+ u64 bitmap_sectors;
+ if (ldev->md.al_offset == 8)
+ bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+ else
+ bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+ return bitmap_sectors << (9 + 3);
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long bits, words, owords, obits;
+ unsigned long want, have, onpages; /* number of pages */
+ struct page **npages, **opages = NULL;
+ int err = 0, growing;
+ int opages_vmalloced;
+
+ if (!expect(b))
+ return -ENOMEM;
+
+ drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
+
+ drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
+ (unsigned long long)capacity);
+
+ if (capacity == b->bm_dev_capacity)
+ goto out;
+
+ opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags);
+
+ if (capacity == 0) {
+ spin_lock_irq(&b->bm_lock);
+ opages = b->bm_pages;
+ onpages = b->bm_number_of_pages;
+ owords = b->bm_words;
+ b->bm_pages = NULL;
+ b->bm_number_of_pages =
+ b->bm_set =
+ b->bm_bits =
+ b->bm_words =
+ b->bm_dev_capacity = 0;
+ spin_unlock_irq(&b->bm_lock);
+ bm_free_pages(opages, onpages);
+ bm_vk_free(opages, opages_vmalloced);
+ goto out;
+ }
+ bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+ /* if we would use
+ words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+ a 32bit host could present the wrong number of words
+ to a 64bit host.
+ */
+ words = ALIGN(bits, 64) >> LN2_BPL;
+
+ if (get_ldev(device)) {
+ u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
+ put_ldev(device);
+ if (bits > bits_on_disk) {
+ drbd_info(device, "bits = %lu\n", bits);
+ drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
+ err = -ENOSPC;
+ goto out;
+ }
+ }
+
+ want = ALIGN(words*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT;
+ have = b->bm_number_of_pages;
+ if (want == have) {
+ D_ASSERT(device, b->bm_pages != NULL);
+ npages = b->bm_pages;
+ } else {
+ if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
+ npages = NULL;
+ else
+ npages = bm_realloc_pages(b, want);
+ }
+
+ if (!npages) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock_irq(&b->bm_lock);
+ opages = b->bm_pages;
+ owords = b->bm_words;
+ obits = b->bm_bits;
+
+ growing = bits > obits;
+ if (opages && growing && set_new_bits)
+ bm_set_surplus(b);
+
+ b->bm_pages = npages;
+ b->bm_number_of_pages = want;
+ b->bm_bits = bits;
+ b->bm_words = words;
+ b->bm_dev_capacity = capacity;
+
+ if (growing) {
+ if (set_new_bits) {
+ bm_memset(b, owords, 0xff, words-owords);
+ b->bm_set += bits - obits;
+ } else
+ bm_memset(b, owords, 0x00, words-owords);
+
+ }
+
+ if (want < have) {
+ /* implicit: (opages != NULL) && (opages != npages) */
+ bm_free_pages(opages + want, have - want);
+ }
+
+ (void)bm_clear_surplus(b);
+
+ spin_unlock_irq(&b->bm_lock);
+ if (opages != npages)
+ bm_vk_free(opages, opages_vmalloced);
+ if (!growing)
+ b->bm_set = bm_count_bits(b);
+ drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
+
+ out:
+ drbd_bm_unlock(device);
+ return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+unsigned long _drbd_bm_total_weight(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long s;
+ unsigned long flags;
+
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ s = b->bm_set;
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+
+ return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_device *device)
+{
+ unsigned long s;
+ /* if I don't have a disk, I don't know about out-of-sync status */
+ if (!get_ldev_if_state(device, D_NEGOTIATING))
+ return 0;
+ s = _drbd_bm_total_weight(device);
+ put_ldev(device);
+ return s;
+}
+
+size_t drbd_bm_words(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return 0;
+
+ return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
+ unsigned long *buffer)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr, *bm;
+ unsigned long word, bits;
+ unsigned int idx;
+ size_t end, do_now;
+
+ end = offset + number;
+
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
+ if (number == 0)
+ return;
+ WARN_ON(offset >= b->bm_words);
+ WARN_ON(end > b->bm_words);
+
+ spin_lock_irq(&b->bm_lock);
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+ idx = bm_word_to_page_idx(b, offset);
+ p_addr = bm_map_pidx(b, idx);
+ bm = p_addr + MLPP(offset);
+ offset += do_now;
+ while (do_now--) {
+ bits = hweight_long(*bm);
+ word = *bm | *buffer++;
+ *bm++ = word;
+ b->bm_set += hweight_long(word) - bits;
+ }
+ bm_unmap(p_addr);
+ bm_set_page_need_writeout(b->bm_pages[idx]);
+ }
+ /* with 32bit <-> 64bit cross-platform connect
+ * this is only correct for current usage,
+ * where we _know_ that we are 64 bit aligned,
+ * and know that this function is used in this way, too...
+ */
+ if (end == b->bm_words)
+ b->bm_set -= bm_clear_surplus(b);
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
+ unsigned long *buffer)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr, *bm;
+ size_t end, do_now;
+
+ end = offset + number;
+
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
+
+ spin_lock_irq(&b->bm_lock);
+ if ((offset >= b->bm_words) ||
+ (end > b->bm_words) ||
+ (number <= 0))
+ drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
+ (unsigned long) offset,
+ (unsigned long) number,
+ (unsigned long) b->bm_words);
+ else {
+ while (offset < end) {
+ do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+ p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
+ bm = p_addr + MLPP(offset);
+ offset += do_now;
+ while (do_now--)
+ *buffer++ = *bm++;
+ bm_unmap(p_addr);
+ }
+ }
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
+
+ spin_lock_irq(&b->bm_lock);
+ bm_memset(b, 0, 0xff, b->bm_words);
+ (void)bm_clear_surplus(b);
+ b->bm_set = b->bm_bits;
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_device *device)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
+
+ spin_lock_irq(&b->bm_lock);
+ bm_memset(b, 0, 0, b->bm_words);
+ b->bm_set = 0;
+ spin_unlock_irq(&b->bm_lock);
+}
+
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
+{
+ struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+ list_del(&ctx->list);
+ spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
+ put_ldev(ctx->device);
+ kfree(ctx);
+}
+
+/* bv_page may be a copy, or may be the original */
+static void drbd_bm_endio(struct bio *bio, int error)
+{
+ struct drbd_bm_aio_ctx *ctx = bio->bi_private;
+ struct drbd_device *device = ctx->device;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?!
+ * do we want to WARN() on this? */
+ if (!error && !uptodate)
+ error = -EIO;
+
+ if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
+ !bm_test_page_unchanged(b->bm_pages[idx]))
+ drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
+
+ if (error) {
+ /* ctx error will hold the completed-last non-zero error code,
+ * in case error codes differ. */
+ ctx->error = error;
+ bm_set_page_io_err(b->bm_pages[idx]);
+ /* Not identical to on disk version of it.
+ * Is BM_PAGE_IO_ERROR enough? */
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
+ error, idx);
+ } else {
+ bm_clear_page_io_err(b->bm_pages[idx]);
+ dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
+ }
+
+ bm_page_unlock_io(device, idx);
+
+ if (ctx->flags & BM_AIO_COPY_PAGES)
+ mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
+
+ bio_put(bio);
+
+ if (atomic_dec_and_test(&ctx->in_flight)) {
+ ctx->done = 1;
+ wake_up(&device->misc_wait);
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+ }
+}
+
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
+{
+ struct bio *bio = bio_alloc_drbd(GFP_NOIO);
+ struct drbd_device *device = ctx->device;
+ struct drbd_bitmap *b = device->bitmap;
+ struct page *page;
+ unsigned int len;
+ unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE;
+
+ sector_t on_disk_sector =
+ device->ldev->md.md_offset + device->ldev->md.bm_offset;
+ on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9);
+
+ /* this might happen with very small
+ * flexible external meta data device,
+ * or with PAGE_SIZE > 4k */
+ len = min_t(unsigned int, PAGE_SIZE,
+ (drbd_md_last_sector(device->ldev) - on_disk_sector + 1)<<9);
+
+ /* serialize IO on this page */
+ bm_page_lock_io(device, page_nr);
+ /* before memcpy and submit,
+ * so it can be redirtied any time */
+ bm_set_page_unchanged(b->bm_pages[page_nr]);
+
+ if (ctx->flags & BM_AIO_COPY_PAGES) {
+ page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+ copy_highpage(page, b->bm_pages[page_nr]);
+ bm_store_page_idx(page, page_nr);
+ } else
+ page = b->bm_pages[page_nr];
+ bio->bi_bdev = device->ldev->md_bdev;
+ bio->bi_iter.bi_sector = on_disk_sector;
+ /* bio_add_page of a single page to an empty bio will always succeed,
+ * according to api. Do we want to assert that? */
+ bio_add_page(bio, page, len, 0);
+ bio->bi_private = ctx;
+ bio->bi_end_io = drbd_bm_endio;
+
+ if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+ bio->bi_rw |= rw;
+ bio_endio(bio, -EIO);
+ } else {
+ submit_bio(rw, bio);
+ /* this should not count as user activity and cause the
+ * resync to throttle -- see drbd_rs_should_slow_down(). */
+ atomic_add(len >> 9, &device->rs_sect_ev);
+ }
+}
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+{
+ struct drbd_bm_aio_ctx *ctx;
+ struct drbd_bitmap *b = device->bitmap;
+ int num_pages, i, count = 0;
+ unsigned long now;
+ char ppb[10];
+ int err = 0;
+
+ /*
+ * We are protected against bitmap disappearing/resizing by holding an
+ * ldev reference (caller must have called get_ldev()).
+ * For read/write, we are protected against changes to the bitmap by
+ * the bitmap lock (see drbd_bitmap_io).
+ * For lazy writeout, we don't care for ongoing changes to the bitmap,
+ * as we submit copies of pages anyways.
+ */
+
+ ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
+ if (!ctx)
+ return -ENOMEM;
+
+ *ctx = (struct drbd_bm_aio_ctx) {
+ .device = device,
+ .start_jif = jiffies,
+ .in_flight = ATOMIC_INIT(1),
+ .done = 0,
+ .flags = flags,
+ .error = 0,
+ .kref = { ATOMIC_INIT(2) },
+ };
+
+ if (!get_ldev_if_state(device, D_ATTACHING)) { /* put is in drbd_bm_aio_ctx_destroy() */
+ drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+ kfree(ctx);
+ return -ENODEV;
+ }
+ /* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
+ drbd_adm_attach(), after device->ldev was assigned. */
+
+ if (0 == (ctx->flags & ~BM_AIO_READ))
+ WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&ctx->list, &device->pending_bitmap_io);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ num_pages = b->bm_number_of_pages;
+
+ now = jiffies;
+
+ /* let the layers below us try to merge these bios... */
+ for (i = 0; i < num_pages; i++) {
+ /* ignore completely unchanged pages */
+ if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+ break;
+ if (!(flags & BM_AIO_READ)) {
+ if ((flags & BM_AIO_WRITE_HINTED) &&
+ !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+ &page_private(b->bm_pages[i])))
+ continue;
+
+ if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
+ bm_test_page_unchanged(b->bm_pages[i])) {
+ dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
+ continue;
+ }
+ /* during lazy writeout,
+ * ignore those pages not marked for lazy writeout. */
+ if (lazy_writeout_upper_idx &&
+ !bm_test_page_lazy_writeout(b->bm_pages[i])) {
+ dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
+ continue;
+ }
+ }
+ atomic_inc(&ctx->in_flight);
+ bm_page_io_async(ctx, i);
+ ++count;
+ cond_resched();
+ }
+
+ /*
+ * We initialize ctx->in_flight to one to make sure drbd_bm_endio
+ * will not set ctx->done early, and decrement / test it here. If there
+ * are still some bios in flight, we need to wait for them here.
+ * If all IO is done already (or nothing had been submitted), there is
+ * no need to wait. Still, we need to put the kref associated with the
+ * "in_flight reached zero, all done" event.
+ */
+ if (!atomic_dec_and_test(&ctx->in_flight))
+ wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
+ else
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+
+ /* summary for global bitmap IO */
+ if (flags == 0)
+ drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n",
+ (flags & BM_AIO_READ) ? "READ" : "WRITE",
+ count, jiffies - now);
+
+ if (ctx->error) {
+ drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ err = -EIO; /* ctx->error ? */
+ }
+
+ if (atomic_read(&ctx->in_flight))
+ err = -EIO; /* Disk timeout/force-detach during IO... */
+
+ now = jiffies;
+ if (flags & BM_AIO_READ) {
+ b->bm_set = bm_count_bits(b);
+ drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
+ jiffies - now);
+ }
+ now = b->bm_set;
+
+ if ((flags & ~BM_AIO_READ) == 0)
+ drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+ ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+ kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+ return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @device: DRBD device.
+ */
+int drbd_bm_read(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_READ, 0);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @device: DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ */
+int drbd_bm_write(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, 0, 0);
+}
+
+/**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @device: DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @device: DRBD device.
+ * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @device: DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_COPY_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @device: DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
+{
+ return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * For this to work on 32bit arch with bitnumbers > (1<<32),
+ * we'd need to return u64, and get a whole lot of other places
+ * fixed where we still use unsigned long.
+ *
+ * this returns a bit number, NOT a sector!
+ */
+static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
+ const int find_zero_bit)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr;
+ unsigned long bit_offset;
+ unsigned i;
+
+
+ if (bm_fo > b->bm_bits) {
+ drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+ bm_fo = DRBD_END_OF_BITMAP;
+ } else {
+ while (bm_fo < b->bm_bits) {
+ /* bit offset of the first bit in the page */
+ bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
+ p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
+
+ if (find_zero_bit)
+ i = find_next_zero_bit_le(p_addr,
+ PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+ else
+ i = find_next_bit_le(p_addr,
+ PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+
+ __bm_unmap(p_addr);
+ if (i < PAGE_SIZE*8) {
+ bm_fo = bit_offset + i;
+ if (bm_fo >= b->bm_bits)
+ break;
+ goto found;
+ }
+ bm_fo = bit_offset + PAGE_SIZE*8;
+ }
+ bm_fo = DRBD_END_OF_BITMAP;
+ }
+ found:
+ return bm_fo;
+}
+
+static unsigned long bm_find_next(struct drbd_device *device,
+ unsigned long bm_fo, const int find_zero_bit)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long i = DRBD_END_OF_BITMAP;
+
+ if (!expect(b))
+ return i;
+ if (!expect(b->bm_pages))
+ return i;
+
+ spin_lock_irq(&b->bm_lock);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
+
+ i = __bm_find_next(device, bm_fo, find_zero_bit);
+
+ spin_unlock_irq(&b->bm_lock);
+ return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+ return bm_find_next(device, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+ return bm_find_next(device, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+ /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+ return __bm_find_next(device, bm_fo, 0);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+ /* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+ return __bm_find_next(device, bm_fo, 1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+ unsigned long e, int val)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr = NULL;
+ unsigned long bitnr;
+ unsigned int last_page_nr = -1U;
+ int c = 0;
+ int changed_total = 0;
+
+ if (e >= b->bm_bits) {
+ drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+ s, e, b->bm_bits);
+ e = b->bm_bits ? b->bm_bits -1 : 0;
+ }
+ for (bitnr = s; bitnr <= e; bitnr++) {
+ unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
+ if (page_nr != last_page_nr) {
+ if (p_addr)
+ __bm_unmap(p_addr);
+ if (c < 0)
+ bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+ else if (c > 0)
+ bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+ changed_total += c;
+ c = 0;
+ p_addr = __bm_map_pidx(b, page_nr);
+ last_page_nr = page_nr;
+ }
+ if (val)
+ c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+ else
+ c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+ }
+ if (p_addr)
+ __bm_unmap(p_addr);
+ if (c < 0)
+ bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+ else if (c > 0)
+ bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+ changed_total += c;
+ b->bm_set += changed_total;
+ return changed_total;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+ const unsigned long e, int val)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = device->bitmap;
+ int c = 0;
+
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
+ bm_print_lock_info(device);
+
+ c = __bm_change_bits_to(device, s, e, val);
+
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+ return bm_change_bits_to(device, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+ return -bm_change_bits_to(device, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+ int page_nr, int first_word, int last_word)
+{
+ int i;
+ int bits;
+ int changed = 0;
+ unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+ for (i = first_word; i < last_word; i++) {
+ bits = hweight_long(paddr[i]);
+ paddr[i] = ~0UL;
+ changed += BITS_PER_LONG - bits;
+ }
+ kunmap_atomic(paddr);
+ if (changed) {
+ /* We only need lazy writeout, the information is still in the
+ * remote bitmap as well, and is reconstructed during the next
+ * bitmap exchange, if lost locally due to a crash. */
+ bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+ b->bm_set += changed;
+ }
+}
+
+/* Same thing as drbd_bm_set_bits,
+ * but more efficient for a large bit range.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+ /* First set_bit from the first bit (s)
+ * up to the next long boundary (sl),
+ * then assign full words up to the last long boundary (el),
+ * then set_bit up to and including the last bit (e).
+ *
+ * Do not use memset, because we must account for changes,
+ * so we need to loop over the words with hweight() anyways.
+ */
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long sl = ALIGN(s,BITS_PER_LONG);
+ unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+ int first_page;
+ int last_page;
+ int page_nr;
+ int first_word;
+ int last_word;
+
+ if (e - s <= 3*BITS_PER_LONG) {
+ /* don't bother; el and sl may even be wrong. */
+ spin_lock_irq(&b->bm_lock);
+ __bm_change_bits_to(device, s, e, 1);
+ spin_unlock_irq(&b->bm_lock);
+ return;
+ }
+
+ /* difference is large enough that we can trust sl and el */
+
+ spin_lock_irq(&b->bm_lock);
+
+ /* bits filling the current long */
+ if (sl)
+ __bm_change_bits_to(device, s, sl-1, 1);
+
+ first_page = sl >> (3 + PAGE_SHIFT);
+ last_page = el >> (3 + PAGE_SHIFT);
+
+ /* MLPP: modulo longs per page */
+ /* LWPP: long words per page */
+ first_word = MLPP(sl >> LN2_BPL);
+ last_word = LWPP;
+
+ /* first and full pages, unless first page == last page */
+ for (page_nr = first_page; page_nr < last_page; page_nr++) {
+ bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
+ spin_unlock_irq(&b->bm_lock);
+ cond_resched();
+ first_word = 0;
+ spin_lock_irq(&b->bm_lock);
+ }
+ /* last page (respectively only page, for first page == last page) */
+ last_word = MLPP(el >> LN2_BPL);
+
+ /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
+ * ==> e = 32767, el = 32768, last_page = 2,
+ * and now last_word = 0.
+ * We do not want to touch last_page in this case,
+ * as we did not allocate it, it is not present in bitmap->bm_pages.
+ */
+ if (last_word)
+ bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
+
+ /* possibly trailing bits.
+ * example: (e & 63) == 63, el will be e+1.
+ * if that even was the very last bit,
+ * it would trigger an assert in __bm_change_bits_to()
+ */
+ if (el <= e)
+ __bm_change_bits_to(device, el, e, 1);
+ spin_unlock_irq(&b->bm_lock);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ * 1 ... bit set
+ * 0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr;
+ int i;
+
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
+ if (bitnr < b->bm_bits) {
+ p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
+ i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
+ bm_unmap(p_addr);
+ } else if (bitnr == b->bm_bits) {
+ i = -1;
+ } else { /* (bitnr > b->bm_bits) */
+ drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+ i = 0;
+ }
+
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+ unsigned long flags;
+ struct drbd_bitmap *b = device->bitmap;
+ unsigned long *p_addr = NULL;
+ unsigned long bitnr;
+ unsigned int page_nr = -1U;
+ int c = 0;
+
+ /* If this is called without a bitmap, that is a bug. But just to be
+ * robust in case we screwed up elsewhere, in that case pretend there
+ * was one dirty bit in the requested area, so we won't try to do a
+ * local read there (no bitmap probably implies no disk) */
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 1;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
+ for (bitnr = s; bitnr <= e; bitnr++) {
+ unsigned int idx = bm_bit_to_page_idx(b, bitnr);
+ if (page_nr != idx) {
+ page_nr = idx;
+ if (p_addr)
+ bm_unmap(p_addr);
+ p_addr = bm_map_pidx(b, idx);
+ }
+ if (expect(bitnr < b->bm_bits))
+ c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+ else
+ drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+ }
+ if (p_addr)
+ bm_unmap(p_addr);
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
+{
+ struct drbd_bitmap *b = device->bitmap;
+ int count, s, e;
+ unsigned long flags;
+ unsigned long *p_addr, *bm;
+
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
+
+ spin_lock_irqsave(&b->bm_lock, flags);
+ if (BM_DONT_TEST & b->bm_flags)
+ bm_print_lock_info(device);
+
+ s = S2W(enr);
+ e = min((size_t)S2W(enr+1), b->bm_words);
+ count = 0;
+ if (s < b->bm_words) {
+ int n = e-s;
+ p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
+ bm = p_addr + MLPP(s);
+ while (n--)
+ count += hweight_long(*bm++);
+ bm_unmap(p_addr);
+ } else {
+ drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+ }
+ spin_unlock_irqrestore(&b->bm_lock, flags);
+ return count;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644
index 000000000..a6ee3d750
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -0,0 +1,958 @@
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+ if (valid)
+ seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+ else
+ seq_printf(m, "\t-");
+}
+
+static void __seq_print_rq_state_bit(struct seq_file *m,
+ bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+ if (is_set && set_name) {
+ seq_putc(m, *sep);
+ seq_puts(m, set_name);
+ *sep = '|';
+ } else if (!is_set && unset_name) {
+ seq_putc(m, *sep);
+ seq_puts(m, unset_name);
+ *sep = '|';
+ }
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+ bool is_set, char *sep, const char *set_name)
+{
+ __seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+ unsigned int s = req->rq_state;
+ char sep = ' ';
+ seq_printf(m, "\t0x%08x", s);
+ seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+ /* RQ_WRITE ignored, already reported */
+ seq_puts(m, "\tlocal:");
+ seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+ seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+ seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+ seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+ if (sep == ' ')
+ seq_puts(m, " -");
+
+ /* for_each_connection ... */
+ seq_printf(m, "\tnet:");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+ seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+ seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+ seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+ seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+ seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+ if (sep == ' ')
+ seq_puts(m, " -");
+
+ seq_printf(m, " :");
+ sep = ' ';
+ seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+ seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+ seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+ if (sep == ' ')
+ seq_puts(m, " -");
+ seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+ /* change anything here, fixup header below! */
+ unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+ seq_printf(m, "0x%x\t%llu\t%u\t%s",
+ req->epoch,
+ (unsigned long long)req->i.sector, req->i.size >> 9,
+ (s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+ seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+ seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+ seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+ seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+ seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+ seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+ seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+ seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+ seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ struct drbd_md_io tmp;
+ /* In theory this is racy,
+ * in the sense that there could have been a
+ * drbd_md_put_buffer(); drbd_md_get_buffer();
+ * between accessing these members here. */
+ tmp = device->md_io;
+ if (atomic_read(&tmp.in_use)) {
+ seq_printf(m, "%u\t%u\t%d\t",
+ device->minor, device->vnr,
+ jiffies_to_msecs(now - tmp.start_jif));
+ if (time_before(tmp.submit_jif, tmp.start_jif))
+ seq_puts(m, "-\t");
+ else
+ seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+ seq_printf(m, "%s\n", tmp.current_use);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ unsigned long jif;
+ struct drbd_request *req;
+ int n = atomic_read(&device->ap_actlog_cnt);
+ if (n) {
+ spin_lock_irq(&device->resource->req_lock);
+ req = list_first_entry_or_null(&device->pending_master_completion[1],
+ struct drbd_request, req_pending_master_completion);
+ /* if the oldest request does not wait for the activity log
+ * it is not interesting for us here */
+ if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+ jif = req->start_jif;
+ else
+ req = NULL;
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+ if (n) {
+ seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+ if (req)
+ seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+ else
+ seq_puts(m, "-\t");
+ seq_printf(m, "%u\n", n);
+ }
+ }
+ rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+ struct drbd_bm_aio_ctx *ctx;
+ unsigned long start_jif;
+ unsigned int in_flight;
+ unsigned int flags;
+ spin_lock_irq(&device->resource->req_lock);
+ ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+ if (ctx && ctx->done)
+ ctx = NULL;
+ if (ctx) {
+ start_jif = ctx->start_jif;
+ in_flight = atomic_read(&ctx->in_flight);
+ flags = ctx->flags;
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ if (ctx) {
+ seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+ device->minor, device->vnr,
+ (flags & BM_AIO_READ) ? 'R' : 'W',
+ jiffies_to_msecs(now - start_jif),
+ in_flight);
+ }
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ seq_print_device_bitmap_io(m, device, now);
+ }
+ rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+ unsigned long f = peer_req->flags;
+ char sep = ' ';
+
+ __seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+ __seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+ seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+ seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+ seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+
+ if (f & EE_IS_TRIM) {
+ seq_putc(m, sep);
+ sep = '|';
+ if (f & EE_IS_TRIM_USE_ZEROOUT)
+ seq_puts(m, "zero-out");
+ else
+ seq_puts(m, "trim");
+ }
+ seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+ struct drbd_device *device, struct list_head *lh,
+ unsigned long now)
+{
+ bool reported_preparing = false;
+ struct drbd_peer_request *peer_req;
+ list_for_each_entry(peer_req, lh, w.list) {
+ if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+ continue;
+
+ if (device)
+ seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+ seq_printf(m, "%llu\t%u\t%c\t%u\t",
+ (unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+ (peer_req->flags & EE_WRITE) ? 'W' : 'R',
+ jiffies_to_msecs(now - peer_req->submit_jif));
+ seq_print_peer_request_flags(m, peer_req);
+ if (peer_req->flags & EE_SUBMITTED)
+ break;
+ else
+ reported_preparing = true;
+ }
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+ struct drbd_device *device, unsigned long now)
+{
+ seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+ spin_lock_irq(&device->resource->req_lock);
+ seq_print_peer_request(m, device, &device->active_ee, now);
+ seq_print_peer_request(m, device, &device->read_ee, now);
+ seq_print_peer_request(m, device, &device->sync_ee, now);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (test_bit(FLUSH_PENDING, &device->flags)) {
+ seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+ device->minor, device->vnr,
+ jiffies_to_msecs(now - device->flush_jif));
+ }
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+ struct drbd_resource *resource, unsigned long now)
+{
+ struct drbd_device *device;
+ unsigned int i;
+
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, i) {
+ seq_print_device_peer_requests(m, device, now);
+ }
+ rcu_read_unlock();
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+ struct drbd_resource *resource,
+ struct drbd_connection *connection,
+ unsigned long now)
+{
+ struct drbd_request *req;
+ unsigned int count = 0;
+ unsigned int show_state = 0;
+
+ seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+ spin_lock_irq(&resource->req_lock);
+ list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+ unsigned int tmp = 0;
+ unsigned int s;
+ ++count;
+
+ /* don't disable irq "forever" */
+ if (!(count & 0x1ff)) {
+ struct drbd_request *req_next;
+ kref_get(&req->kref);
+ spin_unlock_irq(&resource->req_lock);
+ cond_resched();
+ spin_lock_irq(&resource->req_lock);
+ req_next = list_next_entry(req, tl_requests);
+ if (kref_put(&req->kref, drbd_req_destroy))
+ req = req_next;
+ if (&req->tl_requests == &connection->transfer_log)
+ break;
+ }
+
+ s = req->rq_state;
+
+ /* This is meant to summarize timing issues, to be able to tell
+ * local disk problems from network problems.
+ * Skip requests, if we have shown an even older request with
+ * similar aspects already. */
+ if (req->master_bio == NULL)
+ tmp |= 1;
+ if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+ tmp |= 2;
+ if (s & RQ_NET_MASK) {
+ if (!(s & RQ_NET_SENT))
+ tmp |= 4;
+ if (s & RQ_NET_PENDING)
+ tmp |= 8;
+ if (!(s & RQ_NET_DONE))
+ tmp |= 16;
+ }
+ if ((tmp & show_state) == tmp)
+ continue;
+ show_state |= tmp;
+ seq_printf(m, "%u\t", count);
+ seq_print_minor_vnr_req(m, req, now);
+ if (show_state == 0x1f)
+ break;
+ }
+ spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+ struct drbd_resource *resource = m->private;
+ struct drbd_connection *connection;
+ unsigned long jif = jiffies;
+
+ connection = first_connection(resource);
+ /* This does not happen, actually.
+ * But be robust and prepare for future code changes. */
+ if (!connection || !kref_get_unless_zero(&connection->kref))
+ return -ESTALE;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ seq_puts(m, "oldest bitmap IO\n");
+ seq_print_resource_pending_bitmap_io(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "meta data IO\n");
+ seq_print_resource_pending_meta_io(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "socket buffer stats\n");
+ /* for each connection ... once we have more than one */
+ rcu_read_lock();
+ if (connection->data.socket) {
+ /* open coded SIOCINQ, the "relevant" part */
+ struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+ int answ = tp->rcv_nxt - tp->copied_seq;
+ seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+ /* open coded SIOCOUTQ, the "relevant" part */
+ answ = tp->write_seq - tp->snd_una;
+ seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+ }
+ rcu_read_unlock();
+ seq_putc(m, '\n');
+
+ seq_puts(m, "oldest peer requests\n");
+ seq_print_resource_pending_peer_requests(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "application requests waiting for activity log\n");
+ seq_print_waiting_for_AL(m, resource, jif);
+ seq_putc(m, '\n');
+
+ seq_puts(m, "oldest application requests\n");
+ seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+ seq_putc(m, '\n');
+
+ jif = jiffies - jif;
+ if (jif)
+ seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return 0;
+}
+
+/* simple_positive(file->f_path.dentry) respectively debugfs_positive(),
+ * but neither is "reachable" from here.
+ * So we have our own inline version of it above. :-( */
+static inline int debugfs_positive(struct dentry *dentry)
+{
+ return d_really_is_positive(dentry) && !d_unhashed(dentry);
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+ void *data, struct kref *kref,
+ void (*release)(struct kref *))
+{
+ struct dentry *parent;
+ int ret = -ESTALE;
+
+ /* Are we still linked,
+ * or has debugfs_remove() already been called? */
+ parent = file->f_path.dentry->d_parent;
+ /* not sure if this can happen: */
+ if (!parent || d_really_is_negative(parent))
+ goto out;
+ /* serialize with d_delete() */
+ mutex_lock(&d_inode(parent)->i_mutex);
+ /* Make sure the object is still alive */
+ if (debugfs_positive(file->f_path.dentry)
+ && kref_get_unless_zero(kref))
+ ret = 0;
+ mutex_unlock(&d_inode(parent)->i_mutex);
+ if (!ret) {
+ ret = single_open(file, show, data);
+ if (ret)
+ kref_put(kref, release);
+ }
+out:
+ return ret;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+ struct drbd_resource *resource = inode->i_private;
+ return drbd_single_open(file, in_flight_summary_show, resource,
+ &resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+ struct drbd_resource *resource = inode->i_private;
+ kref_put(&resource->kref, drbd_destroy_resource);
+ return single_release(inode, file);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+ .owner = THIS_MODULE,
+ .open = in_flight_summary_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = in_flight_summary_release,
+};
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+ struct dentry *dentry;
+ if (!drbd_debugfs_resources)
+ return;
+
+ dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res = dentry;
+
+ dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_volumes = dentry;
+
+ dentry = debugfs_create_dir("connections", resource->debugfs_res);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_connections = dentry;
+
+ dentry = debugfs_create_file("in_flight_summary", S_IRUSR|S_IRGRP,
+ resource->debugfs_res, resource,
+ &in_flight_summary_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ resource->debugfs_res_in_flight_summary = dentry;
+ return;
+
+fail:
+ drbd_debugfs_resource_cleanup(resource);
+ drbd_err(resource, "failed to create debugfs dentry\n");
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+ debugfs_remove(*dp);
+ *dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+ /* it is ok to call debugfs_remove(NULL) */
+ drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+ drbd_debugfs_remove(&resource->debugfs_res_connections);
+ drbd_debugfs_remove(&resource->debugfs_res_volumes);
+ drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+static void seq_print_one_timing_detail(struct seq_file *m,
+ const struct drbd_thread_timing_details *tdp,
+ unsigned long now)
+{
+ struct drbd_thread_timing_details td;
+ /* No locking...
+ * use temporary assignment to get at consistent data. */
+ do {
+ td = *tdp;
+ } while (td.cb_nr != tdp->cb_nr);
+ if (!td.cb_addr)
+ return;
+ seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+ td.cb_nr,
+ jiffies_to_msecs(now - td.start_jif),
+ td.caller_fn, td.line,
+ td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+ const char *title,
+ unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+ unsigned int start_idx;
+ unsigned int i;
+
+ seq_printf(m, "%s\n", title);
+ /* If not much is going on, this will result in natural ordering.
+ * If it is very busy, we will possibly skip events, or even see wrap
+ * arounds, which could only be avoided with locking.
+ */
+ start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+ for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+ seq_print_one_timing_detail(m, tdp+i, now);
+ for (i = 0; i < start_idx; i++)
+ seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_connection *connection = m->private;
+ unsigned long jif = jiffies;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ seq_puts(m, "n\tage\tcallsite\tfn\n");
+ seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+ seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+ return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ return drbd_single_open(file, callback_history_show, connection,
+ &connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+ .owner = THIS_MODULE,
+ .open = callback_history_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = callback_history_release,
+};
+
+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_connection *connection = m->private;
+ unsigned long now = jiffies;
+ struct drbd_request *r1, *r2;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ r1 = connection->req_next;
+ if (r1)
+ seq_print_minor_vnr_req(m, r1, now);
+ r2 = connection->req_ack_pending;
+ if (r2 && r2 != r1) {
+ r1 = r2;
+ seq_print_minor_vnr_req(m, r1, now);
+ }
+ r2 = connection->req_not_net_done;
+ if (r2 && r2 != r1)
+ seq_print_minor_vnr_req(m, r2, now);
+ spin_unlock_irq(&connection->resource->req_lock);
+ return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ return drbd_single_open(file, connection_oldest_requests_show, connection,
+ &connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+ struct drbd_connection *connection = inode->i_private;
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+ .owner = THIS_MODULE,
+ .open = connection_oldest_requests_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = connection_oldest_requests_release,
+};
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+ struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+ struct dentry *dentry;
+ if (!conns_dir)
+ return;
+
+ /* Once we enable mutliple peers,
+ * these connections will have descriptive names.
+ * For now, it is just the one connection to the (only) "peer". */
+ dentry = debugfs_create_dir("peer", conns_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn = dentry;
+
+ dentry = debugfs_create_file("callback_history", S_IRUSR|S_IRGRP,
+ connection->debugfs_conn, connection,
+ &connection_callback_history_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn_callback_history = dentry;
+
+ dentry = debugfs_create_file("oldest_requests", S_IRUSR|S_IRGRP,
+ connection->debugfs_conn, connection,
+ &connection_oldest_requests_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ connection->debugfs_conn_oldest_requests = dentry;
+ return;
+
+fail:
+ drbd_debugfs_connection_cleanup(connection);
+ drbd_err(connection, "failed to create debugfs dentry\n");
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+ drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+ drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+ drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+ struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+ seq_printf(m, "%5d %s %s %s", bme->rs_left,
+ test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+ test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+ test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+ );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(m, device->resync);
+ lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+ put_ldev(device);
+ }
+ return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(m, device->act_log);
+ lc_seq_dump_details(m, device->act_log, "", NULL);
+ put_ldev(device);
+ }
+ return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+ struct drbd_resource *resource = device->resource;
+ unsigned long now = jiffies;
+ struct drbd_request *r1, *r2;
+ int i;
+
+ /* BUMP me if you change the file format/content/presentation */
+ seq_printf(m, "v: %u\n\n", 0);
+
+ seq_puts(m, RQ_HDR);
+ spin_lock_irq(&resource->req_lock);
+ /* WRITE, then READ */
+ for (i = 1; i >= 0; --i) {
+ r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+ struct drbd_request, req_pending_master_completion);
+ r2 = list_first_entry_or_null(&device->pending_completion[i],
+ struct drbd_request, req_pending_local);
+ if (r1)
+ seq_print_one_request(m, r1, now);
+ if (r2 && r2 != r1)
+ seq_print_one_request(m, r2, now);
+ }
+ spin_unlock_irq(&resource->req_lock);
+ return 0;
+}
+
+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+ struct drbd_device *device = m->private;
+ struct drbd_md *md;
+ enum drbd_uuid_index idx;
+
+ if (!get_ldev_if_state(device, D_FAILED))
+ return -ENODEV;
+
+ md = &device->ldev->md;
+ spin_lock_irq(&md->uuid_lock);
+ for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+ seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+ }
+ spin_unlock_irq(&md->uuid_lock);
+ put_ldev(device);
+ return 0;
+}
+
+#define drbd_debugfs_device_attr(name) \
+static int device_ ## name ## _open(struct inode *inode, struct file *file) \
+{ \
+ struct drbd_device *device = inode->i_private; \
+ return drbd_single_open(file, device_ ## name ## _show, device, \
+ &device->kref, drbd_destroy_device); \
+} \
+static int device_ ## name ## _release(struct inode *inode, struct file *file) \
+{ \
+ struct drbd_device *device = inode->i_private; \
+ kref_put(&device->kref, drbd_destroy_device); \
+ return single_release(inode, file); \
+} \
+static const struct file_operations device_ ## name ## _fops = { \
+ .owner = THIS_MODULE, \
+ .open = device_ ## name ## _open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = device_ ## name ## _release, \
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+ struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+ char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+ char vnr_buf[8]; /* volume number vnr is even 16 bit only; */
+ char *slink_name = NULL;
+
+ struct dentry *dentry;
+ if (!vols_dir || !drbd_debugfs_minors)
+ return;
+
+ snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+ dentry = debugfs_create_dir(vnr_buf, vols_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ device->debugfs_vol = dentry;
+
+ snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+ slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+ device->resource->name, device->vnr);
+ if (!slink_name)
+ goto fail;
+ dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+ kfree(slink_name);
+ slink_name = NULL;
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ device->debugfs_minor = dentry;
+
+#define DCF(name) do { \
+ dentry = debugfs_create_file(#name, S_IRUSR|S_IRGRP, \
+ device->debugfs_vol, device, \
+ &device_ ## name ## _fops); \
+ if (IS_ERR_OR_NULL(dentry)) \
+ goto fail; \
+ device->debugfs_vol_ ## name = dentry; \
+ } while (0)
+
+ DCF(oldest_requests);
+ DCF(act_log_extents);
+ DCF(resync_extents);
+ DCF(data_gen_id);
+#undef DCF
+ return;
+
+fail:
+ drbd_debugfs_device_cleanup(device);
+ drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+ drbd_debugfs_remove(&device->debugfs_minor);
+ drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+ drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+ drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+ drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+ drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+ struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+ struct dentry *dentry;
+ char vnr_buf[8];
+
+ if (!conn_dir)
+ return;
+
+ snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+ dentry = debugfs_create_dir(vnr_buf, conn_dir);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ peer_device->debugfs_peer_dev = dentry;
+ return;
+
+fail:
+ drbd_debugfs_peer_device_cleanup(peer_device);
+ drbd_err(peer_device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+ drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+ seq_printf(m, "# %s\n", drbd_buildtag());
+ seq_printf(m, "VERSION=%s\n", REL_VERSION);
+ seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+ seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+ seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+ return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, drbd_version_show, NULL);
+}
+
+static struct file_operations drbd_version_fops = {
+ .owner = THIS_MODULE,
+ .open = drbd_version_open,
+ .llseek = seq_lseek,
+ .read = seq_read,
+ .release = single_release,
+};
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+ drbd_debugfs_remove(&drbd_debugfs_resources);
+ drbd_debugfs_remove(&drbd_debugfs_minors);
+ drbd_debugfs_remove(&drbd_debugfs_version);
+ drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+int __init drbd_debugfs_init(void)
+{
+ struct dentry *dentry;
+
+ dentry = debugfs_create_dir("drbd", NULL);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_root = dentry;
+
+ dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_version = dentry;
+
+ dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_resources = dentry;
+
+ dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+ if (IS_ERR_OR_NULL(dentry))
+ goto fail;
+ drbd_debugfs_minors = dentry;
+ return 0;
+
+fail:
+ drbd_debugfs_cleanup();
+ if (dentry)
+ return PTR_ERR(dentry);
+ else
+ return -EINVAL;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644
index 000000000..8bee21340
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -0,0 +1,39 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+int __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline int __init drbd_debugfs_init(void) { return -ENODEV; }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000..b905e9888
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2328 @@
+/*
+ drbd_int.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/crypto.h>
+#include <linux/ratelimit.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+#include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+#include "drbd_state.h"
+#include "drbd_protocol.h"
+
+#ifdef __CHECKER__
+# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+# define __must_hold(x)
+#endif
+
+/* module parameter, defined in drbd_main.c */
+extern unsigned int minor_count;
+extern bool disable_sendpage;
+extern bool allow_oos;
+void tl_abort_disk_io(struct drbd_device *device);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern int enable_faults;
+extern int fault_rate;
+extern int fault_devs;
+#endif
+
+extern char usermode_helper[];
+
+
+/* I don't remember why XCPU ...
+ * This is used to wake the asender,
+ * and to interrupt sending the sending task
+ * on disconnect.
+ */
+#define DRBD_SIG SIGXCPU
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+#define ID_IN_SYNC (4711ULL)
+#define ID_OUT_OF_SYNC (4712ULL)
+#define ID_SYNCER (-1ULL)
+
+#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
+
+struct drbd_device;
+struct drbd_connection;
+
+#define __drbd_printk_device(level, device, fmt, args...) \
+ dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
+#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
+ dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
+#define __drbd_printk_resource(level, resource, fmt, args...) \
+ printk(level "drbd %s: " fmt, (resource)->name, ## args)
+#define __drbd_printk_connection(level, connection, fmt, args...) \
+ printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
+
+void drbd_printk_with_wrong_object_type(void);
+
+#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
+ (__builtin_types_compatible_p(typeof(obj), type) || \
+ __builtin_types_compatible_p(typeof(obj), const type)), \
+ func(level, (const type)(obj), fmt, ## args)
+
+#define drbd_printk(level, obj, fmt, args...) \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_device *, \
+ __drbd_printk_device, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_resource *, \
+ __drbd_printk_resource, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_connection *, \
+ __drbd_printk_connection, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
+ __drbd_printk_peer_device, level, fmt, ## args), \
+ drbd_printk_with_wrong_object_type()))))
+
+#define drbd_dbg(obj, fmt, args...) \
+ drbd_printk(KERN_DEBUG, obj, fmt, ## args)
+#define drbd_alert(obj, fmt, args...) \
+ drbd_printk(KERN_ALERT, obj, fmt, ## args)
+#define drbd_err(obj, fmt, args...) \
+ drbd_printk(KERN_ERR, obj, fmt, ## args)
+#define drbd_warn(obj, fmt, args...) \
+ drbd_printk(KERN_WARNING, obj, fmt, ## args)
+#define drbd_info(obj, fmt, args...) \
+ drbd_printk(KERN_INFO, obj, fmt, ## args)
+#define drbd_emerg(obj, fmt, args...) \
+ drbd_printk(KERN_EMERG, obj, fmt, ## args)
+
+#define dynamic_drbd_dbg(device, fmt, args...) \
+ dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
+
+#define D_ASSERT(device, exp) do { \
+ if (!(exp)) \
+ drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
+ } while (0)
+
+/**
+ * expect - Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({ \
+ bool _bool = (exp); \
+ if (!_bool) \
+ drbd_err(device, "ASSERTION %s FAILED in %s\n", \
+ #exp, __func__); \
+ _bool; \
+ })
+
+/* Defines to control fault insertion */
+enum {
+ DRBD_FAULT_MD_WR = 0, /* meta data write */
+ DRBD_FAULT_MD_RD = 1, /* read */
+ DRBD_FAULT_RS_WR = 2, /* resync */
+ DRBD_FAULT_RS_RD = 3,
+ DRBD_FAULT_DT_WR = 4, /* data */
+ DRBD_FAULT_DT_RD = 5,
+ DRBD_FAULT_DT_RA = 6, /* data read ahead */
+ DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */
+ DRBD_FAULT_AL_EE = 8, /* alloc ee */
+ DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
+
+ DRBD_FAULT_MAX,
+};
+
+extern unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type);
+
+static inline int
+drbd_insert_fault(struct drbd_device *device, unsigned int type) {
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+ return fault_rate &&
+ (enable_faults & (1<<type)) &&
+ _drbd_insert_fault(device, type);
+#else
+ return 0;
+#endif
+}
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr drbd_devices; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_resources; /* RCU, updates: genl_lock() */
+
+extern const char *cmdname(enum drbd_packet cmd);
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+ /* "const"
+ * stores total bits and long words
+ * of the bitmap, so we don't need to
+ * call the accessor functions over and again. */
+ unsigned long bm_bits;
+ unsigned long bm_words;
+ /* during xfer, current position within the bitmap */
+ unsigned long bit_offset;
+ unsigned long word_offset;
+
+ /* statistics; index: (h->command == P_BITMAP) */
+ unsigned packets[2];
+ unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_device *device,
+ const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+ /* word_offset counts "native long words" (32 or 64 bit),
+ * aligned at 64 bit.
+ * Encoded packet may end at an unaligned bit offset.
+ * In case a fallback clear text packet is transmitted in
+ * between, we adjust this offset back to the last 64bit
+ * aligned "native long word", which makes coding and decoding
+ * the plain text bitmap much more convenient. */
+#if BITS_PER_LONG == 64
+ c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+ c->word_offset = c->bit_offset >> 5;
+ c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+extern unsigned int drbd_header_size(struct drbd_connection *connection);
+
+/**********************************************************************/
+enum drbd_thread_state {
+ NONE,
+ RUNNING,
+ EXITING,
+ RESTARTING
+};
+
+struct drbd_thread {
+ spinlock_t t_lock;
+ struct task_struct *task;
+ struct completion stop;
+ enum drbd_thread_state t_state;
+ int (*function) (struct drbd_thread *);
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ int reset_cpu_mask;
+ const char *name;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+ /* THINK testing the t_state seems to be uncritical in all cases
+ * (but thread_{start,stop}), so we can read it *without* the lock.
+ * --lge */
+
+ smp_rmb();
+ return thi->t_state;
+}
+
+struct drbd_work {
+ struct list_head list;
+ int (*cb)(struct drbd_work *, int cancel);
+};
+
+struct drbd_device_work {
+ struct drbd_work w;
+ struct drbd_device *device;
+};
+
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
+
+struct drbd_request {
+ struct drbd_work w;
+ struct drbd_device *device;
+
+ /* if local IO is not allowed, will be NULL.
+ * if local IO _is_ allowed, holds the locally submitted bio clone,
+ * or, after local IO completion, the ERR_PTR(error).
+ * see drbd_request_endio(). */
+ struct bio *private_bio;
+
+ struct drbd_interval i;
+
+ /* epoch: used to check on "completion" whether this req was in
+ * the current epoch, and we therefore have to close it,
+ * causing a p_barrier packet to be send, starting a new epoch.
+ *
+ * This corresponds to "barrier" in struct p_barrier[_ack],
+ * and to "barrier_nr" in struct drbd_epoch (and various
+ * comments/function parameters/local variable names).
+ */
+ unsigned int epoch;
+
+ struct list_head tl_requests; /* ring list in the transfer log */
+ struct bio *master_bio; /* master bio pointer */
+
+ /* see struct drbd_device */
+ struct list_head req_pending_master_completion;
+ struct list_head req_pending_local;
+
+ /* for generic IO accounting */
+ unsigned long start_jif;
+
+ /* for DRBD internal statistics */
+
+ /* Minimal set of time stamps to determine if we wait for activity log
+ * transactions, local disk or peer. 32 bit "jiffies" are good enough,
+ * we don't expect a DRBD request to be stalled for several month.
+ */
+
+ /* before actual request processing */
+ unsigned long in_actlog_jif;
+
+ /* local disk */
+ unsigned long pre_submit_jif;
+
+ /* per connection */
+ unsigned long pre_send_jif;
+ unsigned long acked_jif;
+ unsigned long net_done_jif;
+
+ /* Possibly even more detail to track each phase:
+ * master_completion_jif
+ * how long did it take to complete the master bio
+ * (application visible latency)
+ * allocated_jif
+ * how long the master bio was blocked until we finally allocated
+ * a tracking struct
+ * in_actlog_jif
+ * how long did we wait for activity log transactions
+ *
+ * net_queued_jif
+ * when did we finally queue it for sending
+ * pre_send_jif
+ * when did we start sending it
+ * post_send_jif
+ * how long did we block in the network stack trying to send it
+ * acked_jif
+ * when did we receive (or fake, in protocol A) a remote ACK
+ * net_done_jif
+ * when did we receive final acknowledgement (P_BARRIER_ACK),
+ * or decide, e.g. on connection loss, that we do no longer expect
+ * anything from this peer for this request.
+ *
+ * pre_submit_jif
+ * post_sub_jif
+ * when did we start submiting to the lower level device,
+ * and how long did we block in that submit function
+ * local_completion_jif
+ * how long did it take the lower level device to complete this request
+ */
+
+
+ /* once it hits 0, we may complete the master_bio */
+ atomic_t completion_ref;
+ /* once it hits 0, we may destroy this drbd_request object */
+ struct kref kref;
+
+ unsigned rq_state; /* see comments above _req_mod() */
+};
+
+struct drbd_epoch {
+ struct drbd_connection *connection;
+ struct list_head list;
+ unsigned int barrier_nr;
+ atomic_t epoch_size; /* increased on every request added. */
+ atomic_t active; /* increased on every req. added, and dec on every finished. */
+ unsigned long flags;
+};
+
+/* Prototype declaration of function defined in drbd_receiver.c */
+int drbdd_init(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+/* drbd_epoch flag bits */
+enum {
+ DE_HAVE_BARRIER_NUMBER,
+};
+
+enum epoch_event {
+ EV_PUT,
+ EV_GOT_BARRIER_NR,
+ EV_BECAME_LAST,
+ EV_CLEANUP = 32, /* used as flag */
+};
+
+struct digest_info {
+ int digest_size;
+ void *digest;
+};
+
+struct drbd_peer_request {
+ struct drbd_work w;
+ struct drbd_peer_device *peer_device;
+ struct drbd_epoch *epoch; /* for writes */
+ struct page *pages;
+ atomic_t pending_bios;
+ struct drbd_interval i;
+ /* see comments on ee flag bits below */
+ unsigned long flags;
+ unsigned long submit_jif;
+ union {
+ u64 block_id;
+ struct digest_info *digest;
+ };
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
+enum {
+ __EE_CALL_AL_COMPLETE_IO,
+ __EE_MAY_SET_IN_SYNC,
+
+ /* is this a TRIM aka REQ_DISCARD? */
+ __EE_IS_TRIM,
+ /* our lower level cannot handle trim,
+ * and we want to fall back to zeroout instead */
+ __EE_IS_TRIM_USE_ZEROOUT,
+
+ /* In case a barrier failed,
+ * we need to resubmit without the barrier flag. */
+ __EE_RESUBMITTED,
+
+ /* we may have several bios per peer request.
+ * if any of those fail, we set this flag atomically
+ * from the endio callback */
+ __EE_WAS_ERROR,
+
+ /* This ee has a pointer to a digest instead of a block id */
+ __EE_HAS_DIGEST,
+
+ /* Conflicting local requests need to be restarted after this request */
+ __EE_RESTART_REQUESTS,
+
+ /* The peer wants a write ACK for this (wire proto C) */
+ __EE_SEND_WRITE_ACK,
+
+ /* Is set when net_conf had two_primaries set while creating this peer_req */
+ __EE_IN_INTERVAL_TREE,
+
+ /* for debugfs: */
+ /* has this been submitted, or does it still wait for something else? */
+ __EE_SUBMITTED,
+
+ /* this is/was a write request */
+ __EE_WRITE,
+
+ /* this originates from application on peer
+ * (not some resync or verify or other DRBD internal request) */
+ __EE_APPLICATION,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_IS_TRIM (1<<__EE_IS_TRIM)
+#define EE_IS_TRIM_USE_ZEROOUT (1<<__EE_IS_TRIM_USE_ZEROOUT)
+#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED (1<<__EE_SUBMITTED)
+#define EE_WRITE (1<<__EE_WRITE)
+#define EE_APPLICATION (1<<__EE_APPLICATION)
+
+/* flag bits per device */
+enum {
+ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
+ MD_DIRTY, /* current uuids and flags not yet on disk */
+ USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
+ CL_ST_CHG_SUCCESS,
+ CL_ST_CHG_FAIL,
+ CRASHED_PRIMARY, /* This node was a crashed primary.
+ * Gets cleared when the state.conn
+ * goes into C_CONNECTED state. */
+ CONSIDER_RESYNC,
+
+ MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */
+
+ SUSPEND_IO, /* suspend application io */
+ BITMAP_IO, /* suspend application io;
+ once no more io in flight, start bitmap io */
+ BITMAP_IO_QUEUED, /* Started bitmap IO */
+ WAS_IO_ERROR, /* Local disk failed, returned IO error */
+ WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
+ FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
+ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
+ RESIZE_PENDING, /* Size change detected locally, waiting for the response from
+ * the peer, if it changed there as well. */
+ NEW_CUR_UUID, /* Create new current UUID when thawing IO */
+ AL_SUSPENDED, /* Activity logging is currently suspended. */
+ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
+ B_RS_H_DONE, /* Before resync handler done (already executed) */
+ DISCARD_MY_DATA, /* discard_my_data flag per volume */
+ READ_BALANCE_RR,
+
+ FLUSH_PENDING, /* if set, device->flush_jif is when we submitted that flush
+ * from drbd_flush_after_epoch() */
+
+ /* cleared only after backing device related structures have been destroyed. */
+ GOING_DISKLESS, /* Disk is being detached, because of io-error, or admin request. */
+
+ /* to be used in drbd_device_post_work() */
+ GO_DISKLESS, /* tell worker to schedule cleanup before detach */
+ DESTROY_DISK, /* tell worker to close backing devices and destroy related structures. */
+ MD_SYNC, /* tell worker to call drbd_md_sync() */
+ RS_START, /* tell worker to start resync/OV */
+ RS_PROGRESS, /* tell worker that resync made significant progress */
+ RS_DONE, /* tell worker that resync is done */
+};
+
+struct drbd_bitmap; /* opaque for drbd_device */
+
+/* definition of bits in bm_flags to be used in drbd_bm_lock
+ * and drbd_bitmap_io and friends. */
+enum bm_flag {
+ /* do we need to kfree, or vfree bm_pages? */
+ BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
+
+ /* currently locked for bulk operation */
+ BM_LOCKED_MASK = 0xf,
+
+ /* in detail, that is: */
+ BM_DONT_CLEAR = 0x1,
+ BM_DONT_SET = 0x2,
+ BM_DONT_TEST = 0x4,
+
+ /* so we can mark it locked for bulk operation,
+ * and still allow all non-bulk operations */
+ BM_IS_LOCKED = 0x8,
+
+ /* (test bit, count bit) allowed (common case) */
+ BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
+
+ /* testing bits, as well as setting new bits allowed, but clearing bits
+ * would be unexpected. Used during bitmap receive. Setting new bits
+ * requires sending of "out-of-sync" information, though. */
+ BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
+
+ /* for drbd_bm_write_copy_pages, everything is allowed,
+ * only concurrent bulk operations are locked out. */
+ BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
+};
+
+struct drbd_work_queue {
+ struct list_head q;
+ spinlock_t q_lock; /* to protect the list. */
+ wait_queue_head_t q_wait;
+};
+
+struct drbd_socket {
+ struct mutex mutex;
+ struct socket *socket;
+ /* this way we get our
+ * send/receive buffers off the stack */
+ void *sbuf;
+ void *rbuf;
+};
+
+struct drbd_md {
+ u64 md_offset; /* sector offset to 'super' block */
+
+ u64 la_size_sect; /* last agreed size, unit sectors */
+ spinlock_t uuid_lock;
+ u64 uuid[UI_SIZE];
+ u64 device_uuid;
+ u32 flags;
+ u32 md_size_sect;
+
+ s32 al_offset; /* signed relative sector offset to activity log */
+ s32 bm_offset; /* signed relative sector offset to bitmap */
+
+ /* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+ s32 meta_dev_idx;
+
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+ u32 al_size_4k; /* cached product of the above */
+};
+
+struct drbd_backing_dev {
+ struct block_device *backing_bdev;
+ struct block_device *md_bdev;
+ struct drbd_md md;
+ struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
+ sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+ struct page *page;
+ unsigned long start_jif; /* last call to drbd_md_get_buffer */
+ unsigned long submit_jif; /* last _drbd_md_sync_page_io() submit */
+ const char *current_use;
+ atomic_t in_use;
+ unsigned int done;
+ int error;
+};
+
+struct bm_io_work {
+ struct drbd_work w;
+ char *why;
+ enum bm_flag flags;
+ int (*io_fn)(struct drbd_device *device);
+ void (*done)(struct drbd_device *device, int rv);
+};
+
+enum write_ordering_e {
+ WO_none,
+ WO_drain_io,
+ WO_bdev_flush,
+};
+
+struct fifo_buffer {
+ unsigned int head_index;
+ unsigned int size;
+ int total; /* sum of all values */
+ int values[0];
+};
+extern struct fifo_buffer *fifo_alloc(int fifo_size);
+
+/* flag bits per connection */
+enum {
+ NET_CONGESTED, /* The data socket is congested */
+ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
+ SEND_PING, /* whether asender should send a ping asap */
+ SIGNAL_ASENDER, /* whether asender wants to be interrupted */
+ GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
+ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
+ CONN_WD_ST_CHG_OKAY,
+ CONN_WD_ST_CHG_FAIL,
+ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
+ CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
+ STATE_SENT, /* Do not change state/UUIDs while this is set */
+ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+ * pending, from drbd worker context.
+ * If set, bdi_write_congested() returns true,
+ * so shrink_page_list() would not recurse into,
+ * and potentially deadlock on, this drbd worker.
+ */
+ DISCONNECT_SENT,
+
+ DEVICE_WORK_PENDING, /* tell worker that some device has pending work */
+};
+
+struct drbd_resource {
+ char *name;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_res;
+ struct dentry *debugfs_res_volumes;
+ struct dentry *debugfs_res_connections;
+ struct dentry *debugfs_res_in_flight_summary;
+#endif
+ struct kref kref;
+ struct idr devices; /* volume number to device mapping */
+ struct list_head connections;
+ struct list_head resources;
+ struct res_opts res_opts;
+ struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */
+ struct mutex adm_mutex; /* mutex to serialize administrative requests */
+ spinlock_t req_lock;
+
+ unsigned susp:1; /* IO suspended by user */
+ unsigned susp_nod:1; /* IO suspended because no data */
+ unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
+
+ enum write_ordering_e write_ordering;
+
+ cpumask_var_t cpu_mask;
+};
+
+struct drbd_thread_timing_details
+{
+ unsigned long start_jif;
+ void *cb_addr;
+ const char *caller_fn;
+ unsigned int line;
+ unsigned int cb_nr;
+};
+
+struct drbd_connection {
+ struct list_head connections;
+ struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_conn;
+ struct dentry *debugfs_conn_callback_history;
+ struct dentry *debugfs_conn_oldest_requests;
+#endif
+ struct kref kref;
+ struct idr peer_devices; /* volume number to peer device mapping */
+ enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+ struct mutex cstate_mutex; /* Protects graceful disconnects */
+ unsigned int connect_cnt; /* Inc each time a connection is established */
+
+ unsigned long flags;
+ struct net_conf *net_conf; /* content protected by rcu */
+ wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */
+
+ struct sockaddr_storage my_addr;
+ int my_addr_len;
+ struct sockaddr_storage peer_addr;
+ int peer_addr_len;
+
+ struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+ struct drbd_socket meta; /* ping/ack (metadata) packets */
+ int agreed_pro_version; /* actually used protocol version */
+ u32 agreed_features;
+ unsigned long last_received; /* in jiffies, either socket */
+ unsigned int ko_count;
+
+ struct list_head transfer_log; /* all requests not yet fully processed */
+
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by connection->data->mutex */
+ struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *verify_tfm;
+ void *int_dig_in;
+ void *int_dig_vv;
+
+ /* receiver side */
+ struct drbd_epoch *current_epoch;
+ spinlock_t epoch_lock;
+ unsigned int epochs;
+ atomic_t current_tle_nr; /* transfer log epoch number */
+ unsigned current_tle_writes; /* writes seen within this tl epoch */
+
+ unsigned long last_reconnect_jif;
+ struct drbd_thread receiver;
+ struct drbd_thread worker;
+ struct drbd_thread asender;
+
+ /* cached pointers,
+ * so we can look up the oldest pending requests more quickly.
+ * protected by resource->req_lock */
+ struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+ struct drbd_request *req_ack_pending;
+ struct drbd_request *req_not_net_done;
+
+ /* sender side */
+ struct drbd_work_queue sender_work;
+
+#define DRBD_THREAD_DETAILS_HIST 16
+ unsigned int w_cb_nr; /* keeps counting up */
+ unsigned int r_cb_nr; /* keeps counting up */
+ struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+ struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
+ struct {
+ /* whether this sender thread
+ * has processed a single write yet. */
+ bool seen_any_write_yet;
+
+ /* Which barrier number to send with the next P_BARRIER */
+ int current_epoch_nr;
+
+ /* how many write requests have been sent
+ * with req->epoch == current_epoch_nr.
+ * If none, no P_BARRIER will be sent. */
+ unsigned current_epoch_writes;
+ } send;
+};
+
+void __update_timing_details(
+ struct drbd_thread_timing_details *tdp,
+ unsigned int *cb_nr,
+ void *cb,
+ const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+ __update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+ __update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
+struct submit_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ /* protected by ..->resource->req_lock */
+ struct list_head writes;
+};
+
+struct drbd_peer_device {
+ struct list_head peer_devices;
+ struct drbd_device *device;
+ struct drbd_connection *connection;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_peer_dev;
+#endif
+};
+
+struct drbd_device {
+ struct drbd_resource *resource;
+ struct list_head peer_devices;
+ struct list_head pending_bitmap_io;
+
+ unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_minor;
+ struct dentry *debugfs_vol;
+ struct dentry *debugfs_vol_oldest_requests;
+ struct dentry *debugfs_vol_act_log_extents;
+ struct dentry *debugfs_vol_resync_extents;
+ struct dentry *debugfs_vol_data_gen_id;
+#endif
+
+ unsigned int vnr; /* volume number within the connection */
+ unsigned int minor; /* device minor number */
+
+ struct kref kref;
+
+ /* things that are stored as / read from meta data on disk */
+ unsigned long flags;
+
+ /* configured by drbdsetup */
+ struct drbd_backing_dev *ldev __protected_by(local);
+
+ sector_t p_size; /* partner's disk size */
+ struct request_queue *rq_queue;
+ struct block_device *this_bdev;
+ struct gendisk *vdisk;
+
+ unsigned long last_reattach_jif;
+ struct drbd_work resync_work;
+ struct drbd_work unplug_work;
+ struct timer_list resync_timer;
+ struct timer_list md_sync_timer;
+ struct timer_list start_resync_timer;
+ struct timer_list request_timer;
+
+ /* Used after attach while negotiating new disk state. */
+ union drbd_state new_state_tmp;
+
+ union drbd_dev_state state;
+ wait_queue_head_t misc_wait;
+ wait_queue_head_t state_wait; /* upon each state change. */
+ unsigned int send_cnt;
+ unsigned int recv_cnt;
+ unsigned int read_cnt;
+ unsigned int writ_cnt;
+ unsigned int al_writ_cnt;
+ unsigned int bm_writ_cnt;
+ atomic_t ap_bio_cnt; /* Requests we need to complete */
+ atomic_t ap_actlog_cnt; /* Requests waiting for activity log */
+ atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+ atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+ atomic_t unacked_cnt; /* Need to send replies for */
+ atomic_t local_cnt; /* Waiting for local completion */
+
+ /* Interval tree of pending local requests */
+ struct rb_root read_requests;
+ struct rb_root write_requests;
+
+ /* for statistics and timeouts */
+ /* [0] read, [1] write */
+ struct list_head pending_master_completion[2];
+ struct list_head pending_completion[2];
+
+ /* use checksums for *this* resync */
+ bool use_csums;
+ /* blocks to resync in this run [unit BM_BLOCK_SIZE] */
+ unsigned long rs_total;
+ /* number of resync blocks that failed in this run */
+ unsigned long rs_failed;
+ /* Syncer's start time [unit jiffies] */
+ unsigned long rs_start;
+ /* cumulated time in PausedSyncX state [unit jiffies] */
+ unsigned long rs_paused;
+ /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+ unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
+ /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+ unsigned long rs_mark_left[DRBD_SYNC_MARKS];
+ /* marks's time [unit jiffies] */
+ unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+ /* current index into rs_mark_{left,time} */
+ int rs_last_mark;
+ unsigned long rs_last_bcast; /* [unit jiffies] */
+
+ /* where does the admin want us to start? (sector) */
+ sector_t ov_start_sector;
+ sector_t ov_stop_sector;
+ /* where are we now? (sector) */
+ sector_t ov_position;
+ /* Start sector of out of sync range (to merge printk reporting). */
+ sector_t ov_last_oos_start;
+ /* size of out-of-sync range in sectors. */
+ sector_t ov_last_oos_size;
+ unsigned long ov_left; /* in bits */
+
+ struct drbd_bitmap *bitmap;
+ unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+ /* Used to track operations of resync... */
+ struct lru_cache *resync;
+ /* Number of locked elements in resync LRU */
+ unsigned int resync_locked;
+ /* resync extent number waiting for application requests */
+ unsigned int resync_wenr;
+
+ int open_cnt;
+ u64 *p_uuid;
+
+ struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+ struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
+ struct list_head done_ee; /* need to send P_WRITE_ACK */
+ struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
+ struct list_head net_ee; /* zero-copy network send in progress */
+
+ int next_barrier_nr;
+ struct list_head resync_reads;
+ atomic_t pp_in_use; /* allocated from page pool */
+ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
+ wait_queue_head_t ee_wait;
+ struct drbd_md_io md_io;
+ spinlock_t al_lock;
+ wait_queue_head_t al_wait;
+ struct lru_cache *act_log; /* activity log */
+ unsigned int al_tr_number;
+ int al_tr_cycle;
+ wait_queue_head_t seq_wait;
+ atomic_t packet_seq;
+ unsigned int peer_seq;
+ spinlock_t peer_seq_lock;
+ unsigned long comm_bm_set; /* communicated number of set bits. */
+ struct bm_io_work bm_io_work;
+ u64 ed_uuid; /* UUID of the exposed data */
+ struct mutex own_state_mutex;
+ struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */
+ char congestion_reason; /* Why we where congested... */
+ atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+ atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+ int rs_last_sect_ev; /* counter to compare with */
+ int rs_last_events; /* counter of read or write "events" (unit sectors)
+ * on the lower level device when we last looked. */
+ int c_sync_rate; /* current resync rate after syncer throttle magic */
+ struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */
+ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+ unsigned int peer_max_bio_size;
+ unsigned int local_max_bio_size;
+
+ /* any requests that would block in drbd_make_request()
+ * are deferred to this single-threaded work queue */
+ struct submit_worker submit;
+};
+
+struct drbd_bm_aio_ctx {
+ struct drbd_device *device;
+ struct list_head list; /* on device->pending_bitmap_io */;
+ unsigned long start_jif;
+ atomic_t in_flight;
+ unsigned int done;
+ unsigned flags;
+#define BM_AIO_COPY_PAGES 1
+#define BM_AIO_WRITE_HINTED 2
+#define BM_AIO_WRITE_ALL_PAGES 4
+#define BM_AIO_READ 8
+ int error;
+ struct kref kref;
+};
+
+struct drbd_config_context {
+ /* assigned from drbd_genlmsghdr */
+ unsigned int minor;
+ /* assigned from request attributes, if present */
+ unsigned int volume;
+#define VOLUME_UNSPECIFIED (-1U)
+ /* pointer into the request skb,
+ * limited lifetime! */
+ char *resource_name;
+ struct nlattr *my_addr;
+ struct nlattr *peer_addr;
+
+ /* reply buffer */
+ struct sk_buff *reply_skb;
+ /* pointer into reply buffer */
+ struct drbd_genlmsghdr *reply_dh;
+ /* resolved from attributes, if possible */
+ struct drbd_device *device;
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+};
+
+static inline struct drbd_device *minor_to_device(unsigned int minor)
+{
+ return (struct drbd_device *)idr_find(&drbd_devices, minor);
+}
+
+static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
+{
+ return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
+}
+
+#define for_each_resource(resource, _resources) \
+ list_for_each_entry(resource, _resources, resources)
+
+#define for_each_resource_rcu(resource, _resources) \
+ list_for_each_entry_rcu(resource, _resources, resources)
+
+#define for_each_resource_safe(resource, tmp, _resources) \
+ list_for_each_entry_safe(resource, tmp, _resources, resources)
+
+#define for_each_connection(connection, resource) \
+ list_for_each_entry(connection, &resource->connections, connections)
+
+#define for_each_connection_rcu(connection, resource) \
+ list_for_each_entry_rcu(connection, &resource->connections, connections)
+
+#define for_each_connection_safe(connection, tmp, resource) \
+ list_for_each_entry_safe(connection, tmp, &resource->connections, connections)
+
+#define for_each_peer_device(peer_device, device) \
+ list_for_each_entry(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_rcu(peer_device, device) \
+ list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_safe(peer_device, tmp, device) \
+ list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices)
+
+static inline unsigned int device_to_minor(struct drbd_device *device)
+{
+ return device->minor;
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum dds_flags {
+ DDSF_FORCED = 1,
+ DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
+extern void drbd_init_set_defaults(struct drbd_device *device);
+extern int drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#endif
+extern void tl_release(struct drbd_connection *, unsigned int barrier_nr,
+ unsigned int set_size);
+extern void tl_clear(struct drbd_connection *);
+extern void drbd_free_sock(struct drbd_connection *connection);
+extern int drbd_send(struct drbd_connection *connection, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t,
+ unsigned);
+
+extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_connection *connection);
+extern int drbd_send_uuids(struct drbd_peer_device *);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *);
+extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags);
+extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_peer_device *);
+extern int drbd_send_sync_param(struct drbd_peer_device *);
+extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr,
+ u32 set_size);
+extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet,
+ struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet,
+ struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet,
+ struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet,
+ sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet,
+ struct drbd_peer_request *);
+extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req);
+extern int drbd_send_drequest(struct drbd_peer_device *, int cmd,
+ sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector,
+ int size, void *digest, int digest_size,
+ enum drbd_packet cmd);
+extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size);
+
+extern int drbd_send_bitmap(struct drbd_device *device);
+extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
+extern void drbd_free_ldev(struct drbd_backing_dev *ldev);
+extern void drbd_device_cleanup(struct drbd_device *device);
+void drbd_print_uuids(struct drbd_device *device, const char *text);
+
+extern void conn_md_sync(struct drbd_connection *connection);
+extern void drbd_md_write(struct drbd_device *device, void *buffer);
+extern void drbd_md_sync(struct drbd_device *device);
+extern int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local);
+extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local);
+extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+extern void drbd_md_mark_dirty(struct drbd_device *device);
+extern void drbd_queue_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ void (*done)(struct drbd_device *, int),
+ char *why, enum bm_flag flags);
+extern int drbd_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags);
+extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
+
+/* Meta data layout
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * Variants:
+ * old, indexed fixed size meta data:
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ][padding*]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ * end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ * The activity log consists of 4k transaction blocks,
+ * which are written in a ring-buffer, or striped ring-buffer like fashion,
+ * which are writtensize used to be fixed 32kB,
+ * but is about to become configurable.
+ */
+
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_128MB_SECT (128LLU << 11) /* 128 MB, unit sectors */
+#define MD_4kB_SECT 8
+#define MD_32kB_SECT 64
+
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ * This many changes to the active set can be logged with one transaction.
+ * This number is arbitrary.
+ * context per transaction:
+ * This many context extent numbers are logged with each transaction.
+ * This number is resulting from the transaction block size (4k), the layout
+ * of the transaction header, and the number of updates per transaction.
+ * See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
+#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+ int rs_left; /* number of bits set (out of sync) in this extent. */
+ int rs_failed; /* number of failed resync requests in this extent. */
+ unsigned long flags;
+ struct lc_element lce;
+};
+
+#define BME_NO_WRITES 0 /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED 1 /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY 2 /* finish resync IO on this extent ASAP! App IO waiting! */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define SLEEP_TIME (HZ/10)
+
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT 12 /* 4k per bit */
+#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */
+#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x) ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9))
+#define BM_BIT_TO_EXT(x) ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+/* first storage sector a bitmap extent corresponds to */
+#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9))
+/* how much _storage_ sectors we have per bitmap extent */
+#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1)
+/* how many bits are covered by one bitmap extent (resync extent) */
+#define BM_BITS_PER_EXT (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_MASK (BM_BITS_PER_EXT - 1)
+
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ * of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit 0 bit 37 bit 38 bit (512*8)-1
+ * ...|........|........|.. // ..|........|
+ * sect. 0 `296 `304 ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 ) // 128
+#define BM_EXT_PER_SECT ( 512 / BM_BYTES_PER_EXTENT ) // 4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+ ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#if !defined(CONFIG_LBDAF) && BITS_PER_LONG == 32
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_32
+#define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32
+#else
+#define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_FIXED_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
+#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
+/* corresponds to (1UL << 38) bits right now. */
+#endif
+#endif
+
+/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
+
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
+
+/* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE
+#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9)
+
+extern int drbd_bm_init(struct drbd_device *device);
+extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
+extern void drbd_bm_cleanup(struct drbd_device *device);
+extern void drbd_bm_set_all(struct drbd_device *device);
+extern void drbd_bm_clear_all(struct drbd_device *device);
+/* set/clear/test only a few bits at a time */
+extern int drbd_bm_set_bits(
+ struct drbd_device *device, unsigned long s, unsigned long e);
+extern int drbd_bm_clear_bits(
+ struct drbd_device *device, unsigned long s, unsigned long e);
+extern int drbd_bm_count_bits(
+ struct drbd_device *device, const unsigned long s, const unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock,
+ * may process the whole bitmap in one go */
+extern void _drbd_bm_set_bits(struct drbd_device *device,
+ const unsigned long s, const unsigned long e);
+extern int drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
+extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
+extern int drbd_bm_read(struct drbd_device *device) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
+extern int drbd_bm_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
+extern int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
+extern size_t drbd_bm_words(struct drbd_device *device);
+extern unsigned long drbd_bm_bits(struct drbd_device *device);
+extern sector_t drbd_bm_capacity(struct drbd_device *device);
+
+#define DRBD_END_OF_BITMAP (~(unsigned long)0)
+extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
+extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
+ size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap */
+extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset,
+ size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags);
+extern void drbd_bm_unlock(struct drbd_device *device);
+/* drbd_main.c */
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache; /* peer requests */
+extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
+extern mempool_t *drbd_request_mempool;
+extern mempool_t *drbd_ee_mempool;
+
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
+extern spinlock_t drbd_pp_lock;
+extern int drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES 128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
+extern rwlock_t global_state_lock;
+
+extern int conn_lowest_minor(struct drbd_connection *connection);
+extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
+extern void drbd_destroy_device(struct kref *kref);
+extern void drbd_delete_device(struct drbd_device *device);
+
+extern struct drbd_resource *drbd_create_resource(const char *name);
+extern void drbd_free_resource(struct drbd_resource *resource);
+
+extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts);
+extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts);
+extern void drbd_destroy_connection(struct kref *kref);
+extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len);
+extern struct drbd_resource *drbd_find_resource(const char *name);
+extern void drbd_destroy_resource(struct kref *kref);
+extern void conn_free_crypto(struct drbd_connection *connection);
+
+extern int proc_details;
+
+/* drbd_req */
+extern void do_submit(struct work_struct *ws);
+extern void __drbd_make_request(struct drbd_device *, struct bio *, unsigned long);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
+extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
+extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+extern void drbd_suspend_io(struct drbd_device *device);
+extern void drbd_resume_io(struct drbd_device *device);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int);
+enum determine_dev_size {
+ DS_ERROR_SHRINK = -3,
+ DS_ERROR_SPACE_MD = -2,
+ DS_ERROR = -1,
+ DS_UNCHANGED = 0,
+ DS_SHRUNK = 1,
+ DS_GREW = 2,
+ DS_GREW_FROM_ZERO = 3,
+};
+extern enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_device *);
+extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
+ enum drbd_role new_role,
+ int force);
+extern bool conn_try_outdate_peer(struct drbd_connection *connection);
+extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
+extern int drbd_khelper(struct drbd_device *device, char *cmd);
+
+/* drbd_worker.c */
+/* bi_end_io handlers */
+extern void drbd_md_endio(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
+extern int drbd_worker(struct drbd_thread *thi);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
+void drbd_resync_after_changed(struct drbd_device *device);
+extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_device *device);
+extern void suspend_other_sg(struct drbd_device *device);
+extern int drbd_resync_finished(struct drbd_device *device);
+/* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
+extern void drbd_md_put_buffer(struct drbd_device *device);
+extern int drbd_md_sync_page_io(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int);
+extern void wait_until_done_or_force_detached(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, unsigned int *done);
+extern void drbd_rs_controller_reset(struct drbd_device *device);
+
+static inline void ov_out_of_sync_print(struct drbd_device *device)
+{
+ if (device->ov_last_oos_size) {
+ drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n",
+ (unsigned long long)device->ov_last_oos_start,
+ (unsigned long)device->ov_last_oos_size);
+ }
+ device->ov_last_oos_size = 0;
+}
+
+
+extern void drbd_csum_bio(struct crypto_hash *, struct bio *, void *);
+extern void drbd_csum_ee(struct crypto_hash *, struct drbd_peer_request *, void *);
+/* worker callbacks */
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+extern int w_start_resync(struct drbd_work *, int);
+
+extern void resync_timer_fn(unsigned long data);
+extern void start_resync_timer_fn(unsigned long data);
+
+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
+/* drbd_receiver.c */
+extern int drbd_receiver(struct drbd_thread *thi);
+extern int drbd_asender(struct drbd_thread *thi);
+extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+ bool throttle_if_app_is_waiting);
+extern int drbd_submit_peer_request(struct drbd_device *,
+ struct drbd_peer_request *, const unsigned,
+ const int);
+extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
+ sector_t, unsigned int,
+ bool,
+ gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
+ int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
+extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
+extern int drbd_connected(struct drbd_peer_device *);
+
+static inline void drbd_tcp_cork(struct socket *sock)
+{
+ int val = 1;
+ (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
+ (char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_uncork(struct socket *sock)
+{
+ int val = 0;
+ (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK,
+ (char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_nodelay(struct socket *sock)
+{
+ int val = 1;
+ (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
+ (char*)&val, sizeof(val));
+}
+
+static inline void drbd_tcp_quickack(struct socket *sock)
+{
+ int val = 2;
+ (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
+ (char*)&val, sizeof(val));
+}
+
+/* sets the number of 512 byte sectors of our virtual device */
+static inline void drbd_set_my_capacity(struct drbd_device *device,
+ sector_t size)
+{
+ /* set_capacity(device->this_bdev->bd_disk, size); */
+ set_capacity(device->vdisk, size);
+ device->this_bdev->bd_inode->i_size = (loff_t)size << 9;
+}
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_generic_make_request(struct drbd_device *device,
+ int fault_type, struct bio *bio)
+{
+ __release(local);
+ if (!bio->bi_bdev) {
+ drbd_err(device, "drbd_generic_make_request: bio->bi_bdev == NULL\n");
+ bio_endio(bio, -ENODEV);
+ return;
+ }
+
+ if (drbd_insert_fault(device, fault_type))
+ bio_endio(bio, -EIO);
+ else
+ generic_make_request(bio);
+}
+
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+ enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+extern const struct file_operations drbd_proc_fops;
+extern const char *drbd_conn_str(enum drbd_conns s);
+extern const char *drbd_role_str(enum drbd_role s);
+
+/* drbd_actlog.c */
+extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_device *device);
+extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_device *device);
+extern int drbd_rs_del_all(struct drbd_device *device);
+extern void drbd_rs_failed_io(struct drbd_device *device,
+ sector_t sector, int size);
+extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
+
+enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
+extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+ enum update_sync_bits_mode mode);
+#define drbd_set_in_sync(device, sector, size) \
+ __drbd_change_sync(device, sector, size, SET_IN_SYNC)
+#define drbd_set_out_of_sync(device, sector, size) \
+ __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC)
+#define drbd_rs_failed_io(device, sector, size) \
+ __drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
+extern void drbd_al_shrink(struct drbd_device *device);
+extern int drbd_initialize_al(struct drbd_device *, void *);
+
+/* drbd_nl.c */
+/* state info broadcast */
+struct sib_info {
+ enum drbd_state_info_bcast_reason sib_reason;
+ union {
+ struct {
+ char *helper_name;
+ unsigned helper_exit_code;
+ };
+ struct {
+ union drbd_state os;
+ union drbd_state ns;
+ };
+ };
+};
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
+
+/*
+ * inline helper functions
+ *************************/
+
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+ return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+ for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+ page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+ for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
+{
+ struct page *page = peer_req->pages;
+ page_chain_for_each(page) {
+ if (page_count(page) > 1)
+ return 1;
+ }
+ return 0;
+}
+
+static inline enum drbd_state_rv
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+ enum chg_state_flags flags, struct completion *done)
+{
+ enum drbd_state_rv rv;
+
+ read_lock(&global_state_lock);
+ rv = __drbd_set_state(device, ns, flags, done);
+ read_unlock(&global_state_lock);
+
+ return rv;
+}
+
+static inline union drbd_state drbd_read_state(struct drbd_device *device)
+{
+ struct drbd_resource *resource = device->resource;
+ union drbd_state rv;
+
+ rv.i = device->state.i;
+ rv.susp = resource->susp;
+ rv.susp_nod = resource->susp_nod;
+ rv.susp_fen = resource->susp_fen;
+
+ return rv;
+}
+
+enum drbd_force_detach_flags {
+ DRBD_READ_ERROR,
+ DRBD_WRITE_ERROR,
+ DRBD_META_IO_ERROR,
+ DRBD_FORCE_DETACH,
+};
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_device *device,
+ enum drbd_force_detach_flags df,
+ const char *where)
+{
+ enum drbd_io_error_p ep;
+
+ rcu_read_lock();
+ ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+ switch (ep) {
+ case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
+ if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Local IO failed in %s.\n", where);
+ if (device->state.disk > D_INCONSISTENT)
+ _drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
+ break;
+ }
+ /* NOTE fall through for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
+ case EP_DETACH:
+ case EP_CALL_HELPER:
+ /* Remember whether we saw a READ or WRITE error.
+ *
+ * Recovery of the affected area for WRITE failure is covered
+ * by the activity log.
+ * READ errors may fall outside that area though. Certain READ
+ * errors can be "healed" by writing good data to the affected
+ * blocks, which triggers block re-allocation in lower layers.
+ *
+ * If we can not write the bitmap after a READ error,
+ * we may need to trigger a full sync (see w_go_diskless()).
+ *
+ * Force-detach is not really an IO error, but rather a
+ * desperate measure to try to deal with a completely
+ * unresponsive lower level IO stack.
+ * Still it should be treated as a WRITE error.
+ *
+ * Meta IO error is always WRITE error:
+ * we read meta data only once during attach,
+ * which will fail in case of errors.
+ */
+ set_bit(WAS_IO_ERROR, &device->flags);
+ if (df == DRBD_READ_ERROR)
+ set_bit(WAS_READ_ERROR, &device->flags);
+ if (df == DRBD_FORCE_DETACH)
+ set_bit(FORCE_DETACH, &device->flags);
+ if (device->state.disk > D_FAILED) {
+ _drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);
+ drbd_err(device,
+ "Local IO failed in %s. Detaching...\n", where);
+ }
+ break;
+ }
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @device: DRBD device.
+ * @error: Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_device *device,
+ int error, enum drbd_force_detach_flags forcedetach, const char *where)
+{
+ if (error) {
+ unsigned long flags;
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ __drbd_chk_io_error_(device, forcedetach, where);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ }
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev: Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+ switch (bdev->md.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ return bdev->md.md_offset + bdev->md.bm_offset;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ default:
+ return bdev->md.md_offset;
+ }
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev: Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+ switch (bdev->md.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ return bdev->md.md_offset + MD_4kB_SECT -1;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ default:
+ return bdev->md.md_offset + bdev->md.md_size_sect -1;
+ }
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+ /* return bdev ? get_capacity(bdev->bd_disk) : 0; */
+ return bdev ? i_size_read(bdev->bd_inode) >> 9 : 0;
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev: Meta data block device.
+ *
+ * returns the capacity we announce to out peer. we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+ sector_t s;
+
+ switch (bdev->md.meta_dev_idx) {
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ s = drbd_get_capacity(bdev->backing_bdev)
+ ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+ drbd_md_first_sector(bdev))
+ : 0;
+ break;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+ drbd_get_capacity(bdev->backing_bdev));
+ /* clip at maximum size the meta device can support */
+ s = min_t(sector_t, s,
+ BM_EXT_TO_SECT(bdev->md.md_size_sect
+ - bdev->md.bm_offset));
+ break;
+ default:
+ s = min_t(sector_t, DRBD_MAX_SECTORS,
+ drbd_get_capacity(bdev->backing_bdev));
+ }
+ return s;
+}
+
+/**
+ * drbd_md_ss() - Return the sector number of our meta data super block
+ * @bdev: Meta data block device.
+ */
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
+{
+ const int meta_dev_idx = bdev->md.meta_dev_idx;
+
+ if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
+ return 0;
+
+ /* Since drbd08, internal meta data is always "flexible".
+ * position: last 4k aligned block of 4k size */
+ if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+ return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+ /* external, some index; this is the old fixed size layout */
+ return MD_128MB_SECT * bdev->md.meta_dev_idx;
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&q->q_lock, flags);
+ list_add_tail(&w->list, &q->q);
+ spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&q->q_lock, flags);
+ if (list_empty_careful(&w->list))
+ list_add_tail(&w->list, &q->q);
+ spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_device_post_work(struct drbd_device *device, int work_bit)
+{
+ if (!test_and_set_bit(work_bit, &device->flags)) {
+ struct drbd_connection *connection =
+ first_peer_device(device)->connection;
+ struct drbd_work_queue *q = &connection->sender_work;
+ if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+ wake_up(&q->q_wait);
+ }
+}
+
+extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
+
+static inline void wake_asender(struct drbd_connection *connection)
+{
+ if (test_bit(SIGNAL_ASENDER, &connection->flags))
+ force_sig(DRBD_SIG, connection->asender.task);
+}
+
+static inline void request_ping(struct drbd_connection *connection)
+{
+ set_bit(SEND_PING, &connection->flags);
+ wake_asender(connection);
+}
+
+extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_connection *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
+extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
+
+extern int drbd_send_ping(struct drbd_connection *connection);
+extern int drbd_send_ping_ack(struct drbd_connection *connection);
+extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state);
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, false, true);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, false, false);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+ _drbd_thread_stop(thi, true, false);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ * w_send_barrier
+ * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
+ * it is much easier and equally valid to count what we queue for the
+ * worker, even before it actually was queued or send.
+ * (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ * got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ * _req_mod(req, DATA_RECEIVED)
+ * [from receive_DataReply]
+ * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
+ * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ * for some reason it is NOT decreased in got_NegAck,
+ * but in the resulting cleanup code from report_params.
+ * we should try to remember the reason for that...
+ * _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ * _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
+ * [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_device *device)
+{
+ atomic_inc(&device->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \
+ if (atomic_read(&device->which) < 0) \
+ drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n", \
+ func, line, \
+ atomic_read(&device->which))
+
+#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
+{
+ if (atomic_dec_and_test(&device->ap_pending_cnt))
+ wake_up(&device->misc_wait);
+ ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
+
+/* counts how many resync-related answers we still expect from the peer
+ * increase decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK with ID_SYNCER)
+ * (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_device *device)
+{
+ atomic_inc(&device->rs_pending_cnt);
+}
+
+#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
+{
+ atomic_dec(&device->rs_pending_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ * receive_Data unless protocol A;
+ * we need to send a P_RECV_ACK (proto B)
+ * or P_WRITE_ACK (proto C)
+ * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ * receive_Barrier_* we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_device *device)
+{
+ atomic_inc(&device->unacked_cnt);
+}
+
+#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
+static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
+{
+ atomic_dec(&device->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
+static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
+{
+ atomic_sub(n, &device->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+static inline bool is_sync_state(enum drbd_conns connection_state)
+{
+ return
+ (connection_state == C_SYNC_SOURCE
+ || connection_state == C_SYNC_TARGET
+ || connection_state == C_PAUSED_SYNC_S
+ || connection_state == C_PAUSED_SYNC_T);
+}
+
+/**
+ * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
+ * @_device: DRBD device.
+ * @_min_state: Minimum device state required for success.
+ *
+ * You have to call put_ldev() when finished working with device->ldev.
+ */
+#define get_ldev_if_state(_device, _min_state) \
+ (_get_ldev_if_state((_device), (_min_state)) ? \
+ ({ __acquire(x); true; }) : false)
+#define get_ldev(_device) get_ldev_if_state(_device, D_INCONSISTENT)
+
+static inline void put_ldev(struct drbd_device *device)
+{
+ enum drbd_disk_state disk_state = device->state.disk;
+ /* We must check the state *before* the atomic_dec becomes visible,
+ * or we have a theoretical race where someone hitting zero,
+ * while state still D_FAILED, will then see D_DISKLESS in the
+ * condition below and calling into destroy, where he must not, yet. */
+ int i = atomic_dec_return(&device->local_cnt);
+
+ /* This may be called from some endio handler,
+ * so we must not sleep here. */
+
+ __release(local);
+ D_ASSERT(device, i >= 0);
+ if (i == 0) {
+ if (disk_state == D_DISKLESS)
+ /* even internal references gone, safe to destroy */
+ drbd_device_post_work(device, DESTROY_DISK);
+ if (disk_state == D_FAILED)
+ /* all application IO references gone. */
+ if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
+ drbd_device_post_work(device, GO_DISKLESS);
+ wake_up(&device->misc_wait);
+ }
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+ int io_allowed;
+
+ /* never get a reference while D_DISKLESS */
+ if (device->state.disk == D_DISKLESS)
+ return 0;
+
+ atomic_inc(&device->local_cnt);
+ io_allowed = (device->state.disk >= mins);
+ if (!io_allowed)
+ put_ldev(device);
+ return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
+#endif
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_device *device)
+{
+ struct net_conf *nc;
+ int mxb;
+
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */
+ rcu_read_unlock();
+
+ return mxb;
+}
+
+static inline int drbd_state_is_stable(struct drbd_device *device)
+{
+ union drbd_dev_state s = device->state;
+
+ /* DO NOT add a default clause, we want the compiler to warn us
+ * for any newly introduced state we may have forgotten to add here */
+
+ switch ((enum drbd_conns)s.conn) {
+ /* new io only accepted when there is no connection, ... */
+ case C_STANDALONE:
+ case C_WF_CONNECTION:
+ /* ... or there is a well established connection. */
+ case C_CONNECTED:
+ case C_SYNC_SOURCE:
+ case C_SYNC_TARGET:
+ case C_VERIFY_S:
+ case C_VERIFY_T:
+ case C_PAUSED_SYNC_S:
+ case C_PAUSED_SYNC_T:
+ case C_AHEAD:
+ case C_BEHIND:
+ /* transitional states, IO allowed */
+ case C_DISCONNECTING:
+ case C_UNCONNECTED:
+ case C_TIMEOUT:
+ case C_BROKEN_PIPE:
+ case C_NETWORK_FAILURE:
+ case C_PROTOCOL_ERROR:
+ case C_TEAR_DOWN:
+ case C_WF_REPORT_PARAMS:
+ case C_STARTING_SYNC_S:
+ case C_STARTING_SYNC_T:
+ break;
+
+ /* Allow IO in BM exchange states with new protocols */
+ case C_WF_BITMAP_S:
+ if (first_peer_device(device)->connection->agreed_pro_version < 96)
+ return 0;
+ break;
+
+ /* no new io accepted in these states */
+ case C_WF_BITMAP_T:
+ case C_WF_SYNC_UUID:
+ case C_MASK:
+ /* not "stable" */
+ return 0;
+ }
+
+ switch ((enum drbd_disk_state)s.disk) {
+ case D_DISKLESS:
+ case D_INCONSISTENT:
+ case D_OUTDATED:
+ case D_CONSISTENT:
+ case D_UP_TO_DATE:
+ case D_FAILED:
+ /* disk state is stable as well. */
+ break;
+
+ /* no new io accepted during transitional states */
+ case D_ATTACHING:
+ case D_NEGOTIATING:
+ case D_UNKNOWN:
+ case D_MASK:
+ /* not "stable" */
+ return 0;
+ }
+
+ return 1;
+}
+
+static inline int drbd_suspended(struct drbd_device *device)
+{
+ struct drbd_resource *resource = device->resource;
+
+ return resource->susp || resource->susp_fen || resource->susp_nod;
+}
+
+static inline bool may_inc_ap_bio(struct drbd_device *device)
+{
+ int mxb = drbd_get_max_buffers(device);
+
+ if (drbd_suspended(device))
+ return false;
+ if (test_bit(SUSPEND_IO, &device->flags))
+ return false;
+
+ /* to avoid potential deadlock or bitmap corruption,
+ * in various places, we only allow new application io
+ * to start during "stable" states. */
+
+ /* no new io accepted when attaching or detaching the disk */
+ if (!drbd_state_is_stable(device))
+ return false;
+
+ /* since some older kernels don't have atomic_add_unless,
+ * and we are within the spinlock anyways, we have this workaround. */
+ if (atomic_read(&device->ap_bio_cnt) > mxb)
+ return false;
+ if (test_bit(BITMAP_IO, &device->flags))
+ return false;
+ return true;
+}
+
+static inline bool inc_ap_bio_cond(struct drbd_device *device)
+{
+ bool rv = false;
+
+ spin_lock_irq(&device->resource->req_lock);
+ rv = may_inc_ap_bio(device);
+ if (rv)
+ atomic_inc(&device->ap_bio_cnt);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ return rv;
+}
+
+static inline void inc_ap_bio(struct drbd_device *device)
+{
+ /* we wait here
+ * as long as the device is suspended
+ * until the bitmap is no longer on the fly during connection
+ * handshake as long as we would exceed the max_buffer limit.
+ *
+ * to avoid races with the reconnect code,
+ * we need to atomic_inc within the spinlock. */
+
+ wait_event(device->misc_wait, inc_ap_bio_cond(device));
+}
+
+static inline void dec_ap_bio(struct drbd_device *device)
+{
+ int mxb = drbd_get_max_buffers(device);
+ int ap_bio = atomic_dec_return(&device->ap_bio_cnt);
+
+ D_ASSERT(device, ap_bio >= 0);
+
+ if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+ drbd_queue_work(&first_peer_device(device)->
+ connection->sender_work,
+ &device->bm_io_work.w);
+ }
+
+ /* this currently does wake_up for every dec_ap_bio!
+ * maybe rather introduce some type of hysteresis?
+ * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+ if (ap_bio < mxb)
+ wake_up(&device->misc_wait);
+}
+
+static inline bool verify_can_do_stop_sector(struct drbd_device *device)
+{
+ return first_peer_device(device)->connection->agreed_pro_version >= 97 &&
+ first_peer_device(device)->connection->agreed_pro_version != 100;
+}
+
+static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val)
+{
+ int changed = device->ed_uuid != val;
+ device->ed_uuid = val;
+ return changed;
+}
+
+static inline int drbd_queue_order_type(struct drbd_device *device)
+{
+ /* sorry, we currently have no working implementation
+ * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+ return QUEUE_ORDERED_NONE;
+}
+
+static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
+{
+ return list_first_entry_or_null(&resource->connections,
+ struct drbd_connection, connections);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 000000000..51b25ad85
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,179 @@
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/**
+ * interval_end - return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+ struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+ return this->end;
+}
+
+/**
+ * compute_subtree_last - compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children. Called for @node and its parents whenever the end
+ * may have changed.
+ */
+static inline sector_t
+compute_subtree_last(struct drbd_interval *node)
+{
+ sector_t max = node->sector + (node->size >> 9);
+
+ if (node->rb.rb_left) {
+ sector_t left = interval_end(node->rb.rb_left);
+ if (left > max)
+ max = left;
+ }
+ if (node->rb.rb_right) {
+ sector_t right = interval_end(node->rb.rb_right);
+ if (right > max)
+ max = right;
+ }
+ return max;
+}
+
+RB_DECLARE_CALLBACKS(static, augment_callbacks, struct drbd_interval, rb,
+ sector_t, end, compute_subtree_last);
+
+/**
+ * drbd_insert_interval - insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+ sector_t this_end = this->sector + (this->size >> 9);
+
+ BUG_ON(!IS_ALIGNED(this->size, 512));
+
+ while (*new) {
+ struct drbd_interval *here =
+ rb_entry(*new, struct drbd_interval, rb);
+
+ parent = *new;
+ if (here->end < this_end)
+ here->end = this_end;
+ if (this->sector < here->sector)
+ new = &(*new)->rb_left;
+ else if (this->sector > here->sector)
+ new = &(*new)->rb_right;
+ else if (this < here)
+ new = &(*new)->rb_left;
+ else if (this > here)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ this->end = this_end;
+ rb_link_node(&this->rb, parent, new);
+ rb_insert_augmented(&this->rb, root, &augment_callbacks);
+ return true;
+}
+
+/**
+ * drbd_contains_interval - check if a tree contains a given interval
+ * @sector: start sector of @interval
+ * @interval: may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree. Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+ struct drbd_interval *interval)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (sector < here->sector)
+ node = node->rb_left;
+ else if (sector > here->sector)
+ node = node->rb_right;
+ else if (interval < here)
+ node = node->rb_left;
+ else if (interval > here)
+ node = node->rb_right;
+ else
+ return true;
+ }
+ return false;
+}
+
+/**
+ * drbd_remove_interval - remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
+ * @sector: start sector
+ * @size: size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none. When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+ struct rb_node *node = root->rb_node;
+ struct drbd_interval *overlap = NULL;
+ sector_t end = sector + (size >> 9);
+
+ BUG_ON(!IS_ALIGNED(size, 512));
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (node->rb_left &&
+ sector < interval_end(node->rb_left)) {
+ /* Overlap if any must be on left side */
+ node = node->rb_left;
+ } else if (here->sector < end &&
+ sector < here->sector + (here->size >> 9)) {
+ overlap = here;
+ break;
+ } else if (sector >= here->sector) {
+ /* Overlap if any must be on right side */
+ node = node->rb_right;
+ } else
+ break;
+ }
+ return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+ sector_t end = sector + (size >> 9);
+ struct rb_node *node;
+
+ for (;;) {
+ node = rb_next(&i->rb);
+ if (!node)
+ return NULL;
+ i = rb_entry(node, struct drbd_interval, rb);
+ if (i->sector >= end)
+ return NULL;
+ if (sector < i->sector + (i->size >> 9))
+ return i;
+ }
+}
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 000000000..f210543f0
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,42 @@
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+ struct rb_node rb;
+ sector_t sector; /* start sector of the interval */
+ unsigned int size; /* size in bytes */
+ sector_t end; /* highest interval end in subtree */
+ int local:1 /* local or remote request? */;
+ int waiting:1; /* someone is waiting for this to complete */
+ int completed:1; /* this has been completed already;
+ * ignore for conflict detection */
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+ RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+ return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+ struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+ unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+ unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size) \
+ for (i = drbd_find_overlap(root, sector, size); \
+ i; \
+ i = drbd_next_overlap(i, sector, size))
+
+#endif /* __DRBD_INTERVAL_H */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000..81fde9ef7
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3844 @@
+/*
+ drbd.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+ from Logicworks, Inc. for making SDP replication support possible.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/drbd.h>
+#include <asm/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+#include "drbd_vli.h"
+#include "drbd_debugfs.h"
+
+static DEFINE_MUTEX(drbd_main_mutex);
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static void drbd_release(struct gendisk *gd, fmode_t mode);
+static void md_sync_timer_fn(unsigned long data);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+ "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
+ __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* allow_open_on_secondary */
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * this becomes the boot parameter drbd.minor_count */
+module_param(minor_count, uint, 0444);
+module_param(disable_sendpage, bool, 0644);
+module_param(allow_oos, bool, 0);
+module_param(proc_details, int, 0644);
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+int enable_faults;
+int fault_rate;
+static int fault_count;
+int fault_devs;
+/* bitmap of enabled faults */
+module_param(enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param(fault_rate, int, 0664);
+/* count of faults inserted */
+module_param(fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param(fault_devs, int, 0644);
+#endif
+
+/* module parameter, defined */
+unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
+bool disable_sendpage;
+bool allow_oos;
+int proc_details; /* Detail level in proc drbd*/
+
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char usermode_helper[80] = "/sbin/drbdadm";
+
+module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct idr drbd_devices;
+struct list_head drbd_resources;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache; /* peer requests */
+struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
+mempool_t *drbd_request_mempool;
+mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
+
+/* I do not use a standard mempool, because:
+ 1) I want to hand out the pre-allocated objects first.
+ 2) I want to be able to interrupt sleeping allocation with a signal.
+ Note: This is a single linked list, the next pointer is the private
+ member of struct page.
+ */
+struct page *drbd_pp_pool;
+spinlock_t drbd_pp_lock;
+int drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
+
+static const struct block_device_operations drbd_ops = {
+ .owner = THIS_MODULE,
+ .open = drbd_open,
+ .release = drbd_release,
+};
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+ struct bio *bio;
+
+ if (!drbd_md_io_bio_set)
+ return bio_alloc(gfp_mask, 1);
+
+ bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ if (!bio)
+ return NULL;
+ return bio;
+}
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+ give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+ int io_allowed;
+
+ atomic_inc(&device->local_cnt);
+ io_allowed = (device->state.disk >= mins);
+ if (!io_allowed) {
+ if (atomic_dec_and_test(&device->local_cnt))
+ wake_up(&device->misc_wait);
+ }
+ return io_allowed;
+}
+
+#endif
+
+/**
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @connection: DRBD connection.
+ * @barrier_nr: Expected identifier of the DRBD write barrier packet.
+ * @set_size: Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
+ */
+void tl_release(struct drbd_connection *connection, unsigned int barrier_nr,
+ unsigned int set_size)
+{
+ struct drbd_request *r;
+ struct drbd_request *req = NULL;
+ int expect_epoch = 0;
+ int expect_size = 0;
+
+ spin_lock_irq(&connection->resource->req_lock);
+
+ /* find oldest not yet barrier-acked write request,
+ * count writes in its epoch. */
+ list_for_each_entry(r, &connection->transfer_log, tl_requests) {
+ const unsigned s = r->rq_state;
+ if (!req) {
+ if (!(s & RQ_WRITE))
+ continue;
+ if (!(s & RQ_NET_MASK))
+ continue;
+ if (s & RQ_NET_DONE)
+ continue;
+ req = r;
+ expect_epoch = req->epoch;
+ expect_size ++;
+ } else {
+ if (r->epoch != expect_epoch)
+ break;
+ if (!(s & RQ_WRITE))
+ continue;
+ /* if (s & RQ_DONE): not expected */
+ /* if (!(s & RQ_NET_MASK)): not expected */
+ expect_size++;
+ }
+ }
+
+ /* first some paranoia code */
+ if (req == NULL) {
+ drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+ barrier_nr);
+ goto bail;
+ }
+ if (expect_epoch != barrier_nr) {
+ drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
+ barrier_nr, expect_epoch);
+ goto bail;
+ }
+
+ if (expect_size != set_size) {
+ drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+ barrier_nr, set_size, expect_size);
+ goto bail;
+ }
+
+ /* Clean up list of requests processed during current epoch. */
+ /* this extra list walk restart is paranoia,
+ * to catch requests being barrier-acked "unexpectedly".
+ * It usually should find the same req again, or some READ preceding it. */
+ list_for_each_entry(req, &connection->transfer_log, tl_requests)
+ if (req->epoch == expect_epoch)
+ break;
+ list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
+ if (req->epoch != expect_epoch)
+ break;
+ _req_mod(req, BARRIER_ACKED);
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ return;
+
+bail:
+ spin_unlock_irq(&connection->resource->req_lock);
+ conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @connection: DRBD connection to operate on.
+ * @what: The action/event to perform with all request objects
+ *
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
+ */
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+ struct drbd_request *req, *r;
+
+ list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests)
+ _req_mod(req, what);
+}
+
+void tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+ spin_lock_irq(&connection->resource->req_lock);
+ _tl_restart(connection, what);
+ spin_unlock_irq(&connection->resource->req_lock);
+}
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @device: DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_connection *connection)
+{
+ tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+}
+
+/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
+ * @device: DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_device *device)
+{
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct drbd_request *req, *r;
+
+ spin_lock_irq(&connection->resource->req_lock);
+ list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
+ if (!(req->rq_state & RQ_LOCAL_PENDING))
+ continue;
+ if (req->device != device)
+ continue;
+ _req_mod(req, ABORT_DISK_IO);
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+}
+
+static int drbd_thread_setup(void *arg)
+{
+ struct drbd_thread *thi = (struct drbd_thread *) arg;
+ struct drbd_resource *resource = thi->resource;
+ unsigned long flags;
+ int retval;
+
+ snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+ thi->name[0],
+ resource->name);
+
+restart:
+ retval = thi->function(thi);
+
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ /* if the receiver has been "EXITING", the last thing it did
+ * was set the conn state to "StandAlone",
+ * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+ * and receiver thread will be "started".
+ * drbd_thread_start needs to set "RESTARTING" in that case.
+ * t_state check and assignment needs to be within the same spinlock,
+ * so either thread_start sees EXITING, and can remap to RESTARTING,
+ * or thread_start see NONE, and can proceed as normal.
+ */
+
+ if (thi->t_state == RESTARTING) {
+ drbd_info(resource, "Restarting %s thread\n", thi->name);
+ thi->t_state = RUNNING;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ goto restart;
+ }
+
+ thi->task = NULL;
+ thi->t_state = NONE;
+ smp_mb();
+ complete_all(&thi->stop);
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+
+ drbd_info(resource, "Terminating %s\n", current->comm);
+
+ /* Release mod reference taken when thread was started */
+
+ if (thi->connection)
+ kref_put(&thi->connection->kref, drbd_destroy_connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
+ module_put(THIS_MODULE);
+ return retval;
+}
+
+static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi,
+ int (*func) (struct drbd_thread *), const char *name)
+{
+ spin_lock_init(&thi->t_lock);
+ thi->task = NULL;
+ thi->t_state = NONE;
+ thi->function = func;
+ thi->resource = resource;
+ thi->connection = NULL;
+ thi->name = name;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+ struct drbd_resource *resource = thi->resource;
+ struct task_struct *nt;
+ unsigned long flags;
+
+ /* is used from state engine doing drbd_thread_stop_nowait,
+ * while holding the req lock irqsave */
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ switch (thi->t_state) {
+ case NONE:
+ drbd_info(resource, "Starting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
+
+ /* Get ref on module for thread - this is released when thread exits */
+ if (!try_module_get(THIS_MODULE)) {
+ drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ return false;
+ }
+
+ kref_get(&resource->kref);
+ if (thi->connection)
+ kref_get(&thi->connection->kref);
+
+ init_completion(&thi->stop);
+ thi->reset_cpu_mask = 1;
+ thi->t_state = RUNNING;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+ nt = kthread_create(drbd_thread_setup, (void *) thi,
+ "drbd_%c_%s", thi->name[0], thi->resource->name);
+
+ if (IS_ERR(nt)) {
+ drbd_err(resource, "Couldn't start thread\n");
+
+ if (thi->connection)
+ kref_put(&thi->connection->kref, drbd_destroy_connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
+ module_put(THIS_MODULE);
+ return false;
+ }
+ spin_lock_irqsave(&thi->t_lock, flags);
+ thi->task = nt;
+ thi->t_state = RUNNING;
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ wake_up_process(nt);
+ break;
+ case EXITING:
+ thi->t_state = RESTARTING;
+ drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
+ /* fall through */
+ case RUNNING:
+ case RESTARTING:
+ default:
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ break;
+ }
+
+ return true;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+ unsigned long flags;
+
+ enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
+
+ /* may be called from state engine, holding the req lock irqsave */
+ spin_lock_irqsave(&thi->t_lock, flags);
+
+ if (thi->t_state == NONE) {
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ if (restart)
+ drbd_thread_start(thi);
+ return;
+ }
+
+ if (thi->t_state != ns) {
+ if (thi->task == NULL) {
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+ return;
+ }
+
+ thi->t_state = ns;
+ smp_mb();
+ init_completion(&thi->stop);
+ if (thi->task != current)
+ force_sig(DRBD_SIGKILL, thi->task);
+ }
+
+ spin_unlock_irqrestore(&thi->t_lock, flags);
+
+ if (wait)
+ wait_for_completion(&thi->stop);
+}
+
+int conn_lowest_minor(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr = 0, minor = -1;
+
+ rcu_read_lock();
+ peer_device = idr_get_next(&connection->peer_devices, &vnr);
+ if (peer_device)
+ minor = device_to_minor(peer_device->device);
+ rcu_read_unlock();
+
+ return minor;
+}
+
+#ifdef CONFIG_SMP
+/**
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ *
+ * Forces all threads of a resource onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
+{
+ unsigned int *resources_per_cpu, min_index = ~0;
+
+ resources_per_cpu = kzalloc(nr_cpu_ids * sizeof(*resources_per_cpu), GFP_KERNEL);
+ if (resources_per_cpu) {
+ struct drbd_resource *resource;
+ unsigned int cpu, min = ~0;
+
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ for_each_cpu(cpu, resource->cpu_mask)
+ resources_per_cpu[cpu]++;
+ }
+ rcu_read_unlock();
+ for_each_online_cpu(cpu) {
+ if (resources_per_cpu[cpu] < min) {
+ min = resources_per_cpu[cpu];
+ min_index = cpu;
+ }
+ }
+ kfree(resources_per_cpu);
+ }
+ if (min_index == ~0) {
+ cpumask_setall(*cpu_mask);
+ return;
+ }
+ cpumask_set_cpu(min_index, *cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @device: DRBD device.
+ * @thi: drbd_thread object
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
+{
+ struct drbd_resource *resource = thi->resource;
+ struct task_struct *p = current;
+
+ if (!thi->reset_cpu_mask)
+ return;
+ thi->reset_cpu_mask = 0;
+ set_cpus_allowed_ptr(p, resource->cpu_mask);
+}
+#else
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+
+/**
+ * drbd_header_size - size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures. (The bitmap send and receive code
+ * relies on this.)
+ */
+unsigned int drbd_header_size(struct drbd_connection *connection)
+{
+ if (connection->agreed_pro_version >= 100) {
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+ return sizeof(struct p_header100);
+ } else {
+ BUILD_BUG_ON(sizeof(struct p_header80) !=
+ sizeof(struct p_header95));
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+ return sizeof(struct p_header80);
+ }
+}
+
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be16(size);
+ return sizeof(struct p_header80);
+}
+
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be32(size);
+ return sizeof(struct p_header95);
+}
+
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+ int size, int vnr)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC_100);
+ h->volume = cpu_to_be16(vnr);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be32(size);
+ h->pad = 0;
+ return sizeof(struct p_header100);
+}
+
+static unsigned int prepare_header(struct drbd_connection *connection, int vnr,
+ void *buffer, enum drbd_packet cmd, int size)
+{
+ if (connection->agreed_pro_version >= 100)
+ return prepare_header100(buffer, cmd, size, vnr);
+ else if (connection->agreed_pro_version >= 95 &&
+ size > DRBD_MAX_SIZE_H80_PACKET)
+ return prepare_header95(buffer, cmd, size);
+ else
+ return prepare_header80(buffer, cmd, size);
+}
+
+static void *__conn_prepare_command(struct drbd_connection *connection,
+ struct drbd_socket *sock)
+{
+ if (!sock->socket)
+ return NULL;
+ return sock->sbuf + drbd_header_size(connection);
+}
+
+void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock)
+{
+ void *p;
+
+ mutex_lock(&sock->mutex);
+ p = __conn_prepare_command(connection, sock);
+ if (!p)
+ mutex_unlock(&sock->mutex);
+
+ return p;
+}
+
+void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock)
+{
+ return conn_prepare_command(peer_device->connection, sock);
+}
+
+static int __send_command(struct drbd_connection *connection, int vnr,
+ struct drbd_socket *sock, enum drbd_packet cmd,
+ unsigned int header_size, void *data,
+ unsigned int size)
+{
+ int msg_flags;
+ int err;
+
+ /*
+ * Called with @data == NULL and the size of the data blocks in @size
+ * for commands that send data blocks. For those commands, omit the
+ * MSG_MORE flag: this will increase the likelihood that data blocks
+ * which are page aligned on the sender will end up page aligned on the
+ * receiver.
+ */
+ msg_flags = data ? MSG_MORE : 0;
+
+ header_size += prepare_header(connection, vnr, sock->sbuf, cmd,
+ header_size + size);
+ err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
+ msg_flags);
+ if (data && !err)
+ err = drbd_send_all(connection, sock->socket, data, size, 0);
+ /* DRBD protocol "pings" are latency critical.
+ * This is supposed to trigger tcp_push_pending_frames() */
+ if (!err && (cmd == P_PING || cmd == P_PING_ACK))
+ drbd_tcp_nodelay(sock->socket);
+
+ return err;
+}
+
+static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ return __send_command(connection, 0, sock, cmd, header_size, data, size);
+}
+
+int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
+
+ err = __conn_send_command(connection, sock, cmd, header_size, data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
+
+ err = __send_command(peer_device->connection, peer_device->device->vnr,
+ sock, cmd, header_size, data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+int drbd_send_ping(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+
+ sock = &connection->meta;
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, P_PING, 0, NULL, 0);
+}
+
+int drbd_send_ping_ack(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+
+ sock = &connection->meta;
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0);
+}
+
+int drbd_send_sync_param(struct drbd_peer_device *peer_device)
+{
+ struct drbd_socket *sock;
+ struct p_rs_param_95 *p;
+ int size;
+ const int apv = peer_device->connection->agreed_pro_version;
+ enum drbd_packet cmd;
+ struct net_conf *nc;
+ struct disk_conf *dc;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+
+ size = apv <= 87 ? sizeof(struct p_rs_param)
+ : apv == 88 ? sizeof(struct p_rs_param)
+ + strlen(nc->verify_alg) + 1
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+ cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+ /* initialize verify_alg and csums_alg */
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+ if (get_ldev(peer_device->device)) {
+ dc = rcu_dereference(peer_device->device->ldev->disk_conf);
+ p->resync_rate = cpu_to_be32(dc->resync_rate);
+ p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+ p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+ p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+ p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+ put_ldev(peer_device->device);
+ } else {
+ p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+ p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+ p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+ p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+ p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+ }
+
+ if (apv >= 88)
+ strcpy(p->verify_alg, nc->verify_alg);
+ if (apv >= 89)
+ strcpy(p->csums_alg, nc->csums_alg);
+ rcu_read_unlock();
+
+ return drbd_send_command(peer_device, sock, cmd, size, NULL, 0);
+}
+
+int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd)
+{
+ struct drbd_socket *sock;
+ struct p_protocol *p;
+ struct net_conf *nc;
+ int size, cf;
+
+ sock = &connection->data;
+ p = __conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+
+ if (nc->tentative && connection->agreed_pro_version < 92) {
+ rcu_read_unlock();
+ mutex_unlock(&sock->mutex);
+ drbd_err(connection, "--dry-run is not supported by peer");
+ return -EOPNOTSUPP;
+ }
+
+ size = sizeof(*p);
+ if (connection->agreed_pro_version >= 87)
+ size += strlen(nc->integrity_alg) + 1;
+
+ p->protocol = cpu_to_be32(nc->wire_protocol);
+ p->after_sb_0p = cpu_to_be32(nc->after_sb_0p);
+ p->after_sb_1p = cpu_to_be32(nc->after_sb_1p);
+ p->after_sb_2p = cpu_to_be32(nc->after_sb_2p);
+ p->two_primaries = cpu_to_be32(nc->two_primaries);
+ cf = 0;
+ if (nc->discard_my_data)
+ cf |= CF_DISCARD_MY_DATA;
+ if (nc->tentative)
+ cf |= CF_DRY_RUN;
+ p->conn_flags = cpu_to_be32(cf);
+
+ if (connection->agreed_pro_version >= 87)
+ strcpy(p->integrity_alg, nc->integrity_alg);
+ rcu_read_unlock();
+
+ return __conn_send_command(connection, sock, cmd, size, NULL, 0);
+}
+
+int drbd_send_protocol(struct drbd_connection *connection)
+{
+ int err;
+
+ mutex_lock(&connection->data.mutex);
+ err = __drbd_send_protocol(connection, P_PROTOCOL);
+ mutex_unlock(&connection->data.mutex);
+
+ return err;
+}
+
+static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_uuids *p;
+ int i;
+
+ if (!get_ldev_if_state(device, D_NEGOTIATING))
+ return 0;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p) {
+ put_ldev(device);
+ return -EIO;
+ }
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ device->comm_bm_set = drbd_bm_total_weight(device);
+ p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
+ rcu_read_lock();
+ uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0;
+ rcu_read_unlock();
+ uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0;
+ uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+ p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+ put_ldev(device);
+ return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_uuids(struct drbd_peer_device *peer_device)
+{
+ return _drbd_send_uuids(peer_device, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
+{
+ return _drbd_send_uuids(peer_device, 8);
+}
+
+void drbd_print_uuids(struct drbd_device *device, const char *text)
+{
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ u64 *uuid = device->ldev->md.uuid;
+ drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
+ text,
+ (unsigned long long)uuid[UI_CURRENT],
+ (unsigned long long)uuid[UI_BITMAP],
+ (unsigned long long)uuid[UI_HISTORY_START],
+ (unsigned long long)uuid[UI_HISTORY_END]);
+ put_ldev(device);
+ } else {
+ drbd_info(device, "%s effective data uuid: %016llX\n",
+ text,
+ (unsigned long long)device->ed_uuid);
+ }
+}
+
+void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_rs_uuid *p;
+ u64 uuid;
+
+ D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
+
+ uuid = device->ldev->md.uuid[UI_BITMAP];
+ if (uuid && uuid != UUID_JUST_CREATED)
+ uuid = uuid + UUID_NEW_BM_OFFSET;
+ else
+ get_random_bytes(&uuid, sizeof(u64));
+ drbd_uuid_set(device, UI_BITMAP, uuid);
+ drbd_print_uuids(device, "updated sync UUID");
+ drbd_md_sync(device);
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (p) {
+ p->uuid = cpu_to_be64(uuid);
+ drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+ }
+}
+
+int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_sizes *p;
+ sector_t d_size, u_size;
+ int q_order_type;
+ unsigned int max_bio_size;
+
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ D_ASSERT(device, device->ldev->backing_bdev);
+ d_size = drbd_get_max_capacity(device->ldev);
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ q_order_type = drbd_queue_order_type(device);
+ max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9;
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
+ put_ldev(device);
+ } else {
+ d_size = 0;
+ u_size = 0;
+ q_order_type = QUEUE_ORDERED_NONE;
+ max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
+ }
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+
+ if (peer_device->connection->agreed_pro_version <= 94)
+ max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ else if (peer_device->connection->agreed_pro_version < 100)
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
+
+ p->d_size = cpu_to_be64(d_size);
+ p->u_size = cpu_to_be64(u_size);
+ p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(device->this_bdev));
+ p->max_bio_size = cpu_to_be32(max_bio_size);
+ p->queue_order_type = cpu_to_be16(q_order_type);
+ p->dds_flags = cpu_to_be16(flags);
+ return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_current_state() - Sends the drbd state to the peer
+ * @peer_device: DRBD peer device.
+ */
+int drbd_send_current_state(struct drbd_peer_device *peer_device)
+{
+ struct drbd_socket *sock;
+ struct p_state *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */
+ return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @peer_device: DRBD peer device.
+ * @state: the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state)
+{
+ struct drbd_socket *sock;
+ struct p_state *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(state.i); /* Within the send mutex */
+ return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val)
+{
+ struct drbd_socket *sock;
+ struct p_req_state *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
+
+int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_packet cmd;
+ struct drbd_socket *sock;
+ struct p_req_state *p;
+
+ cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode)
+{
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+
+ sock = &peer_device->connection->meta;
+ p = drbd_prepare_command(peer_device, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+ }
+}
+
+void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode)
+{
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+ enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
+
+ sock = &connection->meta;
+ p = conn_prepare_command(connection, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+ }
+}
+
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+ BUG_ON(code & ~0xf);
+ p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+ p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+ BUG_ON(n & ~0x7);
+ p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+static int fill_bitmap_rle_bits(struct drbd_device *device,
+ struct p_compressed_bm *p,
+ unsigned int size,
+ struct bm_xfer_ctx *c)
+{
+ struct bitstream bs;
+ unsigned long plain_bits;
+ unsigned long tmp;
+ unsigned long rl;
+ unsigned len;
+ unsigned toggle;
+ int bits, use_rle;
+
+ /* may we use this feature? */
+ rcu_read_lock();
+ use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
+ rcu_read_unlock();
+ if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90)
+ return 0;
+
+ if (c->bit_offset >= c->bm_bits)
+ return 0; /* nothing to do. */
+
+ /* use at most thus many bytes */
+ bitstream_init(&bs, p->code, size, 0);
+ memset(p->code, 0, size);
+ /* plain bits covered in this code string */
+ plain_bits = 0;
+
+ /* p->encoding & 0x80 stores whether the first run length is set.
+ * bit offset is implicit.
+ * start with toggle == 2 to be able to tell the first iteration */
+ toggle = 2;
+
+ /* see how much plain bits we can stuff into one packet
+ * using RLE and VLI. */
+ do {
+ tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset)
+ : _drbd_bm_find_next(device, c->bit_offset);
+ if (tmp == -1UL)
+ tmp = c->bm_bits;
+ rl = tmp - c->bit_offset;
+
+ if (toggle == 2) { /* first iteration */
+ if (rl == 0) {
+ /* the first checked bit was set,
+ * store start value, */
+ dcbp_set_start(p, 1);
+ /* but skip encoding of zero run length */
+ toggle = !toggle;
+ continue;
+ }
+ dcbp_set_start(p, 0);
+ }
+
+ /* paranoia: catch zero runlength.
+ * can only happen if bitmap is modified while we scan it. */
+ if (rl == 0) {
+ drbd_err(device, "unexpected zero runlength while encoding bitmap "
+ "t:%u bo:%lu\n", toggle, c->bit_offset);
+ return -1;
+ }
+
+ bits = vli_encode_bits(&bs, rl);
+ if (bits == -ENOBUFS) /* buffer full */
+ break;
+ if (bits <= 0) {
+ drbd_err(device, "error while encoding bitmap: %d\n", bits);
+ return 0;
+ }
+
+ toggle = !toggle;
+ plain_bits += rl;
+ c->bit_offset = tmp;
+ } while (c->bit_offset < c->bm_bits);
+
+ len = bs.cur.b - p->code + !!bs.cur.bit;
+
+ if (plain_bits < (len << 3)) {
+ /* incompressible with this method.
+ * we need to rewind both word and bit position. */
+ c->bit_offset -= plain_bits;
+ bm_xfer_ctx_bit_to_word_offset(c);
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+ return 0;
+ }
+
+ /* RLE + VLI was able to compress it just fine.
+ * update c->word_offset. */
+ bm_xfer_ctx_bit_to_word_offset(c);
+
+ /* store pad_bits */
+ dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+ return len;
+}
+
+/**
+ * send_bitmap_rle_or_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c)
+{
+ struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+ unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+ struct p_compressed_bm *p = sock->sbuf + header_size;
+ int len, err;
+
+ len = fill_bitmap_rle_bits(device, p,
+ DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
+ if (len < 0)
+ return -EIO;
+
+ if (len) {
+ dcbp_set_code(p, RLE_VLI_Bits);
+ err = __send_command(first_peer_device(device)->connection, device->vnr, sock,
+ P_COMPRESSED_BITMAP, sizeof(*p) + len,
+ NULL, 0);
+ c->packets[0]++;
+ c->bytes[0] += header_size + sizeof(*p) + len;
+
+ if (c->bit_offset >= c->bm_bits)
+ len = 0; /* DONE */
+ } else {
+ /* was not compressible.
+ * send a buffer full of plain text bits instead. */
+ unsigned int data_size;
+ unsigned long num_words;
+ unsigned long *p = sock->sbuf + header_size;
+
+ data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ len = num_words * sizeof(*p);
+ if (len)
+ drbd_bm_get_lel(device, c->word_offset, num_words, p);
+ err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0);
+ c->word_offset += num_words;
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+ c->packets[1]++;
+ c->bytes[1] += header_size + len;
+
+ if (c->bit_offset > c->bm_bits)
+ c->bit_offset = c->bm_bits;
+ }
+ if (!err) {
+ if (len == 0) {
+ INFO_bm_xfer_stats(device, "send", c);
+ return 0;
+ } else
+ return 1;
+ }
+ return -EIO;
+}
+
+/* See the comment at receive_bitmap() */
+static int _drbd_send_bitmap(struct drbd_device *device)
+{
+ struct bm_xfer_ctx c;
+ int err;
+
+ if (!expect(device->bitmap))
+ return false;
+
+ if (get_ldev(device)) {
+ if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
+ drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
+ drbd_bm_set_all(device);
+ if (drbd_bm_write(device)) {
+ /* write_bm did fail! Leave full sync flag set in Meta P_DATA
+ * but otherwise process as per normal - need to tell other
+ * side that a full resync is required! */
+ drbd_err(device, "Failed to write bitmap to disk!\n");
+ } else {
+ drbd_md_clear_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ }
+ }
+ put_ldev(device);
+ }
+
+ c = (struct bm_xfer_ctx) {
+ .bm_bits = drbd_bm_bits(device),
+ .bm_words = drbd_bm_words(device),
+ };
+
+ do {
+ err = send_bitmap_rle_or_plain(device, &c);
+ } while (err > 0);
+
+ return err == 0;
+}
+
+int drbd_send_bitmap(struct drbd_device *device)
+{
+ struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+ int err = -1;
+
+ mutex_lock(&sock->mutex);
+ if (sock->socket)
+ err = !_drbd_send_bitmap(device);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
+{
+ struct drbd_socket *sock;
+ struct p_barrier_ack *p;
+
+ if (connection->cstate < C_WF_REPORT_PARAMS)
+ return;
+
+ sock = &connection->meta;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return;
+ p->barrier = barrier_nr;
+ p->set_size = cpu_to_be32(set_size);
+ conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @device: DRBD device.
+ * @cmd: Packet command code.
+ * @sector: sector, needs to be in big endian byte order
+ * @blksize: size in byte, needs to be in big endian byte order
+ * @block_id: Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ u64 sector, u32 blksize, u64 block_id)
+{
+ struct drbd_socket *sock;
+ struct p_block_ack *p;
+
+ if (peer_device->device->state.conn < C_CONNECTED)
+ return -EIO;
+
+ sock = &peer_device->connection->meta;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = sector;
+ p->block_id = block_id;
+ p->blksize = blksize;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
+void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct p_data *dp, int data_size)
+{
+ if (peer_device->connection->peer_integrity_tfm)
+ data_size -= crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ _drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size),
+ dp->block_id);
+}
+
+void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct p_block_req *rp)
+{
+ _drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @device: DRBD device
+ * @cmd: packet command code
+ * @peer_req: peer request
+ */
+int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
+{
+ return _drbd_send_ack(peer_device, cmd,
+ cpu_to_be64(peer_req->i.sector),
+ cpu_to_be32(peer_req->i.size),
+ peer_req->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ sector_t sector, int blksize, u64 block_id)
+{
+ return _drbd_send_ack(peer_device, cmd,
+ cpu_to_be64(sector),
+ cpu_to_be32(blksize),
+ cpu_to_be64(block_id));
+}
+
+int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
+ sector_t sector, int size, u64 block_id)
+{
+ struct drbd_socket *sock;
+ struct p_block_req *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = block_id;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size,
+ void *digest, int digest_size, enum drbd_packet cmd)
+{
+ struct drbd_socket *sock;
+ struct p_block_req *p;
+
+ /* FIXME: Put the digest into the preallocated socket buffer. */
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size);
+}
+
+int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+ struct drbd_socket *sock;
+ struct p_block_req *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
+}
+
+/* called on sndtimeo
+ * returns false if we should retry,
+ * true if we think connection is dead
+ */
+static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock)
+{
+ int drop_it;
+ /* long elapsed = (long)(jiffies - device->last_received); */
+
+ drop_it = connection->meta.socket == sock
+ || !connection->asender.task
+ || get_t_state(&connection->asender) != RUNNING
+ || connection->cstate < C_WF_REPORT_PARAMS;
+
+ if (drop_it)
+ return true;
+
+ drop_it = !--connection->ko_count;
+ if (!drop_it) {
+ drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+ current->comm, current->pid, connection->ko_count);
+ request_ping(connection);
+ }
+
+ return drop_it; /* && (device->state == R_PRIMARY) */;
+}
+
+static void drbd_update_congested(struct drbd_connection *connection)
+{
+ struct sock *sk = connection->data.socket->sk;
+ if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+ set_bit(NET_CONGESTED, &connection->flags);
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ * reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page,
+ int offset, size_t size, unsigned msg_flags)
+{
+ struct socket *socket;
+ void *addr;
+ int err;
+
+ socket = peer_device->connection->data.socket;
+ addr = kmap(page) + offset;
+ err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
+ kunmap(page);
+ if (!err)
+ peer_device->device->send_cnt += size >> 9;
+ return err;
+}
+
+static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page,
+ int offset, size_t size, unsigned msg_flags)
+{
+ struct socket *socket = peer_device->connection->data.socket;
+ mm_segment_t oldfs = get_fs();
+ int len = size;
+ int err = -EIO;
+
+ /* e.g. XFS meta- & log-data is in slab pages, which have a
+ * page_count of 0 and/or have PageSlab() set.
+ * we cannot use send_page for those, as that does get_page();
+ * put_page(); and would cause either a VM_BUG directly, or
+ * __page_cache_release a page that would actually still be referenced
+ * by someone, leading to some obscure delayed Oops somewhere else. */
+ if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
+ return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
+
+ msg_flags |= MSG_NOSIGNAL;
+ drbd_update_congested(peer_device->connection);
+ set_fs(KERNEL_DS);
+ do {
+ int sent;
+
+ sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
+ if (sent <= 0) {
+ if (sent == -EAGAIN) {
+ if (we_should_drop_the_connection(peer_device->connection, socket))
+ break;
+ continue;
+ }
+ drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
+ __func__, (int)size, len, sent);
+ if (sent < 0)
+ err = sent;
+ break;
+ }
+ len -= sent;
+ offset += sent;
+ } while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/);
+ set_fs(oldfs);
+ clear_bit(NET_CONGESTED, &peer_device->connection->flags);
+
+ if (len == 0) {
+ err = 0;
+ peer_device->device->send_cnt += size >> 9;
+ }
+ return err;
+}
+
+static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ /* hint all but last page with MSG_MORE */
+ bio_for_each_segment(bvec, bio, iter) {
+ int err;
+
+ err = _drbd_no_send_page(peer_device, bvec.bv_page,
+ bvec.bv_offset, bvec.bv_len,
+ bio_iter_last(bvec, iter)
+ ? 0 : MSG_MORE);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ /* hint all but last page with MSG_MORE */
+ bio_for_each_segment(bvec, bio, iter) {
+ int err;
+
+ err = _drbd_send_page(peer_device, bvec.bv_page,
+ bvec.bv_offset, bvec.bv_len,
+ bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
+ struct drbd_peer_request *peer_req)
+{
+ struct page *page = peer_req->pages;
+ unsigned len = peer_req->i.size;
+ int err;
+
+ /* hint all but last page with MSG_MORE */
+ page_chain_for_each(page) {
+ unsigned l = min_t(unsigned, len, PAGE_SIZE);
+
+ err = _drbd_send_page(peer_device, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+ if (err)
+ return err;
+ len -= l;
+ }
+ return 0;
+}
+
+static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw)
+{
+ if (connection->agreed_pro_version >= 95)
+ return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
+ (bi_rw & REQ_FUA ? DP_FUA : 0) |
+ (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
+ (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
+ else
+ return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
+}
+
+/* Used to send write or TRIM aka REQ_DISCARD requests
+ * R_PRIMARY -> Peer (P_DATA, P_TRIM)
+ */
+int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_data *p;
+ unsigned int dp_flags = 0;
+ int digest_size;
+ int err;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ digest_size = peer_device->connection->integrity_tfm ?
+ crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->block_id = (unsigned long)req;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
+ dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw);
+ if (device->state.conn >= C_SYNC_SOURCE &&
+ device->state.conn <= C_PAUSED_SYNC_T)
+ dp_flags |= DP_MAY_SET_IN_SYNC;
+ if (peer_device->connection->agreed_pro_version >= 100) {
+ if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ /* During resync, request an explicit write ack,
+ * even in protocol != C */
+ if (req->rq_state & RQ_EXP_WRITE_ACK
+ || (dp_flags & DP_MAY_SET_IN_SYNC))
+ dp_flags |= DP_SEND_WRITE_ACK;
+ }
+ p->dp_flags = cpu_to_be32(dp_flags);
+
+ if (dp_flags & DP_DISCARD) {
+ struct p_trim *t = (struct p_trim*)p;
+ t->size = cpu_to_be32(req->i.size);
+ err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0);
+ goto out;
+ }
+
+ /* our digest is still only over the payload.
+ * TRIM does not carry any payload. */
+ if (digest_size)
+ drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1);
+ err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size);
+ if (!err) {
+ /* For protocol A, we have to memcpy the payload into
+ * socket buffers, as we may complete right away
+ * as soon as we handed it over to tcp, at which point the data
+ * pages may become invalid.
+ *
+ * For data-integrity enabled, we copy it as well, so we can be
+ * sure that even if the bio pages may still be modified, it
+ * won't change the data on the wire, thus if the digest checks
+ * out ok after sending on this side, but does not fit on the
+ * receiving side, we sure have detected corruption elsewhere.
+ */
+ if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size)
+ err = _drbd_send_bio(peer_device, req->master_bio);
+ else
+ err = _drbd_send_zc_bio(peer_device, req->master_bio);
+
+ /* double check digest, sometimes buffers have been modified in flight. */
+ if (digest_size > 0 && digest_size <= 64) {
+ /* 64 byte, 512 bit, is the largest digest size
+ * currently supported in kernel crypto. */
+ unsigned char digest[64];
+ drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
+ if (memcmp(p + 1, digest, digest_size)) {
+ drbd_warn(device,
+ "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
+ (unsigned long long)req->i.sector, req->i.size);
+ }
+ } /* else if (digest_size > 64) {
+ ... Be noisy about digest too large ...
+ } */
+ }
+out:
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
+
+ return err;
+}
+
+/* answer packet, used to send data back for read requests:
+ * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
+ * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_socket *sock;
+ struct p_data *p;
+ int err;
+ int digest_size;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+
+ digest_size = peer_device->connection->integrity_tfm ?
+ crypto_hash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(peer_req->i.sector);
+ p->block_id = peer_req->block_id;
+ p->seq_num = 0; /* unused */
+ p->dp_flags = 0;
+ if (digest_size)
+ drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1);
+ err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size);
+ if (!err)
+ err = _drbd_send_zc_ee(peer_device, peer_req);
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
+
+ return err;
+}
+
+int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_socket *sock;
+ struct p_block_desc *p;
+
+ sock = &peer_device->connection->data;
+ p = drbd_prepare_command(peer_device, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->blksize = cpu_to_be32(req->i.size);
+ return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
+}
+
+/*
+ drbd_send distinguishes two cases:
+
+ Packets sent via the data socket "sock"
+ and packets sent via the meta data socket "msock"
+
+ sock msock
+ -----------------+-------------------------+------------------------------
+ timeout conf.timeout / 2 conf.timeout / 2
+ timeout action send a ping via msock Abort communication
+ and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_connection *connection, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags)
+{
+ struct kvec iov;
+ struct msghdr msg;
+ int rv, sent = 0;
+
+ if (!sock)
+ return -EBADR;
+
+ /* THINK if (signal_pending) return ... ? */
+
+ iov.iov_base = buf;
+ iov.iov_len = size;
+
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+ if (sock == connection->data.socket) {
+ rcu_read_lock();
+ connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
+ rcu_read_unlock();
+ drbd_update_congested(connection);
+ }
+ do {
+ /* STRANGE
+ * tcp_sendmsg does _not_ use its size parameter at all ?
+ *
+ * -EAGAIN on timeout, -EINTR on signal.
+ */
+/* THINK
+ * do we need to block DRBD_SIG if sock == &meta.socket ??
+ * otherwise wake_asender() might interrupt some send_*Ack !
+ */
+ rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
+ if (rv == -EAGAIN) {
+ if (we_should_drop_the_connection(connection, sock))
+ break;
+ else
+ continue;
+ }
+ if (rv == -EINTR) {
+ flush_signals(current);
+ rv = 0;
+ }
+ if (rv < 0)
+ break;
+ sent += rv;
+ iov.iov_base += rv;
+ iov.iov_len -= rv;
+ } while (sent < size);
+
+ if (sock == connection->data.socket)
+ clear_bit(NET_CONGESTED, &connection->flags);
+
+ if (rv <= 0) {
+ if (rv != -EAGAIN) {
+ drbd_err(connection, "%s_sendmsg returned %d\n",
+ sock == connection->meta.socket ? "msock" : "sock",
+ rv);
+ conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+ } else
+ conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+ }
+
+ return sent;
+}
+
+/**
+ * drbd_send_all - Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer,
+ size_t size, unsigned msg_flags)
+{
+ int err;
+
+ err = drbd_send(connection, sock, buffer, size, msg_flags);
+ if (err < 0)
+ return err;
+ if (err != size)
+ return -EIO;
+ return 0;
+}
+
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct drbd_device *device = bdev->bd_disk->private_data;
+ unsigned long flags;
+ int rv = 0;
+
+ mutex_lock(&drbd_main_mutex);
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ /* to have a stable device->state.role
+ * and no race with updating open_cnt */
+
+ if (device->state.role != R_PRIMARY) {
+ if (mode & FMODE_WRITE)
+ rv = -EROFS;
+ else if (!allow_oos)
+ rv = -EMEDIUMTYPE;
+ }
+
+ if (!rv)
+ device->open_cnt++;
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ mutex_unlock(&drbd_main_mutex);
+
+ return rv;
+}
+
+static void drbd_release(struct gendisk *gd, fmode_t mode)
+{
+ struct drbd_device *device = gd->private_data;
+ mutex_lock(&drbd_main_mutex);
+ device->open_cnt--;
+ mutex_unlock(&drbd_main_mutex);
+}
+
+static void drbd_set_defaults(struct drbd_device *device)
+{
+ /* Beware! The actual layout differs
+ * between big endian and little endian */
+ device->state = (union drbd_dev_state) {
+ { .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = C_STANDALONE,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN,
+ } };
+}
+
+void drbd_init_set_defaults(struct drbd_device *device)
+{
+ /* the memset(,0,) did most of this.
+ * note: only assignments, no allocation in here */
+
+ drbd_set_defaults(device);
+
+ atomic_set(&device->ap_bio_cnt, 0);
+ atomic_set(&device->ap_actlog_cnt, 0);
+ atomic_set(&device->ap_pending_cnt, 0);
+ atomic_set(&device->rs_pending_cnt, 0);
+ atomic_set(&device->unacked_cnt, 0);
+ atomic_set(&device->local_cnt, 0);
+ atomic_set(&device->pp_in_use_by_net, 0);
+ atomic_set(&device->rs_sect_in, 0);
+ atomic_set(&device->rs_sect_ev, 0);
+ atomic_set(&device->ap_in_flight, 0);
+ atomic_set(&device->md_io.in_use, 0);
+
+ mutex_init(&device->own_state_mutex);
+ device->state_mutex = &device->own_state_mutex;
+
+ spin_lock_init(&device->al_lock);
+ spin_lock_init(&device->peer_seq_lock);
+
+ INIT_LIST_HEAD(&device->active_ee);
+ INIT_LIST_HEAD(&device->sync_ee);
+ INIT_LIST_HEAD(&device->done_ee);
+ INIT_LIST_HEAD(&device->read_ee);
+ INIT_LIST_HEAD(&device->net_ee);
+ INIT_LIST_HEAD(&device->resync_reads);
+ INIT_LIST_HEAD(&device->resync_work.list);
+ INIT_LIST_HEAD(&device->unplug_work.list);
+ INIT_LIST_HEAD(&device->bm_io_work.w.list);
+ INIT_LIST_HEAD(&device->pending_master_completion[0]);
+ INIT_LIST_HEAD(&device->pending_master_completion[1]);
+ INIT_LIST_HEAD(&device->pending_completion[0]);
+ INIT_LIST_HEAD(&device->pending_completion[1]);
+
+ device->resync_work.cb = w_resync_timer;
+ device->unplug_work.cb = w_send_write_hint;
+ device->bm_io_work.w.cb = w_bitmap_io;
+
+ init_timer(&device->resync_timer);
+ init_timer(&device->md_sync_timer);
+ init_timer(&device->start_resync_timer);
+ init_timer(&device->request_timer);
+ device->resync_timer.function = resync_timer_fn;
+ device->resync_timer.data = (unsigned long) device;
+ device->md_sync_timer.function = md_sync_timer_fn;
+ device->md_sync_timer.data = (unsigned long) device;
+ device->start_resync_timer.function = start_resync_timer_fn;
+ device->start_resync_timer.data = (unsigned long) device;
+ device->request_timer.function = request_timer_fn;
+ device->request_timer.data = (unsigned long) device;
+
+ init_waitqueue_head(&device->misc_wait);
+ init_waitqueue_head(&device->state_wait);
+ init_waitqueue_head(&device->ee_wait);
+ init_waitqueue_head(&device->al_wait);
+ init_waitqueue_head(&device->seq_wait);
+
+ device->resync_wenr = LC_FREE;
+ device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+ device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+}
+
+void drbd_device_cleanup(struct drbd_device *device)
+{
+ int i;
+ if (first_peer_device(device)->connection->receiver.t_state != NONE)
+ drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+ first_peer_device(device)->connection->receiver.t_state);
+
+ device->al_writ_cnt =
+ device->bm_writ_cnt =
+ device->read_cnt =
+ device->recv_cnt =
+ device->send_cnt =
+ device->writ_cnt =
+ device->p_size =
+ device->rs_start =
+ device->rs_total =
+ device->rs_failed = 0;
+ device->rs_last_events = 0;
+ device->rs_last_sect_ev = 0;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = 0;
+ device->rs_mark_time[i] = 0;
+ }
+ D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
+
+ drbd_set_my_capacity(device, 0);
+ if (device->bitmap) {
+ /* maybe never allocated. */
+ drbd_bm_resize(device, 0, 1);
+ drbd_bm_cleanup(device);
+ }
+
+ drbd_free_ldev(device->ldev);
+ device->ldev = NULL;
+
+ clear_bit(AL_SUSPENDED, &device->flags);
+
+ D_ASSERT(device, list_empty(&device->active_ee));
+ D_ASSERT(device, list_empty(&device->sync_ee));
+ D_ASSERT(device, list_empty(&device->done_ee));
+ D_ASSERT(device, list_empty(&device->read_ee));
+ D_ASSERT(device, list_empty(&device->net_ee));
+ D_ASSERT(device, list_empty(&device->resync_reads));
+ D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
+ D_ASSERT(device, list_empty(&device->resync_work.list));
+ D_ASSERT(device, list_empty(&device->unplug_work.list));
+
+ drbd_set_defaults(device);
+}
+
+
+static void drbd_destroy_mempools(void)
+{
+ struct page *page;
+
+ while (drbd_pp_pool) {
+ page = drbd_pp_pool;
+ drbd_pp_pool = (struct page *)page_private(page);
+ __free_page(page);
+ drbd_pp_vacant--;
+ }
+
+ /* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
+
+ if (drbd_md_io_bio_set)
+ bioset_free(drbd_md_io_bio_set);
+ if (drbd_md_io_page_pool)
+ mempool_destroy(drbd_md_io_page_pool);
+ if (drbd_ee_mempool)
+ mempool_destroy(drbd_ee_mempool);
+ if (drbd_request_mempool)
+ mempool_destroy(drbd_request_mempool);
+ if (drbd_ee_cache)
+ kmem_cache_destroy(drbd_ee_cache);
+ if (drbd_request_cache)
+ kmem_cache_destroy(drbd_request_cache);
+ if (drbd_bm_ext_cache)
+ kmem_cache_destroy(drbd_bm_ext_cache);
+ if (drbd_al_ext_cache)
+ kmem_cache_destroy(drbd_al_ext_cache);
+
+ drbd_md_io_bio_set = NULL;
+ drbd_md_io_page_pool = NULL;
+ drbd_ee_mempool = NULL;
+ drbd_request_mempool = NULL;
+ drbd_ee_cache = NULL;
+ drbd_request_cache = NULL;
+ drbd_bm_ext_cache = NULL;
+ drbd_al_ext_cache = NULL;
+
+ return;
+}
+
+static int drbd_create_mempools(void)
+{
+ struct page *page;
+ const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
+ int i;
+
+ /* prepare our caches and mempools */
+ drbd_request_mempool = NULL;
+ drbd_ee_cache = NULL;
+ drbd_request_cache = NULL;
+ drbd_bm_ext_cache = NULL;
+ drbd_al_ext_cache = NULL;
+ drbd_pp_pool = NULL;
+ drbd_md_io_page_pool = NULL;
+ drbd_md_io_bio_set = NULL;
+
+ /* caches */
+ drbd_request_cache = kmem_cache_create(
+ "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+ if (drbd_request_cache == NULL)
+ goto Enomem;
+
+ drbd_ee_cache = kmem_cache_create(
+ "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
+ if (drbd_ee_cache == NULL)
+ goto Enomem;
+
+ drbd_bm_ext_cache = kmem_cache_create(
+ "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+ if (drbd_bm_ext_cache == NULL)
+ goto Enomem;
+
+ drbd_al_ext_cache = kmem_cache_create(
+ "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+ if (drbd_al_ext_cache == NULL)
+ goto Enomem;
+
+ /* mempools */
+ drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_bio_set == NULL)
+ goto Enomem;
+
+ drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+ if (drbd_md_io_page_pool == NULL)
+ goto Enomem;
+
+ drbd_request_mempool = mempool_create_slab_pool(number,
+ drbd_request_cache);
+ if (drbd_request_mempool == NULL)
+ goto Enomem;
+
+ drbd_ee_mempool = mempool_create_slab_pool(number, drbd_ee_cache);
+ if (drbd_ee_mempool == NULL)
+ goto Enomem;
+
+ /* drbd's page pool */
+ spin_lock_init(&drbd_pp_lock);
+
+ for (i = 0; i < number; i++) {
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page)
+ goto Enomem;
+ set_page_private(page, (unsigned long)drbd_pp_pool);
+ drbd_pp_pool = page;
+ }
+ drbd_pp_vacant = number;
+
+ return 0;
+
+Enomem:
+ drbd_destroy_mempools(); /* in case we allocated some */
+ return -ENOMEM;
+}
+
+static void drbd_release_all_peer_reqs(struct drbd_device *device)
+{
+ int rr;
+
+ rr = drbd_free_peer_reqs(device, &device->active_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in active list found!\n", rr);
+
+ rr = drbd_free_peer_reqs(device, &device->sync_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in sync list found!\n", rr);
+
+ rr = drbd_free_peer_reqs(device, &device->read_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in read list found!\n", rr);
+
+ rr = drbd_free_peer_reqs(device, &device->done_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in done list found!\n", rr);
+
+ rr = drbd_free_peer_reqs(device, &device->net_ee);
+ if (rr)
+ drbd_err(device, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking. */
+void drbd_destroy_device(struct kref *kref)
+{
+ struct drbd_device *device = container_of(kref, struct drbd_device, kref);
+ struct drbd_resource *resource = device->resource;
+ struct drbd_peer_device *peer_device, *tmp_peer_device;
+
+ del_timer_sync(&device->request_timer);
+
+ /* paranoia asserts */
+ D_ASSERT(device, device->open_cnt == 0);
+ /* end paranoia asserts */
+
+ /* cleanup stuff that may have been allocated during
+ * device (re-)configuration or state changes */
+
+ if (device->this_bdev)
+ bdput(device->this_bdev);
+
+ drbd_free_ldev(device->ldev);
+ device->ldev = NULL;
+
+ drbd_release_all_peer_reqs(device);
+
+ lc_destroy(device->act_log);
+ lc_destroy(device->resync);
+
+ kfree(device->p_uuid);
+ /* device->p_uuid = NULL; */
+
+ if (device->bitmap) /* should no longer be there. */
+ drbd_bm_cleanup(device);
+ __free_page(device->md_io.page);
+ put_disk(device->vdisk);
+ blk_cleanup_queue(device->rq_queue);
+ kfree(device->rs_plan_s);
+
+ /* not for_each_connection(connection, resource):
+ * those may have been cleaned up and disassociated already.
+ */
+ for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+ kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+ kfree(peer_device);
+ }
+ memset(device, 0xfd, sizeof(*device));
+ kfree(device);
+ kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+ struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+ LIST_HEAD(writes);
+ struct drbd_request *req, *tmp;
+
+ spin_lock_irq(&retry->lock);
+ list_splice_init(&retry->writes, &writes);
+ spin_unlock_irq(&retry->lock);
+
+ list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+ struct drbd_device *device = req->device;
+ struct bio *bio = req->master_bio;
+ unsigned long start_jif = req->start_jif;
+ bool expected;
+
+ expected =
+ expect(atomic_read(&req->completion_ref) == 0) &&
+ expect(req->rq_state & RQ_POSTPONED) &&
+ expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+ (req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+ if (!expected)
+ drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
+ req, atomic_read(&req->completion_ref),
+ req->rq_state);
+
+ /* We still need to put one kref associated with the
+ * "completion_ref" going zero in the code path that queued it
+ * here. The request object may still be referenced by a
+ * frozen local req->private_bio, in case we force-detached.
+ */
+ kref_put(&req->kref, drbd_req_destroy);
+
+ /* A single suspended or otherwise blocking device may stall
+ * all others as well. Fortunately, this code path is to
+ * recover from a situation that "should not happen":
+ * concurrent writes in multi-primary setup.
+ * In a "normal" lifecycle, this workqueue is supposed to be
+ * destroyed without ever doing anything.
+ * If it turns out to be an issue anyways, we can do per
+ * resource (replication group) or per device (minor) retry
+ * workqueues instead.
+ */
+
+ /* We are not just doing generic_make_request(),
+ * as we want to keep the start_time information. */
+ inc_ap_bio(device);
+ __drbd_make_request(device, bio, start_jif);
+ }
+}
+
+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
+void drbd_restart_request(struct drbd_request *req)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&retry.lock, flags);
+ list_move_tail(&req->tl_requests, &retry.writes);
+ spin_unlock_irqrestore(&retry.lock, flags);
+
+ /* Drop the extra reference that would otherwise
+ * have been dropped by complete_master_bio.
+ * do_retry() needs to grab a new one. */
+ dec_ap_bio(req->device);
+
+ queue_work(retry.wq, &retry.worker);
+}
+
+void drbd_destroy_resource(struct kref *kref)
+{
+ struct drbd_resource *resource =
+ container_of(kref, struct drbd_resource, kref);
+
+ idr_destroy(&resource->devices);
+ free_cpumask_var(resource->cpu_mask);
+ kfree(resource->name);
+ memset(resource, 0xf2, sizeof(*resource));
+ kfree(resource);
+}
+
+void drbd_free_resource(struct drbd_resource *resource)
+{
+ struct drbd_connection *connection, *tmp;
+
+ for_each_connection_safe(connection, tmp, resource) {
+ list_del(&connection->connections);
+ drbd_debugfs_connection_cleanup(connection);
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
+ drbd_debugfs_resource_cleanup(resource);
+ kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static void drbd_cleanup(void)
+{
+ unsigned int i;
+ struct drbd_device *device;
+ struct drbd_resource *resource, *tmp;
+
+ /* first remove proc,
+ * drbdsetup uses it's presence to detect
+ * whether DRBD is loaded.
+ * If we would get stuck in proc removal,
+ * but have netlink already deregistered,
+ * some drbdsetup commands may wait forever
+ * for an answer.
+ */
+ if (drbd_proc)
+ remove_proc_entry("drbd", NULL);
+
+ if (retry.wq)
+ destroy_workqueue(retry.wq);
+
+ drbd_genl_unregister();
+ drbd_debugfs_cleanup();
+
+ idr_for_each_entry(&drbd_devices, device, i)
+ drbd_delete_device(device);
+
+ /* not _rcu since, no other updater anymore. Genl already unregistered */
+ for_each_resource_safe(resource, tmp, &drbd_resources) {
+ list_del(&resource->resources);
+ drbd_free_resource(resource);
+ }
+
+ drbd_destroy_mempools();
+ unregister_blkdev(DRBD_MAJOR, "drbd");
+
+ idr_destroy(&drbd_devices);
+
+ pr_info("module cleanup done.\n");
+}
+
+/**
+ * drbd_congested() - Callback for the flusher thread
+ * @congested_data: User data
+ * @bdi_bits: Bits the BDI flusher thread is currently interested in
+ *
+ * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ */
+static int drbd_congested(void *congested_data, int bdi_bits)
+{
+ struct drbd_device *device = congested_data;
+ struct request_queue *q;
+ char reason = '-';
+ int r = 0;
+
+ if (!may_inc_ap_bio(device)) {
+ /* DRBD has frozen IO */
+ r = bdi_bits;
+ reason = 'd';
+ goto out;
+ }
+
+ if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
+ r |= (1 << BDI_async_congested);
+ /* Without good local data, we would need to read from remote,
+ * and that would need the worker thread as well, which is
+ * currently blocked waiting for that usermode helper to
+ * finish.
+ */
+ if (!get_ldev_if_state(device, D_UP_TO_DATE))
+ r |= (1 << BDI_sync_congested);
+ else
+ put_ldev(device);
+ r &= bdi_bits;
+ reason = 'c';
+ goto out;
+ }
+
+ if (get_ldev(device)) {
+ q = bdev_get_queue(device->ldev->backing_bdev);
+ r = bdi_congested(&q->backing_dev_info, bdi_bits);
+ put_ldev(device);
+ if (r)
+ reason = 'b';
+ }
+
+ if (bdi_bits & (1 << BDI_async_congested) &&
+ test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
+ r |= (1 << BDI_async_congested);
+ reason = reason == 'b' ? 'a' : 'n';
+ }
+
+out:
+ device->congestion_reason = reason;
+ return r;
+}
+
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
+{
+ spin_lock_init(&wq->q_lock);
+ INIT_LIST_HEAD(&wq->q);
+ init_waitqueue_head(&wq->q_wait);
+}
+
+struct completion_work {
+ struct drbd_work w;
+ struct completion done;
+};
+
+static int w_complete(struct drbd_work *w, int cancel)
+{
+ struct completion_work *completion_work =
+ container_of(w, struct completion_work, w);
+
+ complete(&completion_work->done);
+ return 0;
+}
+
+void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
+{
+ struct completion_work completion_work;
+
+ completion_work.w.cb = w_complete;
+ init_completion(&completion_work.done);
+ drbd_queue_work(work_queue, &completion_work.w);
+ wait_for_completion(&completion_work.done);
+}
+
+struct drbd_resource *drbd_find_resource(const char *name)
+{
+ struct drbd_resource *resource;
+
+ if (!name || !name[0])
+ return NULL;
+
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ if (!strcmp(resource->name, name)) {
+ kref_get(&resource->kref);
+ goto found;
+ }
+ }
+ resource = NULL;
+found:
+ rcu_read_unlock();
+ return resource;
+}
+
+struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len)
+{
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+
+ rcu_read_lock();
+ for_each_resource_rcu(resource, &drbd_resources) {
+ for_each_connection_rcu(connection, resource) {
+ if (connection->my_addr_len == my_addr_len &&
+ connection->peer_addr_len == peer_addr_len &&
+ !memcmp(&connection->my_addr, my_addr, my_addr_len) &&
+ !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) {
+ kref_get(&connection->kref);
+ goto found;
+ }
+ }
+ }
+ connection = NULL;
+found:
+ rcu_read_unlock();
+ return connection;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+ socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->rbuf)
+ return -ENOMEM;
+ socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->sbuf)
+ return -ENOMEM;
+ return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+ free_page((unsigned long) socket->sbuf);
+ free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_connection *connection)
+{
+ drbd_free_sock(connection);
+
+ crypto_free_hash(connection->csums_tfm);
+ crypto_free_hash(connection->verify_tfm);
+ crypto_free_hash(connection->cram_hmac_tfm);
+ crypto_free_hash(connection->integrity_tfm);
+ crypto_free_hash(connection->peer_integrity_tfm);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+
+ connection->csums_tfm = NULL;
+ connection->verify_tfm = NULL;
+ connection->cram_hmac_tfm = NULL;
+ connection->integrity_tfm = NULL;
+ connection->peer_integrity_tfm = NULL;
+ connection->int_dig_in = NULL;
+ connection->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts)
+{
+ struct drbd_connection *connection;
+ cpumask_var_t new_cpu_mask;
+ int err;
+
+ if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ /* silently ignore cpu mask on UP kernel */
+ if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+ err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
+ cpumask_bits(new_cpu_mask), nr_cpu_ids);
+ if (err == -EOVERFLOW) {
+ /* So what. mask it out. */
+ cpumask_var_t tmp_cpu_mask;
+ if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+ cpumask_setall(tmp_cpu_mask);
+ cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+ drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+ res_opts->cpu_mask,
+ strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+ nr_cpu_ids);
+ free_cpumask_var(tmp_cpu_mask);
+ err = 0;
+ }
+ }
+ if (err) {
+ drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
+ /* retcode = ERR_CPU_MASK_PARSE; */
+ goto fail;
+ }
+ }
+ resource->res_opts = *res_opts;
+ if (cpumask_empty(new_cpu_mask))
+ drbd_calc_cpu_mask(&new_cpu_mask);
+ if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) {
+ cpumask_copy(resource->cpu_mask, new_cpu_mask);
+ for_each_connection_rcu(connection, resource) {
+ connection->receiver.reset_cpu_mask = 1;
+ connection->asender.reset_cpu_mask = 1;
+ connection->worker.reset_cpu_mask = 1;
+ }
+ }
+ err = 0;
+
+fail:
+ free_cpumask_var(new_cpu_mask);
+ return err;
+
+}
+
+struct drbd_resource *drbd_create_resource(const char *name)
+{
+ struct drbd_resource *resource;
+
+ resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL);
+ if (!resource)
+ goto fail;
+ resource->name = kstrdup(name, GFP_KERNEL);
+ if (!resource->name)
+ goto fail_free_resource;
+ if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL))
+ goto fail_free_name;
+ kref_init(&resource->kref);
+ idr_init(&resource->devices);
+ INIT_LIST_HEAD(&resource->connections);
+ resource->write_ordering = WO_bdev_flush;
+ list_add_tail_rcu(&resource->resources, &drbd_resources);
+ mutex_init(&resource->conf_update);
+ mutex_init(&resource->adm_mutex);
+ spin_lock_init(&resource->req_lock);
+ drbd_debugfs_resource_add(resource);
+ return resource;
+
+fail_free_name:
+ kfree(resource->name);
+fail_free_resource:
+ kfree(resource);
+fail:
+ return NULL;
+}
+
+/* caller must be under adm_mutex */
+struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
+{
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+
+ connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL);
+ if (!connection)
+ return NULL;
+
+ if (drbd_alloc_socket(&connection->data))
+ goto fail;
+ if (drbd_alloc_socket(&connection->meta))
+ goto fail;
+
+ connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+ if (!connection->current_epoch)
+ goto fail;
+
+ INIT_LIST_HEAD(&connection->transfer_log);
+
+ INIT_LIST_HEAD(&connection->current_epoch->list);
+ connection->epochs = 1;
+ spin_lock_init(&connection->epoch_lock);
+
+ connection->send.seen_any_write_yet = false;
+ connection->send.current_epoch_nr = 0;
+ connection->send.current_epoch_writes = 0;
+
+ resource = drbd_create_resource(name);
+ if (!resource)
+ goto fail;
+
+ connection->cstate = C_STANDALONE;
+ mutex_init(&connection->cstate_mutex);
+ init_waitqueue_head(&connection->ping_wait);
+ idr_init(&connection->peer_devices);
+
+ drbd_init_workqueue(&connection->sender_work);
+ mutex_init(&connection->data.mutex);
+ mutex_init(&connection->meta.mutex);
+
+ drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver");
+ connection->receiver.connection = connection;
+ drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
+ connection->worker.connection = connection;
+ drbd_thread_init(resource, &connection->asender, drbd_asender, "asender");
+ connection->asender.connection = connection;
+
+ kref_init(&connection->kref);
+
+ connection->resource = resource;
+
+ if (set_resource_options(resource, res_opts))
+ goto fail_resource;
+
+ kref_get(&resource->kref);
+ list_add_tail_rcu(&connection->connections, &resource->connections);
+ drbd_debugfs_connection_add(connection);
+ return connection;
+
+fail_resource:
+ list_del(&resource->resources);
+ drbd_free_resource(resource);
+fail:
+ kfree(connection->current_epoch);
+ drbd_free_socket(&connection->meta);
+ drbd_free_socket(&connection->data);
+ kfree(connection);
+ return NULL;
+}
+
+void drbd_destroy_connection(struct kref *kref)
+{
+ struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref);
+ struct drbd_resource *resource = connection->resource;
+
+ if (atomic_read(&connection->current_epoch->epoch_size) != 0)
+ drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
+ kfree(connection->current_epoch);
+
+ idr_destroy(&connection->peer_devices);
+
+ drbd_free_socket(&connection->meta);
+ drbd_free_socket(&connection->data);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+ memset(connection, 0xfc, sizeof(*connection));
+ kfree(connection);
+ kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static int init_submitter(struct drbd_device *device)
+{
+ /* opencoded create_singlethread_workqueue(),
+ * to be able to say "drbd%d", ..., minor */
+ device->submit.wq = alloc_workqueue("drbd%u_submit",
+ WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor);
+ if (!device->submit.wq)
+ return -ENOMEM;
+
+ INIT_WORK(&device->submit.worker, do_submit);
+ INIT_LIST_HEAD(&device->submit.writes);
+ return 0;
+}
+
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
+{
+ struct drbd_resource *resource = adm_ctx->resource;
+ struct drbd_connection *connection;
+ struct drbd_device *device;
+ struct drbd_peer_device *peer_device, *tmp_peer_device;
+ struct gendisk *disk;
+ struct request_queue *q;
+ int id;
+ int vnr = adm_ctx->volume;
+ enum drbd_ret_code err = ERR_NOMEM;
+
+ device = minor_to_device(minor);
+ if (device)
+ return ERR_MINOR_OR_VOLUME_EXISTS;
+
+ /* GFP_KERNEL, we are outside of all write-out paths */
+ device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
+ if (!device)
+ return ERR_NOMEM;
+ kref_init(&device->kref);
+
+ kref_get(&resource->kref);
+ device->resource = resource;
+ device->minor = minor;
+ device->vnr = vnr;
+
+ drbd_init_set_defaults(device);
+
+ q = blk_alloc_queue(GFP_KERNEL);
+ if (!q)
+ goto out_no_q;
+ device->rq_queue = q;
+ q->queuedata = device;
+
+ disk = alloc_disk(1);
+ if (!disk)
+ goto out_no_disk;
+ device->vdisk = disk;
+
+ set_disk_ro(disk, true);
+
+ disk->queue = q;
+ disk->major = DRBD_MAJOR;
+ disk->first_minor = minor;
+ disk->fops = &drbd_ops;
+ sprintf(disk->disk_name, "drbd%d", minor);
+ disk->private_data = device;
+
+ device->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
+ /* we have no partitions. we contain only ourselves. */
+ device->this_bdev->bd_contains = device->this_bdev;
+
+ q->backing_dev_info.congested_fn = drbd_congested;
+ q->backing_dev_info.congested_data = device;
+
+ blk_queue_make_request(q, drbd_make_request);
+ blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+ /* Setting the max_hw_sectors to an odd value of 8kibyte here
+ This triggers a max_bio_size message upon first attach or connect */
+ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
+ blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+ blk_queue_merge_bvec(q, drbd_merge_bvec);
+ q->queue_lock = &resource->req_lock;
+
+ device->md_io.page = alloc_page(GFP_KERNEL);
+ if (!device->md_io.page)
+ goto out_no_io_page;
+
+ if (drbd_bm_init(device))
+ goto out_no_bitmap;
+ device->read_requests = RB_ROOT;
+ device->write_requests = RB_ROOT;
+
+ id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC)
+ err = ERR_MINOR_OR_VOLUME_EXISTS;
+ goto out_no_minor_idr;
+ }
+ kref_get(&device->kref);
+
+ id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC)
+ err = ERR_MINOR_OR_VOLUME_EXISTS;
+ goto out_idr_remove_minor;
+ }
+ kref_get(&device->kref);
+
+ INIT_LIST_HEAD(&device->peer_devices);
+ INIT_LIST_HEAD(&device->pending_bitmap_io);
+ for_each_connection(connection, resource) {
+ peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
+ if (!peer_device)
+ goto out_idr_remove_from_resource;
+ peer_device->connection = connection;
+ peer_device->device = device;
+
+ list_add(&peer_device->peer_devices, &device->peer_devices);
+ kref_get(&device->kref);
+
+ id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
+ if (id < 0) {
+ if (id == -ENOSPC)
+ err = ERR_INVALID_REQUEST;
+ goto out_idr_remove_from_resource;
+ }
+ kref_get(&connection->kref);
+ }
+
+ if (init_submitter(device)) {
+ err = ERR_NOMEM;
+ goto out_idr_remove_vol;
+ }
+
+ add_disk(disk);
+
+ /* inherit the connection state */
+ device->state.conn = first_connection(resource)->cstate;
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ for_each_peer_device(peer_device, device)
+ drbd_connected(peer_device);
+ }
+ /* move to create_peer_device() */
+ for_each_peer_device(peer_device, device)
+ drbd_debugfs_peer_device_add(peer_device);
+ drbd_debugfs_device_add(device);
+ return NO_ERROR;
+
+out_idr_remove_vol:
+ idr_remove(&connection->peer_devices, vnr);
+out_idr_remove_from_resource:
+ for_each_connection(connection, resource) {
+ peer_device = idr_find(&connection->peer_devices, vnr);
+ if (peer_device) {
+ idr_remove(&connection->peer_devices, vnr);
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
+ }
+ for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+ list_del(&peer_device->peer_devices);
+ kfree(peer_device);
+ }
+ idr_remove(&resource->devices, vnr);
+out_idr_remove_minor:
+ idr_remove(&drbd_devices, minor);
+ synchronize_rcu();
+out_no_minor_idr:
+ drbd_bm_cleanup(device);
+out_no_bitmap:
+ __free_page(device->md_io.page);
+out_no_io_page:
+ put_disk(disk);
+out_no_disk:
+ blk_cleanup_queue(q);
+out_no_q:
+ kref_put(&resource->kref, drbd_destroy_resource);
+ kfree(device);
+ return err;
+}
+
+void drbd_delete_device(struct drbd_device *device)
+{
+ struct drbd_resource *resource = device->resource;
+ struct drbd_connection *connection;
+ struct drbd_peer_device *peer_device;
+ int refs = 3;
+
+ /* move to free_peer_device() */
+ for_each_peer_device(peer_device, device)
+ drbd_debugfs_peer_device_cleanup(peer_device);
+ drbd_debugfs_device_cleanup(device);
+ for_each_connection(connection, resource) {
+ idr_remove(&connection->peer_devices, device->vnr);
+ refs++;
+ }
+ idr_remove(&resource->devices, device->vnr);
+ idr_remove(&drbd_devices, device_to_minor(device));
+ del_gendisk(device->vdisk);
+ synchronize_rcu();
+ kref_sub(&device->kref, refs, drbd_destroy_device);
+}
+
+static int __init drbd_init(void)
+{
+ int err;
+
+ if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
+ pr_err("invalid minor_count (%d)\n", minor_count);
+#ifdef MODULE
+ return -EINVAL;
+#else
+ minor_count = DRBD_MINOR_COUNT_DEF;
+#endif
+ }
+
+ err = register_blkdev(DRBD_MAJOR, "drbd");
+ if (err) {
+ pr_err("unable to register block device major %d\n",
+ DRBD_MAJOR);
+ return err;
+ }
+
+ /*
+ * allocate all necessary structs
+ */
+ init_waitqueue_head(&drbd_pp_wait);
+
+ drbd_proc = NULL; /* play safe for drbd_cleanup */
+ idr_init(&drbd_devices);
+
+ rwlock_init(&global_state_lock);
+ INIT_LIST_HEAD(&drbd_resources);
+
+ err = drbd_genl_register();
+ if (err) {
+ pr_err("unable to register generic netlink family\n");
+ goto fail;
+ }
+
+ err = drbd_create_mempools();
+ if (err)
+ goto fail;
+
+ err = -ENOMEM;
+ drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
+ if (!drbd_proc) {
+ pr_err("unable to register proc file\n");
+ goto fail;
+ }
+
+ retry.wq = create_singlethread_workqueue("drbd-reissue");
+ if (!retry.wq) {
+ pr_err("unable to create retry workqueue\n");
+ goto fail;
+ }
+ INIT_WORK(&retry.worker, do_retry);
+ spin_lock_init(&retry.lock);
+ INIT_LIST_HEAD(&retry.writes);
+
+ if (drbd_debugfs_init())
+ pr_notice("failed to initialize debugfs -- will not be available\n");
+
+ pr_info("initialized. "
+ "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+ API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+ pr_info("%s\n", drbd_buildtag());
+ pr_info("registered as block device major %d\n", DRBD_MAJOR);
+ return 0; /* Success! */
+
+fail:
+ drbd_cleanup();
+ if (err == -ENOMEM)
+ pr_err("ran out of memory\n");
+ else
+ pr_err("initialization failure\n");
+ return err;
+}
+
+void drbd_free_ldev(struct drbd_backing_dev *ldev)
+{
+ if (ldev == NULL)
+ return;
+
+ blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+ kfree(ldev->disk_conf);
+ kfree(ldev);
+}
+
+static void drbd_free_one_sock(struct drbd_socket *ds)
+{
+ struct socket *s;
+ mutex_lock(&ds->mutex);
+ s = ds->socket;
+ ds->socket = NULL;
+ mutex_unlock(&ds->mutex);
+ if (s) {
+ /* so debugfs does not need to mutex_lock() */
+ synchronize_rcu();
+ kernel_sock_shutdown(s, SHUT_RDWR);
+ sock_release(s);
+ }
+}
+
+void drbd_free_sock(struct drbd_connection *connection)
+{
+ if (connection->data.socket)
+ drbd_free_one_sock(&connection->data);
+ if (connection->meta.socket)
+ drbd_free_one_sock(&connection->meta);
+}
+
+/* meta data management */
+
+void conn_md_sync(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_md_sync(device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+/* aligned 4kByte */
+struct meta_data_on_disk {
+ u64 la_size_sect; /* last agreed size. */
+ u64 uuid[UI_SIZE]; /* UUIDs. */
+ u64 device_uuid;
+ u64 reserved_u64_1;
+ u32 flags; /* MDF */
+ u32 magic;
+ u32 md_size_sect;
+ u32 al_offset; /* offset to this block */
+ u32 al_nr_extents; /* important for restoring the AL (userspace) */
+ /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
+ u32 bm_offset; /* offset to the bitmap, from here */
+ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
+ u32 la_peer_max_bio_size; /* last peer max_bio_size */
+
+ /* see al_tr_number_to_on_disk_sector() */
+ u32 al_stripes;
+ u32 al_stripe_size_4k;
+
+ u8 reserved_u8[4096 - (7*8 + 10*4)];
+} __packed;
+
+
+
+void drbd_md_write(struct drbd_device *device, void *b)
+{
+ struct meta_data_on_disk *buffer = b;
+ sector_t sector;
+ int i;
+
+ memset(buffer, 0, sizeof(*buffer));
+
+ buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(device->this_bdev));
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+ buffer->flags = cpu_to_be32(device->ldev->md.flags);
+ buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
+
+ buffer->md_size_sect = cpu_to_be32(device->ldev->md.md_size_sect);
+ buffer->al_offset = cpu_to_be32(device->ldev->md.al_offset);
+ buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
+ buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+ buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
+
+ buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
+ buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
+
+ buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
+ buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
+
+ D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
+ sector = device->ldev->md.md_offset;
+
+ if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) {
+ /* this was a try anyways ... */
+ drbd_err(device, "meta data update failed!\n");
+ drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+ }
+}
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @device: DRBD device.
+ */
+void drbd_md_sync(struct drbd_device *device)
+{
+ struct meta_data_on_disk *buffer;
+
+ /* Don't accidentally change the DRBD meta data layout. */
+ BUILD_BUG_ON(UI_SIZE != 4);
+ BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
+ del_timer(&device->md_sync_timer);
+ /* timer may be rearmed by drbd_md_mark_dirty() now. */
+ if (!test_and_clear_bit(MD_DIRTY, &device->flags))
+ return;
+
+ /* We use here D_FAILED and not D_ATTACHING because we try to write
+ * metadata even if we detach due to a disk failure! */
+ if (!get_ldev_if_state(device, D_FAILED))
+ return;
+
+ buffer = drbd_md_get_buffer(device, __func__);
+ if (!buffer)
+ goto out;
+
+ drbd_md_write(device, buffer);
+
+ /* Update device->ldev->md.la_size_sect,
+ * since we updated it on metadata. */
+ device->ldev->md.la_size_sect = drbd_get_capacity(device->this_bdev);
+
+ drbd_md_put_buffer(device);
+out:
+ put_ldev(device);
+}
+
+static int check_activity_log_stripe_size(struct drbd_device *device,
+ struct meta_data_on_disk *on_disk,
+ struct drbd_md *in_core)
+{
+ u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+ u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+ u64 al_size_4k;
+
+ /* both not set: default to old fixed size activity log */
+ if (al_stripes == 0 && al_stripe_size_4k == 0) {
+ al_stripes = 1;
+ al_stripe_size_4k = MD_32kB_SECT/8;
+ }
+
+ /* some paranoia plausibility checks */
+
+ /* we need both values to be set */
+ if (al_stripes == 0 || al_stripe_size_4k == 0)
+ goto err;
+
+ al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+ /* Upper limit of activity log area, to avoid potential overflow
+ * problems in al_tr_number_to_on_disk_sector(). As right now, more
+ * than 72 * 4k blocks total only increases the amount of history,
+ * limiting this arbitrarily to 16 GB is not a real limitation ;-) */
+ if (al_size_4k > (16 * 1024 * 1024/4))
+ goto err;
+
+ /* Lower limit: we need at least 8 transaction slots (32kB)
+ * to not break existing setups */
+ if (al_size_4k < MD_32kB_SECT/8)
+ goto err;
+
+ in_core->al_stripe_size_4k = al_stripe_size_4k;
+ in_core->al_stripes = al_stripes;
+ in_core->al_size_4k = al_size_4k;
+
+ return 0;
+err:
+ drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+ al_stripes, al_stripe_size_4k);
+ return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+ sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+ struct drbd_md *in_core = &bdev->md;
+ s32 on_disk_al_sect;
+ s32 on_disk_bm_sect;
+
+ /* The on-disk size of the activity log, calculated from offsets, and
+ * the size of the activity log calculated from the stripe settings,
+ * should match.
+ * Though we could relax this a bit: it is ok, if the striped activity log
+ * fits in the available on-disk activity log size.
+ * Right now, that would break how resize is implemented.
+ * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+ * of possible unused padding space in the on disk layout. */
+ if (in_core->al_offset < 0) {
+ if (in_core->bm_offset > in_core->al_offset)
+ goto err;
+ on_disk_al_sect = -in_core->al_offset;
+ on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+ } else {
+ if (in_core->al_offset != MD_4kB_SECT)
+ goto err;
+ if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+ on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+ }
+
+ /* old fixed size meta data is exactly that: fixed. */
+ if (in_core->meta_dev_idx >= 0) {
+ if (in_core->md_size_sect != MD_128MB_SECT
+ || in_core->al_offset != MD_4kB_SECT
+ || in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+ || in_core->al_stripes != 1
+ || in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+ goto err;
+ }
+
+ if (capacity < in_core->md_size_sect)
+ goto err;
+ if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+ goto err;
+
+ /* should be aligned, and at least 32k */
+ if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+ goto err;
+
+ /* should fit (for now: exactly) into the available on-disk space;
+ * overflow prevention is in check_activity_log_stripe_size() above. */
+ if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+ goto err;
+
+ /* again, should be aligned */
+ if (in_core->bm_offset & 7)
+ goto err;
+
+ /* FIXME check for device grow with flex external meta data? */
+
+ /* can the available bitmap space cover the last agreed device size? */
+ if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+ goto err;
+
+ return 0;
+
+err:
+ drbd_err(device, "meta data offsets don't make sense: idx=%d "
+ "al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+ "md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+ in_core->meta_dev_idx,
+ in_core->al_stripes, in_core->al_stripe_size_4k,
+ in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+ (unsigned long long)in_core->la_size_sect,
+ (unsigned long long)capacity);
+
+ return -EINVAL;
+}
+
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @device: DRBD device.
+ * @bdev: Device from which the meta data should be read in.
+ *
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
+ * something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @device->ldev.
+ */
+int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+ struct meta_data_on_disk *buffer;
+ u32 magic, flags;
+ int i, rv = NO_ERROR;
+
+ if (device->state.disk != D_DISKLESS)
+ return ERR_DISK_CONFIGURED;
+
+ buffer = drbd_md_get_buffer(device, __func__);
+ if (!buffer)
+ return ERR_NOMEM;
+
+ /* First, figure out where our meta data superblock is located,
+ * and read it. */
+ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+ bdev->md.md_offset = drbd_md_ss(bdev);
+
+ if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) {
+ /* NOTE: can't do normal error processing here as this is
+ called BEFORE disk is attached */
+ drbd_err(device, "Error while reading metadata.\n");
+ rv = ERR_IO_MD_DISK;
+ goto err;
+ }
+
+ magic = be32_to_cpu(buffer->magic);
+ flags = be32_to_cpu(buffer->flags);
+ if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+ (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+ /* btw: that's Activity Log clean, not "all" clean. */
+ drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+ rv = ERR_MD_UNCLEAN;
+ goto err;
+ }
+
+ rv = ERR_MD_INVALID;
+ if (magic != DRBD_MD_MAGIC_08) {
+ if (magic == DRBD_MD_MAGIC_07)
+ drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+ else
+ drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
+ goto err;
+ }
+
+ if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+ drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+ be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+ goto err;
+ }
+
+
+ /* convert to in_core endian */
+ bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
+ for (i = UI_CURRENT; i < UI_SIZE; i++)
+ bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+ bdev->md.flags = be32_to_cpu(buffer->flags);
+ bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+ bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+ bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+ bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
+
+ if (check_activity_log_stripe_size(device, buffer, &bdev->md))
+ goto err;
+ if (check_offsets_and_sizes(device, bdev))
+ goto err;
+
+ if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+ drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
+ be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+ goto err;
+ }
+ if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+ drbd_err(device, "unexpected md_size: %u (expected %u)\n",
+ be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+ goto err;
+ }
+
+ rv = NO_ERROR;
+
+ spin_lock_irq(&device->resource->req_lock);
+ if (device->state.conn < C_CONNECTED) {
+ unsigned int peer;
+ peer = be32_to_cpu(buffer->la_peer_max_bio_size);
+ peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
+ device->peer_max_bio_size = peer;
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+
+ err:
+ drbd_md_put_buffer(device);
+
+ return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @device: DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+#ifdef DEBUG
+void drbd_md_mark_dirty_(struct drbd_device *device, unsigned int line, const char *func)
+{
+ if (!test_and_set_bit(MD_DIRTY, &device->flags)) {
+ mod_timer(&device->md_sync_timer, jiffies + HZ);
+ device->last_md_mark_dirty.line = line;
+ device->last_md_mark_dirty.func = func;
+ }
+}
+#else
+void drbd_md_mark_dirty(struct drbd_device *device)
+{
+ if (!test_and_set_bit(MD_DIRTY, &device->flags))
+ mod_timer(&device->md_sync_timer, jiffies + 5*HZ);
+}
+#endif
+
+void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
+{
+ int i;
+
+ for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+ device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i];
+}
+
+void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+ if (idx == UI_CURRENT) {
+ if (device->state.role == R_PRIMARY)
+ val |= 1;
+ else
+ val &= ~((u64)1);
+
+ drbd_set_ed_uuid(device, val);
+ }
+
+ device->ldev->md.uuid[idx] = val;
+ drbd_md_mark_dirty(device);
+}
+
+void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+ __drbd_uuid_set(device, idx, val);
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+ if (device->ldev->md.uuid[idx]) {
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
+ }
+ __drbd_uuid_set(device, idx, val);
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @device: DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
+{
+ u64 val;
+ unsigned long long bm_uuid;
+
+ get_random_bytes(&val, sizeof(u64));
+
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+
+ if (bm_uuid)
+ drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+ device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
+ __drbd_uuid_set(device, UI_CURRENT, val);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ drbd_print_uuids(device, "new current UUID");
+ /* get it to stable storage _now_ */
+ drbd_md_sync(device);
+}
+
+void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
+{
+ unsigned long flags;
+ if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+ return;
+
+ spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+ if (val == 0) {
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+ device->ldev->md.uuid[UI_BITMAP] = 0;
+ } else {
+ unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+ if (bm_uuid)
+ drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+ device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
+ }
+ spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+
+ drbd_md_mark_dirty(device);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device: DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
+{
+ int rv = -EIO;
+
+ drbd_md_set_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ drbd_bm_set_all(device);
+
+ rv = drbd_bm_write(device);
+
+ if (!rv) {
+ drbd_md_clear_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ }
+
+ return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device: DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
+{
+ drbd_resume_al(device);
+ drbd_bm_clear_all(device);
+ return drbd_bm_write(device);
+}
+
+static int w_bitmap_io(struct drbd_work *w, int unused)
+{
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, bm_io_work.w);
+ struct bm_io_work *work = &device->bm_io_work;
+ int rv = -EIO;
+
+ D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0);
+
+ if (get_ldev(device)) {
+ drbd_bm_lock(device, work->why, work->flags);
+ rv = work->io_fn(device);
+ drbd_bm_unlock(device);
+ put_ldev(device);
+ }
+
+ clear_bit_unlock(BITMAP_IO, &device->flags);
+ wake_up(&device->misc_wait);
+
+ if (work->done)
+ work->done(device, rv);
+
+ clear_bit(BITMAP_IO_QUEUED, &device->flags);
+ work->why = NULL;
+ work->flags = 0;
+
+ return 0;
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @device: DRBD device.
+ * @io_fn: IO callback to be called when bitmap IO is possible
+ * @done: callback to be called after the bitmap IO was performed
+ * @why: Descriptive text of the reason for doing the IO
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ *
+ * Its worker function encloses the call of io_fn() by get_ldev() and
+ * put_ldev().
+ */
+void drbd_queue_bitmap_io(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ void (*done)(struct drbd_device *, int),
+ char *why, enum bm_flag flags)
+{
+ D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+ D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
+ D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
+ D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
+ if (device->bm_io_work.why)
+ drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
+ why, device->bm_io_work.why);
+
+ device->bm_io_work.io_fn = io_fn;
+ device->bm_io_work.done = done;
+ device->bm_io_work.why = why;
+ device->bm_io_work.flags = flags;
+
+ spin_lock_irq(&device->resource->req_lock);
+ set_bit(BITMAP_IO, &device->flags);
+ if (atomic_read(&device->ap_bio_cnt) == 0) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &device->bm_io_work.w);
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+}
+
+/**
+ * drbd_bitmap_io() - Does an IO operation on the whole bitmap
+ * @device: DRBD device.
+ * @io_fn: IO callback to be called when bitmap IO is possible
+ * @why: Descriptive text of the reason for doing the IO
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags)
+{
+ int rv;
+
+ D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+
+ if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+ drbd_suspend_io(device);
+
+ drbd_bm_lock(device, why, flags);
+ rv = io_fn(device);
+ drbd_bm_unlock(device);
+
+ if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
+ drbd_resume_io(device);
+
+ return rv;
+}
+
+void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+ if ((device->ldev->md.flags & flag) != flag) {
+ drbd_md_mark_dirty(device);
+ device->ldev->md.flags |= flag;
+ }
+}
+
+void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+ if ((device->ldev->md.flags & flag) != 0) {
+ drbd_md_mark_dirty(device);
+ device->ldev->md.flags &= ~flag;
+ }
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+ return (bdev->md.flags & flag) != 0;
+}
+
+static void md_sync_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+ drbd_device_post_work(device, MD_SYNC);
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+ /* THINK may need to become several global tables
+ * when we want to support more than
+ * one PRO_VERSION */
+ static const char *cmdnames[] = {
+ [P_DATA] = "Data",
+ [P_DATA_REPLY] = "DataReply",
+ [P_RS_DATA_REPLY] = "RSDataReply",
+ [P_BARRIER] = "Barrier",
+ [P_BITMAP] = "ReportBitMap",
+ [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
+ [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
+ [P_UNPLUG_REMOTE] = "UnplugRemote",
+ [P_DATA_REQUEST] = "DataRequest",
+ [P_RS_DATA_REQUEST] = "RSDataRequest",
+ [P_SYNC_PARAM] = "SyncParam",
+ [P_SYNC_PARAM89] = "SyncParam89",
+ [P_PROTOCOL] = "ReportProtocol",
+ [P_UUIDS] = "ReportUUIDs",
+ [P_SIZES] = "ReportSizes",
+ [P_STATE] = "ReportState",
+ [P_SYNC_UUID] = "ReportSyncUUID",
+ [P_AUTH_CHALLENGE] = "AuthChallenge",
+ [P_AUTH_RESPONSE] = "AuthResponse",
+ [P_PING] = "Ping",
+ [P_PING_ACK] = "PingAck",
+ [P_RECV_ACK] = "RecvAck",
+ [P_WRITE_ACK] = "WriteAck",
+ [P_RS_WRITE_ACK] = "RSWriteAck",
+ [P_SUPERSEDED] = "Superseded",
+ [P_NEG_ACK] = "NegAck",
+ [P_NEG_DREPLY] = "NegDReply",
+ [P_NEG_RS_DREPLY] = "NegRSDReply",
+ [P_BARRIER_ACK] = "BarrierAck",
+ [P_STATE_CHG_REQ] = "StateChgRequest",
+ [P_STATE_CHG_REPLY] = "StateChgReply",
+ [P_OV_REQUEST] = "OVRequest",
+ [P_OV_REPLY] = "OVReply",
+ [P_OV_RESULT] = "OVResult",
+ [P_CSUM_RS_REQUEST] = "CsumRSRequest",
+ [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
+ [P_COMPRESSED_BITMAP] = "CBitmap",
+ [P_DELAY_PROBE] = "DelayProbe",
+ [P_OUT_OF_SYNC] = "OutOfSync",
+ [P_RETRY_WRITE] = "RetryWrite",
+ [P_RS_CANCEL] = "RSCancel",
+ [P_CONN_ST_CHG_REQ] = "conn_st_chg_req",
+ [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
+ [P_RETRY_WRITE] = "retry_write",
+ [P_PROTOCOL_UPDATE] = "protocol_update",
+
+ /* enum drbd_packet, but not commands - obsoleted flags:
+ * P_MAY_IGNORE
+ * P_MAX_OPT_CMD
+ */
+ };
+
+ /* too big for the array: 0xfffX */
+ if (cmd == P_INITIAL_META)
+ return "InitialMeta";
+ if (cmd == P_INITIAL_DATA)
+ return "InitialData";
+ if (cmd == P_CONNECTION_FEATURES)
+ return "ConnectionFeatures";
+ if (cmd >= ARRAY_SIZE(cmdnames))
+ return "Unknown";
+ return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc - wait for a request to make progress
+ * @device: device associated with the request
+ * @i: the struct drbd_interval embedded in struct drbd_request or
+ * struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
+{
+ struct net_conf *nc;
+ DEFINE_WAIT(wait);
+ long timeout;
+
+ rcu_read_lock();
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -ETIMEDOUT;
+ }
+ timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+ rcu_read_unlock();
+
+ /* Indicate to wake up device->misc_wait on progress. */
+ i->waiting = true;
+ prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&device->resource->req_lock);
+ timeout = schedule_timeout(timeout);
+ finish_wait(&device->misc_wait, &wait);
+ spin_lock_irq(&device->resource->req_lock);
+ if (!timeout || device->state.conn < C_CONNECTED)
+ return -ETIMEDOUT;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ return 0;
+}
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+ unsigned long state;
+ unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801 /* prime */
+#define FAULT_RANDOM_ADD 479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator. Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+ long refresh;
+
+ if (!rsp->count--) {
+ get_random_bytes(&refresh, sizeof(refresh));
+ rsp->state += refresh;
+ rsp->count = FAULT_RANDOM_REFRESH;
+ }
+ rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+ return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+ static char *_faults[] = {
+ [DRBD_FAULT_MD_WR] = "Meta-data write",
+ [DRBD_FAULT_MD_RD] = "Meta-data read",
+ [DRBD_FAULT_RS_WR] = "Resync write",
+ [DRBD_FAULT_RS_RD] = "Resync read",
+ [DRBD_FAULT_DT_WR] = "Data write",
+ [DRBD_FAULT_DT_RD] = "Data read",
+ [DRBD_FAULT_DT_RA] = "Data read ahead",
+ [DRBD_FAULT_BM_ALLOC] = "BM allocation",
+ [DRBD_FAULT_AL_EE] = "EE allocation",
+ [DRBD_FAULT_RECEIVE] = "receive data corruption",
+ };
+
+ return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type)
+{
+ static struct fault_random_state rrs = {0, 0};
+
+ unsigned int ret = (
+ (fault_devs == 0 ||
+ ((1 << device_to_minor(device)) & fault_devs) != 0) &&
+ (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
+
+ if (ret) {
+ fault_count++;
+
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_warn(device, "***Simulating %s failure\n",
+ _drbd_fault_str(type));
+ }
+
+ return ret;
+}
+#endif
+
+const char *drbd_buildtag(void)
+{
+ /* DRBD built from external sources has here a reference to the
+ git hash of the source code. */
+
+ static char buildtag[38] = "\0uilt-in";
+
+ if (buildtag[0] == 0) {
+#ifdef MODULE
+ sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+ buildtag[0] = 'b';
+#endif
+ }
+
+ return buildtag;
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000..74df8cfad
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,3674 @@
+/*
+ drbd_nl.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_limits.h>
+#include <linux/kthread.h>
+
+#include <net/genetlink.h>
+
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+ genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+ if (genlmsg_reply(skb, info))
+ pr_err("error sending genl reply\n");
+}
+
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+static int drbd_msg_put_info(struct sk_buff *skb, const char *info)
+{
+ struct nlattr *nla;
+ int err = -EMSGSIZE;
+
+ if (!info || !info[0])
+ return 0;
+
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+ if (!nla)
+ return err;
+
+ err = nla_put_string(skb, T_info_text, info);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
+ } else
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ *
+ * At this point, we still rely on the global genl_lock().
+ * If we want to avoid that, and allow "genl_family.parallel_ops", we may need
+ * to add additional synchronization against object destruction/modification.
+ */
+#define DRBD_ADM_NEED_MINOR 1
+#define DRBD_ADM_NEED_RESOURCE 2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
+ struct sk_buff *skb, struct genl_info *info, unsigned flags)
+{
+ struct drbd_genlmsghdr *d_in = info->userhdr;
+ const u8 cmd = info->genlhdr->cmd;
+ int err;
+
+ memset(adm_ctx, 0, sizeof(*adm_ctx));
+
+ /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!adm_ctx->reply_skb) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
+ info, &drbd_genl_family, 0, cmd);
+ /* put of a few bytes into a fresh skb of >= 4k will always succeed.
+ * but anyways */
+ if (!adm_ctx->reply_dh) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx->reply_dh->minor = d_in->minor;
+ adm_ctx->reply_dh->ret_code = NO_ERROR;
+
+ adm_ctx->volume = VOLUME_UNSPECIFIED;
+ if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+ struct nlattr *nla;
+ /* parse and validate only */
+ err = drbd_cfg_context_from_attrs(NULL, info);
+ if (err)
+ goto fail;
+
+ /* It was present, and valid,
+ * copy it over to the reply skb. */
+ err = nla_put_nohdr(adm_ctx->reply_skb,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]);
+ if (err)
+ goto fail;
+
+ /* and assign stuff to the adm_ctx */
+ nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+ if (nla)
+ adm_ctx->volume = nla_get_u32(nla);
+ nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+ if (nla)
+ adm_ctx->resource_name = nla_data(nla);
+ adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+ adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+ if ((adm_ctx->my_addr &&
+ nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
+ (adm_ctx->peer_addr &&
+ nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ adm_ctx->minor = d_in->minor;
+ adm_ctx->device = minor_to_device(d_in->minor);
+
+ /* We are protected by the global genl_lock().
+ * But we may explicitly drop it/retake it in drbd_adm_set_role(),
+ * so make sure this object stays around. */
+ if (adm_ctx->device)
+ kref_get(&adm_ctx->device->kref);
+
+ if (adm_ctx->resource_name) {
+ adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
+ }
+
+ if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor");
+ return ERR_MINOR_INVALID;
+ }
+ if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource");
+ if (adm_ctx->resource_name)
+ return ERR_RES_NOT_KNOWN;
+ return ERR_INVALID_REQUEST;
+ }
+
+ if (flags & DRBD_ADM_NEED_CONNECTION) {
+ if (adm_ctx->resource) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->device) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->my_addr && adm_ctx->peer_addr)
+ adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
+ nla_len(adm_ctx->my_addr),
+ nla_data(adm_ctx->peer_addr),
+ nla_len(adm_ctx->peer_addr));
+ if (!adm_ctx->connection) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection");
+ return ERR_INVALID_REQUEST;
+ }
+ }
+
+ /* some more paranoia, if the request was over-determined */
+ if (adm_ctx->device && adm_ctx->resource &&
+ adm_ctx->device->resource != adm_ctx->resource) {
+ pr_warning("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
+ adm_ctx->minor, adm_ctx->resource->name,
+ adm_ctx->device->resource->name);
+ drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx->device &&
+ adm_ctx->volume != VOLUME_UNSPECIFIED &&
+ adm_ctx->volume != adm_ctx->device->vnr) {
+ pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+ adm_ctx->minor, adm_ctx->volume,
+ adm_ctx->device->vnr,
+ adm_ctx->device->resource->name);
+ drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume");
+ return ERR_INVALID_REQUEST;
+ }
+
+ /* still, provide adm_ctx->resource always, if possible. */
+ if (!adm_ctx->resource) {
+ adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
+ : adm_ctx->connection ? adm_ctx->connection->resource : NULL;
+ if (adm_ctx->resource)
+ kref_get(&adm_ctx->resource->kref);
+ }
+
+ return NO_ERROR;
+
+fail:
+ nlmsg_free(adm_ctx->reply_skb);
+ adm_ctx->reply_skb = NULL;
+ return err;
+}
+
+static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
+ struct genl_info *info, int retcode)
+{
+ if (adm_ctx->device) {
+ kref_put(&adm_ctx->device->kref, drbd_destroy_device);
+ adm_ctx->device = NULL;
+ }
+ if (adm_ctx->connection) {
+ kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
+ adm_ctx->connection = NULL;
+ }
+ if (adm_ctx->resource) {
+ kref_put(&adm_ctx->resource->kref, drbd_destroy_resource);
+ adm_ctx->resource = NULL;
+ }
+
+ if (!adm_ctx->reply_skb)
+ return -ENOMEM;
+
+ adm_ctx->reply_dh->ret_code = retcode;
+ drbd_adm_send_reply(adm_ctx->reply_skb, info);
+ return 0;
+}
+
+static void setup_khelper_env(struct drbd_connection *connection, char **envp)
+{
+ char *afs;
+
+ /* FIXME: A future version will not allow this case. */
+ if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
+ return;
+
+ switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
+ case AF_INET6:
+ afs = "ipv6";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+ &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
+ break;
+ case AF_INET:
+ afs = "ipv4";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+ break;
+ default:
+ afs = "ssocks";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+ }
+ snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
+
+int drbd_khelper(struct drbd_device *device, char *cmd)
+{
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
+ NULL };
+ char mb[12];
+ char *argv[] = {usermode_helper, cmd, mb, NULL };
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct sib_info sib;
+ int ret;
+
+ if (current == connection->worker.task)
+ set_bit(CALLBACK_PENDING, &connection->flags);
+
+ snprintf(mb, 12, "minor-%d", device_to_minor(device));
+ setup_khelper_env(connection, envp);
+
+ /* The helper may take some time.
+ * write out any unsynced meta data changes now */
+ drbd_md_sync(device);
+
+ drbd_info(device, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
+ sib.sib_reason = SIB_HELPER_PRE;
+ sib.helper_name = cmd;
+ drbd_bcast_event(device, &sib);
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+ if (ret)
+ drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, mb,
+ (ret >> 8) & 0xff, ret);
+ else
+ drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, mb,
+ (ret >> 8) & 0xff, ret);
+ sib.sib_reason = SIB_HELPER_POST;
+ sib.helper_exit_code = ret;
+ drbd_bcast_event(device, &sib);
+
+ if (current == connection->worker.task)
+ clear_bit(CALLBACK_PENDING, &connection->flags);
+
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+}
+
+static int conn_khelper(struct drbd_connection *connection, char *cmd)
+{
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
+ NULL };
+ char *resource_name = connection->resource->name;
+ char *argv[] = {usermode_helper, cmd, resource_name, NULL };
+ int ret;
+
+ setup_khelper_env(connection, envp);
+ conn_md_sync(connection);
+
+ drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name);
+ /* TODO: conn_bcast_event() ?? */
+
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+ if (ret)
+ drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, resource_name,
+ (ret >> 8) & 0xff, ret);
+ else
+ drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, resource_name,
+ (ret >> 8) & 0xff, ret);
+ /* TODO: conn_bcast_event() ?? */
+
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+}
+
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
+{
+ enum drbd_fencing_p fp = FP_NOT_AVAIL;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (get_ldev_if_state(device, D_CONSISTENT)) {
+ struct disk_conf *disk_conf =
+ rcu_dereference(peer_device->device->ldev->disk_conf);
+ fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing);
+ put_ldev(device);
+ }
+ }
+ rcu_read_unlock();
+
+ if (fp == FP_NOT_AVAIL) {
+ /* IO Suspending works on the whole resource.
+ Do it only for one device. */
+ vnr = 0;
+ peer_device = idr_get_next(&connection->peer_devices, &vnr);
+ drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
+ }
+
+ return fp;
+}
+
+bool conn_try_outdate_peer(struct drbd_connection *connection)
+{
+ unsigned int connect_cnt;
+ union drbd_state mask = { };
+ union drbd_state val = { };
+ enum drbd_fencing_p fp;
+ char *ex_to_string;
+ int r;
+
+ spin_lock_irq(&connection->resource->req_lock);
+ if (connection->cstate >= C_WF_REPORT_PARAMS) {
+ drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
+ spin_unlock_irq(&connection->resource->req_lock);
+ return false;
+ }
+
+ connect_cnt = connection->connect_cnt;
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ fp = highest_fencing_policy(connection);
+ switch (fp) {
+ case FP_NOT_AVAIL:
+ drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
+ goto out;
+ case FP_DONT_CARE:
+ return true;
+ default: ;
+ }
+
+ r = conn_khelper(connection, "fence-peer");
+
+ switch ((r>>8) & 0xff) {
+ case 3: /* peer is inconsistent */
+ ex_to_string = "peer is inconsistent or worse";
+ mask.pdsk = D_MASK;
+ val.pdsk = D_INCONSISTENT;
+ break;
+ case 4: /* peer got outdated, or was already outdated */
+ ex_to_string = "peer was fenced";
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+ break;
+ case 5: /* peer was down */
+ if (conn_highest_disk(connection) == D_UP_TO_DATE) {
+ /* we will(have) create(d) a new UUID anyways... */
+ ex_to_string = "peer is unreachable, assumed to be dead";
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+ } else {
+ ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+ }
+ break;
+ case 6: /* Peer is primary, voluntarily outdate myself.
+ * This is useful when an unconnected R_SECONDARY is asked to
+ * become R_PRIMARY, but finds the other peer being active. */
+ ex_to_string = "peer is active";
+ drbd_warn(connection, "Peer is primary, outdating myself.\n");
+ mask.disk = D_MASK;
+ val.disk = D_OUTDATED;
+ break;
+ case 7:
+ if (fp != FP_STONITH)
+ drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
+ ex_to_string = "peer was stonithed";
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+ break;
+ default:
+ /* The script is broken ... */
+ drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+ return false; /* Eventually leave IO frozen */
+ }
+
+ drbd_info(connection, "fence-peer helper returned %d (%s)\n",
+ (r>>8) & 0xff, ex_to_string);
+
+ out:
+
+ /* Not using
+ conn_request_state(connection, mask, val, CS_VERBOSE);
+ here, because we might were able to re-establish the connection in the
+ meantime. */
+ spin_lock_irq(&connection->resource->req_lock);
+ if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
+ if (connection->connect_cnt != connect_cnt)
+ /* In case the connection was established and droped
+ while the fence-peer handler was running, ignore it */
+ drbd_info(connection, "Ignoring fence-peer exit code\n");
+ else
+ _conn_request_state(connection, mask, val, CS_VERBOSE);
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ return conn_highest_pdsk(connection) <= D_OUTDATED;
+}
+
+static int _try_outdate_peer_async(void *data)
+{
+ struct drbd_connection *connection = (struct drbd_connection *)data;
+
+ conn_try_outdate_peer(connection);
+
+ kref_put(&connection->kref, drbd_destroy_connection);
+ return 0;
+}
+
+void conn_try_outdate_peer_async(struct drbd_connection *connection)
+{
+ struct task_struct *opa;
+
+ kref_get(&connection->kref);
+ /* We may just have force_sig()'ed this thread
+ * to get it out of some blocking network function.
+ * Clear signals; otherwise kthread_run(), which internally uses
+ * wait_on_completion_killable(), will mistake our pending signal
+ * for a new fatal signal and fail. */
+ flush_signals(current);
+ opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
+ if (IS_ERR(opa)) {
+ drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n");
+ kref_put(&connection->kref, drbd_destroy_connection);
+ }
+}
+
+enum drbd_state_rv
+drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
+{
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+ const int max_tries = 4;
+ enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+ struct net_conf *nc;
+ int try = 0;
+ int forced = 0;
+ union drbd_state mask, val;
+
+ if (new_role == R_PRIMARY) {
+ struct drbd_connection *connection;
+
+ /* Detect dead peers as soon as possible. */
+
+ rcu_read_lock();
+ for_each_connection(connection, device->resource)
+ request_ping(connection);
+ rcu_read_unlock();
+ }
+
+ mutex_lock(device->state_mutex);
+
+ mask.i = 0; mask.role = R_MASK;
+ val.i = 0; val.role = new_role;
+
+ while (try++ < max_tries) {
+ rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE);
+
+ /* in case we first succeeded to outdate,
+ * but now suddenly could establish a connection */
+ if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+ val.pdsk = 0;
+ mask.pdsk = 0;
+ continue;
+ }
+
+ if (rv == SS_NO_UP_TO_DATE_DISK && force &&
+ (device->state.disk < D_UP_TO_DATE &&
+ device->state.disk >= D_INCONSISTENT)) {
+ mask.disk = D_MASK;
+ val.disk = D_UP_TO_DATE;
+ forced = 1;
+ continue;
+ }
+
+ if (rv == SS_NO_UP_TO_DATE_DISK &&
+ device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+ D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
+
+ if (conn_try_outdate_peer(connection)) {
+ val.disk = D_UP_TO_DATE;
+ mask.disk = D_MASK;
+ }
+ continue;
+ }
+
+ if (rv == SS_NOTHING_TO_DO)
+ goto out;
+ if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
+ if (!conn_try_outdate_peer(connection) && force) {
+ drbd_warn(device, "Forced into split brain situation!\n");
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+
+ }
+ continue;
+ }
+ if (rv == SS_TWO_PRIMARIES) {
+ /* Maybe the peer is detected as dead very soon...
+ retry at most once more in this case. */
+ int timeo;
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
+ if (try < max_tries)
+ try = max_tries - 1;
+ continue;
+ }
+ if (rv < SS_SUCCESS) {
+ rv = _drbd_request_state(device, mask, val,
+ CS_VERBOSE + CS_WAIT_COMPLETE);
+ if (rv < SS_SUCCESS)
+ goto out;
+ }
+ break;
+ }
+
+ if (rv < SS_SUCCESS)
+ goto out;
+
+ if (forced)
+ drbd_warn(device, "Forced to consider local data as UpToDate!\n");
+
+ /* Wait until nothing is on the fly :) */
+ wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
+
+ /* FIXME also wait for all pending P_BARRIER_ACK? */
+
+ if (new_role == R_SECONDARY) {
+ if (get_ldev(device)) {
+ device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+ put_ldev(device);
+ }
+ } else {
+ mutex_lock(&device->resource->conf_update);
+ nc = connection->net_conf;
+ if (nc)
+ nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+ mutex_unlock(&device->resource->conf_update);
+
+ if (get_ldev(device)) {
+ if (((device->state.conn < C_CONNECTED ||
+ device->state.pdsk <= D_FAILED)
+ && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+ drbd_uuid_new_current(device);
+
+ device->ldev->md.uuid[UI_CURRENT] |= (u64)1;
+ put_ldev(device);
+ }
+ }
+
+ /* writeout of activity log covered areas of the bitmap
+ * to stable storage done in after state change already */
+
+ if (device->state.conn >= C_WF_REPORT_PARAMS) {
+ /* if this was forced, we should consider sync */
+ if (forced)
+ drbd_send_uuids(peer_device);
+ drbd_send_current_state(peer_device);
+ }
+
+ drbd_md_sync(device);
+ set_disk_ro(device->vdisk, new_role == R_SECONDARY);
+ kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+out:
+ mutex_unlock(device->state_mutex);
+ return rv;
+}
+
+static const char *from_attrs_err_to_txt(int err)
+{
+ return err == -ENOMSG ? "required attribute missing" :
+ err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+ err == -EEXIST ? "can not change invariant setting" :
+ "invalid attribute value";
+}
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct set_role_parms parms;
+ int err;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+ err = set_role_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+ genl_unlock();
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+ retcode = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
+ else
+ retcode = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
+
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ genl_lock();
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ * |----------- md_size_sect ------------------|
+ * [ 4k superblock ][ activity log ][ Bitmap ]
+ * | al_offset == 8 |
+ * | bm_offset = al_offset + X |
+ * ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ * |----------- md_size_sect ------------------|
+ * [data.....][ Bitmap ][ activity log ][ 4k superblock ]
+ * | al_offset < 0 |
+ * | bm_offset = al_offset - Y |
+ * ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ * Activity log size used to be fixed 32kB,
+ * but is about to become configurable.
+ */
+static void drbd_md_set_sector_offsets(struct drbd_device *device,
+ struct drbd_backing_dev *bdev)
+{
+ sector_t md_size_sect = 0;
+ unsigned int al_size_sect = bdev->md.al_size_4k * 8;
+
+ bdev->md.md_offset = drbd_md_ss(bdev);
+
+ switch (bdev->md.meta_dev_idx) {
+ default:
+ /* v07 style fixed size indexed meta data */
+ bdev->md.md_size_sect = MD_128MB_SECT;
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+ break;
+ case DRBD_MD_INDEX_FLEX_EXT:
+ /* just occupy the full device; unit: sectors */
+ bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+ bdev->md.al_offset = MD_4kB_SECT;
+ bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+ break;
+ case DRBD_MD_INDEX_INTERNAL:
+ case DRBD_MD_INDEX_FLEX_INT:
+ /* al size is still fixed */
+ bdev->md.al_offset = -al_size_sect;
+ /* we need (slightly less than) ~ this much bitmap sectors: */
+ md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+ md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+ md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+ md_size_sect = ALIGN(md_size_sect, 8);
+
+ /* plus the "drbd meta data super block",
+ * and the activity log; */
+ md_size_sect += MD_4kB_SECT + al_size_sect;
+
+ bdev->md.md_size_sect = md_size_sect;
+ /* bitmap offset is adjusted by 'super' block size */
+ bdev->md.bm_offset = -md_size_sect + MD_4kB_SECT;
+ break;
+ }
+}
+
+/* input size is expected to be in KB */
+char *ppsize(char *buf, unsigned long long size)
+{
+ /* Needs 9 bytes at max including trailing NUL:
+ * -1ULL ==> "16384 EB" */
+ static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+ int base = 0;
+ while (size >= 10000 && base < sizeof(units)-1) {
+ /* shift + round */
+ size = (size >> 10) + !!(size & (1<<9));
+ base++;
+ }
+ sprintf(buf, "%u %cB", (unsigned)size, units[base]);
+
+ return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ * remote READ does inc_ap_bio, receiver would need to receive answer
+ * packet from remote to dec_ap_bio again.
+ * receiver receive_sizes(), comes here,
+ * waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ * (not connected, or bad/no disk on peer):
+ * see drbd_fail_request_early, ap_bio_cnt is zero.
+ * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ * peer may not initiate a resize.
+ */
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an device->flag, is triggered by drbd internals,
+ * and should be short-lived. */
+void drbd_suspend_io(struct drbd_device *device)
+{
+ set_bit(SUSPEND_IO, &device->flags);
+ if (drbd_suspended(device))
+ return;
+ wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_device *device)
+{
+ clear_bit(SUSPEND_IO, &device->flags);
+ wake_up(&device->misc_wait);
+}
+
+/**
+ * drbd_determine_dev_size() - Sets the right device size obeying all constraints
+ * @device: DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
+{
+ sector_t prev_first_sect, prev_size; /* previous meta location */
+ sector_t la_size_sect, u_size;
+ struct drbd_md *md = &device->ldev->md;
+ u32 prev_al_stripe_size_4k;
+ u32 prev_al_stripes;
+ sector_t size;
+ char ppb[10];
+ void *buffer;
+
+ int md_moved, la_size_changed;
+ enum determine_dev_size rv = DS_UNCHANGED;
+
+ /* race:
+ * application request passes inc_ap_bio,
+ * but then cannot get an AL-reference.
+ * this function later may wait on ap_bio_cnt == 0. -> deadlock.
+ *
+ * to avoid that:
+ * Suspend IO right here.
+ * still lock the act_log to not trigger ASSERTs there.
+ */
+ drbd_suspend_io(device);
+ buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
+ if (!buffer) {
+ drbd_resume_io(device);
+ return DS_ERROR;
+ }
+
+ /* no wait necessary anymore, actually we could assert that */
+ wait_event(device->al_wait, lc_try_lock(device->act_log));
+
+ prev_first_sect = drbd_md_first_sector(device->ldev);
+ prev_size = device->ldev->md.md_size_sect;
+ la_size_sect = device->ldev->md.la_size_sect;
+
+ if (rs) {
+ /* rs is non NULL if we should change the AL layout only */
+
+ prev_al_stripes = md->al_stripes;
+ prev_al_stripe_size_4k = md->al_stripe_size_4k;
+
+ md->al_stripes = rs->al_stripes;
+ md->al_stripe_size_4k = rs->al_stripe_size / 4;
+ md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
+ }
+
+ drbd_md_set_sector_offsets(device, device->ldev);
+
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
+
+ if (size < la_size_sect) {
+ if (rs && u_size == 0) {
+ /* Remove "rs &&" later. This check should always be active, but
+ right now the receiver expects the permissive behavior */
+ drbd_warn(device, "Implicit shrink not allowed. "
+ "Use --size=%llus for explicit shrink.\n",
+ (unsigned long long)size);
+ rv = DS_ERROR_SHRINK;
+ }
+ if (u_size > size)
+ rv = DS_ERROR_SPACE_MD;
+ if (rv != DS_UNCHANGED)
+ goto err_out;
+ }
+
+ if (drbd_get_capacity(device->this_bdev) != size ||
+ drbd_bm_capacity(device) != size) {
+ int err;
+ err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
+ if (unlikely(err)) {
+ /* currently there is only one error: ENOMEM! */
+ size = drbd_bm_capacity(device)>>1;
+ if (size == 0) {
+ drbd_err(device, "OUT OF MEMORY! "
+ "Could not allocate bitmap!\n");
+ } else {
+ drbd_err(device, "BM resizing failed. "
+ "Leaving size unchanged at size = %lu KB\n",
+ (unsigned long)size);
+ }
+ rv = DS_ERROR;
+ }
+ /* racy, see comments above. */
+ drbd_set_my_capacity(device, size);
+ device->ldev->md.la_size_sect = size;
+ drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
+ (unsigned long long)size>>1);
+ }
+ if (rv <= DS_ERROR)
+ goto err_out;
+
+ la_size_changed = (la_size_sect != device->ldev->md.la_size_sect);
+
+ md_moved = prev_first_sect != drbd_md_first_sector(device->ldev)
+ || prev_size != device->ldev->md.md_size_sect;
+
+ if (la_size_changed || md_moved || rs) {
+ u32 prev_flags;
+
+ /* We do some synchronous IO below, which may take some time.
+ * Clear the timer, to avoid scary "timer expired!" messages,
+ * "Superblock" is written out at least twice below, anyways. */
+ del_timer(&device->md_sync_timer);
+ drbd_al_shrink(device); /* All extents inactive. */
+
+ prev_flags = md->flags;
+ md->flags &= ~MDF_PRIMARY_IND;
+ drbd_md_write(device, buffer);
+
+ drbd_info(device, "Writing the whole bitmap, %s\n",
+ la_size_changed && md_moved ? "size changed and md moved" :
+ la_size_changed ? "size changed" : "md moved");
+ /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
+ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+ "size changed", BM_LOCKED_MASK);
+ drbd_initialize_al(device, buffer);
+
+ md->flags = prev_flags;
+ drbd_md_write(device, buffer);
+
+ if (rs)
+ drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
+ md->al_stripes, md->al_stripe_size_4k * 4);
+ }
+
+ if (size > la_size_sect)
+ rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+ if (size < la_size_sect)
+ rv = DS_SHRUNK;
+
+ if (0) {
+ err_out:
+ if (rs) {
+ md->al_stripes = prev_al_stripes;
+ md->al_stripe_size_4k = prev_al_stripe_size_4k;
+ md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
+
+ drbd_md_set_sector_offsets(device, device->ldev);
+ }
+ }
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
+ drbd_md_put_buffer(device);
+ drbd_resume_io(device);
+
+ return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ sector_t u_size, int assume_peer_has_space)
+{
+ sector_t p_size = device->p_size; /* partner's disk size. */
+ sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
+ sector_t m_size; /* my size */
+ sector_t size = 0;
+
+ m_size = drbd_get_max_capacity(bdev);
+
+ if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
+ drbd_warn(device, "Resize while not connected was forced by the user!\n");
+ p_size = m_size;
+ }
+
+ if (p_size && m_size) {
+ size = min_t(sector_t, p_size, m_size);
+ } else {
+ if (la_size_sect) {
+ size = la_size_sect;
+ if (m_size && m_size < size)
+ size = m_size;
+ if (p_size && p_size < size)
+ size = p_size;
+ } else {
+ if (m_size)
+ size = m_size;
+ if (p_size)
+ size = p_size;
+ }
+ }
+
+ if (size == 0)
+ drbd_err(device, "Both nodes diskless!\n");
+
+ if (u_size) {
+ if (u_size > size)
+ drbd_err(device, "Requested disk size is too big (%lu > %lu)\n",
+ (unsigned long)u_size>>1, (unsigned long)size>>1);
+ else
+ size = u_size;
+ }
+
+ return size;
+}
+
+/**
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @device: DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
+{
+ struct lru_cache *n, *t;
+ struct lc_element *e;
+ unsigned int in_use;
+ int i;
+
+ if (device->act_log &&
+ device->act_log->nr_elements == dc->al_extents)
+ return 0;
+
+ in_use = 0;
+ t = device->act_log;
+ n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+ dc->al_extents, sizeof(struct lc_element), 0);
+
+ if (n == NULL) {
+ drbd_err(device, "Cannot allocate act_log lru!\n");
+ return -ENOMEM;
+ }
+ spin_lock_irq(&device->al_lock);
+ if (t) {
+ for (i = 0; i < t->nr_elements; i++) {
+ e = lc_element_by_index(t, i);
+ if (e->refcnt)
+ drbd_err(device, "refcnt(%d)==%d\n",
+ e->lc_number, e->refcnt);
+ in_use += e->refcnt;
+ }
+ }
+ if (!in_use)
+ device->act_log = n;
+ spin_unlock_irq(&device->al_lock);
+ if (in_use) {
+ drbd_err(device, "Activity log still in use!\n");
+ lc_destroy(n);
+ return -EBUSY;
+ } else {
+ if (t)
+ lc_destroy(t);
+ }
+ drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
+ return 0;
+}
+
+static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
+ unsigned int max_bio_size)
+{
+ struct request_queue * const q = device->rq_queue;
+ unsigned int max_hw_sectors = max_bio_size >> 9;
+ unsigned int max_segments = 0;
+ struct request_queue *b = NULL;
+
+ if (bdev) {
+ b = bdev->backing_bdev->bd_disk->queue;
+
+ max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+ rcu_read_lock();
+ max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
+ rcu_read_unlock();
+
+ blk_set_stacking_limits(&q->limits);
+ blk_queue_max_write_same_sectors(q, 0);
+ }
+
+ blk_queue_logical_block_size(q, 512);
+ blk_queue_max_hw_sectors(q, max_hw_sectors);
+ /* This is the workaround for "bio would need to, but cannot, be split" */
+ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
+ blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1);
+
+ if (b) {
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+
+ if (blk_queue_discard(b) &&
+ (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) {
+ /* For now, don't allow more than one activity log extent worth of data
+ * to be discarded in one go. We may need to rework drbd_al_begin_io()
+ * to allow for even larger discard ranges */
+ q->limits.max_discard_sectors = DRBD_MAX_DISCARD_SECTORS;
+
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ /* REALLY? Is stacking secdiscard "legal"? */
+ if (blk_queue_secdiscard(b))
+ queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q);
+ } else {
+ q->limits.max_discard_sectors = 0;
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+ queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q);
+ }
+
+ blk_queue_stack_limits(q, b);
+
+ if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
+ drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n",
+ q->backing_dev_info.ra_pages,
+ b->backing_dev_info.ra_pages);
+ q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages;
+ }
+ }
+}
+
+void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+ unsigned int now, new, local, peer;
+
+ now = queue_max_hw_sectors(device->rq_queue) << 9;
+ local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
+ peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
+
+ if (bdev) {
+ local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
+ device->local_max_bio_size = local;
+ }
+ local = min(local, DRBD_MAX_BIO_SIZE);
+
+ /* We may ignore peer limits if the peer is modern enough.
+ Because new from 8.3.8 onwards the peer can use multiple
+ BIOs for a single peer_request */
+ if (device->state.conn >= C_WF_REPORT_PARAMS) {
+ if (first_peer_device(device)->connection->agreed_pro_version < 94)
+ peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+ else if (first_peer_device(device)->connection->agreed_pro_version == 94)
+ peer = DRBD_MAX_SIZE_H80_PACKET;
+ else if (first_peer_device(device)->connection->agreed_pro_version < 100)
+ peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
+ else
+ peer = DRBD_MAX_BIO_SIZE;
+
+ /* We may later detach and re-attach on a disconnected Primary.
+ * Avoid this setting to jump back in that case.
+ * We want to store what we know the peer DRBD can handle,
+ * not what the peer IO backend can handle. */
+ if (peer > device->peer_max_bio_size)
+ device->peer_max_bio_size = peer;
+ }
+ new = min(local, peer);
+
+ if (device->state.role == R_PRIMARY && new < now)
+ drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
+
+ if (new != now)
+ drbd_info(device, "max BIO size = %u\n", new);
+
+ drbd_setup_queue_param(device, bdev, new);
+}
+
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_connection *connection)
+{
+ drbd_thread_start(&connection->worker);
+ drbd_flush_workqueue(&connection->sender_work);
+}
+
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_connection *connection)
+{
+ bool stop_threads;
+ spin_lock_irq(&connection->resource->req_lock);
+ stop_threads = conn_all_vols_unconf(connection) &&
+ connection->cstate == C_STANDALONE;
+ spin_unlock_irq(&connection->resource->req_lock);
+ if (stop_threads) {
+ /* asender is implicitly stopped by receiver
+ * in conn_disconnect() */
+ drbd_thread_stop(&connection->receiver);
+ drbd_thread_stop(&connection->worker);
+ }
+}
+
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_device *device)
+{
+ int s = 0;
+
+ if (!lc_try_lock(device->act_log)) {
+ drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n");
+ return;
+ }
+
+ drbd_al_shrink(device);
+ spin_lock_irq(&device->resource->req_lock);
+ if (device->state.conn < C_CONNECTED)
+ s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
+ spin_unlock_irq(&device->resource->req_lock);
+ lc_unlock(device->act_log);
+
+ if (s)
+ drbd_info(device, "Suspended AL updates\n");
+}
+
+
+static bool should_set_defaults(struct genl_info *info)
+{
+ unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+ return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
+{
+ /* This is limited by 16 bit "slot" numbers,
+ * and by available on-disk context storage.
+ *
+ * Also (u16)~0 is special (denotes a "free" extent).
+ *
+ * One transaction occupies one 4kB on-disk block,
+ * we have n such blocks in the on disk ring buffer,
+ * the "current" transaction may fail (n-1),
+ * and there is 919 slot numbers context information per transaction.
+ *
+ * 72 transaction blocks amounts to more than 2**16 context slots,
+ * so cap there first.
+ */
+ const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+ const unsigned int sufficient_on_disk =
+ (max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+ /AL_CONTEXT_PER_TRANSACTION;
+
+ unsigned int al_size_4k = bdev->md.al_size_4k;
+
+ if (al_size_4k > sufficient_on_disk)
+ return max_al_nr;
+
+ return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
+}
+
+static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
+{
+ return a->disk_barrier != b->disk_barrier ||
+ a->disk_flushes != b->disk_flushes ||
+ a->disk_drain != b->disk_drain;
+}
+
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct drbd_device *device;
+ struct disk_conf *new_disk_conf, *old_disk_conf;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+ int err, fifo_size;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ device = adm_ctx.device;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* we also need a disk
+ * to change the options on */
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ mutex_lock(&device->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ if (should_set_defaults(info))
+ set_disk_conf_defaults(new_disk_conf);
+
+ err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail_unlock;
+ }
+
+ if (!expect(new_disk_conf->resync_rate >= 1))
+ new_disk_conf->resync_rate = 1;
+
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev))
+ new_disk_conf->al_extents = drbd_al_extents_max(device->ldev);
+
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != device->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
+ drbd_err(device, "kmalloc of fifo_buffer failed");
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+ }
+
+ drbd_suspend_io(device);
+ wait_event(device->al_wait, lc_try_lock(device->act_log));
+ drbd_al_shrink(device);
+ err = drbd_check_al_size(device, new_disk_conf);
+ lc_unlock(device->act_log);
+ wake_up(&device->al_wait);
+ drbd_resume_io(device);
+
+ if (err) {
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+ if (retcode == NO_ERROR) {
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ drbd_resync_after_changed(device);
+ }
+ write_unlock_irq(&global_state_lock);
+
+ if (retcode != NO_ERROR)
+ goto fail_unlock;
+
+ if (new_plan) {
+ old_plan = device->rs_plan_s;
+ rcu_assign_pointer(device->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&device->resource->conf_update);
+
+ if (new_disk_conf->al_updates)
+ device->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ device->ldev->md.flags |= MDF_AL_DISABLED;
+
+ if (new_disk_conf->md_flushes)
+ clear_bit(MD_NO_FUA, &device->flags);
+ else
+ set_bit(MD_NO_FUA, &device->flags);
+
+ if (write_ordering_changed(old_disk_conf, new_disk_conf))
+ drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush);
+
+ drbd_md_sync(device);
+
+ if (device->state.conn >= C_CONNECTED) {
+ struct drbd_peer_device *peer_device;
+
+ for_each_peer_device(peer_device, device)
+ drbd_send_sync_param(peer_device);
+ }
+
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ kfree(old_plan);
+ mod_timer(&device->request_timer, jiffies + HZ);
+ goto success;
+
+fail_unlock:
+ mutex_unlock(&device->resource->conf_update);
+ fail:
+ kfree(new_disk_conf);
+ kfree(new_plan);
+success:
+ put_ldev(device);
+ out:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ struct drbd_peer_device *peer_device;
+ struct drbd_connection *connection;
+ int err;
+ enum drbd_ret_code retcode;
+ enum determine_dev_size dd;
+ sector_t max_possible_sectors;
+ sector_t min_md_device_sectors;
+ struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+ struct disk_conf *new_disk_conf = NULL;
+ struct block_device *bdev;
+ struct lru_cache *resync_lru = NULL;
+ struct fifo_buffer *new_plan = NULL;
+ union drbd_state ns, os;
+ enum drbd_state_rv rv;
+ struct net_conf *nc;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ device = adm_ctx.device;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ peer_device = first_peer_device(device);
+ connection = peer_device ? peer_device->connection : NULL;
+ conn_reconfig_start(connection);
+
+ /* if you want to reconfigure, please tear down first */
+ if (device->state.disk > D_DISKLESS) {
+ retcode = ERR_DISK_CONFIGURED;
+ goto fail;
+ }
+ /* It may just now have detached because of IO error. Make sure
+ * drbd_ldev_destroy is done already, we may end up here very fast,
+ * e.g. if someone calls attach from the on-io-error handler,
+ * to realize a "hot spare" feature (not that I'd recommend that) */
+ wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
+
+ /* make sure there is no leftover from previous force-detach attempts */
+ clear_bit(FORCE_DETACH, &device->flags);
+ clear_bit(WAS_IO_ERROR, &device->flags);
+ clear_bit(WAS_READ_ERROR, &device->flags);
+
+ /* and no leftover from previously aborted resync or verify, either */
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+
+ /* allocation not in the IO path, drbdsetup context */
+ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+ if (!nbc) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ spin_lock_init(&nbc->md.uuid_lock);
+
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ nbc->disk_conf = new_disk_conf;
+
+ set_disk_conf_defaults(new_disk_conf);
+ err = disk_conf_from_attrs(new_disk_conf, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+ new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+ if (!new_plan) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+ retcode = ERR_MD_IDX_INVALID;
+ goto fail;
+ }
+
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+ write_unlock_irq(&global_state_lock);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (nc) {
+ if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+ rcu_read_unlock();
+ retcode = ERR_STONITH_AND_PROT_A;
+ goto fail;
+ }
+ }
+ rcu_read_unlock();
+
+ bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL, device);
+ if (IS_ERR(bdev)) {
+ drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
+ PTR_ERR(bdev));
+ retcode = ERR_OPEN_DISK;
+ goto fail;
+ }
+ nbc->backing_bdev = bdev;
+
+ /*
+ * meta_dev_idx >= 0: external fixed size, possibly multiple
+ * drbd sharing one meta device. TODO in that case, paranoia
+ * check that [md_bdev, meta_dev_idx] is not yet used by some
+ * other drbd minor! (if you use drbd.conf + drbdadm, that
+ * should check it for you already; but if you don't, or
+ * someone fooled it, we need to double check here)
+ */
+ bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ (new_disk_conf->meta_dev_idx < 0) ?
+ (void *)device : (void *)drbd_m_holder);
+ if (IS_ERR(bdev)) {
+ drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
+ PTR_ERR(bdev));
+ retcode = ERR_OPEN_MD_DISK;
+ goto fail;
+ }
+ nbc->md_bdev = bdev;
+
+ if ((nbc->backing_bdev == nbc->md_bdev) !=
+ (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+ retcode = ERR_MD_IDX_INVALID;
+ goto fail;
+ }
+
+ resync_lru = lc_create("resync", drbd_bm_ext_cache,
+ 1, 61, sizeof(struct bm_extent),
+ offsetof(struct bm_extent, lce));
+ if (!resync_lru) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ /* Read our meta data super block early.
+ * This also sets other on-disk offsets. */
+ retcode = drbd_md_read(device, nbc);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+ new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+ new_disk_conf->al_extents = drbd_al_extents_max(nbc);
+
+ if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
+ drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
+ (unsigned long long) drbd_get_max_capacity(nbc),
+ (unsigned long long) new_disk_conf->disk_size);
+ retcode = ERR_DISK_TOO_SMALL;
+ goto fail;
+ }
+
+ if (new_disk_conf->meta_dev_idx < 0) {
+ max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+ /* at least one MB, otherwise it does not make sense */
+ min_md_device_sectors = (2<<10);
+ } else {
+ max_possible_sectors = DRBD_MAX_SECTORS;
+ min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
+ }
+
+ if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+ retcode = ERR_MD_DISK_TOO_SMALL;
+ drbd_warn(device, "refusing attach: md-device too small, "
+ "at least %llu sectors needed for this meta-disk type\n",
+ (unsigned long long) min_md_device_sectors);
+ goto fail;
+ }
+
+ /* Make sure the new disk is big enough
+ * (we may currently be R_PRIMARY with no local disk...) */
+ if (drbd_get_max_capacity(nbc) <
+ drbd_get_capacity(device->this_bdev)) {
+ retcode = ERR_DISK_TOO_SMALL;
+ goto fail;
+ }
+
+ nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+ if (nbc->known_size > max_possible_sectors) {
+ drbd_warn(device, "==> truncating very big lower level device "
+ "to currently maximum possible %llu sectors <==\n",
+ (unsigned long long) max_possible_sectors);
+ if (new_disk_conf->meta_dev_idx >= 0)
+ drbd_warn(device, "==>> using internal or flexible "
+ "meta data may help <<==\n");
+ }
+
+ drbd_suspend_io(device);
+ /* also wait for the last barrier ack. */
+ /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+ * We need a way to either ignore barrier acks for barriers sent before a device
+ * was attached, or a way to wait for all pending barrier acks to come in.
+ * As barriers are counted per resource,
+ * we'd need to suspend io on all devices of a resource.
+ */
+ wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
+ /* and for any other previously queued work */
+ drbd_flush_workqueue(&connection->sender_work);
+
+ rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
+ retcode = rv; /* FIXME: Type mismatch. */
+ drbd_resume_io(device);
+ if (rv < SS_SUCCESS)
+ goto fail;
+
+ if (!get_ldev_if_state(device, D_ATTACHING))
+ goto force_diskless;
+
+ if (!device->bitmap) {
+ if (drbd_bm_init(device)) {
+ retcode = ERR_NOMEM;
+ goto force_diskless_dec;
+ }
+ }
+
+ if (device->state.conn < C_CONNECTED &&
+ device->state.role == R_PRIMARY && device->ed_uuid &&
+ (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+ drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
+ (unsigned long long)device->ed_uuid);
+ retcode = ERR_DATA_NOT_CURRENT;
+ goto force_diskless_dec;
+ }
+
+ /* Since we are diskless, fix the activity log first... */
+ if (drbd_check_al_size(device, new_disk_conf)) {
+ retcode = ERR_NOMEM;
+ goto force_diskless_dec;
+ }
+
+ /* Prevent shrinking of consistent devices ! */
+ if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
+ drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
+ drbd_warn(device, "refusing to truncate a consistent device\n");
+ retcode = ERR_DISK_TOO_SMALL;
+ goto force_diskless_dec;
+ }
+
+ /* Reset the "barriers don't work" bits here, then force meta data to
+ * be written, to ensure we determine if barriers are supported. */
+ if (new_disk_conf->md_flushes)
+ clear_bit(MD_NO_FUA, &device->flags);
+ else
+ set_bit(MD_NO_FUA, &device->flags);
+
+ /* Point of no return reached.
+ * Devices and memory are no longer released by error cleanup below.
+ * now device takes over responsibility, and the state engine should
+ * clean it up somewhere. */
+ D_ASSERT(device, device->ldev == NULL);
+ device->ldev = nbc;
+ device->resync = resync_lru;
+ device->rs_plan_s = new_plan;
+ nbc = NULL;
+ resync_lru = NULL;
+ new_disk_conf = NULL;
+ new_plan = NULL;
+
+ drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush);
+
+ if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
+ set_bit(CRASHED_PRIMARY, &device->flags);
+ else
+ clear_bit(CRASHED_PRIMARY, &device->flags);
+
+ if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+ !(device->state.role == R_PRIMARY && device->resource->susp_nod))
+ set_bit(CRASHED_PRIMARY, &device->flags);
+
+ device->send_cnt = 0;
+ device->recv_cnt = 0;
+ device->read_cnt = 0;
+ device->writ_cnt = 0;
+
+ drbd_reconsider_max_bio_size(device, device->ldev);
+
+ /* If I am currently not R_PRIMARY,
+ * but meta data primary indicator is set,
+ * I just now recover from a hard crash,
+ * and have been R_PRIMARY before that crash.
+ *
+ * Now, if I had no connection before that crash
+ * (have been degraded R_PRIMARY), chances are that
+ * I won't find my peer now either.
+ *
+ * In that case, and _only_ in that case,
+ * we use the degr-wfc-timeout instead of the default,
+ * so we can automatically recover from a crash of a
+ * degraded but active "cluster" after a certain timeout.
+ */
+ clear_bit(USE_DEGR_WFC_T, &device->flags);
+ if (device->state.role != R_PRIMARY &&
+ drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+ !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
+ set_bit(USE_DEGR_WFC_T, &device->flags);
+
+ dd = drbd_determine_dev_size(device, 0, NULL);
+ if (dd <= DS_ERROR) {
+ retcode = ERR_NOMEM_BITMAP;
+ goto force_diskless_dec;
+ } else if (dd == DS_GREW)
+ set_bit(RESYNC_AFTER_NEG, &device->flags);
+
+ if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
+ (test_bit(CRASHED_PRIMARY, &device->flags) &&
+ drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
+ drbd_info(device, "Assuming that all blocks are out of sync "
+ "(aka FullSync)\n");
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+ "set_n_write from attaching", BM_LOCKED_MASK)) {
+ retcode = ERR_IO_MD_DISK;
+ goto force_diskless_dec;
+ }
+ } else {
+ if (drbd_bitmap_io(device, &drbd_bm_read,
+ "read from attaching", BM_LOCKED_MASK)) {
+ retcode = ERR_IO_MD_DISK;
+ goto force_diskless_dec;
+ }
+ }
+
+ if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
+ drbd_suspend_al(device); /* IO is still suspended here... */
+
+ spin_lock_irq(&device->resource->req_lock);
+ os = drbd_read_state(device);
+ ns = os;
+ /* If MDF_CONSISTENT is not set go into inconsistent state,
+ otherwise investigate MDF_WasUpToDate...
+ If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+ otherwise into D_CONSISTENT state.
+ */
+ if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
+ if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
+ ns.disk = D_CONSISTENT;
+ else
+ ns.disk = D_OUTDATED;
+ } else {
+ ns.disk = D_INCONSISTENT;
+ }
+
+ if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
+ ns.pdsk = D_OUTDATED;
+
+ rcu_read_lock();
+ if (ns.disk == D_CONSISTENT &&
+ (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
+ ns.disk = D_UP_TO_DATE;
+
+ /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+ MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+ this point, because drbd_request_state() modifies these
+ flags. */
+
+ if (rcu_dereference(device->ldev->disk_conf)->al_updates)
+ device->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ device->ldev->md.flags |= MDF_AL_DISABLED;
+
+ rcu_read_unlock();
+
+ /* In case we are C_CONNECTED postpone any decision on the new disk
+ state after the negotiation phase. */
+ if (device->state.conn == C_CONNECTED) {
+ device->new_state_tmp.i = ns.i;
+ ns.i = os.i;
+ ns.disk = D_NEGOTIATING;
+
+ /* We expect to receive up-to-date UUIDs soon.
+ To avoid a race in receive_state, free p_uuid while
+ holding req_lock. I.e. atomic with the state change */
+ kfree(device->p_uuid);
+ device->p_uuid = NULL;
+ }
+
+ rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (rv < SS_SUCCESS)
+ goto force_diskless_dec;
+
+ mod_timer(&device->request_timer, jiffies + HZ);
+
+ if (device->state.role == R_PRIMARY)
+ device->ldev->md.uuid[UI_CURRENT] |= (u64)1;
+ else
+ device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+ drbd_md_mark_dirty(device);
+ drbd_md_sync(device);
+
+ kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+ put_ldev(device);
+ conn_reconfig_done(connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+
+ force_diskless_dec:
+ put_ldev(device);
+ force_diskless:
+ drbd_force_state(device, NS(disk, D_DISKLESS));
+ drbd_md_sync(device);
+ fail:
+ conn_reconfig_done(connection);
+ if (nbc) {
+ if (nbc->backing_bdev)
+ blkdev_put(nbc->backing_bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ if (nbc->md_bdev)
+ blkdev_put(nbc->md_bdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ kfree(nbc);
+ }
+ kfree(new_disk_conf);
+ lc_destroy(resync_lru);
+ kfree(new_plan);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int adm_detach(struct drbd_device *device, int force)
+{
+ enum drbd_state_rv retcode;
+ int ret;
+
+ if (force) {
+ set_bit(FORCE_DETACH, &device->flags);
+ drbd_force_state(device, NS(disk, D_FAILED));
+ retcode = SS_SUCCESS;
+ goto out;
+ }
+
+ drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
+ drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */
+ retcode = drbd_request_state(device, NS(disk, D_FAILED));
+ drbd_md_put_buffer(device);
+ /* D_FAILED will transition to DISKLESS. */
+ ret = wait_event_interruptible(device->misc_wait,
+ device->state.disk != D_FAILED);
+ drbd_resume_io(device);
+ if ((int)retcode == (int)SS_IS_DISKLESS)
+ retcode = SS_NOTHING_TO_DO;
+ if (ret)
+ retcode = ERR_INTR;
+out:
+ return retcode;
+}
+
+/* Detaching the disk is a process in multiple stages. First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct detach_parms parms = { };
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+ err = detach_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = adm_detach(adm_ctx.device, parms.force_detach);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static bool conn_resync_running(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.conn == C_SYNC_SOURCE ||
+ device->state.conn == C_SYNC_TARGET ||
+ device->state.conn == C_PAUSED_SYNC_S ||
+ device->state.conn == C_PAUSED_SYNC_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static bool conn_ov_running(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.conn == C_VERIFY_S ||
+ device->state.conn == C_VERIFY_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static enum drbd_ret_code
+_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf)
+{
+ struct drbd_peer_device *peer_device;
+ int i;
+
+ if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
+ if (new_net_conf->wire_protocol != old_net_conf->wire_protocol)
+ return ERR_NEED_APV_100;
+
+ if (new_net_conf->two_primaries != old_net_conf->two_primaries)
+ return ERR_NEED_APV_100;
+
+ if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg))
+ return ERR_NEED_APV_100;
+ }
+
+ if (!new_net_conf->two_primaries &&
+ conn_highest_role(connection) == R_PRIMARY &&
+ conn_highest_peer(connection) == R_PRIMARY)
+ return ERR_NEED_ALLOW_TWO_PRI;
+
+ if (new_net_conf->two_primaries &&
+ (new_net_conf->wire_protocol != DRBD_PROT_C))
+ return ERR_NOT_PROTO_C;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ if (get_ldev(device)) {
+ enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ put_ldev(device);
+ if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+ return ERR_STONITH_AND_PROT_A;
+ }
+ if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data)
+ return ERR_DISCARD_IMPOSSIBLE;
+ }
+
+ if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A)
+ return ERR_CONG_NOT_PROTO_A;
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf)
+{
+ static enum drbd_ret_code rv;
+ struct drbd_peer_device *peer_device;
+ int i;
+
+ rcu_read_lock();
+ rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
+ rcu_read_unlock();
+
+ /* connection->peer_devices protected by genl_lock() here */
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ if (!device->bitmap) {
+ if (drbd_bm_init(device))
+ return ERR_NOMEM;
+ }
+ }
+
+ return rv;
+}
+
+struct crypto {
+ struct crypto_hash *verify_tfm;
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm;
+};
+
+static int
+alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+{
+ if (!tfm_name[0])
+ return NO_ERROR;
+
+ *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(*tfm)) {
+ *tfm = NULL;
+ return err_alg;
+ }
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf)
+{
+ char hmac_name[CRYPTO_MAX_ALG_NAME];
+ enum drbd_ret_code rv;
+
+ rv = alloc_hash(&crypto->csums_tfm, new_net_conf->csums_alg,
+ ERR_CSUMS_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->verify_tfm, new_net_conf->verify_alg,
+ ERR_VERIFY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->integrity_tfm, new_net_conf->integrity_alg,
+ ERR_INTEGRITY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ if (new_net_conf->cram_hmac_alg[0] != 0) {
+ snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+ new_net_conf->cram_hmac_alg);
+
+ rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+ ERR_AUTH_ALG);
+ }
+
+ return rv;
+}
+
+static void free_crypto(struct crypto *crypto)
+{
+ crypto_free_hash(crypto->cram_hmac_tfm);
+ crypto_free_hash(crypto->integrity_tfm);
+ crypto_free_hash(crypto->csums_tfm);
+ crypto_free_hash(crypto->verify_tfm);
+}
+
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct drbd_connection *connection;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ int err;
+ int ovr; /* online verify running */
+ int rsr; /* re-sync running */
+ struct crypto crypto = { };
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ connection = adm_ctx.connection;
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ retcode = ERR_NOMEM;
+ goto out;
+ }
+
+ conn_reconfig_start(connection);
+
+ mutex_lock(&connection->data.mutex);
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = connection->net_conf;
+
+ if (!old_net_conf) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect");
+ retcode = ERR_INVALID_REQUEST;
+ goto fail;
+ }
+
+ *new_net_conf = *old_net_conf;
+ if (should_set_defaults(info))
+ set_net_conf_defaults(new_net_conf);
+
+ err = net_conf_from_attrs_for_change(new_net_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ retcode = check_net_options(connection, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ /* re-sync running */
+ rsr = conn_resync_running(connection);
+ if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) {
+ retcode = ERR_CSUMS_RESYNC_RUNNING;
+ goto fail;
+ }
+
+ /* online verify running */
+ ovr = conn_ov_running(connection);
+ if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) {
+ retcode = ERR_VERIFY_RUNNING;
+ goto fail;
+ }
+
+ retcode = alloc_crypto(&crypto, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+ if (!rsr) {
+ crypto_free_hash(connection->csums_tfm);
+ connection->csums_tfm = crypto.csums_tfm;
+ crypto.csums_tfm = NULL;
+ }
+ if (!ovr) {
+ crypto_free_hash(connection->verify_tfm);
+ connection->verify_tfm = crypto.verify_tfm;
+ crypto.verify_tfm = NULL;
+ }
+
+ crypto_free_hash(connection->integrity_tfm);
+ connection->integrity_tfm = crypto.integrity_tfm;
+ if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
+ /* Do this without trying to take connection->data.mutex again. */
+ __drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
+
+ crypto_free_hash(connection->cram_hmac_tfm);
+ connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+ synchronize_rcu();
+ kfree(old_net_conf);
+
+ if (connection->cstate >= C_WF_REPORT_PARAMS) {
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ drbd_send_sync_param(peer_device);
+ }
+
+ goto done;
+
+ fail:
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+ free_crypto(&crypto);
+ kfree(new_net_conf);
+ done:
+ conn_reconfig_done(connection);
+ out:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_peer_device *peer_device;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ struct crypto crypto = { };
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ enum drbd_ret_code retcode;
+ int i;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+ if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+
+ /* No need for _rcu here. All reconfiguration is
+ * strictly serialized on genl_lock(). We are protected against
+ * concurrent reconfiguration/addition/deletion */
+ for_each_resource(resource, &drbd_resources) {
+ for_each_connection(connection, resource) {
+ if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
+ !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr,
+ connection->my_addr_len)) {
+ retcode = ERR_LOCAL_ADDR;
+ goto out;
+ }
+
+ if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
+ !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr,
+ connection->peer_addr_len)) {
+ retcode = ERR_PEER_ADDR;
+ goto out;
+ }
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ connection = first_connection(adm_ctx.resource);
+ conn_reconfig_start(connection);
+
+ if (connection->cstate > C_STANDALONE) {
+ retcode = ERR_NET_CONFIGURED;
+ goto fail;
+ }
+
+ /* allocation not in the IO path, drbdsetup / netlink process context */
+ new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ set_net_conf_defaults(new_net_conf);
+
+ err = net_conf_from_attrs(new_net_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ retcode = check_net_options(connection, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ retcode = alloc_crypto(&crypto, new_net_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ ((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+ drbd_flush_workqueue(&connection->sender_work);
+
+ mutex_lock(&adm_ctx.resource->conf_update);
+ old_net_conf = connection->net_conf;
+ if (old_net_conf) {
+ retcode = ERR_NET_CONFIGURED;
+ mutex_unlock(&adm_ctx.resource->conf_update);
+ goto fail;
+ }
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+ conn_free_crypto(connection);
+ connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+ connection->integrity_tfm = crypto.integrity_tfm;
+ connection->csums_tfm = crypto.csums_tfm;
+ connection->verify_tfm = crypto.verify_tfm;
+
+ connection->my_addr_len = nla_len(adm_ctx.my_addr);
+ memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
+ connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
+ memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
+
+ mutex_unlock(&adm_ctx.resource->conf_update);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ struct drbd_device *device = peer_device->device;
+ device->send_cnt = 0;
+ device->recv_cnt = 0;
+ }
+ rcu_read_unlock();
+
+ retcode = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ conn_reconfig_done(connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+
+fail:
+ free_crypto(&crypto);
+ kfree(new_net_conf);
+
+ conn_reconfig_done(connection);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
+{
+ enum drbd_state_rv rv;
+
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+ force ? CS_HARD : 0);
+
+ switch (rv) {
+ case SS_NOTHING_TO_DO:
+ break;
+ case SS_ALREADY_STANDALONE:
+ return SS_SUCCESS;
+ case SS_PRIMARY_NOP:
+ /* Our state checking code wants to see the peer outdated. */
+ rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
+
+ if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
+
+ break;
+ case SS_CW_FAILED_BY_PEER:
+ /* The peer probably wants to see us outdated. */
+ rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
+ disk, D_OUTDATED), 0);
+ if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+ rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+ CS_HARD);
+ }
+ break;
+ default:;
+ /* no special handling necessary */
+ }
+
+ if (rv >= SS_SUCCESS) {
+ enum drbd_state_rv rv2;
+ /* No one else can reconfigure the network while I am here.
+ * The state handling only uses drbd_thread_stop_nowait(),
+ * we want to really wait here until the receiver is no more.
+ */
+ drbd_thread_stop(&connection->receiver);
+
+ /* Race breaker. This additional state change request may be
+ * necessary, if this was a forced disconnect during a receiver
+ * restart. We may have "killed" the receiver thread just
+ * after drbd_receiver() returned. Typically, we should be
+ * C_STANDALONE already, now, and this becomes a no-op.
+ */
+ rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
+ CS_VERBOSE | CS_HARD);
+ if (rv2 < SS_SUCCESS)
+ drbd_err(connection,
+ "unexpected rv2=%d in conn_try_disconnect()\n",
+ rv2);
+ }
+ return rv;
+}
+
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct disconnect_parms parms;
+ struct drbd_connection *connection;
+ enum drbd_state_rv rv;
+ enum drbd_ret_code retcode;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ connection = adm_ctx.connection;
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+ err = disconnect_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ rv = conn_try_disconnect(connection, parms.force_disconnect);
+ if (rv < SS_SUCCESS)
+ retcode = rv; /* FIXME: Type mismatch. */
+ else
+ retcode = NO_ERROR;
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ fail:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+void resync_after_online_grow(struct drbd_device *device)
+{
+ int iass; /* I am sync source */
+
+ drbd_info(device, "Resync of new storage after online grow\n");
+ if (device->state.role != device->state.peer)
+ iass = (device->state.role == R_PRIMARY);
+ else
+ iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+
+ if (iass)
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ else
+ _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+ struct resize_parms rs;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
+ enum determine_dev_size dd;
+ bool change_al_layout = false;
+ enum dds_flags ddsf;
+ sector_t u_size;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ device = adm_ctx.device;
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto fail;
+ }
+
+ memset(&rs, 0, sizeof(struct resize_parms));
+ rs.al_stripes = device->ldev->md.al_stripes;
+ rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
+ if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+ err = resize_parms_from_attrs(&rs, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail_ldev;
+ }
+ }
+
+ if (device->state.conn > C_CONNECTED) {
+ retcode = ERR_RESIZE_RESYNC;
+ goto fail_ldev;
+ }
+
+ if (device->state.role == R_SECONDARY &&
+ device->state.peer == R_SECONDARY) {
+ retcode = ERR_NO_PRIMARY;
+ goto fail_ldev;
+ }
+
+ if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
+ retcode = ERR_NEED_APV_93;
+ goto fail_ldev;
+ }
+
+ rcu_read_lock();
+ u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ if (u_size != (sector_t)rs.resize_size) {
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail_ldev;
+ }
+ }
+
+ if (device->ldev->md.al_stripes != rs.al_stripes ||
+ device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
+ u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
+
+ if (al_size_k > (16 * 1024 * 1024)) {
+ retcode = ERR_MD_LAYOUT_TOO_BIG;
+ goto fail_ldev;
+ }
+
+ if (al_size_k < MD_32kB_SECT/2) {
+ retcode = ERR_MD_LAYOUT_TOO_SMALL;
+ goto fail_ldev;
+ }
+
+ if (device->state.conn != C_CONNECTED && !rs.resize_force) {
+ retcode = ERR_MD_LAYOUT_CONNECTED;
+ goto fail_ldev;
+ }
+
+ change_al_layout = true;
+ }
+
+ if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
+ device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+
+ if (new_disk_conf) {
+ mutex_lock(&device->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = (sector_t)rs.resize_size;
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&device->resource->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ }
+
+ ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+ dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
+ drbd_md_sync(device);
+ put_ldev(device);
+ if (dd == DS_ERROR) {
+ retcode = ERR_NOMEM_BITMAP;
+ goto fail;
+ } else if (dd == DS_ERROR_SPACE_MD) {
+ retcode = ERR_MD_LAYOUT_NO_FIT;
+ goto fail;
+ } else if (dd == DS_ERROR_SHRINK) {
+ retcode = ERR_IMPLICIT_SHRINK;
+ goto fail;
+ }
+
+ if (device->state.conn == C_CONNECTED) {
+ if (dd == DS_GREW)
+ set_bit(RESIZE_PENDING, &device->flags);
+
+ drbd_send_uuids(first_peer_device(device));
+ drbd_send_sizes(first_peer_device(device), 1, ddsf);
+ }
+
+ fail:
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+
+ fail_ldev:
+ put_ldev(device);
+ goto fail;
+}
+
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct res_opts res_opts;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ res_opts = adm_ctx.resource->res_opts;
+ if (should_set_defaults(info))
+ set_res_opts_defaults(&res_opts);
+
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ err = set_resource_options(adm_ctx.resource, &res_opts);
+ if (err) {
+ retcode = ERR_INVALID_REQUEST;
+ if (err == -ENOMEM)
+ retcode = ERR_NOMEM;
+ }
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+
+fail:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ device = adm_ctx.device;
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* If there is still bitmap IO pending, probably because of a previous
+ * resync just being finished, wait for it before requesting a new resync.
+ * Also wait for it's after_state_ch(). */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+ /* If we happen to be C_STANDALONE R_SECONDARY, just change to
+ * D_INCONSISTENT, and set all bits in the bitmap. Otherwise,
+ * try to start a resync handshake as sync target for full sync.
+ */
+ if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
+ retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+ "set_n_write from invalidate", BM_LOCKED_MASK))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ put_ldev(device);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+ union drbd_state mask, union drbd_state val)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = drbd_request_state(adm_ctx.device, mask, val);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
+{
+ int rv;
+
+ rv = drbd_bmio_set_n_write(device);
+ drbd_suspend_al(device);
+ return rv;
+}
+
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ int retcode; /* drbd_ret_code, drbd_state_rv */
+ struct drbd_device *device;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ device = adm_ctx.device;
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* If there is still bitmap IO pending, probably because of a previous
+ * resync just being finished, wait for it before requesting a new resync.
+ * Also wait for it's after_state_ch(). */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+ /* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
+ * in the bitmap. Otherwise, try to start a resync handshake
+ * as sync source for full sync.
+ */
+ if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
+ /* The peer will get a resync upon connect anyways. Just make that
+ into a full resync. */
+ retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
+ "set_n_write from invalidate_peer",
+ BM_LOCKED_SET_ALLOWED))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+ put_ldev(device);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+ retcode = ERR_PAUSE_IS_SET;
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ union drbd_dev_state s;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+ s = adm_ctx.device->state;
+ if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
+ retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
+ s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
+ } else {
+ retcode = ERR_PAUSE_IS_CLEAR;
+ }
+ }
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
+{
+ return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
+}
+
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ device = adm_ctx.device;
+ if (test_bit(NEW_CUR_UUID, &device->flags)) {
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ }
+ drbd_suspend_io(device);
+ retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+ if (retcode == SS_SUCCESS) {
+ if (device->state.conn < C_CONNECTED)
+ tl_clear(first_peer_device(device)->connection);
+ if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
+ tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
+ }
+ drbd_resume_io(device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
+{
+ return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
+}
+
+static int nla_put_drbd_cfg_context(struct sk_buff *skb,
+ struct drbd_resource *resource,
+ struct drbd_connection *connection,
+ struct drbd_device *device)
+{
+ struct nlattr *nla;
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+ if (!nla)
+ goto nla_put_failure;
+ if (device &&
+ nla_put_u32(skb, T_ctx_volume, device->vnr))
+ goto nla_put_failure;
+ if (nla_put_string(skb, T_ctx_resource_name, resource->name))
+ goto nla_put_failure;
+ if (connection) {
+ if (connection->my_addr_len &&
+ nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
+ goto nla_put_failure;
+ if (connection->peer_addr_len &&
+ nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
+ goto nla_put_failure;
+ }
+ nla_nest_end(skb, nla);
+ return 0;
+
+nla_put_failure:
+ if (nla)
+ nla_nest_cancel(skb, nla);
+ return -EMSGSIZE;
+}
+
+/*
+ * Return the connection of @resource if @resource has exactly one connection.
+ */
+static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
+{
+ struct list_head *connections = &resource->connections;
+
+ if (list_empty(connections) || connections->next->next != connections)
+ return NULL;
+ return list_first_entry(&resource->connections, struct drbd_connection, connections);
+}
+
+static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+ const struct sib_info *sib)
+{
+ struct drbd_resource *resource = device->resource;
+ struct state_info *si = NULL; /* for sizeof(si->member); */
+ struct nlattr *nla;
+ int got_ldev;
+ int err = 0;
+ int exclude_sensitive;
+
+ /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+ * to. So we better exclude_sensitive information.
+ *
+ * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+ * in the context of the requesting user process. Exclude sensitive
+ * information, unless current has superuser.
+ *
+ * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+ * relies on the current implementation of netlink_dump(), which
+ * executes the dump callback successively from netlink_recvmsg(),
+ * always in the context of the receiving process */
+ exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+ got_ldev = get_ldev(device);
+
+ /* We need to add connection name and volume number information still.
+ * Minor number is in drbd_genlmsghdr. */
+ if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device))
+ goto nla_put_failure;
+
+ if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ if (got_ldev) {
+ struct disk_conf *disk_conf;
+
+ disk_conf = rcu_dereference(device->ldev->disk_conf);
+ err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
+ }
+ if (!err) {
+ struct net_conf *nc;
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (nc)
+ err = net_conf_to_skb(skb, nc, exclude_sensitive);
+ }
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+
+ nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+ if (!nla)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+ nla_put_u32(skb, T_current_state, device->state.i) ||
+ nla_put_u64(skb, T_ed_uuid, device->ed_uuid) ||
+ nla_put_u64(skb, T_capacity, drbd_get_capacity(device->this_bdev)) ||
+ nla_put_u64(skb, T_send_cnt, device->send_cnt) ||
+ nla_put_u64(skb, T_recv_cnt, device->recv_cnt) ||
+ nla_put_u64(skb, T_read_cnt, device->read_cnt) ||
+ nla_put_u64(skb, T_writ_cnt, device->writ_cnt) ||
+ nla_put_u64(skb, T_al_writ_cnt, device->al_writ_cnt) ||
+ nla_put_u64(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
+ nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
+ nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
+ nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
+ goto nla_put_failure;
+
+ if (got_ldev) {
+ int err;
+
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ if (err)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
+ nla_put_u64(skb, T_bits_total, drbd_bm_bits(device)) ||
+ nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(device)))
+ goto nla_put_failure;
+ if (C_SYNC_SOURCE <= device->state.conn &&
+ C_PAUSED_SYNC_T >= device->state.conn) {
+ if (nla_put_u64(skb, T_bits_rs_total, device->rs_total) ||
+ nla_put_u64(skb, T_bits_rs_failed, device->rs_failed))
+ goto nla_put_failure;
+ }
+ }
+
+ if (sib) {
+ switch(sib->sib_reason) {
+ case SIB_SYNC_PROGRESS:
+ case SIB_GET_STATUS_REPLY:
+ break;
+ case SIB_STATE_CHANGE:
+ if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+ nla_put_u32(skb, T_new_state, sib->ns.i))
+ goto nla_put_failure;
+ break;
+ case SIB_HELPER_POST:
+ if (nla_put_u32(skb, T_helper_exit_code,
+ sib->helper_exit_code))
+ goto nla_put_failure;
+ /* fall through */
+ case SIB_HELPER_PRE:
+ if (nla_put_string(skb, T_helper, sib->helper_name))
+ goto nla_put_failure;
+ break;
+ }
+ }
+ nla_nest_end(skb, nla);
+
+ if (0)
+nla_put_failure:
+ err = -EMSGSIZE;
+ if (got_ldev)
+ put_ldev(device);
+ return err;
+}
+
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
+ }
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct drbd_device *device;
+ struct drbd_genlmsghdr *dh;
+ struct drbd_resource *pos = (struct drbd_resource *)cb->args[0];
+ struct drbd_resource *resource = NULL;
+ struct drbd_resource *tmp;
+ unsigned volume = cb->args[1];
+
+ /* Open coded, deferred, iteration:
+ * for_each_resource_safe(resource, tmp, &drbd_resources) {
+ * connection = "first connection of resource or undefined";
+ * idr_for_each_entry(&resource->devices, device, i) {
+ * ...
+ * }
+ * }
+ * where resource is cb->args[0];
+ * and i is cb->args[1];
+ *
+ * cb->args[2] indicates if we shall loop over all resources,
+ * or just dump all volumes of a single resource.
+ *
+ * This may miss entries inserted after this dump started,
+ * or entries deleted before they are reached.
+ *
+ * We need to make sure the device won't disappear while
+ * we are looking at it, and revalidate our iterators
+ * on each iteration.
+ */
+
+ /* synchronize with conn_create()/drbd_destroy_connection() */
+ rcu_read_lock();
+ /* revalidate iterator position */
+ for_each_resource_rcu(tmp, &drbd_resources) {
+ if (pos == NULL) {
+ /* first iteration */
+ pos = tmp;
+ resource = pos;
+ break;
+ }
+ if (tmp == pos) {
+ resource = pos;
+ break;
+ }
+ }
+ if (resource) {
+next_resource:
+ device = idr_get_next(&resource->devices, &volume);
+ if (!device) {
+ /* No more volumes to dump on this resource.
+ * Advance resource iterator. */
+ pos = list_entry_rcu(resource->resources.next,
+ struct drbd_resource, resources);
+ /* Did we dump any volume of this resource yet? */
+ if (volume != 0) {
+ /* If we reached the end of the list,
+ * or only a single resource dump was requested,
+ * we are done. */
+ if (&pos->resources == &drbd_resources || cb->args[2])
+ goto out;
+ volume = 0;
+ resource = pos;
+ goto next_resource;
+ }
+ }
+
+ dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &drbd_genl_family,
+ NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+ if (!dh)
+ goto out;
+
+ if (!device) {
+ /* This is a connection without a single volume.
+ * Suprisingly enough, it may have a network
+ * configuration. */
+ struct drbd_connection *connection;
+
+ dh->minor = -1U;
+ dh->ret_code = NO_ERROR;
+ connection = the_only_connection(resource);
+ if (nla_put_drbd_cfg_context(skb, resource, connection, NULL))
+ goto cancel;
+ if (connection) {
+ struct net_conf *nc;
+
+ nc = rcu_dereference(connection->net_conf);
+ if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+ goto cancel;
+ }
+ goto done;
+ }
+
+ D_ASSERT(device, device->vnr == volume);
+ D_ASSERT(device, device->resource == resource);
+
+ dh->minor = device_to_minor(device);
+ dh->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(skb, device, NULL)) {
+cancel:
+ genlmsg_cancel(skb, dh);
+ goto out;
+ }
+done:
+ genlmsg_end(skb, dh);
+ }
+
+out:
+ rcu_read_unlock();
+ /* where to start the next iteration */
+ cb->args[0] = (long)pos;
+ cb->args[1] = (pos == resource) ? volume + 1 : 0;
+
+ /* No more resources/volumes/minors found results in an empty skb.
+ * Which will terminate the dump. */
+ return skb->len;
+}
+
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock(). During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
+ */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+ struct nlattr *nla;
+ const char *resource_name;
+ struct drbd_resource *resource;
+ int maxtype;
+
+ /* Is this a followup call? */
+ if (cb->args[0]) {
+ /* ... of a single resource dump,
+ * and the resource iterator has been advanced already? */
+ if (cb->args[2] && cb->args[2] != cb->args[0])
+ return 0; /* DONE. */
+ goto dump;
+ }
+
+ /* First call (from netlink_dump_start). We need to figure out
+ * which resource(s) the user wants us to dump. */
+ nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+ nlmsg_attrlen(cb->nlh, hdrlen),
+ DRBD_NLA_CFG_CONTEXT);
+
+ /* No explicit context given. Dump all. */
+ if (!nla)
+ goto dump;
+ maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+ nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+ if (IS_ERR(nla))
+ return PTR_ERR(nla);
+ /* context given, but no name present? */
+ if (!nla)
+ return -EINVAL;
+ resource_name = nla_data(nla);
+ if (!*resource_name)
+ return -ENODEV;
+ resource = drbd_find_resource(resource_name);
+ if (!resource)
+ return -ENODEV;
+
+ kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */
+
+ /* prime iterators, and set "filter" mode mark:
+ * only dump this connection. */
+ cb->args[0] = (long)resource;
+ /* cb->args[1] = 0; passed in this way. */
+ cb->args[2] = (long)resource;
+
+dump:
+ return get_one_status(skb, cb);
+}
+
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct timeout_parms tp;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ tp.timeout_type =
+ adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+ test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
+ UT_DEFAULT;
+
+ err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
+ }
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
+ struct start_ov_parms parms;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ device = adm_ctx.device;
+
+ /* resume from last known position, if possible */
+ parms.ov_start_sector = device->ov_start_sector;
+ parms.ov_stop_sector = ULLONG_MAX;
+ if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+ int err = start_ov_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+
+ /* w_make_ov_request expects position to be aligned */
+ device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+ device->ov_stop_sector = parms.ov_stop_sector;
+
+ /* If there is still bitmap IO pending, e.g. previous resync or verify
+ * just being finished, wait for it before requesting a new resync. */
+ drbd_suspend_io(device);
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+ retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
+ drbd_resume_io(device);
+
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_device *device;
+ enum drbd_ret_code retcode;
+ int skip_initial_sync = 0;
+ int err;
+ struct new_c_uuid_parms args;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out_nolock;
+
+ device = adm_ctx.device;
+ memset(&args, 0, sizeof(args));
+ if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+ err = new_c_uuid_parms_from_attrs(&args, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out_nolock;
+ }
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
+
+ if (!get_ldev(device)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ /* this is "skip initial sync", assume to be clean */
+ if (device->state.conn == C_CONNECTED &&
+ first_peer_device(device)->connection->agreed_pro_version >= 90 &&
+ device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+ drbd_info(device, "Preparing to skip initial sync\n");
+ skip_initial_sync = 1;
+ } else if (device->state.conn != C_STANDALONE) {
+ retcode = ERR_CONNECTED;
+ goto out_dec;
+ }
+
+ drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+ drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
+
+ if (args.clear_bm) {
+ err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+ "clear_n_write from new_c_uuid", BM_LOCKED_MASK);
+ if (err) {
+ drbd_err(device, "Writing bitmap failed with %d\n", err);
+ retcode = ERR_IO_MD_DISK;
+ }
+ if (skip_initial_sync) {
+ drbd_send_uuids_skip_initial_sync(first_peer_device(device));
+ _drbd_uuid_set(device, UI_BITMAP, 0);
+ drbd_print_uuids(device, "cleared bitmap UUID");
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+ CS_VERBOSE, NULL);
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+ }
+
+ drbd_md_sync(device);
+out_dec:
+ put_ldev(device);
+out:
+ mutex_unlock(device->state_mutex);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out_nolock:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static enum drbd_ret_code
+drbd_check_resource_name(struct drbd_config_context *adm_ctx)
+{
+ const char *name = adm_ctx->resource_name;
+ if (!name || !name[0]) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing");
+ return ERR_MANDATORY_TAG;
+ }
+ /* if we want to use these in sysfs/configfs/debugfs some day,
+ * we must not allow slashes */
+ if (strchr(name, '/')) {
+ drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name");
+ return ERR_INVALID_REQUEST;
+ }
+ return NO_ERROR;
+}
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+ struct res_opts res_opts;
+ int err;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ set_res_opts_defaults(&res_opts);
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+ goto out;
+ }
+
+ retcode = drbd_check_resource_name(&adm_ctx);
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (adm_ctx.resource) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+ retcode = ERR_INVALID_REQUEST;
+ drbd_msg_put_info(adm_ctx.reply_skb, "resource exists");
+ }
+ /* else: still NO_ERROR */
+ goto out;
+ }
+
+ /* not yet safe for genl_family.parallel_ops */
+ if (!conn_create(adm_ctx.resource_name, &res_opts))
+ retcode = ERR_NOMEM;
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_genlmsghdr *dh = info->userhdr;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (dh->minor > MINORMASK) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+ if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+
+ /* drbd_adm_prepare made sure already
+ * that first_peer_device(device)->connection and device->vnr match the request. */
+ if (adm_ctx.device) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+ retcode = ERR_MINOR_OR_VOLUME_EXISTS;
+ /* else: still NO_ERROR */
+ goto out;
+ }
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = drbd_create_device(&adm_ctx, dh->minor);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
+{
+ if (device->state.disk == D_DISKLESS &&
+ /* no need to be device->state.conn == C_STANDALONE &&
+ * we may want to delete a minor from a live replication group.
+ */
+ device->state.role == R_SECONDARY) {
+ _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
+ CS_VERBOSE + CS_WAIT_COMPLETE);
+ drbd_delete_device(device);
+ return NO_ERROR;
+ } else
+ return ERR_MINOR_CONFIGURED;
+}
+
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mutex_lock(&adm_ctx.resource->adm_mutex);
+ retcode = adm_del_minor(adm_ctx.device);
+ mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+static int adm_del_resource(struct drbd_resource *resource)
+{
+ struct drbd_connection *connection;
+
+ for_each_connection(connection, resource) {
+ if (connection->cstate > C_STANDALONE)
+ return ERR_NET_CONFIGURED;
+ }
+ if (!idr_is_empty(&resource->devices))
+ return ERR_RES_IN_USE;
+
+ list_del_rcu(&resource->resources);
+ /* Make sure all threads have actually stopped: state handling only
+ * does drbd_thread_stop_nowait(). */
+ list_for_each_entry(connection, &resource->connections, connections)
+ drbd_thread_stop(&connection->worker);
+ synchronize_rcu();
+ drbd_free_resource(resource);
+ return NO_ERROR;
+}
+
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_resource *resource;
+ struct drbd_connection *connection;
+ struct drbd_device *device;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+ unsigned i;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ resource = adm_ctx.resource;
+ mutex_lock(&resource->adm_mutex);
+ /* demote */
+ for_each_connection(connection, resource) {
+ struct drbd_peer_device *peer_device;
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+ retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote");
+ goto out;
+ }
+ }
+
+ retcode = conn_try_disconnect(connection, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect");
+ goto out;
+ }
+ }
+
+ /* detach */
+ idr_for_each_entry(&resource->devices, device, i) {
+ retcode = adm_detach(device, 0);
+ if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach");
+ goto out;
+ }
+ }
+
+ /* delete volumes */
+ idr_for_each_entry(&resource->devices, device, i) {
+ retcode = adm_del_minor(device);
+ if (retcode != NO_ERROR) {
+ /* "can not happen" */
+ drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume");
+ goto out;
+ }
+ }
+
+ retcode = adm_del_resource(resource);
+out:
+ mutex_unlock(&resource->adm_mutex);
+finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_config_context adm_ctx;
+ struct drbd_resource *resource;
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+ resource = adm_ctx.resource;
+
+ mutex_lock(&resource->adm_mutex);
+ retcode = adm_del_resource(resource);
+ mutex_unlock(&resource->adm_mutex);
+finish:
+ drbd_adm_finish(&adm_ctx, info, retcode);
+ return 0;
+}
+
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
+{
+ static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+ struct sk_buff *msg;
+ struct drbd_genlmsghdr *d_out;
+ unsigned seq;
+ int err = -ENOMEM;
+
+ seq = atomic_inc_return(&drbd_genl_seq);
+ msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+ if (!msg)
+ goto failed;
+
+ err = -EMSGSIZE;
+ d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+ if (!d_out) /* cannot happen, but anyways. */
+ goto nla_put_failure;
+ d_out->minor = device_to_minor(device);
+ d_out->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(msg, device, sib))
+ goto nla_put_failure;
+ genlmsg_end(msg, d_out);
+ err = drbd_genl_multicast_events(msg, 0);
+ /* msg has been consumed or freed in netlink_broadcast() */
+ if (err && err != -ESRCH)
+ goto failed;
+
+ return;
+
+nla_put_failure:
+ nlmsg_free(msg);
+failed:
+ drbd_err(device, "Error %d while broadcasting event. "
+ "Event seq:%u sib_reason:%u\n",
+ err, seq, sib->sib_reason);
+}
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 000000000..b2d479149
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,54 @@
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+ struct nlattr *head = nla_data(nla);
+ int len = nla_len(nla);
+ int rem;
+
+ /*
+ * validate_nla (called from nla_parse_nested) ignores attributes
+ * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+ * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+ * flag set also, check and remove that flag before calling
+ * nla_parse_nested.
+ */
+
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+ nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+ if (nla_type(nla) > maxtype)
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy)
+{
+ int err;
+
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (!err)
+ err = nla_parse_nested(tb, maxtype, nla, policy);
+
+ return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+ int err;
+ /*
+ * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+ * we don't know about that attribute, reject all the nested
+ * attributes.
+ */
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (err)
+ return ERR_PTR(err);
+ return nla_find_nested(nla, attrtype);
+}
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 000000000..679c2d5b4
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,8 @@
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000..3b10fa6cb
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,368 @@
+/*
+ drbd_proc.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+static int drbd_proc_open(struct inode *inode, struct file *file);
+static int drbd_proc_release(struct inode *inode, struct file *file);
+
+
+struct proc_dir_entry *drbd_proc;
+const struct file_operations drbd_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = drbd_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = drbd_proc_release,
+};
+
+static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
+{
+ /* v is in kB/sec. We don't expect TiByte/sec yet. */
+ if (unlikely(v >= 1000000)) {
+ /* cool: > GiByte/s */
+ seq_printf(seq, "%ld,", v / 1000000);
+ v %= 1000000;
+ seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
+ } else if (likely(v >= 1000))
+ seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
+ else
+ seq_printf(seq, "%ld", v);
+}
+
+static void drbd_get_syncer_progress(struct drbd_device *device,
+ union drbd_dev_state state, unsigned long *rs_total,
+ unsigned long *bits_left, unsigned int *per_mil_done)
+{
+ /* this is to break it at compile time when we change that, in case we
+ * want to support more than (1<<32) bits on a 32bit arch. */
+ typecheck(unsigned long, device->rs_total);
+ *rs_total = device->rs_total;
+
+ /* note: both rs_total and rs_left are in bits, i.e. in
+ * units of BM_BLOCK_SIZE.
+ * for the percentage, we don't care. */
+
+ if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+ *bits_left = device->ov_left;
+ else
+ *bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+ /* >> 10 to prevent overflow,
+ * +1 to prevent division by zero */
+ if (*bits_left > *rs_total) {
+ /* D'oh. Maybe a logic bug somewhere. More likely just a race
+ * between state change and reset of rs_total.
+ */
+ *bits_left = *rs_total;
+ *per_mil_done = *rs_total ? 0 : 1000;
+ } else {
+ /* Make sure the division happens in long context.
+ * We allow up to one petabyte storage right now,
+ * at a granularity of 4k per bit that is 2**38 bits.
+ * After shift right and multiplication by 1000,
+ * this should still fit easily into a 32bit long,
+ * so we don't need a 64bit division on 32bit arch.
+ * Note: currently we don't support such large bitmaps on 32bit
+ * arch anyways, but no harm done to be prepared for it here.
+ */
+ unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
+ unsigned long left = *bits_left >> shift;
+ unsigned long total = 1UL + (*rs_total >> shift);
+ unsigned long tmp = 1000UL - left * 1000UL/total;
+ *per_mil_done = tmp;
+ }
+}
+
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ * [=====>..............] 33.5% (23456/123456)
+ * finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
+ union drbd_dev_state state)
+{
+ unsigned long db, dt, dbdt, rt, rs_total, rs_left;
+ unsigned int res;
+ int i, x, y;
+ int stalled = 0;
+
+ drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
+
+ x = res/50;
+ y = 20-x;
+ seq_printf(seq, "\t[");
+ for (i = 1; i < x; i++)
+ seq_printf(seq, "=");
+ seq_printf(seq, ">");
+ for (i = 0; i < y; i++)
+ seq_printf(seq, ".");
+ seq_printf(seq, "] ");
+
+ if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+ seq_printf(seq, "verified:");
+ else
+ seq_printf(seq, "sync'ed:");
+ seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
+
+ /* if more than a few GB, display in MB */
+ if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+ seq_printf(seq, "(%lu/%lu)M",
+ (unsigned long) Bit2KB(rs_left >> 10),
+ (unsigned long) Bit2KB(rs_total >> 10));
+ else
+ seq_printf(seq, "(%lu/%lu)K",
+ (unsigned long) Bit2KB(rs_left),
+ (unsigned long) Bit2KB(rs_total));
+
+ seq_printf(seq, "\n\t");
+
+ /* see drivers/md/md.c
+ * We do not want to overflow, so the order of operands and
+ * the * 100 / 100 trick are important. We do a +1 to be
+ * safe against division by zero. We only estimate anyway.
+ *
+ * dt: time from mark until now
+ * db: blocks written from mark until now
+ * rt: remaining time
+ */
+ /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
+ * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+ * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+ /* ------------------------ ~18s average ------------------------ */
+ i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+ dt = (jiffies - device->rs_mark_time[i]) / HZ;
+ if (dt > 180)
+ stalled = 1;
+
+ if (!dt)
+ dt++;
+ db = device->rs_mark_left[i] - rs_left;
+ rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+ seq_printf(seq, "finish: %lu:%02lu:%02lu",
+ rt / 3600, (rt % 3600) / 60, rt % 60);
+
+ dbdt = Bit2KB(db/dt);
+ seq_printf(seq, " speed: ");
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, " (");
+ /* ------------------------- ~3s average ------------------------ */
+ if (proc_details >= 1) {
+ /* this is what drbd_rs_should_slow_down() uses */
+ i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+ dt = (jiffies - device->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = device->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, " -- ");
+ }
+
+ /* --------------------- long term average ---------------------- */
+ /* mean speed since syncer started
+ * we do account for PausedSync periods */
+ dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+ if (dt == 0)
+ dt = 1;
+ db = rs_total - rs_left;
+ dbdt = Bit2KB(db/dt);
+ seq_printf_with_thousands_grouping(seq, dbdt);
+ seq_printf(seq, ")");
+
+ if (state.conn == C_SYNC_TARGET ||
+ state.conn == C_VERIFY_S) {
+ seq_printf(seq, " want: ");
+ seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
+ }
+ seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
+
+ if (proc_details >= 1) {
+ /* 64 bit:
+ * we convert to sectors in the display below. */
+ unsigned long bm_bits = drbd_bm_bits(device);
+ unsigned long bit_pos;
+ unsigned long long stop_sector = 0;
+ if (state.conn == C_VERIFY_S ||
+ state.conn == C_VERIFY_T) {
+ bit_pos = bm_bits - device->ov_left;
+ if (verify_can_do_stop_sector(device))
+ stop_sector = device->ov_stop_sector;
+ } else
+ bit_pos = device->bm_resync_fo;
+ /* Total sectors may be slightly off for oddly
+ * sized devices. So what. */
+ seq_printf(seq,
+ "\t%3d%% sector pos: %llu/%llu",
+ (int)(bit_pos / (bm_bits/100+1)),
+ (unsigned long long)bit_pos * BM_SECT_PER_BIT,
+ (unsigned long long)bm_bits * BM_SECT_PER_BIT);
+ if (stop_sector != 0 && stop_sector != ULLONG_MAX)
+ seq_printf(seq, " stop sector: %llu", stop_sector);
+ seq_printf(seq, "\n");
+ }
+}
+
+static int drbd_seq_show(struct seq_file *seq, void *v)
+{
+ int i, prev_i = -1;
+ const char *sn;
+ struct drbd_device *device;
+ struct net_conf *nc;
+ union drbd_dev_state state;
+ char wp;
+
+ static char write_ordering_chars[] = {
+ [WO_none] = 'n',
+ [WO_drain_io] = 'd',
+ [WO_bdev_flush] = 'f',
+ };
+
+ seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+ API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+ /*
+ cs .. connection state
+ ro .. node role (local/remote)
+ ds .. disk state (local/remote)
+ protocol
+ various flags
+ ns .. network send
+ nr .. network receive
+ dw .. disk write
+ dr .. disk read
+ al .. activity log write count
+ bm .. bitmap update write count
+ pe .. pending (waiting for ack or data reply)
+ ua .. unack'd (still need to send ack or data reply)
+ ap .. application requests accepted, but not yet completed
+ ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+ wo .. write ordering mode currently in use
+ oos .. known out-of-sync kB
+ */
+
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, device, i) {
+ if (prev_i != i - 1)
+ seq_printf(seq, "\n");
+ prev_i = i;
+
+ state = device->state;
+ sn = drbd_conn_str(state.conn);
+
+ if (state.conn == C_STANDALONE &&
+ state.disk == D_DISKLESS &&
+ state.role == R_SECONDARY) {
+ seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+ } else {
+ /* reset device->congestion_reason */
+ bdi_rw_congested(&device->rq_queue->backing_dev_info);
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
+ seq_printf(seq,
+ "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
+ " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+ "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+ i, sn,
+ drbd_role_str(state.role),
+ drbd_role_str(state.peer),
+ drbd_disk_str(state.disk),
+ drbd_disk_str(state.pdsk),
+ wp,
+ drbd_suspended(device) ? 's' : 'r',
+ state.aftr_isp ? 'a' : '-',
+ state.peer_isp ? 'p' : '-',
+ state.user_isp ? 'u' : '-',
+ device->congestion_reason ?: '-',
+ test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
+ device->send_cnt/2,
+ device->recv_cnt/2,
+ device->writ_cnt/2,
+ device->read_cnt/2,
+ device->al_writ_cnt,
+ device->bm_writ_cnt,
+ atomic_read(&device->local_cnt),
+ atomic_read(&device->ap_pending_cnt) +
+ atomic_read(&device->rs_pending_cnt),
+ atomic_read(&device->unacked_cnt),
+ atomic_read(&device->ap_bio_cnt),
+ first_peer_device(device)->connection->epochs,
+ write_ordering_chars[device->resource->write_ordering]
+ );
+ seq_printf(seq, " oos:%llu\n",
+ Bit2KB((unsigned long long)
+ drbd_bm_total_weight(device)));
+ }
+ if (state.conn == C_SYNC_SOURCE ||
+ state.conn == C_SYNC_TARGET ||
+ state.conn == C_VERIFY_S ||
+ state.conn == C_VERIFY_T)
+ drbd_syncer_progress(device, seq, state);
+
+ if (proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
+ lc_seq_printf_stats(seq, device->resync);
+ lc_seq_printf_stats(seq, device->act_log);
+ put_ldev(device);
+ }
+
+ if (proc_details >= 2)
+ seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int drbd_proc_open(struct inode *inode, struct file *file)
+{
+ int err;
+
+ if (try_module_get(THIS_MODULE)) {
+ err = single_open(file, drbd_seq_show, NULL);
+ if (err)
+ module_put(THIS_MODULE);
+ return err;
+ }
+ return -ENODEV;
+}
+
+static int drbd_proc_release(struct inode *inode, struct file *file)
+{
+ module_put(THIS_MODULE);
+ return single_release(inode, file);
+}
+
+/* PROC FS stuff end */
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
new file mode 100644
index 000000000..2da9104a3
--- /dev/null
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -0,0 +1,307 @@
+#ifndef __DRBD_PROTOCOL_H
+#define __DRBD_PROTOCOL_H
+
+enum drbd_packet {
+ /* receiver (data socket) */
+ P_DATA = 0x00,
+ P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
+ P_RS_DATA_REPLY = 0x02, /* Response to P_RS_DATA_REQUEST */
+ P_BARRIER = 0x03,
+ P_BITMAP = 0x04,
+ P_BECOME_SYNC_TARGET = 0x05,
+ P_BECOME_SYNC_SOURCE = 0x06,
+ P_UNPLUG_REMOTE = 0x07, /* Used at various times to hint the peer */
+ P_DATA_REQUEST = 0x08, /* Used to ask for a data block */
+ P_RS_DATA_REQUEST = 0x09, /* Used to ask for a data block for resync */
+ P_SYNC_PARAM = 0x0a,
+ P_PROTOCOL = 0x0b,
+ P_UUIDS = 0x0c,
+ P_SIZES = 0x0d,
+ P_STATE = 0x0e,
+ P_SYNC_UUID = 0x0f,
+ P_AUTH_CHALLENGE = 0x10,
+ P_AUTH_RESPONSE = 0x11,
+ P_STATE_CHG_REQ = 0x12,
+
+ /* asender (meta socket */
+ P_PING = 0x13,
+ P_PING_ACK = 0x14,
+ P_RECV_ACK = 0x15, /* Used in protocol B */
+ P_WRITE_ACK = 0x16, /* Used in protocol C */
+ P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+ P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */
+ P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
+ P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
+ P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
+ P_BARRIER_ACK = 0x1c,
+ P_STATE_CHG_REPLY = 0x1d,
+
+ /* "new" commands, no longer fitting into the ordering scheme above */
+
+ P_OV_REQUEST = 0x1e, /* data socket */
+ P_OV_REPLY = 0x1f,
+ P_OV_RESULT = 0x20, /* meta socket */
+ P_CSUM_RS_REQUEST = 0x21, /* data socket */
+ P_RS_IS_IN_SYNC = 0x22, /* meta socket */
+ P_SYNC_PARAM89 = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+ P_COMPRESSED_BITMAP = 0x24, /* compressed or otherwise encoded bitmap transfer */
+ /* P_CKPT_FENCE_REQ = 0x25, * currently reserved for protocol D */
+ /* P_CKPT_DISABLE_REQ = 0x26, * currently reserved for protocol D */
+ P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
+ P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
+ P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+ P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
+ P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
+ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
+ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */
+ /* 0x2e to 0x30 reserved, used in drbd 9 */
+
+ /* REQ_DISCARD. We used "discard" in different contexts before,
+ * which is why I chose TRIM here, to disambiguate. */
+ P_TRIM = 0x31,
+
+ P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+ P_MAX_OPT_CMD = 0x101,
+
+ /* special command ids for handshake */
+
+ P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */
+ P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */
+
+ P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */
+};
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ * (except block_id and barrier fields.
+ * these are pointers to local structs
+ * and have no relevance for the partner,
+ * which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header80 {
+ u32 magic;
+ u16 command;
+ u16 length; /* bytes of data after this header */
+} __packed;
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+ u16 magic; /* use DRBD_MAGIC_BIG here */
+ u16 command;
+ u32 length;
+} __packed;
+
+struct p_header100 {
+ u32 magic;
+ u16 volume;
+ u16 command;
+ u32 length;
+ u32 pad;
+} __packed;
+
+/* these defines must not be changed without changing the protocol version */
+#define DP_HARDBARRIER 1 /* depricated */
+#define DP_RW_SYNC 2 /* equals REQ_SYNC */
+#define DP_MAY_SET_IN_SYNC 4
+#define DP_UNPLUG 8 /* not used anymore */
+#define DP_FUA 16 /* equals REQ_FUA */
+#define DP_FLUSH 32 /* equals REQ_FLUSH */
+#define DP_DISCARD 64 /* equals REQ_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
+
+struct p_data {
+ u64 sector; /* 64 bits sector number */
+ u64 block_id; /* to identify the request in protocol B&C */
+ u32 seq_num;
+ u32 dp_flags;
+} __packed;
+
+struct p_trim {
+ struct p_data p_data;
+ u32 size; /* == bio->bi_size */
+} __packed;
+
+/*
+ * commands which share a struct:
+ * p_block_ack:
+ * P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ * P_SUPERSEDED (proto C, two-primaries conflict detection)
+ * p_block_req:
+ * P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+ u64 sector;
+ u64 block_id;
+ u32 blksize;
+ u32 seq_num;
+} __packed;
+
+struct p_block_req {
+ u64 sector;
+ u64 block_id;
+ u32 blksize;
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ * P_CONNECTION_FEATURES
+ * P_BARRIER
+ * P_BARRIER_ACK
+ * P_SYNC_PARAM
+ * ReportParams
+ */
+
+#define FF_TRIM 1
+
+struct p_connection_features {
+ u32 protocol_min;
+ u32 feature_flags;
+ u32 protocol_max;
+
+ /* should be more than enough for future enhancements
+ * for now, feature_flags and the reserved array shall be zero.
+ */
+
+ u32 _pad;
+ u64 reserved[7];
+} __packed;
+
+struct p_barrier {
+ u32 barrier; /* barrier number _handle_ only */
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+ u32 barrier;
+ u32 set_size;
+} __packed;
+
+struct p_rs_param {
+ u32 resync_rate;
+
+ /* Since protocol version 88 and higher. */
+ char verify_alg[0];
+} __packed;
+
+struct p_rs_param_89 {
+ u32 resync_rate;
+ /* protocol version 89: */
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_rs_param_95 {
+ u32 resync_rate;
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+ u32 c_plan_ahead;
+ u32 c_delay_target;
+ u32 c_fill_target;
+ u32 c_max_rate;
+} __packed;
+
+enum drbd_conn_flags {
+ CF_DISCARD_MY_DATA = 1,
+ CF_DRY_RUN = 2,
+};
+
+struct p_protocol {
+ u32 protocol;
+ u32 after_sb_0p;
+ u32 after_sb_1p;
+ u32 after_sb_2p;
+ u32 conn_flags;
+ u32 two_primaries;
+
+ /* Since protocol version 87 and higher. */
+ char integrity_alg[0];
+
+} __packed;
+
+struct p_uuids {
+ u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+ u64 uuid;
+} __packed;
+
+struct p_sizes {
+ u64 d_size; /* size of disk */
+ u64 u_size; /* user requested size */
+ u64 c_size; /* current exported size */
+ u32 max_bio_size; /* Maximal size of a BIO */
+ u16 queue_order_type; /* not yet implemented in DRBD*/
+ u16 dds_flags; /* use enum dds_flags here. */
+} __packed;
+
+struct p_state {
+ u32 state;
+} __packed;
+
+struct p_req_state {
+ u32 mask;
+ u32 val;
+} __packed;
+
+struct p_req_state_reply {
+ u32 retcode;
+} __packed;
+
+struct p_drbd06_param {
+ u64 size;
+ u32 state;
+ u32 blksize;
+ u32 protocol;
+ u32 version;
+ u32 gen_cnt[5];
+ u32 bit_map_gen[5];
+} __packed;
+
+struct p_block_desc {
+ u64 sector;
+ u32 blksize;
+ u32 pad; /* to multiple of 8 Byte */
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+ /* RLE_VLI_Bytes = 0,
+ * and other bit variants had been defined during
+ * algorithm evaluation. */
+ RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+ /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+ * (encoding & 0x80): polarity (set/unset) of first runlength
+ * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+ * used to pad up to head.length bytes
+ */
+ u8 encoding;
+
+ u8 code[0];
+} __packed;
+
+struct p_delay_probe93 {
+ u32 seq_num; /* sequence number to match the two probe packets */
+ u32 offset; /* usecs the probe got sent after the reference time point */
+} __packed;
+
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
+ */
+#define DRBD_SOCKET_BUFFER_SIZE 4096
+
+#endif /* __DRBD_PROTOCOL_H */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000..cee20354a
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,5661 @@
+/*
+ drbd_receiver.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+#include <linux/module.h>
+
+#include <asm/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include "drbd_vli.h"
+
+#define PRO_FEATURES (FF_TRIM)
+
+struct packet_info {
+ enum drbd_packet cmd;
+ unsigned int size;
+ unsigned int vnr;
+ void *data;
+};
+
+enum finish_epoch {
+ FE_STILL_LIVE,
+ FE_DESTROYED,
+ FE_RECYCLED,
+};
+
+static int drbd_do_features(struct drbd_connection *connection);
+static int drbd_do_auth(struct drbd_connection *connection);
+static int drbd_disconnected(struct drbd_peer_device *);
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
+
+
+#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
+
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+ struct page *page;
+ struct page *tmp;
+
+ BUG_ON(!n);
+ BUG_ON(!head);
+
+ page = *head;
+
+ if (!page)
+ return NULL;
+
+ while (page) {
+ tmp = page_chain_next(page);
+ if (--n == 0)
+ break; /* found sufficient pages */
+ if (tmp == NULL)
+ /* insufficient pages, don't use any of them. */
+ return NULL;
+ page = tmp;
+ }
+
+ /* add end of list marker for the returned list */
+ set_page_private(page, 0);
+ /* actual return value, and adjustment of head */
+ page = *head;
+ *head = tmp;
+ return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+ struct page *tmp;
+ int i = 1;
+ while ((tmp = page_chain_next(page)))
+ ++i, page = tmp;
+ if (len)
+ *len = i;
+ return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+ struct page *tmp;
+ int i = 0;
+ page_chain_for_each_safe(page, tmp) {
+ put_page(page);
+ ++i;
+ }
+ return i;
+}
+
+static void page_chain_add(struct page **head,
+ struct page *chain_first, struct page *chain_last)
+{
+#if 1
+ struct page *tmp;
+ tmp = page_chain_tail(chain_first, NULL);
+ BUG_ON(tmp != chain_last);
+#endif
+
+ /* add chain to head */
+ set_page_private(chain_last, (unsigned long)*head);
+ *head = chain_first;
+}
+
+static struct page *__drbd_alloc_pages(struct drbd_device *device,
+ unsigned int number)
+{
+ struct page *page = NULL;
+ struct page *tmp = NULL;
+ unsigned int i = 0;
+
+ /* Yes, testing drbd_pp_vacant outside the lock is racy.
+ * So what. It saves a spin_lock. */
+ if (drbd_pp_vacant >= number) {
+ spin_lock(&drbd_pp_lock);
+ page = page_chain_del(&drbd_pp_pool, number);
+ if (page)
+ drbd_pp_vacant -= number;
+ spin_unlock(&drbd_pp_lock);
+ if (page)
+ return page;
+ }
+
+ /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ for (i = 0; i < number; i++) {
+ tmp = alloc_page(GFP_TRY);
+ if (!tmp)
+ break;
+ set_page_private(tmp, (unsigned long)page);
+ page = tmp;
+ }
+
+ if (i == number)
+ return page;
+
+ /* Not enough pages immediately available this time.
+ * No need to jump around here, drbd_alloc_pages will retry this
+ * function "soon". */
+ if (page) {
+ tmp = page_chain_tail(page, NULL);
+ spin_lock(&drbd_pp_lock);
+ page_chain_add(&drbd_pp_pool, page, tmp);
+ drbd_pp_vacant += i;
+ spin_unlock(&drbd_pp_lock);
+ }
+ return NULL;
+}
+
+static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
+ struct list_head *to_be_freed)
+{
+ struct drbd_peer_request *peer_req, *tmp;
+
+ /* The EEs are always appended to the end of the list. Since
+ they are sent in order over the wire, they have to finish
+ in order. As soon as we see the first not finished we can
+ stop to examine the list... */
+
+ list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
+ if (drbd_peer_req_has_active_page(peer_req))
+ break;
+ list_move(&peer_req->w.list, to_be_freed);
+ }
+}
+
+static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device)
+{
+ LIST_HEAD(reclaimed);
+ struct drbd_peer_request *peer_req, *t;
+
+ spin_lock_irq(&device->resource->req_lock);
+ reclaim_finished_net_peer_reqs(device, &reclaimed);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(device, peer_req);
+}
+
+/**
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
+ * @device: DRBD device.
+ * @number: number of pages requested
+ * @retry: whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
+ *
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
+ * Returns a page chain linked via page->private.
+ */
+struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
+ bool retry)
+{
+ struct drbd_device *device = peer_device->device;
+ struct page *page = NULL;
+ struct net_conf *nc;
+ DEFINE_WAIT(wait);
+ unsigned int mxb;
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000;
+ rcu_read_unlock();
+
+ if (atomic_read(&device->pp_in_use) < mxb)
+ page = __drbd_alloc_pages(device, number);
+
+ while (page == NULL) {
+ prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+ drbd_kick_lo_and_reclaim_net(device);
+
+ if (atomic_read(&device->pp_in_use) < mxb) {
+ page = __drbd_alloc_pages(device, number);
+ if (page)
+ break;
+ }
+
+ if (!retry)
+ break;
+
+ if (signal_pending(current)) {
+ drbd_warn(device, "drbd_alloc_pages interrupted!\n");
+ break;
+ }
+
+ if (schedule_timeout(HZ/10) == 0)
+ mxb = UINT_MAX;
+ }
+ finish_wait(&drbd_pp_wait, &wait);
+
+ if (page)
+ atomic_add(number, &device->pp_in_use);
+ return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&resource->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
+static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
+{
+ atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
+ int i;
+
+ if (page == NULL)
+ return;
+
+ if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
+ i = page_chain_free(page);
+ else {
+ struct page *tmp;
+ tmp = page_chain_tail(page, &i);
+ spin_lock(&drbd_pp_lock);
+ page_chain_add(&drbd_pp_pool, page, tmp);
+ drbd_pp_vacant += i;
+ spin_unlock(&drbd_pp_lock);
+ }
+ i = atomic_sub_return(i, a);
+ if (i < 0)
+ drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
+ is_net ? "pp_in_use_by_net" : "pp_in_use", i);
+ wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
+ drbd_ee_fix_bhs()
+ drbd_finish_peer_reqs()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+ unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
+ struct page *page = NULL;
+ unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+
+ if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
+ return NULL;
+
+ peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+ if (!peer_req) {
+ if (!(gfp_mask & __GFP_NOWARN))
+ drbd_err(device, "%s: allocation failed\n", __func__);
+ return NULL;
+ }
+
+ if (has_payload && data_size) {
+ page = drbd_alloc_pages(peer_device, nr_pages, (gfp_mask & __GFP_WAIT));
+ if (!page)
+ goto fail;
+ }
+
+ memset(peer_req, 0, sizeof(*peer_req));
+ INIT_LIST_HEAD(&peer_req->w.list);
+ drbd_clear_interval(&peer_req->i);
+ peer_req->i.size = data_size;
+ peer_req->i.sector = sector;
+ peer_req->submit_jif = jiffies;
+ peer_req->peer_device = peer_device;
+ peer_req->pages = page;
+ /*
+ * The block_id is opaque to the receiver. It is not endianness
+ * converted, and sent back to the sender unchanged.
+ */
+ peer_req->block_id = id;
+
+ return peer_req;
+
+ fail:
+ mempool_free(peer_req, drbd_ee_mempool);
+ return NULL;
+}
+
+void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
+ int is_net)
+{
+ might_sleep();
+ if (peer_req->flags & EE_HAS_DIGEST)
+ kfree(peer_req->digest);
+ drbd_free_pages(device, peer_req->pages, is_net);
+ D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+ if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+ drbd_al_complete_io(device, &peer_req->i);
+ }
+ mempool_free(peer_req, drbd_ee_mempool);
+}
+
+int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
+{
+ LIST_HEAD(work_list);
+ struct drbd_peer_request *peer_req, *t;
+ int count = 0;
+ int is_net = list == &device->net_ee;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_init(list, &work_list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ __drbd_free_peer_req(device, peer_req, is_net);
+ count++;
+ }
+ return count;
+}
+
+/*
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
+ */
+static int drbd_finish_peer_reqs(struct drbd_device *device)
+{
+ LIST_HEAD(work_list);
+ LIST_HEAD(reclaimed);
+ struct drbd_peer_request *peer_req, *t;
+ int err = 0;
+
+ spin_lock_irq(&device->resource->req_lock);
+ reclaim_finished_net_peer_reqs(device, &reclaimed);
+ list_splice_init(&device->done_ee, &work_list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(device, peer_req);
+
+ /* possible callbacks here:
+ * e_end_block, and e_end_resync_block, e_send_superseded.
+ * all ignore the last argument.
+ */
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ int err2;
+
+ /* list_del not necessary, next/prev members not touched */
+ err2 = peer_req->w.cb(&peer_req->w, !!err);
+ if (!err)
+ err = err2;
+ drbd_free_peer_req(device, peer_req);
+ }
+ wake_up(&device->ee_wait);
+
+ return err;
+}
+
+static void _drbd_wait_ee_list_empty(struct drbd_device *device,
+ struct list_head *head)
+{
+ DEFINE_WAIT(wait);
+
+ /* avoids spin_lock/unlock
+ * and calling prepare_to_wait in the fast path */
+ while (!list_empty(head)) {
+ prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&device->resource->req_lock);
+ io_schedule();
+ finish_wait(&device->ee_wait, &wait);
+ spin_lock_irq(&device->resource->req_lock);
+ }
+}
+
+static void drbd_wait_ee_list_empty(struct drbd_device *device,
+ struct list_head *head)
+{
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_wait_ee_list_empty(device, head);
+ spin_unlock_irq(&device->resource->req_lock);
+}
+
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
+{
+ struct kvec iov = {
+ .iov_base = buf,
+ .iov_len = size,
+ };
+ struct msghdr msg = {
+ .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+ };
+ return kernel_recvmsg(sock, &msg, &iov, 1, size, msg.msg_flags);
+}
+
+static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
+{
+ int rv;
+
+ rv = drbd_recv_short(connection->data.socket, buf, size, 0);
+
+ if (rv < 0) {
+ if (rv == -ECONNRESET)
+ drbd_info(connection, "sock was reset by peer\n");
+ else if (rv != -ERESTARTSYS)
+ drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+ } else if (rv == 0) {
+ if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
+
+ t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
+
+ if (t)
+ goto out;
+ }
+ drbd_info(connection, "sock was shut down by peer\n");
+ }
+
+ if (rv != size)
+ conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+
+out:
+ return rv;
+}
+
+static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv(connection, buf, size);
+ if (err != size) {
+ if (err >= 0)
+ err = -EIO;
+ } else
+ err = 0;
+ return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv_all(connection, buf, size);
+ if (err && !signal_pending(current))
+ drbd_warn(connection, "short read (expected size %d)\n", (int)size);
+ return err;
+}
+
+/* quoting tcp(7):
+ * On individual connections, the socket buffer size must be set prior to the
+ * listen(2) or connect(2) calls in order to have it take effect.
+ * This is our wrapper to do so.
+ */
+static void drbd_setbufsize(struct socket *sock, unsigned int snd,
+ unsigned int rcv)
+{
+ /* open coded SO_SNDBUF, SO_RCVBUF */
+ if (snd) {
+ sock->sk->sk_sndbuf = snd;
+ sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ }
+ if (rcv) {
+ sock->sk->sk_rcvbuf = rcv;
+ sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+}
+
+static struct socket *drbd_try_connect(struct drbd_connection *connection)
+{
+ const char *what;
+ struct socket *sock;
+ struct sockaddr_in6 src_in6;
+ struct sockaddr_in6 peer_in6;
+ struct net_conf *nc;
+ int err, peer_addr_len, my_addr_len;
+ int sndbuf_size, rcvbuf_size, connect_int;
+ int disconnect_on_error = 1;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
+ memcpy(&src_in6, &connection->my_addr, my_addr_len);
+
+ if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
+ src_in6.sin6_port = 0;
+ else
+ ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+ peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
+ memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
+
+ what = "sock_create_kern";
+ err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (err < 0) {
+ sock = NULL;
+ goto out;
+ }
+
+ sock->sk->sk_rcvtimeo =
+ sock->sk->sk_sndtimeo = connect_int * HZ;
+ drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
+
+ /* explicitly bind to the configured IP as source IP
+ * for the outgoing connections.
+ * This is needed for multihomed hosts and to be
+ * able to use lo: interfaces for drbd.
+ * Make sure to use 0 as port number, so linux selects
+ * a free one dynamically.
+ */
+ what = "bind before connect";
+ err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
+ if (err < 0)
+ goto out;
+
+ /* connect may fail, peer not yet available.
+ * stay C_WF_CONNECTION, don't go Disconnecting! */
+ disconnect_on_error = 0;
+ what = "connect";
+ err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
+
+out:
+ if (err < 0) {
+ if (sock) {
+ sock_release(sock);
+ sock = NULL;
+ }
+ switch (-err) {
+ /* timeout, busy, signal pending */
+ case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+ case EINTR: case ERESTARTSYS:
+ /* peer not (yet) available, network problem */
+ case ECONNREFUSED: case ENETUNREACH:
+ case EHOSTDOWN: case EHOSTUNREACH:
+ disconnect_on_error = 0;
+ break;
+ default:
+ drbd_err(connection, "%s failed, err = %d\n", what, err);
+ }
+ if (disconnect_on_error)
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+
+ return sock;
+}
+
+struct accept_wait_data {
+ struct drbd_connection *connection;
+ struct socket *s_listen;
+ struct completion door_bell;
+ void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
+{
+ struct accept_wait_data *ad = sk->sk_user_data;
+ void (*state_change)(struct sock *sk);
+
+ state_change = ad->original_sk_state_change;
+ if (sk->sk_state == TCP_ESTABLISHED)
+ complete(&ad->door_bell);
+ state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+ int err, sndbuf_size, rcvbuf_size, my_addr_len;
+ struct sockaddr_in6 my_addr;
+ struct socket *s_listen;
+ struct net_conf *nc;
+ const char *what;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -EIO;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
+ memcpy(&my_addr, &connection->my_addr, my_addr_len);
+
+ what = "sock_create_kern";
+ err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &s_listen);
+ if (err) {
+ s_listen = NULL;
+ goto out;
+ }
+
+ s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
+
+ what = "bind before listen";
+ err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
+ if (err < 0)
+ goto out;
+
+ ad->s_listen = s_listen;
+ write_lock_bh(&s_listen->sk->sk_callback_lock);
+ ad->original_sk_state_change = s_listen->sk->sk_state_change;
+ s_listen->sk->sk_state_change = drbd_incoming_connection;
+ s_listen->sk->sk_user_data = ad;
+ write_unlock_bh(&s_listen->sk->sk_callback_lock);
+
+ what = "listen";
+ err = s_listen->ops->listen(s_listen, 5);
+ if (err < 0)
+ goto out;
+
+ return 0;
+out:
+ if (s_listen)
+ sock_release(s_listen);
+ if (err < 0) {
+ if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+ drbd_err(connection, "%s failed, err = %d\n", what, err);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ }
+
+ return -EIO;
+}
+
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_state_change = ad->original_sk_state_change;
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+ int timeo, connect_int, err = 0;
+ struct socket *s_estab = NULL;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ timeo = connect_int * HZ;
+ /* 28.5% random jitter */
+ timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
+
+ err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+ if (err <= 0)
+ return NULL;
+
+ err = kernel_accept(ad->s_listen, &s_estab, 0);
+ if (err < 0) {
+ if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+ drbd_err(connection, "accept failed, err = %d\n", err);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ }
+
+ if (s_estab)
+ unregister_state_change(s_estab->sk, ad);
+
+ return s_estab;
+}
+
+static int decode_header(struct drbd_connection *, void *, struct packet_info *);
+
+static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
+ enum drbd_packet cmd)
+{
+ if (!conn_prepare_command(connection, sock))
+ return -EIO;
+ return conn_send_command(connection, sock, cmd, 0, NULL, 0);
+}
+
+static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
+{
+ unsigned int header_size = drbd_header_size(connection);
+ struct packet_info pi;
+ struct net_conf *nc;
+ int err;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -EIO;
+ }
+ sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
+ rcu_read_unlock();
+
+ err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
+ if (err != header_size) {
+ if (err >= 0)
+ err = -EIO;
+ return err;
+ }
+ err = decode_header(connection, connection->data.rbuf, &pi);
+ if (err)
+ return err;
+ return pi.cmd;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @sock: pointer to the pointer to the socket.
+ */
+static bool drbd_socket_okay(struct socket **sock)
+{
+ int rr;
+ char tb[4];
+
+ if (!*sock)
+ return false;
+
+ rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+ if (rr > 0 || rr == -EAGAIN) {
+ return true;
+ } else {
+ sock_release(*sock);
+ *sock = NULL;
+ return false;
+ }
+}
+
+static bool connection_established(struct drbd_connection *connection,
+ struct socket **sock1,
+ struct socket **sock2)
+{
+ struct net_conf *nc;
+ int timeout;
+ bool ok;
+
+ if (!*sock1 || !*sock2)
+ return false;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeout);
+
+ ok = drbd_socket_okay(sock1);
+ ok = drbd_socket_okay(sock2) && ok;
+
+ return ok;
+}
+
+/* Gets called if a connection is established, or if a new minor gets created
+ in a connection */
+int drbd_connected(struct drbd_peer_device *peer_device)
+{
+ struct drbd_device *device = peer_device->device;
+ int err;
+
+ atomic_set(&device->packet_seq, 0);
+ device->peer_seq = 0;
+
+ device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
+ &peer_device->connection->cstate_mutex :
+ &device->own_state_mutex;
+
+ err = drbd_send_sync_param(peer_device);
+ if (!err)
+ err = drbd_send_sizes(peer_device, 0, 0);
+ if (!err)
+ err = drbd_send_uuids(peer_device);
+ if (!err)
+ err = drbd_send_current_state(peer_device);
+ clear_bit(USE_DEGR_WFC_T, &device->flags);
+ clear_bit(RESIZE_PENDING, &device->flags);
+ atomic_set(&device->ap_in_flight, 0);
+ mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
+ return err;
+}
+
+/*
+ * return values:
+ * 1 yes, we have a valid connection
+ * 0 oops, did not work out, please try again
+ * -1 peer talks different language,
+ * no point in trying again, please go standalone.
+ * -2 We do not have a network config...
+ */
+static int conn_connect(struct drbd_connection *connection)
+{
+ struct drbd_socket sock, msock;
+ struct drbd_peer_device *peer_device;
+ struct net_conf *nc;
+ int vnr, timeout, h;
+ bool discard_my_data, ok;
+ enum drbd_state_rv rv;
+ struct accept_wait_data ad = {
+ .connection = connection,
+ .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+ };
+
+ clear_bit(DISCONNECT_SENT, &connection->flags);
+ if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
+ return -2;
+
+ mutex_init(&sock.mutex);
+ sock.sbuf = connection->data.sbuf;
+ sock.rbuf = connection->data.rbuf;
+ sock.socket = NULL;
+ mutex_init(&msock.mutex);
+ msock.sbuf = connection->meta.sbuf;
+ msock.rbuf = connection->meta.rbuf;
+ msock.socket = NULL;
+
+ /* Assume that the peer only understands protocol 80 until we know better. */
+ connection->agreed_pro_version = 80;
+
+ if (prepare_listen_socket(connection, &ad))
+ return 0;
+
+ do {
+ struct socket *s;
+
+ s = drbd_try_connect(connection);
+ if (s) {
+ if (!sock.socket) {
+ sock.socket = s;
+ send_first_packet(connection, &sock, P_INITIAL_DATA);
+ } else if (!msock.socket) {
+ clear_bit(RESOLVE_CONFLICTS, &connection->flags);
+ msock.socket = s;
+ send_first_packet(connection, &msock, P_INITIAL_META);
+ } else {
+ drbd_err(connection, "Logic error in conn_connect()\n");
+ goto out_release_sockets;
+ }
+ }
+
+ if (connection_established(connection, &sock.socket, &msock.socket))
+ break;
+
+retry:
+ s = drbd_wait_for_connect(connection, &ad);
+ if (s) {
+ int fp = receive_first_packet(connection, s);
+ drbd_socket_okay(&sock.socket);
+ drbd_socket_okay(&msock.socket);
+ switch (fp) {
+ case P_INITIAL_DATA:
+ if (sock.socket) {
+ drbd_warn(connection, "initial packet S crossed\n");
+ sock_release(sock.socket);
+ sock.socket = s;
+ goto randomize;
+ }
+ sock.socket = s;
+ break;
+ case P_INITIAL_META:
+ set_bit(RESOLVE_CONFLICTS, &connection->flags);
+ if (msock.socket) {
+ drbd_warn(connection, "initial packet M crossed\n");
+ sock_release(msock.socket);
+ msock.socket = s;
+ goto randomize;
+ }
+ msock.socket = s;
+ break;
+ default:
+ drbd_warn(connection, "Error receiving initial packet\n");
+ sock_release(s);
+randomize:
+ if (prandom_u32() & 1)
+ goto retry;
+ }
+ }
+
+ if (connection->cstate <= C_DISCONNECTING)
+ goto out_release_sockets;
+ if (signal_pending(current)) {
+ flush_signals(current);
+ smp_rmb();
+ if (get_t_state(&connection->receiver) == EXITING)
+ goto out_release_sockets;
+ }
+
+ ok = connection_established(connection, &sock.socket, &msock.socket);
+ } while (!ok);
+
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
+
+ sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+
+ sock.socket->sk->sk_allocation = GFP_NOIO;
+ msock.socket->sk->sk_allocation = GFP_NOIO;
+
+ sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+ /* NOT YET ...
+ * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
+ * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ * first set it to the P_CONNECTION_FEATURES timeout,
+ * which we set to 4x the configured ping_timeout. */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+
+ sock.socket->sk->sk_sndtimeo =
+ sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
+
+ msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+ timeout = nc->timeout * HZ / 10;
+ discard_my_data = nc->discard_my_data;
+ rcu_read_unlock();
+
+ msock.socket->sk->sk_sndtimeo = timeout;
+
+ /* we don't want delays.
+ * we use TCP_CORK where appropriate, though */
+ drbd_tcp_nodelay(sock.socket);
+ drbd_tcp_nodelay(msock.socket);
+
+ connection->data.socket = sock.socket;
+ connection->meta.socket = msock.socket;
+ connection->last_received = jiffies;
+
+ h = drbd_do_features(connection);
+ if (h <= 0)
+ return h;
+
+ if (connection->cram_hmac_tfm) {
+ /* drbd_request_state(device, NS(conn, WFAuth)); */
+ switch (drbd_do_auth(connection)) {
+ case -1:
+ drbd_err(connection, "Authentication of peer failed\n");
+ return -1;
+ case 0:
+ drbd_err(connection, "Authentication of peer failed, trying again.\n");
+ return 0;
+ }
+ }
+
+ connection->data.socket->sk->sk_sndtimeo = timeout;
+ connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+ if (drbd_send_protocol(connection) == -EOPNOTSUPP)
+ return -1;
+
+ /* Prevent a race between resync-handshake and
+ * being promoted to Primary.
+ *
+ * Grab and release the state mutex, so we know that any current
+ * drbd_set_role() is finished, and any incoming drbd_set_role
+ * will see the STATE_SENT flag, and wait for it to be cleared.
+ */
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ mutex_lock(peer_device->device->state_mutex);
+
+ set_bit(STATE_SENT, &connection->flags);
+
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ mutex_unlock(peer_device->device->state_mutex);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+
+ if (discard_my_data)
+ set_bit(DISCARD_MY_DATA, &device->flags);
+ else
+ clear_bit(DISCARD_MY_DATA, &device->flags);
+
+ drbd_connected(peer_device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+ if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
+ clear_bit(STATE_SENT, &connection->flags);
+ return 0;
+ }
+
+ drbd_thread_start(&connection->asender);
+
+ mutex_lock(&connection->resource->conf_update);
+ /* The discard_my_data flag is a single-shot modifier to the next
+ * connection attempt, the handshake of which is now well underway.
+ * No need for rcu style copying of the whole struct
+ * just to clear a single value. */
+ connection->net_conf->discard_my_data = 0;
+ mutex_unlock(&connection->resource->conf_update);
+
+ return h;
+
+out_release_sockets:
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
+ if (sock.socket)
+ sock_release(sock.socket);
+ if (msock.socket)
+ sock_release(msock.socket);
+ return -1;
+}
+
+static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
+{
+ unsigned int header_size = drbd_header_size(connection);
+
+ if (header_size == sizeof(struct p_header100) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+ struct p_header100 *h = header;
+ if (h->pad != 0) {
+ drbd_err(connection, "Header padding is not zero\n");
+ return -EINVAL;
+ }
+ pi->vnr = be16_to_cpu(h->volume);
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ } else if (header_size == sizeof(struct p_header95) &&
+ *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+ struct p_header95 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ pi->vnr = 0;
+ } else if (header_size == sizeof(struct p_header80) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+ struct p_header80 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be16_to_cpu(h->length);
+ pi->vnr = 0;
+ } else {
+ drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
+ be32_to_cpu(*(__be32 *)header),
+ connection->agreed_pro_version);
+ return -EINVAL;
+ }
+ pi->data = header + header_size;
+ return 0;
+}
+
+static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
+{
+ void *buffer = connection->data.rbuf;
+ int err;
+
+ err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
+ if (err)
+ return err;
+
+ err = decode_header(connection, buffer, pi);
+ connection->last_received = jiffies;
+
+ return err;
+}
+
+static void drbd_flush(struct drbd_connection *connection)
+{
+ int rv;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ if (connection->resource->write_ordering >= WO_bdev_flush) {
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ if (!get_ldev(device))
+ continue;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+
+ /* Right now, we have only this one synchronous code path
+ * for flushes between request epochs.
+ * We may want to make those asynchronous,
+ * or at least parallelize the flushes to the volume devices.
+ */
+ device->flush_jif = jiffies;
+ set_bit(FLUSH_PENDING, &device->flags);
+ rv = blkdev_issue_flush(device->ldev->backing_bdev,
+ GFP_NOIO, NULL);
+ clear_bit(FLUSH_PENDING, &device->flags);
+ if (rv) {
+ drbd_info(device, "local disk flush failed with status %d\n", rv);
+ /* would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0
+ * if (rv == -EOPNOTSUPP) */
+ drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io);
+ }
+ put_ldev(device);
+ kref_put(&device->kref, drbd_destroy_device);
+
+ rcu_read_lock();
+ if (rv)
+ break;
+ }
+ rcu_read_unlock();
+ }
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @device: DRBD device.
+ * @epoch: Epoch object.
+ * @ev: Epoch event.
+ */
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
+ struct drbd_epoch *epoch,
+ enum epoch_event ev)
+{
+ int epoch_size;
+ struct drbd_epoch *next_epoch;
+ enum finish_epoch rv = FE_STILL_LIVE;
+
+ spin_lock(&connection->epoch_lock);
+ do {
+ next_epoch = NULL;
+
+ epoch_size = atomic_read(&epoch->epoch_size);
+
+ switch (ev & ~EV_CLEANUP) {
+ case EV_PUT:
+ atomic_dec(&epoch->active);
+ break;
+ case EV_GOT_BARRIER_NR:
+ set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+ break;
+ case EV_BECAME_LAST:
+ /* nothing to do*/
+ break;
+ }
+
+ if (epoch_size != 0 &&
+ atomic_read(&epoch->active) == 0 &&
+ (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
+ if (!(ev & EV_CLEANUP)) {
+ spin_unlock(&connection->epoch_lock);
+ drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
+ spin_lock(&connection->epoch_lock);
+ }
+#if 0
+ /* FIXME: dec unacked on connection, once we have
+ * something to count pending connection packets in. */
+ if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+ dec_unacked(epoch->connection);
+#endif
+
+ if (connection->current_epoch != epoch) {
+ next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+ list_del(&epoch->list);
+ ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+ connection->epochs--;
+ kfree(epoch);
+
+ if (rv == FE_STILL_LIVE)
+ rv = FE_DESTROYED;
+ } else {
+ epoch->flags = 0;
+ atomic_set(&epoch->epoch_size, 0);
+ /* atomic_set(&epoch->active, 0); is already zero */
+ if (rv == FE_STILL_LIVE)
+ rv = FE_RECYCLED;
+ }
+ }
+
+ if (!next_epoch)
+ break;
+
+ epoch = next_epoch;
+ } while (1);
+
+ spin_unlock(&connection->epoch_lock);
+
+ return rv;
+}
+
+static enum write_ordering_e
+max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
+{
+ struct disk_conf *dc;
+
+ dc = rcu_dereference(bdev->disk_conf);
+
+ if (wo == WO_bdev_flush && !dc->disk_flushes)
+ wo = WO_drain_io;
+ if (wo == WO_drain_io && !dc->disk_drain)
+ wo = WO_none;
+
+ return wo;
+}
+
+/**
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @connection: DRBD connection.
+ * @wo: Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+ enum write_ordering_e wo)
+{
+ struct drbd_device *device;
+ enum write_ordering_e pwo;
+ int vnr;
+ static char *write_ordering_str[] = {
+ [WO_none] = "none",
+ [WO_drain_io] = "drain",
+ [WO_bdev_flush] = "flush",
+ };
+
+ pwo = resource->write_ordering;
+ if (wo != WO_bdev_flush)
+ wo = min(pwo, wo);
+ rcu_read_lock();
+ idr_for_each_entry(&resource->devices, device, vnr) {
+ if (get_ldev(device)) {
+ wo = max_allowed_wo(device->ldev, wo);
+ if (device->ldev == bdev)
+ bdev = NULL;
+ put_ldev(device);
+ }
+ }
+
+ if (bdev)
+ wo = max_allowed_wo(bdev, wo);
+
+ rcu_read_unlock();
+
+ resource->write_ordering = wo;
+ if (pwo != resource->write_ordering || wo == WO_bdev_flush)
+ drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
+}
+
+/**
+ * drbd_submit_peer_request()
+ * @device: DRBD device.
+ * @peer_req: peer request
+ * @rw: flag field, see bio->bi_rw
+ *
+ * May spread the pages to multiple bios,
+ * depending on bio_add_page restrictions.
+ *
+ * Returns 0 if all bios have been submitted,
+ * -ENOMEM if we could not allocate enough bios,
+ * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
+ * single page to an empty bio (which should never happen and likely indicates
+ * that the lower level IO stack is in some way broken). This has been observed
+ * on certain Xen deployments.
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_peer_request(struct drbd_device *device,
+ struct drbd_peer_request *peer_req,
+ const unsigned rw, const int fault_type)
+{
+ struct bio *bios = NULL;
+ struct bio *bio;
+ struct page *page = peer_req->pages;
+ sector_t sector = peer_req->i.sector;
+ unsigned data_size = peer_req->i.size;
+ unsigned n_bios = 0;
+ unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ int err = -ENOMEM;
+
+ if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) {
+ /* wait for all pending IO completions, before we start
+ * zeroing things out. */
+ conn_wait_active_ee_empty(first_peer_device(device)->connection);
+ /* add it to the active list now,
+ * so we can find it to present it in debugfs */
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_SUBMITTED;
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->active_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (blkdev_issue_zeroout(device->ldev->backing_bdev,
+ sector, data_size >> 9, GFP_NOIO, false))
+ peer_req->flags |= EE_WAS_ERROR;
+ drbd_endio_write_sec_final(peer_req);
+ return 0;
+ }
+
+ /* Discards don't have any payload.
+ * But the scsi layer still expects a bio_vec it can use internally,
+ * see sd_setup_discard_cmnd() and blk_add_request_payload(). */
+ if (peer_req->flags & EE_IS_TRIM)
+ nr_pages = 1;
+
+ /* In most cases, we will only need one bio. But in case the lower
+ * level restrictions happen to be different at this offset on this
+ * side than those of the sending peer, we may need to submit the
+ * request in more than one bio.
+ *
+ * Plain bio_alloc is good enough here, this is no DRBD internally
+ * generated bio, but a bio allocated on behalf of the peer.
+ */
+next_bio:
+ bio = bio_alloc(GFP_NOIO, nr_pages);
+ if (!bio) {
+ drbd_err(device, "submit_ee: Allocation of a bio failed (nr_pages=%u)\n", nr_pages);
+ goto fail;
+ }
+ /* > peer_req->i.sector, unless this is the first bio */
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_bdev = device->ldev->backing_bdev;
+ bio->bi_rw = rw;
+ bio->bi_private = peer_req;
+ bio->bi_end_io = drbd_peer_request_endio;
+
+ bio->bi_next = bios;
+ bios = bio;
+ ++n_bios;
+
+ if (rw & REQ_DISCARD) {
+ bio->bi_iter.bi_size = data_size;
+ goto submit;
+ }
+
+ page_chain_for_each(page) {
+ unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
+ if (!bio_add_page(bio, page, len, 0)) {
+ /* A single page must always be possible!
+ * But in case it fails anyways,
+ * we deal with it, and complain (below). */
+ if (bio->bi_vcnt == 0) {
+ drbd_err(device,
+ "bio_add_page failed for len=%u, "
+ "bi_vcnt=0 (bi_sector=%llu)\n",
+ len, (uint64_t)bio->bi_iter.bi_sector);
+ err = -ENOSPC;
+ goto fail;
+ }
+ goto next_bio;
+ }
+ data_size -= len;
+ sector += len >> 9;
+ --nr_pages;
+ }
+ D_ASSERT(device, data_size == 0);
+submit:
+ D_ASSERT(device, page == NULL);
+
+ atomic_set(&peer_req->pending_bios, n_bios);
+ /* for debugfs: update timestamp, mark as submitted */
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_SUBMITTED;
+ do {
+ bio = bios;
+ bios = bios->bi_next;
+ bio->bi_next = NULL;
+
+ drbd_generic_make_request(device, fault_type, bio);
+ } while (bios);
+ return 0;
+
+fail:
+ while (bios) {
+ bio = bios;
+ bios = bios->bi_next;
+ bio_put(bio);
+ }
+ return err;
+}
+
+static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_interval *i = &peer_req->i;
+
+ drbd_remove_interval(&device->write_requests, i);
+ drbd_clear_interval(i);
+
+ /* Wake up any processes waiting for this peer request to complete. */
+ if (i->waiting)
+ wake_up(&device->misc_wait);
+}
+
+static void conn_wait_active_ee_empty(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_wait_ee_list_empty(device, &device->active_ee);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+ return idr_find(&connection->peer_devices, volume_number);
+}
+
+static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
+{
+ int rv;
+ struct p_barrier *p = pi->data;
+ struct drbd_epoch *epoch;
+
+ /* FIXME these are unacked on connection,
+ * not a specific (peer)device.
+ */
+ connection->current_epoch->barrier_nr = p->barrier;
+ connection->current_epoch->connection = connection;
+ rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
+
+ /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+ * the activity log, which means it would not be resynced in case the
+ * R_PRIMARY crashes now.
+ * Therefore we must send the barrier_ack after the barrier request was
+ * completed. */
+ switch (connection->resource->write_ordering) {
+ case WO_none:
+ if (rv == FE_RECYCLED)
+ return 0;
+
+ /* receiver context, in the writeout path of the other node.
+ * avoid potential distributed deadlock */
+ epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+ if (epoch)
+ break;
+ else
+ drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
+ /* Fall through */
+
+ case WO_bdev_flush:
+ case WO_drain_io:
+ conn_wait_active_ee_empty(connection);
+ drbd_flush(connection);
+
+ if (atomic_read(&connection->current_epoch->epoch_size)) {
+ epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+ if (epoch)
+ break;
+ }
+
+ return 0;
+ default:
+ drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
+ connection->resource->write_ordering);
+ return -EIO;
+ }
+
+ epoch->flags = 0;
+ atomic_set(&epoch->epoch_size, 0);
+ atomic_set(&epoch->active, 0);
+
+ spin_lock(&connection->epoch_lock);
+ if (atomic_read(&connection->current_epoch->epoch_size)) {
+ list_add(&epoch->list, &connection->current_epoch->list);
+ connection->current_epoch = epoch;
+ connection->epochs++;
+ } else {
+ /* The current_epoch got recycled while we allocated this one... */
+ kfree(epoch);
+ }
+ spin_unlock(&connection->epoch_lock);
+
+ return 0;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data */
+static struct drbd_peer_request *
+read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+ struct packet_info *pi) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ struct drbd_peer_request *peer_req;
+ struct page *page;
+ int digest_size, err;
+ unsigned int data_size = pi->size, ds;
+ void *dig_in = peer_device->connection->int_dig_in;
+ void *dig_vv = peer_device->connection->int_dig_vv;
+ unsigned long *data;
+ struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
+
+ digest_size = 0;
+ if (!trim && peer_device->connection->peer_integrity_tfm) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ /*
+ * FIXME: Receive the incoming digest into the receive buffer
+ * here, together with its struct p_data?
+ */
+ err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+ if (err)
+ return NULL;
+ data_size -= digest_size;
+ }
+
+ if (trim) {
+ D_ASSERT(peer_device, data_size == 0);
+ data_size = be32_to_cpu(trim->size);
+ }
+
+ if (!expect(IS_ALIGNED(data_size, 512)))
+ return NULL;
+ /* prepare for larger trim requests. */
+ if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE))
+ return NULL;
+
+ /* even though we trust out peer,
+ * we sometimes have to double check. */
+ if (sector + (data_size>>9) > capacity) {
+ drbd_err(device, "request from peer beyond end of local disk: "
+ "capacity: %llus < sector: %llus + size: %u\n",
+ (unsigned long long)capacity,
+ (unsigned long long)sector, data_size);
+ return NULL;
+ }
+
+ /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO);
+ if (!peer_req)
+ return NULL;
+
+ peer_req->flags |= EE_WRITE;
+ if (trim)
+ return peer_req;
+
+ ds = data_size;
+ page = peer_req->pages;
+ page_chain_for_each(page) {
+ unsigned len = min_t(int, ds, PAGE_SIZE);
+ data = kmap(page);
+ err = drbd_recv_all_warn(peer_device->connection, data, len);
+ if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
+ drbd_err(device, "Fault injection: Corrupting data on receive\n");
+ data[0] = data[0] ^ (unsigned long)-1;
+ }
+ kunmap(page);
+ if (err) {
+ drbd_free_peer_req(device, peer_req);
+ return NULL;
+ }
+ ds -= len;
+ }
+
+ if (digest_size) {
+ drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv);
+ if (memcmp(dig_in, dig_vv, digest_size)) {
+ drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
+ (unsigned long long)sector, data_size);
+ drbd_free_peer_req(device, peer_req);
+ return NULL;
+ }
+ }
+ device->recv_cnt += data_size >> 9;
+ return peer_req;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
+{
+ struct page *page;
+ int err = 0;
+ void *data;
+
+ if (!data_size)
+ return 0;
+
+ page = drbd_alloc_pages(peer_device, 1, 1);
+
+ data = kmap(page);
+ while (data_size) {
+ unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+ err = drbd_recv_all_warn(peer_device->connection, data, len);
+ if (err)
+ break;
+ data_size -= len;
+ }
+ kunmap(page);
+ drbd_free_pages(peer_device->device, page, 0);
+ return err;
+}
+
+static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
+ sector_t sector, int data_size)
+{
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ struct bio *bio;
+ int digest_size, err, expect;
+ void *dig_in = peer_device->connection->int_dig_in;
+ void *dig_vv = peer_device->connection->int_dig_vv;
+
+ digest_size = 0;
+ if (peer_device->connection->peer_integrity_tfm) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->peer_integrity_tfm);
+ err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+ if (err)
+ return err;
+ data_size -= digest_size;
+ }
+
+ /* optimistically update recv_cnt. if receiving fails below,
+ * we disconnect anyways, and counters will be reset. */
+ peer_device->device->recv_cnt += data_size>>9;
+
+ bio = req->master_bio;
+ D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
+
+ bio_for_each_segment(bvec, bio, iter) {
+ void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
+ expect = min_t(int, data_size, bvec.bv_len);
+ err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
+ kunmap(bvec.bv_page);
+ if (err)
+ return err;
+ data_size -= expect;
+ }
+
+ if (digest_size) {
+ drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
+ if (memcmp(dig_in, dig_vv, digest_size)) {
+ drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
+ return -EINVAL;
+ }
+ }
+
+ D_ASSERT(peer_device->device, data_size == 0);
+ return 0;
+}
+
+/*
+ * e_end_resync_block() is called in asender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ int err;
+
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ drbd_set_in_sync(device, sector, peer_req->i.size);
+ err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
+ } else {
+ /* Record failure to sync */
+ drbd_rs_failed_io(device, sector, peer_req->i.size);
+
+ err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+ }
+ dec_unacked(device);
+
+ return err;
+}
+
+static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
+ struct packet_info *pi) __releases(local)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
+
+ peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
+ if (!peer_req)
+ goto fail;
+
+ dec_rs_pending(device);
+
+ inc_unacked(device);
+ /* corresponding dec_unacked() in e_end_resync_block()
+ * respective _drbd_clear_done_ee */
+
+ peer_req->w.cb = e_end_resync_block;
+ peer_req->submit_jif = jiffies;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->sync_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ atomic_add(pi->size >> 9, &device->rs_sect_ev);
+ if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+ return 0;
+
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ drbd_free_peer_req(device, peer_req);
+fail:
+ put_ldev(device);
+ return -EIO;
+}
+
+static struct drbd_request *
+find_request(struct drbd_device *device, struct rb_root *root, u64 id,
+ sector_t sector, bool missing_ok, const char *func)
+{
+ struct drbd_request *req;
+
+ /* Request object according to our peer */
+ req = (struct drbd_request *)(unsigned long)id;
+ if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+ return req;
+ if (!missing_ok) {
+ drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
+ (unsigned long)id, (unsigned long long)sector);
+ }
+ return NULL;
+}
+
+static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct drbd_request *req;
+ sector_t sector;
+ int err;
+ struct p_data *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+
+ spin_lock_irq(&device->resource->req_lock);
+ req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (unlikely(!req))
+ return -EIO;
+
+ /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
+ * special casing it there for the various failure cases.
+ * still no race with drbd_fail_pending_reads */
+ err = recv_dless_read(peer_device, req, sector, pi->size);
+ if (!err)
+ req_mod(req, DATA_RECEIVED);
+ /* else: nothing. handled from drbd_disconnect...
+ * I don't think we may complete this just yet
+ * in case we are "on-disconnect: freeze" */
+
+ return err;
+}
+
+static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ sector_t sector;
+ int err;
+ struct p_data *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+ D_ASSERT(device, p->block_id == ID_SYNCER);
+
+ if (get_ldev(device)) {
+ /* data is submitted to disk within recv_resync_read.
+ * corresponding put_ldev done below on error,
+ * or in drbd_peer_request_endio. */
+ err = recv_resync_read(peer_device, sector, pi);
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Can not write resync data to local disk.\n");
+
+ err = drbd_drain_block(peer_device, pi->size);
+
+ drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+ }
+
+ atomic_add(pi->size >> 9, &device->rs_sect_in);
+
+ return err;
+}
+
+static void restart_conflicting_writes(struct drbd_device *device,
+ sector_t sector, int size)
+{
+ struct drbd_interval *i;
+ struct drbd_request *req;
+
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED))
+ continue;
+ /* as it is RQ_POSTPONED, this will cause it to
+ * be queued on the retry workqueue. */
+ __req_mod(req, CONFLICT_RESOLVED, NULL);
+ }
+}
+
+/*
+ * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ int err = 0, pcmd;
+
+ if (peer_req->flags & EE_SEND_WRITE_ACK) {
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ pcmd = (device->state.conn >= C_SYNC_SOURCE &&
+ device->state.conn <= C_PAUSED_SYNC_T &&
+ peer_req->flags & EE_MAY_SET_IN_SYNC) ?
+ P_RS_WRITE_ACK : P_WRITE_ACK;
+ err = drbd_send_ack(peer_device, pcmd, peer_req);
+ if (pcmd == P_RS_WRITE_ACK)
+ drbd_set_in_sync(device, sector, peer_req->i.size);
+ } else {
+ err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+ /* we expect it to be marked out of sync anyways...
+ * maybe assert this? */
+ }
+ dec_unacked(device);
+ }
+
+ /* we delete from the conflict detection hash _after_ we sent out the
+ * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
+ if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+ spin_lock_irq(&device->resource->req_lock);
+ D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ if (peer_req->flags & EE_RESTART_REQUESTS)
+ restart_conflicting_writes(device, sector, peer_req->i.size);
+ spin_unlock_irq(&device->resource->req_lock);
+ } else
+ D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+ drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+ return err;
+}
+
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ int err;
+
+ err = drbd_send_ack(peer_device, ack, peer_req);
+ dec_unacked(peer_device->device);
+
+ return err;
+}
+
+static int e_send_superseded(struct drbd_work *w, int unused)
+{
+ return e_send_ack(w, P_SUPERSEDED);
+}
+
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_connection *connection = peer_req->peer_device->connection;
+
+ return e_send_ack(w, connection->agreed_pro_version >= 100 ?
+ P_RETRY_WRITE : P_SUPERSEDED);
+}
+
+static bool seq_greater(u32 a, u32 b)
+{
+ /*
+ * We assume 32-bit wrap-around here.
+ * For 24-bit wrap-around, we would have to shift:
+ * a <<= 8; b <<= 8;
+ */
+ return (s32)a - (s32)b > 0;
+}
+
+static u32 seq_max(u32 a, u32 b)
+{
+ return seq_greater(a, b) ? a : b;
+}
+
+static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
+{
+ struct drbd_device *device = peer_device->device;
+ unsigned int newest_peer_seq;
+
+ if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
+ spin_lock(&device->peer_seq_lock);
+ newest_peer_seq = seq_max(device->peer_seq, peer_seq);
+ device->peer_seq = newest_peer_seq;
+ spin_unlock(&device->peer_seq_lock);
+ /* wake up only if we actually changed device->peer_seq */
+ if (peer_seq == newest_peer_seq)
+ wake_up(&device->seq_wait);
+ }
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+ return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+ struct drbd_peer_request *rs_req;
+ bool rv = 0;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_for_each_entry(rs_req, &device->sync_ee, w.list) {
+ if (overlaps(peer_req->i.sector, peer_req->i.size,
+ rs_req->i.sector, rs_req->i.size)) {
+ rv = 1;
+ break;
+ }
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+
+ return rv;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than device->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update device->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
+{
+ struct drbd_device *device = peer_device->device;
+ DEFINE_WAIT(wait);
+ long timeout;
+ int ret = 0, tp;
+
+ if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
+ return 0;
+
+ spin_lock(&device->peer_seq_lock);
+ for (;;) {
+ if (!seq_greater(peer_seq - 1, device->peer_seq)) {
+ device->peer_seq = seq_max(device->peer_seq, peer_seq);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ rcu_read_lock();
+ tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries;
+ rcu_read_unlock();
+
+ if (!tp)
+ break;
+
+ /* Only need to wait if two_primaries is enabled */
+ prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock(&device->peer_seq_lock);
+ rcu_read_lock();
+ timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
+ rcu_read_unlock();
+ timeout = schedule_timeout(timeout);
+ spin_lock(&device->peer_seq_lock);
+ if (!timeout) {
+ ret = -ETIMEDOUT;
+ drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
+ break;
+ }
+ }
+ spin_unlock(&device->peer_seq_lock);
+ finish_wait(&device->seq_wait, &wait);
+ return ret;
+}
+
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static unsigned long wire_flags_to_bio(u32 dpf)
+{
+ return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ (dpf & DP_FUA ? REQ_FUA : 0) |
+ (dpf & DP_FLUSH ? REQ_FLUSH : 0) |
+ (dpf & DP_DISCARD ? REQ_DISCARD : 0);
+}
+
+static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
+ unsigned int size)
+{
+ struct drbd_interval *i;
+
+ repeat:
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ struct drbd_request *req;
+ struct bio_and_error m;
+
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (!(req->rq_state & RQ_POSTPONED))
+ continue;
+ req->rq_state &= ~RQ_POSTPONED;
+ __req_mod(req, NEG_ACKED, &m);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (m.bio)
+ complete_master_bio(device, &m);
+ spin_lock_irq(&device->resource->req_lock);
+ goto repeat;
+ }
+}
+
+static int handle_write_conflicts(struct drbd_device *device,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_connection *connection = peer_req->peer_device->connection;
+ bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+ sector_t sector = peer_req->i.sector;
+ const unsigned int size = peer_req->i.size;
+ struct drbd_interval *i;
+ bool equal;
+ int err;
+
+ /*
+ * Inserting the peer request into the write_requests tree will prevent
+ * new conflicting local requests from being added.
+ */
+ drbd_insert_interval(&device->write_requests, &peer_req->i);
+
+ repeat:
+ drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+ if (i == &peer_req->i)
+ continue;
+ if (i->completed)
+ continue;
+
+ if (!i->local) {
+ /*
+ * Our peer has sent a conflicting remote request; this
+ * should not happen in a two-node setup. Wait for the
+ * earlier peer request to complete.
+ */
+ err = drbd_wait_misc(device, i);
+ if (err)
+ goto out;
+ goto repeat;
+ }
+
+ equal = i->sector == sector && i->size == size;
+ if (resolve_conflicts) {
+ /*
+ * If the peer request is fully contained within the
+ * overlapping request, it can be considered overwritten
+ * and thus superseded; otherwise, it will be retried
+ * once all overlapping requests have completed.
+ */
+ bool superseded = i->sector <= sector && i->sector +
+ (i->size >> 9) >= sector + (size >> 9);
+
+ if (!equal)
+ drbd_alert(device, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u, "
+ "assuming %s came first\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size,
+ superseded ? "local" : "remote");
+
+ peer_req->w.cb = superseded ? e_send_superseded :
+ e_send_retry_write;
+ list_add_tail(&peer_req->w.list, &device->done_ee);
+ wake_asender(connection);
+
+ err = -ENOENT;
+ goto out;
+ } else {
+ struct drbd_request *req =
+ container_of(i, struct drbd_request, i);
+
+ if (!equal)
+ drbd_alert(device, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size);
+
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED)) {
+ /*
+ * Wait for the node with the discard flag to
+ * decide if this request has been superseded
+ * or needs to be retried.
+ * Requests that have been superseded will
+ * disappear from the write_requests tree.
+ *
+ * In addition, wait for the conflicting
+ * request to finish locally before submitting
+ * the conflicting peer request.
+ */
+ err = drbd_wait_misc(device, &req->i);
+ if (err) {
+ _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+ fail_postponed_requests(device, sector, size);
+ goto out;
+ }
+ goto repeat;
+ }
+ /*
+ * Remember to restart the conflicting requests after
+ * the new peer request has completed.
+ */
+ peer_req->flags |= EE_RESTART_REQUESTS;
+ }
+ }
+ err = 0;
+
+ out:
+ if (err)
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ return err;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct net_conf *nc;
+ sector_t sector;
+ struct drbd_peer_request *peer_req;
+ struct p_data *p = pi->data;
+ u32 peer_seq = be32_to_cpu(p->seq_num);
+ int rw = WRITE;
+ u32 dp_flags;
+ int err, tp;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ if (!get_ldev(device)) {
+ int err2;
+
+ err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+ drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+ atomic_inc(&connection->current_epoch->epoch_size);
+ err2 = drbd_drain_block(peer_device, pi->size);
+ if (!err)
+ err = err2;
+ return err;
+ }
+
+ /*
+ * Corresponding put_ldev done either below (on various errors), or in
+ * drbd_peer_request_endio, if we successfully submit the data at the
+ * end of this function.
+ */
+
+ sector = be64_to_cpu(p->sector);
+ peer_req = read_in_block(peer_device, p->block_id, sector, pi);
+ if (!peer_req) {
+ put_ldev(device);
+ return -EIO;
+ }
+
+ peer_req->w.cb = e_end_block;
+ peer_req->submit_jif = jiffies;
+ peer_req->flags |= EE_APPLICATION;
+
+ dp_flags = be32_to_cpu(p->dp_flags);
+ rw |= wire_flags_to_bio(dp_flags);
+ if (pi->cmd == P_TRIM) {
+ struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
+ peer_req->flags |= EE_IS_TRIM;
+ if (!blk_queue_discard(q))
+ peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT;
+ D_ASSERT(peer_device, peer_req->i.size > 0);
+ D_ASSERT(peer_device, rw & REQ_DISCARD);
+ D_ASSERT(peer_device, peer_req->pages == NULL);
+ } else if (peer_req->pages == NULL) {
+ D_ASSERT(device, peer_req->i.size == 0);
+ D_ASSERT(device, dp_flags & DP_FLUSH);
+ }
+
+ if (dp_flags & DP_MAY_SET_IN_SYNC)
+ peer_req->flags |= EE_MAY_SET_IN_SYNC;
+
+ spin_lock(&connection->epoch_lock);
+ peer_req->epoch = connection->current_epoch;
+ atomic_inc(&peer_req->epoch->epoch_size);
+ atomic_inc(&peer_req->epoch->active);
+ spin_unlock(&connection->epoch_lock);
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+ tp = nc->two_primaries;
+ if (peer_device->connection->agreed_pro_version < 100) {
+ switch (nc->wire_protocol) {
+ case DRBD_PROT_C:
+ dp_flags |= DP_SEND_WRITE_ACK;
+ break;
+ case DRBD_PROT_B:
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ if (dp_flags & DP_SEND_WRITE_ACK) {
+ peer_req->flags |= EE_SEND_WRITE_ACK;
+ inc_unacked(device);
+ /* corresponding dec_unacked() in e_end_block()
+ * respective _drbd_clear_done_ee */
+ }
+
+ if (dp_flags & DP_SEND_RECEIVE_ACK) {
+ /* I really don't like it that the receiver thread
+ * sends on the msock, but anyways */
+ drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req);
+ }
+
+ if (tp) {
+ /* two primaries implies protocol C */
+ D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
+ peer_req->flags |= EE_IN_INTERVAL_TREE;
+ err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+ if (err)
+ goto out_interrupted;
+ spin_lock_irq(&device->resource->req_lock);
+ err = handle_write_conflicts(device, peer_req);
+ if (err) {
+ spin_unlock_irq(&device->resource->req_lock);
+ if (err == -ENOENT) {
+ put_ldev(device);
+ return 0;
+ }
+ goto out_interrupted;
+ }
+ } else {
+ update_peer_seq(peer_device, peer_seq);
+ spin_lock_irq(&device->resource->req_lock);
+ }
+ /* if we use the zeroout fallback code, we process synchronously
+ * and we wait for all pending requests, respectively wait for
+ * active_ee to become empty in drbd_submit_peer_request();
+ * better not add ourselves here. */
+ if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0)
+ list_add_tail(&peer_req->w.list, &device->active_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (device->state.conn == C_SYNC_TARGET)
+ wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
+
+ if (device->state.pdsk < D_INCONSISTENT) {
+ /* In case we have the only disk of the cluster, */
+ drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
+ peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+ drbd_al_begin_io(device, &peer_req->i);
+ peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+ }
+
+ err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR);
+ if (!err)
+ return 0;
+
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ drbd_remove_epoch_entry_interval(device, peer_req);
+ spin_unlock_irq(&device->resource->req_lock);
+ if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+ drbd_al_complete_io(device, &peer_req->i);
+ }
+
+out_interrupted:
+ drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP);
+ put_ldev(device);
+ drbd_free_peer_req(device, peer_req);
+ return err;
+}
+
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+ bool throttle_if_app_is_waiting)
+{
+ struct lc_element *tmp;
+ bool throttle = drbd_rs_c_min_rate_throttle(device);
+
+ if (!throttle || throttle_if_app_is_waiting)
+ return throttle;
+
+ spin_lock_irq(&device->al_lock);
+ tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
+ if (tmp) {
+ struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+ if (test_bit(BME_PRIORITY, &bm_ext->flags))
+ throttle = false;
+ /* Do not slow down if app IO is already waiting for this extent,
+ * and our progress is necessary for application IO to complete. */
+ }
+ spin_unlock_irq(&device->al_lock);
+
+ return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
+{
+ struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+ unsigned long db, dt, dbdt;
+ unsigned int c_min_rate;
+ int curr_events;
+
+ rcu_read_lock();
+ c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
+ rcu_read_unlock();
+
+ /* feature disabled? */
+ if (c_min_rate == 0)
+ return false;
+
+ curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]) -
+ atomic_read(&device->rs_sect_ev);
+
+ if (atomic_read(&device->ap_actlog_cnt)
+ || curr_events - device->rs_last_events > 64) {
+ unsigned long rs_left;
+ int i;
+
+ device->rs_last_events = curr_events;
+
+ /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+ * approx. */
+ i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+
+ if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+ rs_left = device->ov_left;
+ else
+ rs_left = drbd_bm_total_weight(device) - device->rs_failed;
+
+ dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = device->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+
+ if (dbdt > c_min_rate)
+ return true;
+ }
+ return false;
+}
+
+static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ sector_t sector;
+ sector_t capacity;
+ struct drbd_peer_request *peer_req;
+ struct digest_info *di = NULL;
+ int size, verb;
+ unsigned int fault_type;
+ struct p_block_req *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+ capacity = drbd_get_capacity(device->this_bdev);
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+ drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+ (unsigned long long)sector, size);
+ return -EINVAL;
+ }
+ if (sector + (size>>9) > capacity) {
+ drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+ (unsigned long long)sector, size);
+ return -EINVAL;
+ }
+
+ if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
+ verb = 1;
+ switch (pi->cmd) {
+ case P_DATA_REQUEST:
+ drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
+ break;
+ case P_RS_DATA_REQUEST:
+ case P_CSUM_RS_REQUEST:
+ case P_OV_REQUEST:
+ drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
+ break;
+ case P_OV_REPLY:
+ verb = 0;
+ dec_rs_pending(device);
+ drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
+ break;
+ default:
+ BUG();
+ }
+ if (verb && __ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Can not satisfy peer's read request, "
+ "no local data.\n");
+
+ /* drain possibly payload */
+ return drbd_drain_block(peer_device, pi->size);
+ }
+
+ /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+ * "criss-cross" setup, that might cause write-out on some other DRBD,
+ * which in turn might block on the other node at this very place. */
+ peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+ true /* has real payload */, GFP_NOIO);
+ if (!peer_req) {
+ put_ldev(device);
+ return -ENOMEM;
+ }
+
+ switch (pi->cmd) {
+ case P_DATA_REQUEST:
+ peer_req->w.cb = w_e_end_data_req;
+ fault_type = DRBD_FAULT_DT_RD;
+ /* application IO, don't drbd_rs_begin_io */
+ peer_req->flags |= EE_APPLICATION;
+ goto submit;
+
+ case P_RS_DATA_REQUEST:
+ peer_req->w.cb = w_e_end_rsdata_req;
+ fault_type = DRBD_FAULT_RS_RD;
+ /* used in the sector offset progress display */
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ break;
+
+ case P_OV_REPLY:
+ case P_CSUM_RS_REQUEST:
+ fault_type = DRBD_FAULT_RS_RD;
+ di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
+ if (!di)
+ goto out_free_e;
+
+ di->digest_size = pi->size;
+ di->digest = (((char *)di)+sizeof(struct digest_info));
+
+ peer_req->digest = di;
+ peer_req->flags |= EE_HAS_DIGEST;
+
+ if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
+ goto out_free_e;
+
+ if (pi->cmd == P_CSUM_RS_REQUEST) {
+ D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+ peer_req->w.cb = w_e_end_csum_rs_req;
+ /* used in the sector offset progress display */
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ /* remember to report stats in drbd_resync_finished */
+ device->use_csums = true;
+ } else if (pi->cmd == P_OV_REPLY) {
+ /* track progress, we may need to throttle */
+ atomic_add(size >> 9, &device->rs_sect_in);
+ peer_req->w.cb = w_e_end_ov_reply;
+ dec_rs_pending(device);
+ /* drbd_rs_begin_io done when we sent this request,
+ * but accounting still needs to be done. */
+ goto submit_for_resync;
+ }
+ break;
+
+ case P_OV_REQUEST:
+ if (device->ov_start_sector == ~(sector_t)0 &&
+ peer_device->connection->agreed_pro_version >= 90) {
+ unsigned long now = jiffies;
+ int i;
+ device->ov_start_sector = sector;
+ device->ov_position = sector;
+ device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
+ device->rs_total = device->ov_left;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = device->ov_left;
+ device->rs_mark_time[i] = now;
+ }
+ drbd_info(device, "Online Verify start sector: %llu\n",
+ (unsigned long long)sector);
+ }
+ peer_req->w.cb = w_e_end_ov_req;
+ fault_type = DRBD_FAULT_RS_RD;
+ break;
+
+ default:
+ BUG();
+ }
+
+ /* Throttle, drbd_rs_begin_io and submit should become asynchronous
+ * wrt the receiver, but it is not as straightforward as it may seem.
+ * Various places in the resync start and stop logic assume resync
+ * requests are processed in order, requeuing this on the worker thread
+ * introduces a bunch of new code for synchronization between threads.
+ *
+ * Unlimited throttling before drbd_rs_begin_io may stall the resync
+ * "forever", throttling after drbd_rs_begin_io will lock that extent
+ * for application writes for the same time. For now, just throttle
+ * here, where the rest of the code expects the receiver to sleep for
+ * a while, anyways.
+ */
+
+ /* Throttle before drbd_rs_begin_io, as that locks out application IO;
+ * this defers syncer requests for some time, before letting at least
+ * on request through. The resync controller on the receiving side
+ * will adapt to the incoming rate accordingly.
+ *
+ * We cannot throttle here if remote is Primary/SyncTarget:
+ * we would also throttle its application reads.
+ * In that case, throttling is done on the SyncTarget only.
+ */
+
+ /* Even though this may be a resync request, we do add to "read_ee";
+ * "sync_ee" is only used for resync WRITEs.
+ * Add to list early, so debugfs can find this request
+ * even if we have to sleep below. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ update_receiver_timing_details(connection, drbd_rs_should_slow_down);
+ if (device->state.peer != R_PRIMARY
+ && drbd_rs_should_slow_down(device, sector, false))
+ schedule_timeout_uninterruptible(HZ/10);
+ update_receiver_timing_details(connection, drbd_rs_begin_io);
+ if (drbd_rs_begin_io(device, sector))
+ goto out_free_e;
+
+submit_for_resync:
+ atomic_add(size >> 9, &device->rs_sect_ev);
+
+submit:
+ update_receiver_timing_details(connection, drbd_submit_peer_request);
+ inc_unacked(device);
+ if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0)
+ return 0;
+
+ /* don't care for the reason here */
+ drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
+ /* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
+ put_ldev(device);
+ drbd_free_peer_req(device, peer_req);
+ return -EIO;
+}
+
+/**
+ * drbd_asb_recover_0p - Recover after split-brain with no remaining primaries
+ */
+static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ int self, peer, rv = -100;
+ unsigned long ch_self, ch_peer;
+ enum drbd_after_sb_p after_sb_0p;
+
+ self = device->ldev->md.uuid[UI_BITMAP] & 1;
+ peer = device->p_uuid[UI_BITMAP] & 1;
+
+ ch_peer = device->p_uuid[UI_SIZE];
+ ch_self = device->comm_bm_set;
+
+ rcu_read_lock();
+ after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
+ rcu_read_unlock();
+ switch (after_sb_0p) {
+ case ASB_CONSENSUS:
+ case ASB_DISCARD_SECONDARY:
+ case ASB_CALL_HELPER:
+ case ASB_VIOLENTLY:
+ drbd_err(device, "Configuration error.\n");
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_DISCARD_YOUNGER_PRI:
+ if (self == 0 && peer == 1) {
+ rv = -1;
+ break;
+ }
+ if (self == 1 && peer == 0) {
+ rv = 1;
+ break;
+ }
+ /* Else fall through to one of the other strategies... */
+ case ASB_DISCARD_OLDER_PRI:
+ if (self == 0 && peer == 1) {
+ rv = 1;
+ break;
+ }
+ if (self == 1 && peer == 0) {
+ rv = -1;
+ break;
+ }
+ /* Else fall through to one of the other strategies... */
+ drbd_warn(device, "Discard younger/older primary did not find a decision\n"
+ "Using discard-least-changes instead\n");
+ case ASB_DISCARD_ZERO_CHG:
+ if (ch_peer == 0 && ch_self == 0) {
+ rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+ ? -1 : 1;
+ break;
+ } else {
+ if (ch_peer == 0) { rv = 1; break; }
+ if (ch_self == 0) { rv = -1; break; }
+ }
+ if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
+ break;
+ case ASB_DISCARD_LEAST_CHG:
+ if (ch_self < ch_peer)
+ rv = -1;
+ else if (ch_self > ch_peer)
+ rv = 1;
+ else /* ( ch_self == ch_peer ) */
+ /* Well, then use something else. */
+ rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+ ? -1 : 1;
+ break;
+ case ASB_DISCARD_LOCAL:
+ rv = -1;
+ break;
+ case ASB_DISCARD_REMOTE:
+ rv = 1;
+ }
+
+ return rv;
+}
+
+/**
+ * drbd_asb_recover_1p - Recover after split-brain with one remaining primary
+ */
+static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_1p;
+
+ rcu_read_lock();
+ after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
+ rcu_read_unlock();
+ switch (after_sb_1p) {
+ case ASB_DISCARD_YOUNGER_PRI:
+ case ASB_DISCARD_OLDER_PRI:
+ case ASB_DISCARD_LEAST_CHG:
+ case ASB_DISCARD_LOCAL:
+ case ASB_DISCARD_REMOTE:
+ case ASB_DISCARD_ZERO_CHG:
+ drbd_err(device, "Configuration error.\n");
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_CONSENSUS:
+ hg = drbd_asb_recover_0p(peer_device);
+ if (hg == -1 && device->state.role == R_SECONDARY)
+ rv = hg;
+ if (hg == 1 && device->state.role == R_PRIMARY)
+ rv = hg;
+ break;
+ case ASB_VIOLENTLY:
+ rv = drbd_asb_recover_0p(peer_device);
+ break;
+ case ASB_DISCARD_SECONDARY:
+ return device->state.role == R_PRIMARY ? 1 : -1;
+ case ASB_CALL_HELPER:
+ hg = drbd_asb_recover_0p(peer_device);
+ if (hg == -1 && device->state.role == R_PRIMARY) {
+ enum drbd_state_rv rv2;
+
+ /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+ * we might be here in C_WF_REPORT_PARAMS which is transient.
+ * we do not need to wait for the after state change work either. */
+ rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (rv2 != SS_SUCCESS) {
+ drbd_khelper(device, "pri-lost-after-sb");
+ } else {
+ drbd_warn(device, "Successfully gave up primary role.\n");
+ rv = hg;
+ }
+ } else
+ rv = hg;
+ }
+
+ return rv;
+}
+
+/**
+ * drbd_asb_recover_2p - Recover after split-brain with two remaining primaries
+ */
+static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_2p;
+
+ rcu_read_lock();
+ after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
+ rcu_read_unlock();
+ switch (after_sb_2p) {
+ case ASB_DISCARD_YOUNGER_PRI:
+ case ASB_DISCARD_OLDER_PRI:
+ case ASB_DISCARD_LEAST_CHG:
+ case ASB_DISCARD_LOCAL:
+ case ASB_DISCARD_REMOTE:
+ case ASB_CONSENSUS:
+ case ASB_DISCARD_SECONDARY:
+ case ASB_DISCARD_ZERO_CHG:
+ drbd_err(device, "Configuration error.\n");
+ break;
+ case ASB_VIOLENTLY:
+ rv = drbd_asb_recover_0p(peer_device);
+ break;
+ case ASB_DISCONNECT:
+ break;
+ case ASB_CALL_HELPER:
+ hg = drbd_asb_recover_0p(peer_device);
+ if (hg == -1) {
+ enum drbd_state_rv rv2;
+
+ /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+ * we might be here in C_WF_REPORT_PARAMS which is transient.
+ * we do not need to wait for the after state change work either. */
+ rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+ if (rv2 != SS_SUCCESS) {
+ drbd_khelper(device, "pri-lost-after-sb");
+ } else {
+ drbd_warn(device, "Successfully gave up primary role.\n");
+ rv = hg;
+ }
+ } else
+ rv = hg;
+ }
+
+ return rv;
+}
+
+static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
+ u64 bits, u64 flags)
+{
+ if (!uuid) {
+ drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
+ return;
+ }
+ drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+ text,
+ (unsigned long long)uuid[UI_CURRENT],
+ (unsigned long long)uuid[UI_BITMAP],
+ (unsigned long long)uuid[UI_HISTORY_START],
+ (unsigned long long)uuid[UI_HISTORY_END],
+ (unsigned long long)bits,
+ (unsigned long long)flags);
+}
+
+/*
+ 100 after split brain try auto recover
+ 2 C_SYNC_SOURCE set BitMap
+ 1 C_SYNC_SOURCE use BitMap
+ 0 no Sync
+ -1 C_SYNC_TARGET use BitMap
+ -2 C_SYNC_TARGET set BitMap
+ -100 after split brain, disconnect
+-1000 unrelated data
+-1091 requires proto 91
+-1096 requires proto 96
+ */
+static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local)
+{
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+ u64 self, peer;
+ int i, j;
+
+ self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+
+ *rule_nr = 10;
+ if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+ return 0;
+
+ *rule_nr = 20;
+ if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+ peer != UUID_JUST_CREATED)
+ return -2;
+
+ *rule_nr = 30;
+ if (self != UUID_JUST_CREATED &&
+ (peer == UUID_JUST_CREATED || peer == (u64)0))
+ return 2;
+
+ if (self == peer) {
+ int rct, dc; /* roles at crash time */
+
+ if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+ if (connection->agreed_pro_version < 91)
+ return -1091;
+
+ if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+ (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+ drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
+ drbd_uuid_move_history(device);
+ device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+ device->ldev->md.uuid[UI_BITMAP] = 0;
+
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+ device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+ *rule_nr = 34;
+ } else {
+ drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
+ *rule_nr = 36;
+ }
+
+ return 1;
+ }
+
+ if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
+
+ if (connection->agreed_pro_version < 91)
+ return -1091;
+
+ if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+ (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+ drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+ device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
+ device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
+ device->p_uuid[UI_BITMAP] = 0UL;
+
+ drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+ *rule_nr = 35;
+ } else {
+ drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
+ *rule_nr = 37;
+ }
+
+ return -1;
+ }
+
+ /* Common power [off|failure] */
+ rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
+ (device->p_uuid[UI_FLAGS] & 2);
+ /* lowest bit is set when we were primary,
+ * next bit (weight 2) is set when peer was primary */
+ *rule_nr = 40;
+
+ switch (rct) {
+ case 0: /* !self_pri && !peer_pri */ return 0;
+ case 1: /* self_pri && !peer_pri */ return 1;
+ case 2: /* !self_pri && peer_pri */ return -1;
+ case 3: /* self_pri && peer_pri */
+ dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+ return dc ? -1 : 1;
+ }
+ }
+
+ *rule_nr = 50;
+ peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+ if (self == peer)
+ return -1;
+
+ *rule_nr = 51;
+ peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
+ if (self == peer) {
+ if (connection->agreed_pro_version < 96 ?
+ (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
+ (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
+ peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
+ /* The last P_SYNC_UUID did not get though. Undo the last start of
+ resync as sync source modifications of the peer's UUIDs. */
+
+ if (connection->agreed_pro_version < 91)
+ return -1091;
+
+ device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
+ device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
+
+ drbd_info(device, "Lost last syncUUID packet, corrected:\n");
+ drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+ return -1;
+ }
+ }
+
+ *rule_nr = 60;
+ self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ peer = device->p_uuid[i] & ~((u64)1);
+ if (self == peer)
+ return -2;
+ }
+
+ *rule_nr = 70;
+ self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+ if (self == peer)
+ return 1;
+
+ *rule_nr = 71;
+ self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+ if (self == peer) {
+ if (connection->agreed_pro_version < 96 ?
+ (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
+ (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
+ self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
+ /* The last P_SYNC_UUID did not get though. Undo the last start of
+ resync as sync source modifications of our UUIDs. */
+
+ if (connection->agreed_pro_version < 91)
+ return -1091;
+
+ __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
+ __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+ drbd_info(device, "Last syncUUID did not get through, corrected:\n");
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+ device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+
+ return 1;
+ }
+ }
+
+
+ *rule_nr = 80;
+ peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ self = device->ldev->md.uuid[i] & ~((u64)1);
+ if (self == peer)
+ return 2;
+ }
+
+ *rule_nr = 90;
+ self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+ peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+ if (self == peer && self != ((u64)0))
+ return 100;
+
+ *rule_nr = 100;
+ for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+ self = device->ldev->md.uuid[i] & ~((u64)1);
+ for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+ peer = device->p_uuid[j] & ~((u64)1);
+ if (self == peer)
+ return -100;
+ }
+ }
+
+ return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+ CONN_MASK (-1) on failure.
+ */
+static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
+ enum drbd_role peer_role,
+ enum drbd_disk_state peer_disk) __must_hold(local)
+{
+ struct drbd_device *device = peer_device->device;
+ enum drbd_conns rv = C_MASK;
+ enum drbd_disk_state mydisk;
+ struct net_conf *nc;
+ int hg, rule_nr, rr_conflict, tentative;
+
+ mydisk = device->state.disk;
+ if (mydisk == D_NEGOTIATING)
+ mydisk = device->new_state_tmp.disk;
+
+ drbd_info(device, "drbd_sync_handshake:\n");
+
+ spin_lock_irq(&device->ldev->md.uuid_lock);
+ drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
+ drbd_uuid_dump(device, "peer", device->p_uuid,
+ device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+ hg = drbd_uuid_compare(device, &rule_nr);
+ spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+ drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+ if (hg == -1000) {
+ drbd_alert(device, "Unrelated data, aborting!\n");
+ return C_MASK;
+ }
+ if (hg < -1000) {
+ drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
+ return C_MASK;
+ }
+
+ if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+ (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) {
+ int f = (hg == -100) || abs(hg) == 2;
+ hg = mydisk > D_INCONSISTENT ? 1 : -1;
+ if (f)
+ hg = hg*2;
+ drbd_info(device, "Becoming sync %s due to disk states.\n",
+ hg > 0 ? "source" : "target");
+ }
+
+ if (abs(hg) == 100)
+ drbd_khelper(device, "initial-split-brain");
+
+ rcu_read_lock();
+ nc = rcu_dereference(peer_device->connection->net_conf);
+
+ if (hg == 100 || (hg == -100 && nc->always_asbp)) {
+ int pcount = (device->state.role == R_PRIMARY)
+ + (peer_role == R_PRIMARY);
+ int forced = (hg == -100);
+
+ switch (pcount) {
+ case 0:
+ hg = drbd_asb_recover_0p(peer_device);
+ break;
+ case 1:
+ hg = drbd_asb_recover_1p(peer_device);
+ break;
+ case 2:
+ hg = drbd_asb_recover_2p(peer_device);
+ break;
+ }
+ if (abs(hg) < 100) {
+ drbd_warn(device, "Split-Brain detected, %d primaries, "
+ "automatically solved. Sync from %s node\n",
+ pcount, (hg < 0) ? "peer" : "this");
+ if (forced) {
+ drbd_warn(device, "Doing a full sync, since"
+ " UUIDs where ambiguous.\n");
+ hg = hg*2;
+ }
+ }
+ }
+
+ if (hg == -100) {
+ if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
+ hg = -1;
+ if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
+ hg = 1;
+
+ if (abs(hg) < 100)
+ drbd_warn(device, "Split-Brain detected, manually solved. "
+ "Sync from %s node\n",
+ (hg < 0) ? "peer" : "this");
+ }
+ rr_conflict = nc->rr_conflict;
+ tentative = nc->tentative;
+ rcu_read_unlock();
+
+ if (hg == -100) {
+ /* FIXME this log message is not correct if we end up here
+ * after an attempted attach on a diskless node.
+ * We just refuse to attach -- well, we drop the "connection"
+ * to that disk, in a way... */
+ drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
+ drbd_khelper(device, "split-brain");
+ return C_MASK;
+ }
+
+ if (hg > 0 && mydisk <= D_INCONSISTENT) {
+ drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
+ return C_MASK;
+ }
+
+ if (hg < 0 && /* by intention we do not use mydisk here. */
+ device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
+ switch (rr_conflict) {
+ case ASB_CALL_HELPER:
+ drbd_khelper(device, "pri-lost");
+ /* fall through */
+ case ASB_DISCONNECT:
+ drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
+ return C_MASK;
+ case ASB_VIOLENTLY:
+ drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
+ "assumption\n");
+ }
+ }
+
+ if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
+ if (hg == 0)
+ drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
+ else
+ drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
+ drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
+ abs(hg) >= 2 ? "full" : "bit-map based");
+ return C_MASK;
+ }
+
+ if (abs(hg) >= 2) {
+ drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+ if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
+ BM_LOCKED_SET_ALLOWED))
+ return C_MASK;
+ }
+
+ if (hg > 0) { /* become sync source. */
+ rv = C_WF_BITMAP_S;
+ } else if (hg < 0) { /* become sync target */
+ rv = C_WF_BITMAP_T;
+ } else {
+ rv = C_CONNECTED;
+ if (drbd_bm_total_weight(device)) {
+ drbd_info(device, "No resync, but %lu bits in bitmap!\n",
+ drbd_bm_total_weight(device));
+ }
+ }
+
+ return rv;
+}
+
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
+{
+ /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+ if (peer == ASB_DISCARD_REMOTE)
+ return ASB_DISCARD_LOCAL;
+
+ /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+ if (peer == ASB_DISCARD_LOCAL)
+ return ASB_DISCARD_REMOTE;
+
+ /* everything else is valid if they are equal on both sides. */
+ return peer;
+}
+
+static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_protocol *p = pi->data;
+ enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+ int p_proto, p_discard_my_data, p_two_primaries, cf;
+ struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+ char integrity_alg[SHARED_SECRET_MAX] = "";
+ struct crypto_hash *peer_integrity_tfm = NULL;
+ void *int_dig_in = NULL, *int_dig_vv = NULL;
+
+ p_proto = be32_to_cpu(p->protocol);
+ p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
+ p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
+ p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
+ p_two_primaries = be32_to_cpu(p->two_primaries);
+ cf = be32_to_cpu(p->conn_flags);
+ p_discard_my_data = cf & CF_DISCARD_MY_DATA;
+
+ if (connection->agreed_pro_version >= 87) {
+ int err;
+
+ if (pi->size > sizeof(integrity_alg))
+ return -EIO;
+ err = drbd_recv_all(connection, integrity_alg, pi->size);
+ if (err)
+ return err;
+ integrity_alg[SHARED_SECRET_MAX - 1] = 0;
+ }
+
+ if (pi->cmd != P_PROTOCOL_UPDATE) {
+ clear_bit(CONN_DRY_RUN, &connection->flags);
+
+ if (cf & CF_DRY_RUN)
+ set_bit(CONN_DRY_RUN, &connection->flags);
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+
+ if (p_proto != nc->wire_protocol) {
+ drbd_err(connection, "incompatible %s settings\n", "protocol");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+ drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_discard_my_data && nc->discard_my_data) {
+ drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_two_primaries != nc->two_primaries) {
+ drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (strcmp(integrity_alg, nc->integrity_alg)) {
+ drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
+ goto disconnect_rcu_unlock;
+ }
+
+ rcu_read_unlock();
+ }
+
+ if (integrity_alg[0]) {
+ int hash_size;
+
+ /*
+ * We can only change the peer data integrity algorithm
+ * here. Changing our own data integrity algorithm
+ * requires that we send a P_PROTOCOL_UPDATE packet at
+ * the same time; otherwise, the peer has no way to
+ * tell between which packets the algorithm should
+ * change.
+ */
+
+ peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+ if (!peer_integrity_tfm) {
+ drbd_err(connection, "peer data-integrity-alg %s not supported\n",
+ integrity_alg);
+ goto disconnect;
+ }
+
+ hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+ int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+ int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+ if (!(int_dig_in && int_dig_vv)) {
+ drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
+ goto disconnect;
+ }
+ }
+
+ new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ drbd_err(connection, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ mutex_lock(&connection->data.mutex);
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = connection->net_conf;
+ *new_net_conf = *old_net_conf;
+
+ new_net_conf->wire_protocol = p_proto;
+ new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+ new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+ new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+ new_net_conf->two_primaries = p_two_primaries;
+
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+ mutex_unlock(&connection->resource->conf_update);
+ mutex_unlock(&connection->data.mutex);
+
+ crypto_free_hash(connection->peer_integrity_tfm);
+ kfree(connection->int_dig_in);
+ kfree(connection->int_dig_vv);
+ connection->peer_integrity_tfm = peer_integrity_tfm;
+ connection->int_dig_in = int_dig_in;
+ connection->int_dig_vv = int_dig_vv;
+
+ if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+ drbd_info(connection, "peer data-integrity-alg: %s\n",
+ integrity_alg[0] ? integrity_alg : "(none)");
+
+ synchronize_rcu();
+ kfree(old_net_conf);
+ return 0;
+
+disconnect_rcu_unlock:
+ rcu_read_unlock();
+disconnect:
+ crypto_free_hash(peer_integrity_tfm);
+ kfree(int_dig_in);
+ kfree(int_dig_vv);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ * ERR_PTR(error) if something goes wrong
+ * or the crypto hash ptr, if it worked out ok. */
+static struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_device *device,
+ const char *alg, const char *name)
+{
+ struct crypto_hash *tfm;
+
+ if (!alg[0])
+ return NULL;
+
+ tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm)) {
+ drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+ alg, name, PTR_ERR(tfm));
+ return tfm;
+ }
+ return tfm;
+}
+
+static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
+{
+ void *buffer = connection->data.rbuf;
+ int size = pi->size;
+
+ while (size) {
+ int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+ s = drbd_recv(connection, buffer, s);
+ if (s <= 0) {
+ if (s < 0)
+ return s;
+ break;
+ }
+ size -= s;
+ }
+ if (size)
+ return -EIO;
+ return 0;
+}
+
+/*
+ * config_unknown_volume - device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet. It will warn and ignore these
+ * commands. Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
+{
+ drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
+ cmdname(pi->cmd), pi->vnr);
+ return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_rs_param_95 *p;
+ unsigned int header_size, data_size, exp_max_sz;
+ struct crypto_hash *verify_tfm = NULL;
+ struct crypto_hash *csums_tfm = NULL;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+ const int apv = connection->agreed_pro_version;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+ int fifo_size = 0;
+ int err;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
+ exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
+ : apv == 88 ? sizeof(struct p_rs_param)
+ + SHARED_SECRET_MAX
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+ if (pi->size > exp_max_sz) {
+ drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+ pi->size, exp_max_sz);
+ return -EIO;
+ }
+
+ if (apv <= 88) {
+ header_size = sizeof(struct p_rs_param);
+ data_size = pi->size - header_size;
+ } else if (apv <= 94) {
+ header_size = sizeof(struct p_rs_param_89);
+ data_size = pi->size - header_size;
+ D_ASSERT(device, data_size == 0);
+ } else {
+ header_size = sizeof(struct p_rs_param_95);
+ data_size = pi->size - header_size;
+ D_ASSERT(device, data_size == 0);
+ }
+
+ /* initialize verify_alg and csums_alg */
+ p = pi->data;
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+
+ err = drbd_recv_all(peer_device->connection, p, header_size);
+ if (err)
+ return err;
+
+ mutex_lock(&connection->resource->conf_update);
+ old_net_conf = peer_device->connection->net_conf;
+ if (get_ldev(device)) {
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ put_ldev(device);
+ mutex_unlock(&connection->resource->conf_update);
+ drbd_err(device, "Allocation of new disk_conf failed\n");
+ return -ENOMEM;
+ }
+
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+
+ new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+ }
+
+ if (apv >= 88) {
+ if (apv == 88) {
+ if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+ drbd_err(device, "verify-alg of wrong size, "
+ "peer wants %u, accepting only up to %u byte\n",
+ data_size, SHARED_SECRET_MAX);
+ err = -EIO;
+ goto reconnect;
+ }
+
+ err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
+ if (err)
+ goto reconnect;
+ /* we expect NUL terminated string */
+ /* but just in case someone tries to be evil */
+ D_ASSERT(device, p->verify_alg[data_size-1] == 0);
+ p->verify_alg[data_size-1] = 0;
+
+ } else /* apv >= 89 */ {
+ /* we still expect NUL terminated strings */
+ /* but just in case someone tries to be evil */
+ D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+ D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+ p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+ p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+ }
+
+ if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+ old_net_conf->verify_alg, p->verify_alg);
+ goto disconnect;
+ }
+ verify_tfm = drbd_crypto_alloc_digest_safe(device,
+ p->verify_alg, "verify-alg");
+ if (IS_ERR(verify_tfm)) {
+ verify_tfm = NULL;
+ goto disconnect;
+ }
+ }
+
+ if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
+ if (device->state.conn == C_WF_REPORT_PARAMS) {
+ drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+ old_net_conf->csums_alg, p->csums_alg);
+ goto disconnect;
+ }
+ csums_tfm = drbd_crypto_alloc_digest_safe(device,
+ p->csums_alg, "csums-alg");
+ if (IS_ERR(csums_tfm)) {
+ csums_tfm = NULL;
+ goto disconnect;
+ }
+ }
+
+ if (apv > 94 && new_disk_conf) {
+ new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+ new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+ new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != device->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
+ drbd_err(device, "kmalloc of fifo_buffer failed");
+ put_ldev(device);
+ goto disconnect;
+ }
+ }
+ }
+
+ if (verify_tfm || csums_tfm) {
+ new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ drbd_err(device, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ *new_net_conf = *old_net_conf;
+
+ if (verify_tfm) {
+ strcpy(new_net_conf->verify_alg, p->verify_alg);
+ new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+ crypto_free_hash(peer_device->connection->verify_tfm);
+ peer_device->connection->verify_tfm = verify_tfm;
+ drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
+ }
+ if (csums_tfm) {
+ strcpy(new_net_conf->csums_alg, p->csums_alg);
+ new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+ crypto_free_hash(peer_device->connection->csums_tfm);
+ peer_device->connection->csums_tfm = csums_tfm;
+ drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
+ }
+ rcu_assign_pointer(connection->net_conf, new_net_conf);
+ }
+ }
+
+ if (new_disk_conf) {
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ put_ldev(device);
+ }
+
+ if (new_plan) {
+ old_plan = device->rs_plan_s;
+ rcu_assign_pointer(device->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&connection->resource->conf_update);
+ synchronize_rcu();
+ if (new_net_conf)
+ kfree(old_net_conf);
+ kfree(old_disk_conf);
+ kfree(old_plan);
+
+ return 0;
+
+reconnect:
+ if (new_disk_conf) {
+ put_ldev(device);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&connection->resource->conf_update);
+ return -EIO;
+
+disconnect:
+ kfree(new_plan);
+ if (new_disk_conf) {
+ put_ldev(device);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&connection->resource->conf_update);
+ /* just for completeness: actually not needed,
+ * as this is not reached if csums_tfm was ok. */
+ crypto_free_hash(csums_tfm);
+ /* but free the verify_tfm again, if csums_tfm did not work out */
+ crypto_free_hash(verify_tfm);
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_device *device,
+ const char *s, sector_t a, sector_t b)
+{
+ sector_t d;
+ if (a == 0 || b == 0)
+ return;
+ d = (a > b) ? (a - b) : (b - a);
+ if (d > (a>>3) || d > (b>>3))
+ drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
+ (unsigned long long)a, (unsigned long long)b);
+}
+
+static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_sizes *p = pi->data;
+ enum determine_dev_size dd = DS_UNCHANGED;
+ sector_t p_size, p_usize, p_csize, my_usize;
+ int ldsc = 0; /* local disk size changed */
+ enum dds_flags ddsf;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
+ p_size = be64_to_cpu(p->d_size);
+ p_usize = be64_to_cpu(p->u_size);
+ p_csize = be64_to_cpu(p->c_size);
+
+ /* just store the peer's disk size for now.
+ * we still need to figure out whether we accept that. */
+ device->p_size = p_size;
+
+ if (get_ldev(device)) {
+ rcu_read_lock();
+ my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+
+ warn_if_differ_considerably(device, "lower level device sizes",
+ p_size, drbd_get_max_capacity(device->ldev));
+ warn_if_differ_considerably(device, "user requested size",
+ p_usize, my_usize);
+
+ /* if this is the first connect, or an otherwise expected
+ * param exchange, choose the minimum */
+ if (device->state.conn == C_WF_REPORT_PARAMS)
+ p_usize = min_not_zero(my_usize, p_usize);
+
+ /* Never shrink a device with usable data during connect.
+ But allow online shrinking if we are connected. */
+ if (drbd_new_dev_size(device, device->ldev, p_usize, 0) <
+ drbd_get_capacity(device->this_bdev) &&
+ device->state.disk >= D_OUTDATED &&
+ device->state.conn < C_CONNECTED) {
+ drbd_err(device, "The peer's disk size is too small!\n");
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ put_ldev(device);
+ return -EIO;
+ }
+
+ if (my_usize != p_usize) {
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ drbd_err(device, "Allocation of new disk_conf failed\n");
+ put_ldev(device);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&connection->resource->conf_update);
+ old_disk_conf = device->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = p_usize;
+
+ rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&connection->resource->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+
+ drbd_info(device, "Peer sets u_size to %lu sectors\n",
+ (unsigned long)my_usize);
+ }
+
+ put_ldev(device);
+ }
+
+ device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+ /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size().
+ In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+ drbd_reconsider_max_bio_size(), we can be sure that after
+ drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+
+ ddsf = be16_to_cpu(p->dds_flags);
+ if (get_ldev(device)) {
+ drbd_reconsider_max_bio_size(device, device->ldev);
+ dd = drbd_determine_dev_size(device, ddsf, NULL);
+ put_ldev(device);
+ if (dd == DS_ERROR)
+ return -EIO;
+ drbd_md_sync(device);
+ } else {
+ /*
+ * I am diskless, need to accept the peer's *current* size.
+ * I must NOT accept the peers backing disk size,
+ * it may have been larger than mine all along...
+ *
+ * At this point, the peer knows more about my disk, or at
+ * least about what we last agreed upon, than myself.
+ * So if his c_size is less than his d_size, the most likely
+ * reason is that *my* d_size was smaller last time we checked.
+ *
+ * However, if he sends a zero current size,
+ * take his (user-capped or) backing disk size anyways.
+ */
+ drbd_reconsider_max_bio_size(device, NULL);
+ drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size);
+ }
+
+ if (get_ldev(device)) {
+ if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
+ device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+ ldsc = 1;
+ }
+
+ put_ldev(device);
+ }
+
+ if (device->state.conn > C_WF_REPORT_PARAMS) {
+ if (be64_to_cpu(p->c_size) !=
+ drbd_get_capacity(device->this_bdev) || ldsc) {
+ /* we have different sizes, probably peer
+ * needs to know my new size... */
+ drbd_send_sizes(peer_device, 0, ddsf);
+ }
+ if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
+ (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
+ if (device->state.pdsk >= D_INCONSISTENT &&
+ device->state.disk >= D_INCONSISTENT) {
+ if (ddsf & DDSF_NO_RESYNC)
+ drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
+ else
+ resync_after_online_grow(device);
+ } else
+ set_bit(RESYNC_AFTER_NEG, &device->flags);
+ }
+ }
+
+ return 0;
+}
+
+static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_uuids *p = pi->data;
+ u64 *p_uuid;
+ int i, updated_uuids = 0;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
+ p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
+ if (!p_uuid) {
+ drbd_err(device, "kmalloc of p_uuid failed\n");
+ return false;
+ }
+
+ for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+ p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+ kfree(device->p_uuid);
+ device->p_uuid = p_uuid;
+
+ if (device->state.conn < C_CONNECTED &&
+ device->state.disk < D_INCONSISTENT &&
+ device->state.role == R_PRIMARY &&
+ (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+ drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
+ (unsigned long long)device->ed_uuid);
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+ }
+
+ if (get_ldev(device)) {
+ int skip_initial_sync =
+ device->state.conn == C_CONNECTED &&
+ peer_device->connection->agreed_pro_version >= 90 &&
+ device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+ (p_uuid[UI_FLAGS] & 8);
+ if (skip_initial_sync) {
+ drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
+ drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+ "clear_n_write from receive_uuids",
+ BM_LOCKED_TEST_ALLOWED);
+ _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
+ _drbd_uuid_set(device, UI_BITMAP, 0);
+ _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+ CS_VERBOSE, NULL);
+ drbd_md_sync(device);
+ updated_uuids = 1;
+ }
+ put_ldev(device);
+ } else if (device->state.disk < D_INCONSISTENT &&
+ device->state.role == R_PRIMARY) {
+ /* I am a diskless primary, the peer just created a new current UUID
+ for me. */
+ updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+ }
+
+ /* Before we test for the disk state, we should wait until an eventually
+ ongoing cluster wide state change is finished. That is important if
+ we are primary and are detaching from our disk. We need to see the
+ new disk state... */
+ mutex_lock(device->state_mutex);
+ mutex_unlock(device->state_mutex);
+ if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
+ updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+
+ if (updated_uuids)
+ drbd_print_uuids(device, "receiver updated UUIDs to");
+
+ return 0;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps: The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state ps)
+{
+ union drbd_state ms;
+
+ static enum drbd_conns c_tab[] = {
+ [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
+ [C_CONNECTED] = C_CONNECTED,
+
+ [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+ [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+ [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+ [C_VERIFY_S] = C_VERIFY_T,
+ [C_MASK] = C_MASK,
+ };
+
+ ms.i = ps.i;
+
+ ms.conn = c_tab[ps.conn];
+ ms.peer = ps.role;
+ ms.role = ps.peer;
+ ms.pdsk = ps.disk;
+ ms.disk = ps.pdsk;
+ ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+ return ms;
+}
+
+static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_req_state *p = pi->data;
+ union drbd_state mask, val;
+ enum drbd_state_rv rv;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
+
+ if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
+ mutex_is_locked(device->state_mutex)) {
+ drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
+ return 0;
+ }
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = drbd_change_state(device, CS_VERBOSE, mask, val);
+ drbd_send_sr_reply(peer_device, rv);
+
+ drbd_md_sync(device);
+
+ return 0;
+}
+
+static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_req_state *p = pi->data;
+ union drbd_state mask, val;
+ enum drbd_state_rv rv;
+
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
+
+ if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
+ mutex_is_locked(&connection->cstate_mutex)) {
+ conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
+ return 0;
+ }
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+ conn_send_sr_reply(connection, rv);
+
+ return 0;
+}
+
+static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_state *p = pi->data;
+ union drbd_state os, ns, peer_state;
+ enum drbd_disk_state real_peer_disk;
+ enum chg_state_flags cs_flags;
+ int rv;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return config_unknown_volume(connection, pi);
+ device = peer_device->device;
+
+ peer_state.i = be32_to_cpu(p->state);
+
+ real_peer_disk = peer_state.disk;
+ if (peer_state.disk == D_NEGOTIATING) {
+ real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+ drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+ }
+
+ spin_lock_irq(&device->resource->req_lock);
+ retry:
+ os = ns = drbd_read_state(device);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ /* If some other part of the code (asender thread, timeout)
+ * already decided to close the connection again,
+ * we must not "re-establish" it here. */
+ if (os.conn <= C_TEAR_DOWN)
+ return -ECONNRESET;
+
+ /* If this is the "end of sync" confirmation, usually the peer disk
+ * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+ * set) resync started in PausedSyncT, or if the timing of pause-/
+ * unpause-sync events has been "just right", the peer disk may
+ * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+ */
+ if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+ real_peer_disk == D_UP_TO_DATE &&
+ os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+ /* If we are (becoming) SyncSource, but peer is still in sync
+ * preparation, ignore its uptodate-ness to avoid flapping, it
+ * will change to inconsistent once the peer reaches active
+ * syncing states.
+ * It may have changed syncer-paused flags, however, so we
+ * cannot ignore this completely. */
+ if (peer_state.conn > C_CONNECTED &&
+ peer_state.conn < C_SYNC_SOURCE)
+ real_peer_disk = D_INCONSISTENT;
+
+ /* if peer_state changes to connected at the same time,
+ * it explicitly notifies us that it finished resync.
+ * Maybe we should finish it up, too? */
+ else if (os.conn >= C_SYNC_SOURCE &&
+ peer_state.conn == C_CONNECTED) {
+ if (drbd_bm_total_weight(device) <= device->rs_failed)
+ drbd_resync_finished(device);
+ return 0;
+ }
+ }
+
+ /* explicit verify finished notification, stop sector reached. */
+ if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
+ peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+ return 0;
+ }
+
+ /* peer says his disk is inconsistent, while we think it is uptodate,
+ * and this happens while the peer still thinks we have a sync going on,
+ * but we think we are already done with the sync.
+ * We ignore this to avoid flapping pdsk.
+ * This should not happen, if the peer is a recent version of drbd. */
+ if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+ os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+ real_peer_disk = D_UP_TO_DATE;
+
+ if (ns.conn == C_WF_REPORT_PARAMS)
+ ns.conn = C_CONNECTED;
+
+ if (peer_state.conn == C_AHEAD)
+ ns.conn = C_BEHIND;
+
+ if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+ get_ldev_if_state(device, D_NEGOTIATING)) {
+ int cr; /* consider resync */
+
+ /* if we established a new connection */
+ cr = (os.conn < C_CONNECTED);
+ /* if we had an established connection
+ * and one of the nodes newly attaches a disk */
+ cr |= (os.conn == C_CONNECTED &&
+ (peer_state.disk == D_NEGOTIATING ||
+ os.disk == D_NEGOTIATING));
+ /* if we have both been inconsistent, and the peer has been
+ * forced to be UpToDate with --overwrite-data */
+ cr |= test_bit(CONSIDER_RESYNC, &device->flags);
+ /* if we had been plain connected, and the admin requested to
+ * start a sync by "invalidate" or "invalidate-remote" */
+ cr |= (os.conn == C_CONNECTED &&
+ (peer_state.conn >= C_STARTING_SYNC_S &&
+ peer_state.conn <= C_WF_BITMAP_T));
+
+ if (cr)
+ ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
+
+ put_ldev(device);
+ if (ns.conn == C_MASK) {
+ ns.conn = C_CONNECTED;
+ if (device->state.disk == D_NEGOTIATING) {
+ drbd_force_state(device, NS(disk, D_FAILED));
+ } else if (peer_state.disk == D_NEGOTIATING) {
+ drbd_err(device, "Disk attach process on the peer node was aborted.\n");
+ peer_state.disk = D_DISKLESS;
+ real_peer_disk = D_DISKLESS;
+ } else {
+ if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
+ return -EIO;
+ D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+ }
+ }
+ }
+
+ spin_lock_irq(&device->resource->req_lock);
+ if (os.i != drbd_read_state(device).i)
+ goto retry;
+ clear_bit(CONSIDER_RESYNC, &device->flags);
+ ns.peer = peer_state.role;
+ ns.pdsk = real_peer_disk;
+ ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+ if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+ ns.disk = device->new_state_tmp.disk;
+ cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+ if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ test_bit(NEW_CUR_UUID, &device->flags)) {
+ /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
+ for temporal network outages! */
+ spin_unlock_irq(&device->resource->req_lock);
+ drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+ tl_clear(peer_device->connection);
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+ return -EIO;
+ }
+ rv = _drbd_set_state(device, ns, cs_flags, NULL);
+ ns = drbd_read_state(device);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (rv < SS_SUCCESS) {
+ conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
+ }
+
+ if (os.conn > C_WF_REPORT_PARAMS) {
+ if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+ peer_state.disk != D_NEGOTIATING ) {
+ /* we want resync, peer has not yet decided to sync... */
+ /* Nowadays only used when forcing a node into primary role and
+ setting its disk to UpToDate with that */
+ drbd_send_uuids(peer_device);
+ drbd_send_current_state(peer_device);
+ }
+ }
+
+ clear_bit(DISCARD_MY_DATA, &device->flags);
+
+ drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
+
+ return 0;
+}
+
+static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_rs_uuid *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ wait_event(device->misc_wait,
+ device->state.conn == C_WF_SYNC_UUID ||
+ device->state.conn == C_BEHIND ||
+ device->state.conn < C_CONNECTED ||
+ device->state.disk < D_NEGOTIATING);
+
+ /* D_ASSERT(device, device->state.conn == C_WF_SYNC_UUID ); */
+
+ /* Here the _drbd_uuid_ functions are right, current should
+ _not_ be rotated into the history */
+ if (get_ldev_if_state(device, D_NEGOTIATING)) {
+ _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
+ _drbd_uuid_set(device, UI_BITMAP, 0UL);
+
+ drbd_print_uuids(device, "updated sync uuid");
+ drbd_start_resync(device, C_SYNC_TARGET);
+
+ put_ldev(device);
+ } else
+ drbd_err(device, "Ignoring SyncUUID packet!\n");
+
+ return 0;
+}
+
+/**
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
+ unsigned long *p, struct bm_xfer_ctx *c)
+{
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+ drbd_header_size(peer_device->connection);
+ unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ unsigned int want = num_words * sizeof(*p);
+ int err;
+
+ if (want != size) {
+ drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
+ return -EIO;
+ }
+ if (want == 0)
+ return 0;
+ err = drbd_recv_all(peer_device->connection, p, want);
+ if (err)
+ return err;
+
+ drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
+
+ c->word_offset += num_words;
+ c->bit_offset = c->word_offset * BITS_PER_LONG;
+ if (c->bit_offset > c->bm_bits)
+ c->bit_offset = c->bm_bits;
+
+ return 1;
+}
+
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+ return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+ return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+ return (p->encoding >> 4) & 0x7;
+}
+
+/**
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+recv_bm_rle_bits(struct drbd_peer_device *peer_device,
+ struct p_compressed_bm *p,
+ struct bm_xfer_ctx *c,
+ unsigned int len)
+{
+ struct bitstream bs;
+ u64 look_ahead;
+ u64 rl;
+ u64 tmp;
+ unsigned long s = c->bit_offset;
+ unsigned long e;
+ int toggle = dcbp_get_start(p);
+ int have;
+ int bits;
+
+ bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
+
+ bits = bitstream_get_bits(&bs, &look_ahead, 64);
+ if (bits < 0)
+ return -EIO;
+
+ for (have = bits; have > 0; s += rl, toggle = !toggle) {
+ bits = vli_decode_bits(&rl, look_ahead);
+ if (bits <= 0)
+ return -EIO;
+
+ if (toggle) {
+ e = s + rl -1;
+ if (e >= c->bm_bits) {
+ drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+ return -EIO;
+ }
+ _drbd_bm_set_bits(peer_device->device, s, e);
+ }
+
+ if (have < bits) {
+ drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+ have, bits, look_ahead,
+ (unsigned int)(bs.cur.b - p->code),
+ (unsigned int)bs.buf_len);
+ return -EIO;
+ }
+ /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
+ if (likely(bits < 64))
+ look_ahead >>= bits;
+ else
+ look_ahead = 0;
+ have -= bits;
+
+ bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+ if (bits < 0)
+ return -EIO;
+ look_ahead |= tmp << have;
+ have += bits;
+ }
+
+ c->bit_offset = s;
+ bm_xfer_ctx_bit_to_word_offset(c);
+
+ return (s != c->bm_bits);
+}
+
+/**
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+decode_bitmap_c(struct drbd_peer_device *peer_device,
+ struct p_compressed_bm *p,
+ struct bm_xfer_ctx *c,
+ unsigned int len)
+{
+ if (dcbp_get_code(p) == RLE_VLI_Bits)
+ return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
+
+ /* other variants had been implemented for evaluation,
+ * but have been dropped as this one turned out to be "best"
+ * during all our tests. */
+
+ drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+ conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+ return -EIO;
+}
+
+void INFO_bm_xfer_stats(struct drbd_device *device,
+ const char *direction, struct bm_xfer_ctx *c)
+{
+ /* what would it take to transfer it "plaintext" */
+ unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ unsigned int plain =
+ header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+ c->bm_words * sizeof(unsigned long);
+ unsigned int total = c->bytes[0] + c->bytes[1];
+ unsigned int r;
+
+ /* total can not be zero. but just in case: */
+ if (total == 0)
+ return;
+
+ /* don't report if not compressed */
+ if (total >= plain)
+ return;
+
+ /* total < plain. check for overflow, still */
+ r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+ : (1000 * total / plain);
+
+ if (r > 1000)
+ r = 1000;
+
+ r = 1000 - r;
+ drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+ "total %u; compression: %u.%u%%\n",
+ direction,
+ c->bytes[1], c->packets[1],
+ c->bytes[0], c->packets[0],
+ total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+ it does not matter if the process it in 32 bit chunks or 64 bit
+ chunks as long as it is little endian. (Understand it as byte stream,
+ beginning with the lowest byte...) If we would use big endian
+ we would need to process it from the highest address to the lowest,
+ in order to be agnostic to the 32 vs 64 bits issue.
+
+ returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct bm_xfer_ctx c;
+ int err;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
+ /* you are supposed to send additional out-of-sync information
+ * if you actually set bits during this phase */
+
+ c = (struct bm_xfer_ctx) {
+ .bm_bits = drbd_bm_bits(device),
+ .bm_words = drbd_bm_words(device),
+ };
+
+ for(;;) {
+ if (pi->cmd == P_BITMAP)
+ err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
+ else if (pi->cmd == P_COMPRESSED_BITMAP) {
+ /* MAYBE: sanity check that we speak proto >= 90,
+ * and the feature is enabled! */
+ struct p_compressed_bm *p = pi->data;
+
+ if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
+ drbd_err(device, "ReportCBitmap packet too large\n");
+ err = -EIO;
+ goto out;
+ }
+ if (pi->size <= sizeof(*p)) {
+ drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+ err = -EIO;
+ goto out;
+ }
+ err = drbd_recv_all(peer_device->connection, p, pi->size);
+ if (err)
+ goto out;
+ err = decode_bitmap_c(peer_device, p, &c, pi->size);
+ } else {
+ drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+ err = -EIO;
+ goto out;
+ }
+
+ c.packets[pi->cmd == P_BITMAP]++;
+ c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
+
+ if (err <= 0) {
+ if (err < 0)
+ goto out;
+ break;
+ }
+ err = drbd_recv_header(peer_device->connection, pi);
+ if (err)
+ goto out;
+ }
+
+ INFO_bm_xfer_stats(device, "receive", &c);
+
+ if (device->state.conn == C_WF_BITMAP_T) {
+ enum drbd_state_rv rv;
+
+ err = drbd_send_bitmap(device);
+ if (err)
+ goto out;
+ /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+ rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ D_ASSERT(device, rv == SS_SUCCESS);
+ } else if (device->state.conn != C_WF_BITMAP_S) {
+ /* admin may have requested C_DISCONNECTING,
+ * other threads may have noticed network errors */
+ drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
+ drbd_conn_str(device->state.conn));
+ }
+ err = 0;
+
+ out:
+ drbd_bm_unlock(device);
+ if (!err && device->state.conn == C_WF_BITMAP_S)
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ return err;
+}
+
+static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+ drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
+ pi->cmd, pi->size);
+
+ return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
+{
+ /* Make sure we've acked all the TCP data associated
+ * with the data requests being unplugged */
+ drbd_tcp_quickack(connection->data.socket);
+
+ return 0;
+}
+
+static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_desc *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ switch (device->state.conn) {
+ case C_WF_SYNC_UUID:
+ case C_WF_BITMAP_T:
+ case C_BEHIND:
+ break;
+ default:
+ drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
+ drbd_conn_str(device->state.conn));
+ }
+
+ drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+ return 0;
+}
+
+struct data_cmd {
+ int expect_payload;
+ size_t pkt_size;
+ int (*fn)(struct drbd_connection *, struct packet_info *);
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+ [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
+ [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
+ [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+ [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+ [P_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
+ [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
+ [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
+ [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
+ [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
+ [P_STATE] = { 0, sizeof(struct p_state), receive_state },
+ [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
+ [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+ [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
+ [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
+ [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+ [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data },
+};
+
+static void drbdd(struct drbd_connection *connection)
+{
+ struct packet_info pi;
+ size_t shs; /* sub header size */
+ int err;
+
+ while (get_t_state(&connection->receiver) == RUNNING) {
+ struct data_cmd *cmd;
+
+ drbd_thread_current_set_cpu(&connection->receiver);
+ update_receiver_timing_details(connection, drbd_recv_header);
+ if (drbd_recv_header(connection, &pi))
+ goto err_out;
+
+ cmd = &drbd_cmd_handler[pi.cmd];
+ if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+ drbd_err(connection, "Unexpected data packet %s (0x%04x)",
+ cmdname(pi.cmd), pi.cmd);
+ goto err_out;
+ }
+
+ shs = cmd->pkt_size;
+ if (pi.size > shs && !cmd->expect_payload) {
+ drbd_err(connection, "No payload expected %s l:%d\n",
+ cmdname(pi.cmd), pi.size);
+ goto err_out;
+ }
+
+ if (shs) {
+ update_receiver_timing_details(connection, drbd_recv_all_warn);
+ err = drbd_recv_all_warn(connection, pi.data, shs);
+ if (err)
+ goto err_out;
+ pi.size -= shs;
+ }
+
+ update_receiver_timing_details(connection, cmd->fn);
+ err = cmd->fn(connection, &pi);
+ if (err) {
+ drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
+ cmdname(pi.cmd), err, pi.size);
+ goto err_out;
+ }
+ }
+ return;
+
+ err_out:
+ conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+static void conn_disconnect(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ enum drbd_conns oc;
+ int vnr;
+
+ if (connection->cstate == C_STANDALONE)
+ return;
+
+ /* We are about to start the cleanup after connection loss.
+ * Make sure drbd_make_request knows about that.
+ * Usually we should be in some network failure state already,
+ * but just in case we are not, we fix it up here.
+ */
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+
+ /* asender does not clean up anything. it must not interfere, either */
+ drbd_thread_stop(&connection->asender);
+ drbd_free_sock(connection);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_disconnected(peer_device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ if (!list_empty(&connection->current_epoch->list))
+ drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
+ /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+ atomic_set(&connection->current_epoch->epoch_size, 0);
+ connection->send.seen_any_write_yet = false;
+
+ drbd_info(connection, "Connection closed\n");
+
+ if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
+ conn_try_outdate_peer_async(connection);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ oc = connection->cstate;
+ if (oc >= C_UNCONNECTED)
+ _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ if (oc == C_DISCONNECTING)
+ conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+}
+
+static int drbd_disconnected(struct drbd_peer_device *peer_device)
+{
+ struct drbd_device *device = peer_device->device;
+ unsigned int i;
+
+ /* wait for current activity to cease. */
+ spin_lock_irq(&device->resource->req_lock);
+ _drbd_wait_ee_list_empty(device, &device->active_ee);
+ _drbd_wait_ee_list_empty(device, &device->sync_ee);
+ _drbd_wait_ee_list_empty(device, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ /* We do not have data structures that would allow us to
+ * get the rs_pending_cnt down to 0 again.
+ * * On C_SYNC_TARGET we do not have any data structures describing
+ * the pending RSDataRequest's we have sent.
+ * * On C_SYNC_SOURCE there is no data structure that tracks
+ * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+ * And no, it is not the sum of the reference counts in the
+ * resync_LRU. The resync_LRU tracks the whole operation including
+ * the disk-IO, while the rs_pending_cnt only tracks the blocks
+ * on the fly. */
+ drbd_rs_cancel_all(device);
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+ wake_up(&device->misc_wait);
+
+ del_timer_sync(&device->resync_timer);
+ resync_timer_fn((unsigned long)device);
+
+ /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+ * w_make_resync_request etc. which may still be on the worker queue
+ * to be "canceled" */
+ drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+ drbd_finish_peer_reqs(device);
+
+ /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+ might have issued a work again. The one before drbd_finish_peer_reqs() is
+ necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+ drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+ /* need to do it again, drbd_finish_peer_reqs() may have populated it
+ * again via drbd_try_clear_on_disk_bm(). */
+ drbd_rs_cancel_all(device);
+
+ kfree(device->p_uuid);
+ device->p_uuid = NULL;
+
+ if (!drbd_suspended(device))
+ tl_clear(peer_device->connection);
+
+ drbd_md_sync(device);
+
+ /* serialize with bitmap writeout triggered by the state change,
+ * if any. */
+ wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+
+ /* tcp_close and release of sendpage pages can be deferred. I don't
+ * want to use SO_LINGER, because apparently it can be deferred for
+ * more than 20 seconds (longest time I checked).
+ *
+ * Actually we don't care for exactly when the network stack does its
+ * put_page(), but release our reference on these pages right here.
+ */
+ i = drbd_free_peer_reqs(device, &device->net_ee);
+ if (i)
+ drbd_info(device, "net_ee not empty, killed %u entries\n", i);
+ i = atomic_read(&device->pp_in_use_by_net);
+ if (i)
+ drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
+ i = atomic_read(&device->pp_in_use);
+ if (i)
+ drbd_info(device, "pp_in_use = %d, expected 0\n", i);
+
+ D_ASSERT(device, list_empty(&device->read_ee));
+ D_ASSERT(device, list_empty(&device->active_ee));
+ D_ASSERT(device, list_empty(&device->sync_ee));
+ D_ASSERT(device, list_empty(&device->done_ee));
+
+ return 0;
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+static int drbd_send_features(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+ struct p_connection_features *p;
+
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+ memset(p, 0, sizeof(*p));
+ p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+ p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+ p->feature_flags = cpu_to_be32(PRO_FEATURES);
+ return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
+}
+
+/*
+ * return values:
+ * 1 yes, we have a valid connection
+ * 0 oops, did not work out, please try again
+ * -1 peer talks different language,
+ * no point in trying again, please go standalone.
+ */
+static int drbd_do_features(struct drbd_connection *connection)
+{
+ /* ASSERT current == connection->receiver ... */
+ struct p_connection_features *p;
+ const int expect = sizeof(struct p_connection_features);
+ struct packet_info pi;
+ int err;
+
+ err = drbd_send_features(connection);
+ if (err)
+ return 0;
+
+ err = drbd_recv_header(connection, &pi);
+ if (err)
+ return 0;
+
+ if (pi.cmd != P_CONNECTION_FEATURES) {
+ drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ return -1;
+ }
+
+ if (pi.size != expect) {
+ drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
+ expect, pi.size);
+ return -1;
+ }
+
+ p = pi.data;
+ err = drbd_recv_all_warn(connection, p, expect);
+ if (err)
+ return 0;
+
+ p->protocol_min = be32_to_cpu(p->protocol_min);
+ p->protocol_max = be32_to_cpu(p->protocol_max);
+ if (p->protocol_max == 0)
+ p->protocol_max = p->protocol_min;
+
+ if (PRO_VERSION_MAX < p->protocol_min ||
+ PRO_VERSION_MIN > p->protocol_max)
+ goto incompat;
+
+ connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+ connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
+
+ drbd_info(connection, "Handshake successful: "
+ "Agreed network protocol version %d\n", connection->agreed_pro_version);
+
+ drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n",
+ connection->agreed_features & FF_TRIM ? " " : " not ");
+
+ return 1;
+
+ incompat:
+ drbd_err(connection, "incompatible DRBD dialects: "
+ "I support %d-%d, peer supports %d-%d\n",
+ PRO_VERSION_MIN, PRO_VERSION_MAX,
+ p->protocol_min, p->protocol_max);
+ return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+ drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+ drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+ return -1;
+}
+#else
+#define CHALLENGE_LEN 64
+
+/* Return value:
+ 1 - auth succeeded,
+ 0 - failed, try again (network error),
+ -1 - auth failed, don't try again.
+*/
+
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+ struct drbd_socket *sock;
+ char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
+ struct scatterlist sg;
+ char *response = NULL;
+ char *right_response = NULL;
+ char *peers_ch = NULL;
+ unsigned int key_len;
+ char secret[SHARED_SECRET_MAX]; /* 64 byte */
+ unsigned int resp_size;
+ struct hash_desc desc;
+ struct packet_info pi;
+ struct net_conf *nc;
+ int err, rv;
+
+ /* FIXME: Put the challenge/response into the preallocated socket buffer. */
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ key_len = strlen(nc->shared_secret);
+ memcpy(secret, nc->shared_secret, key_len);
+ rcu_read_unlock();
+
+ desc.tfm = connection->cram_hmac_tfm;
+ desc.flags = 0;
+
+ rv = crypto_hash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
+ if (rv) {
+ drbd_err(connection, "crypto_hash_setkey() failed with %d\n", rv);
+ rv = -1;
+ goto fail;
+ }
+
+ get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+ sock = &connection->data;
+ if (!conn_prepare_command(connection, sock)) {
+ rv = 0;
+ goto fail;
+ }
+ rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
+ my_challenge, CHALLENGE_LEN);
+ if (!rv)
+ goto fail;
+
+ err = drbd_recv_header(connection, &pi);
+ if (err) {
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.cmd != P_AUTH_CHALLENGE) {
+ drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.size > CHALLENGE_LEN * 2) {
+ drbd_err(connection, "expected AuthChallenge payload too big.\n");
+ rv = -1;
+ goto fail;
+ }
+
+ if (pi.size < CHALLENGE_LEN) {
+ drbd_err(connection, "AuthChallenge payload too small.\n");
+ rv = -1;
+ goto fail;
+ }
+
+ peers_ch = kmalloc(pi.size, GFP_NOIO);
+ if (peers_ch == NULL) {
+ drbd_err(connection, "kmalloc of peers_ch failed\n");
+ rv = -1;
+ goto fail;
+ }
+
+ err = drbd_recv_all_warn(connection, peers_ch, pi.size);
+ if (err) {
+ rv = 0;
+ goto fail;
+ }
+
+ if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+ drbd_err(connection, "Peer presented the same challenge!\n");
+ rv = -1;
+ goto fail;
+ }
+
+ resp_size = crypto_hash_digestsize(connection->cram_hmac_tfm);
+ response = kmalloc(resp_size, GFP_NOIO);
+ if (response == NULL) {
+ drbd_err(connection, "kmalloc of response failed\n");
+ rv = -1;
+ goto fail;
+ }
+
+ sg_init_table(&sg, 1);
+ sg_set_buf(&sg, peers_ch, pi.size);
+
+ rv = crypto_hash_digest(&desc, &sg, sg.length, response);
+ if (rv) {
+ drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+ rv = -1;
+ goto fail;
+ }
+
+ if (!conn_prepare_command(connection, sock)) {
+ rv = 0;
+ goto fail;
+ }
+ rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
+ response, resp_size);
+ if (!rv)
+ goto fail;
+
+ err = drbd_recv_header(connection, &pi);
+ if (err) {
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.cmd != P_AUTH_RESPONSE) {
+ drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.size != resp_size) {
+ drbd_err(connection, "expected AuthResponse payload of wrong size\n");
+ rv = 0;
+ goto fail;
+ }
+
+ err = drbd_recv_all_warn(connection, response , resp_size);
+ if (err) {
+ rv = 0;
+ goto fail;
+ }
+
+ right_response = kmalloc(resp_size, GFP_NOIO);
+ if (right_response == NULL) {
+ drbd_err(connection, "kmalloc of right_response failed\n");
+ rv = -1;
+ goto fail;
+ }
+
+ sg_set_buf(&sg, my_challenge, CHALLENGE_LEN);
+
+ rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
+ if (rv) {
+ drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+ rv = -1;
+ goto fail;
+ }
+
+ rv = !memcmp(response, right_response, resp_size);
+
+ if (rv)
+ drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
+ resp_size);
+ else
+ rv = -1;
+
+ fail:
+ kfree(peers_ch);
+ kfree(response);
+ kfree(right_response);
+
+ return rv;
+}
+#endif
+
+int drbd_receiver(struct drbd_thread *thi)
+{
+ struct drbd_connection *connection = thi->connection;
+ int h;
+
+ drbd_info(connection, "receiver (re)started\n");
+
+ do {
+ h = conn_connect(connection);
+ if (h == 0) {
+ conn_disconnect(connection);
+ schedule_timeout_interruptible(HZ);
+ }
+ if (h == -1) {
+ drbd_warn(connection, "Discarding network configuration.\n");
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ } while (h == 0);
+
+ if (h > 0)
+ drbdd(connection);
+
+ conn_disconnect(connection);
+
+ drbd_info(connection, "receiver terminated\n");
+ return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_req_state_reply *p = pi->data;
+ int retcode = be32_to_cpu(p->retcode);
+
+ if (retcode >= SS_SUCCESS) {
+ set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
+ } else {
+ set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
+ drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
+ drbd_set_st_err_str(retcode), retcode);
+ }
+ wake_up(&connection->ping_wait);
+
+ return 0;
+}
+
+static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_req_state_reply *p = pi->data;
+ int retcode = be32_to_cpu(p->retcode);
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
+ D_ASSERT(device, connection->agreed_pro_version < 100);
+ return got_conn_RqSReply(connection, pi);
+ }
+
+ if (retcode >= SS_SUCCESS) {
+ set_bit(CL_ST_CHG_SUCCESS, &device->flags);
+ } else {
+ set_bit(CL_ST_CHG_FAIL, &device->flags);
+ drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
+ drbd_set_st_err_str(retcode), retcode);
+ }
+ wake_up(&device->state_wait);
+
+ return 0;
+}
+
+static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
+{
+ return drbd_send_ping_ack(connection);
+
+}
+
+static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+ /* restore idle timeout */
+ connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
+ if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
+ wake_up(&connection->ping_wait);
+
+ return 0;
+}
+
+static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ sector_t sector = be64_to_cpu(p->sector);
+ int blksize = be32_to_cpu(p->blksize);
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, sector);
+ drbd_set_in_sync(device, sector, blksize);
+ /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+ device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ put_ldev(device);
+ }
+ dec_rs_pending(device);
+ atomic_add(blksize >> 9, &device->rs_sect_in);
+
+ return 0;
+}
+
+static int
+validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
+ struct rb_root *root, const char *func,
+ enum drbd_req_event what, bool missing_ok)
+{
+ struct drbd_request *req;
+ struct bio_and_error m;
+
+ spin_lock_irq(&device->resource->req_lock);
+ req = find_request(device, root, id, sector, missing_ok, func);
+ if (unlikely(!req)) {
+ spin_unlock_irq(&device->resource->req_lock);
+ return -EIO;
+ }
+ __req_mod(req, what, &m);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (m.bio)
+ complete_master_bio(device, &m);
+ return 0;
+}
+
+static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ sector_t sector = be64_to_cpu(p->sector);
+ int blksize = be32_to_cpu(p->blksize);
+ enum drbd_req_event what;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (p->block_id == ID_SYNCER) {
+ drbd_set_in_sync(device, sector, blksize);
+ dec_rs_pending(device);
+ return 0;
+ }
+ switch (pi->cmd) {
+ case P_RS_WRITE_ACK:
+ what = WRITE_ACKED_BY_PEER_AND_SIS;
+ break;
+ case P_WRITE_ACK:
+ what = WRITE_ACKED_BY_PEER;
+ break;
+ case P_RECV_ACK:
+ what = RECV_ACKED_BY_PEER;
+ break;
+ case P_SUPERSEDED:
+ what = CONFLICT_RESOLVED;
+ break;
+ case P_RETRY_WRITE:
+ what = POSTPONE_WRITE;
+ break;
+ default:
+ BUG();
+ }
+
+ return validate_req_change_req_state(device, p->block_id, sector,
+ &device->write_requests, __func__,
+ what, false);
+}
+
+static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ sector_t sector = be64_to_cpu(p->sector);
+ int size = be32_to_cpu(p->blksize);
+ int err;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (p->block_id == ID_SYNCER) {
+ dec_rs_pending(device);
+ drbd_rs_failed_io(device, sector, size);
+ return 0;
+ }
+
+ err = validate_req_change_req_state(device, p->block_id, sector,
+ &device->write_requests, __func__,
+ NEG_ACKED, true);
+ if (err) {
+ /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+ The master bio might already be completed, therefore the
+ request is no longer in the collision hash. */
+ /* In Protocol B we might already have got a P_RECV_ACK
+ but then get a P_NEG_ACK afterwards. */
+ drbd_set_out_of_sync(device, sector, size);
+ }
+ return 0;
+}
+
+static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ sector_t sector = be64_to_cpu(p->sector);
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
+ (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+ return validate_req_change_req_state(device, p->block_id, sector,
+ &device->read_requests, __func__,
+ NEG_ACKED, false);
+}
+
+static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ sector_t sector;
+ int size;
+ struct p_block_ack *p = pi->data;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ dec_rs_pending(device);
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_rs_complete_io(device, sector);
+ switch (pi->cmd) {
+ case P_NEG_RS_DREPLY:
+ drbd_rs_failed_io(device, sector, size);
+ case P_RS_CANCEL:
+ break;
+ default:
+ BUG();
+ }
+ put_ldev(device);
+ }
+
+ return 0;
+}
+
+static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct p_barrier_ack *p = pi->data;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+
+ if (device->state.conn == C_AHEAD &&
+ atomic_read(&device->ap_in_flight) == 0 &&
+ !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
+ device->start_resync_timer.expires = jiffies + HZ;
+ add_timer(&device->start_resync_timer);
+ }
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
+{
+ struct drbd_peer_device *peer_device;
+ struct drbd_device *device;
+ struct p_block_ack *p = pi->data;
+ struct drbd_device_work *dw;
+ sector_t sector;
+ int size;
+
+ peer_device = conn_peer_device(connection, pi->vnr);
+ if (!peer_device)
+ return -EIO;
+ device = peer_device->device;
+
+ sector = be64_to_cpu(p->sector);
+ size = be32_to_cpu(p->blksize);
+
+ update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+ if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+ drbd_ov_out_of_sync_found(device, sector, size);
+ else
+ ov_out_of_sync_print(device);
+
+ if (!get_ldev(device))
+ return 0;
+
+ drbd_rs_complete_io(device, sector);
+ dec_rs_pending(device);
+
+ --device->ov_left;
+
+ /* let's advance progress step marks only for every other megabyte */
+ if ((device->ov_left & 0x200) == 0x200)
+ drbd_advance_rs_marks(device, device->ov_left);
+
+ if (device->ov_left == 0) {
+ dw = kmalloc(sizeof(*dw), GFP_NOIO);
+ if (dw) {
+ dw->w.cb = w_ov_finished;
+ dw->device = device;
+ drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
+ } else {
+ drbd_err(device, "kmalloc(dw) failed.");
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+ }
+ }
+ put_ldev(device);
+ return 0;
+}
+
+static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+ return 0;
+}
+
+static int connection_finish_peer_reqs(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr, not_empty = 0;
+
+ do {
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
+ flush_signals(current);
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ if (drbd_finish_peer_reqs(device)) {
+ kref_put(&device->kref, drbd_destroy_device);
+ return 1;
+ }
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ set_bit(SIGNAL_ASENDER, &connection->flags);
+
+ spin_lock_irq(&connection->resource->req_lock);
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ not_empty = !list_empty(&device->done_ee);
+ if (not_empty)
+ break;
+ }
+ spin_unlock_irq(&connection->resource->req_lock);
+ rcu_read_unlock();
+ } while (not_empty);
+
+ return 0;
+}
+
+struct asender_cmd {
+ size_t pkt_size;
+ int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
+static struct asender_cmd asender_tbl[] = {
+ [P_PING] = { 0, got_Ping },
+ [P_PING_ACK] = { 0, got_PingAck },
+ [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
+ [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
+ [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
+ [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
+ [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
+ [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+ [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
+ [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
+ [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
+ [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+ [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
+};
+
+int drbd_asender(struct drbd_thread *thi)
+{
+ struct drbd_connection *connection = thi->connection;
+ struct asender_cmd *cmd = NULL;
+ struct packet_info pi;
+ int rv;
+ void *buf = connection->meta.rbuf;
+ int received = 0;
+ unsigned int header_size = drbd_header_size(connection);
+ int expect = header_size;
+ bool ping_timeout_active = false;
+ struct net_conf *nc;
+ int ping_timeo, tcp_cork, ping_int;
+ struct sched_param param = { .sched_priority = 2 };
+
+ rv = sched_setscheduler(current, SCHED_RR, &param);
+ if (rv < 0)
+ drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv);
+
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ ping_timeo = nc->ping_timeo;
+ tcp_cork = nc->tcp_cork;
+ ping_int = nc->ping_int;
+ rcu_read_unlock();
+
+ if (test_and_clear_bit(SEND_PING, &connection->flags)) {
+ if (drbd_send_ping(connection)) {
+ drbd_err(connection, "drbd_send_ping has failed\n");
+ goto reconnect;
+ }
+ connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+ ping_timeout_active = true;
+ }
+
+ /* TODO: conditionally cork; it may hurt latency if we cork without
+ much to send */
+ if (tcp_cork)
+ drbd_tcp_cork(connection->meta.socket);
+ if (connection_finish_peer_reqs(connection)) {
+ drbd_err(connection, "connection_finish_peer_reqs() failed\n");
+ goto reconnect;
+ }
+ /* but unconditionally uncork unless disabled */
+ if (tcp_cork)
+ drbd_tcp_uncork(connection->meta.socket);
+
+ /* short circuit, recv_msg would return EINTR anyways. */
+ if (signal_pending(current))
+ continue;
+
+ rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
+
+ flush_signals(current);
+
+ /* Note:
+ * -EINTR (on meta) we got a signal
+ * -EAGAIN (on meta) rcvtimeo expired
+ * -ECONNRESET other side closed the connection
+ * -ERESTARTSYS (on data) we got a signal
+ * rv < 0 other than above: unexpected error!
+ * rv == expected: full header or command
+ * rv < expected: "woken" by signal during receive
+ * rv == 0 : "connection shut down by peer"
+ */
+received_more:
+ if (likely(rv > 0)) {
+ received += rv;
+ buf += rv;
+ } else if (rv == 0) {
+ if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
+
+ t = wait_event_timeout(connection->ping_wait,
+ connection->cstate < C_WF_REPORT_PARAMS,
+ t);
+ if (t)
+ break;
+ }
+ drbd_err(connection, "meta connection shut down by peer.\n");
+ goto reconnect;
+ } else if (rv == -EAGAIN) {
+ /* If the data socket received something meanwhile,
+ * that is good enough: peer is still alive. */
+ if (time_after(connection->last_received,
+ jiffies - connection->meta.socket->sk->sk_rcvtimeo))
+ continue;
+ if (ping_timeout_active) {
+ drbd_err(connection, "PingAck did not arrive in time.\n");
+ goto reconnect;
+ }
+ set_bit(SEND_PING, &connection->flags);
+ continue;
+ } else if (rv == -EINTR) {
+ continue;
+ } else {
+ drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+ goto reconnect;
+ }
+
+ if (received == expect && cmd == NULL) {
+ if (decode_header(connection, connection->meta.rbuf, &pi))
+ goto reconnect;
+ cmd = &asender_tbl[pi.cmd];
+ if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+ drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ goto disconnect;
+ }
+ expect = header_size + cmd->pkt_size;
+ if (pi.size != expect - header_size) {
+ drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
+ pi.cmd, pi.size);
+ goto reconnect;
+ }
+ }
+ if (received == expect) {
+ bool err;
+
+ err = cmd->fn(connection, &pi);
+ if (err) {
+ drbd_err(connection, "%pf failed\n", cmd->fn);
+ goto reconnect;
+ }
+
+ connection->last_received = jiffies;
+
+ if (cmd == &asender_tbl[P_PING_ACK]) {
+ /* restore idle timeout */
+ connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+ ping_timeout_active = false;
+ }
+
+ buf = connection->meta.rbuf;
+ received = 0;
+ expect = header_size;
+ cmd = NULL;
+ }
+ if (test_bit(SEND_PING, &connection->flags))
+ continue;
+ rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT);
+ if (rv > 0)
+ goto received_more;
+ }
+
+ if (0) {
+reconnect:
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+ conn_md_sync(connection);
+ }
+ if (0) {
+disconnect:
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ clear_bit(SIGNAL_ASENDER, &connection->flags);
+
+ drbd_info(connection, "asender terminated\n");
+
+ return 0;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000..3907202fb
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1638 @@
+/*
+ drbd_req.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ */
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size);
+
+/* Update disk stats at start of I/O request */
+static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req)
+{
+ generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9,
+ &device->vdisk->part0);
+}
+
+/* Update disk stats when completing request upwards */
+static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req)
+{
+ generic_end_io_acct(bio_data_dir(req->master_bio),
+ &device->vdisk->part0, req->start_jif);
+}
+
+static struct drbd_request *drbd_req_new(struct drbd_device *device,
+ struct bio *bio_src)
+{
+ struct drbd_request *req;
+
+ req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+ if (!req)
+ return NULL;
+ memset(req, 0, sizeof(*req));
+
+ drbd_req_make_private_bio(req, bio_src);
+ req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+ req->device = device;
+ req->master_bio = bio_src;
+ req->epoch = 0;
+
+ drbd_clear_interval(&req->i);
+ req->i.sector = bio_src->bi_iter.bi_sector;
+ req->i.size = bio_src->bi_iter.bi_size;
+ req->i.local = true;
+ req->i.waiting = false;
+
+ INIT_LIST_HEAD(&req->tl_requests);
+ INIT_LIST_HEAD(&req->w.list);
+ INIT_LIST_HEAD(&req->req_pending_master_completion);
+ INIT_LIST_HEAD(&req->req_pending_local);
+
+ /* one reference to be put by __drbd_make_request */
+ atomic_set(&req->completion_ref, 1);
+ /* one kref as long as completion_ref > 0 */
+ kref_init(&req->kref);
+ return req;
+}
+
+static void drbd_remove_request_interval(struct rb_root *root,
+ struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ struct drbd_interval *i = &req->i;
+
+ drbd_remove_interval(root, i);
+
+ /* Wake up any processes waiting for this request to complete. */
+ if (i->waiting)
+ wake_up(&device->misc_wait);
+}
+
+void drbd_req_destroy(struct kref *kref)
+{
+ struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+ struct drbd_device *device = req->device;
+ const unsigned s = req->rq_state;
+
+ if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+ atomic_read(&req->completion_ref) ||
+ (s & RQ_LOCAL_PENDING) ||
+ ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+ drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+ s, atomic_read(&req->completion_ref));
+ return;
+ }
+
+ /* If called from mod_rq_state (expected normal case) or
+ * drbd_send_and_submit (the less likely normal path), this holds the
+ * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+ * though it may be still empty (never added to the transfer log).
+ *
+ * If called from do_retry(), we do NOT hold the req_lock, but we are
+ * still allowed to unconditionally list_del(&req->tl_requests),
+ * because it will be on a local on-stack list only. */
+ list_del_init(&req->tl_requests);
+
+ /* finally remove the request from the conflict detection
+ * respective block_id verification interval tree. */
+ if (!drbd_interval_empty(&req->i)) {
+ struct rb_root *root;
+
+ if (s & RQ_WRITE)
+ root = &device->write_requests;
+ else
+ root = &device->read_requests;
+ drbd_remove_request_interval(root, req);
+ } else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
+ drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
+ s, (unsigned long long)req->i.sector, req->i.size);
+
+ /* if it was a write, we may have to set the corresponding
+ * bit(s) out-of-sync first. If it had a local part, we need to
+ * release the reference to the activity log. */
+ if (s & RQ_WRITE) {
+ /* Set out-of-sync unless both OK flags are set
+ * (local only or remote failed).
+ * Other places where we set out-of-sync:
+ * READ with local io-error */
+
+ /* There is a special case:
+ * we may notice late that IO was suspended,
+ * and postpone, or schedule for retry, a write,
+ * before it even was submitted or sent.
+ * In that case we do not want to touch the bitmap at all.
+ */
+ if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+ if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+ drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+
+ if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+ drbd_set_in_sync(device, req->i.sector, req->i.size);
+ }
+
+ /* one might be tempted to move the drbd_al_complete_io
+ * to the local io completion callback drbd_request_endio.
+ * but, if this was a mirror write, we may only
+ * drbd_al_complete_io after this is RQ_NET_DONE,
+ * otherwise the extent could be dropped from the al
+ * before it has actually been written on the peer.
+ * if we crash before our peer knows about the request,
+ * but after the extent has been dropped from the al,
+ * we would forget to resync the corresponding extent.
+ */
+ if (s & RQ_IN_ACT_LOG) {
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_al_complete_io(device, &req->i);
+ put_ldev(device);
+ } else if (__ratelimit(&drbd_ratelimit_state)) {
+ drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
+ "but my Disk seems to have failed :(\n",
+ (unsigned long long) req->i.sector, req->i.size);
+ }
+ }
+ }
+
+ mempool_free(req, drbd_request_mempool);
+}
+
+static void wake_all_senders(struct drbd_connection *connection)
+{
+ wake_up(&connection->sender_work.q_wait);
+}
+
+/* must hold resource->req_lock */
+void start_new_tl_epoch(struct drbd_connection *connection)
+{
+ /* no point closing an epoch, if it is empty, anyways. */
+ if (connection->current_tle_writes == 0)
+ return;
+
+ connection->current_tle_writes = 0;
+ atomic_inc(&connection->current_tle_nr);
+ wake_all_senders(connection);
+}
+
+void complete_master_bio(struct drbd_device *device,
+ struct bio_and_error *m)
+{
+ bio_endio(m->bio, m->error);
+ dec_ap_bio(device);
+}
+
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+static
+void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
+{
+ const unsigned s = req->rq_state;
+ struct drbd_device *device = req->device;
+ int rw;
+ int error, ok;
+
+ /* we must not complete the master bio, while it is
+ * still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+ * not yet acknowledged by the peer
+ * not yet completed by the local io subsystem
+ * these flags may get cleared in any order by
+ * the worker,
+ * the receiver,
+ * the bio_endio completion callbacks.
+ */
+ if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+ (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+ (s & RQ_COMPLETION_SUSP)) {
+ drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
+ return;
+ }
+
+ if (!req->master_bio) {
+ drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
+ return;
+ }
+
+ rw = bio_rw(req->master_bio);
+
+ /*
+ * figure out whether to report success or failure.
+ *
+ * report success when at least one of the operations succeeded.
+ * or, to put the other way,
+ * only report failure, when both operations failed.
+ *
+ * what to do about the failures is handled elsewhere.
+ * what we need to do here is just: complete the master_bio.
+ *
+ * local completion error, if any, has been stored as ERR_PTR
+ * in private_bio within drbd_request_endio.
+ */
+ ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+ error = PTR_ERR(req->private_bio);
+
+ /* Before we can signal completion to the upper layers,
+ * we may need to close the current transfer log epoch.
+ * We are within the request lock, so we can simply compare
+ * the request epoch number with the current transfer log
+ * epoch number. If they match, increase the current_tle_nr,
+ * and reset the transfer log epoch write_cnt.
+ */
+ if (rw == WRITE &&
+ req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
+ start_new_tl_epoch(first_peer_device(device)->connection);
+
+ /* Update disk stats */
+ _drbd_end_io_acct(device, req);
+
+ /* If READ failed,
+ * have it be pushed back to the retry work queue,
+ * so it will re-enter __drbd_make_request(),
+ * and be re-assigned to a suitable local or remote path,
+ * or failed if we do not have access to good data anymore.
+ *
+ * Unless it was failed early by __drbd_make_request(),
+ * because no path was available, in which case
+ * it was not even added to the transfer_log.
+ *
+ * READA may fail, and will not be retried.
+ *
+ * WRITE should have used all available paths already.
+ */
+ if (!ok && rw == READ && !list_empty(&req->tl_requests))
+ req->rq_state |= RQ_POSTPONED;
+
+ if (!(req->rq_state & RQ_POSTPONED)) {
+ m->error = ok ? 0 : (error ?: -EIO);
+ m->bio = req->master_bio;
+ req->master_bio = NULL;
+ /* We leave it in the tree, to be able to verify later
+ * write-acks in protocol != C during resync.
+ * But we mark it as "complete", so it won't be counted as
+ * conflict in a multi-primary setup. */
+ req->i.completed = true;
+ }
+
+ if (req->i.waiting)
+ wake_up(&device->misc_wait);
+
+ /* Either we are about to complete to upper layers,
+ * or we will restart this request.
+ * In either case, the request object will be destroyed soon,
+ * so better remove it from all lists. */
+ list_del_init(&req->req_pending_master_completion);
+}
+
+/* still holds resource->req_lock */
+static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
+{
+ struct drbd_device *device = req->device;
+ D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED));
+
+ if (!atomic_sub_and_test(put, &req->completion_ref))
+ return 0;
+
+ drbd_req_complete(req, m);
+
+ if (req->rq_state & RQ_POSTPONED) {
+ /* don't destroy the req object just yet,
+ * but queue it for retry */
+ drbd_restart_request(req);
+ return 0;
+ }
+
+ return 1;
+}
+
+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_next == NULL)
+ connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_next != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if (s & RQ_NET_QUEUED)
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_ack_pending == NULL)
+ connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_ack_pending != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING))
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_not_net_done == NULL)
+ connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ if (!connection)
+ return;
+ if (connection->req_not_net_done != req)
+ return;
+ list_for_each_entry_continue(req, &connection->transfer_log, tl_requests) {
+ const unsigned s = req->rq_state;
+ if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE))
+ break;
+ }
+ if (&req->tl_requests == &connection->transfer_log)
+ req = NULL;
+ connection->req_not_net_done = req;
+}
+
+/* I'd like this to be the only place that manipulates
+ * req->completion_ref and req->kref. */
+static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+ int clear, int set)
+{
+ struct drbd_device *device = req->device;
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ unsigned s = req->rq_state;
+ int c_put = 0;
+ int k_put = 0;
+
+ if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
+ set |= RQ_COMPLETION_SUSP;
+
+ /* apply */
+
+ req->rq_state &= ~clear;
+ req->rq_state |= set;
+
+ /* no change? */
+ if (req->rq_state == s)
+ return;
+
+ /* intent: get references */
+
+ if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+ atomic_inc(&req->completion_ref);
+
+ if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+ inc_ap_pending(device);
+ atomic_inc(&req->completion_ref);
+ }
+
+ if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
+ atomic_inc(&req->completion_ref);
+ set_if_null_req_next(peer_device, req);
+ }
+
+ if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+ kref_get(&req->kref); /* wait for the DONE */
+
+ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+ /* potentially already completed in the asender thread */
+ if (!(s & RQ_NET_DONE)) {
+ atomic_add(req->i.size >> 9, &device->ap_in_flight);
+ set_if_null_req_not_net_done(peer_device, req);
+ }
+ if (s & RQ_NET_PENDING)
+ set_if_null_req_ack_pending(peer_device, req);
+ }
+
+ if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+ atomic_inc(&req->completion_ref);
+
+ /* progress: put references */
+
+ if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+ ++c_put;
+
+ if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+ D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
+ /* local completion may still come in later,
+ * we need to keep the req object around. */
+ kref_get(&req->kref);
+ ++c_put;
+ }
+
+ if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+ if (req->rq_state & RQ_LOCAL_ABORTED)
+ ++k_put;
+ else
+ ++c_put;
+ list_del_init(&req->req_pending_local);
+ }
+
+ if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+ dec_ap_pending(device);
+ ++c_put;
+ req->acked_jif = jiffies;
+ advance_conn_req_ack_pending(peer_device, req);
+ }
+
+ if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
+ ++c_put;
+ advance_conn_req_next(peer_device, req);
+ }
+
+ if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+ if (s & RQ_NET_SENT)
+ atomic_sub(req->i.size >> 9, &device->ap_in_flight);
+ if (s & RQ_EXP_BARR_ACK)
+ ++k_put;
+ req->net_done_jif = jiffies;
+
+ /* in ahead/behind mode, or just in case,
+ * before we finally destroy this request,
+ * the caching pointers must not reference it anymore */
+ advance_conn_req_next(peer_device, req);
+ advance_conn_req_ack_pending(peer_device, req);
+ advance_conn_req_not_net_done(peer_device, req);
+ }
+
+ /* potentially complete and destroy */
+
+ if (k_put || c_put) {
+ /* Completion does it's own kref_put. If we are going to
+ * kref_sub below, we need req to be still around then. */
+ int at_least = k_put + !!c_put;
+ int refcount = atomic_read(&req->kref.refcount);
+ if (refcount < at_least)
+ drbd_err(device,
+ "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
+ s, req->rq_state, refcount, at_least);
+ }
+
+ /* If we made progress, retry conflicting peer requests, if any. */
+ if (req->i.waiting)
+ wake_up(&device->misc_wait);
+
+ if (c_put)
+ k_put += drbd_req_put_completion_ref(req, m, c_put);
+ if (k_put)
+ kref_sub(&req->kref, k_put, drbd_req_destroy);
+}
+
+static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
+{
+ char b[BDEVNAME_SIZE];
+
+ if (!__ratelimit(&drbd_ratelimit_state))
+ return;
+
+ drbd_warn(device, "local %s IO error sector %llu+%u on %s\n",
+ (req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
+ (unsigned long long)req->i.sector,
+ req->i.size >> 9,
+ bdevname(device->ldev->backing_bdev, b));
+}
+
+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+ return (req->rq_state &
+ (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+ == (RQ_WRITE|RQ_NET_PENDING);
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ * enforces that it is all in this one place, where it is easier to audit,
+ * it makes it obvious that whatever "event" "happens" to a request should
+ * happen "atomically" within the req_lock,
+ * and it enforces that we have to think in a very structured manner
+ * about the "events" that may happen to a request during its life time ...
+ */
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+ struct bio_and_error *m)
+{
+ struct drbd_device *const device = req->device;
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+ struct net_conf *nc;
+ int p, rv = 0;
+
+ if (m)
+ m->bio = NULL;
+
+ switch (what) {
+ default:
+ drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+ break;
+
+ /* does not happen...
+ * initialization done in drbd_req_new
+ case CREATED:
+ break;
+ */
+
+ case TO_BE_SENT: /* via network */
+ /* reached via __drbd_make_request
+ * and from w_read_retry_remote */
+ D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ p = nc->wire_protocol;
+ rcu_read_unlock();
+ req->rq_state |=
+ p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+ p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+ mod_rq_state(req, m, 0, RQ_NET_PENDING);
+ break;
+
+ case TO_BE_SUBMITTED: /* locally */
+ /* reached via __drbd_make_request */
+ D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK));
+ mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
+ break;
+
+ case COMPLETED_OK:
+ if (req->rq_state & RQ_WRITE)
+ device->writ_cnt += req->i.size >> 9;
+ else
+ device->read_cnt += req->i.size >> 9;
+
+ mod_rq_state(req, m, RQ_LOCAL_PENDING,
+ RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+ break;
+
+ case ABORT_DISK_IO:
+ mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
+ break;
+
+ case WRITE_COMPLETED_WITH_ERROR:
+ drbd_report_io_error(device, req);
+ __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
+
+ case READ_COMPLETED_WITH_ERROR:
+ drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+ drbd_report_io_error(device, req);
+ __drbd_chk_io_error(device, DRBD_READ_ERROR);
+ /* fall through. */
+ case READ_AHEAD_COMPLETED_WITH_ERROR:
+ /* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
+
+ case DISCARD_COMPLETED_NOTSUPP:
+ case DISCARD_COMPLETED_WITH_ERROR:
+ /* I'd rather not detach from local disk just because it
+ * failed a REQ_DISCARD. */
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
+
+ case QUEUE_FOR_NET_READ:
+ /* READ or READA, and
+ * no local disk,
+ * or target area marked as invalid,
+ * or just got an io-error. */
+ /* from __drbd_make_request
+ * or from bio_endio during read io-error recovery */
+
+ /* So we can verify the handle in the answer packet.
+ * Corresponding drbd_remove_request_interval is in
+ * drbd_req_complete() */
+ D_ASSERT(device, drbd_interval_empty(&req->i));
+ drbd_insert_interval(&device->read_requests, &req->i);
+
+ set_bit(UNPLUG_REMOTE, &device->flags);
+
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+ req->w.cb = w_send_read_req;
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+ break;
+
+ case QUEUE_FOR_NET_WRITE:
+ /* assert something? */
+ /* from __drbd_make_request only */
+
+ /* Corresponding drbd_remove_request_interval is in
+ * drbd_req_complete() */
+ D_ASSERT(device, drbd_interval_empty(&req->i));
+ drbd_insert_interval(&device->write_requests, &req->i);
+
+ /* NOTE
+ * In case the req ended up on the transfer log before being
+ * queued on the worker, it could lead to this request being
+ * missed during cleanup after connection loss.
+ * So we have to do both operations here,
+ * within the same lock that protects the transfer log.
+ *
+ * _req_add_to_epoch(req); this has to be after the
+ * _maybe_start_new_epoch(req); which happened in
+ * __drbd_make_request, because we now may set the bit
+ * again ourselves to close the current epoch.
+ *
+ * Add req to the (now) current epoch (barrier). */
+
+ /* otherwise we may lose an unplug, which may cause some remote
+ * io-scheduler timeout to expire, increasing maximum latency,
+ * hurting performance. */
+ set_bit(UNPLUG_REMOTE, &device->flags);
+
+ /* queue work item to send data */
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
+ req->w.cb = w_send_dblock;
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+
+ /* close the epoch, in case it outgrew the limit */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ p = nc->max_epoch_size;
+ rcu_read_unlock();
+ if (connection->current_tle_writes >= p)
+ start_new_tl_epoch(connection);
+
+ break;
+
+ case QUEUE_FOR_SEND_OOS:
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+ req->w.cb = w_send_out_of_sync;
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+ break;
+
+ case READ_RETRY_REMOTE_CANCELED:
+ case SEND_CANCELED:
+ case SEND_FAILED:
+ /* real cleanup will be done from tl_clear. just update flags
+ * so it is no longer marked as on the worker queue */
+ mod_rq_state(req, m, RQ_NET_QUEUED, 0);
+ break;
+
+ case HANDED_OVER_TO_NETWORK:
+ /* assert something? */
+ if (is_pending_write_protocol_A(req))
+ /* this is what is dangerous about protocol A:
+ * pretend it was successfully written on the peer. */
+ mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+ RQ_NET_SENT|RQ_NET_OK);
+ else
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+ /* It is still not yet RQ_NET_DONE until the
+ * corresponding epoch barrier got acked as well,
+ * so we know what to dirty on connection loss. */
+ break;
+
+ case OOS_HANDED_TO_NETWORK:
+ /* Was not set PENDING, no longer QUEUED, so is now DONE
+ * as far as this connection is concerned. */
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
+ break;
+
+ case CONNECTION_LOST_WHILE_PENDING:
+ /* transfer log cleanup after connection loss */
+ mod_rq_state(req, m,
+ RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+ RQ_NET_DONE);
+ break;
+
+ case CONFLICT_RESOLVED:
+ /* for superseded conflicting writes of multiple primaries,
+ * there is no need to keep anything in the tl, potential
+ * node crashes are covered by the activity log.
+ *
+ * If this request had been marked as RQ_POSTPONED before,
+ * it will actually not be completed, but "restarted",
+ * resubmitted from the retry worker context. */
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+ break;
+
+ case WRITE_ACKED_BY_PEER_AND_SIS:
+ req->rq_state |= RQ_NET_SIS;
+ case WRITE_ACKED_BY_PEER:
+ /* Normal operation protocol C: successfully written on peer.
+ * During resync, even in protocol != C,
+ * we requested an explicit write ack anyways.
+ * Which means we cannot even assert anything here.
+ * Nothing more to do here.
+ * We want to keep the tl in place for all protocols, to cater
+ * for volatile write-back caches on lower level devices. */
+ goto ack_common;
+ case RECV_ACKED_BY_PEER:
+ D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
+ /* protocol B; pretends to be successfully written on peer.
+ * see also notes above in HANDED_OVER_TO_NETWORK about
+ * protocol != C */
+ ack_common:
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+ break;
+
+ case POSTPONE_WRITE:
+ D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+ /* If this node has already detected the write conflict, the
+ * worker will be waiting on misc_wait. Wake it up once this
+ * request has completed locally.
+ */
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ req->rq_state |= RQ_POSTPONED;
+ if (req->i.waiting)
+ wake_up(&device->misc_wait);
+ /* Do not clear RQ_NET_PENDING. This request will make further
+ * progress via restart_conflicting_writes() or
+ * fail_postponed_requests(). Hopefully. */
+ break;
+
+ case NEG_ACKED:
+ mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
+ break;
+
+ case FAIL_FROZEN_DISK_IO:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+ break;
+
+ case RESTART_FROZEN_DISK_IO:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+
+ mod_rq_state(req, m,
+ RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+ RQ_LOCAL_PENDING);
+
+ rv = MR_READ;
+ if (bio_data_dir(req->master_bio) == WRITE)
+ rv = MR_WRITE;
+
+ get_ldev(device); /* always succeeds in this call path */
+ req->w.cb = w_restart_disk_io;
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+ break;
+
+ case RESEND:
+ /* Simply complete (local only) READs. */
+ if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+ break;
+ }
+
+ /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+ before the connection loss (B&C only); only P_BARRIER_ACK
+ (or the local completion?) was missing when we suspended.
+ Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+ During connection handshake, we ensure that the peer was not rebooted. */
+ if (!(req->rq_state & RQ_NET_OK)) {
+ /* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync?
+ * in that case we must not set RQ_NET_PENDING. */
+
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
+ if (req->w.cb) {
+ /* w.cb expected to be w_send_dblock, or w_send_read_req */
+ drbd_queue_work(&connection->sender_work,
+ &req->w);
+ rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+ } /* else: FIXME can this happen? */
+ break;
+ }
+ /* else, fall through to BARRIER_ACKED */
+
+ case BARRIER_ACKED:
+ /* barrier ack for READ requests does not make sense */
+ if (!(req->rq_state & RQ_WRITE))
+ break;
+
+ if (req->rq_state & RQ_NET_PENDING) {
+ /* barrier came in before all requests were acked.
+ * this is bad, because if the connection is lost now,
+ * we won't be able to clean them up... */
+ drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n");
+ }
+ /* Allowed to complete requests, even while suspended.
+ * As this is called for all requests within a matching epoch,
+ * we need to filter, and only set RQ_NET_DONE for those that
+ * have actually been on the wire. */
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+ (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
+ break;
+
+ case DATA_RECEIVED:
+ D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
+ break;
+
+ case QUEUE_AS_DRBD_BARRIER:
+ start_new_tl_epoch(connection);
+ mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
+ break;
+ };
+
+ return rv;
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ * BUT we are still/already IN SYNC for this area.
+ * since size may be bigger than BM_BLOCK_SIZE,
+ * we may need to check several bits.
+ */
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size)
+{
+ unsigned long sbnr, ebnr;
+ sector_t esector, nr_sectors;
+
+ if (device->state.disk == D_UP_TO_DATE)
+ return true;
+ if (device->state.disk != D_INCONSISTENT)
+ return false;
+ esector = sector + (size >> 9) - 1;
+ nr_sectors = drbd_get_capacity(device->this_bdev);
+ D_ASSERT(device, sector < nr_sectors);
+ D_ASSERT(device, esector < nr_sectors);
+
+ sbnr = BM_SECT_TO_BIT(sector);
+ ebnr = BM_SECT_TO_BIT(esector);
+
+ return drbd_bm_count_bits(device, sbnr, ebnr) == 0;
+}
+
+static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector,
+ enum drbd_read_balancing rbm)
+{
+ struct backing_dev_info *bdi;
+ int stripe_shift;
+
+ switch (rbm) {
+ case RB_CONGESTED_REMOTE:
+ bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+ return bdi_read_congested(bdi);
+ case RB_LEAST_PENDING:
+ return atomic_read(&device->local_cnt) >
+ atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
+ case RB_32K_STRIPING: /* stripe_shift = 15 */
+ case RB_64K_STRIPING:
+ case RB_128K_STRIPING:
+ case RB_256K_STRIPING:
+ case RB_512K_STRIPING:
+ case RB_1M_STRIPING: /* stripe_shift = 20 */
+ stripe_shift = (rbm - RB_32K_STRIPING + 15);
+ return (sector >> (stripe_shift - 9)) & 1;
+ case RB_ROUND_ROBIN:
+ return test_and_change_bit(READ_BALANCE_RR, &device->flags);
+ case RB_PREFER_REMOTE:
+ return true;
+ case RB_PREFER_LOCAL:
+ default:
+ return false;
+ }
+}
+
+/*
+ * complete_conflicting_writes - wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about. Wait for any requests to complete which conflict with
+ * the new one.
+ *
+ * Only way out: remove the conflicting intervals from the tree.
+ */
+static void complete_conflicting_writes(struct drbd_request *req)
+{
+ DEFINE_WAIT(wait);
+ struct drbd_device *device = req->device;
+ struct drbd_interval *i;
+ sector_t sector = req->i.sector;
+ int size = req->i.size;
+
+ i = drbd_find_overlap(&device->write_requests, sector, size);
+ if (!i)
+ return;
+
+ for (;;) {
+ prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+ i = drbd_find_overlap(&device->write_requests, sector, size);
+ if (!i)
+ break;
+ /* Indicate to wake up device->misc_wait on progress. */
+ i->waiting = true;
+ spin_unlock_irq(&device->resource->req_lock);
+ schedule();
+ spin_lock_irq(&device->resource->req_lock);
+ }
+ finish_wait(&device->misc_wait, &wait);
+}
+
+/* called within req_lock and rcu_read_lock() */
+static void maybe_pull_ahead(struct drbd_device *device)
+{
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct net_conf *nc;
+ bool congested = false;
+ enum drbd_on_congestion on_congestion;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+ rcu_read_unlock();
+ if (on_congestion == OC_BLOCK ||
+ connection->agreed_pro_version < 96)
+ return;
+
+ if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
+ return; /* nothing to do ... */
+
+ /* If I don't even have good local storage, we can not reasonably try
+ * to pull ahead of the peer. We also need the local reference to make
+ * sure device->act_log is there.
+ */
+ if (!get_ldev_if_state(device, D_UP_TO_DATE))
+ return;
+
+ if (nc->cong_fill &&
+ atomic_read(&device->ap_in_flight) >= nc->cong_fill) {
+ drbd_info(device, "Congestion-fill threshold reached\n");
+ congested = true;
+ }
+
+ if (device->act_log->used >= nc->cong_extents) {
+ drbd_info(device, "Congestion-extents threshold reached\n");
+ congested = true;
+ }
+
+ if (congested) {
+ /* start a new epoch for non-mirrored writes */
+ start_new_tl_epoch(first_peer_device(device)->connection);
+
+ if (on_congestion == OC_PULL_AHEAD)
+ _drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL);
+ else /*nc->on_congestion == OC_DISCONNECT */
+ _drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL);
+ }
+ put_ldev(device);
+}
+
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ enum drbd_read_balancing rbm;
+
+ if (req->private_bio) {
+ if (!drbd_may_do_local_read(device,
+ req->i.sector, req->i.size)) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ }
+ }
+
+ if (device->state.pdsk != D_UP_TO_DATE)
+ return false;
+
+ if (req->private_bio == NULL)
+ return true;
+
+ /* TODO: improve read balancing decisions, take into account drbd
+ * protocol, pending requests etc. */
+
+ rcu_read_lock();
+ rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing;
+ rcu_read_unlock();
+
+ if (rbm == RB_PREFER_LOCAL && req->private_bio)
+ return false; /* submit locally */
+
+ if (remote_due_to_read_balancing(device, req->i.sector, rbm)) {
+ if (req->private_bio) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ int remote, send_oos;
+
+ remote = drbd_should_do_remote(device->state);
+ send_oos = drbd_should_send_out_of_sync(device->state);
+
+ /* Need to replicate writes. Unless it is an empty flush,
+ * which is better mapped to a DRBD P_BARRIER packet,
+ * also for drbd wire protocol compatibility reasons.
+ * If this was a flush, just start a new epoch.
+ * Unless the current epoch was empty anyways, or we are not currently
+ * replicating, in which case there is no point. */
+ if (unlikely(req->i.size == 0)) {
+ /* The only size==0 bios we expect are empty flushes. */
+ D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH);
+ if (remote)
+ _req_mod(req, QUEUE_AS_DRBD_BARRIER);
+ return remote;
+ }
+
+ if (!remote && !send_oos)
+ return 0;
+
+ D_ASSERT(device, !(remote && send_oos));
+
+ if (remote) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_WRITE);
+ } else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size))
+ _req_mod(req, QUEUE_FOR_SEND_OOS);
+
+ return remote;
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+ struct drbd_device *device = req->device;
+ struct bio *bio = req->private_bio;
+ const int rw = bio_rw(bio);
+
+ bio->bi_bdev = device->ldev->backing_bdev;
+
+ /* State may have changed since we grabbed our reference on the
+ * ->ldev member. Double check, and short-circuit to endio.
+ * In case the last activity log transaction failed to get on
+ * stable storage, and this is a WRITE, we may not even submit
+ * this bio. */
+ if (get_ldev(device)) {
+ req->pre_submit_jif = jiffies;
+ if (drbd_insert_fault(device,
+ rw == WRITE ? DRBD_FAULT_DT_WR
+ : rw == READ ? DRBD_FAULT_DT_RD
+ : DRBD_FAULT_DT_RA))
+ bio_endio(bio, -EIO);
+ else
+ generic_make_request(bio);
+ put_ldev(device);
+ } else
+ bio_endio(bio, -EIO);
+}
+
+static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
+{
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&req->tl_requests, &device->submit.writes);
+ list_add_tail(&req->req_pending_master_completion,
+ &device->pending_master_completion[1 /* WRITE */]);
+ spin_unlock_irq(&device->resource->req_lock);
+ queue_work(device->submit.wq, &device->submit.worker);
+ /* do_submit() may sleep internally on al_wait, too */
+ wake_up(&device->al_wait);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+static struct drbd_request *
+drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
+{
+ const int rw = bio_data_dir(bio);
+ struct drbd_request *req;
+
+ /* allocate outside of all locks; */
+ req = drbd_req_new(device, bio);
+ if (!req) {
+ dec_ap_bio(device);
+ /* only pass the error to the upper layers.
+ * if user cannot handle io errors, that's not our business. */
+ drbd_err(device, "could not kmalloc() req\n");
+ bio_endio(bio, -ENOMEM);
+ return ERR_PTR(-ENOMEM);
+ }
+ req->start_jif = start_jif;
+
+ if (!get_ldev(device)) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ }
+
+ /* Update disk stats */
+ _drbd_start_io_acct(device, req);
+
+ if (rw == WRITE && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &device->flags)) {
+ if (!drbd_al_begin_io_fastpath(device, &req->i)) {
+ atomic_inc(&device->ap_actlog_cnt);
+ drbd_queue_write(device, req);
+ return NULL;
+ }
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
+ }
+
+ return req;
+}
+
+static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
+{
+ struct drbd_resource *resource = device->resource;
+ const int rw = bio_rw(req->master_bio);
+ struct bio_and_error m = { NULL, };
+ bool no_remote = false;
+ bool submit_private_bio = false;
+
+ spin_lock_irq(&resource->req_lock);
+ if (rw == WRITE) {
+ /* This may temporarily give up the req_lock,
+ * but will re-aquire it before it returns here.
+ * Needs to be before the check on drbd_suspended() */
+ complete_conflicting_writes(req);
+ /* no more giving up req_lock from now on! */
+
+ /* check for congestion, and potentially stop sending
+ * full data updates, but start sending "dirty bits" only. */
+ maybe_pull_ahead(device);
+ }
+
+
+ if (drbd_suspended(device)) {
+ /* push back and retry: */
+ req->rq_state |= RQ_POSTPONED;
+ if (req->private_bio) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(device);
+ }
+ goto out;
+ }
+
+ /* We fail READ/READA early, if we can not serve it.
+ * We must do this before req is registered on any lists.
+ * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+ if (rw != WRITE) {
+ if (!do_remote_read(req) && !req->private_bio)
+ goto nodata;
+ }
+
+ /* which transfer log epoch does this belong to? */
+ req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr);
+
+ /* no point in adding empty flushes to the transfer log,
+ * they are mapped to drbd barriers already. */
+ if (likely(req->i.size!=0)) {
+ if (rw == WRITE)
+ first_peer_device(device)->connection->current_tle_writes++;
+
+ list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log);
+ }
+
+ if (rw == WRITE) {
+ if (!drbd_process_write_request(req))
+ no_remote = true;
+ } else {
+ /* We either have a private_bio, or we can read from remote.
+ * Otherwise we had done the goto nodata above. */
+ if (req->private_bio == NULL) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_READ);
+ } else
+ no_remote = true;
+ }
+
+ /* If it took the fast path in drbd_request_prepare, add it here.
+ * The slow path has added it already. */
+ if (list_empty(&req->req_pending_master_completion))
+ list_add_tail(&req->req_pending_master_completion,
+ &device->pending_master_completion[rw == WRITE]);
+ if (req->private_bio) {
+ /* needs to be marked within the same spinlock */
+ list_add_tail(&req->req_pending_local,
+ &device->pending_completion[rw == WRITE]);
+ _req_mod(req, TO_BE_SUBMITTED);
+ /* but we need to give up the spinlock to submit */
+ submit_private_bio = true;
+ } else if (no_remote) {
+nodata:
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+ (unsigned long long)req->i.sector, req->i.size >> 9);
+ /* A write may have been queued for send_oos, however.
+ * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
+ }
+
+out:
+ if (drbd_req_put_completion_ref(req, &m, 1))
+ kref_put(&req->kref, drbd_req_destroy);
+ spin_unlock_irq(&resource->req_lock);
+
+ /* Even though above is a kref_put(), this is safe.
+ * As long as we still need to submit our private bio,
+ * we hold a completion ref, and the request cannot disappear.
+ * If however this request did not even have a private bio to submit
+ * (e.g. remote read), req may already be invalid now.
+ * That's why we cannot check on req->private_bio. */
+ if (submit_private_bio)
+ drbd_submit_req_private_bio(req);
+ if (m.bio)
+ complete_master_bio(device, &m);
+}
+
+void __drbd_make_request(struct drbd_device *device, struct bio *bio, unsigned long start_jif)
+{
+ struct drbd_request *req = drbd_request_prepare(device, bio, start_jif);
+ if (IS_ERR_OR_NULL(req))
+ return;
+ drbd_send_and_submit(device, req);
+}
+
+static void submit_fast_path(struct drbd_device *device, struct list_head *incoming)
+{
+ struct drbd_request *req, *tmp;
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ const int rw = bio_data_dir(req->master_bio);
+
+ if (rw == WRITE /* rw != WRITE should not even end up here! */
+ && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &device->flags)) {
+ if (!drbd_al_begin_io_fastpath(device, &req->i))
+ continue;
+
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
+ atomic_dec(&device->ap_actlog_cnt);
+ }
+
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(device, req);
+ }
+}
+
+static bool prepare_al_transaction_nonblock(struct drbd_device *device,
+ struct list_head *incoming,
+ struct list_head *pending,
+ struct list_head *later)
+{
+ struct drbd_request *req, *tmp;
+ int wake = 0;
+ int err;
+
+ spin_lock_irq(&device->al_lock);
+ list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+ err = drbd_al_begin_io_nonblock(device, &req->i);
+ if (err == -ENOBUFS)
+ break;
+ if (err == -EBUSY)
+ wake = 1;
+ if (err)
+ list_move_tail(&req->tl_requests, later);
+ else
+ list_move_tail(&req->tl_requests, pending);
+ }
+ spin_unlock_irq(&device->al_lock);
+ if (wake)
+ wake_up(&device->al_wait);
+ return !list_empty(pending);
+}
+
+void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+ struct drbd_request *req, *tmp;
+
+ list_for_each_entry_safe(req, tmp, pending, tl_requests) {
+ req->rq_state |= RQ_IN_ACT_LOG;
+ req->in_actlog_jif = jiffies;
+ atomic_dec(&device->ap_actlog_cnt);
+ list_del_init(&req->tl_requests);
+ drbd_send_and_submit(device, req);
+ }
+}
+
+void do_submit(struct work_struct *ws)
+{
+ struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
+ LIST_HEAD(incoming); /* from drbd_make_request() */
+ LIST_HEAD(pending); /* to be submitted after next AL-transaction commit */
+ LIST_HEAD(busy); /* blocked by resync requests */
+
+ /* grab new incoming requests */
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_tail_init(&device->submit.writes, &incoming);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ for (;;) {
+ DEFINE_WAIT(wait);
+
+ /* move used-to-be-busy back to front of incoming */
+ list_splice_init(&busy, &incoming);
+ submit_fast_path(device, &incoming);
+ if (list_empty(&incoming))
+ break;
+
+ for (;;) {
+ prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+ list_splice_init(&busy, &incoming);
+ prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+ if (!list_empty(&pending))
+ break;
+
+ schedule();
+
+ /* If all currently "hot" activity log extents are kept busy by
+ * incoming requests, we still must not totally starve new
+ * requests to "cold" extents.
+ * Something left on &incoming means there had not been
+ * enough update slots available, and the activity log
+ * has been marked as "starving".
+ *
+ * Try again now, without looking for new requests,
+ * effectively blocking all new requests until we made
+ * at least _some_ progress with what we currently have.
+ */
+ if (!list_empty(&incoming))
+ continue;
+
+ /* Nothing moved to pending, but nothing left
+ * on incoming: all moved to busy!
+ * Grab new and iterate. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_tail_init(&device->submit.writes, &incoming);
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+ finish_wait(&device->al_wait, &wait);
+
+ /* If the transaction was full, before all incoming requests
+ * had been processed, skip ahead to commit, and iterate
+ * without splicing in more incoming requests from upper layers.
+ *
+ * Else, if all incoming have been processed,
+ * they have become either "pending" (to be submitted after
+ * next transaction commit) or "busy" (blocked by resync).
+ *
+ * Maybe more was queued, while we prepared the transaction?
+ * Try to stuff those into this transaction as well.
+ * Be strictly non-blocking here,
+ * we already have something to commit.
+ *
+ * Commit if we don't make any more progres.
+ */
+
+ while (list_empty(&incoming)) {
+ LIST_HEAD(more_pending);
+ LIST_HEAD(more_incoming);
+ bool made_progress;
+
+ /* It is ok to look outside the lock,
+ * it's only an optimization anyways */
+ if (list_empty(&device->submit.writes))
+ break;
+
+ spin_lock_irq(&device->resource->req_lock);
+ list_splice_tail_init(&device->submit.writes, &more_incoming);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (list_empty(&more_incoming))
+ break;
+
+ made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
+
+ list_splice_tail_init(&more_pending, &pending);
+ list_splice_tail_init(&more_incoming, &incoming);
+ if (!made_progress)
+ break;
+ }
+
+ drbd_al_begin_io_commit(device);
+ send_and_submit_pending(device, &pending);
+ }
+}
+
+void drbd_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct drbd_device *device = (struct drbd_device *) q->queuedata;
+ unsigned long start_jif;
+
+ start_jif = jiffies;
+
+ /*
+ * what we "blindly" assume:
+ */
+ D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
+
+ inc_ap_bio(device);
+ __drbd_make_request(device, bio, start_jif);
+}
+
+/* This is called by bio_add_page().
+ *
+ * q->max_hw_sectors and other global limits are already enforced there.
+ *
+ * We need to call down to our lower level device,
+ * in case it has special restrictions.
+ *
+ * We also may need to enforce configured max-bio-bvecs limits.
+ *
+ * As long as the BIO is empty we have to allow at least one bvec,
+ * regardless of size and offset, so no need to ask lower levels.
+ */
+int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
+{
+ struct drbd_device *device = (struct drbd_device *) q->queuedata;
+ unsigned int bio_size = bvm->bi_size;
+ int limit = DRBD_MAX_BIO_SIZE;
+ int backing_limit;
+
+ if (bio_size && get_ldev(device)) {
+ unsigned int max_hw_sectors = queue_max_hw_sectors(q);
+ struct request_queue * const b =
+ device->ldev->backing_bdev->bd_disk->queue;
+ if (b->merge_bvec_fn) {
+ bvm->bi_bdev = device->ldev->backing_bdev;
+ backing_limit = b->merge_bvec_fn(b, bvm, bvec);
+ limit = min(limit, backing_limit);
+ }
+ put_ldev(device);
+ if ((limit >> 9) > max_hw_sectors)
+ limit = max_hw_sectors << 9;
+ }
+ return limit;
+}
+
+void request_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+ struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
+ struct net_conf *nc;
+ unsigned long oldest_submit_jif;
+ unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+ unsigned long now;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ if (nc && device->state.conn >= C_WF_REPORT_PARAMS)
+ ent = nc->timeout * HZ/10 * nc->ko_count;
+
+ if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
+ dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
+ put_ldev(device);
+ }
+ rcu_read_unlock();
+
+ et = min_not_zero(dt, ent);
+
+ if (!et)
+ return; /* Recurring timer stopped */
+
+ now = jiffies;
+ nt = now + et;
+
+ spin_lock_irq(&device->resource->req_lock);
+ req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+ req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+ req_peer = connection->req_not_net_done;
+ /* maybe the oldest request waiting for the peer is in fact still
+ * blocking in tcp sendmsg */
+ if (!req_peer && connection->req_next && connection->req_next->pre_send_jif)
+ req_peer = connection->req_next;
+
+ /* evaluate the oldest peer request only in one timer! */
+ if (req_peer && req_peer->device != device)
+ req_peer = NULL;
+
+ /* do we have something to evaluate? */
+ if (req_peer == NULL && req_write == NULL && req_read == NULL)
+ goto out;
+
+ oldest_submit_jif =
+ (req_write && req_read)
+ ? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+ ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+ : req_write ? req_write->pre_submit_jif
+ : req_read ? req_read->pre_submit_jif : now;
+
+ /* The request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ * with above state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ * resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ * for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ * !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+ if (ent && req_peer &&
+ time_after(now, req_peer->pre_send_jif + ent) &&
+ !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) {
+ drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n");
+ _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
+ }
+ if (dt && oldest_submit_jif != now &&
+ time_after(now, oldest_submit_jif + dt) &&
+ !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
+ drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
+ __drbd_chk_io_error(device, DRBD_FORCE_DETACH);
+ }
+
+ /* Reschedule timer for the nearest not already expired timeout.
+ * Fallback to now + min(effective network timeout, disk timeout). */
+ ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+ ? req_peer->pre_send_jif + ent : now + et;
+ dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+ ? oldest_submit_jif + dt : now + et;
+ nt = time_before(ent, dt) ? ent : dt;
+out:
+ spin_unlock_irq(&device->resource->req_lock);
+ mod_timer(&device->request_timer, nt);
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000..9f6a04080
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,351 @@
+/*
+ drbd_req.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+ Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+ DRBD is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ DRBD is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+ and in Softirqs/Tasklets/BH context by the SCSI drivers,
+ and by the receiver and worker in kernel-thread context.
+ Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ * It will be created.
+ * It will be marked with the intention to be
+ * submitted to local disk and/or
+ * send via the network.
+ *
+ * It has to be placed on the transfer log and other housekeeping lists,
+ * In case we have a network connection.
+ *
+ * It may be identified as a concurrent (write) request
+ * and be handled accordingly.
+ *
+ * It may me handed over to the local disk subsystem.
+ * It may be completed by the local disk subsystem,
+ * either successfully or with io-error.
+ * In case it is a READ request, and it failed locally,
+ * it may be retried remotely.
+ *
+ * It may be queued for sending.
+ * It may be handed over to the network stack,
+ * which may fail.
+ * It may be acknowledged by the "peer" according to the wire_protocol in use.
+ * this may be a negative ack.
+ * It may receive a faked ack when the network connection is lost and the
+ * transfer log is cleaned up.
+ * Sending may be canceled due to network connection loss.
+ * When it finally has outlived its time,
+ * corresponding dirty bits in the resync-bitmap may be cleared or set,
+ * it will be destroyed,
+ * and completion will be signalled to the originator,
+ * with or without "success".
+ */
+
+enum drbd_req_event {
+ CREATED,
+ TO_BE_SENT,
+ TO_BE_SUBMITTED,
+
+ /* XXX yes, now I am inconsistent...
+ * these are not "events" but "actions"
+ * oh, well... */
+ QUEUE_FOR_NET_WRITE,
+ QUEUE_FOR_NET_READ,
+ QUEUE_FOR_SEND_OOS,
+
+ /* An empty flush is queued as P_BARRIER,
+ * which will cause it to complete "successfully",
+ * even if the local disk flush failed.
+ *
+ * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
+ * only see an error if neither local nor remote data is reachable. */
+ QUEUE_AS_DRBD_BARRIER,
+
+ SEND_CANCELED,
+ SEND_FAILED,
+ HANDED_OVER_TO_NETWORK,
+ OOS_HANDED_TO_NETWORK,
+ CONNECTION_LOST_WHILE_PENDING,
+ READ_RETRY_REMOTE_CANCELED,
+ RECV_ACKED_BY_PEER,
+ WRITE_ACKED_BY_PEER,
+ WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
+ CONFLICT_RESOLVED,
+ POSTPONE_WRITE,
+ NEG_ACKED,
+ BARRIER_ACKED, /* in protocol A and B */
+ DATA_RECEIVED, /* (remote read) */
+
+ COMPLETED_OK,
+ READ_COMPLETED_WITH_ERROR,
+ READ_AHEAD_COMPLETED_WITH_ERROR,
+ WRITE_COMPLETED_WITH_ERROR,
+ DISCARD_COMPLETED_NOTSUPP,
+ DISCARD_COMPLETED_WITH_ERROR,
+
+ ABORT_DISK_IO,
+ RESEND,
+ FAIL_FROZEN_DISK_IO,
+ RESTART_FROZEN_DISK_IO,
+ NOTHING,
+};
+
+/* encoding of request states for now. we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+ /* 3210
+ * 0000: no local possible
+ * 0001: to be submitted
+ * UNUSED, we could map: 011: submitted, completion still pending
+ * 0110: completed ok
+ * 0010: completed with error
+ * 1001: Aborted (before completion)
+ * 1x10: Aborted and completed -> free
+ */
+ __RQ_LOCAL_PENDING,
+ __RQ_LOCAL_COMPLETED,
+ __RQ_LOCAL_OK,
+ __RQ_LOCAL_ABORTED,
+
+ /* 87654
+ * 00000: no network possible
+ * 00001: to be send
+ * 00011: to be send, on worker queue
+ * 00101: sent, expecting recv_ack (B) or write_ack (C)
+ * 11101: sent,
+ * recv_ack (B) or implicit "ack" (A),
+ * still waiting for the barrier ack.
+ * master_bio may already be completed and invalidated.
+ * 11100: write acked (C),
+ * data received (for remote read, any protocol)
+ * or finally the barrier ack has arrived (B,A)...
+ * request can be freed
+ * 01100: neg-acked (write, protocol C)
+ * or neg-d-acked (read, any protocol)
+ * or killed from the transfer log
+ * during cleanup after connection loss
+ * request can be freed
+ * 01000: canceled or send failed...
+ * request can be freed
+ */
+
+ /* if "SENT" is not set, yet, this can still fail or be canceled.
+ * if "SENT" is set already, we still wait for an Ack packet.
+ * when cleared, the master_bio may be completed.
+ * in (B,A) the request object may still linger on the transaction log
+ * until the corresponding barrier ack comes in */
+ __RQ_NET_PENDING,
+
+ /* If it is QUEUED, and it is a WRITE, it is also registered in the
+ * transfer log. Currently we need this flag to avoid conflicts between
+ * worker canceling the request and tl_clear_barrier killing it from
+ * transfer log. We should restructure the code so this conflict does
+ * no longer occur. */
+ __RQ_NET_QUEUED,
+
+ /* well, actually only "handed over to the network stack".
+ *
+ * TODO can potentially be dropped because of the similar meaning
+ * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+ * however it is not exactly the same. before we drop it
+ * we must ensure that we can tell a request with network part
+ * from a request without, regardless of what happens to it. */
+ __RQ_NET_SENT,
+
+ /* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+ * basically this means the corresponding P_BARRIER_ACK was received */
+ __RQ_NET_DONE,
+
+ /* whether or not we know (C) or pretend (B,A) that the write
+ * was successfully written on the peer.
+ */
+ __RQ_NET_OK,
+
+ /* peer called drbd_set_in_sync() for this write */
+ __RQ_NET_SIS,
+
+ /* keep this last, its for the RQ_NET_MASK */
+ __RQ_NET_MAX,
+
+ /* Set when this is a write, clear for a read */
+ __RQ_WRITE,
+
+ /* Should call drbd_al_complete_io() for this request... */
+ __RQ_IN_ACT_LOG,
+
+ /* The peer has sent a retry ACK */
+ __RQ_POSTPONED,
+
+ /* would have been completed,
+ * but was not, because of drbd_suspended() */
+ __RQ_COMPLETION_SUSP,
+
+ /* We expect a receive ACK (wire proto B) */
+ __RQ_EXP_RECEIVE_ACK,
+
+ /* We expect a write ACK (wite proto C) */
+ __RQ_EXP_WRITE_ACK,
+
+ /* waiting for a barrier ack, did an extra kref_get */
+ __RQ_EXP_BARR_ACK,
+};
+
+#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED)
+
+#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1)
+
+#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS (1UL << __RQ_NET_SIS)
+
+/* 0x1f8 */
+#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+#define RQ_WRITE (1UL << __RQ_WRITE)
+#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
+#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
+#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
+
+/* For waking up the frozen transfer log mod_req() has to return if the request
+ should be counted in the epoch object*/
+#define MR_WRITE 1
+#define MR_READ 2
+
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+ struct bio *bio;
+ bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+ req->private_bio = bio;
+
+ bio->bi_private = req;
+ bio->bi_end_io = drbd_request_endio;
+ bio->bi_next = NULL;
+}
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_iter.bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+ struct bio *bio;
+ int error;
+};
+
+extern void start_new_tl_epoch(struct drbd_connection *connection);
+extern void drbd_req_destroy(struct kref *kref);
+extern void _req_may_be_done(struct drbd_request *req,
+ struct bio_and_error *m);
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+ struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_device *device,
+ struct bio_and_error *m);
+extern void request_timer_fn(unsigned long data);
+extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void tl_abort_disk_io(struct drbd_device *device);
+
+/* this is in drbd_main.c */
+extern void drbd_restart_request(struct drbd_request *req);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+ struct drbd_device *device = req->device;
+ struct bio_and_error m;
+ int rv;
+
+ /* __req_mod possibly frees req, do not touch req after that! */
+ rv = __req_mod(req, what, &m);
+ if (m.bio)
+ complete_master_bio(device, &m);
+
+ return rv;
+}
+
+/* completion of master bio is outside of our spinlock.
+ * We still may or may not be inside some irqs disabled section
+ * of the lower level driver completion callback, so we need to
+ * spin_lock_irqsave here. */
+static inline int req_mod(struct drbd_request *req,
+ enum drbd_req_event what)
+{
+ unsigned long flags;
+ struct drbd_device *device = req->device;
+ struct bio_and_error m;
+ int rv;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ rv = __req_mod(req, what, &m);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (m.bio)
+ complete_master_bio(device, &m);
+
+ return rv;
+}
+
+static inline bool drbd_should_do_remote(union drbd_dev_state s)
+{
+ return s.pdsk == D_UP_TO_DATE ||
+ (s.pdsk >= D_INCONSISTENT &&
+ s.conn >= C_WF_BITMAP_T &&
+ s.conn < C_AHEAD);
+ /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+ That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+ states. */
+}
+static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+ return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+ /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+ since we enter state C_AHEAD only if proto >= 96 */
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
new file mode 100644
index 000000000..2d7dd269b
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.c
@@ -0,0 +1,1918 @@
+/*
+ drbd_state.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+ from Logicworks, Inc. for making SDP replication support possible.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+struct after_state_chg_work {
+ struct drbd_work w;
+ struct drbd_device *device;
+ union drbd_state os;
+ union drbd_state ns;
+ enum chg_state_flags flags;
+ struct completion *done;
+};
+
+enum sanitize_state_warnings {
+ NO_WARNING,
+ ABORTED_ONLINE_VERIFY,
+ ABORTED_RESYNC,
+ CONNECTION_LOST_NEGOTIATING,
+ IMPLICITLY_UPGRADED_DISK,
+ IMPLICITLY_UPGRADED_PDSK,
+};
+
+static int w_after_state_ch(struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags);
+static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
+static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
+static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum sanitize_state_warnings *warn);
+
+static inline bool is_susp(union drbd_state s)
+{
+ return s.susp || s.susp_nod || s.susp_fen;
+}
+
+bool conn_all_vols_unconf(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ bool rv = true;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (device->state.disk != D_DISKLESS ||
+ device->state.conn != C_STANDALONE ||
+ device->state.role != R_SECONDARY) {
+ rv = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/* Unfortunately the states where not correctly ordered, when
+ they where defined. therefore can not use max_t() here. */
+static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+{
+ if (role1 == R_PRIMARY || role2 == R_PRIMARY)
+ return R_PRIMARY;
+ if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+ return R_SECONDARY;
+ return R_UNKNOWN;
+}
+static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+{
+ if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
+ return R_UNKNOWN;
+ if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+ return R_SECONDARY;
+ return R_PRIMARY;
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection)
+{
+ enum drbd_role role = R_UNKNOWN;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ role = max_role(role, device->state.role);
+ }
+ rcu_read_unlock();
+
+ return role;
+}
+
+enum drbd_role conn_highest_peer(struct drbd_connection *connection)
+{
+ enum drbd_role peer = R_UNKNOWN;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ peer = max_role(peer, device->state.peer);
+ }
+ rcu_read_unlock();
+
+ return peer;
+}
+
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state disk_state = D_DISKLESS;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk);
+ }
+ rcu_read_unlock();
+
+ return disk_state;
+}
+
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state disk_state = D_MASK;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
+ }
+ rcu_read_unlock();
+
+ return disk_state;
+}
+
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection)
+{
+ enum drbd_disk_state disk_state = D_DISKLESS;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk);
+ }
+ rcu_read_unlock();
+
+ return disk_state;
+}
+
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection)
+{
+ enum drbd_conns conn = C_MASK;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ conn = min_t(enum drbd_conns, conn, device->state.conn);
+ }
+ rcu_read_unlock();
+
+ return conn;
+}
+
+static bool no_peer_wf_report_params(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+ bool rv = true;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) {
+ rv = false;
+ break;
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static void wake_up_all_devices(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ wake_up(&peer_device->device->state_wait);
+ rcu_read_unlock();
+
+}
+
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @device: DRBD device.
+ * @os: old (current) state.
+ * @ns: new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_device *device,
+ union drbd_state os, union drbd_state ns)
+{
+ return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+ ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+ (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+ (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+ (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+ (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
+ (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+}
+
+static union drbd_state
+apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+{
+ union drbd_state ns;
+ ns.i = (os.i & ~mask.i) | val.i;
+ return ns;
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_device *device, enum chg_state_flags f,
+ union drbd_state mask, union drbd_state val)
+{
+ unsigned long flags;
+ union drbd_state ns;
+ enum drbd_state_rv rv;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ ns = apply_mask_val(drbd_read_state(device), mask, val);
+ rv = _drbd_set_state(device, ns, f, NULL);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ */
+void drbd_force_state(struct drbd_device *device,
+ union drbd_state mask, union drbd_state val)
+{
+ drbd_change_state(device, CS_HARD, mask, val);
+}
+
+static enum drbd_state_rv
+_req_st_cond(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val)
+{
+ union drbd_state os, ns;
+ unsigned long flags;
+ enum drbd_state_rv rv;
+
+ if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags))
+ return SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags))
+ return SS_CW_FAILED_BY_PEER;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+ rv = is_valid_transition(os, ns);
+ if (rv >= SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+
+ if (!cl_wide_st_chg(device, os, ns))
+ rv = SS_CW_NO_NEED;
+ if (rv == SS_UNKNOWN_ERROR) {
+ rv = is_valid_state(device, ns);
+ if (rv >= SS_SUCCESS) {
+ rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+ if (rv >= SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+ }
+ }
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static enum drbd_state_rv
+drbd_req_state(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ struct completion done;
+ unsigned long flags;
+ union drbd_state os, ns;
+ enum drbd_state_rv rv;
+
+ init_completion(&done);
+
+ if (f & CS_SERIALIZE)
+ mutex_lock(device->state_mutex);
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS) {
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ goto abort;
+ }
+
+ if (cl_wide_st_chg(device, os, ns)) {
+ rv = is_valid_state(device, ns);
+ if (rv == SS_SUCCESS)
+ rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+
+ if (drbd_send_state_req(first_peer_device(device), mask, val)) {
+ rv = SS_CW_FAILED_BY_PEER;
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+
+ wait_event(device->state_wait,
+ (rv = _req_st_cond(device, mask, val)));
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ goto abort;
+ }
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ ns = apply_mask_val(drbd_read_state(device), mask, val);
+ rv = _drbd_set_state(device, ns, f, &done);
+ } else {
+ rv = _drbd_set_state(device, ns, f, &done);
+ }
+
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+ D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+ wait_for_completion(&done);
+ }
+
+abort:
+ if (f & CS_SERIALIZE)
+ mutex_unlock(device->state_mutex);
+
+ return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ enum drbd_state_rv rv;
+
+ wait_event(device->state_wait,
+ (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+ return rv;
+}
+
+enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ enum drbd_state_rv rv;
+
+ BUG_ON(f & CS_SERIALIZE);
+
+ wait_event_cmd(device->state_wait,
+ (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE,
+ mutex_unlock(device->state_mutex),
+ mutex_lock(device->state_mutex));
+
+ return rv;
+}
+
+static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
+{
+ drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
+ name,
+ drbd_conn_str(ns.conn),
+ drbd_role_str(ns.role),
+ drbd_role_str(ns.peer),
+ drbd_disk_str(ns.disk),
+ drbd_disk_str(ns.pdsk),
+ is_susp(ns) ? 's' : 'r',
+ ns.aftr_isp ? 'a' : '-',
+ ns.peer_isp ? 'p' : '-',
+ ns.user_isp ? 'u' : '-',
+ ns.susp_fen ? 'F' : '-',
+ ns.susp_nod ? 'N' : '-'
+ );
+}
+
+void print_st_err(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum drbd_state_rv err)
+{
+ if (err == SS_IN_TRANSIENT_STATE)
+ return;
+ drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err));
+ print_st(device, " state", os);
+ print_st(device, "wanted", ns);
+}
+
+static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char *pbp;
+ pbp = pb;
+ *pbp = 0;
+
+ if (ns.role != os.role && flags & CS_DC_ROLE)
+ pbp += sprintf(pbp, "role( %s -> %s ) ",
+ drbd_role_str(os.role),
+ drbd_role_str(ns.role));
+ if (ns.peer != os.peer && flags & CS_DC_PEER)
+ pbp += sprintf(pbp, "peer( %s -> %s ) ",
+ drbd_role_str(os.peer),
+ drbd_role_str(ns.peer));
+ if (ns.conn != os.conn && flags & CS_DC_CONN)
+ pbp += sprintf(pbp, "conn( %s -> %s ) ",
+ drbd_conn_str(os.conn),
+ drbd_conn_str(ns.conn));
+ if (ns.disk != os.disk && flags & CS_DC_DISK)
+ pbp += sprintf(pbp, "disk( %s -> %s ) ",
+ drbd_disk_str(os.disk),
+ drbd_disk_str(ns.disk));
+ if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
+ pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+ drbd_disk_str(os.pdsk),
+ drbd_disk_str(ns.pdsk));
+
+ return pbp - pb;
+}
+
+static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char pb[300];
+ char *pbp = pb;
+
+ pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+
+ if (ns.aftr_isp != os.aftr_isp)
+ pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+ os.aftr_isp,
+ ns.aftr_isp);
+ if (ns.peer_isp != os.peer_isp)
+ pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+ os.peer_isp,
+ ns.peer_isp);
+ if (ns.user_isp != os.user_isp)
+ pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+ os.user_isp,
+ ns.user_isp);
+
+ if (pbp != pb)
+ drbd_info(device, "%s\n", pb);
+}
+
+static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char pb[300];
+ char *pbp = pb;
+
+ pbp += print_state_change(pbp, os, ns, flags);
+
+ if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
+ pbp += sprintf(pbp, "susp( %d -> %d ) ",
+ is_susp(os),
+ is_susp(ns));
+
+ if (pbp != pb)
+ drbd_info(connection, "%s\n", pb);
+}
+
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @device: DRBD device.
+ * @ns: State to consider.
+ */
+static enum drbd_state_rv
+is_valid_state(struct drbd_device *device, union drbd_state ns)
+{
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ enum drbd_fencing_p fp;
+ enum drbd_state_rv rv = SS_SUCCESS;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ fp = FP_DONT_CARE;
+ if (get_ldev(device)) {
+ fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ put_ldev(device);
+ }
+
+ nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+ if (nc) {
+ if (!nc->two_primaries && ns.role == R_PRIMARY) {
+ if (ns.peer == R_PRIMARY)
+ rv = SS_TWO_PRIMARIES;
+ else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY)
+ rv = SS_O_VOL_PEER_PRI;
+ }
+ }
+
+ if (rv <= 0)
+ /* already found a reason to abort */;
+ else if (ns.role == R_SECONDARY && device->open_cnt)
+ rv = SS_DEVICE_IN_USE;
+
+ else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (fp >= FP_RESOURCE &&
+ ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+ rv = SS_PRIMARY_NOP;
+
+ else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+ rv = SS_NO_LOCAL_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+ rv = SS_NO_REMOTE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if ((ns.conn == C_CONNECTED ||
+ ns.conn == C_WF_BITMAP_S ||
+ ns.conn == C_SYNC_SOURCE ||
+ ns.conn == C_PAUSED_SYNC_S) &&
+ ns.disk == D_OUTDATED)
+ rv = SS_CONNECTED_OUTDATES;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ (nc->verify_alg[0] == 0))
+ rv = SS_NO_VERIFY_ALG;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ first_peer_device(device)->connection->agreed_pro_version < 88)
+ rv = SS_NOT_SUPPORTED;
+
+ else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ ns.pdsk == D_UNKNOWN)
+ rv = SS_NEED_CONNECTION;
+
+ else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+ rv = SS_CONNECTED_OUTDATES;
+
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
+ * This function limits state transitions that may be declined by DRBD. I.e.
+ * user requests (aka soft transitions).
+ * @device: DRBD device.
+ * @ns: new state.
+ * @os: old state.
+ */
+static enum drbd_state_rv
+is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+
+ if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+ os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+ rv = SS_ALREADY_STANDALONE;
+
+ if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+ rv = SS_NO_NET_CONFIG;
+
+ if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+ rv = SS_LOWER_THAN_OUTDATED;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+ rv = SS_IN_TRANSIENT_STATE;
+
+ /* While establishing a connection only allow cstate to change.
+ Delay/refuse role changes, detach attach etc... (they do not touch cstate) */
+ if (test_bit(STATE_SENT, &connection->flags) &&
+ !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) ||
+ (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
+ rv = SS_IN_TRANSIENT_STATE;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ ns.conn != os.conn && os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+ && os.conn < C_WF_REPORT_PARAMS)
+ rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+ if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+ os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+ rv = SS_OUTDATE_WO_CONN;
+
+ return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+{
+ /* no change -> nothing to do, at least for the connection part */
+ if (oc == nc)
+ return SS_NOTHING_TO_DO;
+
+ /* disconnect of an unconfigured connection does not make sense */
+ if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+ return SS_ALREADY_STANDALONE;
+
+ /* from C_STANDALONE, we start with C_UNCONNECTED */
+ if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+ return SS_NEED_CONNECTION;
+
+ /* When establishing a connection we need to go through WF_REPORT_PARAMS!
+ Necessary to do the right thing upon invalidate-remote on a disconnected resource */
+ if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
+ return SS_NEED_CONNECTION;
+
+ /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+ if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+ return SS_IN_TRANSIENT_STATE;
+
+ /* After C_DISCONNECTING only C_STANDALONE may follow */
+ if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+ return SS_IN_TRANSIENT_STATE;
+
+ return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @ns: new state.
+ * @os: old state.
+ */
+static enum drbd_state_rv
+is_valid_transition(union drbd_state os, union drbd_state ns)
+{
+ enum drbd_state_rv rv;
+
+ rv = is_valid_conn_transition(os.conn, ns.conn);
+
+ /* we cannot fail (again) if we already detached */
+ if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn)
+{
+ static const char *msg_table[] = {
+ [NO_WARNING] = "",
+ [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+ [ABORTED_RESYNC] = "Resync aborted.",
+ [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+ [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+ [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+ };
+
+ if (warn != NO_WARNING)
+ drbd_warn(device, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @device: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum sanitize_state_warnings *warn)
+{
+ enum drbd_fencing_p fp;
+ enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+ if (warn)
+ *warn = NO_WARNING;
+
+ fp = FP_DONT_CARE;
+ if (get_ldev(device)) {
+ rcu_read_lock();
+ fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+ rcu_read_unlock();
+ put_ldev(device);
+ }
+
+ /* Implications from connection to peer and peer_isp */
+ if (ns.conn < C_CONNECTED) {
+ ns.peer_isp = 0;
+ ns.peer = R_UNKNOWN;
+ if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+ ns.pdsk = D_UNKNOWN;
+ }
+
+ /* Clear the aftr_isp when becoming unconfigured */
+ if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+ ns.aftr_isp = 0;
+
+ /* An implication of the disk states onto the connection state */
+ /* Abort resync if a disk fails/detaches */
+ if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+ if (warn)
+ *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
+ ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+ ns.conn = C_CONNECTED;
+ }
+
+ /* Connection breaks down before we finished "Negotiating" */
+ if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+ get_ldev_if_state(device, D_NEGOTIATING)) {
+ if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) {
+ ns.disk = device->new_state_tmp.disk;
+ ns.pdsk = device->new_state_tmp.pdsk;
+ } else {
+ if (warn)
+ *warn = CONNECTION_LOST_NEGOTIATING;
+ ns.disk = D_DISKLESS;
+ ns.pdsk = D_UNKNOWN;
+ }
+ put_ldev(device);
+ }
+
+ /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+ if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+ if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+ ns.disk = D_UP_TO_DATE;
+ if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+ ns.pdsk = D_UP_TO_DATE;
+ }
+
+ /* Implications of the connection stat on the disk states */
+ disk_min = D_DISKLESS;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_UNKNOWN;
+ switch ((enum drbd_conns)ns.conn) {
+ case C_WF_BITMAP_T:
+ case C_PAUSED_SYNC_T:
+ case C_STARTING_SYNC_T:
+ case C_WF_SYNC_UUID:
+ case C_BEHIND:
+ disk_min = D_INCONSISTENT;
+ disk_max = D_OUTDATED;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_VERIFY_S:
+ case C_VERIFY_T:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_CONNECTED:
+ disk_min = D_DISKLESS;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_DISKLESS;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_WF_BITMAP_S:
+ case C_PAUSED_SYNC_S:
+ case C_STARTING_SYNC_S:
+ case C_AHEAD:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+ break;
+ case C_SYNC_TARGET:
+ disk_min = D_INCONSISTENT;
+ disk_max = D_INCONSISTENT;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_SYNC_SOURCE:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_INCONSISTENT;
+ break;
+ case C_STANDALONE:
+ case C_DISCONNECTING:
+ case C_UNCONNECTED:
+ case C_TIMEOUT:
+ case C_BROKEN_PIPE:
+ case C_NETWORK_FAILURE:
+ case C_PROTOCOL_ERROR:
+ case C_TEAR_DOWN:
+ case C_WF_CONNECTION:
+ case C_WF_REPORT_PARAMS:
+ case C_MASK:
+ break;
+ }
+ if (ns.disk > disk_max)
+ ns.disk = disk_max;
+
+ if (ns.disk < disk_min) {
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_DISK;
+ ns.disk = disk_min;
+ }
+ if (ns.pdsk > pdsk_max)
+ ns.pdsk = pdsk_max;
+
+ if (ns.pdsk < pdsk_min) {
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_PDSK;
+ ns.pdsk = pdsk_min;
+ }
+
+ if (fp == FP_STONITH &&
+ (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+ !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+ ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+ if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+ !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+ ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+ if (ns.conn == C_SYNC_SOURCE)
+ ns.conn = C_PAUSED_SYNC_S;
+ if (ns.conn == C_SYNC_TARGET)
+ ns.conn = C_PAUSED_SYNC_T;
+ } else {
+ if (ns.conn == C_PAUSED_SYNC_S)
+ ns.conn = C_SYNC_SOURCE;
+ if (ns.conn == C_PAUSED_SYNC_T)
+ ns.conn = C_SYNC_TARGET;
+ }
+
+ return ns;
+}
+
+void drbd_resume_al(struct drbd_device *device)
+{
+ if (test_and_clear_bit(AL_SUSPENDED, &device->flags))
+ drbd_info(device, "Resumed AL updates\n");
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
+{
+ if (first_peer_device(device)->connection->agreed_pro_version < 90)
+ device->ov_start_sector = 0;
+ device->rs_total = drbd_bm_bits(device);
+ device->ov_position = 0;
+ if (cs == C_VERIFY_T) {
+ /* starting online verify from an arbitrary position
+ * does not fit well into the existing protocol.
+ * on C_VERIFY_T, we initialize ov_left and friends
+ * implicitly in receive_DataRequest once the
+ * first P_OV_REQUEST is received */
+ device->ov_start_sector = ~(sector_t)0;
+ } else {
+ unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector);
+ if (bit >= device->rs_total) {
+ device->ov_start_sector =
+ BM_BIT_TO_SECT(device->rs_total - 1);
+ device->rs_total = 1;
+ } else
+ device->rs_total -= bit;
+ device->ov_position = device->ov_start_sector;
+ }
+ device->ov_left = device->rs_total;
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @device: DRBD device.
+ * @ns: new state.
+ * @flags: Flags
+ * @done: Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+enum drbd_state_rv
+__drbd_set_state(struct drbd_device *device, union drbd_state ns,
+ enum chg_state_flags flags, struct completion *done)
+{
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ union drbd_state os;
+ enum drbd_state_rv rv = SS_SUCCESS;
+ enum sanitize_state_warnings ssw;
+ struct after_state_chg_work *ascw;
+
+ os = drbd_read_state(device);
+
+ ns = sanitize_state(device, os, ns, &ssw);
+ if (ns.i == os.i)
+ return SS_NOTHING_TO_DO;
+
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS)
+ return rv;
+
+ if (!(flags & CS_HARD)) {
+ /* pre-state-change checks ; only look at ns */
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ rv = is_valid_state(device, ns);
+ if (rv < SS_SUCCESS) {
+ /* If the old state was illegal as well, then let
+ this happen...*/
+
+ if (is_valid_state(device, os) == rv)
+ rv = is_valid_soft_transition(os, ns, connection);
+ } else
+ rv = is_valid_soft_transition(os, ns, connection);
+ }
+
+ if (rv < SS_SUCCESS) {
+ if (flags & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ return rv;
+ }
+
+ print_sanitize_warnings(device, ssw);
+
+ drbd_pr_state_change(device, os, ns, flags);
+
+ /* Display changes to the susp* flags that where caused by the call to
+ sanitize_state(). Only display it here if we where not called from
+ _conn_request_state() */
+ if (!(flags & CS_DC_SUSP))
+ conn_pr_state_change(connection, os, ns,
+ (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+
+ /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+ * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+ * drbd_ldev_destroy() won't happen before our corresponding
+ * after_state_ch works run, where we put_ldev again. */
+ if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+ (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+ atomic_inc(&device->local_cnt);
+
+ if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
+ clear_bit(RS_DONE, &device->flags);
+
+ /* changes to local_cnt and device flags should be visible before
+ * changes to state, which again should be visible before anything else
+ * depending on that change happens. */
+ smp_wmb();
+ device->state.i = ns.i;
+ device->resource->susp = ns.susp;
+ device->resource->susp_nod = ns.susp_nod;
+ device->resource->susp_fen = ns.susp_fen;
+ smp_wmb();
+
+ /* put replicated vs not-replicated requests in seperate epochs */
+ if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+ drbd_should_do_remote((union drbd_dev_state)ns.i))
+ start_new_tl_epoch(connection);
+
+ if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+ drbd_print_uuids(device, "attached to UUIDs");
+
+ /* Wake up role changes, that were delayed because of connection establishing */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
+ no_peer_wf_report_params(connection)) {
+ clear_bit(STATE_SENT, &connection->flags);
+ wake_up_all_devices(connection);
+ }
+
+ wake_up(&device->misc_wait);
+ wake_up(&device->state_wait);
+ wake_up(&connection->ping_wait);
+
+ /* Aborted verify run, or we reached the stop sector.
+ * Log the last position, unless end-of-device. */
+ if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+ ns.conn <= C_CONNECTED) {
+ device->ov_start_sector =
+ BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left);
+ if (device->ov_left)
+ drbd_info(device, "Online Verify reached sector %llu\n",
+ (unsigned long long)device->ov_start_sector);
+ }
+
+ if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
+ drbd_info(device, "Syncer continues.\n");
+ device->rs_paused += (long)jiffies
+ -(long)device->rs_mark_time[device->rs_last_mark];
+ if (ns.conn == C_SYNC_TARGET)
+ mod_timer(&device->resync_timer, jiffies);
+ }
+
+ if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
+ (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+ drbd_info(device, "Resync suspended\n");
+ device->rs_mark_time[device->rs_last_mark] = jiffies;
+ }
+
+ if (os.conn == C_CONNECTED &&
+ (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+ unsigned long now = jiffies;
+ int i;
+
+ set_ov_position(device, ns.conn);
+ device->rs_start = now;
+ device->rs_last_sect_ev = 0;
+ device->ov_last_oos_size = 0;
+ device->ov_last_oos_start = 0;
+
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = device->ov_left;
+ device->rs_mark_time[i] = now;
+ }
+
+ drbd_rs_controller_reset(device);
+
+ if (ns.conn == C_VERIFY_S) {
+ drbd_info(device, "Starting Online Verify from sector %llu\n",
+ (unsigned long long)device->ov_position);
+ mod_timer(&device->resync_timer, jiffies);
+ }
+ }
+
+ if (get_ldev(device)) {
+ u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+ MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+ MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+ mdf &= ~MDF_AL_CLEAN;
+ if (test_bit(CRASHED_PRIMARY, &device->flags))
+ mdf |= MDF_CRASHED_PRIMARY;
+ if (device->state.role == R_PRIMARY ||
+ (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY))
+ mdf |= MDF_PRIMARY_IND;
+ if (device->state.conn > C_WF_REPORT_PARAMS)
+ mdf |= MDF_CONNECTED_IND;
+ if (device->state.disk > D_INCONSISTENT)
+ mdf |= MDF_CONSISTENT;
+ if (device->state.disk > D_OUTDATED)
+ mdf |= MDF_WAS_UP_TO_DATE;
+ if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT)
+ mdf |= MDF_PEER_OUT_DATED;
+ if (mdf != device->ldev->md.flags) {
+ device->ldev->md.flags = mdf;
+ drbd_md_mark_dirty(device);
+ }
+ if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+ drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]);
+ put_ldev(device);
+ }
+
+ /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+ if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+ os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+ set_bit(CONSIDER_RESYNC, &device->flags);
+
+ /* Receiver should clean up itself */
+ if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+ drbd_thread_stop_nowait(&connection->receiver);
+
+ /* Now the receiver finished cleaning up itself, it should die */
+ if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+ drbd_thread_stop_nowait(&connection->receiver);
+
+ /* Upon network failure, we need to restart the receiver. */
+ if (os.conn > C_WF_CONNECTION &&
+ ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+ drbd_thread_restart_nowait(&connection->receiver);
+
+ /* Resume AL writing if we get a connection */
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+ drbd_resume_al(device);
+ connection->connect_cnt++;
+ }
+
+ /* remember last attach time so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ ns.disk > D_NEGOTIATING)
+ device->last_reattach_jif = jiffies;
+
+ ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+ if (ascw) {
+ ascw->os = os;
+ ascw->ns = ns;
+ ascw->flags = flags;
+ ascw->w.cb = w_after_state_ch;
+ ascw->device = device;
+ ascw->done = done;
+ drbd_queue_work(&connection->sender_work,
+ &ascw->w);
+ } else {
+ drbd_err(device, "Could not kmalloc an ascw\n");
+ }
+
+ return rv;
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused)
+{
+ struct after_state_chg_work *ascw =
+ container_of(w, struct after_state_chg_work, w);
+ struct drbd_device *device = ascw->device;
+
+ after_state_ch(device, ascw->os, ascw->ns, ascw->flags);
+ if (ascw->flags & CS_WAIT_COMPLETE)
+ complete(ascw->done);
+ kfree(ascw);
+
+ return 0;
+}
+
+static void abw_start_sync(struct drbd_device *device, int rv)
+{
+ if (rv) {
+ drbd_err(device, "Writing the bitmap failed not starting resync.\n");
+ _drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE);
+ return;
+ }
+
+ switch (device->state.conn) {
+ case C_STARTING_SYNC_T:
+ _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ break;
+ case C_STARTING_SYNC_S:
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ break;
+ }
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_device *device,
+ int (*io_fn)(struct drbd_device *),
+ char *why, enum bm_flag flags)
+{
+ int rv;
+
+ D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+ /* open coded non-blocking drbd_suspend_io(device); */
+ set_bit(SUSPEND_IO, &device->flags);
+
+ drbd_bm_lock(device, why, flags);
+ rv = io_fn(device);
+ drbd_bm_unlock(device);
+
+ drbd_resume_io(device);
+
+ return rv;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @device: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @flags: Flags
+ */
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags)
+{
+ struct drbd_resource *resource = device->resource;
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ struct sib_info sib;
+
+ sib.sib_reason = SIB_STATE_CHANGE;
+ sib.os = os;
+ sib.ns = ns;
+
+ if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
+ && (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
+ clear_bit(CRASHED_PRIMARY, &device->flags);
+ if (device->p_uuid)
+ device->p_uuid[UI_FLAGS] &= ~((u64)2);
+ }
+
+ /* Inform userspace about the change... */
+ drbd_bcast_event(device, &sib);
+
+ if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+ drbd_khelper(device, "pri-on-incon-degr");
+
+ /* Here we have the actions that are performed after a
+ state change. This function might sleep */
+
+ if (ns.susp_nod) {
+ enum drbd_req_event what = NOTHING;
+
+ spin_lock_irq(&device->resource->req_lock);
+ if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED)
+ what = RESEND;
+
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ conn_lowest_disk(connection) > D_NEGOTIATING)
+ what = RESTART_FROZEN_DISK_IO;
+
+ if (resource->susp_nod && what != NOTHING) {
+ _tl_restart(connection, what);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_nod = 1 } },
+ (union drbd_state) { { .susp_nod = 0 } },
+ CS_VERBOSE);
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+
+ if (ns.susp_fen) {
+ spin_lock_irq(&device->resource->req_lock);
+ if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
+ /* case2: The connection was established again: */
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+ clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
+ rcu_read_unlock();
+ _tl_restart(connection, RESEND);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE);
+ }
+ spin_unlock_irq(&device->resource->req_lock);
+ }
+
+ /* Became sync source. With protocol >= 96, we still need to send out
+ * the sync uuid now. Need to do that before any drbd_send_state, or
+ * the other side may go "paused sync" before receiving the sync uuids,
+ * which is unexpected. */
+ if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+ connection->agreed_pro_version >= 96 && get_ldev(device)) {
+ drbd_gen_and_send_sync_uuid(peer_device);
+ put_ldev(device);
+ }
+
+ /* Do not change the order of the if above and the two below... */
+ if (os.pdsk == D_DISKLESS &&
+ ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */
+ /* we probably will start a resync soon.
+ * make sure those things are properly reset. */
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ atomic_set(&device->rs_pending_cnt, 0);
+ drbd_rs_cancel_all(device);
+
+ drbd_send_uuids(peer_device);
+ drbd_send_state(peer_device, ns);
+ }
+ /* No point in queuing send_bitmap if we don't have a connection
+ * anymore, so check also the _current_ state, not only the new state
+ * at the time this work was queued. */
+ if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+ device->state.conn == C_WF_BITMAP_S)
+ drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL,
+ "send_bitmap (WFBitMapS)",
+ BM_LOCKED_TEST_ALLOWED);
+
+ /* Lost contact to peer's copy of the data */
+ if ((os.pdsk >= D_INCONSISTENT &&
+ os.pdsk != D_UNKNOWN &&
+ os.pdsk != D_OUTDATED)
+ && (ns.pdsk < D_INCONSISTENT ||
+ ns.pdsk == D_UNKNOWN ||
+ ns.pdsk == D_OUTDATED)) {
+ if (get_ldev(device)) {
+ if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+ device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ if (drbd_suspended(device)) {
+ set_bit(NEW_CUR_UUID, &device->flags);
+ } else {
+ drbd_uuid_new_current(device);
+ drbd_send_uuids(peer_device);
+ }
+ }
+ put_ldev(device);
+ }
+ }
+
+ if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
+ if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+ device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ drbd_uuid_new_current(device);
+ drbd_send_uuids(peer_device);
+ }
+ /* D_DISKLESS Peer becomes secondary */
+ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+ /* We may still be Primary ourselves.
+ * No harm done if the bitmap still changes,
+ * redirtied pages will follow later. */
+ drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+ "demote diskless peer", BM_LOCKED_SET_ALLOWED);
+ put_ldev(device);
+ }
+
+ /* Write out all changed bits on demote.
+ * Though, no need to da that just yet
+ * if there is a resync going on still */
+ if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+ device->state.conn <= C_CONNECTED && get_ldev(device)) {
+ /* No changes to the bitmap expected this time, so assert that,
+ * even though no harm was done if it did change. */
+ drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+ "demote", BM_LOCKED_TEST_ALLOWED);
+ put_ldev(device);
+ }
+
+ /* Last part of the attaching process ... */
+ if (ns.conn >= C_CONNECTED &&
+ os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+ drbd_send_sizes(peer_device, 0, 0); /* to start sync... */
+ drbd_send_uuids(peer_device);
+ drbd_send_state(peer_device, ns);
+ }
+
+ /* We want to pause/continue resync, tell peer. */
+ if (ns.conn >= C_CONNECTED &&
+ ((os.aftr_isp != ns.aftr_isp) ||
+ (os.user_isp != ns.user_isp)))
+ drbd_send_state(peer_device, ns);
+
+ /* In case one of the isp bits got set, suspend other devices. */
+ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+ (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+ suspend_other_sg(device);
+
+ /* Make sure the peer gets informed about eventual state
+ changes (ISP bits) while we were in WFReportParams. */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+
+ if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+ drbd_send_state(peer_device, ns);
+
+ /* We are in the progress to start a full sync... */
+ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+ /* no other bitmap changes expected during this phase */
+ drbd_queue_bitmap_io(device,
+ &drbd_bmio_set_n_write, &abw_start_sync,
+ "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+ /* first half of local IO error, failure to attach,
+ * or administrative detach */
+ if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+ enum drbd_io_error_p eh = EP_PASS_ON;
+ int was_io_error = 0;
+ /* corresponding get_ldev was in __drbd_set_state, to serialize
+ * our cleanup here with the transition to D_DISKLESS.
+ * But is is still not save to dreference ldev here, since
+ * we might come from an failed Attach before ldev was set. */
+ if (device->ldev) {
+ rcu_read_lock();
+ eh = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+
+ was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
+
+ if (was_io_error && eh == EP_CALL_HELPER)
+ drbd_khelper(device, "local-io-error");
+
+ /* Immediately allow completion of all application IO,
+ * that waits for completion from the local disk,
+ * if this was a force-detach due to disk_timeout
+ * or administrator request (drbdsetup detach --force).
+ * Do NOT abort otherwise.
+ * Aborting local requests may cause serious problems,
+ * if requests are completed to upper layers already,
+ * and then later the already submitted local bio completes.
+ * This can cause DMA into former bio pages that meanwhile
+ * have been re-used for other things.
+ * So aborting local requests may cause crashes,
+ * or even worse, silent data corruption.
+ */
+ if (test_and_clear_bit(FORCE_DETACH, &device->flags))
+ tl_abort_disk_io(device);
+
+ /* current state still has to be D_FAILED,
+ * there is only one way out: to D_DISKLESS,
+ * and that may only happen after our put_ldev below. */
+ if (device->state.disk != D_FAILED)
+ drbd_err(device,
+ "ASSERT FAILED: disk is %s during detach\n",
+ drbd_disk_str(device->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+
+ drbd_rs_cancel_all(device);
+
+ /* In case we want to get something to stable storage still,
+ * this may be the last chance.
+ * Following put_ldev may transition to D_DISKLESS. */
+ drbd_md_sync(device);
+ }
+ put_ldev(device);
+ }
+
+ /* second half of local IO error, failure to attach,
+ * or administrative detach,
+ * after local_cnt references have reached zero again */
+ if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+ /* We must still be diskless,
+ * re-attach has to be serialized with this! */
+ if (device->state.disk != D_DISKLESS)
+ drbd_err(device,
+ "ASSERT FAILED: disk is %s while going diskless\n",
+ drbd_disk_str(device->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+ /* corresponding get_ldev in __drbd_set_state
+ * this may finally trigger drbd_ldev_destroy. */
+ put_ldev(device);
+ }
+
+ /* Notify peer that I had a local IO error, and did not detached.. */
+ if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+
+ /* Disks got bigger while they were detached */
+ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+ test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) {
+ if (ns.conn == C_CONNECTED)
+ resync_after_online_grow(device);
+ }
+
+ /* A resync finished or aborted, wake paused devices... */
+ if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+ (os.peer_isp && !ns.peer_isp) ||
+ (os.user_isp && !ns.user_isp))
+ resume_next_sg(device);
+
+ /* sync target done with resync. Explicitly notify peer, even though
+ * it should (at least for non-empty resyncs) already know itself. */
+ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+ drbd_send_state(peer_device, ns);
+
+ /* Verify finished, or reached stop sector. Peer did not know about
+ * the stop sector, and we may even have changed the stop sector during
+ * verify to interrupt/stop early. Send the new state. */
+ if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
+ && verify_can_do_stop_sector(device))
+ drbd_send_state(peer_device, ns);
+
+ /* This triggers bitmap writeout of potentially still unwritten pages
+ * if the resync finished cleanly, or aborted because of peer disk
+ * failure, or because of connection loss.
+ * For resync aborted because of local disk failure, we cannot do
+ * any bitmap writeout anymore.
+ * No harm done if some bits change during this phase.
+ */
+ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) {
+ drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
+ "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+ put_ldev(device);
+ }
+
+ if (ns.disk == D_DISKLESS &&
+ ns.conn == C_STANDALONE &&
+ ns.role == R_SECONDARY) {
+ if (os.aftr_isp != ns.aftr_isp)
+ resume_next_sg(device);
+ }
+
+ drbd_md_sync(device);
+}
+
+struct after_conn_state_chg_work {
+ struct drbd_work w;
+ enum drbd_conns oc;
+ union drbd_state ns_min;
+ union drbd_state ns_max; /* new, max state, over all devices */
+ enum chg_state_flags flags;
+ struct drbd_connection *connection;
+};
+
+static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+{
+ struct after_conn_state_chg_work *acscw =
+ container_of(w, struct after_conn_state_chg_work, w);
+ struct drbd_connection *connection = acscw->connection;
+ enum drbd_conns oc = acscw->oc;
+ union drbd_state ns_max = acscw->ns_max;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ kfree(acscw);
+
+ /* Upon network configuration, we need to start the receiver */
+ if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
+ drbd_thread_start(&connection->receiver);
+
+ if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
+ struct net_conf *old_conf;
+
+ mutex_lock(&connection->resource->conf_update);
+ old_conf = connection->net_conf;
+ connection->my_addr_len = 0;
+ connection->peer_addr_len = 0;
+ RCU_INIT_POINTER(connection->net_conf, NULL);
+ conn_free_crypto(connection);
+ mutex_unlock(&connection->resource->conf_update);
+
+ synchronize_rcu();
+ kfree(old_conf);
+ }
+
+ if (ns_max.susp_fen) {
+ /* case1: The outdate peer handler is successful: */
+ if (ns_max.pdsk <= D_OUTDATED) {
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ if (test_bit(NEW_CUR_UUID, &device->flags)) {
+ drbd_uuid_new_current(device);
+ clear_bit(NEW_CUR_UUID, &device->flags);
+ }
+ }
+ rcu_read_unlock();
+ spin_lock_irq(&connection->resource->req_lock);
+ _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+ _conn_request_state(connection,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE);
+ spin_unlock_irq(&connection->resource->req_lock);
+ }
+ }
+ kref_put(&connection->kref, drbd_destroy_connection);
+
+ conn_md_sync(connection);
+
+ return 0;
+}
+
+static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+{
+ enum chg_state_flags flags = ~0;
+ struct drbd_peer_device *peer_device;
+ int vnr, first_vol = 1;
+ union drbd_dev_state os, cs = {
+ { .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = connection->cstate,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN,
+ } };
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ os = device->state;
+
+ if (first_vol) {
+ cs = os;
+ first_vol = 0;
+ continue;
+ }
+
+ if (cs.role != os.role)
+ flags &= ~CS_DC_ROLE;
+
+ if (cs.peer != os.peer)
+ flags &= ~CS_DC_PEER;
+
+ if (cs.conn != os.conn)
+ flags &= ~CS_DC_CONN;
+
+ if (cs.disk != os.disk)
+ flags &= ~CS_DC_DISK;
+
+ if (cs.pdsk != os.pdsk)
+ flags &= ~CS_DC_PDSK;
+ }
+ rcu_read_unlock();
+
+ *pf |= CS_DC_MASK;
+ *pf &= flags;
+ (*pcs).i = cs.i;
+}
+
+static enum drbd_state_rv
+conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+ union drbd_state ns, os;
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ os = drbd_read_state(device);
+ ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+
+ if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+ ns.disk = os.disk;
+
+ if (ns.i == os.i)
+ continue;
+
+ rv = is_valid_transition(os, ns);
+
+ if (rv >= SS_SUCCESS && !(flags & CS_HARD)) {
+ rv = is_valid_state(device, ns);
+ if (rv < SS_SUCCESS) {
+ if (is_valid_state(device, os) == rv)
+ rv = is_valid_soft_transition(os, ns, connection);
+ } else
+ rv = is_valid_soft_transition(os, ns, connection);
+ }
+
+ if (rv < SS_SUCCESS) {
+ if (flags & CS_VERBOSE)
+ print_st_err(device, os, ns, rv);
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static void
+conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+{
+ union drbd_state ns, os, ns_max = { };
+ union drbd_state ns_min = {
+ { .role = R_MASK,
+ .peer = R_MASK,
+ .conn = val.conn,
+ .disk = D_MASK,
+ .pdsk = D_MASK
+ } };
+ struct drbd_peer_device *peer_device;
+ enum drbd_state_rv rv;
+ int vnr, number_of_volumes = 0;
+
+ if (mask.conn == C_MASK) {
+ /* remember last connect time so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
+ connection->last_reconnect_jif = jiffies;
+
+ connection->cstate = val.conn;
+ }
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ number_of_volumes++;
+ os = drbd_read_state(device);
+ ns = apply_mask_val(os, mask, val);
+ ns = sanitize_state(device, os, ns, NULL);
+
+ if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+ ns.disk = os.disk;
+
+ rv = __drbd_set_state(device, ns, flags, NULL);
+ if (rv < SS_SUCCESS)
+ BUG();
+
+ ns.i = device->state.i;
+ ns_max.role = max_role(ns.role, ns_max.role);
+ ns_max.peer = max_role(ns.peer, ns_max.peer);
+ ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
+ ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
+ ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
+
+ ns_min.role = min_role(ns.role, ns_min.role);
+ ns_min.peer = min_role(ns.peer, ns_min.peer);
+ ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
+ ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
+ ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+ }
+ rcu_read_unlock();
+
+ if (number_of_volumes == 0) {
+ ns_min = ns_max = (union drbd_state) { {
+ .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = val.conn,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN
+ } };
+ }
+
+ ns_min.susp = ns_max.susp = connection->resource->susp;
+ ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod;
+ ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen;
+
+ *pns_min = ns_min;
+ *pns_max = ns_max;
+}
+
+static enum drbd_state_rv
+_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
+
+ if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
+ rv = SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
+ rv = SS_CW_FAILED_BY_PEER;
+
+ err = conn_is_valid_transition(connection, mask, val, 0);
+ if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
+ return rv;
+
+ return err;
+}
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+ struct after_conn_state_chg_work *acscw;
+ enum drbd_conns oc = connection->cstate;
+ union drbd_state ns_max, ns_min, os;
+ bool have_mutex = false;
+
+ if (mask.conn) {
+ rv = is_valid_conn_transition(oc, val.conn);
+ if (rv < SS_SUCCESS)
+ goto abort;
+ }
+
+ rv = conn_is_valid_transition(connection, mask, val, flags);
+ if (rv < SS_SUCCESS)
+ goto abort;
+
+ if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
+ !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
+
+ /* This will be a cluster-wide state change.
+ * Need to give up the spinlock, grab the mutex,
+ * then send the state change request, ... */
+ spin_unlock_irq(&connection->resource->req_lock);
+ mutex_lock(&connection->cstate_mutex);
+ have_mutex = true;
+
+ set_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ if (conn_send_state_req(connection, mask, val)) {
+ /* sending failed. */
+ clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ rv = SS_CW_FAILED_BY_PEER;
+ /* need to re-aquire the spin lock, though */
+ goto abort_unlocked;
+ }
+
+ if (val.conn == C_DISCONNECTING)
+ set_bit(DISCONNECT_SENT, &connection->flags);
+
+ /* ... and re-aquire the spinlock.
+ * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
+ * conn_set_state() within the same spinlock. */
+ spin_lock_irq(&connection->resource->req_lock);
+ wait_event_lock_irq(connection->ping_wait,
+ (rv = _conn_rq_cond(connection, mask, val)),
+ connection->resource->req_lock);
+ clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+ if (rv < SS_SUCCESS)
+ goto abort;
+ }
+
+ conn_old_common_state(connection, &os, &flags);
+ flags |= CS_DC_SUSP;
+ conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
+ conn_pr_state_change(connection, os, ns_max, flags);
+
+ acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
+ if (acscw) {
+ acscw->oc = os.conn;
+ acscw->ns_min = ns_min;
+ acscw->ns_max = ns_max;
+ acscw->flags = flags;
+ acscw->w.cb = w_after_conn_state_ch;
+ kref_get(&connection->kref);
+ acscw->connection = connection;
+ drbd_queue_work(&connection->sender_work, &acscw->w);
+ } else {
+ drbd_err(connection, "Could not kmalloc an acscw\n");
+ }
+
+ abort:
+ if (have_mutex) {
+ /* mutex_unlock() "... must not be used in interrupt context.",
+ * so give up the spinlock, then re-aquire it */
+ spin_unlock_irq(&connection->resource->req_lock);
+ abort_unlocked:
+ mutex_unlock(&connection->cstate_mutex);
+ spin_lock_irq(&connection->resource->req_lock);
+ }
+ if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
+ drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv));
+ drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
+ drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
+ }
+ return rv;
+}
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv;
+
+ spin_lock_irq(&connection->resource->req_lock);
+ rv = _conn_request_state(connection, mask, val, flags);
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ return rv;
+}
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
new file mode 100644
index 000000000..7f53c4082
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.h
@@ -0,0 +1,166 @@
+#ifndef DRBD_STATE_H
+#define DRBD_STATE_H
+
+struct drbd_device;
+struct drbd_connection;
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+#define NS(T, S) \
+ ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+enum chg_state_flags {
+ CS_HARD = 1 << 0,
+ CS_VERBOSE = 1 << 1,
+ CS_WAIT_COMPLETE = 1 << 2,
+ CS_SERIALIZE = 1 << 3,
+ CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
+ CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
+ CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
+ CS_DC_PEER = 1 << 6,
+ CS_DC_CONN = 1 << 7,
+ CS_DC_DISK = 1 << 8,
+ CS_DC_PDSK = 1 << 9,
+ CS_DC_SUSP = 1 << 10,
+ CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
+ CS_IGN_OUTD_FAIL = 1 << 11,
+};
+
+/* drbd_dev_state and drbd_state are different types. This is to stress the
+ small difference. There is no suspended flag (.susp), and no suspended
+ while fence handler runs flas (susp_fen). */
+union drbd_dev_state {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned _unused:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned peer_isp:1 ;
+ unsigned user_isp:1 ;
+ unsigned _pad:11; /* 0 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned _pad:11;
+ unsigned user_isp:1 ;
+ unsigned peer_isp:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned _unused:1 ;
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+ };
+ unsigned int i;
+};
+
+extern enum drbd_state_rv drbd_change_state(struct drbd_device *device,
+ enum chg_state_flags f,
+ union drbd_state mask,
+ union drbd_state val);
+extern void drbd_force_state(struct drbd_device *, union drbd_state,
+ union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
+ union drbd_state,
+ union drbd_state,
+ enum chg_state_flags);
+
+extern enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
+ union drbd_state, enum chg_state_flags);
+
+extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state,
+ enum chg_state_flags,
+ struct completion *done);
+extern void print_st_err(struct drbd_device *, union drbd_state,
+ union drbd_state, int);
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags);
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags);
+
+extern void drbd_resume_al(struct drbd_device *device);
+extern bool conn_all_vols_unconf(struct drbd_connection *connection);
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @device: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_device *device,
+ union drbd_state mask,
+ union drbd_state val)
+{
+ return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection);
+enum drbd_role conn_highest_peer(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection);
+
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000..80b0f63c7
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,118 @@
+/*
+ drbd.h
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+
+static const char *drbd_conn_s_names[] = {
+ [C_STANDALONE] = "StandAlone",
+ [C_DISCONNECTING] = "Disconnecting",
+ [C_UNCONNECTED] = "Unconnected",
+ [C_TIMEOUT] = "Timeout",
+ [C_BROKEN_PIPE] = "BrokenPipe",
+ [C_NETWORK_FAILURE] = "NetworkFailure",
+ [C_PROTOCOL_ERROR] = "ProtocolError",
+ [C_WF_CONNECTION] = "WFConnection",
+ [C_WF_REPORT_PARAMS] = "WFReportParams",
+ [C_TEAR_DOWN] = "TearDown",
+ [C_CONNECTED] = "Connected",
+ [C_STARTING_SYNC_S] = "StartingSyncS",
+ [C_STARTING_SYNC_T] = "StartingSyncT",
+ [C_WF_BITMAP_S] = "WFBitMapS",
+ [C_WF_BITMAP_T] = "WFBitMapT",
+ [C_WF_SYNC_UUID] = "WFSyncUUID",
+ [C_SYNC_SOURCE] = "SyncSource",
+ [C_SYNC_TARGET] = "SyncTarget",
+ [C_PAUSED_SYNC_S] = "PausedSyncS",
+ [C_PAUSED_SYNC_T] = "PausedSyncT",
+ [C_VERIFY_S] = "VerifyS",
+ [C_VERIFY_T] = "VerifyT",
+ [C_AHEAD] = "Ahead",
+ [C_BEHIND] = "Behind",
+};
+
+static const char *drbd_role_s_names[] = {
+ [R_PRIMARY] = "Primary",
+ [R_SECONDARY] = "Secondary",
+ [R_UNKNOWN] = "Unknown"
+};
+
+static const char *drbd_disk_s_names[] = {
+ [D_DISKLESS] = "Diskless",
+ [D_ATTACHING] = "Attaching",
+ [D_FAILED] = "Failed",
+ [D_NEGOTIATING] = "Negotiating",
+ [D_INCONSISTENT] = "Inconsistent",
+ [D_OUTDATED] = "Outdated",
+ [D_UNKNOWN] = "DUnknown",
+ [D_CONSISTENT] = "Consistent",
+ [D_UP_TO_DATE] = "UpToDate",
+};
+
+static const char *drbd_state_sw_errors[] = {
+ [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+ [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
+ [-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+ [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+ [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+ [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+ [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+ [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+ [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+ [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+ [-SS_DEVICE_IN_USE] = "Device is held open by someone",
+ [-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+ [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+ [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+ [-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+ [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+ [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+ [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+ [-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
+ [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+ /* enums are unsigned... */
+ return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+ return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+ return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_rv err)
+{
+ return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+ err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+ : drbd_state_sw_errors[-err];
+}
diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h
new file mode 100644
index 000000000..f9923cc88
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.h
@@ -0,0 +1,9 @@
+#ifndef __DRBD_STRINGS_H
+#define __DRBD_STRINGS_H
+
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_rv);
+
+#endif /* __DRBD_STRINGS_H */
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000..8cb1532a3
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,351 @@
+/*
+-*- linux-c -*-
+ drbd_receiver.c
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transferred bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ * * simple byte-based
+ * * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total). The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix data bits max val Nº data bits
+0 x 0x2 1
+10 x 0x4 1
+110 xx 0x8 2
+1110 xxx 0x10 3
+11110 xxx xx 0x30 5
+111110 xx xxxxxx 0x130 8
+11111100 xxxxxxxx xxxxx 0x2130 13
+11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21
+11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34
+11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted x 0.29
+ as plaintext x ........................
+ x ........................
+ x ........................
+ x 0.59 0.21........................
+ x ........................................................
+ x .. c ...................................................
+ x 0.44.. o ...................................................
+ x .......... d ...................................................
+ x .......... e ...................................................
+ X............. ...................................................
+ x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ........................... plain bits ..........
+-+-----------------------------------------------------------------------
+ 1 16 32 64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+ LEVEL( 2, 1, 0x00); \
+ LEVEL( 3, 2, 0x01); \
+ LEVEL( 5, 3, 0x03); \
+ LEVEL( 7, 4, 0x07); \
+ LEVEL(10, 5, 0x0f); \
+ LEVEL(14, 6, 0x1f); \
+ LEVEL(21, 8, 0x3f); \
+ LEVEL(29, 8, 0x7f); \
+ LEVEL(42, 8, 0xbf); \
+ LEVEL(64, 8, 0xff); \
+ } while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+ u64 adj = 1;
+
+#define LEVEL(t,b,v) \
+ do { \
+ if ((in & ((1 << b) -1)) == v) { \
+ *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \
+ return t; \
+ } \
+ adj += 1ULL << (t - b); \
+ } while (0)
+
+ VLI_L_1_1();
+
+ /* NOT REACHED, if VLI_LEVELS code table is defined properly */
+ BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+ u64 max = 0;
+ u64 adj = 1;
+
+ if (in == 0)
+ return -EINVAL;
+
+#define LEVEL(t,b,v) do { \
+ max += 1ULL << (t - b); \
+ if (in <= max) { \
+ if (out) \
+ *out = ((in - adj) << b) | v; \
+ return t; \
+ } \
+ adj = max + 1; \
+ } while (0)
+
+ VLI_L_1_1();
+
+ return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+ /* the current byte */
+ u8 *b;
+ /* the current bit within *b, nomalized: 0..7 */
+ unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+ cur->b = s;
+ cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+ bits += cur->bit;
+ cur->b = cur->b + (bits >> 3);
+ cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+ struct bitstream_cursor cur;
+ unsigned char *buf;
+ size_t buf_len; /* in bytes */
+
+ /* for input stream:
+ * number of trailing 0 bits for padding
+ * total number of valid bits in stream: buf_len * 8 - pad_bits */
+ unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+ bs->buf = s;
+ bs->buf_len = len;
+ bs->pad_bits = pad_bits;
+ bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+ bitstream_cursor_reset(&bs->cur, bs->buf);
+ memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+ unsigned char *b = bs->cur.b;
+ unsigned int tmp;
+
+ if (bits == 0)
+ return 0;
+
+ if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+ return -ENOBUFS;
+
+ /* paranoia: strip off hi bits; they should not be set anyways. */
+ if (bits < 64)
+ val &= ~0ULL >> (64 - bits);
+
+ *b++ |= (val & 0xff) << bs->cur.bit;
+
+ for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+ *b++ |= (val >> tmp) & 0xff;
+
+ bitstream_cursor_advance(&bs->cur, bits);
+ return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+ u64 val;
+ unsigned int n;
+
+ if (bits > 64)
+ return -EINVAL;
+
+ if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+ bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+ - bs->cur.bit - bs->pad_bits;
+
+ if (bits == 0) {
+ *out = 0;
+ return 0;
+ }
+
+ /* get the high bits */
+ val = 0;
+ n = (bs->cur.bit + bits + 7) >> 3;
+ /* n may be at most 9, if cur.bit + bits > 64 */
+ /* which means this copies at most 8 byte */
+ if (n) {
+ memcpy(&val, bs->cur.b+1, n - 1);
+ val = le64_to_cpu(val) << (8 - bs->cur.bit);
+ }
+
+ /* we still need the low bits */
+ val |= bs->cur.b[0] >> bs->cur.bit;
+
+ /* and mask out bits we don't want */
+ val &= ~0ULL >> (64 - bits);
+
+ bitstream_cursor_advance(&bs->cur, bits);
+ *out = val;
+
+ return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ * > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+ u64 code = code;
+ int bits = __vli_encode_bits(&code, in);
+
+ if (bits <= 0)
+ return bits;
+
+ return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000..d0fae55d8
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,2156 @@
+/*
+ drbd_worker.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+static int make_ov_request(struct drbd_device *, int);
+static int make_resync_request(struct drbd_device *, int);
+
+/* endio handlers:
+ * drbd_md_endio (defined here)
+ * drbd_request_endio (defined here)
+ * drbd_peer_request_endio (defined here)
+ * drbd_bm_endio (defined in drbd_bitmap.c)
+ *
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+
+/* About the global_state_lock
+ Each state transition on an device holds a read lock. In case we have
+ to evaluate the resync after dependencies, we grab a write lock, because
+ we need stable states on all devices for that. */
+rwlock_t global_state_lock;
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_endio(struct bio *bio, int error)
+{
+ struct drbd_device *device;
+
+ device = bio->bi_private;
+ device->md_io.error = error;
+
+ /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+ * to timeout on the lower level device, and eventually detach from it.
+ * If this io completion runs after that timeout expired, this
+ * drbd_md_put_buffer() may allow us to finally try and re-attach.
+ * During normal operation, this only puts that extra reference
+ * down to 1 again.
+ * Make sure we first drop the reference, and only then signal
+ * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+ * next drbd_md_sync_page_io(), that we trigger the
+ * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
+ */
+ drbd_md_put_buffer(device);
+ device->md_io.done = 1;
+ wake_up(&device->misc_wait);
+ bio_put(bio);
+ if (device->ldev) /* special case: drbd_md_read() during drbd_adm_attach() */
+ put_ldev(device);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+ unsigned long flags = 0;
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ device->read_cnt += peer_req->i.size >> 9;
+ list_del(&peer_req->w.list);
+ if (list_empty(&device->read_ee))
+ wake_up(&device->ee_wait);
+ if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+ __drbd_chk_io_error(device, DRBD_READ_ERROR);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
+ put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver, final stage. */
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+ unsigned long flags = 0;
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ struct drbd_interval i;
+ int do_wake;
+ u64 block_id;
+ int do_al_complete_io;
+
+ /* after we moved peer_req to done_ee,
+ * we may no longer access it,
+ * it may be freed/reused already!
+ * (as soon as we release the req_lock) */
+ i = peer_req->i;
+ do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+ block_id = peer_req->block_id;
+ peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ device->writ_cnt += peer_req->i.size >> 9;
+ list_move_tail(&peer_req->w.list, &device->done_ee);
+
+ /*
+ * Do not remove from the write_requests tree here: we did not send the
+ * Ack yet and did not wake possibly waiting conflicting requests.
+ * Removed from the tree from "drbd_process_done_ee" within the
+ * appropriate dw.cb (e_end_block/e_end_resync_block) or from
+ * _drbd_clear_done_ee.
+ */
+
+ do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
+
+ /* FIXME do we want to detach for failed REQ_DISCARD?
+ * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+ if (peer_req->flags & EE_WAS_ERROR)
+ __drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+ if (block_id == ID_SYNCER)
+ drbd_rs_complete_io(device, i.sector);
+
+ if (do_wake)
+ wake_up(&device->ee_wait);
+
+ if (do_al_complete_io)
+ drbd_al_complete_io(device, &i);
+
+ wake_asender(peer_device->connection);
+ put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_peer_request_endio(struct bio *bio, int error)
+{
+ struct drbd_peer_request *peer_req = bio->bi_private;
+ struct drbd_device *device = peer_req->peer_device->device;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+ int is_write = bio_data_dir(bio) == WRITE;
+ int is_discard = !!(bio->bi_rw & REQ_DISCARD);
+
+ if (error && __ratelimit(&drbd_ratelimit_state))
+ drbd_warn(device, "%s: error=%d s=%llus\n",
+ is_write ? (is_discard ? "discard" : "write")
+ : "read", error,
+ (unsigned long long)peer_req->i.sector);
+ if (!error && !uptodate) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_warn(device, "%s: setting error to -EIO s=%llus\n",
+ is_write ? "write" : "read",
+ (unsigned long long)peer_req->i.sector);
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ error = -EIO;
+ }
+
+ if (error)
+ set_bit(__EE_WAS_ERROR, &peer_req->flags);
+
+ bio_put(bio); /* no need for the bio anymore */
+ if (atomic_dec_and_test(&peer_req->pending_bios)) {
+ if (is_write)
+ drbd_endio_write_sec_final(peer_req);
+ else
+ drbd_endio_read_sec_final(peer_req);
+ }
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_request_endio(struct bio *bio, int error)
+{
+ unsigned long flags;
+ struct drbd_request *req = bio->bi_private;
+ struct drbd_device *device = req->device;
+ struct bio_and_error m;
+ enum drbd_req_event what;
+ int uptodate = bio_flagged(bio, BIO_UPTODATE);
+
+ if (!error && !uptodate) {
+ drbd_warn(device, "p %s: setting error to -EIO\n",
+ bio_data_dir(bio) == WRITE ? "write" : "read");
+ /* strange behavior of some lower level drivers...
+ * fail the request by clearing the uptodate flag,
+ * but do not return any error?! */
+ error = -EIO;
+ }
+
+
+ /* If this request was aborted locally before,
+ * but now was completed "successfully",
+ * chances are that this caused arbitrary data corruption.
+ *
+ * "aborting" requests, or force-detaching the disk, is intended for
+ * completely blocked/hung local backing devices which do no longer
+ * complete requests at all, not even do error completions. In this
+ * situation, usually a hard-reset and failover is the only way out.
+ *
+ * By "aborting", basically faking a local error-completion,
+ * we allow for a more graceful swichover by cleanly migrating services.
+ * Still the affected node has to be rebooted "soon".
+ *
+ * By completing these requests, we allow the upper layers to re-use
+ * the associated data pages.
+ *
+ * If later the local backing device "recovers", and now DMAs some data
+ * from disk into the original request pages, in the best case it will
+ * just put random data into unused pages; but typically it will corrupt
+ * meanwhile completely unrelated data, causing all sorts of damage.
+ *
+ * Which means delayed successful completion,
+ * especially for READ requests,
+ * is a reason to panic().
+ *
+ * We assume that a delayed *error* completion is OK,
+ * though we still will complain noisily about it.
+ */
+ if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+ if (!error)
+ panic("possible random memory corruption caused by delayed completion of aborted local request\n");
+ }
+
+ /* to avoid recursion in __req_mod */
+ if (unlikely(error)) {
+ if (bio->bi_rw & REQ_DISCARD)
+ what = (error == -EOPNOTSUPP)
+ ? DISCARD_COMPLETED_NOTSUPP
+ : DISCARD_COMPLETED_WITH_ERROR;
+ else
+ what = (bio_data_dir(bio) == WRITE)
+ ? WRITE_COMPLETED_WITH_ERROR
+ : (bio_rw(bio) == READ)
+ ? READ_COMPLETED_WITH_ERROR
+ : READ_AHEAD_COMPLETED_WITH_ERROR;
+ } else
+ what = COMPLETED_OK;
+
+ bio_put(req->private_bio);
+ req->private_bio = ERR_PTR(error);
+
+ /* not req_mod(), we need irqsave here! */
+ spin_lock_irqsave(&device->resource->req_lock, flags);
+ __req_mod(req, what, &m);
+ spin_unlock_irqrestore(&device->resource->req_lock, flags);
+ put_ldev(device);
+
+ if (m.bio)
+ complete_master_bio(device, &m);
+}
+
+void drbd_csum_ee(struct crypto_hash *tfm, struct drbd_peer_request *peer_req, void *digest)
+{
+ struct hash_desc desc;
+ struct scatterlist sg;
+ struct page *page = peer_req->pages;
+ struct page *tmp;
+ unsigned len;
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ sg_init_table(&sg, 1);
+ crypto_hash_init(&desc);
+
+ while ((tmp = page_chain_next(page))) {
+ /* all but the last page will be fully used */
+ sg_set_page(&sg, page, PAGE_SIZE, 0);
+ crypto_hash_update(&desc, &sg, sg.length);
+ page = tmp;
+ }
+ /* and now the last, possibly only partially used page */
+ len = peer_req->i.size & (PAGE_SIZE - 1);
+ sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
+ crypto_hash_update(&desc, &sg, sg.length);
+ crypto_hash_final(&desc, digest);
+}
+
+void drbd_csum_bio(struct crypto_hash *tfm, struct bio *bio, void *digest)
+{
+ struct hash_desc desc;
+ struct scatterlist sg;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ desc.tfm = tfm;
+ desc.flags = 0;
+
+ sg_init_table(&sg, 1);
+ crypto_hash_init(&desc);
+
+ bio_for_each_segment(bvec, bio, iter) {
+ sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
+ crypto_hash_update(&desc, &sg, sg.length);
+ }
+ crypto_hash_final(&desc, digest);
+}
+
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ int digest_size;
+ void *digest;
+ int err = 0;
+
+ if (unlikely(cancel))
+ goto out;
+
+ if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
+ goto out;
+
+ digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ if (digest) {
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+ /* Free peer_req and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
+ peer_req = NULL;
+ inc_rs_pending(device);
+ err = drbd_send_drequest_csum(peer_device, sector, size,
+ digest, digest_size,
+ P_CSUM_RS_REQUEST);
+ kfree(digest);
+ } else {
+ drbd_err(device, "kmalloc() of digest failed.\n");
+ err = -ENOMEM;
+ }
+
+out:
+ if (peer_req)
+ drbd_free_peer_req(device, peer_req);
+
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
+ return err;
+}
+
+#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+ struct drbd_device *device = peer_device->device;
+ struct drbd_peer_request *peer_req;
+
+ if (!get_ldev(device))
+ return -EIO;
+
+ /* GFP_TRY, because if there is no memory available right now, this may
+ * be rescheduled for later. It is "only" background resync, after all. */
+ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
+ size, true /* has real payload */, GFP_TRY);
+ if (!peer_req)
+ goto defer;
+
+ peer_req->w.cb = w_e_send_csum;
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->read_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ atomic_add(size >> 9, &device->rs_sect_ev);
+ if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
+ return 0;
+
+ /* If it failed because of ENOMEM, retry should help. If it failed
+ * because bio_add_page failed (probably broken lower level driver),
+ * retry may or may not help.
+ * If it does not, you may need to force disconnect. */
+ spin_lock_irq(&device->resource->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ drbd_free_peer_req(device, peer_req);
+defer:
+ put_ldev(device);
+ return -EAGAIN;
+}
+
+int w_resync_timer(struct drbd_work *w, int cancel)
+{
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, resync_work);
+
+ switch (device->state.conn) {
+ case C_VERIFY_S:
+ make_ov_request(device, cancel);
+ break;
+ case C_SYNC_TARGET:
+ make_resync_request(device, cancel);
+ break;
+ }
+
+ return 0;
+}
+
+void resync_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+
+ drbd_queue_work_if_unqueued(
+ &first_peer_device(device)->connection->sender_work,
+ &device->resync_work);
+}
+
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+ int ov;
+
+ ov = fb->values[fb->head_index];
+ fb->values[fb->head_index++] = value;
+
+ if (fb->head_index >= fb->size)
+ fb->head_index = 0;
+
+ return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] += value;
+}
+
+struct fifo_buffer *fifo_alloc(int fifo_size)
+{
+ struct fifo_buffer *fb;
+
+ fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+ if (!fb)
+ return NULL;
+
+ fb->head_index = 0;
+ fb->size = fifo_size;
+ fb->total = 0;
+
+ return fb;
+}
+
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
+{
+ struct disk_conf *dc;
+ unsigned int want; /* The number of sectors we want in-flight */
+ int req_sect; /* Number of sectors to request in this turn */
+ int correction; /* Number of sectors more we need in-flight */
+ int cps; /* correction per invocation of drbd_rs_controller() */
+ int steps; /* Number of time steps to plan ahead */
+ int curr_corr;
+ int max_sect;
+ struct fifo_buffer *plan;
+
+ dc = rcu_dereference(device->ldev->disk_conf);
+ plan = rcu_dereference(device->rs_plan_s);
+
+ steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+ if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
+ want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
+ } else { /* normal path */
+ want = dc->c_fill_target ? dc->c_fill_target :
+ sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
+ }
+
+ correction = want - device->rs_in_flight - plan->total;
+
+ /* Plan ahead */
+ cps = correction / steps;
+ fifo_add_val(plan, cps);
+ plan->total += cps * steps;
+
+ /* What we do in this step */
+ curr_corr = fifo_push(plan, 0);
+ plan->total -= curr_corr;
+
+ req_sect = sect_in + curr_corr;
+ if (req_sect < 0)
+ req_sect = 0;
+
+ max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
+ if (req_sect > max_sect)
+ req_sect = max_sect;
+
+ /*
+ drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+ sect_in, device->rs_in_flight, want, correction,
+ steps, cps, device->rs_planed, curr_corr, req_sect);
+ */
+
+ return req_sect;
+}
+
+static int drbd_rs_number_requests(struct drbd_device *device)
+{
+ unsigned int sect_in; /* Number of sectors that came in since the last turn */
+ int number, mxb;
+
+ sect_in = atomic_xchg(&device->rs_sect_in, 0);
+ device->rs_in_flight -= sect_in;
+
+ rcu_read_lock();
+ mxb = drbd_get_max_buffers(device) / 2;
+ if (rcu_dereference(device->rs_plan_s)->size) {
+ number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
+ device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+ } else {
+ device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
+ number = SLEEP_TIME * device->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
+ }
+ rcu_read_unlock();
+
+ /* Don't have more than "max-buffers"/2 in-flight.
+ * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+ * potentially causing a distributed deadlock on congestion during
+ * online-verify or (checksum-based) resync, if max-buffers,
+ * socket buffer sizes and resync rate settings are mis-configured. */
+
+ /* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+ * mxb (as used here, and in drbd_alloc_pages on the peer) is
+ * "number of pages" (typically also 4k),
+ * but "rs_in_flight" is in "sectors" (512 Byte). */
+ if (mxb - device->rs_in_flight/8 < number)
+ number = mxb - device->rs_in_flight/8;
+
+ return number;
+}
+
+static int make_resync_request(struct drbd_device *const device, int cancel)
+{
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+ unsigned long bit;
+ sector_t sector;
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ int max_bio_size;
+ int number, rollback_i, size;
+ int align, requeue = 0;
+ int i = 0;
+
+ if (unlikely(cancel))
+ return 0;
+
+ if (device->rs_total == 0) {
+ /* empty resync? */
+ drbd_resync_finished(device);
+ return 0;
+ }
+
+ if (!get_ldev(device)) {
+ /* Since we only need to access device->rsync a
+ get_ldev_if_state(device,D_FAILED) would be sufficient, but
+ to continue resync with a broken disk makes no sense at
+ all */
+ drbd_err(device, "Disk broke down during resync!\n");
+ return 0;
+ }
+
+ max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
+ number = drbd_rs_number_requests(device);
+ if (number <= 0)
+ goto requeue;
+
+ for (i = 0; i < number; i++) {
+ /* Stop generating RS requests when half of the send buffer is filled,
+ * but notify TCP that we'd like to have more space. */
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket) {
+ struct sock *sk = connection->data.socket->sk;
+ int queued = sk->sk_wmem_queued;
+ int sndbuf = sk->sk_sndbuf;
+ if (queued > sndbuf / 2) {
+ requeue = 1;
+ if (sk->sk_socket)
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ }
+ } else
+ requeue = 1;
+ mutex_unlock(&connection->data.mutex);
+ if (requeue)
+ goto requeue;
+
+next_sector:
+ size = BM_BLOCK_SIZE;
+ bit = drbd_bm_find_next(device, device->bm_resync_fo);
+
+ if (bit == DRBD_END_OF_BITMAP) {
+ device->bm_resync_fo = drbd_bm_bits(device);
+ put_ldev(device);
+ return 0;
+ }
+
+ sector = BM_BIT_TO_SECT(bit);
+
+ if (drbd_try_rs_begin_io(device, sector)) {
+ device->bm_resync_fo = bit;
+ goto requeue;
+ }
+ device->bm_resync_fo = bit + 1;
+
+ if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
+ drbd_rs_complete_io(device, sector);
+ goto next_sector;
+ }
+
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
+ /* try to find some adjacent bits.
+ * we stop if we have already the maximum req size.
+ *
+ * Additionally always align bigger requests, in order to
+ * be prepared for all stripe sizes of software RAIDs.
+ */
+ align = 1;
+ rollback_i = i;
+ while (i < number) {
+ if (size + BM_BLOCK_SIZE > max_bio_size)
+ break;
+
+ /* Be always aligned */
+ if (sector & ((1<<(align+3))-1))
+ break;
+
+ /* do not cross extent boundaries */
+ if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+ break;
+ /* now, is it actually dirty, after all?
+ * caution, drbd_bm_test_bit is tri-state for some
+ * obscure reason; ( b == 0 ) would get the out-of-band
+ * only accidentally right because of the "oddly sized"
+ * adjustment below */
+ if (drbd_bm_test_bit(device, bit+1) != 1)
+ break;
+ bit++;
+ size += BM_BLOCK_SIZE;
+ if ((BM_BLOCK_SIZE << align) <= size)
+ align++;
+ i++;
+ }
+ /* if we merged some,
+ * reset the offset to start the next drbd_bm_find_next from */
+ if (size > BM_BLOCK_SIZE)
+ device->bm_resync_fo = bit + 1;
+#endif
+
+ /* adjust very last sectors, in case we are oddly sized */
+ if (sector + (size>>9) > capacity)
+ size = (capacity-sector)<<9;
+
+ if (device->use_csums) {
+ switch (read_for_csum(peer_device, sector, size)) {
+ case -EIO: /* Disk failure */
+ put_ldev(device);
+ return -EIO;
+ case -EAGAIN: /* allocation failed, or ldev busy */
+ drbd_rs_complete_io(device, sector);
+ device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ i = rollback_i;
+ goto requeue;
+ case 0:
+ /* everything ok */
+ break;
+ default:
+ BUG();
+ }
+ } else {
+ int err;
+
+ inc_rs_pending(device);
+ err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST,
+ sector, size, ID_SYNCER);
+ if (err) {
+ drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
+ dec_rs_pending(device);
+ put_ldev(device);
+ return err;
+ }
+ }
+ }
+
+ if (device->bm_resync_fo >= drbd_bm_bits(device)) {
+ /* last syncer _request_ was sent,
+ * but the P_RS_DATA_REPLY not yet received. sync will end (and
+ * next sync group will resume), as soon as we receive the last
+ * resync data block, and the last bit is cleared.
+ * until then resync "work" is "inactive" ...
+ */
+ put_ldev(device);
+ return 0;
+ }
+
+ requeue:
+ device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+ mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+ put_ldev(device);
+ return 0;
+}
+
+static int make_ov_request(struct drbd_device *device, int cancel)
+{
+ int number, i, size;
+ sector_t sector;
+ const sector_t capacity = drbd_get_capacity(device->this_bdev);
+ bool stop_sector_reached = false;
+
+ if (unlikely(cancel))
+ return 1;
+
+ number = drbd_rs_number_requests(device);
+
+ sector = device->ov_position;
+ for (i = 0; i < number; i++) {
+ if (sector >= capacity)
+ return 1;
+
+ /* We check for "finished" only in the reply path:
+ * w_e_end_ov_reply().
+ * We need to send at least one request out. */
+ stop_sector_reached = i > 0
+ && verify_can_do_stop_sector(device)
+ && sector >= device->ov_stop_sector;
+ if (stop_sector_reached)
+ break;
+
+ size = BM_BLOCK_SIZE;
+
+ if (drbd_try_rs_begin_io(device, sector)) {
+ device->ov_position = sector;
+ goto requeue;
+ }
+
+ if (sector + (size>>9) > capacity)
+ size = (capacity-sector)<<9;
+
+ inc_rs_pending(device);
+ if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
+ dec_rs_pending(device);
+ return 0;
+ }
+ sector += BM_SECT_PER_BIT;
+ }
+ device->ov_position = sector;
+
+ requeue:
+ device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+ if (i == 0 || !stop_sector_reached)
+ mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+ return 1;
+}
+
+int w_ov_finished(struct drbd_work *w, int cancel)
+{
+ struct drbd_device_work *dw =
+ container_of(w, struct drbd_device_work, w);
+ struct drbd_device *device = dw->device;
+ kfree(dw);
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+
+ return 0;
+}
+
+static int w_resync_finished(struct drbd_work *w, int cancel)
+{
+ struct drbd_device_work *dw =
+ container_of(w, struct drbd_device_work, w);
+ struct drbd_device *device = dw->device;
+ kfree(dw);
+
+ drbd_resync_finished(device);
+
+ return 0;
+}
+
+static void ping_peer(struct drbd_device *device)
+{
+ struct drbd_connection *connection = first_peer_device(device)->connection;
+
+ clear_bit(GOT_PING_ACK, &connection->flags);
+ request_ping(connection);
+ wait_event(connection->ping_wait,
+ test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
+}
+
+int drbd_resync_finished(struct drbd_device *device)
+{
+ unsigned long db, dt, dbdt;
+ unsigned long n_oos;
+ union drbd_state os, ns;
+ struct drbd_device_work *dw;
+ char *khelper_cmd = NULL;
+ int verify_done = 0;
+
+ /* Remove all elements from the resync LRU. Since future actions
+ * might set bits in the (main) bitmap, then the entries in the
+ * resync LRU would be wrong. */
+ if (drbd_rs_del_all(device)) {
+ /* In case this is not possible now, most probably because
+ * there are P_RS_DATA_REPLY Packets lingering on the worker's
+ * queue (or even the read operations for those packets
+ * is not finished by now). Retry in 100ms. */
+
+ schedule_timeout_interruptible(HZ / 10);
+ dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
+ if (dw) {
+ dw->w.cb = w_resync_finished;
+ dw->device = device;
+ drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+ &dw->w);
+ return 1;
+ }
+ drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
+ }
+
+ dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+ if (dt <= 0)
+ dt = 1;
+
+ db = device->rs_total;
+ /* adjust for verify start and stop sectors, respective reached position */
+ if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+ db -= device->ov_left;
+
+ dbdt = Bit2KB(db/dt);
+ device->rs_paused /= HZ;
+
+ if (!get_ldev(device))
+ goto out;
+
+ ping_peer(device);
+
+ spin_lock_irq(&device->resource->req_lock);
+ os = drbd_read_state(device);
+
+ verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
+
+ /* This protects us against multiple calls (that can happen in the presence
+ of application IO), and against connectivity loss just before we arrive here. */
+ if (os.conn <= C_CONNECTED)
+ goto out_unlock;
+
+ ns = os;
+ ns.conn = C_CONNECTED;
+
+ drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+ verify_done ? "Online verify" : "Resync",
+ dt + device->rs_paused, device->rs_paused, dbdt);
+
+ n_oos = drbd_bm_total_weight(device);
+
+ if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+ if (n_oos) {
+ drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
+ n_oos, Bit2KB(1));
+ khelper_cmd = "out-of-sync";
+ }
+ } else {
+ D_ASSERT(device, (n_oos - device->rs_failed) == 0);
+
+ if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+ khelper_cmd = "after-resync-target";
+
+ if (device->use_csums && device->rs_total) {
+ const unsigned long s = device->rs_same_csum;
+ const unsigned long t = device->rs_total;
+ const int ratio =
+ (t == 0) ? 0 :
+ (t < 100000) ? ((s*100)/t) : (s/(t/100));
+ drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
+ "transferred %luK total %luK\n",
+ ratio,
+ Bit2KB(device->rs_same_csum),
+ Bit2KB(device->rs_total - device->rs_same_csum),
+ Bit2KB(device->rs_total));
+ }
+ }
+
+ if (device->rs_failed) {
+ drbd_info(device, " %lu failed blocks\n", device->rs_failed);
+
+ if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+ ns.disk = D_INCONSISTENT;
+ ns.pdsk = D_UP_TO_DATE;
+ } else {
+ ns.disk = D_UP_TO_DATE;
+ ns.pdsk = D_INCONSISTENT;
+ }
+ } else {
+ ns.disk = D_UP_TO_DATE;
+ ns.pdsk = D_UP_TO_DATE;
+
+ if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+ if (device->p_uuid) {
+ int i;
+ for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+ _drbd_uuid_set(device, i, device->p_uuid[i]);
+ drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
+ _drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
+ } else {
+ drbd_err(device, "device->p_uuid is NULL! BUG\n");
+ }
+ }
+
+ if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+ /* for verify runs, we don't update uuids here,
+ * so there would be nothing to report. */
+ drbd_uuid_set_bm(device, 0UL);
+ drbd_print_uuids(device, "updated UUIDs");
+ if (device->p_uuid) {
+ /* Now the two UUID sets are equal, update what we
+ * know of the peer. */
+ int i;
+ for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+ device->p_uuid[i] = device->ldev->md.uuid[i];
+ }
+ }
+ }
+
+ _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+out_unlock:
+ spin_unlock_irq(&device->resource->req_lock);
+ put_ldev(device);
+out:
+ device->rs_total = 0;
+ device->rs_failed = 0;
+ device->rs_paused = 0;
+
+ /* reset start sector, if we reached end of device */
+ if (verify_done && device->ov_left == 0)
+ device->ov_start_sector = 0;
+
+ drbd_md_sync(device);
+
+ if (khelper_cmd)
+ drbd_khelper(device, khelper_cmd);
+
+ return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+ if (drbd_peer_req_has_active_page(peer_req)) {
+ /* This might happen if sendpage() has not finished */
+ int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ atomic_add(i, &device->pp_in_use_by_net);
+ atomic_sub(i, &device->pp_in_use);
+ spin_lock_irq(&device->resource->req_lock);
+ list_add_tail(&peer_req->w.list, &device->net_ee);
+ spin_unlock_irq(&device->resource->req_lock);
+ wake_up(&drbd_pp_wait);
+ } else
+ drbd_free_peer_req(device, peer_req);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @device: DRBD device.
+ * @w: work object.
+ * @cancel: The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ int err;
+
+ if (unlikely(cancel)) {
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
+ }
+
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Sending NegDReply. sector=%llus.\n",
+ (unsigned long long)peer_req->i.sector);
+
+ err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
+ }
+
+ dec_unacked(device);
+
+ move_to_net_ee_or_free(device, peer_req);
+
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block() failed\n");
+ return err;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
+ * @w: work object.
+ * @cancel: The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ int err;
+
+ if (unlikely(cancel)) {
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
+ }
+
+ if (get_ldev_if_state(device, D_FAILED)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
+ }
+
+ if (device->state.conn == C_AHEAD) {
+ err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
+ } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ if (likely(device->state.pdsk >= D_INCONSISTENT)) {
+ inc_rs_pending(device);
+ err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Not sending RSDataReply, "
+ "partner DISKLESS!\n");
+ err = 0;
+ }
+ } else {
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
+ (unsigned long long)peer_req->i.sector);
+
+ err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+
+ /* update resync data with failure */
+ drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
+ }
+
+ dec_unacked(device);
+
+ move_to_net_ee_or_free(device, peer_req);
+
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block() failed\n");
+ return err;
+}
+
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ struct digest_info *di;
+ int digest_size;
+ void *digest = NULL;
+ int err, eq = 0;
+
+ if (unlikely(cancel)) {
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
+ }
+
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
+ }
+
+ di = peer_req->digest;
+
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ /* quick hack to try to avoid a race against reconfiguration.
+ * a real fix would be much more involved,
+ * introducing more locking mechanisms */
+ if (peer_device->connection->csums_tfm) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->csums_tfm);
+ D_ASSERT(device, digest_size == di->digest_size);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ }
+ if (digest) {
+ drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+ eq = !memcmp(digest, di->digest, digest_size);
+ kfree(digest);
+ }
+
+ if (eq) {
+ drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
+ /* rs_same_csums unit is BM_BLOCK_SIZE */
+ device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+ err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
+ } else {
+ inc_rs_pending(device);
+ peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+ peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
+ kfree(di);
+ err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+ }
+ } else {
+ err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+ if (__ratelimit(&drbd_ratelimit_state))
+ drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
+ }
+
+ dec_unacked(device);
+ move_to_net_ee_or_free(device, peer_req);
+
+ if (unlikely(err))
+ drbd_err(device, "drbd_send_block/ack() failed\n");
+ return err;
+}
+
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ int digest_size;
+ void *digest;
+ int err = 0;
+
+ if (unlikely(cancel))
+ goto out;
+
+ digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ if (!digest) {
+ err = 1; /* terminate the connection in case the allocation failed */
+ goto out;
+ }
+
+ if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+ drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+ else
+ memset(digest, 0, digest_size);
+
+ /* Free e and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
+ peer_req = NULL;
+ inc_rs_pending(device);
+ err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
+ if (err)
+ dec_rs_pending(device);
+ kfree(digest);
+
+out:
+ if (peer_req)
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return err;
+}
+
+void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
+{
+ if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
+ device->ov_last_oos_size += size>>9;
+ } else {
+ device->ov_last_oos_start = sector;
+ device->ov_last_oos_size = size>>9;
+ }
+ drbd_set_out_of_sync(device, sector, size);
+}
+
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_peer_device *peer_device = peer_req->peer_device;
+ struct drbd_device *device = peer_device->device;
+ struct digest_info *di;
+ void *digest;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ int digest_size;
+ int err, eq = 0;
+ bool stop_sector_reached = false;
+
+ if (unlikely(cancel)) {
+ drbd_free_peer_req(device, peer_req);
+ dec_unacked(device);
+ return 0;
+ }
+
+ /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+ * the resync lru has been cleaned up already */
+ if (get_ldev(device)) {
+ drbd_rs_complete_io(device, peer_req->i.sector);
+ put_ldev(device);
+ }
+
+ di = peer_req->digest;
+
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ digest_size = crypto_hash_digestsize(peer_device->connection->verify_tfm);
+ digest = kmalloc(digest_size, GFP_NOIO);
+ if (digest) {
+ drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+
+ D_ASSERT(device, digest_size == di->digest_size);
+ eq = !memcmp(digest, di->digest, digest_size);
+ kfree(digest);
+ }
+ }
+
+ /* Free peer_req and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(device, peer_req);
+ if (!eq)
+ drbd_ov_out_of_sync_found(device, sector, size);
+ else
+ ov_out_of_sync_print(device);
+
+ err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
+ eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+ dec_unacked(device);
+
+ --device->ov_left;
+
+ /* let's advance progress step marks only for every other megabyte */
+ if ((device->ov_left & 0x200) == 0x200)
+ drbd_advance_rs_marks(device, device->ov_left);
+
+ stop_sector_reached = verify_can_do_stop_sector(device) &&
+ (sector + (size>>9)) >= device->ov_stop_sector;
+
+ if (device->ov_left == 0 || stop_sector_reached) {
+ ov_out_of_sync_print(device);
+ drbd_resync_finished(device);
+ }
+
+ return err;
+}
+
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+static int drbd_send_barrier(struct drbd_connection *connection)
+{
+ struct p_barrier *p;
+ struct drbd_socket *sock;
+
+ sock = &connection->data;
+ p = conn_prepare_command(connection, sock);
+ if (!p)
+ return -EIO;
+ p->barrier = connection->send.current_epoch_nr;
+ p->pad = 0;
+ connection->send.current_epoch_writes = 0;
+
+ return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
+}
+
+int w_send_write_hint(struct drbd_work *w, int cancel)
+{
+ struct drbd_device *device =
+ container_of(w, struct drbd_device, unplug_work);
+ struct drbd_socket *sock;
+
+ if (cancel)
+ return 0;
+ sock = &first_peer_device(device)->connection->data;
+ if (!drbd_prepare_command(first_peer_device(device), sock))
+ return -EIO;
+ return drbd_send_command(first_peer_device(device), sock, P_UNPLUG_REMOTE, 0, NULL, 0);
+}
+
+static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
+{
+ if (!connection->send.seen_any_write_yet) {
+ connection->send.seen_any_write_yet = true;
+ connection->send.current_epoch_nr = epoch;
+ connection->send.current_epoch_writes = 0;
+ }
+}
+
+static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
+{
+ /* re-init if first write on this connection */
+ if (!connection->send.seen_any_write_yet)
+ return;
+ if (connection->send.current_epoch_nr != epoch) {
+ if (connection->send.current_epoch_writes)
+ drbd_send_barrier(connection);
+ connection->send.current_epoch_nr = epoch;
+ }
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *const connection = peer_device->connection;
+ int err;
+
+ if (unlikely(cancel)) {
+ req_mod(req, SEND_CANCELED);
+ return 0;
+ }
+ req->pre_send_jif = jiffies;
+
+ /* this time, no connection->send.current_epoch_writes++;
+ * If it was sent, it was the closing barrier for the last
+ * replicated epoch, before we went into AHEAD mode.
+ * No more barriers will be sent, until we leave AHEAD mode again. */
+ maybe_send_barrier(connection, req->epoch);
+
+ err = drbd_send_out_of_sync(peer_device, req);
+ req_mod(req, OOS_HANDED_TO_NETWORK);
+
+ return err;
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @w: work object.
+ * @cancel: The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device->connection;
+ int err;
+
+ if (unlikely(cancel)) {
+ req_mod(req, SEND_CANCELED);
+ return 0;
+ }
+ req->pre_send_jif = jiffies;
+
+ re_init_if_first_write(connection, req->epoch);
+ maybe_send_barrier(connection, req->epoch);
+ connection->send.current_epoch_writes++;
+
+ err = drbd_send_dblock(peer_device, req);
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+ return err;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @w: work object.
+ * @cancel: The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+ struct drbd_peer_device *const peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device->connection;
+ int err;
+
+ if (unlikely(cancel)) {
+ req_mod(req, SEND_CANCELED);
+ return 0;
+ }
+ req->pre_send_jif = jiffies;
+
+ /* Even read requests may close a write epoch,
+ * if there was any yet. */
+ maybe_send_barrier(connection, req->epoch);
+
+ err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
+ (unsigned long)req);
+
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+ return err;
+}
+
+int w_restart_disk_io(struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_device *device = req->device;
+
+ if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+ drbd_al_begin_io(device, &req->i);
+
+ drbd_req_make_private_bio(req, req->master_bio);
+ req->private_bio->bi_bdev = device->ldev->backing_bdev;
+ generic_make_request(req->private_bio);
+
+ return 0;
+}
+
+static int _drbd_may_sync_now(struct drbd_device *device)
+{
+ struct drbd_device *odev = device;
+ int resync_after;
+
+ while (1) {
+ if (!odev->ldev || odev->state.disk == D_DISKLESS)
+ return 1;
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
+ if (resync_after == -1)
+ return 1;
+ odev = minor_to_device(resync_after);
+ if (!odev)
+ return 1;
+ if ((odev->state.conn >= C_SYNC_SOURCE &&
+ odev->state.conn <= C_PAUSED_SYNC_T) ||
+ odev->state.aftr_isp || odev->state.peer_isp ||
+ odev->state.user_isp)
+ return 0;
+ }
+}
+
+/**
+ * _drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @device: DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+static int _drbd_pause_after(struct drbd_device *device)
+{
+ struct drbd_device *odev;
+ int i, rv = 0;
+
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, odev, i) {
+ if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+ continue;
+ if (!_drbd_may_sync_now(odev))
+ rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
+ != SS_NOTHING_TO_DO);
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/**
+ * _drbd_resume_next() - Resume resync on all devices that may resync now
+ * @device: DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+static int _drbd_resume_next(struct drbd_device *device)
+{
+ struct drbd_device *odev;
+ int i, rv = 0;
+
+ rcu_read_lock();
+ idr_for_each_entry(&drbd_devices, odev, i) {
+ if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+ continue;
+ if (odev->state.aftr_isp) {
+ if (_drbd_may_sync_now(odev))
+ rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0),
+ CS_HARD, NULL)
+ != SS_NOTHING_TO_DO) ;
+ }
+ }
+ rcu_read_unlock();
+ return rv;
+}
+
+void resume_next_sg(struct drbd_device *device)
+{
+ write_lock_irq(&global_state_lock);
+ _drbd_resume_next(device);
+ write_unlock_irq(&global_state_lock);
+}
+
+void suspend_other_sg(struct drbd_device *device)
+{
+ write_lock_irq(&global_state_lock);
+ _drbd_pause_after(device);
+ write_unlock_irq(&global_state_lock);
+}
+
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
+{
+ struct drbd_device *odev;
+ int resync_after;
+
+ if (o_minor == -1)
+ return NO_ERROR;
+ if (o_minor < -1 || o_minor > MINORMASK)
+ return ERR_RESYNC_AFTER;
+
+ /* check for loops */
+ odev = minor_to_device(o_minor);
+ while (1) {
+ if (odev == device)
+ return ERR_RESYNC_AFTER_CYCLE;
+
+ /* You are free to depend on diskless, non-existing,
+ * or not yet/no longer existing minors.
+ * We only reject dependency loops.
+ * We cannot follow the dependency chain beyond a detached or
+ * missing minor.
+ */
+ if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+ return NO_ERROR;
+
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
+ /* dependency chain ends here, no cycles. */
+ if (resync_after == -1)
+ return NO_ERROR;
+
+ /* follow the dependency chain */
+ odev = minor_to_device(resync_after);
+ }
+}
+
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_device *device)
+{
+ int changes;
+
+ do {
+ changes = _drbd_pause_after(device);
+ changes |= _drbd_resume_next(device);
+ } while (changes);
+}
+
+void drbd_rs_controller_reset(struct drbd_device *device)
+{
+ struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk;
+ struct fifo_buffer *plan;
+
+ atomic_set(&device->rs_sect_in, 0);
+ atomic_set(&device->rs_sect_ev, 0);
+ device->rs_in_flight = 0;
+ device->rs_last_events =
+ (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]);
+
+ /* Updating the RCU protected object in place is necessary since
+ this function gets called from atomic context.
+ It is valid since all other updates also lead to an completely
+ empty fifo */
+ rcu_read_lock();
+ plan = rcu_dereference(device->rs_plan_s);
+ plan->total = 0;
+ fifo_set(plan, 0);
+ rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+ struct drbd_device *device = (struct drbd_device *) data;
+ drbd_device_post_work(device, RS_START);
+}
+
+static void do_start_resync(struct drbd_device *device)
+{
+ if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
+ drbd_warn(device, "postponing start_resync ...\n");
+ device->start_resync_timer.expires = jiffies + HZ/10;
+ add_timer(&device->start_resync_timer);
+ return;
+ }
+
+ drbd_start_resync(device, C_SYNC_SOURCE);
+ clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+ bool csums_after_crash_only;
+ rcu_read_lock();
+ csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+ rcu_read_unlock();
+ return connection->agreed_pro_version >= 89 && /* supported? */
+ connection->csums_tfm && /* configured? */
+ (csums_after_crash_only == 0 /* use for each resync? */
+ || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @device: DRBD device.
+ * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
+{
+ struct drbd_peer_device *peer_device = first_peer_device(device);
+ struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+ union drbd_state ns;
+ int r;
+
+ if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
+ drbd_err(device, "Resync already running!\n");
+ return;
+ }
+
+ if (!test_bit(B_RS_H_DONE, &device->flags)) {
+ if (side == C_SYNC_TARGET) {
+ /* Since application IO was locked out during C_WF_BITMAP_T and
+ C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+ we check that we might make the data inconsistent. */
+ r = drbd_khelper(device, "before-resync-target");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ drbd_info(device, "before-resync-target handler returned %d, "
+ "dropping connection.\n", r);
+ conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+ return;
+ }
+ } else /* C_SYNC_SOURCE */ {
+ r = drbd_khelper(device, "before-resync-source");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ if (r == 3) {
+ drbd_info(device, "before-resync-source handler returned %d, "
+ "ignoring. Old userland tools?", r);
+ } else {
+ drbd_info(device, "before-resync-source handler returned %d, "
+ "dropping connection.\n", r);
+ conn_request_state(connection,
+ NS(conn, C_DISCONNECTING), CS_HARD);
+ return;
+ }
+ }
+ }
+ }
+
+ if (current == connection->worker.task) {
+ /* The worker should not sleep waiting for state_mutex,
+ that can take long */
+ if (!mutex_trylock(device->state_mutex)) {
+ set_bit(B_RS_H_DONE, &device->flags);
+ device->start_resync_timer.expires = jiffies + HZ/5;
+ add_timer(&device->start_resync_timer);
+ return;
+ }
+ } else {
+ mutex_lock(device->state_mutex);
+ }
+ clear_bit(B_RS_H_DONE, &device->flags);
+
+ /* req_lock: serialize with drbd_send_and_submit() and others
+ * global_state_lock: for stable sync-after dependencies */
+ spin_lock_irq(&device->resource->req_lock);
+ write_lock(&global_state_lock);
+ /* Did some connection breakage or IO error race with us? */
+ if (device->state.conn < C_CONNECTED
+ || !get_ldev_if_state(device, D_NEGOTIATING)) {
+ write_unlock(&global_state_lock);
+ spin_unlock_irq(&device->resource->req_lock);
+ mutex_unlock(device->state_mutex);
+ return;
+ }
+
+ ns = drbd_read_state(device);
+
+ ns.aftr_isp = !_drbd_may_sync_now(device);
+
+ ns.conn = side;
+
+ if (side == C_SYNC_TARGET)
+ ns.disk = D_INCONSISTENT;
+ else /* side == C_SYNC_SOURCE */
+ ns.pdsk = D_INCONSISTENT;
+
+ r = __drbd_set_state(device, ns, CS_VERBOSE, NULL);
+ ns = drbd_read_state(device);
+
+ if (ns.conn < C_CONNECTED)
+ r = SS_UNKNOWN_ERROR;
+
+ if (r == SS_SUCCESS) {
+ unsigned long tw = drbd_bm_total_weight(device);
+ unsigned long now = jiffies;
+ int i;
+
+ device->rs_failed = 0;
+ device->rs_paused = 0;
+ device->rs_same_csum = 0;
+ device->rs_last_sect_ev = 0;
+ device->rs_total = tw;
+ device->rs_start = now;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ device->rs_mark_left[i] = tw;
+ device->rs_mark_time[i] = now;
+ }
+ _drbd_pause_after(device);
+ /* Forget potentially stale cached per resync extent bit-counts.
+ * Open coded drbd_rs_cancel_all(device), we already have IRQs
+ * disabled, and know the disk state is ok. */
+ spin_lock(&device->al_lock);
+ lc_reset(device->resync);
+ device->resync_locked = 0;
+ device->resync_wenr = LC_FREE;
+ spin_unlock(&device->al_lock);
+ }
+ write_unlock(&global_state_lock);
+ spin_unlock_irq(&device->resource->req_lock);
+
+ if (r == SS_SUCCESS) {
+ wake_up(&device->al_wait); /* for lc_reset() above */
+ /* reset rs_last_bcast when a resync or verify is started,
+ * to deal with potential jiffies wrap. */
+ device->rs_last_bcast = jiffies - HZ;
+
+ drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+ drbd_conn_str(ns.conn),
+ (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
+ (unsigned long) device->rs_total);
+ if (side == C_SYNC_TARGET) {
+ device->bm_resync_fo = 0;
+ device->use_csums = use_checksum_based_resync(connection, device);
+ } else {
+ device->use_csums = 0;
+ }
+
+ /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+ * with w_send_oos, or the sync target will get confused as to
+ * how much bits to resync. We cannot do that always, because for an
+ * empty resync and protocol < 95, we need to do it here, as we call
+ * drbd_resync_finished from here in that case.
+ * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+ * and from after_state_ch otherwise. */
+ if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+ drbd_gen_and_send_sync_uuid(peer_device);
+
+ if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
+ /* This still has a race (about when exactly the peers
+ * detect connection loss) that can lead to a full sync
+ * on next handshake. In 8.3.9 we fixed this with explicit
+ * resync-finished notifications, but the fix
+ * introduces a protocol change. Sleeping for some
+ * time longer than the ping interval + timeout on the
+ * SyncSource, to give the SyncTarget the chance to
+ * detect connection loss, then waiting for a ping
+ * response (implicit in drbd_resync_finished) reduces
+ * the race considerably, but does not solve it. */
+ if (side == C_SYNC_SOURCE) {
+ struct net_conf *nc;
+ int timeo;
+
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
+ }
+ drbd_resync_finished(device);
+ }
+
+ drbd_rs_controller_reset(device);
+ /* ns.conn may already be != device->state.conn,
+ * we may have been paused in between, or become paused until
+ * the timer triggers.
+ * No matter, that is handled in resync_timer_fn() */
+ if (ns.conn == C_SYNC_TARGET)
+ mod_timer(&device->resync_timer, jiffies);
+
+ drbd_md_sync(device);
+ }
+ put_ldev(device);
+ mutex_unlock(device->state_mutex);
+}
+
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
+{
+ struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+ device->rs_last_bcast = jiffies;
+
+ if (!get_ldev(device))
+ return;
+
+ drbd_bm_write_lazy(device, 0);
+ if (resync_done && is_sync_state(device->state.conn))
+ drbd_resync_finished(device);
+
+ drbd_bcast_event(device, &sib);
+ /* update timestamp, in case it took a while to write out stuff */
+ device->rs_last_bcast = jiffies;
+ put_ldev(device);
+}
+
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+ lc_destroy(device->resync);
+ device->resync = NULL;
+ lc_destroy(device->act_log);
+ device->act_log = NULL;
+
+ __acquire(local);
+ drbd_free_ldev(device->ldev);
+ device->ldev = NULL;
+ __release(local);
+
+ clear_bit(GOING_DISKLESS, &device->flags);
+ wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+ D_ASSERT(device, device->state.disk == D_FAILED);
+ /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+ * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+ * the protected members anymore, though, so once put_ldev reaches zero
+ * again, it will be safe to free them. */
+
+ /* Try to write changed bitmap pages, read errors may have just
+ * set some bits outside the area covered by the activity log.
+ *
+ * If we have an IO error during the bitmap writeout,
+ * we will want a full sync next time, just in case.
+ * (Do we want a specific meta data flag for this?)
+ *
+ * If that does not make it to stable storage either,
+ * we cannot do anything about that anymore.
+ *
+ * We still need to check if both bitmap and ldev are present, we may
+ * end up here after a failed attach, before ldev was even assigned.
+ */
+ if (device->bitmap && device->ldev) {
+ /* An interrupted resync or similar is allowed to recounts bits
+ * while we detach.
+ * Any modifications would not be expected anymore, though.
+ */
+ if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+ "detach", BM_LOCKED_TEST_ALLOWED)) {
+ if (test_bit(WAS_READ_ERROR, &device->flags)) {
+ drbd_md_set_flag(device, MDF_FULL_SYNC);
+ drbd_md_sync(device);
+ }
+ }
+ }
+
+ drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+static int do_md_sync(struct drbd_device *device)
+{
+ drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+ drbd_md_sync(device);
+ return 0;
+}
+
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+ struct drbd_thread_timing_details *tdp,
+ unsigned int *cb_nr,
+ void *cb,
+ const char *fn, const unsigned int line)
+{
+ unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+ struct drbd_thread_timing_details *td = tdp + i;
+
+ td->start_jif = jiffies;
+ td->cb_addr = cb;
+ td->caller_fn = fn;
+ td->line = line;
+ td->cb_nr = *cb_nr;
+
+ i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+ td = tdp + i;
+ memset(td, 0, sizeof(*td));
+
+ ++(*cb_nr);
+}
+
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+ if (test_bit(MD_SYNC, &todo))
+ do_md_sync(device);
+ if (test_bit(RS_DONE, &todo) ||
+ test_bit(RS_PROGRESS, &todo))
+ update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
+ if (test_bit(GO_DISKLESS, &todo))
+ go_diskless(device);
+ if (test_bit(DESTROY_DISK, &todo))
+ drbd_ldev_destroy(device);
+ if (test_bit(RS_START, &todo))
+ do_start_resync(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK \
+ ((1UL << GO_DISKLESS) \
+ |(1UL << DESTROY_DISK) \
+ |(1UL << MD_SYNC) \
+ |(1UL << RS_START) \
+ |(1UL << RS_PROGRESS) \
+ |(1UL << RS_DONE) \
+ )
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+ unsigned long old, new;
+ do {
+ old = *flags;
+ new = old & ~DRBD_DEVICE_WORK_MASK;
+ } while (cmpxchg(flags, old, new) != old);
+ return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
+{
+ struct drbd_peer_device *peer_device;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ unsigned long todo = get_work_bits(&device->flags);
+ if (!todo)
+ continue;
+
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ do_device_work(device, todo);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+ spin_lock_irq(&queue->q_lock);
+ list_splice_tail_init(&queue->q, work_list);
+ spin_unlock_irq(&queue->q_lock);
+ return !list_empty(work_list);
+}
+
+static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
+{
+ DEFINE_WAIT(wait);
+ struct net_conf *nc;
+ int uncork, cork;
+
+ dequeue_work_batch(&connection->sender_work, work_list);
+ if (!list_empty(work_list))
+ return;
+
+ /* Still nothing to do?
+ * Maybe we still need to close the current epoch,
+ * even if no new requests are queued yet.
+ *
+ * Also, poke TCP, just in case.
+ * Then wait for new work (or signal). */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ uncork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ if (uncork) {
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket)
+ drbd_tcp_uncork(connection->data.socket);
+ mutex_unlock(&connection->data.mutex);
+ }
+
+ for (;;) {
+ int send_barrier;
+ prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_lock_irq(&connection->resource->req_lock);
+ spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ if (!list_empty(&connection->sender_work.q))
+ list_splice_tail_init(&connection->sender_work.q, work_list);
+ spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ if (!list_empty(work_list) || signal_pending(current)) {
+ spin_unlock_irq(&connection->resource->req_lock);
+ break;
+ }
+
+ /* We found nothing new to do, no to-be-communicated request,
+ * no other work item. We may still need to close the last
+ * epoch. Next incoming request epoch will be connection ->
+ * current transfer log epoch number. If that is different
+ * from the epoch of the last request we communicated, it is
+ * safe to send the epoch separating barrier now.
+ */
+ send_barrier =
+ atomic_read(&connection->current_tle_nr) !=
+ connection->send.current_epoch_nr;
+ spin_unlock_irq(&connection->resource->req_lock);
+
+ if (send_barrier)
+ maybe_send_barrier(connection,
+ connection->send.current_epoch_nr + 1);
+
+ if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
+ break;
+
+ /* drbd_send() may have called flush_signals() */
+ if (get_t_state(&connection->worker) != RUNNING)
+ break;
+
+ schedule();
+ /* may be woken up for other things but new work, too,
+ * e.g. if the current epoch got closed.
+ * In which case we send the barrier above. */
+ }
+ finish_wait(&connection->sender_work.q_wait, &wait);
+
+ /* someone may have changed the config while we have been waiting above. */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ cork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket) {
+ if (cork)
+ drbd_tcp_cork(connection->data.socket);
+ else if (!uncork)
+ drbd_tcp_uncork(connection->data.socket);
+ }
+ mutex_unlock(&connection->data.mutex);
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+ struct drbd_connection *connection = thi->connection;
+ struct drbd_work *w = NULL;
+ struct drbd_peer_device *peer_device;
+ LIST_HEAD(work_list);
+ int vnr;
+
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
+
+ if (list_empty(&work_list)) {
+ update_worker_timing_details(connection, wait_for_work);
+ wait_for_work(connection, &work_list);
+ }
+
+ if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+ update_worker_timing_details(connection, do_unqueued_work);
+ do_unqueued_work(connection);
+ }
+
+ if (signal_pending(current)) {
+ flush_signals(current);
+ if (get_t_state(thi) == RUNNING) {
+ drbd_warn(connection, "Worker got an unexpected signal\n");
+ continue;
+ }
+ break;
+ }
+
+ if (get_t_state(thi) != RUNNING)
+ break;
+
+ if (!list_empty(&work_list)) {
+ w = list_first_entry(&work_list, struct drbd_work, list);
+ list_del_init(&w->list);
+ update_worker_timing_details(connection, w->cb);
+ if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
+ continue;
+ if (connection->cstate >= C_WF_REPORT_PARAMS)
+ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+ }
+ }
+
+ do {
+ if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+ update_worker_timing_details(connection, do_unqueued_work);
+ do_unqueued_work(connection);
+ }
+ if (!list_empty(&work_list)) {
+ w = list_first_entry(&work_list, struct drbd_work, list);
+ list_del_init(&w->list);
+ update_worker_timing_details(connection, w->cb);
+ w->cb(w, 1);
+ } else
+ dequeue_work_batch(&connection->sender_work, &work_list);
+ } while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
+
+ rcu_read_lock();
+ idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+ struct drbd_device *device = peer_device->device;
+ D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
+ kref_get(&device->kref);
+ rcu_read_unlock();
+ drbd_device_cleanup(device);
+ kref_put(&device->kref, drbd_destroy_device);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
new file mode 100644
index 000000000..a08cda955
--- /dev/null
+++ b/drivers/block/floppy.c
@@ -0,0 +1,4645 @@
+/*
+ * linux/drivers/block/floppy.c
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 1993, 1994 Alain Knaff
+ * Copyright (C) 1998 Alan Cox
+ */
+
+/*
+ * 02.12.91 - Changed to static variables to indicate need for reset
+ * and recalibrate. This makes some things easier (output_byte reset
+ * checking etc), and means less interrupt jumping in case of errors,
+ * so the code is hopefully easier to understand.
+ */
+
+/*
+ * This file is certainly a mess. I've tried my best to get it working,
+ * but I don't like programming floppies, and I have only one anyway.
+ * Urgel. I should check for more errors, and do more graceful error
+ * recovery. Seems there are problems with several drives. I've tried to
+ * correct them. No promises.
+ */
+
+/*
+ * As with hd.c, all routines within this file can (and will) be called
+ * by interrupts, so extreme caution is needed. A hardware interrupt
+ * handler may not sleep, or a kernel panic will happen. Thus I cannot
+ * call "floppy-on" directly, but have to set a special timer interrupt
+ * etc.
+ */
+
+/*
+ * 28.02.92 - made track-buffering routines, based on the routines written
+ * by entropy@wintermute.wpi.edu (Lawrence Foard). Linus.
+ */
+
+/*
+ * Automatic floppy-detection and formatting written by Werner Almesberger
+ * (almesber@nessie.cs.id.ethz.ch), who also corrected some problems with
+ * the floppy-change signal detection.
+ */
+
+/*
+ * 1992/7/22 -- Hennus Bergman: Added better error reporting, fixed
+ * FDC data overrun bug, added some preliminary stuff for vertical
+ * recording support.
+ *
+ * 1992/9/17: Added DMA allocation & DMA functions. -- hhb.
+ *
+ * TODO: Errors are still not counted properly.
+ */
+
+/* 1992/9/20
+ * Modifications for ``Sector Shifting'' by Rob Hooft (hooft@chem.ruu.nl)
+ * modeled after the freeware MS-DOS program fdformat/88 V1.8 by
+ * Christoph H. Hochst\"atter.
+ * I have fixed the shift values to the ones I always use. Maybe a new
+ * ioctl() should be created to be able to modify them.
+ * There is a bug in the driver that makes it impossible to format a
+ * floppy as the first thing after bootup.
+ */
+
+/*
+ * 1993/4/29 -- Linus -- cleaned up the timer handling in the kernel, and
+ * this helped the floppy driver as well. Much cleaner, and still seems to
+ * work.
+ */
+
+/* 1994/6/24 --bbroad-- added the floppy table entries and made
+ * minor modifications to allow 2.88 floppies to be run.
+ */
+
+/* 1994/7/13 -- Paul Vojta -- modified the probing code to allow three or more
+ * disk types.
+ */
+
+/*
+ * 1994/8/8 -- Alain Knaff -- Switched to fdpatch driver: Support for bigger
+ * format bug fixes, but unfortunately some new bugs too...
+ */
+
+/* 1994/9/17 -- Koen Holtman -- added logging of physical floppy write
+ * errors to allow safe writing by specialized programs.
+ */
+
+/* 1995/4/24 -- Dan Fandrich -- added support for Commodore 1581 3.5" disks
+ * by defining bit 1 of the "stretch" parameter to mean put sectors on the
+ * opposite side of the disk, leaving the sector IDs alone (i.e. Commodore's
+ * drives are "upside-down").
+ */
+
+/*
+ * 1995/8/26 -- Andreas Busse -- added Mips support.
+ */
+
+/*
+ * 1995/10/18 -- Ralf Baechle -- Portability cleanup; move machine dependent
+ * features to asm/floppy.h.
+ */
+
+/*
+ * 1998/1/21 -- Richard Gooch <rgooch@atnf.csiro.au> -- devfs support
+ */
+
+/*
+ * 1998/05/07 -- Russell King -- More portability cleanups; moved definition of
+ * interrupt and dma channel to asm/floppy.h. Cleaned up some formatting &
+ * use of '0' for NULL.
+ */
+
+/*
+ * 1998/06/07 -- Alan Cox -- Merged the 2.0.34 fixes for resource allocation
+ * failures.
+ */
+
+/*
+ * 1998/09/20 -- David Weinehall -- Added slow-down code for buggy PS/2-drives.
+ */
+
+/*
+ * 1999/08/13 -- Paul Slootman -- floppy stopped working on Alpha after 24
+ * days, 6 hours, 32 minutes and 32 seconds (i.e. MAXINT jiffies; ints were
+ * being used to store jiffies, which are unsigned longs).
+ */
+
+/*
+ * 2000/08/28 -- Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * - get rid of check_region
+ * - s/suser/capable/
+ */
+
+/*
+ * 2001/08/26 -- Paul Gortmaker - fix insmod oops on machines with no
+ * floppy controller (lingering task on list after module is gone... boom.)
+ */
+
+/*
+ * 2002/02/07 -- Anton Altaparmakov - Fix io ports reservation to correct range
+ * (0x3f2-0x3f5, 0x3f7). This fix is a bit of a hack but the proper fix
+ * requires many non-obvious changes in arch dependent code.
+ */
+
+/* 2003/07/28 -- Daniele Bellucci <bellucda@tiscali.it>.
+ * Better audit of register_blkdev.
+ */
+
+#undef FLOPPY_SILENT_DCL_CLEAR
+
+#define REALLY_SLOW_IO
+
+#define DEBUGT 2
+
+#define DPRINT(format, args...) \
+ pr_info("floppy%d: " format, current_drive, ##args)
+
+#define DCL_DEBUG /* debug disk change line */
+#ifdef DCL_DEBUG
+#define debug_dcl(test, fmt, args...) \
+ do { if ((test) & FD_DEBUG) DPRINT(fmt, ##args); } while (0)
+#else
+#define debug_dcl(test, fmt, args...) \
+ do { if (0) DPRINT(fmt, ##args); } while (0)
+#endif
+
+/* do print messages for unexpected interrupts */
+static int print_unex = 1;
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#define FDPATCHES
+#include <linux/fdreg.h>
+#include <linux/fd.h>
+#include <linux/hdreg.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/string.h>
+#include <linux/jiffies.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h> /* CMOS defines */
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/async.h>
+
+/*
+ * PS/2 floppies have much slower step rates than regular floppies.
+ * It's been recommended that take about 1/4 of the default speed
+ * in some more extreme cases.
+ */
+static DEFINE_MUTEX(floppy_mutex);
+static int slow_floppy;
+
+#include <asm/dma.h>
+#include <asm/irq.h>
+
+static int FLOPPY_IRQ = 6;
+static int FLOPPY_DMA = 2;
+static int can_use_virtual_dma = 2;
+/* =======
+ * can use virtual DMA:
+ * 0 = use of virtual DMA disallowed by config
+ * 1 = use of virtual DMA prescribed by config
+ * 2 = no virtual DMA preference configured. By default try hard DMA,
+ * but fall back on virtual DMA when not enough memory available
+ */
+
+static int use_virtual_dma;
+/* =======
+ * use virtual DMA
+ * 0 using hard DMA
+ * 1 using virtual DMA
+ * This variable is set to virtual when a DMA mem problem arises, and
+ * reset back in floppy_grab_irq_and_dma.
+ * It is not safe to reset it in other circumstances, because the floppy
+ * driver may have several buffers in use at once, and we do currently not
+ * record each buffers capabilities
+ */
+
+static DEFINE_SPINLOCK(floppy_lock);
+
+static unsigned short virtual_dma_port = 0x3f0;
+irqreturn_t floppy_interrupt(int irq, void *dev_id);
+static int set_dor(int fdc, char mask, char data);
+
+#define K_64 0x10000 /* 64KB */
+
+/* the following is the mask of allowed drives. By default units 2 and
+ * 3 of both floppy controllers are disabled, because switching on the
+ * motor of these drives causes system hangs on some PCI computers. drive
+ * 0 is the low bit (0x1), and drive 7 is the high bit (0x80). Bits are on if
+ * a drive is allowed.
+ *
+ * NOTE: This must come before we include the arch floppy header because
+ * some ports reference this variable from there. -DaveM
+ */
+
+static int allowed_drive_mask = 0x33;
+
+#include <asm/floppy.h>
+
+static int irqdma_allocated;
+
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/cdrom.h> /* for the compatibility eject ioctl */
+#include <linux/completion.h>
+
+static struct request *current_req;
+static void do_fd_request(struct request_queue *q);
+static int set_next_request(void);
+
+#ifndef fd_get_dma_residue
+#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
+#endif
+
+/* Dma Memory related stuff */
+
+#ifndef fd_dma_mem_free
+#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
+#endif
+
+#ifndef fd_dma_mem_alloc
+#define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL, get_order(size))
+#endif
+
+static inline void fallback_on_nodma_alloc(char **addr, size_t l)
+{
+#ifdef FLOPPY_CAN_FALLBACK_ON_NODMA
+ if (*addr)
+ return; /* we have the memory */
+ if (can_use_virtual_dma != 2)
+ return; /* no fallback allowed */
+ pr_info("DMA memory shortage. Temporarily falling back on virtual DMA\n");
+ *addr = (char *)nodma_mem_alloc(l);
+#else
+ return;
+#endif
+}
+
+/* End dma memory related stuff */
+
+static unsigned long fake_change;
+static bool initialized;
+
+#define ITYPE(x) (((x) >> 2) & 0x1f)
+#define TOMINOR(x) ((x & 3) | ((x & 4) << 5))
+#define UNIT(x) ((x) & 0x03) /* drive on fdc */
+#define FDC(x) (((x) & 0x04) >> 2) /* fdc of drive */
+ /* reverse mapping from unit and fdc to drive */
+#define REVDRIVE(fdc, unit) ((unit) + ((fdc) << 2))
+
+#define DP (&drive_params[current_drive])
+#define DRS (&drive_state[current_drive])
+#define DRWE (&write_errors[current_drive])
+#define FDCS (&fdc_state[fdc])
+
+#define UDP (&drive_params[drive])
+#define UDRS (&drive_state[drive])
+#define UDRWE (&write_errors[drive])
+#define UFDCS (&fdc_state[FDC(drive)])
+
+#define PH_HEAD(floppy, head) (((((floppy)->stretch & 2) >> 1) ^ head) << 2)
+#define STRETCH(floppy) ((floppy)->stretch & FD_STRETCH)
+
+/* read/write */
+#define COMMAND (raw_cmd->cmd[0])
+#define DR_SELECT (raw_cmd->cmd[1])
+#define TRACK (raw_cmd->cmd[2])
+#define HEAD (raw_cmd->cmd[3])
+#define SECTOR (raw_cmd->cmd[4])
+#define SIZECODE (raw_cmd->cmd[5])
+#define SECT_PER_TRACK (raw_cmd->cmd[6])
+#define GAP (raw_cmd->cmd[7])
+#define SIZECODE2 (raw_cmd->cmd[8])
+#define NR_RW 9
+
+/* format */
+#define F_SIZECODE (raw_cmd->cmd[2])
+#define F_SECT_PER_TRACK (raw_cmd->cmd[3])
+#define F_GAP (raw_cmd->cmd[4])
+#define F_FILL (raw_cmd->cmd[5])
+#define NR_F 6
+
+/*
+ * Maximum disk size (in kilobytes).
+ * This default is used whenever the current disk size is unknown.
+ * [Now it is rather a minimum]
+ */
+#define MAX_DISK_SIZE 4 /* 3984 */
+
+/*
+ * globals used by 'result()'
+ */
+#define MAX_REPLIES 16
+static unsigned char reply_buffer[MAX_REPLIES];
+static int inr; /* size of reply buffer, when called from interrupt */
+#define ST0 (reply_buffer[0])
+#define ST1 (reply_buffer[1])
+#define ST2 (reply_buffer[2])
+#define ST3 (reply_buffer[0]) /* result of GETSTATUS */
+#define R_TRACK (reply_buffer[3])
+#define R_HEAD (reply_buffer[4])
+#define R_SECTOR (reply_buffer[5])
+#define R_SIZECODE (reply_buffer[6])
+
+#define SEL_DLY (2 * HZ / 100)
+
+/*
+ * this struct defines the different floppy drive types.
+ */
+static struct {
+ struct floppy_drive_params params;
+ const char *name; /* name printed while booting */
+} default_drive_params[] = {
+/* NOTE: the time values in jiffies should be in msec!
+ CMOS drive type
+ | Maximum data rate supported by drive type
+ | | Head load time, msec
+ | | | Head unload time, msec (not used)
+ | | | | Step rate interval, usec
+ | | | | | Time needed for spinup time (jiffies)
+ | | | | | | Timeout for spinning down (jiffies)
+ | | | | | | | Spindown offset (where disk stops)
+ | | | | | | | | Select delay
+ | | | | | | | | | RPS
+ | | | | | | | | | | Max number of tracks
+ | | | | | | | | | | | Interrupt timeout
+ | | | | | | | | | | | | Max nonintlv. sectors
+ | | | | | | | | | | | | | -Max Errors- flags */
+{{0, 500, 16, 16, 8000, 1*HZ, 3*HZ, 0, SEL_DLY, 5, 80, 3*HZ, 20, {3,1,2,0,2}, 0,
+ 0, { 7, 4, 8, 2, 1, 5, 3,10}, 3*HZ/2, 0 }, "unknown" },
+
+{{1, 300, 16, 16, 8000, 1*HZ, 3*HZ, 0, SEL_DLY, 5, 40, 3*HZ, 17, {3,1,2,0,2}, 0,
+ 0, { 1, 0, 0, 0, 0, 0, 0, 0}, 3*HZ/2, 1 }, "360K PC" }, /*5 1/4 360 KB PC*/
+
+{{2, 500, 16, 16, 6000, 4*HZ/10, 3*HZ, 14, SEL_DLY, 6, 83, 3*HZ, 17, {3,1,2,0,2}, 0,
+ 0, { 2, 5, 6,23,10,20,12, 0}, 3*HZ/2, 2 }, "1.2M" }, /*5 1/4 HD AT*/
+
+{{3, 250, 16, 16, 3000, 1*HZ, 3*HZ, 0, SEL_DLY, 5, 83, 3*HZ, 20, {3,1,2,0,2}, 0,
+ 0, { 4,22,21,30, 3, 0, 0, 0}, 3*HZ/2, 4 }, "720k" }, /*3 1/2 DD*/
+
+{{4, 500, 16, 16, 4000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5, 83, 3*HZ, 20, {3,1,2,0,2}, 0,
+ 0, { 7, 4,25,22,31,21,29,11}, 3*HZ/2, 7 }, "1.44M" }, /*3 1/2 HD*/
+
+{{5, 1000, 15, 8, 3000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5, 83, 3*HZ, 40, {3,1,2,0,2}, 0,
+ 0, { 7, 8, 4,25,28,22,31,21}, 3*HZ/2, 8 }, "2.88M AMI BIOS" }, /*3 1/2 ED*/
+
+{{6, 1000, 15, 8, 3000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5, 83, 3*HZ, 40, {3,1,2,0,2}, 0,
+ 0, { 7, 8, 4,25,28,22,31,21}, 3*HZ/2, 8 }, "2.88M" } /*3 1/2 ED*/
+/* | --autodetected formats--- | | |
+ * read_track | | Name printed when booting
+ * | Native format
+ * Frequency of disk change checks */
+};
+
+static struct floppy_drive_params drive_params[N_DRIVE];
+static struct floppy_drive_struct drive_state[N_DRIVE];
+static struct floppy_write_errors write_errors[N_DRIVE];
+static struct timer_list motor_off_timer[N_DRIVE];
+static struct gendisk *disks[N_DRIVE];
+static struct block_device *opened_bdev[N_DRIVE];
+static DEFINE_MUTEX(open_lock);
+static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
+static int fdc_queue;
+
+/*
+ * This struct defines the different floppy types.
+ *
+ * Bit 0 of 'stretch' tells if the tracks need to be doubled for some
+ * types (e.g. 360kB diskette in 1.2MB drive, etc.). Bit 1 of 'stretch'
+ * tells if the disk is in Commodore 1581 format, which means side 0 sectors
+ * are located on side 1 of the disk but with a side 0 ID, and vice-versa.
+ * This is the same as the Sharp MZ-80 5.25" CP/M disk format, except that the
+ * 1581's logical side 0 is on physical side 1, whereas the Sharp's logical
+ * side 0 is on physical side 0 (but with the misnamed sector IDs).
+ * 'stretch' should probably be renamed to something more general, like
+ * 'options'.
+ *
+ * Bits 2 through 9 of 'stretch' tell the number of the first sector.
+ * The LSB (bit 2) is flipped. For most disks, the first sector
+ * is 1 (represented by 0x00<<2). For some CP/M and music sampler
+ * disks (such as Ensoniq EPS 16plus) it is 0 (represented as 0x01<<2).
+ * For Amstrad CPC disks it is 0xC1 (represented as 0xC0<<2).
+ *
+ * Other parameters should be self-explanatory (see also setfdprm(8)).
+ */
+/*
+ Size
+ | Sectors per track
+ | | Head
+ | | | Tracks
+ | | | | Stretch
+ | | | | | Gap 1 size
+ | | | | | | Data rate, | 0x40 for perp
+ | | | | | | | Spec1 (stepping rate, head unload
+ | | | | | | | | /fmt gap (gap2) */
+static struct floppy_struct floppy_type[32] = {
+ { 0, 0,0, 0,0,0x00,0x00,0x00,0x00,NULL }, /* 0 no testing */
+ { 720, 9,2,40,0,0x2A,0x02,0xDF,0x50,"d360" }, /* 1 360KB PC */
+ { 2400,15,2,80,0,0x1B,0x00,0xDF,0x54,"h1200" }, /* 2 1.2MB AT */
+ { 720, 9,1,80,0,0x2A,0x02,0xDF,0x50,"D360" }, /* 3 360KB SS 3.5" */
+ { 1440, 9,2,80,0,0x2A,0x02,0xDF,0x50,"D720" }, /* 4 720KB 3.5" */
+ { 720, 9,2,40,1,0x23,0x01,0xDF,0x50,"h360" }, /* 5 360KB AT */
+ { 1440, 9,2,80,0,0x23,0x01,0xDF,0x50,"h720" }, /* 6 720KB AT */
+ { 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,"H1440" }, /* 7 1.44MB 3.5" */
+ { 5760,36,2,80,0,0x1B,0x43,0xAF,0x54,"E2880" }, /* 8 2.88MB 3.5" */
+ { 6240,39,2,80,0,0x1B,0x43,0xAF,0x28,"E3120" }, /* 9 3.12MB 3.5" */
+
+ { 2880,18,2,80,0,0x25,0x00,0xDF,0x02,"h1440" }, /* 10 1.44MB 5.25" */
+ { 3360,21,2,80,0,0x1C,0x00,0xCF,0x0C,"H1680" }, /* 11 1.68MB 3.5" */
+ { 820,10,2,41,1,0x25,0x01,0xDF,0x2E,"h410" }, /* 12 410KB 5.25" */
+ { 1640,10,2,82,0,0x25,0x02,0xDF,0x2E,"H820" }, /* 13 820KB 3.5" */
+ { 2952,18,2,82,0,0x25,0x00,0xDF,0x02,"h1476" }, /* 14 1.48MB 5.25" */
+ { 3444,21,2,82,0,0x25,0x00,0xDF,0x0C,"H1722" }, /* 15 1.72MB 3.5" */
+ { 840,10,2,42,1,0x25,0x01,0xDF,0x2E,"h420" }, /* 16 420KB 5.25" */
+ { 1660,10,2,83,0,0x25,0x02,0xDF,0x2E,"H830" }, /* 17 830KB 3.5" */
+ { 2988,18,2,83,0,0x25,0x00,0xDF,0x02,"h1494" }, /* 18 1.49MB 5.25" */
+ { 3486,21,2,83,0,0x25,0x00,0xDF,0x0C,"H1743" }, /* 19 1.74 MB 3.5" */
+
+ { 1760,11,2,80,0,0x1C,0x09,0xCF,0x00,"h880" }, /* 20 880KB 5.25" */
+ { 2080,13,2,80,0,0x1C,0x01,0xCF,0x00,"D1040" }, /* 21 1.04MB 3.5" */
+ { 2240,14,2,80,0,0x1C,0x19,0xCF,0x00,"D1120" }, /* 22 1.12MB 3.5" */
+ { 3200,20,2,80,0,0x1C,0x20,0xCF,0x2C,"h1600" }, /* 23 1.6MB 5.25" */
+ { 3520,22,2,80,0,0x1C,0x08,0xCF,0x2e,"H1760" }, /* 24 1.76MB 3.5" */
+ { 3840,24,2,80,0,0x1C,0x20,0xCF,0x00,"H1920" }, /* 25 1.92MB 3.5" */
+ { 6400,40,2,80,0,0x25,0x5B,0xCF,0x00,"E3200" }, /* 26 3.20MB 3.5" */
+ { 7040,44,2,80,0,0x25,0x5B,0xCF,0x00,"E3520" }, /* 27 3.52MB 3.5" */
+ { 7680,48,2,80,0,0x25,0x63,0xCF,0x00,"E3840" }, /* 28 3.84MB 3.5" */
+ { 3680,23,2,80,0,0x1C,0x10,0xCF,0x00,"H1840" }, /* 29 1.84MB 3.5" */
+
+ { 1600,10,2,80,0,0x25,0x02,0xDF,0x2E,"D800" }, /* 30 800KB 3.5" */
+ { 3200,20,2,80,0,0x1C,0x00,0xCF,0x2C,"H1600" }, /* 31 1.6MB 3.5" */
+};
+
+#define SECTSIZE (_FD_SECTSIZE(*floppy))
+
+/* Auto-detection: Disk type used until the next media change occurs. */
+static struct floppy_struct *current_type[N_DRIVE];
+
+/*
+ * User-provided type information. current_type points to
+ * the respective entry of this array.
+ */
+static struct floppy_struct user_params[N_DRIVE];
+
+static sector_t floppy_sizes[256];
+
+static char floppy_device_name[] = "floppy";
+
+/*
+ * The driver is trying to determine the correct media format
+ * while probing is set. rw_interrupt() clears it after a
+ * successful access.
+ */
+static int probing;
+
+/* Synchronization of FDC access. */
+#define FD_COMMAND_NONE -1
+#define FD_COMMAND_ERROR 2
+#define FD_COMMAND_OKAY 3
+
+static volatile int command_status = FD_COMMAND_NONE;
+static unsigned long fdc_busy;
+static DECLARE_WAIT_QUEUE_HEAD(fdc_wait);
+static DECLARE_WAIT_QUEUE_HEAD(command_done);
+
+/* Errors during formatting are counted here. */
+static int format_errors;
+
+/* Format request descriptor. */
+static struct format_descr format_req;
+
+/*
+ * Rate is 0 for 500kb/s, 1 for 300kbps, 2 for 250kbps
+ * Spec1 is 0xSH, where S is stepping rate (F=1ms, E=2ms, D=3ms etc),
+ * H is head unload time (1=16ms, 2=32ms, etc)
+ */
+
+/*
+ * Track buffer
+ * Because these are written to by the DMA controller, they must
+ * not contain a 64k byte boundary crossing, or data will be
+ * corrupted/lost.
+ */
+static char *floppy_track_buffer;
+static int max_buffer_sectors;
+
+static int *errors;
+typedef void (*done_f)(int);
+static const struct cont_t {
+ void (*interrupt)(void);
+ /* this is called after the interrupt of the
+ * main command */
+ void (*redo)(void); /* this is called to retry the operation */
+ void (*error)(void); /* this is called to tally an error */
+ done_f done; /* this is called to say if the operation has
+ * succeeded/failed */
+} *cont;
+
+static void floppy_ready(void);
+static void floppy_start(void);
+static void process_fd_request(void);
+static void recalibrate_floppy(void);
+static void floppy_shutdown(struct work_struct *);
+
+static int floppy_request_regions(int);
+static void floppy_release_regions(int);
+static int floppy_grab_irq_and_dma(void);
+static void floppy_release_irq_and_dma(void);
+
+/*
+ * The "reset" variable should be tested whenever an interrupt is scheduled,
+ * after the commands have been sent. This is to ensure that the driver doesn't
+ * get wedged when the interrupt doesn't come because of a failed command.
+ * reset doesn't need to be tested before sending commands, because
+ * output_byte is automatically disabled when reset is set.
+ */
+static void reset_fdc(void);
+
+/*
+ * These are global variables, as that's the easiest way to give
+ * information to interrupts. They are the data used for the current
+ * request.
+ */
+#define NO_TRACK -1
+#define NEED_1_RECAL -2
+#define NEED_2_RECAL -3
+
+static atomic_t usage_count = ATOMIC_INIT(0);
+
+/* buffer related variables */
+static int buffer_track = -1;
+static int buffer_drive = -1;
+static int buffer_min = -1;
+static int buffer_max = -1;
+
+/* fdc related variables, should end up in a struct */
+static struct floppy_fdc_state fdc_state[N_FDC];
+static int fdc; /* current fdc */
+
+static struct workqueue_struct *floppy_wq;
+
+static struct floppy_struct *_floppy = floppy_type;
+static unsigned char current_drive;
+static long current_count_sectors;
+static unsigned char fsector_t; /* sector in track */
+static unsigned char in_sector_offset; /* offset within physical sector,
+ * expressed in units of 512 bytes */
+
+static inline bool drive_no_geom(int drive)
+{
+ return !current_type[drive] && !ITYPE(UDRS->fd_device);
+}
+
+#ifndef fd_eject
+static inline int fd_eject(int drive)
+{
+ return -EINVAL;
+}
+#endif
+
+/*
+ * Debugging
+ * =========
+ */
+#ifdef DEBUGT
+static long unsigned debugtimer;
+
+static inline void set_debugt(void)
+{
+ debugtimer = jiffies;
+}
+
+static inline void debugt(const char *func, const char *msg)
+{
+ if (DP->flags & DEBUGT)
+ pr_info("%s:%s dtime=%lu\n", func, msg, jiffies - debugtimer);
+}
+#else
+static inline void set_debugt(void) { }
+static inline void debugt(const char *func, const char *msg) { }
+#endif /* DEBUGT */
+
+
+static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown);
+static const char *timeout_message;
+
+static void is_alive(const char *func, const char *message)
+{
+ /* this routine checks whether the floppy driver is "alive" */
+ if (test_bit(0, &fdc_busy) && command_status < 2 &&
+ !delayed_work_pending(&fd_timeout)) {
+ DPRINT("%s: timeout handler died. %s\n", func, message);
+ }
+}
+
+static void (*do_floppy)(void) = NULL;
+
+#define OLOGSIZE 20
+
+static void (*lasthandler)(void);
+static unsigned long interruptjiffies;
+static unsigned long resultjiffies;
+static int resultsize;
+static unsigned long lastredo;
+
+static struct output_log {
+ unsigned char data;
+ unsigned char status;
+ unsigned long jiffies;
+} output_log[OLOGSIZE];
+
+static int output_log_pos;
+
+#define current_reqD -1
+#define MAXTIMEOUT -2
+
+static void __reschedule_timeout(int drive, const char *message)
+{
+ unsigned long delay;
+
+ if (drive == current_reqD)
+ drive = current_drive;
+
+ if (drive < 0 || drive >= N_DRIVE) {
+ delay = 20UL * HZ;
+ drive = 0;
+ } else
+ delay = UDP->timeout;
+
+ mod_delayed_work(floppy_wq, &fd_timeout, delay);
+ if (UDP->flags & FD_DEBUG)
+ DPRINT("reschedule timeout %s\n", message);
+ timeout_message = message;
+}
+
+static void reschedule_timeout(int drive, const char *message)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&floppy_lock, flags);
+ __reschedule_timeout(drive, message);
+ spin_unlock_irqrestore(&floppy_lock, flags);
+}
+
+#define INFBOUND(a, b) (a) = max_t(int, a, b)
+#define SUPBOUND(a, b) (a) = min_t(int, a, b)
+
+/*
+ * Bottom half floppy driver.
+ * ==========================
+ *
+ * This part of the file contains the code talking directly to the hardware,
+ * and also the main service loop (seek-configure-spinup-command)
+ */
+
+/*
+ * disk change.
+ * This routine is responsible for maintaining the FD_DISK_CHANGE flag,
+ * and the last_checked date.
+ *
+ * last_checked is the date of the last check which showed 'no disk change'
+ * FD_DISK_CHANGE is set under two conditions:
+ * 1. The floppy has been changed after some i/o to that floppy already
+ * took place.
+ * 2. No floppy disk is in the drive. This is done in order to ensure that
+ * requests are quickly flushed in case there is no disk in the drive. It
+ * follows that FD_DISK_CHANGE can only be cleared if there is a disk in
+ * the drive.
+ *
+ * For 1., maxblock is observed. Maxblock is 0 if no i/o has taken place yet.
+ * For 2., FD_DISK_NEWCHANGE is watched. FD_DISK_NEWCHANGE is cleared on
+ * each seek. If a disk is present, the disk change line should also be
+ * cleared on each seek. Thus, if FD_DISK_NEWCHANGE is clear, but the disk
+ * change line is set, this means either that no disk is in the drive, or
+ * that it has been removed since the last seek.
+ *
+ * This means that we really have a third possibility too:
+ * The floppy has been changed after the last seek.
+ */
+
+static int disk_change(int drive)
+{
+ int fdc = FDC(drive);
+
+ if (time_before(jiffies, UDRS->select_date + UDP->select_delay))
+ DPRINT("WARNING disk change called early\n");
+ if (!(FDCS->dor & (0x10 << UNIT(drive))) ||
+ (FDCS->dor & 3) != UNIT(drive) || fdc != FDC(drive)) {
+ DPRINT("probing disk change on unselected drive\n");
+ DPRINT("drive=%d fdc=%d dor=%x\n", drive, FDC(drive),
+ (unsigned int)FDCS->dor);
+ }
+
+ debug_dcl(UDP->flags,
+ "checking disk change line for drive %d\n", drive);
+ debug_dcl(UDP->flags, "jiffies=%lu\n", jiffies);
+ debug_dcl(UDP->flags, "disk change line=%x\n", fd_inb(FD_DIR) & 0x80);
+ debug_dcl(UDP->flags, "flags=%lx\n", UDRS->flags);
+
+ if (UDP->flags & FD_BROKEN_DCL)
+ return test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+ if ((fd_inb(FD_DIR) ^ UDP->flags) & 0x80) {
+ set_bit(FD_VERIFY_BIT, &UDRS->flags);
+ /* verify write protection */
+
+ if (UDRS->maxblock) /* mark it changed */
+ set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+
+ /* invalidate its geometry */
+ if (UDRS->keep_data >= 0) {
+ if ((UDP->flags & FTD_MSG) &&
+ current_type[drive] != NULL)
+ DPRINT("Disk type is undefined after disk change\n");
+ current_type[drive] = NULL;
+ floppy_sizes[TOMINOR(drive)] = MAX_DISK_SIZE << 1;
+ }
+
+ return 1;
+ } else {
+ UDRS->last_checked = jiffies;
+ clear_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags);
+ }
+ return 0;
+}
+
+static inline int is_selected(int dor, int unit)
+{
+ return ((dor & (0x10 << unit)) && (dor & 3) == unit);
+}
+
+static bool is_ready_state(int status)
+{
+ int state = status & (STATUS_READY | STATUS_DIR | STATUS_DMA);
+ return state == STATUS_READY;
+}
+
+static int set_dor(int fdc, char mask, char data)
+{
+ unsigned char unit;
+ unsigned char drive;
+ unsigned char newdor;
+ unsigned char olddor;
+
+ if (FDCS->address == -1)
+ return -1;
+
+ olddor = FDCS->dor;
+ newdor = (olddor & mask) | data;
+ if (newdor != olddor) {
+ unit = olddor & 0x3;
+ if (is_selected(olddor, unit) && !is_selected(newdor, unit)) {
+ drive = REVDRIVE(fdc, unit);
+ debug_dcl(UDP->flags,
+ "calling disk change from set_dor\n");
+ disk_change(drive);
+ }
+ FDCS->dor = newdor;
+ fd_outb(newdor, FD_DOR);
+
+ unit = newdor & 0x3;
+ if (!is_selected(olddor, unit) && is_selected(newdor, unit)) {
+ drive = REVDRIVE(fdc, unit);
+ UDRS->select_date = jiffies;
+ }
+ }
+ return olddor;
+}
+
+static void twaddle(void)
+{
+ if (DP->select_delay)
+ return;
+ fd_outb(FDCS->dor & ~(0x10 << UNIT(current_drive)), FD_DOR);
+ fd_outb(FDCS->dor, FD_DOR);
+ DRS->select_date = jiffies;
+}
+
+/*
+ * Reset all driver information about the current fdc.
+ * This is needed after a reset, and after a raw command.
+ */
+static void reset_fdc_info(int mode)
+{
+ int drive;
+
+ FDCS->spec1 = FDCS->spec2 = -1;
+ FDCS->need_configure = 1;
+ FDCS->perp_mode = 1;
+ FDCS->rawcmd = 0;
+ for (drive = 0; drive < N_DRIVE; drive++)
+ if (FDC(drive) == fdc && (mode || UDRS->track != NEED_1_RECAL))
+ UDRS->track = NEED_2_RECAL;
+}
+
+/* selects the fdc and drive, and enables the fdc's input/dma. */
+static void set_fdc(int drive)
+{
+ if (drive >= 0 && drive < N_DRIVE) {
+ fdc = FDC(drive);
+ current_drive = drive;
+ }
+ if (fdc != 1 && fdc != 0) {
+ pr_info("bad fdc value\n");
+ return;
+ }
+ set_dor(fdc, ~0, 8);
+#if N_FDC > 1
+ set_dor(1 - fdc, ~8, 0);
+#endif
+ if (FDCS->rawcmd == 2)
+ reset_fdc_info(1);
+ if (fd_inb(FD_STATUS) != STATUS_READY)
+ FDCS->reset = 1;
+}
+
+/* locks the driver */
+static int lock_fdc(int drive, bool interruptible)
+{
+ if (WARN(atomic_read(&usage_count) == 0,
+ "Trying to lock fdc while usage count=0\n"))
+ return -1;
+
+ if (wait_event_interruptible(fdc_wait, !test_and_set_bit(0, &fdc_busy)))
+ return -EINTR;
+
+ command_status = FD_COMMAND_NONE;
+
+ reschedule_timeout(drive, "lock fdc");
+ set_fdc(drive);
+ return 0;
+}
+
+/* unlocks the driver */
+static void unlock_fdc(void)
+{
+ if (!test_bit(0, &fdc_busy))
+ DPRINT("FDC access conflict!\n");
+
+ raw_cmd = NULL;
+ command_status = FD_COMMAND_NONE;
+ cancel_delayed_work(&fd_timeout);
+ do_floppy = NULL;
+ cont = NULL;
+ clear_bit(0, &fdc_busy);
+ wake_up(&fdc_wait);
+}
+
+/* switches the motor off after a given timeout */
+static void motor_off_callback(unsigned long nr)
+{
+ unsigned char mask = ~(0x10 << UNIT(nr));
+
+ set_dor(FDC(nr), mask, 0);
+}
+
+/* schedules motor off */
+static void floppy_off(unsigned int drive)
+{
+ unsigned long volatile delta;
+ int fdc = FDC(drive);
+
+ if (!(FDCS->dor & (0x10 << UNIT(drive))))
+ return;
+
+ del_timer(motor_off_timer + drive);
+
+ /* make spindle stop in a position which minimizes spinup time
+ * next time */
+ if (UDP->rps) {
+ delta = jiffies - UDRS->first_read_date + HZ -
+ UDP->spindown_offset;
+ delta = ((delta * UDP->rps) % HZ) / UDP->rps;
+ motor_off_timer[drive].expires =
+ jiffies + UDP->spindown - delta;
+ }
+ add_timer(motor_off_timer + drive);
+}
+
+/*
+ * cycle through all N_DRIVE floppy drives, for disk change testing.
+ * stopping at current drive. This is done before any long operation, to
+ * be sure to have up to date disk change information.
+ */
+static void scandrives(void)
+{
+ int i;
+ int drive;
+ int saved_drive;
+
+ if (DP->select_delay)
+ return;
+
+ saved_drive = current_drive;
+ for (i = 0; i < N_DRIVE; i++) {
+ drive = (saved_drive + i + 1) % N_DRIVE;
+ if (UDRS->fd_ref == 0 || UDP->select_delay != 0)
+ continue; /* skip closed drives */
+ set_fdc(drive);
+ if (!(set_dor(fdc, ~3, UNIT(drive) | (0x10 << UNIT(drive))) &
+ (0x10 << UNIT(drive))))
+ /* switch the motor off again, if it was off to
+ * begin with */
+ set_dor(fdc, ~(0x10 << UNIT(drive)), 0);
+ }
+ set_fdc(saved_drive);
+}
+
+static void empty(void)
+{
+}
+
+static void (*floppy_work_fn)(void);
+
+static void floppy_work_workfn(struct work_struct *work)
+{
+ floppy_work_fn();
+}
+
+static DECLARE_WORK(floppy_work, floppy_work_workfn);
+
+static void schedule_bh(void (*handler)(void))
+{
+ WARN_ON(work_pending(&floppy_work));
+
+ floppy_work_fn = handler;
+ queue_work(floppy_wq, &floppy_work);
+}
+
+static void (*fd_timer_fn)(void) = NULL;
+
+static void fd_timer_workfn(struct work_struct *work)
+{
+ fd_timer_fn();
+}
+
+static DECLARE_DELAYED_WORK(fd_timer, fd_timer_workfn);
+
+static void cancel_activity(void)
+{
+ do_floppy = NULL;
+ cancel_delayed_work_sync(&fd_timer);
+ cancel_work_sync(&floppy_work);
+}
+
+/* this function makes sure that the disk stays in the drive during the
+ * transfer */
+static void fd_watchdog(void)
+{
+ debug_dcl(DP->flags, "calling disk change from watchdog\n");
+
+ if (disk_change(current_drive)) {
+ DPRINT("disk removed during i/o\n");
+ cancel_activity();
+ cont->done(0);
+ reset_fdc();
+ } else {
+ cancel_delayed_work(&fd_timer);
+ fd_timer_fn = fd_watchdog;
+ queue_delayed_work(floppy_wq, &fd_timer, HZ / 10);
+ }
+}
+
+static void main_command_interrupt(void)
+{
+ cancel_delayed_work(&fd_timer);
+ cont->interrupt();
+}
+
+/* waits for a delay (spinup or select) to pass */
+static int fd_wait_for_completion(unsigned long expires,
+ void (*function)(void))
+{
+ if (FDCS->reset) {
+ reset_fdc(); /* do the reset during sleep to win time
+ * if we don't need to sleep, it's a good
+ * occasion anyways */
+ return 1;
+ }
+
+ if (time_before(jiffies, expires)) {
+ cancel_delayed_work(&fd_timer);
+ fd_timer_fn = function;
+ queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies);
+ return 1;
+ }
+ return 0;
+}
+
+static void setup_DMA(void)
+{
+ unsigned long f;
+
+ if (raw_cmd->length == 0) {
+ int i;
+
+ pr_info("zero dma transfer size:");
+ for (i = 0; i < raw_cmd->cmd_count; i++)
+ pr_cont("%x,", raw_cmd->cmd[i]);
+ pr_cont("\n");
+ cont->done(0);
+ FDCS->reset = 1;
+ return;
+ }
+ if (((unsigned long)raw_cmd->kernel_data) % 512) {
+ pr_info("non aligned address: %p\n", raw_cmd->kernel_data);
+ cont->done(0);
+ FDCS->reset = 1;
+ return;
+ }
+ f = claim_dma_lock();
+ fd_disable_dma();
+#ifdef fd_dma_setup
+ if (fd_dma_setup(raw_cmd->kernel_data, raw_cmd->length,
+ (raw_cmd->flags & FD_RAW_READ) ?
+ DMA_MODE_READ : DMA_MODE_WRITE, FDCS->address) < 0) {
+ release_dma_lock(f);
+ cont->done(0);
+ FDCS->reset = 1;
+ return;
+ }
+ release_dma_lock(f);
+#else
+ fd_clear_dma_ff();
+ fd_cacheflush(raw_cmd->kernel_data, raw_cmd->length);
+ fd_set_dma_mode((raw_cmd->flags & FD_RAW_READ) ?
+ DMA_MODE_READ : DMA_MODE_WRITE);
+ fd_set_dma_addr(raw_cmd->kernel_data);
+ fd_set_dma_count(raw_cmd->length);
+ virtual_dma_port = FDCS->address;
+ fd_enable_dma();
+ release_dma_lock(f);
+#endif
+}
+
+static void show_floppy(void);
+
+/* waits until the fdc becomes ready */
+static int wait_til_ready(void)
+{
+ int status;
+ int counter;
+
+ if (FDCS->reset)
+ return -1;
+ for (counter = 0; counter < 10000; counter++) {
+ status = fd_inb(FD_STATUS);
+ if (status & STATUS_READY)
+ return status;
+ }
+ if (initialized) {
+ DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc);
+ show_floppy();
+ }
+ FDCS->reset = 1;
+ return -1;
+}
+
+/* sends a command byte to the fdc */
+static int output_byte(char byte)
+{
+ int status = wait_til_ready();
+
+ if (status < 0)
+ return -1;
+
+ if (is_ready_state(status)) {
+ fd_outb(byte, FD_DATA);
+ output_log[output_log_pos].data = byte;
+ output_log[output_log_pos].status = status;
+ output_log[output_log_pos].jiffies = jiffies;
+ output_log_pos = (output_log_pos + 1) % OLOGSIZE;
+ return 0;
+ }
+ FDCS->reset = 1;
+ if (initialized) {
+ DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n",
+ byte, fdc, status);
+ show_floppy();
+ }
+ return -1;
+}
+
+/* gets the response from the fdc */
+static int result(void)
+{
+ int i;
+ int status = 0;
+
+ for (i = 0; i < MAX_REPLIES; i++) {
+ status = wait_til_ready();
+ if (status < 0)
+ break;
+ status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA;
+ if ((status & ~STATUS_BUSY) == STATUS_READY) {
+ resultjiffies = jiffies;
+ resultsize = i;
+ return i;
+ }
+ if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY))
+ reply_buffer[i] = fd_inb(FD_DATA);
+ else
+ break;
+ }
+ if (initialized) {
+ DPRINT("get result error. Fdc=%d Last status=%x Read bytes=%d\n",
+ fdc, status, i);
+ show_floppy();
+ }
+ FDCS->reset = 1;
+ return -1;
+}
+
+#define MORE_OUTPUT -2
+/* does the fdc need more output? */
+static int need_more_output(void)
+{
+ int status = wait_til_ready();
+
+ if (status < 0)
+ return -1;
+
+ if (is_ready_state(status))
+ return MORE_OUTPUT;
+
+ return result();
+}
+
+/* Set perpendicular mode as required, based on data rate, if supported.
+ * 82077 Now tested. 1Mbps data rate only possible with 82077-1.
+ */
+static void perpendicular_mode(void)
+{
+ unsigned char perp_mode;
+
+ if (raw_cmd->rate & 0x40) {
+ switch (raw_cmd->rate & 3) {
+ case 0:
+ perp_mode = 2;
+ break;
+ case 3:
+ perp_mode = 3;
+ break;
+ default:
+ DPRINT("Invalid data rate for perpendicular mode!\n");
+ cont->done(0);
+ FDCS->reset = 1;
+ /*
+ * convenient way to return to
+ * redo without too much hassle
+ * (deep stack et al.)
+ */
+ return;
+ }
+ } else
+ perp_mode = 0;
+
+ if (FDCS->perp_mode == perp_mode)
+ return;
+ if (FDCS->version >= FDC_82077_ORIG) {
+ output_byte(FD_PERPENDICULAR);
+ output_byte(perp_mode);
+ FDCS->perp_mode = perp_mode;
+ } else if (perp_mode) {
+ DPRINT("perpendicular mode not supported by this FDC.\n");
+ }
+} /* perpendicular_mode */
+
+static int fifo_depth = 0xa;
+static int no_fifo;
+
+static int fdc_configure(void)
+{
+ /* Turn on FIFO */
+ output_byte(FD_CONFIGURE);
+ if (need_more_output() != MORE_OUTPUT)
+ return 0;
+ output_byte(0);
+ output_byte(0x10 | (no_fifo & 0x20) | (fifo_depth & 0xf));
+ output_byte(0); /* pre-compensation from track
+ 0 upwards */
+ return 1;
+}
+
+#define NOMINAL_DTR 500
+
+/* Issue a "SPECIFY" command to set the step rate time, head unload time,
+ * head load time, and DMA disable flag to values needed by floppy.
+ *
+ * The value "dtr" is the data transfer rate in Kbps. It is needed
+ * to account for the data rate-based scaling done by the 82072 and 82077
+ * FDC types. This parameter is ignored for other types of FDCs (i.e.
+ * 8272a).
+ *
+ * Note that changing the data transfer rate has a (probably deleterious)
+ * effect on the parameters subject to scaling for 82072/82077 FDCs, so
+ * fdc_specify is called again after each data transfer rate
+ * change.
+ *
+ * srt: 1000 to 16000 in microseconds
+ * hut: 16 to 240 milliseconds
+ * hlt: 2 to 254 milliseconds
+ *
+ * These values are rounded up to the next highest available delay time.
+ */
+static void fdc_specify(void)
+{
+ unsigned char spec1;
+ unsigned char spec2;
+ unsigned long srt;
+ unsigned long hlt;
+ unsigned long hut;
+ unsigned long dtr = NOMINAL_DTR;
+ unsigned long scale_dtr = NOMINAL_DTR;
+ int hlt_max_code = 0x7f;
+ int hut_max_code = 0xf;
+
+ if (FDCS->need_configure && FDCS->version >= FDC_82072A) {
+ fdc_configure();
+ FDCS->need_configure = 0;
+ }
+
+ switch (raw_cmd->rate & 0x03) {
+ case 3:
+ dtr = 1000;
+ break;
+ case 1:
+ dtr = 300;
+ if (FDCS->version >= FDC_82078) {
+ /* chose the default rate table, not the one
+ * where 1 = 2 Mbps */
+ output_byte(FD_DRIVESPEC);
+ if (need_more_output() == MORE_OUTPUT) {
+ output_byte(UNIT(current_drive));
+ output_byte(0xc0);
+ }
+ }
+ break;
+ case 2:
+ dtr = 250;
+ break;
+ }
+
+ if (FDCS->version >= FDC_82072) {
+ scale_dtr = dtr;
+ hlt_max_code = 0x00; /* 0==256msec*dtr0/dtr (not linear!) */
+ hut_max_code = 0x0; /* 0==256msec*dtr0/dtr (not linear!) */
+ }
+
+ /* Convert step rate from microseconds to milliseconds and 4 bits */
+ srt = 16 - DIV_ROUND_UP(DP->srt * scale_dtr / 1000, NOMINAL_DTR);
+ if (slow_floppy)
+ srt = srt / 4;
+
+ SUPBOUND(srt, 0xf);
+ INFBOUND(srt, 0);
+
+ hlt = DIV_ROUND_UP(DP->hlt * scale_dtr / 2, NOMINAL_DTR);
+ if (hlt < 0x01)
+ hlt = 0x01;
+ else if (hlt > 0x7f)
+ hlt = hlt_max_code;
+
+ hut = DIV_ROUND_UP(DP->hut * scale_dtr / 16, NOMINAL_DTR);
+ if (hut < 0x1)
+ hut = 0x1;
+ else if (hut > 0xf)
+ hut = hut_max_code;
+
+ spec1 = (srt << 4) | hut;
+ spec2 = (hlt << 1) | (use_virtual_dma & 1);
+
+ /* If these parameters did not change, just return with success */
+ if (FDCS->spec1 != spec1 || FDCS->spec2 != spec2) {
+ /* Go ahead and set spec1 and spec2 */
+ output_byte(FD_SPECIFY);
+ output_byte(FDCS->spec1 = spec1);
+ output_byte(FDCS->spec2 = spec2);
+ }
+} /* fdc_specify */
+
+/* Set the FDC's data transfer rate on behalf of the specified drive.
+ * NOTE: with 82072/82077 FDCs, changing the data rate requires a reissue
+ * of the specify command (i.e. using the fdc_specify function).
+ */
+static int fdc_dtr(void)
+{
+ /* If data rate not already set to desired value, set it. */
+ if ((raw_cmd->rate & 3) == FDCS->dtr)
+ return 0;
+
+ /* Set dtr */
+ fd_outb(raw_cmd->rate & 3, FD_DCR);
+
+ /* TODO: some FDC/drive combinations (C&T 82C711 with TEAC 1.2MB)
+ * need a stabilization period of several milliseconds to be
+ * enforced after data rate changes before R/W operations.
+ * Pause 5 msec to avoid trouble. (Needs to be 2 jiffies)
+ */
+ FDCS->dtr = raw_cmd->rate & 3;
+ return fd_wait_for_completion(jiffies + 2UL * HZ / 100, floppy_ready);
+} /* fdc_dtr */
+
+static void tell_sector(void)
+{
+ pr_cont(": track %d, head %d, sector %d, size %d",
+ R_TRACK, R_HEAD, R_SECTOR, R_SIZECODE);
+} /* tell_sector */
+
+static void print_errors(void)
+{
+ DPRINT("");
+ if (ST0 & ST0_ECE) {
+ pr_cont("Recalibrate failed!");
+ } else if (ST2 & ST2_CRC) {
+ pr_cont("data CRC error");
+ tell_sector();
+ } else if (ST1 & ST1_CRC) {
+ pr_cont("CRC error");
+ tell_sector();
+ } else if ((ST1 & (ST1_MAM | ST1_ND)) ||
+ (ST2 & ST2_MAM)) {
+ if (!probing) {
+ pr_cont("sector not found");
+ tell_sector();
+ } else
+ pr_cont("probe failed...");
+ } else if (ST2 & ST2_WC) { /* seek error */
+ pr_cont("wrong cylinder");
+ } else if (ST2 & ST2_BC) { /* cylinder marked as bad */
+ pr_cont("bad cylinder");
+ } else {
+ pr_cont("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x",
+ ST0, ST1, ST2);
+ tell_sector();
+ }
+ pr_cont("\n");
+}
+
+/*
+ * OK, this error interpreting routine is called after a
+ * DMA read/write has succeeded
+ * or failed, so we check the results, and copy any buffers.
+ * hhb: Added better error reporting.
+ * ak: Made this into a separate routine.
+ */
+static int interpret_errors(void)
+{
+ char bad;
+
+ if (inr != 7) {
+ DPRINT("-- FDC reply error\n");
+ FDCS->reset = 1;
+ return 1;
+ }
+
+ /* check IC to find cause of interrupt */
+ switch (ST0 & ST0_INTR) {
+ case 0x40: /* error occurred during command execution */
+ if (ST1 & ST1_EOC)
+ return 0; /* occurs with pseudo-DMA */
+ bad = 1;
+ if (ST1 & ST1_WP) {
+ DPRINT("Drive is write protected\n");
+ clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
+ cont->done(0);
+ bad = 2;
+ } else if (ST1 & ST1_ND) {
+ set_bit(FD_NEED_TWADDLE_BIT, &DRS->flags);
+ } else if (ST1 & ST1_OR) {
+ if (DP->flags & FTD_MSG)
+ DPRINT("Over/Underrun - retrying\n");
+ bad = 0;
+ } else if (*errors >= DP->max_errors.reporting) {
+ print_errors();
+ }
+ if (ST2 & ST2_WC || ST2 & ST2_BC)
+ /* wrong cylinder => recal */
+ DRS->track = NEED_2_RECAL;
+ return bad;
+ case 0x80: /* invalid command given */
+ DPRINT("Invalid FDC command given!\n");
+ cont->done(0);
+ return 2;
+ case 0xc0:
+ DPRINT("Abnormal termination caused by polling\n");
+ cont->error();
+ return 2;
+ default: /* (0) Normal command termination */
+ return 0;
+ }
+}
+
+/*
+ * This routine is called when everything should be correctly set up
+ * for the transfer (i.e. floppy motor is on, the correct floppy is
+ * selected, and the head is sitting on the right track).
+ */
+static void setup_rw_floppy(void)
+{
+ int i;
+ int r;
+ int flags;
+ int dflags;
+ unsigned long ready_date;
+ void (*function)(void);
+
+ flags = raw_cmd->flags;
+ if (flags & (FD_RAW_READ | FD_RAW_WRITE))
+ flags |= FD_RAW_INTR;
+
+ if ((flags & FD_RAW_SPIN) && !(flags & FD_RAW_NO_MOTOR)) {
+ ready_date = DRS->spinup_date + DP->spinup;
+ /* If spinup will take a long time, rerun scandrives
+ * again just before spinup completion. Beware that
+ * after scandrives, we must again wait for selection.
+ */
+ if (time_after(ready_date, jiffies + DP->select_delay)) {
+ ready_date -= DP->select_delay;
+ function = floppy_start;
+ } else
+ function = setup_rw_floppy;
+
+ /* wait until the floppy is spinning fast enough */
+ if (fd_wait_for_completion(ready_date, function))
+ return;
+ }
+ dflags = DRS->flags;
+
+ if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE))
+ setup_DMA();
+
+ if (flags & FD_RAW_INTR)
+ do_floppy = main_command_interrupt;
+
+ r = 0;
+ for (i = 0; i < raw_cmd->cmd_count; i++)
+ r |= output_byte(raw_cmd->cmd[i]);
+
+ debugt(__func__, "rw_command");
+
+ if (r) {
+ cont->error();
+ reset_fdc();
+ return;
+ }
+
+ if (!(flags & FD_RAW_INTR)) {
+ inr = result();
+ cont->interrupt();
+ } else if (flags & FD_RAW_NEED_DISK)
+ fd_watchdog();
+}
+
+static int blind_seek;
+
+/*
+ * This is the routine called after every seek (or recalibrate) interrupt
+ * from the floppy controller.
+ */
+static void seek_interrupt(void)
+{
+ debugt(__func__, "");
+ if (inr != 2 || (ST0 & 0xF8) != 0x20) {
+ DPRINT("seek failed\n");
+ DRS->track = NEED_2_RECAL;
+ cont->error();
+ cont->redo();
+ return;
+ }
+ if (DRS->track >= 0 && DRS->track != ST1 && !blind_seek) {
+ debug_dcl(DP->flags,
+ "clearing NEWCHANGE flag because of effective seek\n");
+ debug_dcl(DP->flags, "jiffies=%lu\n", jiffies);
+ clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
+ /* effective seek */
+ DRS->select_date = jiffies;
+ }
+ DRS->track = ST1;
+ floppy_ready();
+}
+
+static void check_wp(void)
+{
+ if (test_bit(FD_VERIFY_BIT, &DRS->flags)) {
+ /* check write protection */
+ output_byte(FD_GETSTATUS);
+ output_byte(UNIT(current_drive));
+ if (result() != 1) {
+ FDCS->reset = 1;
+ return;
+ }
+ clear_bit(FD_VERIFY_BIT, &DRS->flags);
+ clear_bit(FD_NEED_TWADDLE_BIT, &DRS->flags);
+ debug_dcl(DP->flags,
+ "checking whether disk is write protected\n");
+ debug_dcl(DP->flags, "wp=%x\n", ST3 & 0x40);
+ if (!(ST3 & 0x40))
+ set_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
+ else
+ clear_bit(FD_DISK_WRITABLE_BIT, &DRS->flags);
+ }
+}
+
+static void seek_floppy(void)
+{
+ int track;
+
+ blind_seek = 0;
+
+ debug_dcl(DP->flags, "calling disk change from %s\n", __func__);
+
+ if (!test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) &&
+ disk_change(current_drive) && (raw_cmd->flags & FD_RAW_NEED_DISK)) {
+ /* the media changed flag should be cleared after the seek.
+ * If it isn't, this means that there is really no disk in
+ * the drive.
+ */
+ set_bit(FD_DISK_CHANGED_BIT, &DRS->flags);
+ cont->done(0);
+ cont->redo();
+ return;
+ }
+ if (DRS->track <= NEED_1_RECAL) {
+ recalibrate_floppy();
+ return;
+ } else if (test_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags) &&
+ (raw_cmd->flags & FD_RAW_NEED_DISK) &&
+ (DRS->track <= NO_TRACK || DRS->track == raw_cmd->track)) {
+ /* we seek to clear the media-changed condition. Does anybody
+ * know a more elegant way, which works on all drives? */
+ if (raw_cmd->track)
+ track = raw_cmd->track - 1;
+ else {
+ if (DP->flags & FD_SILENT_DCL_CLEAR) {
+ set_dor(fdc, ~(0x10 << UNIT(current_drive)), 0);
+ blind_seek = 1;
+ raw_cmd->flags |= FD_RAW_NEED_SEEK;
+ }
+ track = 1;
+ }
+ } else {
+ check_wp();
+ if (raw_cmd->track != DRS->track &&
+ (raw_cmd->flags & FD_RAW_NEED_SEEK))
+ track = raw_cmd->track;
+ else {
+ setup_rw_floppy();
+ return;
+ }
+ }
+
+ do_floppy = seek_interrupt;
+ output_byte(FD_SEEK);
+ output_byte(UNIT(current_drive));
+ if (output_byte(track) < 0) {
+ reset_fdc();
+ return;
+ }
+ debugt(__func__, "");
+}
+
+static void recal_interrupt(void)
+{
+ debugt(__func__, "");
+ if (inr != 2)
+ FDCS->reset = 1;
+ else if (ST0 & ST0_ECE) {
+ switch (DRS->track) {
+ case NEED_1_RECAL:
+ debugt(__func__, "need 1 recal");
+ /* after a second recalibrate, we still haven't
+ * reached track 0. Probably no drive. Raise an
+ * error, as failing immediately might upset
+ * computers possessed by the Devil :-) */
+ cont->error();
+ cont->redo();
+ return;
+ case NEED_2_RECAL:
+ debugt(__func__, "need 2 recal");
+ /* If we already did a recalibrate,
+ * and we are not at track 0, this
+ * means we have moved. (The only way
+ * not to move at recalibration is to
+ * be already at track 0.) Clear the
+ * new change flag */
+ debug_dcl(DP->flags,
+ "clearing NEWCHANGE flag because of second recalibrate\n");
+
+ clear_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
+ DRS->select_date = jiffies;
+ /* fall through */
+ default:
+ debugt(__func__, "default");
+ /* Recalibrate moves the head by at
+ * most 80 steps. If after one
+ * recalibrate we don't have reached
+ * track 0, this might mean that we
+ * started beyond track 80. Try
+ * again. */
+ DRS->track = NEED_1_RECAL;
+ break;
+ }
+ } else
+ DRS->track = ST1;
+ floppy_ready();
+}
+
+static void print_result(char *message, int inr)
+{
+ int i;
+
+ DPRINT("%s ", message);
+ if (inr >= 0)
+ for (i = 0; i < inr; i++)
+ pr_cont("repl[%d]=%x ", i, reply_buffer[i]);
+ pr_cont("\n");
+}
+
+/* interrupt handler. Note that this can be called externally on the Sparc */
+irqreturn_t floppy_interrupt(int irq, void *dev_id)
+{
+ int do_print;
+ unsigned long f;
+ void (*handler)(void) = do_floppy;
+
+ lasthandler = handler;
+ interruptjiffies = jiffies;
+
+ f = claim_dma_lock();
+ fd_disable_dma();
+ release_dma_lock(f);
+
+ do_floppy = NULL;
+ if (fdc >= N_FDC || FDCS->address == -1) {
+ /* we don't even know which FDC is the culprit */
+ pr_info("DOR0=%x\n", fdc_state[0].dor);
+ pr_info("floppy interrupt on bizarre fdc %d\n", fdc);
+ pr_info("handler=%pf\n", handler);
+ is_alive(__func__, "bizarre fdc");
+ return IRQ_NONE;
+ }
+
+ FDCS->reset = 0;
+ /* We have to clear the reset flag here, because apparently on boxes
+ * with level triggered interrupts (PS/2, Sparc, ...), it is needed to
+ * emit SENSEI's to clear the interrupt line. And FDCS->reset blocks the
+ * emission of the SENSEI's.
+ * It is OK to emit floppy commands because we are in an interrupt
+ * handler here, and thus we have to fear no interference of other
+ * activity.
+ */
+
+ do_print = !handler && print_unex && initialized;
+
+ inr = result();
+ if (do_print)
+ print_result("unexpected interrupt", inr);
+ if (inr == 0) {
+ int max_sensei = 4;
+ do {
+ output_byte(FD_SENSEI);
+ inr = result();
+ if (do_print)
+ print_result("sensei", inr);
+ max_sensei--;
+ } while ((ST0 & 0x83) != UNIT(current_drive) &&
+ inr == 2 && max_sensei);
+ }
+ if (!handler) {
+ FDCS->reset = 1;
+ return IRQ_NONE;
+ }
+ schedule_bh(handler);
+ is_alive(__func__, "normal interrupt end");
+
+ /* FIXME! Was it really for us? */
+ return IRQ_HANDLED;
+}
+
+static void recalibrate_floppy(void)
+{
+ debugt(__func__, "");
+ do_floppy = recal_interrupt;
+ output_byte(FD_RECALIBRATE);
+ if (output_byte(UNIT(current_drive)) < 0)
+ reset_fdc();
+}
+
+/*
+ * Must do 4 FD_SENSEIs after reset because of ``drive polling''.
+ */
+static void reset_interrupt(void)
+{
+ debugt(__func__, "");
+ result(); /* get the status ready for set_fdc */
+ if (FDCS->reset) {
+ pr_info("reset set in interrupt, calling %pf\n", cont->error);
+ cont->error(); /* a reset just after a reset. BAD! */
+ }
+ cont->redo();
+}
+
+/*
+ * reset is done by pulling bit 2 of DOR low for a while (old FDCs),
+ * or by setting the self clearing bit 7 of STATUS (newer FDCs)
+ */
+static void reset_fdc(void)
+{
+ unsigned long flags;
+
+ do_floppy = reset_interrupt;
+ FDCS->reset = 0;
+ reset_fdc_info(0);
+
+ /* Pseudo-DMA may intercept 'reset finished' interrupt. */
+ /* Irrelevant for systems with true DMA (i386). */
+
+ flags = claim_dma_lock();
+ fd_disable_dma();
+ release_dma_lock(flags);
+
+ if (FDCS->version >= FDC_82072A)
+ fd_outb(0x80 | (FDCS->dtr & 3), FD_STATUS);
+ else {
+ fd_outb(FDCS->dor & ~0x04, FD_DOR);
+ udelay(FD_RESET_DELAY);
+ fd_outb(FDCS->dor, FD_DOR);
+ }
+}
+
+static void show_floppy(void)
+{
+ int i;
+
+ pr_info("\n");
+ pr_info("floppy driver state\n");
+ pr_info("-------------------\n");
+ pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%pf\n",
+ jiffies, interruptjiffies, jiffies - interruptjiffies,
+ lasthandler);
+
+ pr_info("timeout_message=%s\n", timeout_message);
+ pr_info("last output bytes:\n");
+ for (i = 0; i < OLOGSIZE; i++)
+ pr_info("%2x %2x %lu\n",
+ output_log[(i + output_log_pos) % OLOGSIZE].data,
+ output_log[(i + output_log_pos) % OLOGSIZE].status,
+ output_log[(i + output_log_pos) % OLOGSIZE].jiffies);
+ pr_info("last result at %lu\n", resultjiffies);
+ pr_info("last redo_fd_request at %lu\n", lastredo);
+ print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1,
+ reply_buffer, resultsize, true);
+
+ pr_info("status=%x\n", fd_inb(FD_STATUS));
+ pr_info("fdc_busy=%lu\n", fdc_busy);
+ if (do_floppy)
+ pr_info("do_floppy=%pf\n", do_floppy);
+ if (work_pending(&floppy_work))
+ pr_info("floppy_work.func=%pf\n", floppy_work.func);
+ if (delayed_work_pending(&fd_timer))
+ pr_info("delayed work.function=%p expires=%ld\n",
+ fd_timer.work.func,
+ fd_timer.timer.expires - jiffies);
+ if (delayed_work_pending(&fd_timeout))
+ pr_info("timer_function=%p expires=%ld\n",
+ fd_timeout.work.func,
+ fd_timeout.timer.expires - jiffies);
+
+ pr_info("cont=%p\n", cont);
+ pr_info("current_req=%p\n", current_req);
+ pr_info("command_status=%d\n", command_status);
+ pr_info("\n");
+}
+
+static void floppy_shutdown(struct work_struct *arg)
+{
+ unsigned long flags;
+
+ if (initialized)
+ show_floppy();
+ cancel_activity();
+
+ flags = claim_dma_lock();
+ fd_disable_dma();
+ release_dma_lock(flags);
+
+ /* avoid dma going to a random drive after shutdown */
+
+ if (initialized)
+ DPRINT("floppy timeout called\n");
+ FDCS->reset = 1;
+ if (cont) {
+ cont->done(0);
+ cont->redo(); /* this will recall reset when needed */
+ } else {
+ pr_info("no cont in shutdown!\n");
+ process_fd_request();
+ }
+ is_alive(__func__, "");
+}
+
+/* start motor, check media-changed condition and write protection */
+static int start_motor(void (*function)(void))
+{
+ int mask;
+ int data;
+
+ mask = 0xfc;
+ data = UNIT(current_drive);
+ if (!(raw_cmd->flags & FD_RAW_NO_MOTOR)) {
+ if (!(FDCS->dor & (0x10 << UNIT(current_drive)))) {
+ set_debugt();
+ /* no read since this drive is running */
+ DRS->first_read_date = 0;
+ /* note motor start time if motor is not yet running */
+ DRS->spinup_date = jiffies;
+ data |= (0x10 << UNIT(current_drive));
+ }
+ } else if (FDCS->dor & (0x10 << UNIT(current_drive)))
+ mask &= ~(0x10 << UNIT(current_drive));
+
+ /* starts motor and selects floppy */
+ del_timer(motor_off_timer + current_drive);
+ set_dor(fdc, mask, data);
+
+ /* wait_for_completion also schedules reset if needed. */
+ return fd_wait_for_completion(DRS->select_date + DP->select_delay,
+ function);
+}
+
+static void floppy_ready(void)
+{
+ if (FDCS->reset) {
+ reset_fdc();
+ return;
+ }
+ if (start_motor(floppy_ready))
+ return;
+ if (fdc_dtr())
+ return;
+
+ debug_dcl(DP->flags, "calling disk change from floppy_ready\n");
+ if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) &&
+ disk_change(current_drive) && !DP->select_delay)
+ twaddle(); /* this clears the dcl on certain
+ * drive/controller combinations */
+
+#ifdef fd_chose_dma_mode
+ if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) {
+ unsigned long flags = claim_dma_lock();
+ fd_chose_dma_mode(raw_cmd->kernel_data, raw_cmd->length);
+ release_dma_lock(flags);
+ }
+#endif
+
+ if (raw_cmd->flags & (FD_RAW_NEED_SEEK | FD_RAW_NEED_DISK)) {
+ perpendicular_mode();
+ fdc_specify(); /* must be done here because of hut, hlt ... */
+ seek_floppy();
+ } else {
+ if ((raw_cmd->flags & FD_RAW_READ) ||
+ (raw_cmd->flags & FD_RAW_WRITE))
+ fdc_specify();
+ setup_rw_floppy();
+ }
+}
+
+static void floppy_start(void)
+{
+ reschedule_timeout(current_reqD, "floppy start");
+
+ scandrives();
+ debug_dcl(DP->flags, "setting NEWCHANGE in floppy_start\n");
+ set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
+ floppy_ready();
+}
+
+/*
+ * ========================================================================
+ * here ends the bottom half. Exported routines are:
+ * floppy_start, floppy_off, floppy_ready, lock_fdc, unlock_fdc, set_fdc,
+ * start_motor, reset_fdc, reset_fdc_info, interpret_errors.
+ * Initialization also uses output_byte, result, set_dor, floppy_interrupt
+ * and set_dor.
+ * ========================================================================
+ */
+/*
+ * General purpose continuations.
+ * ==============================
+ */
+
+static void do_wakeup(void)
+{
+ reschedule_timeout(MAXTIMEOUT, "do wakeup");
+ cont = NULL;
+ command_status += 2;
+ wake_up(&command_done);
+}
+
+static const struct cont_t wakeup_cont = {
+ .interrupt = empty,
+ .redo = do_wakeup,
+ .error = empty,
+ .done = (done_f)empty
+};
+
+static const struct cont_t intr_cont = {
+ .interrupt = empty,
+ .redo = process_fd_request,
+ .error = empty,
+ .done = (done_f)empty
+};
+
+static int wait_til_done(void (*handler)(void), bool interruptible)
+{
+ int ret;
+
+ schedule_bh(handler);
+
+ if (interruptible)
+ wait_event_interruptible(command_done, command_status >= 2);
+ else
+ wait_event(command_done, command_status >= 2);
+
+ if (command_status < 2) {
+ cancel_activity();
+ cont = &intr_cont;
+ reset_fdc();
+ return -EINTR;
+ }
+
+ if (FDCS->reset)
+ command_status = FD_COMMAND_ERROR;
+ if (command_status == FD_COMMAND_OKAY)
+ ret = 0;
+ else
+ ret = -EIO;
+ command_status = FD_COMMAND_NONE;
+ return ret;
+}
+
+static void generic_done(int result)
+{
+ command_status = result;
+ cont = &wakeup_cont;
+}
+
+static void generic_success(void)
+{
+ cont->done(1);
+}
+
+static void generic_failure(void)
+{
+ cont->done(0);
+}
+
+static void success_and_wakeup(void)
+{
+ generic_success();
+ cont->redo();
+}
+
+/*
+ * formatting and rw support.
+ * ==========================
+ */
+
+static int next_valid_format(void)
+{
+ int probed_format;
+
+ probed_format = DRS->probed_format;
+ while (1) {
+ if (probed_format >= 8 || !DP->autodetect[probed_format]) {
+ DRS->probed_format = 0;
+ return 1;
+ }
+ if (floppy_type[DP->autodetect[probed_format]].sect) {
+ DRS->probed_format = probed_format;
+ return 0;
+ }
+ probed_format++;
+ }
+}
+
+static void bad_flp_intr(void)
+{
+ int err_count;
+
+ if (probing) {
+ DRS->probed_format++;
+ if (!next_valid_format())
+ return;
+ }
+ err_count = ++(*errors);
+ INFBOUND(DRWE->badness, err_count);
+ if (err_count > DP->max_errors.abort)
+ cont->done(0);
+ if (err_count > DP->max_errors.reset)
+ FDCS->reset = 1;
+ else if (err_count > DP->max_errors.recal)
+ DRS->track = NEED_2_RECAL;
+}
+
+static void set_floppy(int drive)
+{
+ int type = ITYPE(UDRS->fd_device);
+
+ if (type)
+ _floppy = floppy_type + type;
+ else
+ _floppy = current_type[drive];
+}
+
+/*
+ * formatting support.
+ * ===================
+ */
+static void format_interrupt(void)
+{
+ switch (interpret_errors()) {
+ case 1:
+ cont->error();
+ case 2:
+ break;
+ case 0:
+ cont->done(1);
+ }
+ cont->redo();
+}
+
+#define FM_MODE(x, y) ((y) & ~(((x)->rate & 0x80) >> 1))
+#define CT(x) ((x) | 0xc0)
+
+static void setup_format_params(int track)
+{
+ int n;
+ int il;
+ int count;
+ int head_shift;
+ int track_shift;
+ struct fparm {
+ unsigned char track, head, sect, size;
+ } *here = (struct fparm *)floppy_track_buffer;
+
+ raw_cmd = &default_raw_cmd;
+ raw_cmd->track = track;
+
+ raw_cmd->flags = (FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN |
+ FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK);
+ raw_cmd->rate = _floppy->rate & 0x43;
+ raw_cmd->cmd_count = NR_F;
+ COMMAND = FM_MODE(_floppy, FD_FORMAT);
+ DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, format_req.head);
+ F_SIZECODE = FD_SIZECODE(_floppy);
+ F_SECT_PER_TRACK = _floppy->sect << 2 >> F_SIZECODE;
+ F_GAP = _floppy->fmt_gap;
+ F_FILL = FD_FILL_BYTE;
+
+ raw_cmd->kernel_data = floppy_track_buffer;
+ raw_cmd->length = 4 * F_SECT_PER_TRACK;
+
+ /* allow for about 30ms for data transport per track */
+ head_shift = (F_SECT_PER_TRACK + 5) / 6;
+
+ /* a ``cylinder'' is two tracks plus a little stepping time */
+ track_shift = 2 * head_shift + 3;
+
+ /* position of logical sector 1 on this track */
+ n = (track_shift * format_req.track + head_shift * format_req.head)
+ % F_SECT_PER_TRACK;
+
+ /* determine interleave */
+ il = 1;
+ if (_floppy->fmt_gap < 0x22)
+ il++;
+
+ /* initialize field */
+ for (count = 0; count < F_SECT_PER_TRACK; ++count) {
+ here[count].track = format_req.track;
+ here[count].head = format_req.head;
+ here[count].sect = 0;
+ here[count].size = F_SIZECODE;
+ }
+ /* place logical sectors */
+ for (count = 1; count <= F_SECT_PER_TRACK; ++count) {
+ here[n].sect = count;
+ n = (n + il) % F_SECT_PER_TRACK;
+ if (here[n].sect) { /* sector busy, find next free sector */
+ ++n;
+ if (n >= F_SECT_PER_TRACK) {
+ n -= F_SECT_PER_TRACK;
+ while (here[n].sect)
+ ++n;
+ }
+ }
+ }
+ if (_floppy->stretch & FD_SECTBASEMASK) {
+ for (count = 0; count < F_SECT_PER_TRACK; count++)
+ here[count].sect += FD_SECTBASE(_floppy) - 1;
+ }
+}
+
+static void redo_format(void)
+{
+ buffer_track = -1;
+ setup_format_params(format_req.track << STRETCH(_floppy));
+ floppy_start();
+ debugt(__func__, "queue format request");
+}
+
+static const struct cont_t format_cont = {
+ .interrupt = format_interrupt,
+ .redo = redo_format,
+ .error = bad_flp_intr,
+ .done = generic_done
+};
+
+static int do_format(int drive, struct format_descr *tmp_format_req)
+{
+ int ret;
+
+ if (lock_fdc(drive, true))
+ return -EINTR;
+
+ set_floppy(drive);
+ if (!_floppy ||
+ _floppy->track > DP->tracks ||
+ tmp_format_req->track >= _floppy->track ||
+ tmp_format_req->head >= _floppy->head ||
+ (_floppy->sect << 2) % (1 << FD_SIZECODE(_floppy)) ||
+ !_floppy->fmt_gap) {
+ process_fd_request();
+ return -EINVAL;
+ }
+ format_req = *tmp_format_req;
+ format_errors = 0;
+ cont = &format_cont;
+ errors = &format_errors;
+ ret = wait_til_done(redo_format, true);
+ if (ret == -EINTR)
+ return -EINTR;
+ process_fd_request();
+ return ret;
+}
+
+/*
+ * Buffer read/write and support
+ * =============================
+ */
+
+static void floppy_end_request(struct request *req, int error)
+{
+ unsigned int nr_sectors = current_count_sectors;
+ unsigned int drive = (unsigned long)req->rq_disk->private_data;
+
+ /* current_count_sectors can be zero if transfer failed */
+ if (error)
+ nr_sectors = blk_rq_cur_sectors(req);
+ if (__blk_end_request(req, error, nr_sectors << 9))
+ return;
+
+ /* We're done with the request */
+ floppy_off(drive);
+ current_req = NULL;
+}
+
+/* new request_done. Can handle physical sectors which are smaller than a
+ * logical buffer */
+static void request_done(int uptodate)
+{
+ struct request *req = current_req;
+ struct request_queue *q;
+ unsigned long flags;
+ int block;
+ char msg[sizeof("request done ") + sizeof(int) * 3];
+
+ probing = 0;
+ snprintf(msg, sizeof(msg), "request done %d", uptodate);
+ reschedule_timeout(MAXTIMEOUT, msg);
+
+ if (!req) {
+ pr_info("floppy.c: no request in request_done\n");
+ return;
+ }
+
+ q = req->q;
+
+ if (uptodate) {
+ /* maintain values for invalidation on geometry
+ * change */
+ block = current_count_sectors + blk_rq_pos(req);
+ INFBOUND(DRS->maxblock, block);
+ if (block > _floppy->sect)
+ DRS->maxtrack = 1;
+
+ /* unlock chained buffers */
+ spin_lock_irqsave(q->queue_lock, flags);
+ floppy_end_request(req, 0);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ } else {
+ if (rq_data_dir(req) == WRITE) {
+ /* record write error information */
+ DRWE->write_errors++;
+ if (DRWE->write_errors == 1) {
+ DRWE->first_error_sector = blk_rq_pos(req);
+ DRWE->first_error_generation = DRS->generation;
+ }
+ DRWE->last_error_sector = blk_rq_pos(req);
+ DRWE->last_error_generation = DRS->generation;
+ }
+ spin_lock_irqsave(q->queue_lock, flags);
+ floppy_end_request(req, -EIO);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ }
+}
+
+/* Interrupt handler evaluating the result of the r/w operation */
+static void rw_interrupt(void)
+{
+ int eoc;
+ int ssize;
+ int heads;
+ int nr_sectors;
+
+ if (R_HEAD >= 2) {
+ /* some Toshiba floppy controllers occasionnally seem to
+ * return bogus interrupts after read/write operations, which
+ * can be recognized by a bad head number (>= 2) */
+ return;
+ }
+
+ if (!DRS->first_read_date)
+ DRS->first_read_date = jiffies;
+
+ nr_sectors = 0;
+ ssize = DIV_ROUND_UP(1 << SIZECODE, 4);
+
+ if (ST1 & ST1_EOC)
+ eoc = 1;
+ else
+ eoc = 0;
+
+ if (COMMAND & 0x80)
+ heads = 2;
+ else
+ heads = 1;
+
+ nr_sectors = (((R_TRACK - TRACK) * heads +
+ R_HEAD - HEAD) * SECT_PER_TRACK +
+ R_SECTOR - SECTOR + eoc) << SIZECODE >> 2;
+
+ if (nr_sectors / ssize >
+ DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) {
+ DPRINT("long rw: %x instead of %lx\n",
+ nr_sectors, current_count_sectors);
+ pr_info("rs=%d s=%d\n", R_SECTOR, SECTOR);
+ pr_info("rh=%d h=%d\n", R_HEAD, HEAD);
+ pr_info("rt=%d t=%d\n", R_TRACK, TRACK);
+ pr_info("heads=%d eoc=%d\n", heads, eoc);
+ pr_info("spt=%d st=%d ss=%d\n",
+ SECT_PER_TRACK, fsector_t, ssize);
+ pr_info("in_sector_offset=%d\n", in_sector_offset);
+ }
+
+ nr_sectors -= in_sector_offset;
+ INFBOUND(nr_sectors, 0);
+ SUPBOUND(current_count_sectors, nr_sectors);
+
+ switch (interpret_errors()) {
+ case 2:
+ cont->redo();
+ return;
+ case 1:
+ if (!current_count_sectors) {
+ cont->error();
+ cont->redo();
+ return;
+ }
+ break;
+ case 0:
+ if (!current_count_sectors) {
+ cont->redo();
+ return;
+ }
+ current_type[current_drive] = _floppy;
+ floppy_sizes[TOMINOR(current_drive)] = _floppy->size;
+ break;
+ }
+
+ if (probing) {
+ if (DP->flags & FTD_MSG)
+ DPRINT("Auto-detected floppy type %s in fd%d\n",
+ _floppy->name, current_drive);
+ current_type[current_drive] = _floppy;
+ floppy_sizes[TOMINOR(current_drive)] = _floppy->size;
+ probing = 0;
+ }
+
+ if (CT(COMMAND) != FD_READ ||
+ raw_cmd->kernel_data == bio_data(current_req->bio)) {
+ /* transfer directly from buffer */
+ cont->done(1);
+ } else if (CT(COMMAND) == FD_READ) {
+ buffer_track = raw_cmd->track;
+ buffer_drive = current_drive;
+ INFBOUND(buffer_max, nr_sectors + fsector_t);
+ }
+ cont->redo();
+}
+
+/* Compute maximal contiguous buffer size. */
+static int buffer_chain_size(void)
+{
+ struct bio_vec bv;
+ int size;
+ struct req_iterator iter;
+ char *base;
+
+ base = bio_data(current_req->bio);
+ size = 0;
+
+ rq_for_each_segment(bv, current_req, iter) {
+ if (page_address(bv.bv_page) + bv.bv_offset != base + size)
+ break;
+
+ size += bv.bv_len;
+ }
+
+ return size >> 9;
+}
+
+/* Compute the maximal transfer size */
+static int transfer_size(int ssize, int max_sector, int max_size)
+{
+ SUPBOUND(max_sector, fsector_t + max_size);
+
+ /* alignment */
+ max_sector -= (max_sector % _floppy->sect) % ssize;
+
+ /* transfer size, beginning not aligned */
+ current_count_sectors = max_sector - fsector_t;
+
+ return max_sector;
+}
+
+/*
+ * Move data from/to the track buffer to/from the buffer cache.
+ */
+static void copy_buffer(int ssize, int max_sector, int max_sector_2)
+{
+ int remaining; /* number of transferred 512-byte sectors */
+ struct bio_vec bv;
+ char *buffer;
+ char *dma_buffer;
+ int size;
+ struct req_iterator iter;
+
+ max_sector = transfer_size(ssize,
+ min(max_sector, max_sector_2),
+ blk_rq_sectors(current_req));
+
+ if (current_count_sectors <= 0 && CT(COMMAND) == FD_WRITE &&
+ buffer_max > fsector_t + blk_rq_sectors(current_req))
+ current_count_sectors = min_t(int, buffer_max - fsector_t,
+ blk_rq_sectors(current_req));
+
+ remaining = current_count_sectors << 9;
+ if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) {
+ DPRINT("in copy buffer\n");
+ pr_info("current_count_sectors=%ld\n", current_count_sectors);
+ pr_info("remaining=%d\n", remaining >> 9);
+ pr_info("current_req->nr_sectors=%u\n",
+ blk_rq_sectors(current_req));
+ pr_info("current_req->current_nr_sectors=%u\n",
+ blk_rq_cur_sectors(current_req));
+ pr_info("max_sector=%d\n", max_sector);
+ pr_info("ssize=%d\n", ssize);
+ }
+
+ buffer_max = max(max_sector, buffer_max);
+
+ dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9);
+
+ size = blk_rq_cur_bytes(current_req);
+
+ rq_for_each_segment(bv, current_req, iter) {
+ if (!remaining)
+ break;
+
+ size = bv.bv_len;
+ SUPBOUND(size, remaining);
+
+ buffer = page_address(bv.bv_page) + bv.bv_offset;
+ if (dma_buffer + size >
+ floppy_track_buffer + (max_buffer_sectors << 10) ||
+ dma_buffer < floppy_track_buffer) {
+ DPRINT("buffer overrun in copy buffer %d\n",
+ (int)((floppy_track_buffer - dma_buffer) >> 9));
+ pr_info("fsector_t=%d buffer_min=%d\n",
+ fsector_t, buffer_min);
+ pr_info("current_count_sectors=%ld\n",
+ current_count_sectors);
+ if (CT(COMMAND) == FD_READ)
+ pr_info("read\n");
+ if (CT(COMMAND) == FD_WRITE)
+ pr_info("write\n");
+ break;
+ }
+ if (((unsigned long)buffer) % 512)
+ DPRINT("%p buffer not aligned\n", buffer);
+
+ if (CT(COMMAND) == FD_READ)
+ memcpy(buffer, dma_buffer, size);
+ else
+ memcpy(dma_buffer, buffer, size);
+
+ remaining -= size;
+ dma_buffer += size;
+ }
+ if (remaining) {
+ if (remaining > 0)
+ max_sector -= remaining >> 9;
+ DPRINT("weirdness: remaining %d\n", remaining >> 9);
+ }
+}
+
+/* work around a bug in pseudo DMA
+ * (on some FDCs) pseudo DMA does not stop when the CPU stops
+ * sending data. Hence we need a different way to signal the
+ * transfer length: We use SECT_PER_TRACK. Unfortunately, this
+ * does not work with MT, hence we can only transfer one head at
+ * a time
+ */
+static void virtualdmabug_workaround(void)
+{
+ int hard_sectors;
+ int end_sector;
+
+ if (CT(COMMAND) == FD_WRITE) {
+ COMMAND &= ~0x80; /* switch off multiple track mode */
+
+ hard_sectors = raw_cmd->length >> (7 + SIZECODE);
+ end_sector = SECTOR + hard_sectors - 1;
+ if (end_sector > SECT_PER_TRACK) {
+ pr_info("too many sectors %d > %d\n",
+ end_sector, SECT_PER_TRACK);
+ return;
+ }
+ SECT_PER_TRACK = end_sector;
+ /* make sure SECT_PER_TRACK
+ * points to end of transfer */
+ }
+}
+
+/*
+ * Formulate a read/write request.
+ * this routine decides where to load the data (directly to buffer, or to
+ * tmp floppy area), how much data to load (the size of the buffer, the whole
+ * track, or a single sector)
+ * All floppy_track_buffer handling goes in here. If we ever add track buffer
+ * allocation on the fly, it should be done here. No other part should need
+ * modification.
+ */
+
+static int make_raw_rw_request(void)
+{
+ int aligned_sector_t;
+ int max_sector;
+ int max_size;
+ int tracksize;
+ int ssize;
+
+ if (WARN(max_buffer_sectors == 0, "VFS: Block I/O scheduled on unopened device\n"))
+ return 0;
+
+ set_fdc((long)current_req->rq_disk->private_data);
+
+ raw_cmd = &default_raw_cmd;
+ raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK;
+ raw_cmd->cmd_count = NR_RW;
+ if (rq_data_dir(current_req) == READ) {
+ raw_cmd->flags |= FD_RAW_READ;
+ COMMAND = FM_MODE(_floppy, FD_READ);
+ } else if (rq_data_dir(current_req) == WRITE) {
+ raw_cmd->flags |= FD_RAW_WRITE;
+ COMMAND = FM_MODE(_floppy, FD_WRITE);
+ } else {
+ DPRINT("%s: unknown command\n", __func__);
+ return 0;
+ }
+
+ max_sector = _floppy->sect * _floppy->head;
+
+ TRACK = (int)blk_rq_pos(current_req) / max_sector;
+ fsector_t = (int)blk_rq_pos(current_req) % max_sector;
+ if (_floppy->track && TRACK >= _floppy->track) {
+ if (blk_rq_cur_sectors(current_req) & 1) {
+ current_count_sectors = 1;
+ return 1;
+ } else
+ return 0;
+ }
+ HEAD = fsector_t / _floppy->sect;
+
+ if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) ||
+ test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags)) &&
+ fsector_t < _floppy->sect)
+ max_sector = _floppy->sect;
+
+ /* 2M disks have phantom sectors on the first track */
+ if ((_floppy->rate & FD_2M) && (!TRACK) && (!HEAD)) {
+ max_sector = 2 * _floppy->sect / 3;
+ if (fsector_t >= max_sector) {
+ current_count_sectors =
+ min_t(int, _floppy->sect - fsector_t,
+ blk_rq_sectors(current_req));
+ return 1;
+ }
+ SIZECODE = 2;
+ } else
+ SIZECODE = FD_SIZECODE(_floppy);
+ raw_cmd->rate = _floppy->rate & 0x43;
+ if ((_floppy->rate & FD_2M) && (TRACK || HEAD) && raw_cmd->rate == 2)
+ raw_cmd->rate = 1;
+
+ if (SIZECODE)
+ SIZECODE2 = 0xff;
+ else
+ SIZECODE2 = 0x80;
+ raw_cmd->track = TRACK << STRETCH(_floppy);
+ DR_SELECT = UNIT(current_drive) + PH_HEAD(_floppy, HEAD);
+ GAP = _floppy->gap;
+ ssize = DIV_ROUND_UP(1 << SIZECODE, 4);
+ SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE;
+ SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) +
+ FD_SECTBASE(_floppy);
+
+ /* tracksize describes the size which can be filled up with sectors
+ * of size ssize.
+ */
+ tracksize = _floppy->sect - _floppy->sect % ssize;
+ if (tracksize < _floppy->sect) {
+ SECT_PER_TRACK++;
+ if (tracksize <= fsector_t % _floppy->sect)
+ SECTOR--;
+
+ /* if we are beyond tracksize, fill up using smaller sectors */
+ while (tracksize <= fsector_t % _floppy->sect) {
+ while (tracksize + ssize > _floppy->sect) {
+ SIZECODE--;
+ ssize >>= 1;
+ }
+ SECTOR++;
+ SECT_PER_TRACK++;
+ tracksize += ssize;
+ }
+ max_sector = HEAD * _floppy->sect + tracksize;
+ } else if (!TRACK && !HEAD && !(_floppy->rate & FD_2M) && probing) {
+ max_sector = _floppy->sect;
+ } else if (!HEAD && CT(COMMAND) == FD_WRITE) {
+ /* for virtual DMA bug workaround */
+ max_sector = _floppy->sect;
+ }
+
+ in_sector_offset = (fsector_t % _floppy->sect) % ssize;
+ aligned_sector_t = fsector_t - in_sector_offset;
+ max_size = blk_rq_sectors(current_req);
+ if ((raw_cmd->track == buffer_track) &&
+ (current_drive == buffer_drive) &&
+ (fsector_t >= buffer_min) && (fsector_t < buffer_max)) {
+ /* data already in track buffer */
+ if (CT(COMMAND) == FD_READ) {
+ copy_buffer(1, max_sector, buffer_max);
+ return 1;
+ }
+ } else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) {
+ if (CT(COMMAND) == FD_WRITE) {
+ unsigned int sectors;
+
+ sectors = fsector_t + blk_rq_sectors(current_req);
+ if (sectors > ssize && sectors < ssize + ssize)
+ max_size = ssize + ssize;
+ else
+ max_size = ssize;
+ }
+ raw_cmd->flags &= ~FD_RAW_WRITE;
+ raw_cmd->flags |= FD_RAW_READ;
+ COMMAND = FM_MODE(_floppy, FD_READ);
+ } else if ((unsigned long)bio_data(current_req->bio) < MAX_DMA_ADDRESS) {
+ unsigned long dma_limit;
+ int direct, indirect;
+
+ indirect =
+ transfer_size(ssize, max_sector,
+ max_buffer_sectors * 2) - fsector_t;
+
+ /*
+ * Do NOT use minimum() here---MAX_DMA_ADDRESS is 64 bits wide
+ * on a 64 bit machine!
+ */
+ max_size = buffer_chain_size();
+ dma_limit = (MAX_DMA_ADDRESS -
+ ((unsigned long)bio_data(current_req->bio))) >> 9;
+ if ((unsigned long)max_size > dma_limit)
+ max_size = dma_limit;
+ /* 64 kb boundaries */
+ if (CROSS_64KB(bio_data(current_req->bio), max_size << 9))
+ max_size = (K_64 -
+ ((unsigned long)bio_data(current_req->bio)) %
+ K_64) >> 9;
+ direct = transfer_size(ssize, max_sector, max_size) - fsector_t;
+ /*
+ * We try to read tracks, but if we get too many errors, we
+ * go back to reading just one sector at a time.
+ *
+ * This means we should be able to read a sector even if there
+ * are other bad sectors on this track.
+ */
+ if (!direct ||
+ (indirect * 2 > direct * 3 &&
+ *errors < DP->max_errors.read_track &&
+ ((!probing ||
+ (DP->read_track & (1 << DRS->probed_format)))))) {
+ max_size = blk_rq_sectors(current_req);
+ } else {
+ raw_cmd->kernel_data = bio_data(current_req->bio);
+ raw_cmd->length = current_count_sectors << 9;
+ if (raw_cmd->length == 0) {
+ DPRINT("%s: zero dma transfer attempted\n", __func__);
+ DPRINT("indirect=%d direct=%d fsector_t=%d\n",
+ indirect, direct, fsector_t);
+ return 0;
+ }
+ virtualdmabug_workaround();
+ return 2;
+ }
+ }
+
+ if (CT(COMMAND) == FD_READ)
+ max_size = max_sector; /* unbounded */
+
+ /* claim buffer track if needed */
+ if (buffer_track != raw_cmd->track || /* bad track */
+ buffer_drive != current_drive || /* bad drive */
+ fsector_t > buffer_max ||
+ fsector_t < buffer_min ||
+ ((CT(COMMAND) == FD_READ ||
+ (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) &&
+ max_sector > 2 * max_buffer_sectors + buffer_min &&
+ max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)) {
+ /* not enough space */
+ buffer_track = -1;
+ buffer_drive = current_drive;
+ buffer_max = buffer_min = aligned_sector_t;
+ }
+ raw_cmd->kernel_data = floppy_track_buffer +
+ ((aligned_sector_t - buffer_min) << 9);
+
+ if (CT(COMMAND) == FD_WRITE) {
+ /* copy write buffer to track buffer.
+ * if we get here, we know that the write
+ * is either aligned or the data already in the buffer
+ * (buffer will be overwritten) */
+ if (in_sector_offset && buffer_track == -1)
+ DPRINT("internal error offset !=0 on write\n");
+ buffer_track = raw_cmd->track;
+ buffer_drive = current_drive;
+ copy_buffer(ssize, max_sector,
+ 2 * max_buffer_sectors + buffer_min);
+ } else
+ transfer_size(ssize, max_sector,
+ 2 * max_buffer_sectors + buffer_min -
+ aligned_sector_t);
+
+ /* round up current_count_sectors to get dma xfer size */
+ raw_cmd->length = in_sector_offset + current_count_sectors;
+ raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
+ raw_cmd->length <<= 9;
+ if ((raw_cmd->length < current_count_sectors << 9) ||
+ (raw_cmd->kernel_data != bio_data(current_req->bio) &&
+ CT(COMMAND) == FD_WRITE &&
+ (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max ||
+ aligned_sector_t < buffer_min)) ||
+ raw_cmd->length % (128 << SIZECODE) ||
+ raw_cmd->length <= 0 || current_count_sectors <= 0) {
+ DPRINT("fractionary current count b=%lx s=%lx\n",
+ raw_cmd->length, current_count_sectors);
+ if (raw_cmd->kernel_data != bio_data(current_req->bio))
+ pr_info("addr=%d, length=%ld\n",
+ (int)((raw_cmd->kernel_data -
+ floppy_track_buffer) >> 9),
+ current_count_sectors);
+ pr_info("st=%d ast=%d mse=%d msi=%d\n",
+ fsector_t, aligned_sector_t, max_sector, max_size);
+ pr_info("ssize=%x SIZECODE=%d\n", ssize, SIZECODE);
+ pr_info("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n",
+ COMMAND, SECTOR, HEAD, TRACK);
+ pr_info("buffer drive=%d\n", buffer_drive);
+ pr_info("buffer track=%d\n", buffer_track);
+ pr_info("buffer_min=%d\n", buffer_min);
+ pr_info("buffer_max=%d\n", buffer_max);
+ return 0;
+ }
+
+ if (raw_cmd->kernel_data != bio_data(current_req->bio)) {
+ if (raw_cmd->kernel_data < floppy_track_buffer ||
+ current_count_sectors < 0 ||
+ raw_cmd->length < 0 ||
+ raw_cmd->kernel_data + raw_cmd->length >
+ floppy_track_buffer + (max_buffer_sectors << 10)) {
+ DPRINT("buffer overrun in schedule dma\n");
+ pr_info("fsector_t=%d buffer_min=%d current_count=%ld\n",
+ fsector_t, buffer_min, raw_cmd->length >> 9);
+ pr_info("current_count_sectors=%ld\n",
+ current_count_sectors);
+ if (CT(COMMAND) == FD_READ)
+ pr_info("read\n");
+ if (CT(COMMAND) == FD_WRITE)
+ pr_info("write\n");
+ return 0;
+ }
+ } else if (raw_cmd->length > blk_rq_bytes(current_req) ||
+ current_count_sectors > blk_rq_sectors(current_req)) {
+ DPRINT("buffer overrun in direct transfer\n");
+ return 0;
+ } else if (raw_cmd->length < current_count_sectors << 9) {
+ DPRINT("more sectors than bytes\n");
+ pr_info("bytes=%ld\n", raw_cmd->length >> 9);
+ pr_info("sectors=%ld\n", current_count_sectors);
+ }
+ if (raw_cmd->length == 0) {
+ DPRINT("zero dma transfer attempted from make_raw_request\n");
+ return 0;
+ }
+
+ virtualdmabug_workaround();
+ return 2;
+}
+
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static int set_next_request(void)
+{
+ struct request_queue *q;
+ int old_pos = fdc_queue;
+
+ do {
+ q = disks[fdc_queue]->queue;
+ if (++fdc_queue == N_DRIVE)
+ fdc_queue = 0;
+ if (q) {
+ current_req = blk_fetch_request(q);
+ if (current_req)
+ break;
+ }
+ } while (fdc_queue != old_pos);
+
+ return current_req != NULL;
+}
+
+static void redo_fd_request(void)
+{
+ int drive;
+ int tmp;
+
+ lastredo = jiffies;
+ if (current_drive < N_DRIVE)
+ floppy_off(current_drive);
+
+do_request:
+ if (!current_req) {
+ int pending;
+
+ spin_lock_irq(&floppy_lock);
+ pending = set_next_request();
+ spin_unlock_irq(&floppy_lock);
+ if (!pending) {
+ do_floppy = NULL;
+ unlock_fdc();
+ return;
+ }
+ }
+ drive = (long)current_req->rq_disk->private_data;
+ set_fdc(drive);
+ reschedule_timeout(current_reqD, "redo fd request");
+
+ set_floppy(drive);
+ raw_cmd = &default_raw_cmd;
+ raw_cmd->flags = 0;
+ if (start_motor(redo_fd_request))
+ return;
+
+ disk_change(current_drive);
+ if (test_bit(current_drive, &fake_change) ||
+ test_bit(FD_DISK_CHANGED_BIT, &DRS->flags)) {
+ DPRINT("disk absent or changed during operation\n");
+ request_done(0);
+ goto do_request;
+ }
+ if (!_floppy) { /* Autodetection */
+ if (!probing) {
+ DRS->probed_format = 0;
+ if (next_valid_format()) {
+ DPRINT("no autodetectable formats\n");
+ _floppy = NULL;
+ request_done(0);
+ goto do_request;
+ }
+ }
+ probing = 1;
+ _floppy = floppy_type + DP->autodetect[DRS->probed_format];
+ } else
+ probing = 0;
+ errors = &(current_req->errors);
+ tmp = make_raw_rw_request();
+ if (tmp < 2) {
+ request_done(tmp);
+ goto do_request;
+ }
+
+ if (test_bit(FD_NEED_TWADDLE_BIT, &DRS->flags))
+ twaddle();
+ schedule_bh(floppy_start);
+ debugt(__func__, "queue fd request");
+ return;
+}
+
+static const struct cont_t rw_cont = {
+ .interrupt = rw_interrupt,
+ .redo = redo_fd_request,
+ .error = bad_flp_intr,
+ .done = request_done
+};
+
+static void process_fd_request(void)
+{
+ cont = &rw_cont;
+ schedule_bh(redo_fd_request);
+}
+
+static void do_fd_request(struct request_queue *q)
+{
+ if (WARN(max_buffer_sectors == 0,
+ "VFS: %s called on non-open device\n", __func__))
+ return;
+
+ if (WARN(atomic_read(&usage_count) == 0,
+ "warning: usage count=0, current_req=%p sect=%ld type=%x flags=%llx\n",
+ current_req, (long)blk_rq_pos(current_req), current_req->cmd_type,
+ (unsigned long long) current_req->cmd_flags))
+ return;
+
+ if (test_and_set_bit(0, &fdc_busy)) {
+ /* fdc busy, this new request will be treated when the
+ current one is done */
+ is_alive(__func__, "old request running");
+ return;
+ }
+ command_status = FD_COMMAND_NONE;
+ __reschedule_timeout(MAXTIMEOUT, "fd_request");
+ set_fdc(0);
+ process_fd_request();
+ is_alive(__func__, "");
+}
+
+static const struct cont_t poll_cont = {
+ .interrupt = success_and_wakeup,
+ .redo = floppy_ready,
+ .error = generic_failure,
+ .done = generic_done
+};
+
+static int poll_drive(bool interruptible, int flag)
+{
+ /* no auto-sense, just clear dcl */
+ raw_cmd = &default_raw_cmd;
+ raw_cmd->flags = flag;
+ raw_cmd->track = 0;
+ raw_cmd->cmd_count = 0;
+ cont = &poll_cont;
+ debug_dcl(DP->flags, "setting NEWCHANGE in poll_drive\n");
+ set_bit(FD_DISK_NEWCHANGE_BIT, &DRS->flags);
+
+ return wait_til_done(floppy_ready, interruptible);
+}
+
+/*
+ * User triggered reset
+ * ====================
+ */
+
+static void reset_intr(void)
+{
+ pr_info("weird, reset interrupt called\n");
+}
+
+static const struct cont_t reset_cont = {
+ .interrupt = reset_intr,
+ .redo = success_and_wakeup,
+ .error = generic_failure,
+ .done = generic_done
+};
+
+static int user_reset_fdc(int drive, int arg, bool interruptible)
+{
+ int ret;
+
+ if (lock_fdc(drive, interruptible))
+ return -EINTR;
+
+ if (arg == FD_RESET_ALWAYS)
+ FDCS->reset = 1;
+ if (FDCS->reset) {
+ cont = &reset_cont;
+ ret = wait_til_done(reset_fdc, interruptible);
+ if (ret == -EINTR)
+ return -EINTR;
+ }
+ process_fd_request();
+ return 0;
+}
+
+/*
+ * Misc Ioctl's and support
+ * ========================
+ */
+static inline int fd_copyout(void __user *param, const void *address,
+ unsigned long size)
+{
+ return copy_to_user(param, address, size) ? -EFAULT : 0;
+}
+
+static inline int fd_copyin(void __user *param, void *address,
+ unsigned long size)
+{
+ return copy_from_user(address, param, size) ? -EFAULT : 0;
+}
+
+static const char *drive_name(int type, int drive)
+{
+ struct floppy_struct *floppy;
+
+ if (type)
+ floppy = floppy_type + type;
+ else {
+ if (UDP->native_format)
+ floppy = floppy_type + UDP->native_format;
+ else
+ return "(null)";
+ }
+ if (floppy->name)
+ return floppy->name;
+ else
+ return "(null)";
+}
+
+/* raw commands */
+static void raw_cmd_done(int flag)
+{
+ int i;
+
+ if (!flag) {
+ raw_cmd->flags |= FD_RAW_FAILURE;
+ raw_cmd->flags |= FD_RAW_HARDFAILURE;
+ } else {
+ raw_cmd->reply_count = inr;
+ if (raw_cmd->reply_count > MAX_REPLIES)
+ raw_cmd->reply_count = 0;
+ for (i = 0; i < raw_cmd->reply_count; i++)
+ raw_cmd->reply[i] = reply_buffer[i];
+
+ if (raw_cmd->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
+ unsigned long flags;
+ flags = claim_dma_lock();
+ raw_cmd->length = fd_get_dma_residue();
+ release_dma_lock(flags);
+ }
+
+ if ((raw_cmd->flags & FD_RAW_SOFTFAILURE) &&
+ (!raw_cmd->reply_count || (raw_cmd->reply[0] & 0xc0)))
+ raw_cmd->flags |= FD_RAW_FAILURE;
+
+ if (disk_change(current_drive))
+ raw_cmd->flags |= FD_RAW_DISK_CHANGE;
+ else
+ raw_cmd->flags &= ~FD_RAW_DISK_CHANGE;
+ if (raw_cmd->flags & FD_RAW_NO_MOTOR_AFTER)
+ motor_off_callback(current_drive);
+
+ if (raw_cmd->next &&
+ (!(raw_cmd->flags & FD_RAW_FAILURE) ||
+ !(raw_cmd->flags & FD_RAW_STOP_IF_FAILURE)) &&
+ ((raw_cmd->flags & FD_RAW_FAILURE) ||
+ !(raw_cmd->flags & FD_RAW_STOP_IF_SUCCESS))) {
+ raw_cmd = raw_cmd->next;
+ return;
+ }
+ }
+ generic_done(flag);
+}
+
+static const struct cont_t raw_cmd_cont = {
+ .interrupt = success_and_wakeup,
+ .redo = floppy_start,
+ .error = generic_failure,
+ .done = raw_cmd_done
+};
+
+static int raw_cmd_copyout(int cmd, void __user *param,
+ struct floppy_raw_cmd *ptr)
+{
+ int ret;
+
+ while (ptr) {
+ struct floppy_raw_cmd cmd = *ptr;
+ cmd.next = NULL;
+ cmd.kernel_data = NULL;
+ ret = copy_to_user(param, &cmd, sizeof(cmd));
+ if (ret)
+ return -EFAULT;
+ param += sizeof(struct floppy_raw_cmd);
+ if ((ptr->flags & FD_RAW_READ) && ptr->buffer_length) {
+ if (ptr->length >= 0 &&
+ ptr->length <= ptr->buffer_length) {
+ long length = ptr->buffer_length - ptr->length;
+ ret = fd_copyout(ptr->data, ptr->kernel_data,
+ length);
+ if (ret)
+ return ret;
+ }
+ }
+ ptr = ptr->next;
+ }
+
+ return 0;
+}
+
+static void raw_cmd_free(struct floppy_raw_cmd **ptr)
+{
+ struct floppy_raw_cmd *next;
+ struct floppy_raw_cmd *this;
+
+ this = *ptr;
+ *ptr = NULL;
+ while (this) {
+ if (this->buffer_length) {
+ fd_dma_mem_free((unsigned long)this->kernel_data,
+ this->buffer_length);
+ this->buffer_length = 0;
+ }
+ next = this->next;
+ kfree(this);
+ this = next;
+ }
+}
+
+static int raw_cmd_copyin(int cmd, void __user *param,
+ struct floppy_raw_cmd **rcmd)
+{
+ struct floppy_raw_cmd *ptr;
+ int ret;
+ int i;
+
+ *rcmd = NULL;
+
+loop:
+ ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_USER);
+ if (!ptr)
+ return -ENOMEM;
+ *rcmd = ptr;
+ ret = copy_from_user(ptr, param, sizeof(*ptr));
+ ptr->next = NULL;
+ ptr->buffer_length = 0;
+ ptr->kernel_data = NULL;
+ if (ret)
+ return -EFAULT;
+ param += sizeof(struct floppy_raw_cmd);
+ if (ptr->cmd_count > 33)
+ /* the command may now also take up the space
+ * initially intended for the reply & the
+ * reply count. Needed for long 82078 commands
+ * such as RESTORE, which takes ... 17 command
+ * bytes. Murphy's law #137: When you reserve
+ * 16 bytes for a structure, you'll one day
+ * discover that you really need 17...
+ */
+ return -EINVAL;
+
+ for (i = 0; i < 16; i++)
+ ptr->reply[i] = 0;
+ ptr->resultcode = 0;
+
+ if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
+ if (ptr->length <= 0)
+ return -EINVAL;
+ ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length);
+ fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length);
+ if (!ptr->kernel_data)
+ return -ENOMEM;
+ ptr->buffer_length = ptr->length;
+ }
+ if (ptr->flags & FD_RAW_WRITE) {
+ ret = fd_copyin(ptr->data, ptr->kernel_data, ptr->length);
+ if (ret)
+ return ret;
+ }
+
+ if (ptr->flags & FD_RAW_MORE) {
+ rcmd = &(ptr->next);
+ ptr->rate &= 0x43;
+ goto loop;
+ }
+
+ return 0;
+}
+
+static int raw_cmd_ioctl(int cmd, void __user *param)
+{
+ struct floppy_raw_cmd *my_raw_cmd;
+ int drive;
+ int ret2;
+ int ret;
+
+ if (FDCS->rawcmd <= 1)
+ FDCS->rawcmd = 1;
+ for (drive = 0; drive < N_DRIVE; drive++) {
+ if (FDC(drive) != fdc)
+ continue;
+ if (drive == current_drive) {
+ if (UDRS->fd_ref > 1) {
+ FDCS->rawcmd = 2;
+ break;
+ }
+ } else if (UDRS->fd_ref) {
+ FDCS->rawcmd = 2;
+ break;
+ }
+ }
+
+ if (FDCS->reset)
+ return -EIO;
+
+ ret = raw_cmd_copyin(cmd, param, &my_raw_cmd);
+ if (ret) {
+ raw_cmd_free(&my_raw_cmd);
+ return ret;
+ }
+
+ raw_cmd = my_raw_cmd;
+ cont = &raw_cmd_cont;
+ ret = wait_til_done(floppy_start, true);
+ debug_dcl(DP->flags, "calling disk change from raw_cmd ioctl\n");
+
+ if (ret != -EINTR && FDCS->reset)
+ ret = -EIO;
+
+ DRS->track = NO_TRACK;
+
+ ret2 = raw_cmd_copyout(cmd, param, my_raw_cmd);
+ if (!ret)
+ ret = ret2;
+ raw_cmd_free(&my_raw_cmd);
+ return ret;
+}
+
+static int invalidate_drive(struct block_device *bdev)
+{
+ /* invalidate the buffer track to force a reread */
+ set_bit((long)bdev->bd_disk->private_data, &fake_change);
+ process_fd_request();
+ check_disk_change(bdev);
+ return 0;
+}
+
+static int set_geometry(unsigned int cmd, struct floppy_struct *g,
+ int drive, int type, struct block_device *bdev)
+{
+ int cnt;
+
+ /* sanity checking for parameters. */
+ if (g->sect <= 0 ||
+ g->head <= 0 ||
+ g->track <= 0 || g->track > UDP->tracks >> STRETCH(g) ||
+ /* check if reserved bits are set */
+ (g->stretch & ~(FD_STRETCH | FD_SWAPSIDES | FD_SECTBASEMASK)) != 0)
+ return -EINVAL;
+ if (type) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ mutex_lock(&open_lock);
+ if (lock_fdc(drive, true)) {
+ mutex_unlock(&open_lock);
+ return -EINTR;
+ }
+ floppy_type[type] = *g;
+ floppy_type[type].name = "user format";
+ for (cnt = type << 2; cnt < (type << 2) + 4; cnt++)
+ floppy_sizes[cnt] = floppy_sizes[cnt + 0x80] =
+ floppy_type[type].size + 1;
+ process_fd_request();
+ for (cnt = 0; cnt < N_DRIVE; cnt++) {
+ struct block_device *bdev = opened_bdev[cnt];
+ if (!bdev || ITYPE(drive_state[cnt].fd_device) != type)
+ continue;
+ __invalidate_device(bdev, true);
+ }
+ mutex_unlock(&open_lock);
+ } else {
+ int oldStretch;
+
+ if (lock_fdc(drive, true))
+ return -EINTR;
+ if (cmd != FDDEFPRM) {
+ /* notice a disk change immediately, else
+ * we lose our settings immediately*/
+ if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+ return -EINTR;
+ }
+ oldStretch = g->stretch;
+ user_params[drive] = *g;
+ if (buffer_drive == drive)
+ SUPBOUND(buffer_max, user_params[drive].sect);
+ current_type[drive] = &user_params[drive];
+ floppy_sizes[drive] = user_params[drive].size;
+ if (cmd == FDDEFPRM)
+ DRS->keep_data = -1;
+ else
+ DRS->keep_data = 1;
+ /* invalidation. Invalidate only when needed, i.e.
+ * when there are already sectors in the buffer cache
+ * whose number will change. This is useful, because
+ * mtools often changes the geometry of the disk after
+ * looking at the boot block */
+ if (DRS->maxblock > user_params[drive].sect ||
+ DRS->maxtrack ||
+ ((user_params[drive].sect ^ oldStretch) &
+ (FD_SWAPSIDES | FD_SECTBASEMASK)))
+ invalidate_drive(bdev);
+ else
+ process_fd_request();
+ }
+ return 0;
+}
+
+/* handle obsolete ioctl's */
+static unsigned int ioctl_table[] = {
+ FDCLRPRM,
+ FDSETPRM,
+ FDDEFPRM,
+ FDGETPRM,
+ FDMSGON,
+ FDMSGOFF,
+ FDFMTBEG,
+ FDFMTTRK,
+ FDFMTEND,
+ FDSETEMSGTRESH,
+ FDFLUSH,
+ FDSETMAXERRS,
+ FDGETMAXERRS,
+ FDGETDRVTYP,
+ FDSETDRVPRM,
+ FDGETDRVPRM,
+ FDGETDRVSTAT,
+ FDPOLLDRVSTAT,
+ FDRESET,
+ FDGETFDCSTAT,
+ FDWERRORCLR,
+ FDWERRORGET,
+ FDRAWCMD,
+ FDEJECT,
+ FDTWADDLE
+};
+
+static int normalize_ioctl(unsigned int *cmd, int *size)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ioctl_table); i++) {
+ if ((*cmd & 0xffff) == (ioctl_table[i] & 0xffff)) {
+ *size = _IOC_SIZE(*cmd);
+ *cmd = ioctl_table[i];
+ if (*size > _IOC_SIZE(*cmd)) {
+ pr_info("ioctl not yet supported\n");
+ return -EFAULT;
+ }
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
+{
+ if (type)
+ *g = &floppy_type[type];
+ else {
+ if (lock_fdc(drive, false))
+ return -EINTR;
+ if (poll_drive(false, 0) == -EINTR)
+ return -EINTR;
+ process_fd_request();
+ *g = current_type[drive];
+ }
+ if (!*g)
+ return -ENODEV;
+ return 0;
+}
+
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ int drive = (long)bdev->bd_disk->private_data;
+ int type = ITYPE(drive_state[drive].fd_device);
+ struct floppy_struct *g;
+ int ret;
+
+ ret = get_floppy_geometry(drive, type, &g);
+ if (ret)
+ return ret;
+
+ geo->heads = g->head;
+ geo->sectors = g->sect;
+ geo->cylinders = g->track;
+ return 0;
+}
+
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+ unsigned long param)
+{
+ int drive = (long)bdev->bd_disk->private_data;
+ int type = ITYPE(UDRS->fd_device);
+ int i;
+ int ret;
+ int size;
+ union inparam {
+ struct floppy_struct g; /* geometry */
+ struct format_descr f;
+ struct floppy_max_errors max_errors;
+ struct floppy_drive_params dp;
+ } inparam; /* parameters coming from user space */
+ const void *outparam; /* parameters passed back to user space */
+
+ /* convert compatibility eject ioctls into floppy eject ioctl.
+ * We do this in order to provide a means to eject floppy disks before
+ * installing the new fdutils package */
+ if (cmd == CDROMEJECT || /* CD-ROM eject */
+ cmd == 0x6470) { /* SunOS floppy eject */
+ DPRINT("obsolete eject ioctl\n");
+ DPRINT("please use floppycontrol --eject\n");
+ cmd = FDEJECT;
+ }
+
+ if (!((cmd & 0xff00) == 0x0200))
+ return -EINVAL;
+
+ /* convert the old style command into a new style command */
+ ret = normalize_ioctl(&cmd, &size);
+ if (ret)
+ return ret;
+
+ /* permission checks */
+ if (((cmd & 0x40) && !(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL))) ||
+ ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)))
+ return -EPERM;
+
+ if (WARN_ON(size < 0 || size > sizeof(inparam)))
+ return -EINVAL;
+
+ /* copyin */
+ memset(&inparam, 0, sizeof(inparam));
+ if (_IOC_DIR(cmd) & _IOC_WRITE) {
+ ret = fd_copyin((void __user *)param, &inparam, size);
+ if (ret)
+ return ret;
+ }
+
+ switch (cmd) {
+ case FDEJECT:
+ if (UDRS->fd_ref != 1)
+ /* somebody else has this drive open */
+ return -EBUSY;
+ if (lock_fdc(drive, true))
+ return -EINTR;
+
+ /* do the actual eject. Fails on
+ * non-Sparc architectures */
+ ret = fd_eject(UNIT(drive));
+
+ set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+ set_bit(FD_VERIFY_BIT, &UDRS->flags);
+ process_fd_request();
+ return ret;
+ case FDCLRPRM:
+ if (lock_fdc(drive, true))
+ return -EINTR;
+ current_type[drive] = NULL;
+ floppy_sizes[drive] = MAX_DISK_SIZE << 1;
+ UDRS->keep_data = 0;
+ return invalidate_drive(bdev);
+ case FDSETPRM:
+ case FDDEFPRM:
+ return set_geometry(cmd, &inparam.g, drive, type, bdev);
+ case FDGETPRM:
+ ret = get_floppy_geometry(drive, type,
+ (struct floppy_struct **)&outparam);
+ if (ret)
+ return ret;
+ break;
+ case FDMSGON:
+ UDP->flags |= FTD_MSG;
+ return 0;
+ case FDMSGOFF:
+ UDP->flags &= ~FTD_MSG;
+ return 0;
+ case FDFMTBEG:
+ if (lock_fdc(drive, true))
+ return -EINTR;
+ if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+ return -EINTR;
+ ret = UDRS->flags;
+ process_fd_request();
+ if (ret & FD_VERIFY)
+ return -ENODEV;
+ if (!(ret & FD_DISK_WRITABLE))
+ return -EROFS;
+ return 0;
+ case FDFMTTRK:
+ if (UDRS->fd_ref != 1)
+ return -EBUSY;
+ return do_format(drive, &inparam.f);
+ case FDFMTEND:
+ case FDFLUSH:
+ if (lock_fdc(drive, true))
+ return -EINTR;
+ return invalidate_drive(bdev);
+ case FDSETEMSGTRESH:
+ UDP->max_errors.reporting = (unsigned short)(param & 0x0f);
+ return 0;
+ case FDGETMAXERRS:
+ outparam = &UDP->max_errors;
+ break;
+ case FDSETMAXERRS:
+ UDP->max_errors = inparam.max_errors;
+ break;
+ case FDGETDRVTYP:
+ outparam = drive_name(type, drive);
+ SUPBOUND(size, strlen((const char *)outparam) + 1);
+ break;
+ case FDSETDRVPRM:
+ *UDP = inparam.dp;
+ break;
+ case FDGETDRVPRM:
+ outparam = UDP;
+ break;
+ case FDPOLLDRVSTAT:
+ if (lock_fdc(drive, true))
+ return -EINTR;
+ if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+ return -EINTR;
+ process_fd_request();
+ /* fall through */
+ case FDGETDRVSTAT:
+ outparam = UDRS;
+ break;
+ case FDRESET:
+ return user_reset_fdc(drive, (int)param, true);
+ case FDGETFDCSTAT:
+ outparam = UFDCS;
+ break;
+ case FDWERRORCLR:
+ memset(UDRWE, 0, sizeof(*UDRWE));
+ return 0;
+ case FDWERRORGET:
+ outparam = UDRWE;
+ break;
+ case FDRAWCMD:
+ if (type)
+ return -EINVAL;
+ if (lock_fdc(drive, true))
+ return -EINTR;
+ set_floppy(drive);
+ i = raw_cmd_ioctl(cmd, (void __user *)param);
+ if (i == -EINTR)
+ return -EINTR;
+ process_fd_request();
+ return i;
+ case FDTWADDLE:
+ if (lock_fdc(drive, true))
+ return -EINTR;
+ twaddle();
+ process_fd_request();
+ return 0;
+ default:
+ return -EINVAL;
+ }
+
+ if (_IOC_DIR(cmd) & _IOC_READ)
+ return fd_copyout((void __user *)param, outparam, size);
+
+ return 0;
+}
+
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param)
+{
+ int ret;
+
+ mutex_lock(&floppy_mutex);
+ ret = fd_locked_ioctl(bdev, mode, cmd, param);
+ mutex_unlock(&floppy_mutex);
+
+ return ret;
+}
+
+static void __init config_types(void)
+{
+ bool has_drive = false;
+ int drive;
+
+ /* read drive info out of physical CMOS */
+ drive = 0;
+ if (!UDP->cmos)
+ UDP->cmos = FLOPPY0_TYPE;
+ drive = 1;
+ if (!UDP->cmos && FLOPPY1_TYPE)
+ UDP->cmos = FLOPPY1_TYPE;
+
+ /* FIXME: additional physical CMOS drive detection should go here */
+
+ for (drive = 0; drive < N_DRIVE; drive++) {
+ unsigned int type = UDP->cmos;
+ struct floppy_drive_params *params;
+ const char *name = NULL;
+ static char temparea[32];
+
+ if (type < ARRAY_SIZE(default_drive_params)) {
+ params = &default_drive_params[type].params;
+ if (type) {
+ name = default_drive_params[type].name;
+ allowed_drive_mask |= 1 << drive;
+ } else
+ allowed_drive_mask &= ~(1 << drive);
+ } else {
+ params = &default_drive_params[0].params;
+ sprintf(temparea, "unknown type %d (usb?)", type);
+ name = temparea;
+ }
+ if (name) {
+ const char *prepend;
+ if (!has_drive) {
+ prepend = "";
+ has_drive = true;
+ pr_info("Floppy drive(s):");
+ } else {
+ prepend = ",";
+ }
+
+ pr_cont("%s fd%d is %s", prepend, drive, name);
+ }
+ *UDP = *params;
+ }
+
+ if (has_drive)
+ pr_cont("\n");
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+ int drive = (long)disk->private_data;
+
+ mutex_lock(&floppy_mutex);
+ mutex_lock(&open_lock);
+ if (!UDRS->fd_ref--) {
+ DPRINT("floppy_release with fd_ref == 0");
+ UDRS->fd_ref = 0;
+ }
+ if (!UDRS->fd_ref)
+ opened_bdev[drive] = NULL;
+ mutex_unlock(&open_lock);
+ mutex_unlock(&floppy_mutex);
+}
+
+/*
+ * floppy_open check for aliasing (/dev/fd0 can be the same as
+ * /dev/PS0 etc), and disallows simultaneous access to the same
+ * drive with different device numbers.
+ */
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+ int drive = (long)bdev->bd_disk->private_data;
+ int old_dev, new_dev;
+ int try;
+ int res = -EBUSY;
+ char *tmp;
+
+ mutex_lock(&floppy_mutex);
+ mutex_lock(&open_lock);
+ old_dev = UDRS->fd_device;
+ if (opened_bdev[drive] && opened_bdev[drive] != bdev)
+ goto out2;
+
+ if (!UDRS->fd_ref && (UDP->flags & FD_BROKEN_DCL)) {
+ set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+ set_bit(FD_VERIFY_BIT, &UDRS->flags);
+ }
+
+ UDRS->fd_ref++;
+
+ opened_bdev[drive] = bdev;
+
+ res = -ENXIO;
+
+ if (!floppy_track_buffer) {
+ /* if opening an ED drive, reserve a big buffer,
+ * else reserve a small one */
+ if ((UDP->cmos == 6) || (UDP->cmos == 5))
+ try = 64; /* Only 48 actually useful */
+ else
+ try = 32; /* Only 24 actually useful */
+
+ tmp = (char *)fd_dma_mem_alloc(1024 * try);
+ if (!tmp && !floppy_track_buffer) {
+ try >>= 1; /* buffer only one side */
+ INFBOUND(try, 16);
+ tmp = (char *)fd_dma_mem_alloc(1024 * try);
+ }
+ if (!tmp && !floppy_track_buffer)
+ fallback_on_nodma_alloc(&tmp, 2048 * try);
+ if (!tmp && !floppy_track_buffer) {
+ DPRINT("Unable to allocate DMA memory\n");
+ goto out;
+ }
+ if (floppy_track_buffer) {
+ if (tmp)
+ fd_dma_mem_free((unsigned long)tmp, try * 1024);
+ } else {
+ buffer_min = buffer_max = -1;
+ floppy_track_buffer = tmp;
+ max_buffer_sectors = try;
+ }
+ }
+
+ new_dev = MINOR(bdev->bd_dev);
+ UDRS->fd_device = new_dev;
+ set_capacity(disks[drive], floppy_sizes[new_dev]);
+ if (old_dev != -1 && old_dev != new_dev) {
+ if (buffer_drive == drive)
+ buffer_track = -1;
+ }
+
+ if (UFDCS->rawcmd == 1)
+ UFDCS->rawcmd = 2;
+
+ if (!(mode & FMODE_NDELAY)) {
+ if (mode & (FMODE_READ|FMODE_WRITE)) {
+ UDRS->last_checked = 0;
+ clear_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
+ check_disk_change(bdev);
+ if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags))
+ goto out;
+ if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags))
+ goto out;
+ }
+ res = -EROFS;
+ if ((mode & FMODE_WRITE) &&
+ !test_bit(FD_DISK_WRITABLE_BIT, &UDRS->flags))
+ goto out;
+ }
+ mutex_unlock(&open_lock);
+ mutex_unlock(&floppy_mutex);
+ return 0;
+out:
+ UDRS->fd_ref--;
+
+ if (!UDRS->fd_ref)
+ opened_bdev[drive] = NULL;
+out2:
+ mutex_unlock(&open_lock);
+ mutex_unlock(&floppy_mutex);
+ return res;
+}
+
+/*
+ * Check if the disk has been changed or if a change has been faked.
+ */
+static unsigned int floppy_check_events(struct gendisk *disk,
+ unsigned int clearing)
+{
+ int drive = (long)disk->private_data;
+
+ if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
+ test_bit(FD_VERIFY_BIT, &UDRS->flags))
+ return DISK_EVENT_MEDIA_CHANGE;
+
+ if (time_after(jiffies, UDRS->last_checked + UDP->checkfreq)) {
+ lock_fdc(drive, false);
+ poll_drive(false, 0);
+ process_fd_request();
+ }
+
+ if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
+ test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
+ test_bit(drive, &fake_change) ||
+ drive_no_geom(drive))
+ return DISK_EVENT_MEDIA_CHANGE;
+ return 0;
+}
+
+/*
+ * This implements "read block 0" for floppy_revalidate().
+ * Needed for format autodetection, checking whether there is
+ * a disk in the drive, and whether that disk is writable.
+ */
+
+struct rb0_cbdata {
+ int drive;
+ struct completion complete;
+};
+
+static void floppy_rb0_cb(struct bio *bio, int err)
+{
+ struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private;
+ int drive = cbdata->drive;
+
+ if (err) {
+ pr_info("floppy: error %d while reading block 0\n", err);
+ set_bit(FD_OPEN_SHOULD_FAIL_BIT, &UDRS->flags);
+ }
+ complete(&cbdata->complete);
+}
+
+static int __floppy_read_block_0(struct block_device *bdev, int drive)
+{
+ struct bio bio;
+ struct bio_vec bio_vec;
+ struct page *page;
+ struct rb0_cbdata cbdata;
+ size_t size;
+
+ page = alloc_page(GFP_NOIO);
+ if (!page) {
+ process_fd_request();
+ return -ENOMEM;
+ }
+
+ size = bdev->bd_block_size;
+ if (!size)
+ size = 1024;
+
+ cbdata.drive = drive;
+
+ bio_init(&bio);
+ bio.bi_io_vec = &bio_vec;
+ bio_vec.bv_page = page;
+ bio_vec.bv_len = size;
+ bio_vec.bv_offset = 0;
+ bio.bi_vcnt = 1;
+ bio.bi_iter.bi_size = size;
+ bio.bi_bdev = bdev;
+ bio.bi_iter.bi_sector = 0;
+ bio.bi_flags |= (1 << BIO_QUIET);
+ bio.bi_private = &cbdata;
+ bio.bi_end_io = floppy_rb0_cb;
+
+ submit_bio(READ, &bio);
+ process_fd_request();
+
+ init_completion(&cbdata.complete);
+ wait_for_completion(&cbdata.complete);
+
+ __free_page(page);
+
+ return 0;
+}
+
+/* revalidate the floppy disk, i.e. trigger format autodetection by reading
+ * the bootblock (block 0). "Autodetection" is also needed to check whether
+ * there is a disk in the drive at all... Thus we also do it for fixed
+ * geometry formats */
+static int floppy_revalidate(struct gendisk *disk)
+{
+ int drive = (long)disk->private_data;
+ int cf;
+ int res = 0;
+
+ if (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
+ test_bit(FD_VERIFY_BIT, &UDRS->flags) ||
+ test_bit(drive, &fake_change) ||
+ drive_no_geom(drive)) {
+ if (WARN(atomic_read(&usage_count) == 0,
+ "VFS: revalidate called on non-open device.\n"))
+ return -EFAULT;
+
+ lock_fdc(drive, false);
+ cf = (test_bit(FD_DISK_CHANGED_BIT, &UDRS->flags) ||
+ test_bit(FD_VERIFY_BIT, &UDRS->flags));
+ if (!(cf || test_bit(drive, &fake_change) || drive_no_geom(drive))) {
+ process_fd_request(); /*already done by another thread */
+ return 0;
+ }
+ UDRS->maxblock = 0;
+ UDRS->maxtrack = 0;
+ if (buffer_drive == drive)
+ buffer_track = -1;
+ clear_bit(drive, &fake_change);
+ clear_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+ if (cf)
+ UDRS->generation++;
+ if (drive_no_geom(drive)) {
+ /* auto-sensing */
+ res = __floppy_read_block_0(opened_bdev[drive], drive);
+ } else {
+ if (cf)
+ poll_drive(false, FD_RAW_NEED_DISK);
+ process_fd_request();
+ }
+ }
+ set_capacity(disk, floppy_sizes[UDRS->fd_device]);
+ return res;
+}
+
+static const struct block_device_operations floppy_fops = {
+ .owner = THIS_MODULE,
+ .open = floppy_open,
+ .release = floppy_release,
+ .ioctl = fd_ioctl,
+ .getgeo = fd_getgeo,
+ .check_events = floppy_check_events,
+ .revalidate_disk = floppy_revalidate,
+};
+
+/*
+ * Floppy Driver initialization
+ * =============================
+ */
+
+/* Determine the floppy disk controller type */
+/* This routine was written by David C. Niemi */
+static char __init get_fdc_version(void)
+{
+ int r;
+
+ output_byte(FD_DUMPREGS); /* 82072 and better know DUMPREGS */
+ if (FDCS->reset)
+ return FDC_NONE;
+ r = result();
+ if (r <= 0x00)
+ return FDC_NONE; /* No FDC present ??? */
+ if ((r == 1) && (reply_buffer[0] == 0x80)) {
+ pr_info("FDC %d is an 8272A\n", fdc);
+ return FDC_8272A; /* 8272a/765 don't know DUMPREGS */
+ }
+ if (r != 10) {
+ pr_info("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n",
+ fdc, r);
+ return FDC_UNKNOWN;
+ }
+
+ if (!fdc_configure()) {
+ pr_info("FDC %d is an 82072\n", fdc);
+ return FDC_82072; /* 82072 doesn't know CONFIGURE */
+ }
+
+ output_byte(FD_PERPENDICULAR);
+ if (need_more_output() == MORE_OUTPUT) {
+ output_byte(0);
+ } else {
+ pr_info("FDC %d is an 82072A\n", fdc);
+ return FDC_82072A; /* 82072A as found on Sparcs. */
+ }
+
+ output_byte(FD_UNLOCK);
+ r = result();
+ if ((r == 1) && (reply_buffer[0] == 0x80)) {
+ pr_info("FDC %d is a pre-1991 82077\n", fdc);
+ return FDC_82077_ORIG; /* Pre-1991 82077, doesn't know
+ * LOCK/UNLOCK */
+ }
+ if ((r != 1) || (reply_buffer[0] != 0x00)) {
+ pr_info("FDC %d init: UNLOCK: unexpected return of %d bytes.\n",
+ fdc, r);
+ return FDC_UNKNOWN;
+ }
+ output_byte(FD_PARTID);
+ r = result();
+ if (r != 1) {
+ pr_info("FDC %d init: PARTID: unexpected return of %d bytes.\n",
+ fdc, r);
+ return FDC_UNKNOWN;
+ }
+ if (reply_buffer[0] == 0x80) {
+ pr_info("FDC %d is a post-1991 82077\n", fdc);
+ return FDC_82077; /* Revised 82077AA passes all the tests */
+ }
+ switch (reply_buffer[0] >> 5) {
+ case 0x0:
+ /* Either a 82078-1 or a 82078SL running at 5Volt */
+ pr_info("FDC %d is an 82078.\n", fdc);
+ return FDC_82078;
+ case 0x1:
+ pr_info("FDC %d is a 44pin 82078\n", fdc);
+ return FDC_82078;
+ case 0x2:
+ pr_info("FDC %d is a S82078B\n", fdc);
+ return FDC_S82078B;
+ case 0x3:
+ pr_info("FDC %d is a National Semiconductor PC87306\n", fdc);
+ return FDC_87306;
+ default:
+ pr_info("FDC %d init: 82078 variant with unknown PARTID=%d.\n",
+ fdc, reply_buffer[0] >> 5);
+ return FDC_82078_UNKN;
+ }
+} /* get_fdc_version */
+
+/* lilo configuration */
+
+static void __init floppy_set_flags(int *ints, int param, int param2)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(default_drive_params); i++) {
+ if (param)
+ default_drive_params[i].params.flags |= param2;
+ else
+ default_drive_params[i].params.flags &= ~param2;
+ }
+ DPRINT("%s flag 0x%x\n", param2 ? "Setting" : "Clearing", param);
+}
+
+static void __init daring(int *ints, int param, int param2)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(default_drive_params); i++) {
+ if (param) {
+ default_drive_params[i].params.select_delay = 0;
+ default_drive_params[i].params.flags |=
+ FD_SILENT_DCL_CLEAR;
+ } else {
+ default_drive_params[i].params.select_delay =
+ 2 * HZ / 100;
+ default_drive_params[i].params.flags &=
+ ~FD_SILENT_DCL_CLEAR;
+ }
+ }
+ DPRINT("Assuming %s floppy hardware\n", param ? "standard" : "broken");
+}
+
+static void __init set_cmos(int *ints, int dummy, int dummy2)
+{
+ int current_drive = 0;
+
+ if (ints[0] != 2) {
+ DPRINT("wrong number of parameters for CMOS\n");
+ return;
+ }
+ current_drive = ints[1];
+ if (current_drive < 0 || current_drive >= 8) {
+ DPRINT("bad drive for set_cmos\n");
+ return;
+ }
+#if N_FDC > 1
+ if (current_drive >= 4 && !FDC2)
+ FDC2 = 0x370;
+#endif
+ DP->cmos = ints[2];
+ DPRINT("setting CMOS code to %d\n", ints[2]);
+}
+
+static struct param_table {
+ const char *name;
+ void (*fn) (int *ints, int param, int param2);
+ int *var;
+ int def_param;
+ int param2;
+} config_params[] __initdata = {
+ {"allowed_drive_mask", NULL, &allowed_drive_mask, 0xff, 0}, /* obsolete */
+ {"all_drives", NULL, &allowed_drive_mask, 0xff, 0}, /* obsolete */
+ {"asus_pci", NULL, &allowed_drive_mask, 0x33, 0},
+ {"irq", NULL, &FLOPPY_IRQ, 6, 0},
+ {"dma", NULL, &FLOPPY_DMA, 2, 0},
+ {"daring", daring, NULL, 1, 0},
+#if N_FDC > 1
+ {"two_fdc", NULL, &FDC2, 0x370, 0},
+ {"one_fdc", NULL, &FDC2, 0, 0},
+#endif
+ {"thinkpad", floppy_set_flags, NULL, 1, FD_INVERTED_DCL},
+ {"broken_dcl", floppy_set_flags, NULL, 1, FD_BROKEN_DCL},
+ {"messages", floppy_set_flags, NULL, 1, FTD_MSG},
+ {"silent_dcl_clear", floppy_set_flags, NULL, 1, FD_SILENT_DCL_CLEAR},
+ {"debug", floppy_set_flags, NULL, 1, FD_DEBUG},
+ {"nodma", NULL, &can_use_virtual_dma, 1, 0},
+ {"omnibook", NULL, &can_use_virtual_dma, 1, 0},
+ {"yesdma", NULL, &can_use_virtual_dma, 0, 0},
+ {"fifo_depth", NULL, &fifo_depth, 0xa, 0},
+ {"nofifo", NULL, &no_fifo, 0x20, 0},
+ {"usefifo", NULL, &no_fifo, 0, 0},
+ {"cmos", set_cmos, NULL, 0, 0},
+ {"slow", NULL, &slow_floppy, 1, 0},
+ {"unexpected_interrupts", NULL, &print_unex, 1, 0},
+ {"no_unexpected_interrupts", NULL, &print_unex, 0, 0},
+ {"L40SX", NULL, &print_unex, 0, 0}
+
+ EXTRA_FLOPPY_PARAMS
+};
+
+static int __init floppy_setup(char *str)
+{
+ int i;
+ int param;
+ int ints[11];
+
+ str = get_options(str, ARRAY_SIZE(ints), ints);
+ if (str) {
+ for (i = 0; i < ARRAY_SIZE(config_params); i++) {
+ if (strcmp(str, config_params[i].name) == 0) {
+ if (ints[0])
+ param = ints[1];
+ else
+ param = config_params[i].def_param;
+ if (config_params[i].fn)
+ config_params[i].fn(ints, param,
+ config_params[i].
+ param2);
+ if (config_params[i].var) {
+ DPRINT("%s=%d\n", str, param);
+ *config_params[i].var = param;
+ }
+ return 1;
+ }
+ }
+ }
+ if (str) {
+ DPRINT("unknown floppy option [%s]\n", str);
+
+ DPRINT("allowed options are:");
+ for (i = 0; i < ARRAY_SIZE(config_params); i++)
+ pr_cont(" %s", config_params[i].name);
+ pr_cont("\n");
+ } else
+ DPRINT("botched floppy option\n");
+ DPRINT("Read Documentation/blockdev/floppy.txt\n");
+ return 0;
+}
+
+static int have_no_fdc = -ENODEV;
+
+static ssize_t floppy_cmos_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct platform_device *p = to_platform_device(dev);
+ int drive;
+
+ drive = p->id;
+ return sprintf(buf, "%X\n", UDP->cmos);
+}
+
+static DEVICE_ATTR(cmos, S_IRUGO, floppy_cmos_show, NULL);
+
+static struct attribute *floppy_dev_attrs[] = {
+ &dev_attr_cmos.attr,
+ NULL
+};
+
+ATTRIBUTE_GROUPS(floppy_dev);
+
+static void floppy_device_release(struct device *dev)
+{
+}
+
+static int floppy_resume(struct device *dev)
+{
+ int fdc;
+
+ for (fdc = 0; fdc < N_FDC; fdc++)
+ if (FDCS->address != -1)
+ user_reset_fdc(-1, FD_RESET_ALWAYS, false);
+
+ return 0;
+}
+
+static const struct dev_pm_ops floppy_pm_ops = {
+ .resume = floppy_resume,
+ .restore = floppy_resume,
+};
+
+static struct platform_driver floppy_driver = {
+ .driver = {
+ .name = "floppy",
+ .pm = &floppy_pm_ops,
+ },
+};
+
+static struct platform_device floppy_device[N_DRIVE];
+
+static bool floppy_available(int drive)
+{
+ if (!(allowed_drive_mask & (1 << drive)))
+ return false;
+ if (fdc_state[FDC(drive)].version == FDC_NONE)
+ return false;
+ return true;
+}
+
+static struct kobject *floppy_find(dev_t dev, int *part, void *data)
+{
+ int drive = (*part & 3) | ((*part & 0x80) >> 5);
+ if (drive >= N_DRIVE || !floppy_available(drive))
+ return NULL;
+ if (((*part >> 2) & 0x1f) >= ARRAY_SIZE(floppy_type))
+ return NULL;
+ *part = 0;
+ return get_disk(disks[drive]);
+}
+
+static int __init do_floppy_init(void)
+{
+ int i, unit, drive, err;
+
+ set_debugt();
+ interruptjiffies = resultjiffies = jiffies;
+
+#if defined(CONFIG_PPC)
+ if (check_legacy_ioport(FDC1))
+ return -ENODEV;
+#endif
+
+ raw_cmd = NULL;
+
+ floppy_wq = alloc_ordered_workqueue("floppy", 0);
+ if (!floppy_wq)
+ return -ENOMEM;
+
+ for (drive = 0; drive < N_DRIVE; drive++) {
+ disks[drive] = alloc_disk(1);
+ if (!disks[drive]) {
+ err = -ENOMEM;
+ goto out_put_disk;
+ }
+
+ disks[drive]->queue = blk_init_queue(do_fd_request, &floppy_lock);
+ if (!disks[drive]->queue) {
+ err = -ENOMEM;
+ goto out_put_disk;
+ }
+
+ blk_queue_max_hw_sectors(disks[drive]->queue, 64);
+ disks[drive]->major = FLOPPY_MAJOR;
+ disks[drive]->first_minor = TOMINOR(drive);
+ disks[drive]->fops = &floppy_fops;
+ sprintf(disks[drive]->disk_name, "fd%d", drive);
+
+ init_timer(&motor_off_timer[drive]);
+ motor_off_timer[drive].data = drive;
+ motor_off_timer[drive].function = motor_off_callback;
+ }
+
+ err = register_blkdev(FLOPPY_MAJOR, "fd");
+ if (err)
+ goto out_put_disk;
+
+ err = platform_driver_register(&floppy_driver);
+ if (err)
+ goto out_unreg_blkdev;
+
+ blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
+ floppy_find, NULL, NULL);
+
+ for (i = 0; i < 256; i++)
+ if (ITYPE(i))
+ floppy_sizes[i] = floppy_type[ITYPE(i)].size;
+ else
+ floppy_sizes[i] = MAX_DISK_SIZE << 1;
+
+ reschedule_timeout(MAXTIMEOUT, "floppy init");
+ config_types();
+
+ for (i = 0; i < N_FDC; i++) {
+ fdc = i;
+ memset(FDCS, 0, sizeof(*FDCS));
+ FDCS->dtr = -1;
+ FDCS->dor = 0x4;
+#if defined(__sparc__) || defined(__mc68000__)
+ /*sparcs/sun3x don't have a DOR reset which we can fall back on to */
+#ifdef __mc68000__
+ if (MACH_IS_SUN3X)
+#endif
+ FDCS->version = FDC_82072A;
+#endif
+ }
+
+ use_virtual_dma = can_use_virtual_dma & 1;
+ fdc_state[0].address = FDC1;
+ if (fdc_state[0].address == -1) {
+ cancel_delayed_work(&fd_timeout);
+ err = -ENODEV;
+ goto out_unreg_region;
+ }
+#if N_FDC > 1
+ fdc_state[1].address = FDC2;
+#endif
+
+ fdc = 0; /* reset fdc in case of unexpected interrupt */
+ err = floppy_grab_irq_and_dma();
+ if (err) {
+ cancel_delayed_work(&fd_timeout);
+ err = -EBUSY;
+ goto out_unreg_region;
+ }
+
+ /* initialise drive state */
+ for (drive = 0; drive < N_DRIVE; drive++) {
+ memset(UDRS, 0, sizeof(*UDRS));
+ memset(UDRWE, 0, sizeof(*UDRWE));
+ set_bit(FD_DISK_NEWCHANGE_BIT, &UDRS->flags);
+ set_bit(FD_DISK_CHANGED_BIT, &UDRS->flags);
+ set_bit(FD_VERIFY_BIT, &UDRS->flags);
+ UDRS->fd_device = -1;
+ floppy_track_buffer = NULL;
+ max_buffer_sectors = 0;
+ }
+ /*
+ * Small 10 msec delay to let through any interrupt that
+ * initialization might have triggered, to not
+ * confuse detection:
+ */
+ msleep(10);
+
+ for (i = 0; i < N_FDC; i++) {
+ fdc = i;
+ FDCS->driver_version = FD_DRIVER_VERSION;
+ for (unit = 0; unit < 4; unit++)
+ FDCS->track[unit] = 0;
+ if (FDCS->address == -1)
+ continue;
+ FDCS->rawcmd = 2;
+ if (user_reset_fdc(-1, FD_RESET_ALWAYS, false)) {
+ /* free ioports reserved by floppy_grab_irq_and_dma() */
+ floppy_release_regions(fdc);
+ FDCS->address = -1;
+ FDCS->version = FDC_NONE;
+ continue;
+ }
+ /* Try to determine the floppy controller type */
+ FDCS->version = get_fdc_version();
+ if (FDCS->version == FDC_NONE) {
+ /* free ioports reserved by floppy_grab_irq_and_dma() */
+ floppy_release_regions(fdc);
+ FDCS->address = -1;
+ continue;
+ }
+ if (can_use_virtual_dma == 2 && FDCS->version < FDC_82072A)
+ can_use_virtual_dma = 0;
+
+ have_no_fdc = 0;
+ /* Not all FDCs seem to be able to handle the version command
+ * properly, so force a reset for the standard FDC clones,
+ * to avoid interrupt garbage.
+ */
+ user_reset_fdc(-1, FD_RESET_ALWAYS, false);
+ }
+ fdc = 0;
+ cancel_delayed_work(&fd_timeout);
+ current_drive = 0;
+ initialized = true;
+ if (have_no_fdc) {
+ DPRINT("no floppy controllers found\n");
+ err = have_no_fdc;
+ goto out_release_dma;
+ }
+
+ for (drive = 0; drive < N_DRIVE; drive++) {
+ if (!floppy_available(drive))
+ continue;
+
+ floppy_device[drive].name = floppy_device_name;
+ floppy_device[drive].id = drive;
+ floppy_device[drive].dev.release = floppy_device_release;
+ floppy_device[drive].dev.groups = floppy_dev_groups;
+
+ err = platform_device_register(&floppy_device[drive]);
+ if (err)
+ goto out_remove_drives;
+
+ /* to be cleaned up... */
+ disks[drive]->private_data = (void *)(long)drive;
+ disks[drive]->flags |= GENHD_FL_REMOVABLE;
+ disks[drive]->driverfs_dev = &floppy_device[drive].dev;
+ add_disk(disks[drive]);
+ }
+
+ return 0;
+
+out_remove_drives:
+ while (drive--) {
+ if (floppy_available(drive)) {
+ del_gendisk(disks[drive]);
+ platform_device_unregister(&floppy_device[drive]);
+ }
+ }
+out_release_dma:
+ if (atomic_read(&usage_count))
+ floppy_release_irq_and_dma();
+out_unreg_region:
+ blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
+ platform_driver_unregister(&floppy_driver);
+out_unreg_blkdev:
+ unregister_blkdev(FLOPPY_MAJOR, "fd");
+out_put_disk:
+ destroy_workqueue(floppy_wq);
+ for (drive = 0; drive < N_DRIVE; drive++) {
+ if (!disks[drive])
+ break;
+ if (disks[drive]->queue) {
+ del_timer_sync(&motor_off_timer[drive]);
+ blk_cleanup_queue(disks[drive]->queue);
+ disks[drive]->queue = NULL;
+ }
+ put_disk(disks[drive]);
+ }
+ return err;
+}
+
+#ifndef MODULE
+static __init void floppy_async_init(void *data, async_cookie_t cookie)
+{
+ do_floppy_init();
+}
+#endif
+
+static int __init floppy_init(void)
+{
+#ifdef MODULE
+ return do_floppy_init();
+#else
+ /* Don't hold up the bootup by the floppy initialization */
+ async_schedule(floppy_async_init, NULL);
+ return 0;
+#endif
+}
+
+static const struct io_region {
+ int offset;
+ int size;
+} io_regions[] = {
+ { 2, 1 },
+ /* address + 3 is sometimes reserved by pnp bios for motherboard */
+ { 4, 2 },
+ /* address + 6 is reserved, and may be taken by IDE.
+ * Unfortunately, Adaptec doesn't know this :-(, */
+ { 7, 1 },
+};
+
+static void floppy_release_allocated_regions(int fdc, const struct io_region *p)
+{
+ while (p != io_regions) {
+ p--;
+ release_region(FDCS->address + p->offset, p->size);
+ }
+}
+
+#define ARRAY_END(X) (&((X)[ARRAY_SIZE(X)]))
+
+static int floppy_request_regions(int fdc)
+{
+ const struct io_region *p;
+
+ for (p = io_regions; p < ARRAY_END(io_regions); p++) {
+ if (!request_region(FDCS->address + p->offset,
+ p->size, "floppy")) {
+ DPRINT("Floppy io-port 0x%04lx in use\n",
+ FDCS->address + p->offset);
+ floppy_release_allocated_regions(fdc, p);
+ return -EBUSY;
+ }
+ }
+ return 0;
+}
+
+static void floppy_release_regions(int fdc)
+{
+ floppy_release_allocated_regions(fdc, ARRAY_END(io_regions));
+}
+
+static int floppy_grab_irq_and_dma(void)
+{
+ if (atomic_inc_return(&usage_count) > 1)
+ return 0;
+
+ /*
+ * We might have scheduled a free_irq(), wait it to
+ * drain first:
+ */
+ flush_workqueue(floppy_wq);
+
+ if (fd_request_irq()) {
+ DPRINT("Unable to grab IRQ%d for the floppy driver\n",
+ FLOPPY_IRQ);
+ atomic_dec(&usage_count);
+ return -1;
+ }
+ if (fd_request_dma()) {
+ DPRINT("Unable to grab DMA%d for the floppy driver\n",
+ FLOPPY_DMA);
+ if (can_use_virtual_dma & 2)
+ use_virtual_dma = can_use_virtual_dma = 1;
+ if (!(can_use_virtual_dma & 1)) {
+ fd_free_irq();
+ atomic_dec(&usage_count);
+ return -1;
+ }
+ }
+
+ for (fdc = 0; fdc < N_FDC; fdc++) {
+ if (FDCS->address != -1) {
+ if (floppy_request_regions(fdc))
+ goto cleanup;
+ }
+ }
+ for (fdc = 0; fdc < N_FDC; fdc++) {
+ if (FDCS->address != -1) {
+ reset_fdc_info(1);
+ fd_outb(FDCS->dor, FD_DOR);
+ }
+ }
+ fdc = 0;
+ set_dor(0, ~0, 8); /* avoid immediate interrupt */
+
+ for (fdc = 0; fdc < N_FDC; fdc++)
+ if (FDCS->address != -1)
+ fd_outb(FDCS->dor, FD_DOR);
+ /*
+ * The driver will try and free resources and relies on us
+ * to know if they were allocated or not.
+ */
+ fdc = 0;
+ irqdma_allocated = 1;
+ return 0;
+cleanup:
+ fd_free_irq();
+ fd_free_dma();
+ while (--fdc >= 0)
+ floppy_release_regions(fdc);
+ atomic_dec(&usage_count);
+ return -1;
+}
+
+static void floppy_release_irq_and_dma(void)
+{
+ int old_fdc;
+#ifndef __sparc__
+ int drive;
+#endif
+ long tmpsize;
+ unsigned long tmpaddr;
+
+ if (!atomic_dec_and_test(&usage_count))
+ return;
+
+ if (irqdma_allocated) {
+ fd_disable_dma();
+ fd_free_dma();
+ fd_free_irq();
+ irqdma_allocated = 0;
+ }
+ set_dor(0, ~0, 8);
+#if N_FDC > 1
+ set_dor(1, ~8, 0);
+#endif
+
+ if (floppy_track_buffer && max_buffer_sectors) {
+ tmpsize = max_buffer_sectors * 1024;
+ tmpaddr = (unsigned long)floppy_track_buffer;
+ floppy_track_buffer = NULL;
+ max_buffer_sectors = 0;
+ buffer_min = buffer_max = -1;
+ fd_dma_mem_free(tmpaddr, tmpsize);
+ }
+#ifndef __sparc__
+ for (drive = 0; drive < N_FDC * 4; drive++)
+ if (timer_pending(motor_off_timer + drive))
+ pr_info("motor off timer %d still active\n", drive);
+#endif
+
+ if (delayed_work_pending(&fd_timeout))
+ pr_info("floppy timer still active:%s\n", timeout_message);
+ if (delayed_work_pending(&fd_timer))
+ pr_info("auxiliary floppy timer still active\n");
+ if (work_pending(&floppy_work))
+ pr_info("work still pending\n");
+ old_fdc = fdc;
+ for (fdc = 0; fdc < N_FDC; fdc++)
+ if (FDCS->address != -1)
+ floppy_release_regions(fdc);
+ fdc = old_fdc;
+}
+
+#ifdef MODULE
+
+static char *floppy;
+
+static void __init parse_floppy_cfg_string(char *cfg)
+{
+ char *ptr;
+
+ while (*cfg) {
+ ptr = cfg;
+ while (*cfg && *cfg != ' ' && *cfg != '\t')
+ cfg++;
+ if (*cfg) {
+ *cfg = '\0';
+ cfg++;
+ }
+ if (*ptr)
+ floppy_setup(ptr);
+ }
+}
+
+static int __init floppy_module_init(void)
+{
+ if (floppy)
+ parse_floppy_cfg_string(floppy);
+ return floppy_init();
+}
+module_init(floppy_module_init);
+
+static void __exit floppy_module_exit(void)
+{
+ int drive;
+
+ blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
+ unregister_blkdev(FLOPPY_MAJOR, "fd");
+ platform_driver_unregister(&floppy_driver);
+
+ destroy_workqueue(floppy_wq);
+
+ for (drive = 0; drive < N_DRIVE; drive++) {
+ del_timer_sync(&motor_off_timer[drive]);
+
+ if (floppy_available(drive)) {
+ del_gendisk(disks[drive]);
+ platform_device_unregister(&floppy_device[drive]);
+ }
+ blk_cleanup_queue(disks[drive]->queue);
+
+ /*
+ * These disks have not called add_disk(). Don't put down
+ * queue reference in put_disk().
+ */
+ if (!(allowed_drive_mask & (1 << drive)) ||
+ fdc_state[FDC(drive)].version == FDC_NONE)
+ disks[drive]->queue = NULL;
+
+ put_disk(disks[drive]);
+ }
+
+ cancel_delayed_work_sync(&fd_timeout);
+ cancel_delayed_work_sync(&fd_timer);
+
+ if (atomic_read(&usage_count))
+ floppy_release_irq_and_dma();
+
+ /* eject disk, if any */
+ fd_eject(0);
+}
+
+module_exit(floppy_module_exit);
+
+module_param(floppy, charp, 0);
+module_param(FLOPPY_IRQ, int, 0);
+module_param(FLOPPY_DMA, int, 0);
+MODULE_AUTHOR("Alain L. Knaff");
+MODULE_SUPPORTED_DEVICE("fd");
+MODULE_LICENSE("GPL");
+
+/* This doesn't actually get used other than for module information */
+static const struct pnp_device_id floppy_pnpids[] = {
+ {"PNP0700", 0},
+ {}
+};
+
+MODULE_DEVICE_TABLE(pnp, floppy_pnpids);
+
+#else
+
+__setup("floppy=", floppy_setup);
+module_init(floppy_init)
+#endif
+
+MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR);
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
new file mode 100644
index 000000000..3abb12182
--- /dev/null
+++ b/drivers/block/hd.c
@@ -0,0 +1,804 @@
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ *
+ * This is the low-level hd interrupt support. It traverses the
+ * request-list, using interrupts to jump between functions. As
+ * all the functions are called within interrupts, we may not
+ * sleep. Special care is recommended.
+ *
+ * modified by Drew Eckhardt to check nr of hd's from the CMOS.
+ *
+ * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug
+ * in the early extended-partition checks and added DM partitions
+ *
+ * IRQ-unmask, drive-id, multiple-mode, support for ">16 heads",
+ * and general streamlining by Mark Lord.
+ *
+ * Removed 99% of above. Use Mark's ide driver for those options.
+ * This is now a lightweight ST-506 driver. (Paul Gortmaker)
+ *
+ * Modified 1995 Russell King for ARM processor.
+ *
+ * Bugfix: max_sectors must be <= 255 or the wheels tend to come
+ * off in a hurry once you queue things up - Paul G. 02/2001
+ */
+
+/* Uncomment the following if you want verbose error reports. */
+/* #define VERBOSE_ERRORS */
+
+#include <linux/blkdev.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/interrupt.h>
+#include <linux/timer.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/genhd.h>
+#include <linux/string.h>
+#include <linux/ioport.h>
+#include <linux/init.h>
+#include <linux/blkpg.h>
+#include <linux/ata.h>
+#include <linux/hdreg.h>
+
+#define HD_IRQ 14
+
+#define REALLY_SLOW_IO
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+#ifdef __arm__
+#undef HD_IRQ
+#endif
+#include <asm/irq.h>
+#ifdef __arm__
+#define HD_IRQ IRQ_HARDDISK
+#endif
+
+/* Hd controller regster ports */
+
+#define HD_DATA 0x1f0 /* _CTL when writing */
+#define HD_ERROR 0x1f1 /* see err-bits */
+#define HD_NSECTOR 0x1f2 /* nr of sectors to read/write */
+#define HD_SECTOR 0x1f3 /* starting sector */
+#define HD_LCYL 0x1f4 /* starting cylinder */
+#define HD_HCYL 0x1f5 /* high byte of starting cyl */
+#define HD_CURRENT 0x1f6 /* 101dhhhh , d=drive, hhhh=head */
+#define HD_STATUS 0x1f7 /* see status-bits */
+#define HD_FEATURE HD_ERROR /* same io address, read=error, write=feature */
+#define HD_PRECOMP HD_FEATURE /* obsolete use of this port - predates IDE */
+#define HD_COMMAND HD_STATUS /* same io address, read=status, write=cmd */
+
+#define HD_CMD 0x3f6 /* used for resets */
+#define HD_ALTSTATUS 0x3f6 /* same as HD_STATUS but doesn't clear irq */
+
+/* Bits of HD_STATUS */
+#define ERR_STAT 0x01
+#define INDEX_STAT 0x02
+#define ECC_STAT 0x04 /* Corrected error */
+#define DRQ_STAT 0x08
+#define SEEK_STAT 0x10
+#define SERVICE_STAT SEEK_STAT
+#define WRERR_STAT 0x20
+#define READY_STAT 0x40
+#define BUSY_STAT 0x80
+
+/* Bits for HD_ERROR */
+#define MARK_ERR 0x01 /* Bad address mark */
+#define TRK0_ERR 0x02 /* couldn't find track 0 */
+#define ABRT_ERR 0x04 /* Command aborted */
+#define MCR_ERR 0x08 /* media change request */
+#define ID_ERR 0x10 /* ID field not found */
+#define MC_ERR 0x20 /* media changed */
+#define ECC_ERR 0x40 /* Uncorrectable ECC error */
+#define BBD_ERR 0x80 /* pre-EIDE meaning: block marked bad */
+#define ICRC_ERR 0x80 /* new meaning: CRC error during transfer */
+
+static DEFINE_SPINLOCK(hd_lock);
+static struct request_queue *hd_queue;
+static struct request *hd_req;
+
+#define TIMEOUT_VALUE (6*HZ)
+#define HD_DELAY 0
+
+#define MAX_ERRORS 16 /* Max read/write errors/sector */
+#define RESET_FREQ 8 /* Reset controller every 8th retry */
+#define RECAL_FREQ 4 /* Recalibrate every 4th retry */
+#define MAX_HD 2
+
+#define STAT_OK (READY_STAT|SEEK_STAT)
+#define OK_STATUS(s) (((s)&(STAT_OK|(BUSY_STAT|WRERR_STAT|ERR_STAT)))==STAT_OK)
+
+static void recal_intr(void);
+static void bad_rw_intr(void);
+
+static int reset;
+static int hd_error;
+
+/*
+ * This struct defines the HD's and their types.
+ */
+struct hd_i_struct {
+ unsigned int head, sect, cyl, wpcom, lzone, ctl;
+ int unit;
+ int recalibrate;
+ int special_op;
+};
+
+#ifdef HD_TYPE
+static struct hd_i_struct hd_info[] = { HD_TYPE };
+static int NR_HD = ARRAY_SIZE(hd_info);
+#else
+static struct hd_i_struct hd_info[MAX_HD];
+static int NR_HD;
+#endif
+
+static struct gendisk *hd_gendisk[MAX_HD];
+
+static struct timer_list device_timer;
+
+#define TIMEOUT_VALUE (6*HZ)
+
+#define SET_TIMER \
+ do { \
+ mod_timer(&device_timer, jiffies + TIMEOUT_VALUE); \
+ } while (0)
+
+static void (*do_hd)(void) = NULL;
+#define SET_HANDLER(x) \
+if ((do_hd = (x)) != NULL) \
+ SET_TIMER; \
+else \
+ del_timer(&device_timer);
+
+
+#if (HD_DELAY > 0)
+
+#include <linux/i8253.h>
+
+unsigned long last_req;
+
+unsigned long read_timer(void)
+{
+ unsigned long t, flags;
+ int i;
+
+ raw_spin_lock_irqsave(&i8253_lock, flags);
+ t = jiffies * 11932;
+ outb_p(0, 0x43);
+ i = inb_p(0x40);
+ i |= inb(0x40) << 8;
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
+ return(t - i);
+}
+#endif
+
+static void __init hd_setup(char *str, int *ints)
+{
+ int hdind = 0;
+
+ if (ints[0] != 3)
+ return;
+ if (hd_info[0].head != 0)
+ hdind = 1;
+ hd_info[hdind].head = ints[2];
+ hd_info[hdind].sect = ints[3];
+ hd_info[hdind].cyl = ints[1];
+ hd_info[hdind].wpcom = 0;
+ hd_info[hdind].lzone = ints[1];
+ hd_info[hdind].ctl = (ints[2] > 8 ? 8 : 0);
+ NR_HD = hdind+1;
+}
+
+static bool hd_end_request(int err, unsigned int bytes)
+{
+ if (__blk_end_request(hd_req, err, bytes))
+ return true;
+ hd_req = NULL;
+ return false;
+}
+
+static bool hd_end_request_cur(int err)
+{
+ return hd_end_request(err, blk_rq_cur_bytes(hd_req));
+}
+
+static void dump_status(const char *msg, unsigned int stat)
+{
+ char *name = "hd?";
+ if (hd_req)
+ name = hd_req->rq_disk->disk_name;
+
+#ifdef VERBOSE_ERRORS
+ printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
+ if (stat & BUSY_STAT) printk("Busy ");
+ if (stat & READY_STAT) printk("DriveReady ");
+ if (stat & WRERR_STAT) printk("WriteFault ");
+ if (stat & SEEK_STAT) printk("SeekComplete ");
+ if (stat & DRQ_STAT) printk("DataRequest ");
+ if (stat & ECC_STAT) printk("CorrectedError ");
+ if (stat & INDEX_STAT) printk("Index ");
+ if (stat & ERR_STAT) printk("Error ");
+ printk("}\n");
+ if ((stat & ERR_STAT) == 0) {
+ hd_error = 0;
+ } else {
+ hd_error = inb(HD_ERROR);
+ printk("%s: %s: error=0x%02x { ", name, msg, hd_error & 0xff);
+ if (hd_error & BBD_ERR) printk("BadSector ");
+ if (hd_error & ECC_ERR) printk("UncorrectableError ");
+ if (hd_error & ID_ERR) printk("SectorIdNotFound ");
+ if (hd_error & ABRT_ERR) printk("DriveStatusError ");
+ if (hd_error & TRK0_ERR) printk("TrackZeroNotFound ");
+ if (hd_error & MARK_ERR) printk("AddrMarkNotFound ");
+ printk("}");
+ if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) {
+ printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
+ inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
+ if (hd_req)
+ printk(", sector=%ld", blk_rq_pos(hd_req));
+ }
+ printk("\n");
+ }
+#else
+ printk("%s: %s: status=0x%02x.\n", name, msg, stat & 0xff);
+ if ((stat & ERR_STAT) == 0) {
+ hd_error = 0;
+ } else {
+ hd_error = inb(HD_ERROR);
+ printk("%s: %s: error=0x%02x.\n", name, msg, hd_error & 0xff);
+ }
+#endif
+}
+
+static void check_status(void)
+{
+ int i = inb_p(HD_STATUS);
+
+ if (!OK_STATUS(i)) {
+ dump_status("check_status", i);
+ bad_rw_intr();
+ }
+}
+
+static int controller_busy(void)
+{
+ int retries = 100000;
+ unsigned char status;
+
+ do {
+ status = inb_p(HD_STATUS);
+ } while ((status & BUSY_STAT) && --retries);
+ return status;
+}
+
+static int status_ok(void)
+{
+ unsigned char status = inb_p(HD_STATUS);
+
+ if (status & BUSY_STAT)
+ return 1; /* Ancient, but does it make sense??? */
+ if (status & WRERR_STAT)
+ return 0;
+ if (!(status & READY_STAT))
+ return 0;
+ if (!(status & SEEK_STAT))
+ return 0;
+ return 1;
+}
+
+static int controller_ready(unsigned int drive, unsigned int head)
+{
+ int retry = 100;
+
+ do {
+ if (controller_busy() & BUSY_STAT)
+ return 0;
+ outb_p(0xA0 | (drive<<4) | head, HD_CURRENT);
+ if (status_ok())
+ return 1;
+ } while (--retry);
+ return 0;
+}
+
+static void hd_out(struct hd_i_struct *disk,
+ unsigned int nsect,
+ unsigned int sect,
+ unsigned int head,
+ unsigned int cyl,
+ unsigned int cmd,
+ void (*intr_addr)(void))
+{
+ unsigned short port;
+
+#if (HD_DELAY > 0)
+ while (read_timer() - last_req < HD_DELAY)
+ /* nothing */;
+#endif
+ if (reset)
+ return;
+ if (!controller_ready(disk->unit, head)) {
+ reset = 1;
+ return;
+ }
+ SET_HANDLER(intr_addr);
+ outb_p(disk->ctl, HD_CMD);
+ port = HD_DATA;
+ outb_p(disk->wpcom >> 2, ++port);
+ outb_p(nsect, ++port);
+ outb_p(sect, ++port);
+ outb_p(cyl, ++port);
+ outb_p(cyl >> 8, ++port);
+ outb_p(0xA0 | (disk->unit << 4) | head, ++port);
+ outb_p(cmd, ++port);
+}
+
+static void hd_request (void);
+
+static int drive_busy(void)
+{
+ unsigned int i;
+ unsigned char c;
+
+ for (i = 0; i < 500000 ; i++) {
+ c = inb_p(HD_STATUS);
+ if ((c & (BUSY_STAT | READY_STAT | SEEK_STAT)) == STAT_OK)
+ return 0;
+ }
+ dump_status("reset timed out", c);
+ return 1;
+}
+
+static void reset_controller(void)
+{
+ int i;
+
+ outb_p(4, HD_CMD);
+ for (i = 0; i < 1000; i++) barrier();
+ outb_p(hd_info[0].ctl & 0x0f, HD_CMD);
+ for (i = 0; i < 1000; i++) barrier();
+ if (drive_busy())
+ printk("hd: controller still busy\n");
+ else if ((hd_error = inb(HD_ERROR)) != 1)
+ printk("hd: controller reset failed: %02x\n", hd_error);
+}
+
+static void reset_hd(void)
+{
+ static int i;
+
+repeat:
+ if (reset) {
+ reset = 0;
+ i = -1;
+ reset_controller();
+ } else {
+ check_status();
+ if (reset)
+ goto repeat;
+ }
+ if (++i < NR_HD) {
+ struct hd_i_struct *disk = &hd_info[i];
+ disk->special_op = disk->recalibrate = 1;
+ hd_out(disk, disk->sect, disk->sect, disk->head-1,
+ disk->cyl, ATA_CMD_INIT_DEV_PARAMS, &reset_hd);
+ if (reset)
+ goto repeat;
+ } else
+ hd_request();
+}
+
+/*
+ * Ok, don't know what to do with the unexpected interrupts: on some machines
+ * doing a reset and a retry seems to result in an eternal loop. Right now I
+ * ignore it, and just set the timeout.
+ *
+ * On laptops (and "green" PCs), an unexpected interrupt occurs whenever the
+ * drive enters "idle", "standby", or "sleep" mode, so if the status looks
+ * "good", we just ignore the interrupt completely.
+ */
+static void unexpected_hd_interrupt(void)
+{
+ unsigned int stat = inb_p(HD_STATUS);
+
+ if (stat & (BUSY_STAT|DRQ_STAT|ECC_STAT|ERR_STAT)) {
+ dump_status("unexpected interrupt", stat);
+ SET_TIMER;
+ }
+}
+
+/*
+ * bad_rw_intr() now tries to be a bit smarter and does things
+ * according to the error returned by the controller.
+ * -Mika Liljeberg (liljeber@cs.Helsinki.FI)
+ */
+static void bad_rw_intr(void)
+{
+ struct request *req = hd_req;
+
+ if (req != NULL) {
+ struct hd_i_struct *disk = req->rq_disk->private_data;
+ if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
+ hd_end_request_cur(-EIO);
+ disk->special_op = disk->recalibrate = 1;
+ } else if (req->errors % RESET_FREQ == 0)
+ reset = 1;
+ else if ((hd_error & TRK0_ERR) || req->errors % RECAL_FREQ == 0)
+ disk->special_op = disk->recalibrate = 1;
+ /* Otherwise just retry */
+ }
+}
+
+static inline int wait_DRQ(void)
+{
+ int retries;
+ int stat;
+
+ for (retries = 0; retries < 100000; retries++) {
+ stat = inb_p(HD_STATUS);
+ if (stat & DRQ_STAT)
+ return 0;
+ }
+ dump_status("wait_DRQ", stat);
+ return -1;
+}
+
+static void read_intr(void)
+{
+ struct request *req;
+ int i, retries = 100000;
+
+ do {
+ i = (unsigned) inb_p(HD_STATUS);
+ if (i & BUSY_STAT)
+ continue;
+ if (!OK_STATUS(i))
+ break;
+ if (i & DRQ_STAT)
+ goto ok_to_read;
+ } while (--retries > 0);
+ dump_status("read_intr", i);
+ bad_rw_intr();
+ hd_request();
+ return;
+
+ok_to_read:
+ req = hd_req;
+ insw(HD_DATA, bio_data(req->bio), 256);
+#ifdef DEBUG
+ printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
+ req->rq_disk->disk_name, blk_rq_pos(req) + 1,
+ blk_rq_sectors(req) - 1, bio_data(req->bio)+512);
+#endif
+ if (hd_end_request(0, 512)) {
+ SET_HANDLER(&read_intr);
+ return;
+ }
+
+ (void) inb_p(HD_STATUS);
+#if (HD_DELAY > 0)
+ last_req = read_timer();
+#endif
+ hd_request();
+}
+
+static void write_intr(void)
+{
+ struct request *req = hd_req;
+ int i;
+ int retries = 100000;
+
+ do {
+ i = (unsigned) inb_p(HD_STATUS);
+ if (i & BUSY_STAT)
+ continue;
+ if (!OK_STATUS(i))
+ break;
+ if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT))
+ goto ok_to_write;
+ } while (--retries > 0);
+ dump_status("write_intr", i);
+ bad_rw_intr();
+ hd_request();
+ return;
+
+ok_to_write:
+ if (hd_end_request(0, 512)) {
+ SET_HANDLER(&write_intr);
+ outsw(HD_DATA, bio_data(req->bio), 256);
+ return;
+ }
+
+#if (HD_DELAY > 0)
+ last_req = read_timer();
+#endif
+ hd_request();
+}
+
+static void recal_intr(void)
+{
+ check_status();
+#if (HD_DELAY > 0)
+ last_req = read_timer();
+#endif
+ hd_request();
+}
+
+/*
+ * This is another of the error-routines I don't know what to do with. The
+ * best idea seems to just set reset, and start all over again.
+ */
+static void hd_times_out(unsigned long dummy)
+{
+ char *name;
+
+ do_hd = NULL;
+
+ if (!hd_req)
+ return;
+
+ spin_lock_irq(hd_queue->queue_lock);
+ reset = 1;
+ name = hd_req->rq_disk->disk_name;
+ printk("%s: timeout\n", name);
+ if (++hd_req->errors >= MAX_ERRORS) {
+#ifdef DEBUG
+ printk("%s: too many errors\n", name);
+#endif
+ hd_end_request_cur(-EIO);
+ }
+ hd_request();
+ spin_unlock_irq(hd_queue->queue_lock);
+}
+
+static int do_special_op(struct hd_i_struct *disk, struct request *req)
+{
+ if (disk->recalibrate) {
+ disk->recalibrate = 0;
+ hd_out(disk, disk->sect, 0, 0, 0, ATA_CMD_RESTORE, &recal_intr);
+ return reset;
+ }
+ if (disk->head > 16) {
+ printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
+ hd_end_request_cur(-EIO);
+ }
+ disk->special_op = 0;
+ return 1;
+}
+
+/*
+ * The driver enables interrupts as much as possible. In order to do this,
+ * (a) the device-interrupt is disabled before entering hd_request(),
+ * and (b) the timeout-interrupt is disabled before the sti().
+ *
+ * Interrupts are still masked (by default) whenever we are exchanging
+ * data/cmds with a drive, because some drives seem to have very poor
+ * tolerance for latency during I/O. The IDE driver has support to unmask
+ * interrupts for non-broken hardware, so use that driver if required.
+ */
+static void hd_request(void)
+{
+ unsigned int block, nsect, sec, track, head, cyl;
+ struct hd_i_struct *disk;
+ struct request *req;
+
+ if (do_hd)
+ return;
+repeat:
+ del_timer(&device_timer);
+
+ if (!hd_req) {
+ hd_req = blk_fetch_request(hd_queue);
+ if (!hd_req) {
+ do_hd = NULL;
+ return;
+ }
+ }
+ req = hd_req;
+
+ if (reset) {
+ reset_hd();
+ return;
+ }
+ disk = req->rq_disk->private_data;
+ block = blk_rq_pos(req);
+ nsect = blk_rq_sectors(req);
+ if (block >= get_capacity(req->rq_disk) ||
+ ((block+nsect) > get_capacity(req->rq_disk))) {
+ printk("%s: bad access: block=%d, count=%d\n",
+ req->rq_disk->disk_name, block, nsect);
+ hd_end_request_cur(-EIO);
+ goto repeat;
+ }
+
+ if (disk->special_op) {
+ if (do_special_op(disk, req))
+ goto repeat;
+ return;
+ }
+ sec = block % disk->sect + 1;
+ track = block / disk->sect;
+ head = track % disk->head;
+ cyl = track / disk->head;
+#ifdef DEBUG
+ printk("%s: %sing: CHS=%d/%d/%d, sectors=%d, buffer=%p\n",
+ req->rq_disk->disk_name,
+ req_data_dir(req) == READ ? "read" : "writ",
+ cyl, head, sec, nsect, bio_data(req->bio));
+#endif
+ if (req->cmd_type == REQ_TYPE_FS) {
+ switch (rq_data_dir(req)) {
+ case READ:
+ hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_READ,
+ &read_intr);
+ if (reset)
+ goto repeat;
+ break;
+ case WRITE:
+ hd_out(disk, nsect, sec, head, cyl, ATA_CMD_PIO_WRITE,
+ &write_intr);
+ if (reset)
+ goto repeat;
+ if (wait_DRQ()) {
+ bad_rw_intr();
+ goto repeat;
+ }
+ outsw(HD_DATA, bio_data(req->bio), 256);
+ break;
+ default:
+ printk("unknown hd-command\n");
+ hd_end_request_cur(-EIO);
+ break;
+ }
+ }
+}
+
+static void do_hd_request(struct request_queue *q)
+{
+ hd_request();
+}
+
+static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct hd_i_struct *disk = bdev->bd_disk->private_data;
+
+ geo->heads = disk->head;
+ geo->sectors = disk->sect;
+ geo->cylinders = disk->cyl;
+ return 0;
+}
+
+/*
+ * Releasing a block device means we sync() it, so that it can safely
+ * be forgotten about...
+ */
+
+static irqreturn_t hd_interrupt(int irq, void *dev_id)
+{
+ void (*handler)(void) = do_hd;
+
+ spin_lock(hd_queue->queue_lock);
+
+ do_hd = NULL;
+ del_timer(&device_timer);
+ if (!handler)
+ handler = unexpected_hd_interrupt;
+ handler();
+
+ spin_unlock(hd_queue->queue_lock);
+
+ return IRQ_HANDLED;
+}
+
+static const struct block_device_operations hd_fops = {
+ .getgeo = hd_getgeo,
+};
+
+static int __init hd_init(void)
+{
+ int drive;
+
+ if (register_blkdev(HD_MAJOR, "hd"))
+ return -1;
+
+ hd_queue = blk_init_queue(do_hd_request, &hd_lock);
+ if (!hd_queue) {
+ unregister_blkdev(HD_MAJOR, "hd");
+ return -ENOMEM;
+ }
+
+ blk_queue_max_hw_sectors(hd_queue, 255);
+ init_timer(&device_timer);
+ device_timer.function = hd_times_out;
+ blk_queue_logical_block_size(hd_queue, 512);
+
+ if (!NR_HD) {
+ /*
+ * We don't know anything about the drive. This means
+ * that you *MUST* specify the drive parameters to the
+ * kernel yourself.
+ *
+ * If we were on an i386, we used to read this info from
+ * the BIOS or CMOS. This doesn't work all that well,
+ * since this assumes that this is a primary or secondary
+ * drive, and if we're using this legacy driver, it's
+ * probably an auxiliary controller added to recover
+ * legacy data off an ST-506 drive. Either way, it's
+ * definitely safest to have the user explicitly specify
+ * the information.
+ */
+ printk("hd: no drives specified - use hd=cyl,head,sectors"
+ " on kernel command line\n");
+ goto out;
+ }
+
+ for (drive = 0 ; drive < NR_HD ; drive++) {
+ struct gendisk *disk = alloc_disk(64);
+ struct hd_i_struct *p = &hd_info[drive];
+ if (!disk)
+ goto Enomem;
+ disk->major = HD_MAJOR;
+ disk->first_minor = drive << 6;
+ disk->fops = &hd_fops;
+ sprintf(disk->disk_name, "hd%c", 'a'+drive);
+ disk->private_data = p;
+ set_capacity(disk, p->head * p->sect * p->cyl);
+ disk->queue = hd_queue;
+ p->unit = drive;
+ hd_gendisk[drive] = disk;
+ printk("%s: %luMB, CHS=%d/%d/%d\n",
+ disk->disk_name, (unsigned long)get_capacity(disk)/2048,
+ p->cyl, p->head, p->sect);
+ }
+
+ if (request_irq(HD_IRQ, hd_interrupt, 0, "hd", NULL)) {
+ printk("hd: unable to get IRQ%d for the hard disk driver\n",
+ HD_IRQ);
+ goto out1;
+ }
+ if (!request_region(HD_DATA, 8, "hd")) {
+ printk(KERN_WARNING "hd: port 0x%x busy\n", HD_DATA);
+ goto out2;
+ }
+ if (!request_region(HD_CMD, 1, "hd(cmd)")) {
+ printk(KERN_WARNING "hd: port 0x%x busy\n", HD_CMD);
+ goto out3;
+ }
+
+ /* Let them fly */
+ for (drive = 0; drive < NR_HD; drive++)
+ add_disk(hd_gendisk[drive]);
+
+ return 0;
+
+out3:
+ release_region(HD_DATA, 8);
+out2:
+ free_irq(HD_IRQ, NULL);
+out1:
+ for (drive = 0; drive < NR_HD; drive++)
+ put_disk(hd_gendisk[drive]);
+ NR_HD = 0;
+out:
+ del_timer(&device_timer);
+ unregister_blkdev(HD_MAJOR, "hd");
+ blk_cleanup_queue(hd_queue);
+ return -1;
+Enomem:
+ while (drive--)
+ put_disk(hd_gendisk[drive]);
+ goto out;
+}
+
+static int __init parse_hd_setup(char *line)
+{
+ int ints[6];
+
+ (void) get_options(line, ARRAY_SIZE(ints), ints);
+ hd_setup(NULL, ints);
+
+ return 1;
+}
+__setup("hd=", parse_hd_setup);
+
+late_initcall(hd_init);
diff --git a/drivers/block/ida_cmd.h b/drivers/block/ida_cmd.h
new file mode 100644
index 000000000..98b5746b3
--- /dev/null
+++ b/drivers/block/ida_cmd.h
@@ -0,0 +1,349 @@
+/*
+ * Disk Array driver for Compaq SMART2 Controllers
+ * Copyright 1998 Compaq Computer Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Questions/Comments/Bugfixes to iss_storagedev@hp.com
+ *
+ */
+#ifndef ARRAYCMD_H
+#define ARRAYCMD_H
+
+#include <asm/types.h>
+#if 0
+#include <linux/blkdev.h>
+#endif
+
+/* for the Smart Array 42XX cards */
+#define S42XX_REQUEST_PORT_OFFSET 0x40
+#define S42XX_REPLY_INTR_MASK_OFFSET 0x34
+#define S42XX_REPLY_PORT_OFFSET 0x44
+#define S42XX_INTR_STATUS 0x30
+
+#define S42XX_INTR_OFF 0x08
+#define S42XX_INTR_PENDING 0x08
+
+#define COMMAND_FIFO 0x04
+#define COMMAND_COMPLETE_FIFO 0x08
+#define INTR_MASK 0x0C
+#define INTR_STATUS 0x10
+#define INTR_PENDING 0x14
+
+#define FIFO_NOT_EMPTY 0x01
+#define FIFO_NOT_FULL 0x02
+
+#define BIG_PROBLEM 0x40
+#define LOG_NOT_CONF 2
+
+#pragma pack(1)
+typedef struct {
+ __u32 size;
+ __u32 addr;
+} sg_t;
+
+#define RCODE_NONFATAL 0x02
+#define RCODE_FATAL 0x04
+#define RCODE_INVREQ 0x10
+typedef struct {
+ __u16 next;
+ __u8 cmd;
+ __u8 rcode;
+ __u32 blk;
+ __u16 blk_cnt;
+ __u8 sg_cnt;
+ __u8 reserved;
+} rhdr_t;
+
+#define SG_MAX 32
+typedef struct {
+ rhdr_t hdr;
+ sg_t sg[SG_MAX];
+ __u32 bp;
+} rblk_t;
+
+typedef struct {
+ __u8 unit;
+ __u8 prio;
+ __u16 size;
+} chdr_t;
+
+#define CMD_RWREQ 0x00
+#define CMD_IOCTL_PEND 0x01
+#define CMD_IOCTL_DONE 0x02
+
+typedef struct cmdlist {
+ chdr_t hdr;
+ rblk_t req;
+ __u32 size;
+ int retry_cnt;
+ __u32 busaddr;
+ int ctlr;
+ struct cmdlist *prev;
+ struct cmdlist *next;
+ struct request *rq;
+ int type;
+} cmdlist_t;
+
+#define ID_CTLR 0x11
+typedef struct {
+ __u8 nr_drvs;
+ __u32 cfg_sig;
+ __u8 firm_rev[4];
+ __u8 rom_rev[4];
+ __u8 hw_rev;
+ __u32 bb_rev;
+ __u32 drv_present_map;
+ __u32 ext_drv_map;
+ __u32 board_id;
+ __u8 cfg_error;
+ __u32 non_disk_bits;
+ __u8 bad_ram_addr;
+ __u8 cpu_rev;
+ __u8 pdpi_rev;
+ __u8 epic_rev;
+ __u8 wcxc_rev;
+ __u8 marketing_rev;
+ __u8 ctlr_flags;
+ __u8 host_flags;
+ __u8 expand_dis;
+ __u8 scsi_chips;
+ __u32 max_req_blocks;
+ __u32 ctlr_clock;
+ __u8 drvs_per_bus;
+ __u16 big_drv_present_map[8];
+ __u16 big_ext_drv_map[8];
+ __u16 big_non_disk_map[8];
+ __u16 task_flags;
+ __u8 icl_bus;
+ __u8 red_modes;
+ __u8 cur_red_mode;
+ __u8 red_ctlr_stat;
+ __u8 red_fail_reason;
+ __u8 reserved[403];
+} id_ctlr_t;
+
+typedef struct {
+ __u16 cyl;
+ __u8 heads;
+ __u8 xsig;
+ __u8 psectors;
+ __u16 wpre;
+ __u8 maxecc;
+ __u8 drv_ctrl;
+ __u16 pcyls;
+ __u8 pheads;
+ __u16 landz;
+ __u8 sect_per_track;
+ __u8 cksum;
+} drv_param_t;
+
+#define ID_LOG_DRV 0x10
+typedef struct {
+ __u16 blk_size;
+ __u32 nr_blks;
+ drv_param_t drv;
+ __u8 fault_tol;
+ __u8 reserved;
+ __u8 bios_disable;
+} id_log_drv_t;
+
+#define ID_LOG_DRV_EXT 0x18
+typedef struct {
+ __u32 log_drv_id;
+ __u8 log_drv_label[64];
+ __u8 reserved[418];
+} id_log_drv_ext_t;
+
+#define SENSE_LOG_DRV_STAT 0x12
+typedef struct {
+ __u8 status;
+ __u32 fail_map;
+ __u16 read_err[32];
+ __u16 write_err[32];
+ __u8 drv_err_data[256];
+ __u8 drq_timeout[32];
+ __u32 blks_to_recover;
+ __u8 drv_recovering;
+ __u16 remap_cnt[32];
+ __u32 replace_drv_map;
+ __u32 act_spare_map;
+ __u8 spare_stat;
+ __u8 spare_repl_map[32];
+ __u32 repl_ok_map;
+ __u8 media_exch;
+ __u8 cache_fail;
+ __u8 expn_fail;
+ __u8 unit_flags;
+ __u16 big_fail_map[8];
+ __u16 big_remap_map[128];
+ __u16 big_repl_map[8];
+ __u16 big_act_spare_map[8];
+ __u8 big_spar_repl_map[128];
+ __u16 big_repl_ok_map[8];
+ __u8 big_drv_rebuild;
+ __u8 reserved[36];
+} sense_log_drv_stat_t;
+
+#define START_RECOVER 0x13
+
+#define ID_PHYS_DRV 0x15
+typedef struct {
+ __u8 scsi_bus;
+ __u8 scsi_id;
+ __u16 blk_size;
+ __u32 nr_blks;
+ __u32 rsvd_blks;
+ __u8 drv_model[40];
+ __u8 drv_sn[40];
+ __u8 drv_fw[8];
+ __u8 scsi_iq_bits;
+ __u8 compaq_drv_stmp;
+ __u8 last_fail;
+ __u8 phys_drv_flags;
+ __u8 phys_drv_flags1;
+ __u8 scsi_lun;
+ __u8 phys_drv_flags2;
+ __u8 reserved;
+ __u32 spi_speed_rules;
+ __u8 phys_connector[2];
+ __u8 phys_box_on_bus;
+ __u8 phys_bay_in_box;
+} id_phys_drv_t;
+
+#define BLINK_DRV_LEDS 0x16
+typedef struct {
+ __u32 blink_duration;
+ __u32 reserved;
+ __u8 blink[256];
+ __u8 reserved1[248];
+} blink_drv_leds_t;
+
+#define SENSE_BLINK_LEDS 0x17
+typedef struct {
+ __u32 blink_duration;
+ __u32 btime_elap;
+ __u8 blink[256];
+ __u8 reserved1[248];
+} sense_blink_leds_t;
+
+#define IDA_READ 0x20
+#define IDA_WRITE 0x30
+#define IDA_WRITE_MEDIA 0x31
+#define RESET_TO_DIAG 0x40
+#define DIAG_PASS_THRU 0x41
+
+#define SENSE_CONFIG 0x50
+#define SET_CONFIG 0x51
+typedef struct {
+ __u32 cfg_sig;
+ __u16 compat_port;
+ __u8 data_dist_mode;
+ __u8 surf_an_ctrl;
+ __u16 ctlr_phys_drv;
+ __u16 log_unit_phys_drv;
+ __u16 fault_tol_mode;
+ __u8 phys_drv_param[16];
+ drv_param_t drv;
+ __u32 drv_asgn_map;
+ __u16 dist_factor;
+ __u32 spare_asgn_map;
+ __u8 reserved[6];
+ __u16 os;
+ __u8 ctlr_order;
+ __u8 extra_info;
+ __u32 data_offs;
+ __u8 parity_backedout_write_drvs;
+ __u8 parity_dist_mode;
+ __u8 parity_shift_fact;
+ __u8 bios_disable_flag;
+ __u32 blks_on_vol;
+ __u32 blks_per_drv;
+ __u8 scratch[16];
+ __u16 big_drv_map[8];
+ __u16 big_spare_map[8];
+ __u8 ss_source_vol;
+ __u8 mix_drv_cap_range;
+ struct {
+ __u16 big_drv_map[8];
+ __u32 blks_per_drv;
+ __u16 fault_tol_mode;
+ __u16 dist_factor;
+ } MDC_range[4];
+ __u8 reserved1[248];
+} config_t;
+
+#define BYPASS_VOL_STATE 0x52
+#define SS_CREATE_VOL 0x53
+#define CHANGE_CONFIG 0x54
+#define SENSE_ORIG_CONF 0x55
+#define REORDER_LOG_DRV 0x56
+typedef struct {
+ __u8 old_units[32];
+} reorder_log_drv_t;
+
+#define LABEL_LOG_DRV 0x57
+typedef struct {
+ __u8 log_drv_label[64];
+} label_log_drv_t;
+
+#define SS_TO_VOL 0x58
+
+#define SET_SURF_DELAY 0x60
+typedef struct {
+ __u16 delay;
+ __u8 reserved[510];
+} surf_delay_t;
+
+#define SET_OVERHEAT_DELAY 0x61
+typedef struct {
+ __u16 delay;
+} overhead_delay_t;
+
+#define SET_MP_DELAY
+typedef struct {
+ __u16 delay;
+ __u8 reserved[510];
+} mp_delay_t;
+
+#define PASSTHRU_A 0x91
+typedef struct {
+ __u8 target;
+ __u8 bus;
+ __u8 lun;
+ __u32 timeout;
+ __u32 flags;
+ __u8 status;
+ __u8 error;
+ __u8 cdb_len;
+ __u8 sense_error;
+ __u8 sense_key;
+ __u32 sense_info;
+ __u8 sense_code;
+ __u8 sense_qual;
+ __u32 residual;
+ __u8 reserved[4];
+ __u8 cdb[12];
+} scsi_param_t;
+
+#define RESUME_BACKGROUND_ACTIVITY 0x99
+#define SENSE_CONTROLLER_PERFORMANCE 0xa8
+#define FLUSH_CACHE 0xc2
+#define COLLECT_BUFFER 0xd2
+#define READ_FLASH_ROM 0xf6
+#define WRITE_FLASH_ROM 0xf7
+#pragma pack()
+
+#endif /* ARRAYCMD_H */
diff --git a/drivers/block/ida_ioctl.h b/drivers/block/ida_ioctl.h
new file mode 100644
index 000000000..888fff9ca
--- /dev/null
+++ b/drivers/block/ida_ioctl.h
@@ -0,0 +1,87 @@
+/*
+ * Disk Array driver for Compaq SMART2 Controllers
+ * Copyright 1998 Compaq Computer Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Questions/Comments/Bugfixes to iss_storagedev@hp.com
+ *
+ */
+#ifndef IDA_IOCTL_H
+#define IDA_IOCTL_H
+
+#include "ida_cmd.h"
+#include "cpqarray.h"
+
+#define IDAGETDRVINFO 0x27272828
+#define IDAPASSTHRU 0x28282929
+#define IDAGETCTLRSIG 0x29293030
+#define IDAREVALIDATEVOLS 0x30303131
+#define IDADRIVERVERSION 0x31313232
+#define IDAGETPCIINFO 0x32323333
+
+typedef struct _ida_pci_info_struct
+{
+ unsigned char bus;
+ unsigned char dev_fn;
+ __u32 board_id;
+} ida_pci_info_struct;
+/*
+ * Normally, the ioctl determines the logical unit for this command by
+ * the major,minor number of the fd passed to ioctl. If you need to send
+ * a command to a different/nonexistant unit (such as during config), you
+ * can override the normal behavior by setting the unit valid bit. (Normally,
+ * it should be zero) The controller the command is sent to is still
+ * determined by the major number of the open device.
+ */
+
+#define UNITVALID 0x80
+typedef struct {
+ __u8 cmd;
+ __u8 rcode;
+ __u8 unit;
+ __u32 blk;
+ __u16 blk_cnt;
+
+/* currently, sg_cnt is assumed to be 1: only the 0th element of sg is used */
+ struct {
+ void __user *addr;
+ size_t size;
+ } sg[SG_MAX];
+ int sg_cnt;
+
+ union ctlr_cmds {
+ drv_info_t drv;
+ unsigned char buf[1024];
+
+ id_ctlr_t id_ctlr;
+ drv_param_t drv_param;
+ id_log_drv_t id_log_drv;
+ id_log_drv_ext_t id_log_drv_ext;
+ sense_log_drv_stat_t sense_log_drv_stat;
+ id_phys_drv_t id_phys_drv;
+ blink_drv_leds_t blink_drv_leds;
+ sense_blink_leds_t sense_blink_leds;
+ config_t config;
+ reorder_log_drv_t reorder_log_drv;
+ label_log_drv_t label_log_drv;
+ surf_delay_t surf_delay;
+ overhead_delay_t overhead_delay;
+ mp_delay_t mp_delay;
+ scsi_param_t scsi_param;
+ } c;
+} ida_ioctl_t;
+
+#endif /* IDA_IOCTL_H */
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
new file mode 100644
index 000000000..a5b343d20
--- /dev/null
+++ b/drivers/block/loop.c
@@ -0,0 +1,1885 @@
+/*
+ * linux/drivers/block/loop.c
+ *
+ * Written by Theodore Ts'o, 3/29/93
+ *
+ * Copyright 1993 by Theodore Ts'o. Redistribution of this file is
+ * permitted under the GNU General Public License.
+ *
+ * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
+ * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
+ *
+ * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
+ * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
+ *
+ * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
+ *
+ * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
+ *
+ * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
+ *
+ * Loadable modules and other fixes by AK, 1998
+ *
+ * Make real block number available to downstream transfer functions, enables
+ * CBC (and relatives) mode encryption requiring unique IVs per data block.
+ * Reed H. Petty, rhp@draper.net
+ *
+ * Maximum number of loop devices now dynamic via max_loop module parameter.
+ * Russell Kroll <rkroll@exploits.org> 19990701
+ *
+ * Maximum number of loop devices when compiled-in now selectable by passing
+ * max_loop=<1-255> to the kernel on boot.
+ * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999
+ *
+ * Completely rewrite request handling to be make_request_fn style and
+ * non blocking, pushing work to a helper thread. Lots of fixes from
+ * Al Viro too.
+ * Jens Axboe <axboe@suse.de>, Nov 2000
+ *
+ * Support up to 256 loop devices
+ * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
+ *
+ * Support for falling back on the write file operation when the address space
+ * operations write_begin is not available on the backing filesystem.
+ * Anton Altaparmakov, 16 Feb 2005
+ *
+ * Still To Fix:
+ * - Advisory locking is ignored here.
+ * - Should use an own CAP_* category instead of CAP_SYS_ADMIN
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+#include <linux/mutex.h>
+#include <linux/writeback.h>
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/kthread.h>
+#include <linux/splice.h>
+#include <linux/sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/uio.h>
+#include "loop.h"
+
+#include <asm/uaccess.h>
+
+static DEFINE_IDR(loop_index_idr);
+static DEFINE_MUTEX(loop_index_mutex);
+
+static int max_part;
+static int part_shift;
+
+static int transfer_xor(struct loop_device *lo, int cmd,
+ struct page *raw_page, unsigned raw_off,
+ struct page *loop_page, unsigned loop_off,
+ int size, sector_t real_block)
+{
+ char *raw_buf = kmap_atomic(raw_page) + raw_off;
+ char *loop_buf = kmap_atomic(loop_page) + loop_off;
+ char *in, *out, *key;
+ int i, keysize;
+
+ if (cmd == READ) {
+ in = raw_buf;
+ out = loop_buf;
+ } else {
+ in = loop_buf;
+ out = raw_buf;
+ }
+
+ key = lo->lo_encrypt_key;
+ keysize = lo->lo_encrypt_key_size;
+ for (i = 0; i < size; i++)
+ *out++ = *in++ ^ key[(i & 511) % keysize];
+
+ kunmap_atomic(loop_buf);
+ kunmap_atomic(raw_buf);
+ cond_resched();
+ return 0;
+}
+
+static int xor_init(struct loop_device *lo, const struct loop_info64 *info)
+{
+ if (unlikely(info->lo_encrypt_key_size <= 0))
+ return -EINVAL;
+ return 0;
+}
+
+static struct loop_func_table none_funcs = {
+ .number = LO_CRYPT_NONE,
+};
+
+static struct loop_func_table xor_funcs = {
+ .number = LO_CRYPT_XOR,
+ .transfer = transfer_xor,
+ .init = xor_init
+};
+
+/* xfer_funcs[0] is special - its release function is never called */
+static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
+ &none_funcs,
+ &xor_funcs
+};
+
+static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
+{
+ loff_t loopsize;
+
+ /* Compute loopsize in bytes */
+ loopsize = i_size_read(file->f_mapping->host);
+ if (offset > 0)
+ loopsize -= offset;
+ /* offset is beyond i_size, weird but possible */
+ if (loopsize < 0)
+ return 0;
+
+ if (sizelimit > 0 && sizelimit < loopsize)
+ loopsize = sizelimit;
+ /*
+ * Unfortunately, if we want to do I/O on the device,
+ * the number of 512-byte sectors has to fit into a sector_t.
+ */
+ return loopsize >> 9;
+}
+
+static loff_t get_loop_size(struct loop_device *lo, struct file *file)
+{
+ return get_size(lo->lo_offset, lo->lo_sizelimit, file);
+}
+
+static int
+figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit)
+{
+ loff_t size = get_size(offset, sizelimit, lo->lo_backing_file);
+ sector_t x = (sector_t)size;
+ struct block_device *bdev = lo->lo_device;
+
+ if (unlikely((loff_t)x != size))
+ return -EFBIG;
+ if (lo->lo_offset != offset)
+ lo->lo_offset = offset;
+ if (lo->lo_sizelimit != sizelimit)
+ lo->lo_sizelimit = sizelimit;
+ set_capacity(lo->lo_disk, x);
+ bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9);
+ /* let user-space know about the new size */
+ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
+ return 0;
+}
+
+static inline int
+lo_do_transfer(struct loop_device *lo, int cmd,
+ struct page *rpage, unsigned roffs,
+ struct page *lpage, unsigned loffs,
+ int size, sector_t rblock)
+{
+ int ret;
+
+ ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
+ if (likely(!ret))
+ return 0;
+
+ printk_ratelimited(KERN_ERR
+ "loop: Transfer error at byte offset %llu, length %i.\n",
+ (unsigned long long)rblock << 9, size);
+ return ret;
+}
+
+static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
+{
+ struct iov_iter i;
+ ssize_t bw;
+
+ iov_iter_bvec(&i, ITER_BVEC, bvec, 1, bvec->bv_len);
+
+ file_start_write(file);
+ bw = vfs_iter_write(file, &i, ppos);
+ file_end_write(file);
+
+ if (likely(bw == bvec->bv_len))
+ return 0;
+
+ printk_ratelimited(KERN_ERR
+ "loop: Write error at byte offset %llu, length %i.\n",
+ (unsigned long long)*ppos, bvec->bv_len);
+ if (bw >= 0)
+ bw = -EIO;
+ return bw;
+}
+
+static int lo_write_simple(struct loop_device *lo, struct request *rq,
+ loff_t pos)
+{
+ struct bio_vec bvec;
+ struct req_iterator iter;
+ int ret = 0;
+
+ rq_for_each_segment(bvec, rq, iter) {
+ ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
+ if (ret < 0)
+ break;
+ cond_resched();
+ }
+
+ return ret;
+}
+
+/*
+ * This is the slow, transforming version that needs to double buffer the
+ * data as it cannot do the transformations in place without having direct
+ * access to the destination pages of the backing file.
+ */
+static int lo_write_transfer(struct loop_device *lo, struct request *rq,
+ loff_t pos)
+{
+ struct bio_vec bvec, b;
+ struct req_iterator iter;
+ struct page *page;
+ int ret = 0;
+
+ page = alloc_page(GFP_NOIO);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ rq_for_each_segment(bvec, rq, iter) {
+ ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page,
+ bvec.bv_offset, bvec.bv_len, pos >> 9);
+ if (unlikely(ret))
+ break;
+
+ b.bv_page = page;
+ b.bv_offset = 0;
+ b.bv_len = bvec.bv_len;
+ ret = lo_write_bvec(lo->lo_backing_file, &b, &pos);
+ if (ret < 0)
+ break;
+ }
+
+ __free_page(page);
+ return ret;
+}
+
+static int lo_read_simple(struct loop_device *lo, struct request *rq,
+ loff_t pos)
+{
+ struct bio_vec bvec;
+ struct req_iterator iter;
+ struct iov_iter i;
+ ssize_t len;
+
+ rq_for_each_segment(bvec, rq, iter) {
+ iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len);
+ len = vfs_iter_read(lo->lo_backing_file, &i, &pos);
+ if (len < 0)
+ return len;
+
+ flush_dcache_page(bvec.bv_page);
+
+ if (len != bvec.bv_len) {
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
+ break;
+ }
+ cond_resched();
+ }
+
+ return 0;
+}
+
+static int lo_read_transfer(struct loop_device *lo, struct request *rq,
+ loff_t pos)
+{
+ struct bio_vec bvec, b;
+ struct req_iterator iter;
+ struct iov_iter i;
+ struct page *page;
+ ssize_t len;
+ int ret = 0;
+
+ page = alloc_page(GFP_NOIO);
+ if (unlikely(!page))
+ return -ENOMEM;
+
+ rq_for_each_segment(bvec, rq, iter) {
+ loff_t offset = pos;
+
+ b.bv_page = page;
+ b.bv_offset = 0;
+ b.bv_len = bvec.bv_len;
+
+ iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len);
+ len = vfs_iter_read(lo->lo_backing_file, &i, &pos);
+ if (len < 0) {
+ ret = len;
+ goto out_free_page;
+ }
+
+ ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page,
+ bvec.bv_offset, len, offset >> 9);
+ if (ret)
+ goto out_free_page;
+
+ flush_dcache_page(bvec.bv_page);
+
+ if (len != bvec.bv_len) {
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq)
+ zero_fill_bio(bio);
+ break;
+ }
+ }
+
+ ret = 0;
+out_free_page:
+ __free_page(page);
+ return ret;
+}
+
+static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
+{
+ /*
+ * We use punch hole to reclaim the free space used by the
+ * image a.k.a. discard. However we do not support discard if
+ * encryption is enabled, because it may give an attacker
+ * useful information.
+ */
+ struct file *file = lo->lo_backing_file;
+ int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+ int ret;
+
+ if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
+ if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+ ret = -EIO;
+ out:
+ return ret;
+}
+
+static int lo_req_flush(struct loop_device *lo, struct request *rq)
+{
+ struct file *file = lo->lo_backing_file;
+ int ret = vfs_fsync(file, 0);
+ if (unlikely(ret && ret != -EINVAL))
+ ret = -EIO;
+
+ return ret;
+}
+
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
+{
+ loff_t pos;
+ int ret;
+
+ pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
+
+ if (rq->cmd_flags & REQ_WRITE) {
+ if (rq->cmd_flags & REQ_FLUSH)
+ ret = lo_req_flush(lo, rq);
+ else if (rq->cmd_flags & REQ_DISCARD)
+ ret = lo_discard(lo, rq, pos);
+ else if (lo->transfer)
+ ret = lo_write_transfer(lo, rq, pos);
+ else
+ ret = lo_write_simple(lo, rq, pos);
+
+ } else {
+ if (lo->transfer)
+ ret = lo_read_transfer(lo, rq, pos);
+ else
+ ret = lo_read_simple(lo, rq, pos);
+ }
+
+ return ret;
+}
+
+struct switch_request {
+ struct file *file;
+ struct completion wait;
+};
+
+/*
+ * Do the actual switch; called from the BIO completion routine
+ */
+static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
+{
+ struct file *file = p->file;
+ struct file *old_file = lo->lo_backing_file;
+ struct address_space *mapping;
+
+ /* if no new file, only flush of queued bios requested */
+ if (!file)
+ return;
+
+ mapping = file->f_mapping;
+ mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+ lo->lo_backing_file = file;
+ lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
+ mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
+ lo->old_gfp_mask = mapping_gfp_mask(mapping);
+ mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+}
+
+/*
+ * loop_switch performs the hard work of switching a backing store.
+ * First it needs to flush existing IO, it does this by sending a magic
+ * BIO down the pipe. The completion of this BIO does the actual switch.
+ */
+static int loop_switch(struct loop_device *lo, struct file *file)
+{
+ struct switch_request w;
+
+ w.file = file;
+
+ /* freeze queue and wait for completion of scheduled requests */
+ blk_mq_freeze_queue(lo->lo_queue);
+
+ /* do the switch action */
+ do_loop_switch(lo, &w);
+
+ /* unfreeze */
+ blk_mq_unfreeze_queue(lo->lo_queue);
+
+ return 0;
+}
+
+/*
+ * Helper to flush the IOs in loop, but keeping loop thread running
+ */
+static int loop_flush(struct loop_device *lo)
+{
+ return loop_switch(lo, NULL);
+}
+
+/*
+ * loop_change_fd switched the backing store of a loopback device to
+ * a new file. This is useful for operating system installers to free up
+ * the original file and in High Availability environments to switch to
+ * an alternative location for the content in case of server meltdown.
+ * This can only work if the loop device is used read-only, and if the
+ * new backing store is the same size and type as the old backing store.
+ */
+static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
+ unsigned int arg)
+{
+ struct file *file, *old_file;
+ struct inode *inode;
+ int error;
+
+ error = -ENXIO;
+ if (lo->lo_state != Lo_bound)
+ goto out;
+
+ /* the loop device has to be read-only */
+ error = -EINVAL;
+ if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
+ goto out;
+
+ error = -EBADF;
+ file = fget(arg);
+ if (!file)
+ goto out;
+
+ inode = file->f_mapping->host;
+ old_file = lo->lo_backing_file;
+
+ error = -EINVAL;
+
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ goto out_putf;
+
+ /* size of the new backing store needs to be the same */
+ if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
+ goto out_putf;
+
+ /* and ... switch */
+ error = loop_switch(lo, file);
+ if (error)
+ goto out_putf;
+
+ fput(old_file);
+ if (lo->lo_flags & LO_FLAGS_PARTSCAN)
+ ioctl_by_bdev(bdev, BLKRRPART, 0);
+ return 0;
+
+ out_putf:
+ fput(file);
+ out:
+ return error;
+}
+
+static inline int is_loop_device(struct file *file)
+{
+ struct inode *i = file->f_mapping->host;
+
+ return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
+}
+
+/*
+ * for AUFS
+ * no get/put for file.
+ */
+struct file *loop_backing_file(struct super_block *sb)
+{
+ struct file *ret;
+ struct loop_device *l;
+
+ ret = NULL;
+ if (MAJOR(sb->s_dev) == LOOP_MAJOR) {
+ l = sb->s_bdev->bd_disk->private_data;
+ ret = l->lo_backing_file;
+ }
+ return ret;
+}
+EXPORT_SYMBOL(loop_backing_file);
+
+/* loop sysfs attributes */
+
+static ssize_t loop_attr_show(struct device *dev, char *page,
+ ssize_t (*callback)(struct loop_device *, char *))
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct loop_device *lo = disk->private_data;
+
+ return callback(lo, page);
+}
+
+#define LOOP_ATTR_RO(_name) \
+static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \
+static ssize_t loop_attr_do_show_##_name(struct device *d, \
+ struct device_attribute *attr, char *b) \
+{ \
+ return loop_attr_show(d, b, loop_attr_##_name##_show); \
+} \
+static struct device_attribute loop_attr_##_name = \
+ __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+
+static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
+{
+ ssize_t ret;
+ char *p = NULL;
+
+ spin_lock_irq(&lo->lo_lock);
+ if (lo->lo_backing_file)
+ p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+ spin_unlock_irq(&lo->lo_lock);
+
+ if (IS_ERR_OR_NULL(p))
+ ret = PTR_ERR(p);
+ else {
+ ret = strlen(p);
+ memmove(buf, p, ret);
+ buf[ret++] = '\n';
+ buf[ret] = 0;
+ }
+
+ return ret;
+}
+
+static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
+}
+
+static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
+}
+
+static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
+{
+ int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
+
+ return sprintf(buf, "%s\n", autoclear ? "1" : "0");
+}
+
+static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
+{
+ int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);
+
+ return sprintf(buf, "%s\n", partscan ? "1" : "0");
+}
+
+LOOP_ATTR_RO(backing_file);
+LOOP_ATTR_RO(offset);
+LOOP_ATTR_RO(sizelimit);
+LOOP_ATTR_RO(autoclear);
+LOOP_ATTR_RO(partscan);
+
+static struct attribute *loop_attrs[] = {
+ &loop_attr_backing_file.attr,
+ &loop_attr_offset.attr,
+ &loop_attr_sizelimit.attr,
+ &loop_attr_autoclear.attr,
+ &loop_attr_partscan.attr,
+ NULL,
+};
+
+static struct attribute_group loop_attribute_group = {
+ .name = "loop",
+ .attrs= loop_attrs,
+};
+
+static int loop_sysfs_init(struct loop_device *lo)
+{
+ return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
+}
+
+static void loop_sysfs_exit(struct loop_device *lo)
+{
+ sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
+}
+
+static void loop_config_discard(struct loop_device *lo)
+{
+ struct file *file = lo->lo_backing_file;
+ struct inode *inode = file->f_mapping->host;
+ struct request_queue *q = lo->lo_queue;
+
+ /*
+ * We use punch hole to reclaim the free space used by the
+ * image a.k.a. discard. However we do not support discard if
+ * encryption is enabled, because it may give an attacker
+ * useful information.
+ */
+ if ((!file->f_op->fallocate) ||
+ lo->lo_encrypt_key_size) {
+ q->limits.discard_granularity = 0;
+ q->limits.discard_alignment = 0;
+ q->limits.max_discard_sectors = 0;
+ q->limits.discard_zeroes_data = 0;
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
+ return;
+ }
+
+ q->limits.discard_granularity = inode->i_sb->s_blocksize;
+ q->limits.discard_alignment = 0;
+ q->limits.max_discard_sectors = UINT_MAX >> 9;
+ q->limits.discard_zeroes_data = 1;
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+}
+
+static int loop_set_fd(struct loop_device *lo, fmode_t mode,
+ struct block_device *bdev, unsigned int arg)
+{
+ struct file *file, *f;
+ struct inode *inode;
+ struct address_space *mapping;
+ unsigned lo_blocksize;
+ int lo_flags = 0;
+ int error;
+ loff_t size;
+
+ /* This is safe, since we have a reference from open(). */
+ __module_get(THIS_MODULE);
+
+ error = -EBADF;
+ file = fget(arg);
+ if (!file)
+ goto out;
+
+ error = -EBUSY;
+ if (lo->lo_state != Lo_unbound)
+ goto out_putf;
+
+ /* Avoid recursion */
+ f = file;
+ while (is_loop_device(f)) {
+ struct loop_device *l;
+
+ if (f->f_mapping->host->i_bdev == bdev)
+ goto out_putf;
+
+ l = f->f_mapping->host->i_bdev->bd_disk->private_data;
+ if (l->lo_state == Lo_unbound) {
+ error = -EINVAL;
+ goto out_putf;
+ }
+ f = l->lo_backing_file;
+ }
+
+ mapping = file->f_mapping;
+ inode = mapping->host;
+
+ error = -EINVAL;
+ if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+ goto out_putf;
+
+ if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
+ !file->f_op->write_iter)
+ lo_flags |= LO_FLAGS_READ_ONLY;
+
+ lo_blocksize = S_ISBLK(inode->i_mode) ?
+ inode->i_bdev->bd_block_size : PAGE_SIZE;
+
+ error = -EFBIG;
+ size = get_loop_size(lo, file);
+ if ((loff_t)(sector_t)size != size)
+ goto out_putf;
+ error = -ENOMEM;
+ lo->wq = alloc_workqueue("kloopd%d",
+ WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_UNBOUND, 16,
+ lo->lo_number);
+ if (!lo->wq)
+ goto out_putf;
+
+ error = 0;
+
+ set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
+
+ lo->lo_blocksize = lo_blocksize;
+ lo->lo_device = bdev;
+ lo->lo_flags = lo_flags;
+ lo->lo_backing_file = file;
+ lo->transfer = NULL;
+ lo->ioctl = NULL;
+ lo->lo_sizelimit = 0;
+ lo->old_gfp_mask = mapping_gfp_mask(mapping);
+ mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+ if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
+ blk_queue_flush(lo->lo_queue, REQ_FLUSH);
+
+ set_capacity(lo->lo_disk, size);
+ bd_set_size(bdev, size << 9);
+ loop_sysfs_init(lo);
+ /* let user-space know about the new size */
+ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
+
+ set_blocksize(bdev, lo_blocksize);
+
+ lo->lo_state = Lo_bound;
+ if (part_shift)
+ lo->lo_flags |= LO_FLAGS_PARTSCAN;
+ if (lo->lo_flags & LO_FLAGS_PARTSCAN)
+ ioctl_by_bdev(bdev, BLKRRPART, 0);
+
+ /* Grab the block_device to prevent its destruction after we
+ * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev).
+ */
+ bdgrab(bdev);
+ return 0;
+
+ out_putf:
+ fput(file);
+ out:
+ /* This is safe: open() is still holding a reference. */
+ module_put(THIS_MODULE);
+ return error;
+}
+
+static int
+loop_release_xfer(struct loop_device *lo)
+{
+ int err = 0;
+ struct loop_func_table *xfer = lo->lo_encryption;
+
+ if (xfer) {
+ if (xfer->release)
+ err = xfer->release(lo);
+ lo->transfer = NULL;
+ lo->lo_encryption = NULL;
+ module_put(xfer->owner);
+ }
+ return err;
+}
+
+static int
+loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer,
+ const struct loop_info64 *i)
+{
+ int err = 0;
+
+ if (xfer) {
+ struct module *owner = xfer->owner;
+
+ if (!try_module_get(owner))
+ return -EINVAL;
+ if (xfer->init)
+ err = xfer->init(lo, i);
+ if (err)
+ module_put(owner);
+ else
+ lo->lo_encryption = xfer;
+ }
+ return err;
+}
+
+static int loop_clr_fd(struct loop_device *lo)
+{
+ struct file *filp = lo->lo_backing_file;
+ gfp_t gfp = lo->old_gfp_mask;
+ struct block_device *bdev = lo->lo_device;
+
+ if (lo->lo_state != Lo_bound)
+ return -ENXIO;
+
+ /*
+ * If we've explicitly asked to tear down the loop device,
+ * and it has an elevated reference count, set it for auto-teardown when
+ * the last reference goes away. This stops $!~#$@ udev from
+ * preventing teardown because it decided that it needs to run blkid on
+ * the loopback device whenever they appear. xfstests is notorious for
+ * failing tests because blkid via udev races with a losetup
+ * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
+ * command to fail with EBUSY.
+ */
+ if (lo->lo_refcnt > 1) {
+ lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
+ mutex_unlock(&lo->lo_ctl_mutex);
+ return 0;
+ }
+
+ if (filp == NULL)
+ return -EINVAL;
+
+ spin_lock_irq(&lo->lo_lock);
+ lo->lo_state = Lo_rundown;
+ lo->lo_backing_file = NULL;
+ spin_unlock_irq(&lo->lo_lock);
+
+ loop_release_xfer(lo);
+ lo->transfer = NULL;
+ lo->ioctl = NULL;
+ lo->lo_device = NULL;
+ lo->lo_encryption = NULL;
+ lo->lo_offset = 0;
+ lo->lo_sizelimit = 0;
+ lo->lo_encrypt_key_size = 0;
+ memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
+ memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
+ memset(lo->lo_file_name, 0, LO_NAME_SIZE);
+ if (bdev) {
+ bdput(bdev);
+ invalidate_bdev(bdev);
+ }
+ set_capacity(lo->lo_disk, 0);
+ loop_sysfs_exit(lo);
+ if (bdev) {
+ bd_set_size(bdev, 0);
+ /* let user-space know about this change */
+ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
+ }
+ mapping_set_gfp_mask(filp->f_mapping, gfp);
+ lo->lo_state = Lo_unbound;
+ /* This is safe: open() is still holding a reference. */
+ module_put(THIS_MODULE);
+ if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev)
+ ioctl_by_bdev(bdev, BLKRRPART, 0);
+ lo->lo_flags = 0;
+ if (!part_shift)
+ lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN;
+ destroy_workqueue(lo->wq);
+ lo->wq = NULL;
+ mutex_unlock(&lo->lo_ctl_mutex);
+ /*
+ * Need not hold lo_ctl_mutex to fput backing file.
+ * Calling fput holding lo_ctl_mutex triggers a circular
+ * lock dependency possibility warning as fput can take
+ * bd_mutex which is usually taken before lo_ctl_mutex.
+ */
+ fput(filp);
+ return 0;
+}
+
+static int
+loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
+{
+ int err;
+ struct loop_func_table *xfer;
+ kuid_t uid = current_uid();
+
+ if (lo->lo_encrypt_key_size &&
+ !uid_eq(lo->lo_key_owner, uid) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (lo->lo_state != Lo_bound)
+ return -ENXIO;
+ if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
+ return -EINVAL;
+
+ err = loop_release_xfer(lo);
+ if (err)
+ return err;
+
+ if (info->lo_encrypt_type) {
+ unsigned int type = info->lo_encrypt_type;
+
+ if (type >= MAX_LO_CRYPT)
+ return -EINVAL;
+ xfer = xfer_funcs[type];
+ if (xfer == NULL)
+ return -EINVAL;
+ } else
+ xfer = NULL;
+
+ err = loop_init_xfer(lo, xfer, info);
+ if (err)
+ return err;
+
+ if (lo->lo_offset != info->lo_offset ||
+ lo->lo_sizelimit != info->lo_sizelimit)
+ if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit))
+ return -EFBIG;
+
+ loop_config_discard(lo);
+
+ memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
+ memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
+ lo->lo_file_name[LO_NAME_SIZE-1] = 0;
+ lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;
+
+ if (!xfer)
+ xfer = &none_funcs;
+ lo->transfer = xfer->transfer;
+ lo->ioctl = xfer->ioctl;
+
+ if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) !=
+ (info->lo_flags & LO_FLAGS_AUTOCLEAR))
+ lo->lo_flags ^= LO_FLAGS_AUTOCLEAR;
+
+ if ((info->lo_flags & LO_FLAGS_PARTSCAN) &&
+ !(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
+ lo->lo_flags |= LO_FLAGS_PARTSCAN;
+ lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
+ ioctl_by_bdev(lo->lo_device, BLKRRPART, 0);
+ }
+
+ lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
+ lo->lo_init[0] = info->lo_init[0];
+ lo->lo_init[1] = info->lo_init[1];
+ if (info->lo_encrypt_key_size) {
+ memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
+ info->lo_encrypt_key_size);
+ lo->lo_key_owner = uid;
+ }
+
+ return 0;
+}
+
+static int
+loop_get_status(struct loop_device *lo, struct loop_info64 *info)
+{
+ struct file *file = lo->lo_backing_file;
+ struct kstat stat;
+ int error;
+
+ if (lo->lo_state != Lo_bound)
+ return -ENXIO;
+ error = vfs_getattr(&file->f_path, &stat);
+ if (error)
+ return error;
+ memset(info, 0, sizeof(*info));
+ info->lo_number = lo->lo_number;
+ info->lo_device = huge_encode_dev(stat.dev);
+ info->lo_inode = stat.ino;
+ info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev);
+ info->lo_offset = lo->lo_offset;
+ info->lo_sizelimit = lo->lo_sizelimit;
+ info->lo_flags = lo->lo_flags;
+ memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
+ memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
+ info->lo_encrypt_type =
+ lo->lo_encryption ? lo->lo_encryption->number : 0;
+ if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
+ info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
+ memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
+ lo->lo_encrypt_key_size);
+ }
+ return 0;
+}
+
+static void
+loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
+{
+ memset(info64, 0, sizeof(*info64));
+ info64->lo_number = info->lo_number;
+ info64->lo_device = info->lo_device;
+ info64->lo_inode = info->lo_inode;
+ info64->lo_rdevice = info->lo_rdevice;
+ info64->lo_offset = info->lo_offset;
+ info64->lo_sizelimit = 0;
+ info64->lo_encrypt_type = info->lo_encrypt_type;
+ info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
+ info64->lo_flags = info->lo_flags;
+ info64->lo_init[0] = info->lo_init[0];
+ info64->lo_init[1] = info->lo_init[1];
+ if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
+ memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
+ else
+ memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
+ memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
+}
+
+static int
+loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
+{
+ memset(info, 0, sizeof(*info));
+ info->lo_number = info64->lo_number;
+ info->lo_device = info64->lo_device;
+ info->lo_inode = info64->lo_inode;
+ info->lo_rdevice = info64->lo_rdevice;
+ info->lo_offset = info64->lo_offset;
+ info->lo_encrypt_type = info64->lo_encrypt_type;
+ info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
+ info->lo_flags = info64->lo_flags;
+ info->lo_init[0] = info64->lo_init[0];
+ info->lo_init[1] = info64->lo_init[1];
+ if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
+ memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
+ else
+ memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
+ memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
+
+ /* error in case values were truncated */
+ if (info->lo_device != info64->lo_device ||
+ info->lo_rdevice != info64->lo_rdevice ||
+ info->lo_inode != info64->lo_inode ||
+ info->lo_offset != info64->lo_offset)
+ return -EOVERFLOW;
+
+ return 0;
+}
+
+static int
+loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
+{
+ struct loop_info info;
+ struct loop_info64 info64;
+
+ if (copy_from_user(&info, arg, sizeof (struct loop_info)))
+ return -EFAULT;
+ loop_info64_from_old(&info, &info64);
+ return loop_set_status(lo, &info64);
+}
+
+static int
+loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
+{
+ struct loop_info64 info64;
+
+ if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
+ return -EFAULT;
+ return loop_set_status(lo, &info64);
+}
+
+static int
+loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
+ struct loop_info info;
+ struct loop_info64 info64;
+ int err = 0;
+
+ if (!arg)
+ err = -EINVAL;
+ if (!err)
+ err = loop_get_status(lo, &info64);
+ if (!err)
+ err = loop_info64_to_old(&info64, &info);
+ if (!err && copy_to_user(arg, &info, sizeof(info)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static int
+loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
+ struct loop_info64 info64;
+ int err = 0;
+
+ if (!arg)
+ err = -EINVAL;
+ if (!err)
+ err = loop_get_status(lo, &info64);
+ if (!err && copy_to_user(arg, &info64, sizeof(info64)))
+ err = -EFAULT;
+
+ return err;
+}
+
+static int loop_set_capacity(struct loop_device *lo, struct block_device *bdev)
+{
+ if (unlikely(lo->lo_state != Lo_bound))
+ return -ENXIO;
+
+ return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit);
+}
+
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct loop_device *lo = bdev->bd_disk->private_data;
+ int err;
+
+ mutex_lock_nested(&lo->lo_ctl_mutex, 1);
+ switch (cmd) {
+ case LOOP_SET_FD:
+ err = loop_set_fd(lo, mode, bdev, arg);
+ break;
+ case LOOP_CHANGE_FD:
+ err = loop_change_fd(lo, bdev, arg);
+ break;
+ case LOOP_CLR_FD:
+ /* loop_clr_fd would have unlocked lo_ctl_mutex on success */
+ err = loop_clr_fd(lo);
+ if (!err)
+ goto out_unlocked;
+ break;
+ case LOOP_SET_STATUS:
+ err = -EPERM;
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ err = loop_set_status_old(lo,
+ (struct loop_info __user *)arg);
+ break;
+ case LOOP_GET_STATUS:
+ err = loop_get_status_old(lo, (struct loop_info __user *) arg);
+ break;
+ case LOOP_SET_STATUS64:
+ err = -EPERM;
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ err = loop_set_status64(lo,
+ (struct loop_info64 __user *) arg);
+ break;
+ case LOOP_GET_STATUS64:
+ err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
+ break;
+ case LOOP_SET_CAPACITY:
+ err = -EPERM;
+ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN))
+ err = loop_set_capacity(lo, bdev);
+ break;
+ default:
+ err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
+ }
+ mutex_unlock(&lo->lo_ctl_mutex);
+
+out_unlocked:
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_loop_info {
+ compat_int_t lo_number; /* ioctl r/o */
+ compat_dev_t lo_device; /* ioctl r/o */
+ compat_ulong_t lo_inode; /* ioctl r/o */
+ compat_dev_t lo_rdevice; /* ioctl r/o */
+ compat_int_t lo_offset;
+ compat_int_t lo_encrypt_type;
+ compat_int_t lo_encrypt_key_size; /* ioctl w/o */
+ compat_int_t lo_flags; /* ioctl r/o */
+ char lo_name[LO_NAME_SIZE];
+ unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
+ compat_ulong_t lo_init[2];
+ char reserved[4];
+};
+
+/*
+ * Transfer 32-bit compatibility structure in userspace to 64-bit loop info
+ * - noinlined to reduce stack space usage in main part of driver
+ */
+static noinline int
+loop_info64_from_compat(const struct compat_loop_info __user *arg,
+ struct loop_info64 *info64)
+{
+ struct compat_loop_info info;
+
+ if (copy_from_user(&info, arg, sizeof(info)))
+ return -EFAULT;
+
+ memset(info64, 0, sizeof(*info64));
+ info64->lo_number = info.lo_number;
+ info64->lo_device = info.lo_device;
+ info64->lo_inode = info.lo_inode;
+ info64->lo_rdevice = info.lo_rdevice;
+ info64->lo_offset = info.lo_offset;
+ info64->lo_sizelimit = 0;
+ info64->lo_encrypt_type = info.lo_encrypt_type;
+ info64->lo_encrypt_key_size = info.lo_encrypt_key_size;
+ info64->lo_flags = info.lo_flags;
+ info64->lo_init[0] = info.lo_init[0];
+ info64->lo_init[1] = info.lo_init[1];
+ if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
+ memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);
+ else
+ memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
+ memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);
+ return 0;
+}
+
+/*
+ * Transfer 64-bit loop info to 32-bit compatibility structure in userspace
+ * - noinlined to reduce stack space usage in main part of driver
+ */
+static noinline int
+loop_info64_to_compat(const struct loop_info64 *info64,
+ struct compat_loop_info __user *arg)
+{
+ struct compat_loop_info info;
+
+ memset(&info, 0, sizeof(info));
+ info.lo_number = info64->lo_number;
+ info.lo_device = info64->lo_device;
+ info.lo_inode = info64->lo_inode;
+ info.lo_rdevice = info64->lo_rdevice;
+ info.lo_offset = info64->lo_offset;
+ info.lo_encrypt_type = info64->lo_encrypt_type;
+ info.lo_encrypt_key_size = info64->lo_encrypt_key_size;
+ info.lo_flags = info64->lo_flags;
+ info.lo_init[0] = info64->lo_init[0];
+ info.lo_init[1] = info64->lo_init[1];
+ if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
+ memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
+ else
+ memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
+ memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
+
+ /* error in case values were truncated */
+ if (info.lo_device != info64->lo_device ||
+ info.lo_rdevice != info64->lo_rdevice ||
+ info.lo_inode != info64->lo_inode ||
+ info.lo_offset != info64->lo_offset ||
+ info.lo_init[0] != info64->lo_init[0] ||
+ info.lo_init[1] != info64->lo_init[1])
+ return -EOVERFLOW;
+
+ if (copy_to_user(arg, &info, sizeof(info)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+loop_set_status_compat(struct loop_device *lo,
+ const struct compat_loop_info __user *arg)
+{
+ struct loop_info64 info64;
+ int ret;
+
+ ret = loop_info64_from_compat(arg, &info64);
+ if (ret < 0)
+ return ret;
+ return loop_set_status(lo, &info64);
+}
+
+static int
+loop_get_status_compat(struct loop_device *lo,
+ struct compat_loop_info __user *arg)
+{
+ struct loop_info64 info64;
+ int err = 0;
+
+ if (!arg)
+ err = -EINVAL;
+ if (!err)
+ err = loop_get_status(lo, &info64);
+ if (!err)
+ err = loop_info64_to_compat(&info64, arg);
+ return err;
+}
+
+static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct loop_device *lo = bdev->bd_disk->private_data;
+ int err;
+
+ switch(cmd) {
+ case LOOP_SET_STATUS:
+ mutex_lock(&lo->lo_ctl_mutex);
+ err = loop_set_status_compat(
+ lo, (const struct compat_loop_info __user *) arg);
+ mutex_unlock(&lo->lo_ctl_mutex);
+ break;
+ case LOOP_GET_STATUS:
+ mutex_lock(&lo->lo_ctl_mutex);
+ err = loop_get_status_compat(
+ lo, (struct compat_loop_info __user *) arg);
+ mutex_unlock(&lo->lo_ctl_mutex);
+ break;
+ case LOOP_SET_CAPACITY:
+ case LOOP_CLR_FD:
+ case LOOP_GET_STATUS64:
+ case LOOP_SET_STATUS64:
+ arg = (unsigned long) compat_ptr(arg);
+ case LOOP_SET_FD:
+ case LOOP_CHANGE_FD:
+ err = lo_ioctl(bdev, mode, cmd, arg);
+ break;
+ default:
+ err = -ENOIOCTLCMD;
+ break;
+ }
+ return err;
+}
+#endif
+
+static int lo_open(struct block_device *bdev, fmode_t mode)
+{
+ struct loop_device *lo;
+ int err = 0;
+
+ mutex_lock(&loop_index_mutex);
+ lo = bdev->bd_disk->private_data;
+ if (!lo) {
+ err = -ENXIO;
+ goto out;
+ }
+
+ mutex_lock(&lo->lo_ctl_mutex);
+ lo->lo_refcnt++;
+ mutex_unlock(&lo->lo_ctl_mutex);
+out:
+ mutex_unlock(&loop_index_mutex);
+ return err;
+}
+
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+ struct loop_device *lo = disk->private_data;
+ int err;
+
+ mutex_lock(&lo->lo_ctl_mutex);
+
+ if (--lo->lo_refcnt)
+ goto out;
+
+ if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
+ /*
+ * In autoclear mode, stop the loop thread
+ * and remove configuration after last close.
+ */
+ err = loop_clr_fd(lo);
+ if (!err)
+ return;
+ } else {
+ /*
+ * Otherwise keep thread (if running) and config,
+ * but flush possible ongoing bios in thread.
+ */
+ loop_flush(lo);
+ }
+
+out:
+ mutex_unlock(&lo->lo_ctl_mutex);
+}
+
+static const struct block_device_operations lo_fops = {
+ .owner = THIS_MODULE,
+ .open = lo_open,
+ .release = lo_release,
+ .ioctl = lo_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = lo_compat_ioctl,
+#endif
+};
+
+/*
+ * And now the modules code and kernel interface.
+ */
+static int max_loop;
+module_param(max_loop, int, S_IRUGO);
+MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
+module_param(max_part, int, S_IRUGO);
+MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
+
+int loop_register_transfer(struct loop_func_table *funcs)
+{
+ unsigned int n = funcs->number;
+
+ if (n >= MAX_LO_CRYPT || xfer_funcs[n])
+ return -EINVAL;
+ xfer_funcs[n] = funcs;
+ return 0;
+}
+
+static int unregister_transfer_cb(int id, void *ptr, void *data)
+{
+ struct loop_device *lo = ptr;
+ struct loop_func_table *xfer = data;
+
+ mutex_lock(&lo->lo_ctl_mutex);
+ if (lo->lo_encryption == xfer)
+ loop_release_xfer(lo);
+ mutex_unlock(&lo->lo_ctl_mutex);
+ return 0;
+}
+
+int loop_unregister_transfer(int number)
+{
+ unsigned int n = number;
+ struct loop_func_table *xfer;
+
+ if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
+ return -EINVAL;
+
+ xfer_funcs[n] = NULL;
+ idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
+ return 0;
+}
+
+EXPORT_SYMBOL(loop_register_transfer);
+EXPORT_SYMBOL(loop_unregister_transfer);
+
+static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+ struct loop_device *lo = cmd->rq->q->queuedata;
+
+ blk_mq_start_request(bd->rq);
+
+ if (lo->lo_state != Lo_bound)
+ return -EIO;
+
+ if (cmd->rq->cmd_flags & REQ_WRITE) {
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ bool need_sched = true;
+
+ spin_lock_irq(&lo->lo_lock);
+ if (lo->write_started)
+ need_sched = false;
+ else
+ lo->write_started = true;
+ list_add_tail(&cmd->list, &lo->write_cmd_head);
+ spin_unlock_irq(&lo->lo_lock);
+
+ if (need_sched)
+ queue_work(lo->wq, &lo->write_work);
+ } else {
+ queue_work(lo->wq, &cmd->read_work);
+ }
+
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void loop_handle_cmd(struct loop_cmd *cmd)
+{
+ const bool write = cmd->rq->cmd_flags & REQ_WRITE;
+ struct loop_device *lo = cmd->rq->q->queuedata;
+ int ret = -EIO;
+
+ if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY))
+ goto failed;
+
+ ret = do_req_filebacked(lo, cmd->rq);
+
+ failed:
+ if (ret)
+ cmd->rq->errors = -EIO;
+ blk_mq_complete_request(cmd->rq);
+}
+
+static void loop_queue_write_work(struct work_struct *work)
+{
+ struct loop_device *lo =
+ container_of(work, struct loop_device, write_work);
+ LIST_HEAD(cmd_list);
+
+ spin_lock_irq(&lo->lo_lock);
+ repeat:
+ list_splice_init(&lo->write_cmd_head, &cmd_list);
+ spin_unlock_irq(&lo->lo_lock);
+
+ while (!list_empty(&cmd_list)) {
+ struct loop_cmd *cmd = list_first_entry(&cmd_list,
+ struct loop_cmd, list);
+ list_del_init(&cmd->list);
+ loop_handle_cmd(cmd);
+ }
+
+ spin_lock_irq(&lo->lo_lock);
+ if (!list_empty(&lo->write_cmd_head))
+ goto repeat;
+ lo->write_started = false;
+ spin_unlock_irq(&lo->lo_lock);
+}
+
+static void loop_queue_read_work(struct work_struct *work)
+{
+ struct loop_cmd *cmd =
+ container_of(work, struct loop_cmd, read_work);
+
+ loop_handle_cmd(cmd);
+}
+
+static int loop_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ cmd->rq = rq;
+ INIT_WORK(&cmd->read_work, loop_queue_read_work);
+
+ return 0;
+}
+
+static struct blk_mq_ops loop_mq_ops = {
+ .queue_rq = loop_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = loop_init_request,
+};
+
+static int loop_add(struct loop_device **l, int i)
+{
+ struct loop_device *lo;
+ struct gendisk *disk;
+ int err;
+
+ err = -ENOMEM;
+ lo = kzalloc(sizeof(*lo), GFP_KERNEL);
+ if (!lo)
+ goto out;
+
+ lo->lo_state = Lo_unbound;
+
+ /* allocate id, if @id >= 0, we're requesting that specific id */
+ if (i >= 0) {
+ err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
+ if (err == -ENOSPC)
+ err = -EEXIST;
+ } else {
+ err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
+ }
+ if (err < 0)
+ goto out_free_dev;
+ i = err;
+
+ err = -ENOMEM;
+ lo->tag_set.ops = &loop_mq_ops;
+ lo->tag_set.nr_hw_queues = 1;
+ lo->tag_set.queue_depth = 128;
+ lo->tag_set.numa_node = NUMA_NO_NODE;
+ lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+ lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ lo->tag_set.driver_data = lo;
+
+ err = blk_mq_alloc_tag_set(&lo->tag_set);
+ if (err)
+ goto out_free_idr;
+
+ lo->lo_queue = blk_mq_init_queue(&lo->tag_set);
+ if (IS_ERR_OR_NULL(lo->lo_queue)) {
+ err = PTR_ERR(lo->lo_queue);
+ goto out_cleanup_tags;
+ }
+ lo->lo_queue->queuedata = lo;
+
+ INIT_LIST_HEAD(&lo->write_cmd_head);
+ INIT_WORK(&lo->write_work, loop_queue_write_work);
+
+ disk = lo->lo_disk = alloc_disk(1 << part_shift);
+ if (!disk)
+ goto out_free_queue;
+
+ /*
+ * Disable partition scanning by default. The in-kernel partition
+ * scanning can be requested individually per-device during its
+ * setup. Userspace can always add and remove partitions from all
+ * devices. The needed partition minors are allocated from the
+ * extended minor space, the main loop device numbers will continue
+ * to match the loop minors, regardless of the number of partitions
+ * used.
+ *
+ * If max_part is given, partition scanning is globally enabled for
+ * all loop devices. The minors for the main loop devices will be
+ * multiples of max_part.
+ *
+ * Note: Global-for-all-devices, set-only-at-init, read-only module
+ * parameteters like 'max_loop' and 'max_part' make things needlessly
+ * complicated, are too static, inflexible and may surprise
+ * userspace tools. Parameters like this in general should be avoided.
+ */
+ if (!part_shift)
+ disk->flags |= GENHD_FL_NO_PART_SCAN;
+ disk->flags |= GENHD_FL_EXT_DEVT;
+ mutex_init(&lo->lo_ctl_mutex);
+ lo->lo_number = i;
+ spin_lock_init(&lo->lo_lock);
+ disk->major = LOOP_MAJOR;
+ disk->first_minor = i << part_shift;
+ disk->fops = &lo_fops;
+ disk->private_data = lo;
+ disk->queue = lo->lo_queue;
+ sprintf(disk->disk_name, "loop%d", i);
+ add_disk(disk);
+ *l = lo;
+ return lo->lo_number;
+
+out_free_queue:
+ blk_cleanup_queue(lo->lo_queue);
+out_cleanup_tags:
+ blk_mq_free_tag_set(&lo->tag_set);
+out_free_idr:
+ idr_remove(&loop_index_idr, i);
+out_free_dev:
+ kfree(lo);
+out:
+ return err;
+}
+
+static void loop_remove(struct loop_device *lo)
+{
+ blk_cleanup_queue(lo->lo_queue);
+ del_gendisk(lo->lo_disk);
+ blk_mq_free_tag_set(&lo->tag_set);
+ put_disk(lo->lo_disk);
+ kfree(lo);
+}
+
+static int find_free_cb(int id, void *ptr, void *data)
+{
+ struct loop_device *lo = ptr;
+ struct loop_device **l = data;
+
+ if (lo->lo_state == Lo_unbound) {
+ *l = lo;
+ return 1;
+ }
+ return 0;
+}
+
+static int loop_lookup(struct loop_device **l, int i)
+{
+ struct loop_device *lo;
+ int ret = -ENODEV;
+
+ if (i < 0) {
+ int err;
+
+ err = idr_for_each(&loop_index_idr, &find_free_cb, &lo);
+ if (err == 1) {
+ *l = lo;
+ ret = lo->lo_number;
+ }
+ goto out;
+ }
+
+ /* lookup and return a specific i */
+ lo = idr_find(&loop_index_idr, i);
+ if (lo) {
+ *l = lo;
+ ret = lo->lo_number;
+ }
+out:
+ return ret;
+}
+
+static struct kobject *loop_probe(dev_t dev, int *part, void *data)
+{
+ struct loop_device *lo;
+ struct kobject *kobj;
+ int err;
+
+ mutex_lock(&loop_index_mutex);
+ err = loop_lookup(&lo, MINOR(dev) >> part_shift);
+ if (err < 0)
+ err = loop_add(&lo, MINOR(dev) >> part_shift);
+ if (err < 0)
+ kobj = NULL;
+ else
+ kobj = get_disk(lo->lo_disk);
+ mutex_unlock(&loop_index_mutex);
+
+ *part = 0;
+ return kobj;
+}
+
+static long loop_control_ioctl(struct file *file, unsigned int cmd,
+ unsigned long parm)
+{
+ struct loop_device *lo;
+ int ret = -ENOSYS;
+
+ mutex_lock(&loop_index_mutex);
+ switch (cmd) {
+ case LOOP_CTL_ADD:
+ ret = loop_lookup(&lo, parm);
+ if (ret >= 0) {
+ ret = -EEXIST;
+ break;
+ }
+ ret = loop_add(&lo, parm);
+ break;
+ case LOOP_CTL_REMOVE:
+ ret = loop_lookup(&lo, parm);
+ if (ret < 0)
+ break;
+ mutex_lock(&lo->lo_ctl_mutex);
+ if (lo->lo_state != Lo_unbound) {
+ ret = -EBUSY;
+ mutex_unlock(&lo->lo_ctl_mutex);
+ break;
+ }
+ if (lo->lo_refcnt > 0) {
+ ret = -EBUSY;
+ mutex_unlock(&lo->lo_ctl_mutex);
+ break;
+ }
+ lo->lo_disk->private_data = NULL;
+ mutex_unlock(&lo->lo_ctl_mutex);
+ idr_remove(&loop_index_idr, lo->lo_number);
+ loop_remove(lo);
+ break;
+ case LOOP_CTL_GET_FREE:
+ ret = loop_lookup(&lo, -1);
+ if (ret >= 0)
+ break;
+ ret = loop_add(&lo, -1);
+ }
+ mutex_unlock(&loop_index_mutex);
+
+ return ret;
+}
+
+static const struct file_operations loop_ctl_fops = {
+ .open = nonseekable_open,
+ .unlocked_ioctl = loop_control_ioctl,
+ .compat_ioctl = loop_control_ioctl,
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+};
+
+static struct miscdevice loop_misc = {
+ .minor = LOOP_CTRL_MINOR,
+ .name = "loop-control",
+ .fops = &loop_ctl_fops,
+};
+
+MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
+MODULE_ALIAS("devname:loop-control");
+
+static int __init loop_init(void)
+{
+ int i, nr;
+ unsigned long range;
+ struct loop_device *lo;
+ int err;
+
+ err = misc_register(&loop_misc);
+ if (err < 0)
+ return err;
+
+ part_shift = 0;
+ if (max_part > 0) {
+ part_shift = fls(max_part);
+
+ /*
+ * Adjust max_part according to part_shift as it is exported
+ * to user space so that user can decide correct minor number
+ * if [s]he want to create more devices.
+ *
+ * Note that -1 is required because partition 0 is reserved
+ * for the whole disk.
+ */
+ max_part = (1UL << part_shift) - 1;
+ }
+
+ if ((1UL << part_shift) > DISK_MAX_PARTS) {
+ err = -EINVAL;
+ goto misc_out;
+ }
+
+ if (max_loop > 1UL << (MINORBITS - part_shift)) {
+ err = -EINVAL;
+ goto misc_out;
+ }
+
+ /*
+ * If max_loop is specified, create that many devices upfront.
+ * This also becomes a hard limit. If max_loop is not specified,
+ * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+ * init time. Loop devices can be requested on-demand with the
+ * /dev/loop-control interface, or be instantiated by accessing
+ * a 'dead' device node.
+ */
+ if (max_loop) {
+ nr = max_loop;
+ range = max_loop << part_shift;
+ } else {
+ nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
+ range = 1UL << MINORBITS;
+ }
+
+ if (register_blkdev(LOOP_MAJOR, "loop")) {
+ err = -EIO;
+ goto misc_out;
+ }
+
+ blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
+ THIS_MODULE, loop_probe, NULL, NULL);
+
+ /* pre-create number of devices given by config or max_loop */
+ mutex_lock(&loop_index_mutex);
+ for (i = 0; i < nr; i++)
+ loop_add(&lo, i);
+ mutex_unlock(&loop_index_mutex);
+
+ printk(KERN_INFO "loop: module loaded\n");
+ return 0;
+
+misc_out:
+ misc_deregister(&loop_misc);
+ return err;
+}
+
+static int loop_exit_cb(int id, void *ptr, void *data)
+{
+ struct loop_device *lo = ptr;
+
+ loop_remove(lo);
+ return 0;
+}
+
+static void __exit loop_exit(void)
+{
+ unsigned long range;
+
+ range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
+
+ idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
+ idr_destroy(&loop_index_idr);
+
+ blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
+ unregister_blkdev(LOOP_MAJOR, "loop");
+
+ misc_deregister(&loop_misc);
+}
+
+module_init(loop_init);
+module_exit(loop_exit);
+
+#ifndef MODULE
+static int __init max_loop_setup(char *str)
+{
+ max_loop = simple_strtol(str, NULL, 0);
+ return 1;
+}
+
+__setup("max_loop=", max_loop_setup);
+#endif
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
new file mode 100644
index 000000000..49564edf5
--- /dev/null
+++ b/drivers/block/loop.h
@@ -0,0 +1,92 @@
+/*
+ * loop.h
+ *
+ * Written by Theodore Ts'o, 3/29/93.
+ *
+ * Copyright 1993 by Theodore Ts'o. Redistribution of this file is
+ * permitted under the GNU General Public License.
+ */
+#ifndef _LINUX_LOOP_H
+#define _LINUX_LOOP_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/loop.h>
+
+/* Possible states of device */
+enum {
+ Lo_unbound,
+ Lo_bound,
+ Lo_rundown,
+};
+
+struct loop_func_table;
+
+struct loop_device {
+ int lo_number;
+ int lo_refcnt;
+ loff_t lo_offset;
+ loff_t lo_sizelimit;
+ int lo_flags;
+ int (*transfer)(struct loop_device *, int cmd,
+ struct page *raw_page, unsigned raw_off,
+ struct page *loop_page, unsigned loop_off,
+ int size, sector_t real_block);
+ char lo_file_name[LO_NAME_SIZE];
+ char lo_crypt_name[LO_NAME_SIZE];
+ char lo_encrypt_key[LO_KEY_SIZE];
+ int lo_encrypt_key_size;
+ struct loop_func_table *lo_encryption;
+ __u32 lo_init[2];
+ kuid_t lo_key_owner; /* Who set the key */
+ int (*ioctl)(struct loop_device *, int cmd,
+ unsigned long arg);
+
+ struct file * lo_backing_file;
+ struct block_device *lo_device;
+ unsigned lo_blocksize;
+ void *key_data;
+
+ gfp_t old_gfp_mask;
+
+ spinlock_t lo_lock;
+ struct workqueue_struct *wq;
+ struct list_head write_cmd_head;
+ struct work_struct write_work;
+ bool write_started;
+ int lo_state;
+ struct mutex lo_ctl_mutex;
+
+ struct request_queue *lo_queue;
+ struct blk_mq_tag_set tag_set;
+ struct gendisk *lo_disk;
+};
+
+struct loop_cmd {
+ struct work_struct read_work;
+ struct request *rq;
+ struct list_head list;
+};
+
+/* Support for loadable transfer modules */
+struct loop_func_table {
+ int number; /* filter type */
+ int (*transfer)(struct loop_device *lo, int cmd,
+ struct page *raw_page, unsigned raw_off,
+ struct page *loop_page, unsigned loop_off,
+ int size, sector_t real_block);
+ int (*init)(struct loop_device *, const struct loop_info64 *);
+ /* release is called from loop_unregister_transfer or clr_fd */
+ int (*release)(struct loop_device *);
+ int (*ioctl)(struct loop_device *, int cmd, unsigned long arg);
+ struct module *owner;
+};
+
+int loop_register_transfer(struct loop_func_table *funcs);
+int loop_unregister_transfer(int number);
+
+#endif
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
new file mode 100644
index 000000000..145ce2aa2
--- /dev/null
+++ b/drivers/block/mg_disk.c
@@ -0,0 +1,1112 @@
+/*
+ * drivers/block/mg_disk.c
+ *
+ * Support for the mGine m[g]flash IO mode.
+ * Based on legacy hd.c
+ *
+ * (c) 2008 mGine Co.,LTD
+ * (c) 2008 unsik Kim <donari75@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/gpio.h>
+#include <linux/mg_disk.h>
+#include <linux/slab.h>
+
+#define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
+
+/* name for block device */
+#define MG_DISK_NAME "mgd"
+
+#define MG_DISK_MAJ 0
+#define MG_DISK_MAX_PART 16
+#define MG_SECTOR_SIZE 512
+#define MG_MAX_SECTS 256
+
+/* Register offsets */
+#define MG_BUFF_OFFSET 0x8000
+#define MG_REG_OFFSET 0xC000
+#define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */
+#define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */
+#define MG_REG_SECT_CNT (MG_REG_OFFSET + 4)
+#define MG_REG_SECT_NUM (MG_REG_OFFSET + 6)
+#define MG_REG_CYL_LOW (MG_REG_OFFSET + 8)
+#define MG_REG_CYL_HIGH (MG_REG_OFFSET + 0xA)
+#define MG_REG_DRV_HEAD (MG_REG_OFFSET + 0xC)
+#define MG_REG_COMMAND (MG_REG_OFFSET + 0xE) /* write case */
+#define MG_REG_STATUS (MG_REG_OFFSET + 0xE) /* read case */
+#define MG_REG_DRV_CTRL (MG_REG_OFFSET + 0x10)
+#define MG_REG_BURST_CTRL (MG_REG_OFFSET + 0x12)
+
+/* handy status */
+#define MG_STAT_READY (ATA_DRDY | ATA_DSC)
+#define MG_READY_OK(s) (((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
+ ATA_ERR))) == MG_STAT_READY)
+
+/* error code for others */
+#define MG_ERR_NONE 0
+#define MG_ERR_TIMEOUT 0x100
+#define MG_ERR_INIT_STAT 0x101
+#define MG_ERR_TRANSLATION 0x102
+#define MG_ERR_CTRL_RST 0x103
+#define MG_ERR_INV_STAT 0x104
+#define MG_ERR_RSTOUT 0x105
+
+#define MG_MAX_ERRORS 6 /* Max read/write errors */
+
+/* command */
+#define MG_CMD_RD 0x20
+#define MG_CMD_WR 0x30
+#define MG_CMD_SLEEP 0x99
+#define MG_CMD_WAKEUP 0xC3
+#define MG_CMD_ID 0xEC
+#define MG_CMD_WR_CONF 0x3C
+#define MG_CMD_RD_CONF 0x40
+
+/* operation mode */
+#define MG_OP_CASCADE (1 << 0)
+#define MG_OP_CASCADE_SYNC_RD (1 << 1)
+#define MG_OP_CASCADE_SYNC_WR (1 << 2)
+#define MG_OP_INTERLEAVE (1 << 3)
+
+/* synchronous */
+#define MG_BURST_LAT_4 (3 << 4)
+#define MG_BURST_LAT_5 (4 << 4)
+#define MG_BURST_LAT_6 (5 << 4)
+#define MG_BURST_LAT_7 (6 << 4)
+#define MG_BURST_LAT_8 (7 << 4)
+#define MG_BURST_LEN_4 (1 << 1)
+#define MG_BURST_LEN_8 (2 << 1)
+#define MG_BURST_LEN_16 (3 << 1)
+#define MG_BURST_LEN_32 (4 << 1)
+#define MG_BURST_LEN_CONT (0 << 1)
+
+/* timeout value (unit: ms) */
+#define MG_TMAX_CONF_TO_CMD 1
+#define MG_TMAX_WAIT_RD_DRQ 10
+#define MG_TMAX_WAIT_WR_DRQ 500
+#define MG_TMAX_RST_TO_BUSY 10
+#define MG_TMAX_HDRST_TO_RDY 500
+#define MG_TMAX_SWRST_TO_RDY 500
+#define MG_TMAX_RSTOUT 3000
+
+#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
+
+/* main structure for mflash driver */
+struct mg_host {
+ struct device *dev;
+
+ struct request_queue *breq;
+ struct request *req;
+ spinlock_t lock;
+ struct gendisk *gd;
+
+ struct timer_list timer;
+ void (*mg_do_intr) (struct mg_host *);
+
+ u16 id[ATA_ID_WORDS];
+
+ u16 cyls;
+ u16 heads;
+ u16 sectors;
+ u32 n_sectors;
+ u32 nres_sectors;
+
+ void __iomem *dev_base;
+ unsigned int irq;
+ unsigned int rst;
+ unsigned int rstout;
+
+ u32 major;
+ u32 error;
+};
+
+/*
+ * Debugging macro and defines
+ */
+#undef DO_MG_DEBUG
+#ifdef DO_MG_DEBUG
+# define MG_DBG(fmt, args...) \
+ printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
+#else /* CONFIG_MG_DEBUG */
+# define MG_DBG(fmt, args...) do { } while (0)
+#endif /* CONFIG_MG_DEBUG */
+
+static void mg_request(struct request_queue *);
+
+static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
+{
+ if (__blk_end_request(host->req, err, nr_bytes))
+ return true;
+
+ host->req = NULL;
+ return false;
+}
+
+static bool mg_end_request_cur(struct mg_host *host, int err)
+{
+ return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
+}
+
+static void mg_dump_status(const char *msg, unsigned int stat,
+ struct mg_host *host)
+{
+ char *name = MG_DISK_NAME;
+
+ if (host->req)
+ name = host->req->rq_disk->disk_name;
+
+ printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
+ if (stat & ATA_BUSY)
+ printk("Busy ");
+ if (stat & ATA_DRDY)
+ printk("DriveReady ");
+ if (stat & ATA_DF)
+ printk("WriteFault ");
+ if (stat & ATA_DSC)
+ printk("SeekComplete ");
+ if (stat & ATA_DRQ)
+ printk("DataRequest ");
+ if (stat & ATA_CORR)
+ printk("CorrectedError ");
+ if (stat & ATA_ERR)
+ printk("Error ");
+ printk("}\n");
+ if ((stat & ATA_ERR) == 0) {
+ host->error = 0;
+ } else {
+ host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
+ printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
+ host->error & 0xff);
+ if (host->error & ATA_BBK)
+ printk("BadSector ");
+ if (host->error & ATA_UNC)
+ printk("UncorrectableError ");
+ if (host->error & ATA_IDNF)
+ printk("SectorIdNotFound ");
+ if (host->error & ATA_ABORTED)
+ printk("DriveStatusError ");
+ if (host->error & ATA_AMNF)
+ printk("AddrMarkNotFound ");
+ printk("}");
+ if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
+ if (host->req)
+ printk(", sector=%u",
+ (unsigned int)blk_rq_pos(host->req));
+ }
+ printk("\n");
+ }
+}
+
+static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
+{
+ u8 status;
+ unsigned long expire, cur_jiffies;
+ struct mg_drv_data *prv_data = host->dev->platform_data;
+
+ host->error = MG_ERR_NONE;
+ expire = jiffies + msecs_to_jiffies(msec);
+
+ /* These 2 times dummy status read prevents reading invalid
+ * status. A very little time (3 times of mflash operating clk)
+ * is required for busy bit is set. Use dummy read instead of
+ * busy wait, because mflash's PLL is machine dependent.
+ */
+ if (prv_data->use_polling) {
+ status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+ status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+ }
+
+ status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+
+ do {
+ cur_jiffies = jiffies;
+ if (status & ATA_BUSY) {
+ if (expect == ATA_BUSY)
+ break;
+ } else {
+ /* Check the error condition! */
+ if (status & ATA_ERR) {
+ mg_dump_status("mg_wait", status, host);
+ break;
+ }
+
+ if (expect == MG_STAT_READY)
+ if (MG_READY_OK(status))
+ break;
+
+ if (expect == ATA_DRQ)
+ if (status & ATA_DRQ)
+ break;
+ }
+ if (!msec) {
+ mg_dump_status("not ready", status, host);
+ return MG_ERR_INV_STAT;
+ }
+
+ status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+ } while (time_before(cur_jiffies, expire));
+
+ if (time_after_eq(cur_jiffies, expire) && msec)
+ host->error = MG_ERR_TIMEOUT;
+
+ return host->error;
+}
+
+static unsigned int mg_wait_rstout(u32 rstout, u32 msec)
+{
+ unsigned long expire;
+
+ expire = jiffies + msecs_to_jiffies(msec);
+ while (time_before(jiffies, expire)) {
+ if (gpio_get_value(rstout) == 1)
+ return MG_ERR_NONE;
+ msleep(10);
+ }
+
+ return MG_ERR_RSTOUT;
+}
+
+static void mg_unexpected_intr(struct mg_host *host)
+{
+ u32 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+
+ mg_dump_status("mg_unexpected_intr", status, host);
+}
+
+static irqreturn_t mg_irq(int irq, void *dev_id)
+{
+ struct mg_host *host = dev_id;
+ void (*handler)(struct mg_host *) = host->mg_do_intr;
+
+ spin_lock(&host->lock);
+
+ host->mg_do_intr = NULL;
+ del_timer(&host->timer);
+ if (!handler)
+ handler = mg_unexpected_intr;
+ handler(host);
+
+ spin_unlock(&host->lock);
+
+ return IRQ_HANDLED;
+}
+
+/* local copy of ata_id_string() */
+static void mg_id_string(const u16 *id, unsigned char *s,
+ unsigned int ofs, unsigned int len)
+{
+ unsigned int c;
+
+ BUG_ON(len & 1);
+
+ while (len > 0) {
+ c = id[ofs] >> 8;
+ *s = c;
+ s++;
+
+ c = id[ofs] & 0xff;
+ *s = c;
+ s++;
+
+ ofs++;
+ len -= 2;
+ }
+}
+
+/* local copy of ata_id_c_string() */
+static void mg_id_c_string(const u16 *id, unsigned char *s,
+ unsigned int ofs, unsigned int len)
+{
+ unsigned char *p;
+
+ mg_id_string(id, s, ofs, len - 1);
+
+ p = s + strnlen(s, len - 1);
+ while (p > s && p[-1] == ' ')
+ p--;
+ *p = '\0';
+}
+
+static int mg_get_disk_id(struct mg_host *host)
+{
+ u32 i;
+ s32 err;
+ const u16 *id = host->id;
+ struct mg_drv_data *prv_data = host->dev->platform_data;
+ char fwrev[ATA_ID_FW_REV_LEN + 1];
+ char model[ATA_ID_PROD_LEN + 1];
+ char serial[ATA_ID_SERNO_LEN + 1];
+
+ if (!prv_data->use_polling)
+ outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+
+ outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
+ err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
+ if (err)
+ return err;
+
+ for (i = 0; i < (MG_SECTOR_SIZE >> 1); i++)
+ host->id[i] = le16_to_cpu(inw((unsigned long)host->dev_base +
+ MG_BUFF_OFFSET + i * 2));
+
+ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
+ err = mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD);
+ if (err)
+ return err;
+
+ if ((id[ATA_ID_FIELD_VALID] & 1) == 0)
+ return MG_ERR_TRANSLATION;
+
+ host->n_sectors = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
+ host->cyls = id[ATA_ID_CYLS];
+ host->heads = id[ATA_ID_HEADS];
+ host->sectors = id[ATA_ID_SECTORS];
+
+ if (MG_RES_SEC && host->heads && host->sectors) {
+ /* modify cyls, n_sectors */
+ host->cyls = (host->n_sectors - MG_RES_SEC) /
+ host->heads / host->sectors;
+ host->nres_sectors = host->n_sectors - host->cyls *
+ host->heads * host->sectors;
+ host->n_sectors -= host->nres_sectors;
+ }
+
+ mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
+ mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
+ mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
+ printk(KERN_INFO "mg_disk: model: %s\n", model);
+ printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
+ printk(KERN_INFO "mg_disk: serial: %s\n", serial);
+ printk(KERN_INFO "mg_disk: %d + reserved %d sectors\n",
+ host->n_sectors, host->nres_sectors);
+
+ if (!prv_data->use_polling)
+ outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+
+ return err;
+}
+
+
+static int mg_disk_init(struct mg_host *host)
+{
+ struct mg_drv_data *prv_data = host->dev->platform_data;
+ s32 err;
+ u8 init_status;
+
+ /* hdd rst low */
+ gpio_set_value(host->rst, 0);
+ err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
+ if (err)
+ return err;
+
+ /* hdd rst high */
+ gpio_set_value(host->rst, 1);
+ err = mg_wait(host, MG_STAT_READY, MG_TMAX_HDRST_TO_RDY);
+ if (err)
+ return err;
+
+ /* soft reset on */
+ outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
+ (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+ err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
+ if (err)
+ return err;
+
+ /* soft reset off */
+ outb(prv_data->use_polling ? ATA_NIEN : 0,
+ (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+ err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
+ if (err)
+ return err;
+
+ init_status = inb((unsigned long)host->dev_base + MG_REG_STATUS) & 0xf;
+
+ if (init_status == 0xf)
+ return MG_ERR_INIT_STAT;
+
+ return err;
+}
+
+static void mg_bad_rw_intr(struct mg_host *host)
+{
+ if (host->req)
+ if (++host->req->errors >= MG_MAX_ERRORS ||
+ host->error == MG_ERR_TIMEOUT)
+ mg_end_request_cur(host, -EIO);
+}
+
+static unsigned int mg_out(struct mg_host *host,
+ unsigned int sect_num,
+ unsigned int sect_cnt,
+ unsigned int cmd,
+ void (*intr_addr)(struct mg_host *))
+{
+ struct mg_drv_data *prv_data = host->dev->platform_data;
+
+ if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
+ return host->error;
+
+ if (!prv_data->use_polling) {
+ host->mg_do_intr = intr_addr;
+ mod_timer(&host->timer, jiffies + 3 * HZ);
+ }
+ if (MG_RES_SEC)
+ sect_num += MG_RES_SEC;
+ outb((u8)sect_cnt, (unsigned long)host->dev_base + MG_REG_SECT_CNT);
+ outb((u8)sect_num, (unsigned long)host->dev_base + MG_REG_SECT_NUM);
+ outb((u8)(sect_num >> 8), (unsigned long)host->dev_base +
+ MG_REG_CYL_LOW);
+ outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
+ MG_REG_CYL_HIGH);
+ outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
+ (unsigned long)host->dev_base + MG_REG_DRV_HEAD);
+ outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
+ return MG_ERR_NONE;
+}
+
+static void mg_read_one(struct mg_host *host, struct request *req)
+{
+ u16 *buff = (u16 *)bio_data(req->bio);
+ u32 i;
+
+ for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
+ *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
+ (i << 1));
+}
+
+static void mg_read(struct request *req)
+{
+ struct mg_host *host = req->rq_disk->private_data;
+
+ if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
+ MG_CMD_RD, NULL) != MG_ERR_NONE)
+ mg_bad_rw_intr(host);
+
+ MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
+ blk_rq_sectors(req), blk_rq_pos(req), bio_data(req->bio));
+
+ do {
+ if (mg_wait(host, ATA_DRQ,
+ MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
+ mg_bad_rw_intr(host);
+ return;
+ }
+
+ mg_read_one(host, req);
+
+ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
+ MG_REG_COMMAND);
+ } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
+}
+
+static void mg_write_one(struct mg_host *host, struct request *req)
+{
+ u16 *buff = (u16 *)bio_data(req->bio);
+ u32 i;
+
+ for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
+ outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
+ (i << 1));
+}
+
+static void mg_write(struct request *req)
+{
+ struct mg_host *host = req->rq_disk->private_data;
+ unsigned int rem = blk_rq_sectors(req);
+
+ if (mg_out(host, blk_rq_pos(req), rem,
+ MG_CMD_WR, NULL) != MG_ERR_NONE) {
+ mg_bad_rw_intr(host);
+ return;
+ }
+
+ MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
+ rem, blk_rq_pos(req), bio_data(req->bio));
+
+ if (mg_wait(host, ATA_DRQ,
+ MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+ mg_bad_rw_intr(host);
+ return;
+ }
+
+ do {
+ mg_write_one(host, req);
+
+ outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
+ MG_REG_COMMAND);
+
+ rem--;
+ if (rem > 1 && mg_wait(host, ATA_DRQ,
+ MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+ mg_bad_rw_intr(host);
+ return;
+ } else if (mg_wait(host, MG_STAT_READY,
+ MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+ mg_bad_rw_intr(host);
+ return;
+ }
+ } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
+}
+
+static void mg_read_intr(struct mg_host *host)
+{
+ struct request *req = host->req;
+ u32 i;
+
+ /* check status */
+ do {
+ i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+ if (i & ATA_BUSY)
+ break;
+ if (!MG_READY_OK(i))
+ break;
+ if (i & ATA_DRQ)
+ goto ok_to_read;
+ } while (0);
+ mg_dump_status("mg_read_intr", i, host);
+ mg_bad_rw_intr(host);
+ mg_request(host->breq);
+ return;
+
+ok_to_read:
+ mg_read_one(host, req);
+
+ MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
+ blk_rq_pos(req), blk_rq_sectors(req) - 1, bio_data(req->bio));
+
+ /* send read confirm */
+ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
+
+ if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
+ /* set handler if read remains */
+ host->mg_do_intr = mg_read_intr;
+ mod_timer(&host->timer, jiffies + 3 * HZ);
+ } else /* goto next request */
+ mg_request(host->breq);
+}
+
+static void mg_write_intr(struct mg_host *host)
+{
+ struct request *req = host->req;
+ u32 i;
+ bool rem;
+
+ /* check status */
+ do {
+ i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
+ if (i & ATA_BUSY)
+ break;
+ if (!MG_READY_OK(i))
+ break;
+ if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
+ goto ok_to_write;
+ } while (0);
+ mg_dump_status("mg_write_intr", i, host);
+ mg_bad_rw_intr(host);
+ mg_request(host->breq);
+ return;
+
+ok_to_write:
+ if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
+ /* write 1 sector and set handler if remains */
+ mg_write_one(host, req);
+ MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
+ blk_rq_pos(req), blk_rq_sectors(req), bio_data(req->bio));
+ host->mg_do_intr = mg_write_intr;
+ mod_timer(&host->timer, jiffies + 3 * HZ);
+ }
+
+ /* send write confirm */
+ outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
+
+ if (!rem)
+ mg_request(host->breq);
+}
+
+static void mg_times_out(unsigned long data)
+{
+ struct mg_host *host = (struct mg_host *)data;
+ char *name;
+
+ spin_lock_irq(&host->lock);
+
+ if (!host->req)
+ goto out_unlock;
+
+ host->mg_do_intr = NULL;
+
+ name = host->req->rq_disk->disk_name;
+ printk(KERN_DEBUG "%s: timeout\n", name);
+
+ host->error = MG_ERR_TIMEOUT;
+ mg_bad_rw_intr(host);
+
+out_unlock:
+ mg_request(host->breq);
+ spin_unlock_irq(&host->lock);
+}
+
+static void mg_request_poll(struct request_queue *q)
+{
+ struct mg_host *host = q->queuedata;
+
+ while (1) {
+ if (!host->req) {
+ host->req = blk_fetch_request(q);
+ if (!host->req)
+ break;
+ }
+
+ if (unlikely(host->req->cmd_type != REQ_TYPE_FS)) {
+ mg_end_request_cur(host, -EIO);
+ continue;
+ }
+
+ if (rq_data_dir(host->req) == READ)
+ mg_read(host->req);
+ else
+ mg_write(host->req);
+ }
+}
+
+static unsigned int mg_issue_req(struct request *req,
+ struct mg_host *host,
+ unsigned int sect_num,
+ unsigned int sect_cnt)
+{
+ switch (rq_data_dir(req)) {
+ case READ:
+ if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
+ != MG_ERR_NONE) {
+ mg_bad_rw_intr(host);
+ return host->error;
+ }
+ break;
+ case WRITE:
+ /* TODO : handler */
+ outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+ if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
+ != MG_ERR_NONE) {
+ mg_bad_rw_intr(host);
+ return host->error;
+ }
+ del_timer(&host->timer);
+ mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
+ outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+ if (host->error) {
+ mg_bad_rw_intr(host);
+ return host->error;
+ }
+ mg_write_one(host, req);
+ mod_timer(&host->timer, jiffies + 3 * HZ);
+ outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
+ MG_REG_COMMAND);
+ break;
+ }
+ return MG_ERR_NONE;
+}
+
+/* This function also called from IRQ context */
+static void mg_request(struct request_queue *q)
+{
+ struct mg_host *host = q->queuedata;
+ struct request *req;
+ u32 sect_num, sect_cnt;
+
+ while (1) {
+ if (!host->req) {
+ host->req = blk_fetch_request(q);
+ if (!host->req)
+ break;
+ }
+ req = host->req;
+
+ /* check unwanted request call */
+ if (host->mg_do_intr)
+ return;
+
+ del_timer(&host->timer);
+
+ sect_num = blk_rq_pos(req);
+ /* deal whole segments */
+ sect_cnt = blk_rq_sectors(req);
+
+ /* sanity check */
+ if (sect_num >= get_capacity(req->rq_disk) ||
+ ((sect_num + sect_cnt) >
+ get_capacity(req->rq_disk))) {
+ printk(KERN_WARNING
+ "%s: bad access: sector=%d, count=%d\n",
+ req->rq_disk->disk_name,
+ sect_num, sect_cnt);
+ mg_end_request_cur(host, -EIO);
+ continue;
+ }
+
+ if (unlikely(req->cmd_type != REQ_TYPE_FS)) {
+ mg_end_request_cur(host, -EIO);
+ continue;
+ }
+
+ if (!mg_issue_req(req, host, sect_num, sect_cnt))
+ return;
+ }
+}
+
+static int mg_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct mg_host *host = bdev->bd_disk->private_data;
+
+ geo->cylinders = (unsigned short)host->cyls;
+ geo->heads = (unsigned char)host->heads;
+ geo->sectors = (unsigned char)host->sectors;
+ return 0;
+}
+
+static const struct block_device_operations mg_disk_ops = {
+ .getgeo = mg_getgeo
+};
+
+#ifdef CONFIG_PM_SLEEP
+static int mg_suspend(struct device *dev)
+{
+ struct mg_drv_data *prv_data = dev->platform_data;
+ struct mg_host *host = prv_data->host;
+
+ if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
+ return -EIO;
+
+ if (!prv_data->use_polling)
+ outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+
+ outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
+ /* wait until mflash deep sleep */
+ msleep(1);
+
+ if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
+ if (!prv_data->use_polling)
+ outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int mg_resume(struct device *dev)
+{
+ struct mg_drv_data *prv_data = dev->platform_data;
+ struct mg_host *host = prv_data->host;
+
+ if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
+ return -EIO;
+
+ outb(MG_CMD_WAKEUP, (unsigned long)host->dev_base + MG_REG_COMMAND);
+ /* wait until mflash wakeup */
+ msleep(1);
+
+ if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD))
+ return -EIO;
+
+ if (!prv_data->use_polling)
+ outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
+
+ return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume);
+
+static int mg_probe(struct platform_device *plat_dev)
+{
+ struct mg_host *host;
+ struct resource *rsc;
+ struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+ int err = 0;
+
+ if (!prv_data) {
+ printk(KERN_ERR "%s:%d fail (no driver_data)\n",
+ __func__, __LINE__);
+ err = -EINVAL;
+ goto probe_err;
+ }
+
+ /* alloc mg_host */
+ host = kzalloc(sizeof(struct mg_host), GFP_KERNEL);
+ if (!host) {
+ printk(KERN_ERR "%s:%d fail (no memory for mg_host)\n",
+ __func__, __LINE__);
+ err = -ENOMEM;
+ goto probe_err;
+ }
+ host->major = MG_DISK_MAJ;
+
+ /* link each other */
+ prv_data->host = host;
+ host->dev = &plat_dev->dev;
+
+ /* io remap */
+ rsc = platform_get_resource(plat_dev, IORESOURCE_MEM, 0);
+ if (!rsc) {
+ printk(KERN_ERR "%s:%d platform_get_resource fail\n",
+ __func__, __LINE__);
+ err = -EINVAL;
+ goto probe_err_2;
+ }
+ host->dev_base = ioremap(rsc->start, resource_size(rsc));
+ if (!host->dev_base) {
+ printk(KERN_ERR "%s:%d ioremap fail\n",
+ __func__, __LINE__);
+ err = -EIO;
+ goto probe_err_2;
+ }
+ MG_DBG("dev_base = 0x%x\n", (u32)host->dev_base);
+
+ /* get reset pin */
+ rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
+ MG_RST_PIN);
+ if (!rsc) {
+ printk(KERN_ERR "%s:%d get reset pin fail\n",
+ __func__, __LINE__);
+ err = -EIO;
+ goto probe_err_3;
+ }
+ host->rst = rsc->start;
+
+ /* init rst pin */
+ err = gpio_request(host->rst, MG_RST_PIN);
+ if (err)
+ goto probe_err_3;
+ gpio_direction_output(host->rst, 1);
+
+ /* reset out pin */
+ if (!(prv_data->dev_attr & MG_DEV_MASK)) {
+ err = -EINVAL;
+ goto probe_err_3a;
+ }
+
+ if (prv_data->dev_attr != MG_BOOT_DEV) {
+ rsc = platform_get_resource_byname(plat_dev, IORESOURCE_IO,
+ MG_RSTOUT_PIN);
+ if (!rsc) {
+ printk(KERN_ERR "%s:%d get reset-out pin fail\n",
+ __func__, __LINE__);
+ err = -EIO;
+ goto probe_err_3a;
+ }
+ host->rstout = rsc->start;
+ err = gpio_request(host->rstout, MG_RSTOUT_PIN);
+ if (err)
+ goto probe_err_3a;
+ gpio_direction_input(host->rstout);
+ }
+
+ /* disk reset */
+ if (prv_data->dev_attr == MG_STORAGE_DEV) {
+ /* If POR seq. not yet finished, wait */
+ err = mg_wait_rstout(host->rstout, MG_TMAX_RSTOUT);
+ if (err)
+ goto probe_err_3b;
+ err = mg_disk_init(host);
+ if (err) {
+ printk(KERN_ERR "%s:%d fail (err code : %d)\n",
+ __func__, __LINE__, err);
+ err = -EIO;
+ goto probe_err_3b;
+ }
+ }
+
+ /* get irq resource */
+ if (!prv_data->use_polling) {
+ host->irq = platform_get_irq(plat_dev, 0);
+ if (host->irq == -ENXIO) {
+ err = host->irq;
+ goto probe_err_3b;
+ }
+ err = request_irq(host->irq, mg_irq,
+ IRQF_TRIGGER_RISING,
+ MG_DEV_NAME, host);
+ if (err) {
+ printk(KERN_ERR "%s:%d fail (request_irq err=%d)\n",
+ __func__, __LINE__, err);
+ goto probe_err_3b;
+ }
+
+ }
+
+ /* get disk id */
+ err = mg_get_disk_id(host);
+ if (err) {
+ printk(KERN_ERR "%s:%d fail (err code : %d)\n",
+ __func__, __LINE__, err);
+ err = -EIO;
+ goto probe_err_4;
+ }
+
+ err = register_blkdev(host->major, MG_DISK_NAME);
+ if (err < 0) {
+ printk(KERN_ERR "%s:%d register_blkdev fail (err code : %d)\n",
+ __func__, __LINE__, err);
+ goto probe_err_4;
+ }
+ if (!host->major)
+ host->major = err;
+
+ spin_lock_init(&host->lock);
+
+ if (prv_data->use_polling)
+ host->breq = blk_init_queue(mg_request_poll, &host->lock);
+ else
+ host->breq = blk_init_queue(mg_request, &host->lock);
+
+ if (!host->breq) {
+ err = -ENOMEM;
+ printk(KERN_ERR "%s:%d (blk_init_queue) fail\n",
+ __func__, __LINE__);
+ goto probe_err_5;
+ }
+ host->breq->queuedata = host;
+
+ /* mflash is random device, thanx for the noop */
+ err = elevator_change(host->breq, "noop");
+ if (err) {
+ printk(KERN_ERR "%s:%d (elevator_init) fail\n",
+ __func__, __LINE__);
+ goto probe_err_6;
+ }
+ blk_queue_max_hw_sectors(host->breq, MG_MAX_SECTS);
+ blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
+
+ init_timer(&host->timer);
+ host->timer.function = mg_times_out;
+ host->timer.data = (unsigned long)host;
+
+ host->gd = alloc_disk(MG_DISK_MAX_PART);
+ if (!host->gd) {
+ printk(KERN_ERR "%s:%d (alloc_disk) fail\n",
+ __func__, __LINE__);
+ err = -ENOMEM;
+ goto probe_err_7;
+ }
+ host->gd->major = host->major;
+ host->gd->first_minor = 0;
+ host->gd->fops = &mg_disk_ops;
+ host->gd->queue = host->breq;
+ host->gd->private_data = host;
+ sprintf(host->gd->disk_name, MG_DISK_NAME"a");
+
+ set_capacity(host->gd, host->n_sectors);
+
+ add_disk(host->gd);
+
+ return err;
+
+probe_err_7:
+ del_timer_sync(&host->timer);
+probe_err_6:
+ blk_cleanup_queue(host->breq);
+probe_err_5:
+ unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME);
+probe_err_4:
+ if (!prv_data->use_polling)
+ free_irq(host->irq, host);
+probe_err_3b:
+ gpio_free(host->rstout);
+probe_err_3a:
+ gpio_free(host->rst);
+probe_err_3:
+ iounmap(host->dev_base);
+probe_err_2:
+ kfree(host);
+probe_err:
+ return err;
+}
+
+static int mg_remove(struct platform_device *plat_dev)
+{
+ struct mg_drv_data *prv_data = plat_dev->dev.platform_data;
+ struct mg_host *host = prv_data->host;
+ int err = 0;
+
+ /* delete timer */
+ del_timer_sync(&host->timer);
+
+ /* remove disk */
+ if (host->gd) {
+ del_gendisk(host->gd);
+ put_disk(host->gd);
+ }
+ /* remove queue */
+ if (host->breq)
+ blk_cleanup_queue(host->breq);
+
+ /* unregister blk device */
+ unregister_blkdev(host->major, MG_DISK_NAME);
+
+ /* free irq */
+ if (!prv_data->use_polling)
+ free_irq(host->irq, host);
+
+ /* free reset-out pin */
+ if (prv_data->dev_attr != MG_BOOT_DEV)
+ gpio_free(host->rstout);
+
+ /* free rst pin */
+ if (host->rst)
+ gpio_free(host->rst);
+
+ /* unmap io */
+ if (host->dev_base)
+ iounmap(host->dev_base);
+
+ /* free mg_host */
+ kfree(host);
+
+ return err;
+}
+
+static struct platform_driver mg_disk_driver = {
+ .probe = mg_probe,
+ .remove = mg_remove,
+ .driver = {
+ .name = MG_DEV_NAME,
+ .pm = &mg_pm,
+ }
+};
+
+/****************************************************************************
+ *
+ * Module stuff
+ *
+ ****************************************************************************/
+
+static int __init mg_init(void)
+{
+ printk(KERN_INFO "mGine mflash driver, (c) 2008 mGine Co.\n");
+ return platform_driver_register(&mg_disk_driver);
+}
+
+static void __exit mg_exit(void)
+{
+ printk(KERN_INFO "mflash driver : bye bye\n");
+ platform_driver_unregister(&mg_disk_driver);
+}
+
+module_init(mg_init);
+module_exit(mg_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("unsik Kim <donari75@gmail.com>");
+MODULE_DESCRIPTION("mGine m[g]flash device driver");
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
new file mode 100644
index 000000000..0ba837fc6
--- /dev/null
+++ b/drivers/block/mtip32xx/Kconfig
@@ -0,0 +1,9 @@
+#
+# mtip32xx device driver configuration
+#
+
+config BLK_DEV_PCIESSD_MTIP32XX
+ tristate "Block Device Driver for Micron PCIe SSDs"
+ depends on PCI
+ help
+ This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile
new file mode 100644
index 000000000..4fbef8c83
--- /dev/null
+++ b/drivers/block/mtip32xx/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for Block device driver for Micron PCIe SSD
+#
+
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
new file mode 100644
index 000000000..3bd7ca985
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -0,0 +1,4745 @@
+/*
+ * Driver for the Micron P320 SSD
+ * Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ * Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ata.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <../drivers/ata/ahci.h>
+#include <linux/export.h>
+#include <linux/debugfs.h>
+#include <linux/prefetch.h>
+#include "mtip32xx.h"
+
+#define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32)
+
+/* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */
+#define AHCI_RX_FIS_SZ 0x100
+#define AHCI_RX_FIS_OFFSET 0x0
+#define AHCI_IDFY_SZ ATA_SECT_SIZE
+#define AHCI_IDFY_OFFSET 0x400
+#define AHCI_SECTBUF_SZ ATA_SECT_SIZE
+#define AHCI_SECTBUF_OFFSET 0x800
+#define AHCI_SMARTBUF_SZ ATA_SECT_SIZE
+#define AHCI_SMARTBUF_OFFSET 0xC00
+/* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */
+#define BLOCK_DMA_ALLOC_SZ 4096
+
+/* DMA region containing command table (should be 8192 bytes) */
+#define AHCI_CMD_SLOT_SZ sizeof(struct mtip_cmd_hdr)
+#define AHCI_CMD_TBL_SZ (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ)
+#define AHCI_CMD_TBL_OFFSET 0x0
+
+/* DMA region per command (contains header and SGL) */
+#define AHCI_CMD_TBL_HDR_SZ 0x80
+#define AHCI_CMD_TBL_HDR_OFFSET 0x0
+#define AHCI_CMD_TBL_SGL_SZ (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg))
+#define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ
+#define CMD_DMA_ALLOC_SZ (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ)
+
+
+#define HOST_CAP_NZDMA (1 << 19)
+#define HOST_HSORG 0xFC
+#define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
+#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
+#define HSORG_HWREV 0xFF00
+#define HSORG_STYLE 0x8
+#define HSORG_SLOTGROUPS 0x7
+
+#define PORT_COMMAND_ISSUE 0x38
+#define PORT_SDBV 0x7C
+
+#define PORT_OFFSET 0x100
+#define PORT_MEM_SIZE 0x80
+
+#define PORT_IRQ_ERR \
+ (PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \
+ PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \
+ PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \
+ PORT_IRQ_OVERFLOW)
+#define PORT_IRQ_LEGACY \
+ (PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS)
+#define PORT_IRQ_HANDLED \
+ (PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \
+ PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \
+ PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)
+#define DEF_PORT_IRQ \
+ (PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS)
+
+/* product numbers */
+#define MTIP_PRODUCT_UNKNOWN 0x00
+#define MTIP_PRODUCT_ASICFPGA 0x11
+
+/* Device instance number, incremented each time a device is probed. */
+static int instance;
+
+struct list_head online_list;
+struct list_head removing_list;
+spinlock_t dev_lock;
+
+/*
+ * Global variable used to hold the major block device number
+ * allocated in mtip_init().
+ */
+static int mtip_major;
+static struct dentry *dfs_parent;
+static struct dentry *dfs_device_status;
+
+static u32 cpu_use[NR_CPUS];
+
+static DEFINE_SPINLOCK(rssd_index_lock);
+static DEFINE_IDA(rssd_index_ida);
+
+static int mtip_block_initialize(struct driver_data *dd);
+
+#ifdef CONFIG_COMPAT
+struct mtip_compat_ide_task_request_s {
+ __u8 io_ports[8];
+ __u8 hob_ports[8];
+ ide_reg_valid_t out_flags;
+ ide_reg_valid_t in_flags;
+ int data_phase;
+ int req_cmd;
+ compat_ulong_t out_size;
+ compat_ulong_t in_size;
+};
+#endif
+
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ * true if device removed, else false
+ */
+static bool mtip_check_surprise_removal(struct pci_dev *pdev)
+{
+ u16 vendor_id = 0;
+ struct driver_data *dd = pci_get_drvdata(pdev);
+
+ if (dd->sr)
+ return true;
+
+ /* Read the vendorID from the configuration space */
+ pci_read_config_word(pdev, 0x00, &vendor_id);
+ if (vendor_id == 0xFFFF) {
+ dd->sr = true;
+ if (dd->queue)
+ set_bit(QUEUE_FLAG_DEAD, &dd->queue->queue_flags);
+ else
+ dev_warn(&dd->pdev->dev,
+ "%s: dd->queue is NULL\n", __func__);
+ if (dd->port) {
+ set_bit(MTIP_PF_SR_CLEANUP_BIT, &dd->port->flags);
+ wake_up_interruptible(&dd->port->svc_wait);
+ } else
+ dev_warn(&dd->pdev->dev,
+ "%s: dd->port is NULL\n", __func__);
+ return true; /* device removed */
+ }
+
+ return false; /* device present */
+}
+
+static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
+{
+ struct request *rq;
+
+ rq = blk_mq_alloc_request(dd->queue, 0, __GFP_WAIT, true);
+ return blk_mq_rq_to_pdu(rq);
+}
+
+static void mtip_put_int_command(struct driver_data *dd, struct mtip_cmd *cmd)
+{
+ blk_put_request(blk_mq_rq_from_pdu(cmd));
+}
+
+/*
+ * Once we add support for one hctx per mtip group, this will change a bit
+ */
+static struct request *mtip_rq_from_tag(struct driver_data *dd,
+ unsigned int tag)
+{
+ struct blk_mq_hw_ctx *hctx = dd->queue->queue_hw_ctx[0];
+
+ return blk_mq_tag_to_rq(hctx->tags, tag);
+}
+
+static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
+ unsigned int tag)
+{
+ struct request *rq = mtip_rq_from_tag(dd, tag);
+
+ return blk_mq_rq_to_pdu(rq);
+}
+
+/*
+ * IO completion function.
+ *
+ * This completion function is called by the driver ISR when a
+ * command that was issued by the kernel completes. It first calls the
+ * asynchronous completion function which normally calls back into the block
+ * layer passing the asynchronous callback data, then unmaps the
+ * scatter list associated with the completed command, and finally
+ * clears the allocated bit associated with the completed command.
+ *
+ * @port Pointer to the port data structure.
+ * @tag Tag of the command.
+ * @data Pointer to driver_data.
+ * @status Completion status.
+ *
+ * return value
+ * None
+ */
+static void mtip_async_complete(struct mtip_port *port,
+ int tag, struct mtip_cmd *cmd, int status)
+{
+ struct driver_data *dd = port->dd;
+ struct request *rq;
+
+ if (unlikely(!dd) || unlikely(!port))
+ return;
+
+ if (unlikely(status == PORT_IRQ_TF_ERR)) {
+ dev_warn(&port->dd->pdev->dev,
+ "Command tag %d failed due to TFE\n", tag);
+ }
+
+ /* Unmap the DMA scatter list entries */
+ dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents, cmd->direction);
+
+ rq = mtip_rq_from_tag(dd, tag);
+
+ if (unlikely(cmd->unaligned))
+ up(&port->cmd_slot_unal);
+
+ blk_mq_end_request(rq, status ? -EIO : 0);
+}
+
+/*
+ * Reset the HBA (without sleeping)
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 The reset was successful.
+ * -1 The HBA Reset bit did not clear.
+ */
+static int mtip_hba_reset(struct driver_data *dd)
+{
+ unsigned long timeout;
+
+ /* Set the reset bit */
+ writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+ /* Flush */
+ readl(dd->mmio + HOST_CTL);
+
+ /* Spin for up to 2 seconds, waiting for reset acknowledgement */
+ timeout = jiffies + msecs_to_jiffies(2000);
+ do {
+ mdelay(10);
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+ return -1;
+
+ } while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+ && time_before(jiffies, timeout));
+
+ if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Issue a command to the hardware.
+ *
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
+ *
+ * @port Pointer to the port structure.
+ * @tag The tag of the command to be issued.
+ *
+ * return value
+ * None
+ */
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
+{
+ int group = tag >> 5;
+
+ /* guard SACT and CI registers */
+ spin_lock(&port->cmd_issue_lock[group]);
+ writel((1 << MTIP_TAG_BIT(tag)),
+ port->s_active[MTIP_TAG_INDEX(tag)]);
+ writel((1 << MTIP_TAG_BIT(tag)),
+ port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+ spin_unlock(&port->cmd_issue_lock[group]);
+}
+
+/*
+ * Enable/disable the reception of FIS
+ *
+ * @port Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ * Previous state: 1 enabled, 0 disabled
+ */
+static int mtip_enable_fis(struct mtip_port *port, int enable)
+{
+ u32 tmp;
+
+ /* enable FIS reception */
+ tmp = readl(port->mmio + PORT_CMD);
+ if (enable)
+ writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+ else
+ writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+
+ /* Flush */
+ readl(port->mmio + PORT_CMD);
+
+ return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX));
+}
+
+/*
+ * Enable/disable the DMA engine
+ *
+ * @port Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ * Previous state: 1 enabled, 0 disabled.
+ */
+static int mtip_enable_engine(struct mtip_port *port, int enable)
+{
+ u32 tmp;
+
+ /* enable FIS reception */
+ tmp = readl(port->mmio + PORT_CMD);
+ if (enable)
+ writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD);
+ else
+ writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD);
+
+ readl(port->mmio + PORT_CMD);
+ return (((tmp & PORT_CMD_START) == PORT_CMD_START));
+}
+
+/*
+ * Enables the port DMA engine and FIS reception.
+ *
+ * return value
+ * None
+ */
+static inline void mtip_start_port(struct mtip_port *port)
+{
+ /* Enable FIS reception */
+ mtip_enable_fis(port, 1);
+
+ /* Enable the DMA engine */
+ mtip_enable_engine(port, 1);
+}
+
+/*
+ * Deinitialize a port by disabling port interrupts, the DMA engine,
+ * and FIS reception.
+ *
+ * @port Pointer to the port structure
+ *
+ * return value
+ * None
+ */
+static inline void mtip_deinit_port(struct mtip_port *port)
+{
+ /* Disable interrupts on this port */
+ writel(0, port->mmio + PORT_IRQ_MASK);
+
+ /* Disable the DMA engine */
+ mtip_enable_engine(port, 0);
+
+ /* Disable FIS reception */
+ mtip_enable_fis(port, 0);
+}
+
+/*
+ * Initialize a port.
+ *
+ * This function deinitializes the port by calling mtip_deinit_port() and
+ * then initializes it by setting the command header and RX FIS addresses,
+ * clearing the SError register and any pending port interrupts before
+ * re-enabling the default set of port interrupts.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_init_port(struct mtip_port *port)
+{
+ int i;
+ mtip_deinit_port(port);
+
+ /* Program the command list base and FIS base addresses */
+ if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) {
+ writel((port->command_list_dma >> 16) >> 16,
+ port->mmio + PORT_LST_ADDR_HI);
+ writel((port->rxfis_dma >> 16) >> 16,
+ port->mmio + PORT_FIS_ADDR_HI);
+ }
+
+ writel(port->command_list_dma & 0xFFFFFFFF,
+ port->mmio + PORT_LST_ADDR);
+ writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR);
+
+ /* Clear SError */
+ writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+ /* reset the completed registers.*/
+ for (i = 0; i < port->dd->slot_groups; i++)
+ writel(0xFFFFFFFF, port->completed[i]);
+
+ /* Clear any pending interrupts for this port */
+ writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
+
+ /* Clear any pending interrupts on the HBA. */
+ writel(readl(port->dd->mmio + HOST_IRQ_STAT),
+ port->dd->mmio + HOST_IRQ_STAT);
+
+ /* Enable port interrupts */
+ writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
+}
+
+/*
+ * Restart a port
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_restart_port(struct mtip_port *port)
+{
+ unsigned long timeout;
+
+ /* Disable the DMA engine */
+ mtip_enable_engine(port, 0);
+
+ /* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */
+ timeout = jiffies + msecs_to_jiffies(500);
+ while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON)
+ && time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ /*
+ * Chip quirk: escalate to hba reset if
+ * PxCMD.CR not clear after 500 ms
+ */
+ if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) {
+ dev_warn(&port->dd->pdev->dev,
+ "PxCMD.CR not clear, escalating reset\n");
+
+ if (mtip_hba_reset(port->dd))
+ dev_err(&port->dd->pdev->dev,
+ "HBA reset escalation failed.\n");
+
+ /* 30 ms delay before com reset to quiesce chip */
+ mdelay(30);
+ }
+
+ dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n");
+
+ /* Set PxSCTL.DET */
+ writel(readl(port->mmio + PORT_SCR_CTL) |
+ 1, port->mmio + PORT_SCR_CTL);
+ readl(port->mmio + PORT_SCR_CTL);
+
+ /* Wait 1 ms to quiesce chip function */
+ timeout = jiffies + msecs_to_jiffies(1);
+ while (time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ /* Clear PxSCTL.DET */
+ writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
+ port->mmio + PORT_SCR_CTL);
+ readl(port->mmio + PORT_SCR_CTL);
+
+ /* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */
+ timeout = jiffies + msecs_to_jiffies(500);
+ while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+ && time_before(jiffies, timeout))
+ ;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return;
+
+ if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+ dev_warn(&port->dd->pdev->dev,
+ "COM reset failed\n");
+
+ mtip_init_port(port);
+ mtip_start_port(port);
+
+}
+
+static int mtip_device_reset(struct driver_data *dd)
+{
+ int rv = 0;
+
+ if (mtip_check_surprise_removal(dd->pdev))
+ return 0;
+
+ if (mtip_hba_reset(dd) < 0)
+ rv = -EFAULT;
+
+ mdelay(1);
+ mtip_init_port(dd->port);
+ mtip_start_port(dd->port);
+
+ /* Enable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+ return rv;
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+ char *msg,
+ unsigned long *tagbits,
+ int cnt)
+{
+ unsigned char tagmap[128];
+ int group, tagmap_len = 0;
+
+ memset(tagmap, 0, sizeof(tagmap));
+ for (group = SLOTBITS_IN_LONGS; group > 0; group--)
+ tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ",
+ tagbits[group-1]);
+ dev_warn(&dd->pdev->dev,
+ "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
+}
+
+/*
+ * Internal command completion callback function.
+ *
+ * This function is normally called by the driver ISR when an internal
+ * command completed. This function signals the command completion by
+ * calling complete().
+ *
+ * @port Pointer to the port data structure.
+ * @tag Tag of the command that has completed.
+ * @data Pointer to a completion structure.
+ * @status Completion status.
+ *
+ * return value
+ * None
+ */
+static void mtip_completion(struct mtip_port *port,
+ int tag, struct mtip_cmd *command, int status)
+{
+ struct completion *waiting = command->comp_data;
+ if (unlikely(status == PORT_IRQ_TF_ERR))
+ dev_warn(&port->dd->pdev->dev,
+ "Internal command %d completed with TFE\n", tag);
+
+ complete(waiting);
+}
+
+static void mtip_null_completion(struct mtip_port *port,
+ int tag, struct mtip_cmd *command, int status)
+{
+}
+
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+ dma_addr_t buffer_dma, unsigned int sectors);
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+ struct smart_attr *attrib);
+/*
+ * Handle an error.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_handle_tfe(struct driver_data *dd)
+{
+ int group, tag, bit, reissue, rv;
+ struct mtip_port *port;
+ struct mtip_cmd *cmd;
+ u32 completed;
+ struct host_to_dev_fis *fis;
+ unsigned long tagaccum[SLOTBITS_IN_LONGS];
+ unsigned int cmd_cnt = 0;
+ unsigned char *buf;
+ char *fail_reason = NULL;
+ int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0;
+
+ dev_warn(&dd->pdev->dev, "Taskfile error\n");
+
+ port = dd->port;
+
+ set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+
+ if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+ test_bit(MTIP_TAG_INTERNAL, port->allocated)) {
+ cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
+ dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n");
+
+ if (cmd->comp_data && cmd->comp_func) {
+ cmd->comp_func(port, MTIP_TAG_INTERNAL,
+ cmd, PORT_IRQ_TF_ERR);
+ }
+ goto handle_tfe_exit;
+ }
+
+ /* clear the tag accumulator */
+ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+ /* Loop through all the groups */
+ for (group = 0; group < dd->slot_groups; group++) {
+ completed = readl(port->completed[group]);
+
+ dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed);
+
+ /* clear completed status register in the hardware.*/
+ writel(completed, port->completed[group]);
+
+ /* Process successfully completed commands */
+ for (bit = 0; bit < 32 && completed; bit++) {
+ if (!(completed & (1<<bit)))
+ continue;
+ tag = (group << 5) + bit;
+
+ /* Skip the internal command slot */
+ if (tag == MTIP_TAG_INTERNAL)
+ continue;
+
+ cmd = mtip_cmd_from_tag(dd, tag);
+ if (likely(cmd->comp_func)) {
+ set_bit(tag, tagaccum);
+ cmd_cnt++;
+ cmd->comp_func(port, tag, cmd, 0);
+ } else {
+ dev_err(&port->dd->pdev->dev,
+ "Missing completion func for tag %d",
+ tag);
+ if (mtip_check_surprise_removal(dd->pdev)) {
+ /* don't proceed further */
+ return;
+ }
+ }
+ }
+ }
+
+ print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt);
+
+ /* Restart the port */
+ mdelay(20);
+ mtip_restart_port(port);
+
+ /* Trying to determine the cause of the error */
+ rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+ dd->port->log_buf,
+ dd->port->log_buf_dma, 1);
+ if (rv) {
+ dev_warn(&dd->pdev->dev,
+ "Error in READ LOG EXT (10h) command\n");
+ /* non-critical error, don't fail the load */
+ } else {
+ buf = (unsigned char *)dd->port->log_buf;
+ if (buf[259] & 0x1) {
+ dev_info(&dd->pdev->dev,
+ "Write protect bit is set.\n");
+ set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+ fail_all_ncq_write = 1;
+ fail_reason = "write protect";
+ }
+ if (buf[288] == 0xF7) {
+ dev_info(&dd->pdev->dev,
+ "Exceeded Tmax, drive in thermal shutdown.\n");
+ set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+ fail_all_ncq_cmds = 1;
+ fail_reason = "thermal shutdown";
+ }
+ if (buf[288] == 0xBF) {
+ set_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
+ dev_info(&dd->pdev->dev,
+ "Drive indicates rebuild has failed. Secure erase required.\n");
+ fail_all_ncq_cmds = 1;
+ fail_reason = "rebuild failed";
+ }
+ }
+
+ /* clear the tag accumulator */
+ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+ /* Loop through all the groups */
+ for (group = 0; group < dd->slot_groups; group++) {
+ for (bit = 0; bit < 32; bit++) {
+ reissue = 1;
+ tag = (group << 5) + bit;
+ cmd = mtip_cmd_from_tag(dd, tag);
+
+ fis = (struct host_to_dev_fis *)cmd->command;
+
+ /* Should re-issue? */
+ if (tag == MTIP_TAG_INTERNAL ||
+ fis->command == ATA_CMD_SET_FEATURES)
+ reissue = 0;
+ else {
+ if (fail_all_ncq_cmds ||
+ (fail_all_ncq_write &&
+ fis->command == ATA_CMD_FPDMA_WRITE)) {
+ dev_warn(&dd->pdev->dev,
+ " Fail: %s w/tag %d [%s].\n",
+ fis->command == ATA_CMD_FPDMA_WRITE ?
+ "write" : "read",
+ tag,
+ fail_reason != NULL ?
+ fail_reason : "unknown");
+ if (cmd->comp_func) {
+ cmd->comp_func(port, tag,
+ cmd, -ENODATA);
+ }
+ continue;
+ }
+ }
+
+ /*
+ * First check if this command has
+ * exceeded its retries.
+ */
+ if (reissue && (cmd->retries-- > 0)) {
+
+ set_bit(tag, tagaccum);
+
+ /* Re-issue the command. */
+ mtip_issue_ncq_command(port, tag);
+
+ continue;
+ }
+
+ /* Retire a command that will not be reissued */
+ dev_warn(&port->dd->pdev->dev,
+ "retiring tag %d\n", tag);
+
+ if (cmd->comp_func)
+ cmd->comp_func(port, tag, cmd, PORT_IRQ_TF_ERR);
+ else
+ dev_warn(&port->dd->pdev->dev,
+ "Bad completion for tag %d\n",
+ tag);
+ }
+ }
+ print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt);
+
+handle_tfe_exit:
+ /* clear eh_active */
+ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+}
+
+/*
+ * Handle a set device bits interrupt
+ */
+static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
+ u32 completed)
+{
+ struct driver_data *dd = port->dd;
+ int tag, bit;
+ struct mtip_cmd *command;
+
+ if (!completed) {
+ WARN_ON_ONCE(!completed);
+ return;
+ }
+ /* clear completed status register in the hardware.*/
+ writel(completed, port->completed[group]);
+
+ /* Process completed commands. */
+ for (bit = 0; (bit < 32) && completed; bit++) {
+ if (completed & 0x01) {
+ tag = (group << 5) | bit;
+
+ /* skip internal command slot. */
+ if (unlikely(tag == MTIP_TAG_INTERNAL))
+ continue;
+
+ command = mtip_cmd_from_tag(dd, tag);
+ if (likely(command->comp_func))
+ command->comp_func(port, tag, command, 0);
+ else {
+ dev_dbg(&dd->pdev->dev,
+ "Null completion for tag %d",
+ tag);
+
+ if (mtip_check_surprise_removal(
+ dd->pdev)) {
+ return;
+ }
+ }
+ }
+ completed >>= 1;
+ }
+
+ /* If last, re-enable interrupts */
+ if (atomic_dec_return(&dd->irq_workers_active) == 0)
+ writel(0xffffffff, dd->mmio + HOST_IRQ_STAT);
+}
+
+/*
+ * Process legacy pio and d2h interrupts
+ */
+static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
+{
+ struct mtip_port *port = dd->port;
+ struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
+
+ if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+ (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL))) {
+ if (cmd->comp_func) {
+ cmd->comp_func(port, MTIP_TAG_INTERNAL, cmd, 0);
+ return;
+ }
+ }
+
+ return;
+}
+
+/*
+ * Demux and handle errors
+ */
+static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
+{
+
+ if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
+ dev_warn(&dd->pdev->dev,
+ "Clearing PxSERR.DIAG.x\n");
+ writel((1 << 26), dd->port->mmio + PORT_SCR_ERR);
+ }
+
+ if (unlikely(port_stat & PORT_IRQ_PHYRDY)) {
+ dev_warn(&dd->pdev->dev,
+ "Clearing PxSERR.DIAG.n\n");
+ writel((1 << 16), dd->port->mmio + PORT_SCR_ERR);
+ }
+
+ if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) {
+ dev_warn(&dd->pdev->dev,
+ "Port stat errors %x unhandled\n",
+ (port_stat & ~PORT_IRQ_HANDLED));
+ if (mtip_check_surprise_removal(dd->pdev))
+ return;
+ }
+ if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) {
+ set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags);
+ wake_up_interruptible(&dd->port->svc_wait);
+ }
+}
+
+static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
+{
+ struct driver_data *dd = (struct driver_data *) data;
+ struct mtip_port *port = dd->port;
+ u32 hba_stat, port_stat;
+ int rv = IRQ_NONE;
+ int do_irq_enable = 1, i, workers;
+ struct mtip_work *twork;
+
+ hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+ if (hba_stat) {
+ rv = IRQ_HANDLED;
+
+ /* Acknowledge the interrupt status on the port.*/
+ port_stat = readl(port->mmio + PORT_IRQ_STAT);
+ writel(port_stat, port->mmio + PORT_IRQ_STAT);
+
+ /* Demux port status */
+ if (likely(port_stat & PORT_IRQ_SDB_FIS)) {
+ do_irq_enable = 0;
+ WARN_ON_ONCE(atomic_read(&dd->irq_workers_active) != 0);
+
+ /* Start at 1: group zero is always local? */
+ for (i = 0, workers = 0; i < MTIP_MAX_SLOT_GROUPS;
+ i++) {
+ twork = &dd->work[i];
+ twork->completed = readl(port->completed[i]);
+ if (twork->completed)
+ workers++;
+ }
+
+ atomic_set(&dd->irq_workers_active, workers);
+ if (workers) {
+ for (i = 1; i < MTIP_MAX_SLOT_GROUPS; i++) {
+ twork = &dd->work[i];
+ if (twork->completed)
+ queue_work_on(
+ twork->cpu_binding,
+ dd->isr_workq,
+ &twork->work);
+ }
+
+ if (likely(dd->work[0].completed))
+ mtip_workq_sdbfx(port, 0,
+ dd->work[0].completed);
+
+ } else {
+ /*
+ * Chip quirk: SDB interrupt but nothing
+ * to complete
+ */
+ do_irq_enable = 1;
+ }
+ }
+
+ if (unlikely(port_stat & PORT_IRQ_ERR)) {
+ if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+ /* don't proceed further */
+ return IRQ_HANDLED;
+ }
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag))
+ return rv;
+
+ mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
+ }
+
+ if (unlikely(port_stat & PORT_IRQ_LEGACY))
+ mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY);
+ }
+
+ /* acknowledge interrupt */
+ if (unlikely(do_irq_enable))
+ writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+
+ return rv;
+}
+
+/*
+ * HBA interrupt subroutine.
+ *
+ * @irq IRQ number.
+ * @instance Pointer to the driver data structure.
+ *
+ * return value
+ * IRQ_HANDLED A HBA interrupt was pending and handled.
+ * IRQ_NONE This interrupt was not for the HBA.
+ */
+static irqreturn_t mtip_irq_handler(int irq, void *instance)
+{
+ struct driver_data *dd = instance;
+
+ return mtip_handle_irq(dd);
+}
+
+static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
+{
+ writel(1 << MTIP_TAG_BIT(tag),
+ port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+}
+
+static bool mtip_pause_ncq(struct mtip_port *port,
+ struct host_to_dev_fis *fis)
+{
+ struct host_to_dev_fis *reply;
+ unsigned long task_file_data;
+
+ reply = port->rxfis + RX_FIS_D2H_REG;
+ task_file_data = readl(port->mmio+PORT_TFDATA);
+
+ if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
+ clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
+ if ((task_file_data & 1))
+ return false;
+
+ if (fis->command == ATA_CMD_SEC_ERASE_PREP) {
+ set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+ set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+ port->ic_pause_timer = jiffies;
+ return true;
+ } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) &&
+ (fis->features == 0x03)) {
+ set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+ port->ic_pause_timer = jiffies;
+ return true;
+ } else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) ||
+ ((fis->command == 0xFC) &&
+ (fis->features == 0x27 || fis->features == 0x72 ||
+ fis->features == 0x62 || fis->features == 0x26))) {
+ /* Com reset after secure erase or lowlevel format */
+ mtip_restart_port(port);
+ return false;
+ }
+
+ return false;
+}
+
+/*
+ * Wait for port to quiesce
+ *
+ * @port Pointer to port data structure
+ * @timeout Max duration to wait (ms)
+ *
+ * return value
+ * 0 Success
+ * -EBUSY Commands still active
+ */
+static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
+{
+ unsigned long to;
+ unsigned int n;
+ unsigned int active = 1;
+
+ blk_mq_stop_hw_queues(port->dd->queue);
+
+ to = jiffies + msecs_to_jiffies(timeout);
+ do {
+ if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
+ test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+ msleep(20);
+ continue; /* svc thd is actively issuing commands */
+ }
+
+ msleep(100);
+ if (mtip_check_surprise_removal(port->dd->pdev))
+ goto err_fault;
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ goto err_fault;
+
+ /*
+ * Ignore s_active bit 0 of array element 0.
+ * This bit will always be set
+ */
+ active = readl(port->s_active[0]) & 0xFFFFFFFE;
+ for (n = 1; n < port->dd->slot_groups; n++)
+ active |= readl(port->s_active[n]);
+
+ if (!active)
+ break;
+ } while (time_before(jiffies, to));
+
+ blk_mq_start_stopped_hw_queues(port->dd->queue, true);
+ return active ? -EBUSY : 0;
+err_fault:
+ blk_mq_start_stopped_hw_queues(port->dd->queue, true);
+ return -EFAULT;
+}
+
+/*
+ * Execute an internal command and wait for the completion.
+ *
+ * @port Pointer to the port data structure.
+ * @fis Pointer to the FIS that describes the command.
+ * @fis_len Length in WORDS of the FIS.
+ * @buffer DMA accessible for command data.
+ * @buf_len Length, in bytes, of the data buffer.
+ * @opts Command header options, excluding the FIS length
+ * and the number of PRD entries.
+ * @timeout Time in ms to wait for the command to complete.
+ *
+ * return value
+ * 0 Command completed successfully.
+ * -EFAULT The buffer address is not correctly aligned.
+ * -EBUSY Internal command or other IO in progress.
+ * -EAGAIN Time out waiting for command to complete.
+ */
+static int mtip_exec_internal_command(struct mtip_port *port,
+ struct host_to_dev_fis *fis,
+ int fis_len,
+ dma_addr_t buffer,
+ int buf_len,
+ u32 opts,
+ gfp_t atomic,
+ unsigned long timeout)
+{
+ struct mtip_cmd_sg *command_sg;
+ DECLARE_COMPLETION_ONSTACK(wait);
+ struct mtip_cmd *int_cmd;
+ struct driver_data *dd = port->dd;
+ int rv = 0;
+
+ /* Make sure the buffer is 8 byte aligned. This is asic specific. */
+ if (buffer & 0x00000007) {
+ dev_err(&dd->pdev->dev, "SG buffer is not 8 byte aligned\n");
+ return -EFAULT;
+ }
+
+ int_cmd = mtip_get_int_command(dd);
+
+ set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ port->ic_pause_timer = 0;
+
+ clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+ clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+
+ if (atomic == GFP_KERNEL) {
+ if (fis->command != ATA_CMD_STANDBYNOW1) {
+ /* wait for io to complete if non atomic */
+ if (mtip_quiesce_io(port,
+ MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) {
+ dev_warn(&dd->pdev->dev,
+ "Failed to quiesce IO\n");
+ mtip_put_int_command(dd, int_cmd);
+ clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+ return -EBUSY;
+ }
+ }
+
+ /* Set the completion function and data for the command. */
+ int_cmd->comp_data = &wait;
+ int_cmd->comp_func = mtip_completion;
+
+ } else {
+ /* Clear completion - we're going to poll */
+ int_cmd->comp_data = NULL;
+ int_cmd->comp_func = mtip_null_completion;
+ }
+
+ /* Copy the command to the command table */
+ memcpy(int_cmd->command, fis, fis_len*4);
+
+ /* Populate the SG list */
+ int_cmd->command_header->opts =
+ __force_bit2int cpu_to_le32(opts | fis_len);
+ if (buf_len) {
+ command_sg = int_cmd->command + AHCI_CMD_TBL_HDR_SZ;
+
+ command_sg->info =
+ __force_bit2int cpu_to_le32((buf_len-1) & 0x3FFFFF);
+ command_sg->dba =
+ __force_bit2int cpu_to_le32(buffer & 0xFFFFFFFF);
+ command_sg->dba_upper =
+ __force_bit2int cpu_to_le32((buffer >> 16) >> 16);
+
+ int_cmd->command_header->opts |=
+ __force_bit2int cpu_to_le32((1 << 16));
+ }
+
+ /* Populate the command header */
+ int_cmd->command_header->byte_count = 0;
+
+ /* Issue the command to the hardware */
+ mtip_issue_non_ncq_command(port, MTIP_TAG_INTERNAL);
+
+ if (atomic == GFP_KERNEL) {
+ /* Wait for the command to complete or timeout. */
+ if ((rv = wait_for_completion_interruptible_timeout(
+ &wait,
+ msecs_to_jiffies(timeout))) <= 0) {
+ if (rv == -ERESTARTSYS) { /* interrupted */
+ dev_err(&dd->pdev->dev,
+ "Internal command [%02X] was interrupted after %lu ms\n",
+ fis->command, timeout);
+ rv = -EINTR;
+ goto exec_ic_exit;
+ } else if (rv == 0) /* timeout */
+ dev_err(&dd->pdev->dev,
+ "Internal command did not complete [%02X] within timeout of %lu ms\n",
+ fis->command, timeout);
+ else
+ dev_err(&dd->pdev->dev,
+ "Internal command [%02X] wait returned code [%d] after %lu ms - unhandled\n",
+ fis->command, rv, timeout);
+
+ if (mtip_check_surprise_removal(dd->pdev) ||
+ test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag)) {
+ dev_err(&dd->pdev->dev,
+ "Internal command [%02X] wait returned due to SR\n",
+ fis->command);
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ mtip_device_reset(dd); /* recover from timeout issue */
+ rv = -EAGAIN;
+ goto exec_ic_exit;
+ }
+ } else {
+ u32 hba_stat, port_stat;
+
+ /* Spin for <timeout> checking if command still outstanding */
+ timeout = jiffies + msecs_to_jiffies(timeout);
+ while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL))
+ && time_before(jiffies, timeout)) {
+ if (mtip_check_surprise_removal(dd->pdev)) {
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ if ((fis->command != ATA_CMD_STANDBYNOW1) &&
+ test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag)) {
+ rv = -ENXIO;
+ goto exec_ic_exit;
+ }
+ port_stat = readl(port->mmio + PORT_IRQ_STAT);
+ if (!port_stat)
+ continue;
+
+ if (port_stat & PORT_IRQ_ERR) {
+ dev_err(&dd->pdev->dev,
+ "Internal command [%02X] failed\n",
+ fis->command);
+ mtip_device_reset(dd);
+ rv = -EIO;
+ goto exec_ic_exit;
+ } else {
+ writel(port_stat, port->mmio + PORT_IRQ_STAT);
+ hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+ if (hba_stat)
+ writel(hba_stat,
+ dd->mmio + HOST_IRQ_STAT);
+ }
+ break;
+ }
+ }
+
+ if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+ & (1 << MTIP_TAG_INTERNAL)) {
+ rv = -ENXIO;
+ if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+ mtip_device_reset(dd);
+ rv = -EAGAIN;
+ }
+ }
+exec_ic_exit:
+ /* Clear the allocated and active bits for the internal command. */
+ mtip_put_int_command(dd, int_cmd);
+ if (rv >= 0 && mtip_pause_ncq(port, fis)) {
+ /* NCQ paused */
+ return rv;
+ }
+ clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+ wake_up_interruptible(&port->svc_wait);
+
+ return rv;
+}
+
+/*
+ * Byte-swap ATA ID strings.
+ *
+ * ATA identify data contains strings in byte-swapped 16-bit words.
+ * They must be swapped (on all architectures) to be usable as C strings.
+ * This function swaps bytes in-place.
+ *
+ * @buf The buffer location of the string
+ * @len The number of bytes to swap
+ *
+ * return value
+ * None
+ */
+static inline void ata_swap_string(u16 *buf, unsigned int len)
+{
+ int i;
+ for (i = 0; i < (len/2); i++)
+ be16_to_cpus(&buf[i]);
+}
+
+static void mtip_set_timeout(struct driver_data *dd,
+ struct host_to_dev_fis *fis,
+ unsigned int *timeout, u8 erasemode)
+{
+ switch (fis->command) {
+ case ATA_CMD_DOWNLOAD_MICRO:
+ *timeout = 120000; /* 2 minutes */
+ break;
+ case ATA_CMD_SEC_ERASE_UNIT:
+ case 0xFC:
+ if (erasemode)
+ *timeout = ((*(dd->port->identify + 90) * 2) * 60000);
+ else
+ *timeout = ((*(dd->port->identify + 89) * 2) * 60000);
+ break;
+ case ATA_CMD_STANDBYNOW1:
+ *timeout = 120000; /* 2 minutes */
+ break;
+ case 0xF7:
+ case 0xFA:
+ *timeout = 60000; /* 60 seconds */
+ break;
+ case ATA_CMD_SMART:
+ *timeout = 15000; /* 15 seconds */
+ break;
+ default:
+ *timeout = MTIP_IOCTL_CMD_TIMEOUT_MS;
+ break;
+ }
+}
+
+/*
+ * Request the device identity information.
+ *
+ * If a user space buffer is not specified, i.e. is NULL, the
+ * identify information is still read from the drive and placed
+ * into the identify data buffer (@e port->identify) in the
+ * port data structure.
+ * When the identify buffer contains valid identify information @e
+ * port->identify_valid is non-zero.
+ *
+ * @port Pointer to the port structure.
+ * @user_buffer A user space buffer where the identify data should be
+ * copied.
+ *
+ * return value
+ * 0 Command completed successfully.
+ * -EFAULT An error occurred while coping data to the user buffer.
+ * -1 Command failed.
+ */
+static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
+{
+ int rv = 0;
+ struct host_to_dev_fis fis;
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+ return -EFAULT;
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_ID_ATA;
+
+ /* Set the identify information as invalid. */
+ port->identify_valid = 0;
+
+ /* Clear the identify information. */
+ memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ port->identify_dma,
+ sizeof(u16) * ATA_ID_WORDS,
+ 0,
+ GFP_KERNEL,
+ MTIP_INT_CMD_TIMEOUT_MS)
+ < 0) {
+ rv = -1;
+ goto out;
+ }
+
+ /*
+ * Perform any necessary byte-swapping. Yes, the kernel does in fact
+ * perform field-sensitive swapping on the string fields.
+ * See the kernel use of ata_id_string() for proof of this.
+ */
+#ifdef __LITTLE_ENDIAN
+ ata_swap_string(port->identify + 27, 40); /* model string*/
+ ata_swap_string(port->identify + 23, 8); /* firmware string*/
+ ata_swap_string(port->identify + 10, 20); /* serial# string*/
+#else
+ {
+ int i;
+ for (i = 0; i < ATA_ID_WORDS; i++)
+ port->identify[i] = le16_to_cpu(port->identify[i]);
+ }
+#endif
+
+ /* Check security locked state */
+ if (port->identify[128] & 0x4)
+ set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+ else
+ clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
+#ifdef MTIP_TRIM /* Disabling TRIM support temporarily */
+ /* Demux ID.DRAT & ID.RZAT to determine trim support */
+ if (port->identify[69] & (1 << 14) && port->identify[69] & (1 << 5))
+ port->dd->trim_supp = true;
+ else
+#endif
+ port->dd->trim_supp = false;
+
+ /* Set the identify buffer as valid. */
+ port->identify_valid = 1;
+
+ if (user_buffer) {
+ if (copy_to_user(
+ user_buffer,
+ port->identify,
+ ATA_ID_WORDS * sizeof(u16))) {
+ rv = -EFAULT;
+ goto out;
+ }
+ }
+
+out:
+ return rv;
+}
+
+/*
+ * Issue a standby immediate command to the device.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ * 0 Command was executed successfully.
+ * -1 An error occurred while executing the command.
+ */
+static int mtip_standby_immediate(struct mtip_port *port)
+{
+ int rv;
+ struct host_to_dev_fis fis;
+ unsigned long start;
+ unsigned int timeout;
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_STANDBYNOW1;
+
+ mtip_set_timeout(port->dd, &fis, &timeout, 0);
+
+ start = jiffies;
+ rv = mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ 0,
+ 0,
+ 0,
+ GFP_ATOMIC,
+ timeout);
+ dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
+ jiffies_to_msecs(jiffies - start));
+ if (rv)
+ dev_warn(&port->dd->pdev->dev,
+ "STANDBY IMMEDIATE command failed.\n");
+
+ return rv;
+}
+
+/*
+ * Issue a READ LOG EXT command to the device.
+ *
+ * @port pointer to the port structure.
+ * @page page number to fetch
+ * @buffer pointer to buffer
+ * @buffer_dma dma address corresponding to @buffer
+ * @sectors page length to fetch, in sectors
+ *
+ * return value
+ * @rv return value from mtip_exec_internal_command()
+ */
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+ dma_addr_t buffer_dma, unsigned int sectors)
+{
+ struct host_to_dev_fis fis;
+
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_READ_LOG_EXT;
+ fis.sect_count = sectors & 0xFF;
+ fis.sect_cnt_ex = (sectors >> 8) & 0xFF;
+ fis.lba_low = page;
+ fis.lba_mid = 0;
+ fis.device = ATA_DEVICE_OBS;
+
+ memset(buffer, 0, sectors * ATA_SECT_SIZE);
+
+ return mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ buffer_dma,
+ sectors * ATA_SECT_SIZE,
+ 0,
+ GFP_ATOMIC,
+ MTIP_INT_CMD_TIMEOUT_MS);
+}
+
+/*
+ * Issue a SMART READ DATA command to the device.
+ *
+ * @port pointer to the port structure.
+ * @buffer pointer to buffer
+ * @buffer_dma dma address corresponding to @buffer
+ *
+ * return value
+ * @rv return value from mtip_exec_internal_command()
+ */
+static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer,
+ dma_addr_t buffer_dma)
+{
+ struct host_to_dev_fis fis;
+
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = ATA_CMD_SMART;
+ fis.features = 0xD0;
+ fis.sect_count = 1;
+ fis.lba_mid = 0x4F;
+ fis.lba_hi = 0xC2;
+ fis.device = ATA_DEVICE_OBS;
+
+ return mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ buffer_dma,
+ ATA_SECT_SIZE,
+ 0,
+ GFP_ATOMIC,
+ 15000);
+}
+
+/*
+ * Get the value of a smart attribute
+ *
+ * @port pointer to the port structure
+ * @id attribute number
+ * @attrib pointer to return attrib information corresponding to @id
+ *
+ * return value
+ * -EINVAL NULL buffer passed or unsupported attribute @id.
+ * -EPERM Identify data not valid, SMART not supported or not enabled
+ */
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+ struct smart_attr *attrib)
+{
+ int rv, i;
+ struct smart_attr *pattr;
+
+ if (!attrib)
+ return -EINVAL;
+
+ if (!port->identify_valid) {
+ dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n");
+ return -EPERM;
+ }
+ if (!(port->identify[82] & 0x1)) {
+ dev_warn(&port->dd->pdev->dev, "SMART not supported\n");
+ return -EPERM;
+ }
+ if (!(port->identify[85] & 0x1)) {
+ dev_warn(&port->dd->pdev->dev, "SMART not enabled\n");
+ return -EPERM;
+ }
+
+ memset(port->smart_buf, 0, ATA_SECT_SIZE);
+ rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma);
+ if (rv) {
+ dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n");
+ return rv;
+ }
+
+ pattr = (struct smart_attr *)(port->smart_buf + 2);
+ for (i = 0; i < 29; i++, pattr++)
+ if (pattr->attr_id == id) {
+ memcpy(attrib, pattr, sizeof(struct smart_attr));
+ break;
+ }
+
+ if (i == 29) {
+ dev_warn(&port->dd->pdev->dev,
+ "Query for invalid SMART attribute ID\n");
+ rv = -EINVAL;
+ }
+
+ return rv;
+}
+
+/*
+ * Trim unused sectors
+ *
+ * @dd pointer to driver_data structure
+ * @lba starting lba
+ * @len # of 512b sectors to trim
+ *
+ * return value
+ * -ENOMEM Out of dma memory
+ * -EINVAL Invalid parameters passed in, trim not supported
+ * -EIO Error submitting trim request to hw
+ */
+static int mtip_send_trim(struct driver_data *dd, unsigned int lba,
+ unsigned int len)
+{
+ int i, rv = 0;
+ u64 tlba, tlen, sect_left;
+ struct mtip_trim_entry *buf;
+ dma_addr_t dma_addr;
+ struct host_to_dev_fis fis;
+
+ if (!len || dd->trim_supp == false)
+ return -EINVAL;
+
+ /* Trim request too big */
+ WARN_ON(len > (MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES));
+
+ /* Trim request not aligned on 4k boundary */
+ WARN_ON(len % 8 != 0);
+
+ /* Warn if vu_trim structure is too big */
+ WARN_ON(sizeof(struct mtip_trim) > ATA_SECT_SIZE);
+
+ /* Allocate a DMA buffer for the trim structure */
+ buf = dmam_alloc_coherent(&dd->pdev->dev, ATA_SECT_SIZE, &dma_addr,
+ GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+ memset(buf, 0, ATA_SECT_SIZE);
+
+ for (i = 0, sect_left = len, tlba = lba;
+ i < MTIP_MAX_TRIM_ENTRIES && sect_left;
+ i++) {
+ tlen = (sect_left >= MTIP_MAX_TRIM_ENTRY_LEN ?
+ MTIP_MAX_TRIM_ENTRY_LEN :
+ sect_left);
+ buf[i].lba = __force_bit2int cpu_to_le32(tlba);
+ buf[i].range = __force_bit2int cpu_to_le16(tlen);
+ tlba += tlen;
+ sect_left -= tlen;
+ }
+ WARN_ON(sect_left != 0);
+
+ /* Build the fis */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = 0xfb;
+ fis.features = 0x60;
+ fis.sect_count = 1;
+ fis.device = ATA_DEVICE_OBS;
+
+ if (mtip_exec_internal_command(dd->port,
+ &fis,
+ 5,
+ dma_addr,
+ ATA_SECT_SIZE,
+ 0,
+ GFP_KERNEL,
+ MTIP_TRIM_TIMEOUT_MS) < 0)
+ rv = -EIO;
+
+ dmam_free_coherent(&dd->pdev->dev, ATA_SECT_SIZE, buf, dma_addr);
+ return rv;
+}
+
+/*
+ * Get the drive capacity.
+ *
+ * @dd Pointer to the device data structure.
+ * @sectors Pointer to the variable that will receive the sector count.
+ *
+ * return value
+ * 1 Capacity was returned successfully.
+ * 0 The identify information is invalid.
+ */
+static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+{
+ struct mtip_port *port = dd->port;
+ u64 total, raw0, raw1, raw2, raw3;
+ raw0 = port->identify[100];
+ raw1 = port->identify[101];
+ raw2 = port->identify[102];
+ raw3 = port->identify[103];
+ total = raw0 | raw1<<16 | raw2<<32 | raw3<<48;
+ *sectors = total;
+ return (bool) !!port->identify_valid;
+}
+
+/*
+ * Display the identify command data.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_dump_identify(struct mtip_port *port)
+{
+ sector_t sectors;
+ unsigned short revid;
+ char cbuf[42];
+
+ if (!port->identify_valid)
+ return;
+
+ strlcpy(cbuf, (char *)(port->identify+10), 21);
+ dev_info(&port->dd->pdev->dev,
+ "Serial No.: %s\n", cbuf);
+
+ strlcpy(cbuf, (char *)(port->identify+23), 9);
+ dev_info(&port->dd->pdev->dev,
+ "Firmware Ver.: %s\n", cbuf);
+
+ strlcpy(cbuf, (char *)(port->identify+27), 41);
+ dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
+
+ dev_info(&port->dd->pdev->dev, "Security: %04x %s\n",
+ port->identify[128],
+ port->identify[128] & 0x4 ? "(LOCKED)" : "");
+
+ if (mtip_hw_get_capacity(port->dd, &sectors))
+ dev_info(&port->dd->pdev->dev,
+ "Capacity: %llu sectors (%llu MB)\n",
+ (u64)sectors,
+ ((u64)sectors) * ATA_SECT_SIZE >> 20);
+
+ pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
+ switch (revid & 0xFF) {
+ case 0x1:
+ strlcpy(cbuf, "A0", 3);
+ break;
+ case 0x3:
+ strlcpy(cbuf, "A2", 3);
+ break;
+ default:
+ strlcpy(cbuf, "?", 2);
+ break;
+ }
+ dev_info(&port->dd->pdev->dev,
+ "Card Type: %s\n", cbuf);
+}
+
+/*
+ * Map the commands scatter list into the command table.
+ *
+ * @command Pointer to the command.
+ * @nents Number of scatter list entries.
+ *
+ * return value
+ * None
+ */
+static inline void fill_command_sg(struct driver_data *dd,
+ struct mtip_cmd *command,
+ int nents)
+{
+ int n;
+ unsigned int dma_len;
+ struct mtip_cmd_sg *command_sg;
+ struct scatterlist *sg = command->sg;
+
+ command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
+
+ for (n = 0; n < nents; n++) {
+ dma_len = sg_dma_len(sg);
+ if (dma_len > 0x400000)
+ dev_err(&dd->pdev->dev,
+ "DMA segment length truncated\n");
+ command_sg->info = __force_bit2int
+ cpu_to_le32((dma_len-1) & 0x3FFFFF);
+ command_sg->dba = __force_bit2int
+ cpu_to_le32(sg_dma_address(sg));
+ command_sg->dba_upper = __force_bit2int
+ cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
+ command_sg++;
+ sg++;
+ }
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * return value 0 The command completed successfully.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_task(struct mtip_port *port, u8 *command)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+ unsigned int to;
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = command[0];
+ fis.features = command[1];
+ fis.sect_count = command[2];
+ fis.sector = command[3];
+ fis.cyl_low = command[4];
+ fis.cyl_hi = command[5];
+ fis.device = command[6] & ~0x10; /* Clear the dev bit*/
+
+ mtip_set_timeout(port->dd, &fis, &to, 0);
+
+ dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2],
+ command[3],
+ command[4],
+ command[5],
+ command[6]);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ 0,
+ 0,
+ 0,
+ GFP_KERNEL,
+ to) < 0) {
+ return -1;
+ }
+
+ command[0] = reply->command; /* Status*/
+ command[1] = reply->features; /* Error*/
+ command[4] = reply->cyl_low;
+ command[5] = reply->cyl_hi;
+
+ dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[4],
+ command[5]);
+
+ return 0;
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * @param port Pointer to the port data structure.
+ * @param command Pointer to the user specified command parameters.
+ * @param user_buffer Pointer to the user space buffer where read sector
+ * data should be copied.
+ *
+ * return value 0 The command completed successfully.
+ * return value -EFAULT An error occurred while copying the completion
+ * data to the user space buffer.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_command(struct mtip_port *port, u8 *command,
+ void __user *user_buffer)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply;
+ u8 *buf = NULL;
+ dma_addr_t dma_addr = 0;
+ int rv = 0, xfer_sz = command[3];
+ unsigned int to;
+
+ if (xfer_sz) {
+ if (!user_buffer)
+ return -EFAULT;
+
+ buf = dmam_alloc_coherent(&port->dd->pdev->dev,
+ ATA_SECT_SIZE * xfer_sz,
+ &dma_addr,
+ GFP_KERNEL);
+ if (!buf) {
+ dev_err(&port->dd->pdev->dev,
+ "Memory allocation failed (%d bytes)\n",
+ ATA_SECT_SIZE * xfer_sz);
+ return -ENOMEM;
+ }
+ memset(buf, 0, ATA_SECT_SIZE * xfer_sz);
+ }
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = command[0];
+ fis.features = command[2];
+ fis.sect_count = command[3];
+ if (fis.command == ATA_CMD_SMART) {
+ fis.sector = command[1];
+ fis.cyl_low = 0x4F;
+ fis.cyl_hi = 0xC2;
+ }
+
+ mtip_set_timeout(port->dd, &fis, &to, 0);
+
+ if (xfer_sz)
+ reply = (port->rxfis + RX_FIS_PIO_SETUP);
+ else
+ reply = (port->rxfis + RX_FIS_D2H_REG);
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: User Command: cmd %x, sect %x, "
+ "feat %x, sectcnt %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2],
+ command[3]);
+
+ /* Execute the command. */
+ if (mtip_exec_internal_command(port,
+ &fis,
+ 5,
+ (xfer_sz ? dma_addr : 0),
+ (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0),
+ 0,
+ GFP_KERNEL,
+ to)
+ < 0) {
+ rv = -EFAULT;
+ goto exit_drive_command;
+ }
+
+ /* Collect the completion status. */
+ command[0] = reply->command; /* Status*/
+ command[1] = reply->features; /* Error*/
+ command[2] = reply->sect_count;
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: Completion Status: stat %x, "
+ "err %x, nsect %x\n",
+ __func__,
+ command[0],
+ command[1],
+ command[2]);
+
+ if (xfer_sz) {
+ if (copy_to_user(user_buffer,
+ buf,
+ ATA_SECT_SIZE * command[3])) {
+ rv = -EFAULT;
+ goto exit_drive_command;
+ }
+ }
+exit_drive_command:
+ if (buf)
+ dmam_free_coherent(&port->dd->pdev->dev,
+ ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
+ return rv;
+}
+
+/*
+ * Indicates whether a command has a single sector payload.
+ *
+ * @command passed to the device to perform the certain event.
+ * @features passed to the device to perform the certain event.
+ *
+ * return value
+ * 1 command is one that always has a single sector payload,
+ * regardless of the value in the Sector Count field.
+ * 0 otherwise
+ *
+ */
+static unsigned int implicit_sector(unsigned char command,
+ unsigned char features)
+{
+ unsigned int rv = 0;
+
+ /* list of commands that have an implicit sector count of 1 */
+ switch (command) {
+ case ATA_CMD_SEC_SET_PASS:
+ case ATA_CMD_SEC_UNLOCK:
+ case ATA_CMD_SEC_ERASE_PREP:
+ case ATA_CMD_SEC_ERASE_UNIT:
+ case ATA_CMD_SEC_FREEZE_LOCK:
+ case ATA_CMD_SEC_DISABLE_PASS:
+ case ATA_CMD_PMP_READ:
+ case ATA_CMD_PMP_WRITE:
+ rv = 1;
+ break;
+ case ATA_CMD_SET_MAX:
+ if (features == ATA_SET_MAX_UNLOCK)
+ rv = 1;
+ break;
+ case ATA_CMD_SMART:
+ if ((features == ATA_SMART_READ_VALUES) ||
+ (features == ATA_SMART_READ_THRESHOLDS))
+ rv = 1;
+ break;
+ case ATA_CMD_CONF_OVERLAY:
+ if ((features == ATA_DCO_IDENTIFY) ||
+ (features == ATA_DCO_SET))
+ rv = 1;
+ break;
+ }
+ return rv;
+}
+
+/*
+ * Executes a taskfile
+ * See ide_taskfile_ioctl() for derivation
+ */
+static int exec_drive_taskfile(struct driver_data *dd,
+ void __user *buf,
+ ide_task_request_t *req_task,
+ int outtotal)
+{
+ struct host_to_dev_fis fis;
+ struct host_to_dev_fis *reply;
+ u8 *outbuf = NULL;
+ u8 *inbuf = NULL;
+ dma_addr_t outbuf_dma = 0;
+ dma_addr_t inbuf_dma = 0;
+ dma_addr_t dma_buffer = 0;
+ int err = 0;
+ unsigned int taskin = 0;
+ unsigned int taskout = 0;
+ u8 nsect = 0;
+ unsigned int timeout;
+ unsigned int force_single_sector;
+ unsigned int transfer_size;
+ unsigned long task_file_data;
+ int intotal = outtotal + req_task->out_size;
+ int erasemode = 0;
+
+ taskout = req_task->out_size;
+ taskin = req_task->in_size;
+ /* 130560 = 512 * 0xFF*/
+ if (taskin > 130560 || taskout > 130560) {
+ err = -EINVAL;
+ goto abort;
+ }
+
+ if (taskout) {
+ outbuf = kzalloc(taskout, GFP_KERNEL);
+ if (outbuf == NULL) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ if (copy_from_user(outbuf, buf + outtotal, taskout)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ outbuf_dma = pci_map_single(dd->pdev,
+ outbuf,
+ taskout,
+ DMA_TO_DEVICE);
+ if (outbuf_dma == 0) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ dma_buffer = outbuf_dma;
+ }
+
+ if (taskin) {
+ inbuf = kzalloc(taskin, GFP_KERNEL);
+ if (inbuf == NULL) {
+ err = -ENOMEM;
+ goto abort;
+ }
+
+ if (copy_from_user(inbuf, buf + intotal, taskin)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ inbuf_dma = pci_map_single(dd->pdev,
+ inbuf,
+ taskin, DMA_FROM_DEVICE);
+ if (inbuf_dma == 0) {
+ err = -ENOMEM;
+ goto abort;
+ }
+ dma_buffer = inbuf_dma;
+ }
+
+ /* only supports PIO and non-data commands from this ioctl. */
+ switch (req_task->data_phase) {
+ case TASKFILE_OUT:
+ nsect = taskout / ATA_SECT_SIZE;
+ reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+ break;
+ case TASKFILE_IN:
+ reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+ break;
+ case TASKFILE_NO_DATA:
+ reply = (dd->port->rxfis + RX_FIS_D2H_REG);
+ break;
+ default:
+ err = -EINVAL;
+ goto abort;
+ }
+
+ /* Build the FIS. */
+ memset(&fis, 0, sizeof(struct host_to_dev_fis));
+
+ fis.type = 0x27;
+ fis.opts = 1 << 7;
+ fis.command = req_task->io_ports[7];
+ fis.features = req_task->io_ports[1];
+ fis.sect_count = req_task->io_ports[2];
+ fis.lba_low = req_task->io_ports[3];
+ fis.lba_mid = req_task->io_ports[4];
+ fis.lba_hi = req_task->io_ports[5];
+ /* Clear the dev bit*/
+ fis.device = req_task->io_ports[6] & ~0x10;
+
+ if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) {
+ req_task->in_flags.all =
+ IDE_TASKFILE_STD_IN_FLAGS |
+ (IDE_HOB_STD_IN_FLAGS << 8);
+ fis.lba_low_ex = req_task->hob_ports[3];
+ fis.lba_mid_ex = req_task->hob_ports[4];
+ fis.lba_hi_ex = req_task->hob_ports[5];
+ fis.features_ex = req_task->hob_ports[1];
+ fis.sect_cnt_ex = req_task->hob_ports[2];
+
+ } else {
+ req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
+ }
+
+ force_single_sector = implicit_sector(fis.command, fis.features);
+
+ if ((taskin || taskout) && (!fis.sect_count)) {
+ if (nsect)
+ fis.sect_count = nsect;
+ else {
+ if (!force_single_sector) {
+ dev_warn(&dd->pdev->dev,
+ "data movement but "
+ "sect_count is 0\n");
+ err = -EINVAL;
+ goto abort;
+ }
+ }
+ }
+
+ dbg_printk(MTIP_DRV_NAME
+ " %s: cmd %x, feat %x, nsect %x,"
+ " sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
+ " head/dev %x\n",
+ __func__,
+ fis.command,
+ fis.features,
+ fis.sect_count,
+ fis.lba_low,
+ fis.lba_mid,
+ fis.lba_hi,
+ fis.device);
+
+ /* check for erase mode support during secure erase.*/
+ if ((fis.command == ATA_CMD_SEC_ERASE_UNIT) && outbuf &&
+ (outbuf[0] & MTIP_SEC_ERASE_MODE)) {
+ erasemode = 1;
+ }
+
+ mtip_set_timeout(dd, &fis, &timeout, erasemode);
+
+ /* Determine the correct transfer size.*/
+ if (force_single_sector)
+ transfer_size = ATA_SECT_SIZE;
+ else
+ transfer_size = ATA_SECT_SIZE * fis.sect_count;
+
+ /* Execute the command.*/
+ if (mtip_exec_internal_command(dd->port,
+ &fis,
+ 5,
+ dma_buffer,
+ transfer_size,
+ 0,
+ GFP_KERNEL,
+ timeout) < 0) {
+ err = -EIO;
+ goto abort;
+ }
+
+ task_file_data = readl(dd->port->mmio+PORT_TFDATA);
+
+ if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) {
+ reply = dd->port->rxfis + RX_FIS_PIO_SETUP;
+ req_task->io_ports[7] = reply->control;
+ } else {
+ reply = dd->port->rxfis + RX_FIS_D2H_REG;
+ req_task->io_ports[7] = reply->command;
+ }
+
+ /* reclaim the DMA buffers.*/
+ if (inbuf_dma)
+ pci_unmap_single(dd->pdev, inbuf_dma,
+ taskin, DMA_FROM_DEVICE);
+ if (outbuf_dma)
+ pci_unmap_single(dd->pdev, outbuf_dma,
+ taskout, DMA_TO_DEVICE);
+ inbuf_dma = 0;
+ outbuf_dma = 0;
+
+ /* return the ATA registers to the caller.*/
+ req_task->io_ports[1] = reply->features;
+ req_task->io_ports[2] = reply->sect_count;
+ req_task->io_ports[3] = reply->lba_low;
+ req_task->io_ports[4] = reply->lba_mid;
+ req_task->io_ports[5] = reply->lba_hi;
+ req_task->io_ports[6] = reply->device;
+
+ if (req_task->out_flags.all & 1) {
+
+ req_task->hob_ports[3] = reply->lba_low_ex;
+ req_task->hob_ports[4] = reply->lba_mid_ex;
+ req_task->hob_ports[5] = reply->lba_hi_ex;
+ req_task->hob_ports[1] = reply->features_ex;
+ req_task->hob_ports[2] = reply->sect_cnt_ex;
+ }
+ dbg_printk(MTIP_DRV_NAME
+ " %s: Completion: stat %x,"
+ "err %x, sect_cnt %x, lbalo %x,"
+ "lbamid %x, lbahi %x, dev %x\n",
+ __func__,
+ req_task->io_ports[7],
+ req_task->io_ports[1],
+ req_task->io_ports[2],
+ req_task->io_ports[3],
+ req_task->io_ports[4],
+ req_task->io_ports[5],
+ req_task->io_ports[6]);
+
+ if (taskout) {
+ if (copy_to_user(buf + outtotal, outbuf, taskout)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ }
+ if (taskin) {
+ if (copy_to_user(buf + intotal, inbuf, taskin)) {
+ err = -EFAULT;
+ goto abort;
+ }
+ }
+abort:
+ if (inbuf_dma)
+ pci_unmap_single(dd->pdev, inbuf_dma,
+ taskin, DMA_FROM_DEVICE);
+ if (outbuf_dma)
+ pci_unmap_single(dd->pdev, outbuf_dma,
+ taskout, DMA_TO_DEVICE);
+ kfree(outbuf);
+ kfree(inbuf);
+
+ return err;
+}
+
+/*
+ * Handle IOCTL calls from the Block Layer.
+ *
+ * This function is called by the Block Layer when it receives an IOCTL
+ * command that it does not understand. If the IOCTL command is not supported
+ * this function returns -ENOTTY.
+ *
+ * @dd Pointer to the driver data structure.
+ * @cmd IOCTL command passed from the Block Layer.
+ * @arg IOCTL argument passed from the Block Layer.
+ *
+ * return value
+ * 0 The IOCTL completed successfully.
+ * -ENOTTY The specified command is not supported.
+ * -EFAULT An error occurred copying data to a user space buffer.
+ * -EIO An error occurred while executing the command.
+ */
+static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
+ unsigned long arg)
+{
+ switch (cmd) {
+ case HDIO_GET_IDENTITY:
+ {
+ if (copy_to_user((void __user *)arg, dd->port->identify,
+ sizeof(u16) * ATA_ID_WORDS))
+ return -EFAULT;
+ break;
+ }
+ case HDIO_DRIVE_CMD:
+ {
+ u8 drive_command[4];
+
+ /* Copy the user command info to our buffer. */
+ if (copy_from_user(drive_command,
+ (void __user *) arg,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ /* Execute the drive command. */
+ if (exec_drive_command(dd->port,
+ drive_command,
+ (void __user *) (arg+4)))
+ return -EIO;
+
+ /* Copy the status back to the users buffer. */
+ if (copy_to_user((void __user *) arg,
+ drive_command,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ break;
+ }
+ case HDIO_DRIVE_TASK:
+ {
+ u8 drive_command[7];
+
+ /* Copy the user command info to our buffer. */
+ if (copy_from_user(drive_command,
+ (void __user *) arg,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ /* Execute the drive command. */
+ if (exec_drive_task(dd->port, drive_command))
+ return -EIO;
+
+ /* Copy the status back to the users buffer. */
+ if (copy_to_user((void __user *) arg,
+ drive_command,
+ sizeof(drive_command)))
+ return -EFAULT;
+
+ break;
+ }
+ case HDIO_DRIVE_TASKFILE: {
+ ide_task_request_t req_task;
+ int ret, outtotal;
+
+ if (copy_from_user(&req_task, (void __user *) arg,
+ sizeof(req_task)))
+ return -EFAULT;
+
+ outtotal = sizeof(req_task);
+
+ ret = exec_drive_taskfile(dd, (void __user *) arg,
+ &req_task, outtotal);
+
+ if (copy_to_user((void __user *) arg, &req_task,
+ sizeof(req_task)))
+ return -EFAULT;
+
+ return ret;
+ }
+
+ default:
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/*
+ * Submit an IO to the hw
+ *
+ * This function is called by the block layer to issue an io
+ * to the device. Upon completion, the callback function will
+ * be called with the data parameter passed as the callback data.
+ *
+ * @dd Pointer to the driver data structure.
+ * @start First sector to read.
+ * @nsect Number of sectors to read.
+ * @nents Number of entries in scatter list for the read command.
+ * @tag The tag of this read command.
+ * @callback Pointer to the function that should be called
+ * when the read completes.
+ * @data Callback data passed to the callback function
+ * when the read completes.
+ * @dir Direction (read or write)
+ *
+ * return value
+ * None
+ */
+static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
+ struct mtip_cmd *command, int nents,
+ struct blk_mq_hw_ctx *hctx)
+{
+ struct host_to_dev_fis *fis;
+ struct mtip_port *port = dd->port;
+ int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+ u64 start = blk_rq_pos(rq);
+ unsigned int nsect = blk_rq_sectors(rq);
+
+ /* Map the scatter list for DMA access */
+ nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
+
+ prefetch(&port->flags);
+
+ command->scatter_ents = nents;
+
+ /*
+ * The number of retries for this command before it is
+ * reported as a failure to the upper layers.
+ */
+ command->retries = MTIP_MAX_RETRIES;
+
+ /* Fill out fis */
+ fis = command->command;
+ fis->type = 0x27;
+ fis->opts = 1 << 7;
+ if (dma_dir == DMA_FROM_DEVICE)
+ fis->command = ATA_CMD_FPDMA_READ;
+ else
+ fis->command = ATA_CMD_FPDMA_WRITE;
+ fis->lba_low = start & 0xFF;
+ fis->lba_mid = (start >> 8) & 0xFF;
+ fis->lba_hi = (start >> 16) & 0xFF;
+ fis->lba_low_ex = (start >> 24) & 0xFF;
+ fis->lba_mid_ex = (start >> 32) & 0xFF;
+ fis->lba_hi_ex = (start >> 40) & 0xFF;
+ fis->device = 1 << 6;
+ fis->features = nsect & 0xFF;
+ fis->features_ex = (nsect >> 8) & 0xFF;
+ fis->sect_count = ((rq->tag << 3) | (rq->tag >> 5));
+ fis->sect_cnt_ex = 0;
+ fis->control = 0;
+ fis->res2 = 0;
+ fis->res3 = 0;
+ fill_command_sg(dd, command, nents);
+
+ if (unlikely(command->unaligned))
+ fis->device |= 1 << 7;
+
+ /* Populate the command header */
+ command->command_header->opts =
+ __force_bit2int cpu_to_le32(
+ (nents << 16) | 5 | AHCI_CMD_PREFETCH);
+ command->command_header->byte_count = 0;
+
+ /*
+ * Set the completion function and data for the command
+ * within this layer.
+ */
+ command->comp_data = dd;
+ command->comp_func = mtip_async_complete;
+ command->direction = dma_dir;
+
+ /*
+ * To prevent this command from being issued
+ * if an internal command is in progress or error handling is active.
+ */
+ if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) {
+ set_bit(rq->tag, port->cmds_to_issue);
+ set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+ return;
+ }
+
+ /* Issue the command to the hardware */
+ mtip_issue_ncq_command(port, rq->tag);
+}
+
+/*
+ * Sysfs status dump.
+ *
+ * @dev Pointer to the device structure, passed by the kernrel.
+ * @attr Pointer to the device_attribute structure passed by the kernel.
+ * @buf Pointer to the char buffer that will receive the stats info.
+ *
+ * return value
+ * The size, in bytes, of the data copied into buf.
+ */
+static ssize_t mtip_hw_show_status(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct driver_data *dd = dev_to_disk(dev)->private_data;
+ int size = 0;
+
+ if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+ size += sprintf(buf, "%s", "thermal_shutdown\n");
+ else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag))
+ size += sprintf(buf, "%s", "write_protect\n");
+ else
+ size += sprintf(buf, "%s", "online\n");
+
+ return size;
+}
+
+static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+
+/* debugsfs entries */
+
+static ssize_t show_device_status(struct device_driver *drv, char *buf)
+{
+ int size = 0;
+ struct driver_data *dd, *tmp;
+ unsigned long flags;
+ char id_buf[42];
+ u16 status = 0;
+
+ spin_lock_irqsave(&dev_lock, flags);
+ size += sprintf(&buf[size], "Devices Present:\n");
+ list_for_each_entry_safe(dd, tmp, &online_list, online_list) {
+ if (dd->pdev) {
+ if (dd->port &&
+ dd->port->identify &&
+ dd->port->identify_valid) {
+ strlcpy(id_buf,
+ (char *) (dd->port->identify + 10), 21);
+ status = *(dd->port->identify + 141);
+ } else {
+ memset(id_buf, 0, 42);
+ status = 0;
+ }
+
+ if (dd->port &&
+ test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) {
+ size += sprintf(&buf[size],
+ " device %s %s (ftl rebuild %d %%)\n",
+ dev_name(&dd->pdev->dev),
+ id_buf,
+ status);
+ } else {
+ size += sprintf(&buf[size],
+ " device %s %s\n",
+ dev_name(&dd->pdev->dev),
+ id_buf);
+ }
+ }
+ }
+
+ size += sprintf(&buf[size], "Devices Being Removed:\n");
+ list_for_each_entry_safe(dd, tmp, &removing_list, remove_list) {
+ if (dd->pdev) {
+ if (dd->port &&
+ dd->port->identify &&
+ dd->port->identify_valid) {
+ strlcpy(id_buf,
+ (char *) (dd->port->identify+10), 21);
+ status = *(dd->port->identify + 141);
+ } else {
+ memset(id_buf, 0, 42);
+ status = 0;
+ }
+
+ if (dd->port &&
+ test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) {
+ size += sprintf(&buf[size],
+ " device %s %s (ftl rebuild %d %%)\n",
+ dev_name(&dd->pdev->dev),
+ id_buf,
+ status);
+ } else {
+ size += sprintf(&buf[size],
+ " device %s %s\n",
+ dev_name(&dd->pdev->dev),
+ id_buf);
+ }
+ }
+ }
+ spin_unlock_irqrestore(&dev_lock, flags);
+
+ return size;
+}
+
+static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf,
+ size_t len, loff_t *offset)
+{
+ struct driver_data *dd = (struct driver_data *)f->private_data;
+ int size = *offset;
+ char *buf;
+ int rv = 0;
+
+ if (!len || *offset)
+ return 0;
+
+ buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+ if (!buf) {
+ dev_err(&dd->pdev->dev,
+ "Memory allocation: status buffer\n");
+ return -ENOMEM;
+ }
+
+ size += show_device_status(NULL, buf);
+
+ *offset = size <= len ? size : len;
+ size = copy_to_user(ubuf, buf, *offset);
+ if (size)
+ rv = -EFAULT;
+
+ kfree(buf);
+ return rv ? rv : *offset;
+}
+
+static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
+ size_t len, loff_t *offset)
+{
+ struct driver_data *dd = (struct driver_data *)f->private_data;
+ char *buf;
+ u32 group_allocated;
+ int size = *offset;
+ int n, rv = 0;
+
+ if (!len || size)
+ return 0;
+
+ buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+ if (!buf) {
+ dev_err(&dd->pdev->dev,
+ "Memory allocation: register buffer\n");
+ return -ENOMEM;
+ }
+
+ size += sprintf(&buf[size], "H/ S ACTive : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--)
+ size += sprintf(&buf[size], "%08X ",
+ readl(dd->port->s_active[n]));
+
+ size += sprintf(&buf[size], "]\n");
+ size += sprintf(&buf[size], "H/ Command Issue : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--)
+ size += sprintf(&buf[size], "%08X ",
+ readl(dd->port->cmd_issue[n]));
+
+ size += sprintf(&buf[size], "]\n");
+ size += sprintf(&buf[size], "H/ Completed : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--)
+ size += sprintf(&buf[size], "%08X ",
+ readl(dd->port->completed[n]));
+
+ size += sprintf(&buf[size], "]\n");
+ size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n",
+ readl(dd->port->mmio + PORT_IRQ_STAT));
+ size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n",
+ readl(dd->mmio + HOST_IRQ_STAT));
+ size += sprintf(&buf[size], "\n");
+
+ size += sprintf(&buf[size], "L/ Allocated : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--) {
+ if (sizeof(long) > sizeof(u32))
+ group_allocated =
+ dd->port->allocated[n/2] >> (32*(n&1));
+ else
+ group_allocated = dd->port->allocated[n];
+ size += sprintf(&buf[size], "%08X ", group_allocated);
+ }
+ size += sprintf(&buf[size], "]\n");
+
+ size += sprintf(&buf[size], "L/ Commands in Q : [ 0x");
+
+ for (n = dd->slot_groups-1; n >= 0; n--) {
+ if (sizeof(long) > sizeof(u32))
+ group_allocated =
+ dd->port->cmds_to_issue[n/2] >> (32*(n&1));
+ else
+ group_allocated = dd->port->cmds_to_issue[n];
+ size += sprintf(&buf[size], "%08X ", group_allocated);
+ }
+ size += sprintf(&buf[size], "]\n");
+
+ *offset = size <= len ? size : len;
+ size = copy_to_user(ubuf, buf, *offset);
+ if (size)
+ rv = -EFAULT;
+
+ kfree(buf);
+ return rv ? rv : *offset;
+}
+
+static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
+ size_t len, loff_t *offset)
+{
+ struct driver_data *dd = (struct driver_data *)f->private_data;
+ char *buf;
+ int size = *offset;
+ int rv = 0;
+
+ if (!len || size)
+ return 0;
+
+ buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+ if (!buf) {
+ dev_err(&dd->pdev->dev,
+ "Memory allocation: flag buffer\n");
+ return -ENOMEM;
+ }
+
+ size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
+ dd->port->flags);
+ size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n",
+ dd->dd_flag);
+
+ *offset = size <= len ? size : len;
+ size = copy_to_user(ubuf, buf, *offset);
+ if (size)
+ rv = -EFAULT;
+
+ kfree(buf);
+ return rv ? rv : *offset;
+}
+
+static const struct file_operations mtip_device_status_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = mtip_hw_read_device_status,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations mtip_regs_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = mtip_hw_read_registers,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations mtip_flags_fops = {
+ .owner = THIS_MODULE,
+ .open = simple_open,
+ .read = mtip_hw_read_flags,
+ .llseek = no_llseek,
+};
+
+/*
+ * Create the sysfs related attributes.
+ *
+ * @dd Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
+{
+ if (!kobj || !dd)
+ return -EINVAL;
+
+ if (sysfs_create_file(kobj, &dev_attr_status.attr))
+ dev_warn(&dd->pdev->dev,
+ "Error creating 'status' sysfs entry\n");
+ return 0;
+}
+
+/*
+ * Remove the sysfs related attributes.
+ *
+ * @dd Pointer to the driver data structure.
+ * @kobj Pointer to the kobj for the block device.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -EINVAL Invalid parameter.
+ */
+static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
+{
+ if (!kobj || !dd)
+ return -EINVAL;
+
+ sysfs_remove_file(kobj, &dev_attr_status.attr);
+
+ return 0;
+}
+
+static int mtip_hw_debugfs_init(struct driver_data *dd)
+{
+ if (!dfs_parent)
+ return -1;
+
+ dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent);
+ if (IS_ERR_OR_NULL(dd->dfs_node)) {
+ dev_warn(&dd->pdev->dev,
+ "Error creating node %s under debugfs\n",
+ dd->disk->disk_name);
+ dd->dfs_node = NULL;
+ return -1;
+ }
+
+ debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd,
+ &mtip_flags_fops);
+ debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd,
+ &mtip_regs_fops);
+
+ return 0;
+}
+
+static void mtip_hw_debugfs_exit(struct driver_data *dd)
+{
+ if (dd->dfs_node)
+ debugfs_remove_recursive(dd->dfs_node);
+}
+
+static int mtip_free_orphan(struct driver_data *dd)
+{
+ struct kobject *kobj;
+
+ if (dd->bdev) {
+ if (dd->bdev->bd_holders >= 1)
+ return -2;
+
+ bdput(dd->bdev);
+ dd->bdev = NULL;
+ }
+
+ mtip_hw_debugfs_exit(dd);
+
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, dd->index);
+ spin_unlock(&rssd_index_lock);
+
+ if (!test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag) &&
+ test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag)) {
+ put_disk(dd->disk);
+ } else {
+ if (dd->disk) {
+ kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+ if (kobj) {
+ mtip_hw_sysfs_exit(dd, kobj);
+ kobject_put(kobj);
+ }
+ del_gendisk(dd->disk);
+ dd->disk = NULL;
+ }
+ if (dd->queue) {
+ dd->queue->queuedata = NULL;
+ blk_cleanup_queue(dd->queue);
+ blk_mq_free_tag_set(&dd->tags);
+ dd->queue = NULL;
+ }
+ }
+ kfree(dd);
+ return 0;
+}
+
+/*
+ * Perform any init/resume time hardware setup
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * None
+ */
+static inline void hba_setup(struct driver_data *dd)
+{
+ u32 hwdata;
+ hwdata = readl(dd->mmio + HOST_HSORG);
+
+ /* interrupt bug workaround: use only 1 IS bit.*/
+ writel(hwdata |
+ HSORG_DISABLE_SLOTGRP_INTR |
+ HSORG_DISABLE_SLOTGRP_PXIS,
+ dd->mmio + HOST_HSORG);
+}
+
+static int mtip_device_unaligned_constrained(struct driver_data *dd)
+{
+ return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0);
+}
+
+/*
+ * Detect the details of the product, and store anything needed
+ * into the driver data structure. This includes product type and
+ * version and number of slot groups.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * None
+ */
+static void mtip_detect_product(struct driver_data *dd)
+{
+ u32 hwdata;
+ unsigned int rev, slotgroups;
+
+ /*
+ * HBA base + 0xFC [15:0] - vendor-specific hardware interface
+ * info register:
+ * [15:8] hardware/software interface rev#
+ * [ 3] asic-style interface
+ * [ 2:0] number of slot groups, minus 1 (only valid for asic-style).
+ */
+ hwdata = readl(dd->mmio + HOST_HSORG);
+
+ dd->product_type = MTIP_PRODUCT_UNKNOWN;
+ dd->slot_groups = 1;
+
+ if (hwdata & 0x8) {
+ dd->product_type = MTIP_PRODUCT_ASICFPGA;
+ rev = (hwdata & HSORG_HWREV) >> 8;
+ slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1;
+ dev_info(&dd->pdev->dev,
+ "ASIC-FPGA design, HS rev 0x%x, "
+ "%i slot groups [%i slots]\n",
+ rev,
+ slotgroups,
+ slotgroups * 32);
+
+ if (slotgroups > MTIP_MAX_SLOT_GROUPS) {
+ dev_warn(&dd->pdev->dev,
+ "Warning: driver only supports "
+ "%i slot groups.\n", MTIP_MAX_SLOT_GROUPS);
+ slotgroups = MTIP_MAX_SLOT_GROUPS;
+ }
+ dd->slot_groups = slotgroups;
+ return;
+ }
+
+ dev_warn(&dd->pdev->dev, "Unrecognized product id\n");
+}
+
+/*
+ * Blocking wait for FTL rebuild to complete
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ * 0 FTL rebuild completed successfully
+ * -EFAULT FTL rebuild error/timeout/interruption
+ */
+static int mtip_ftl_rebuild_poll(struct driver_data *dd)
+{
+ unsigned long timeout, cnt = 0, start;
+
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild in progress. Polling for completion.\n");
+
+ start = jiffies;
+ timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
+
+ do {
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag)))
+ return -EFAULT;
+ if (mtip_check_surprise_removal(dd->pdev))
+ return -EFAULT;
+
+ if (mtip_get_identify(dd->port, NULL) < 0)
+ return -EFAULT;
+
+ if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+ MTIP_FTL_REBUILD_MAGIC) {
+ ssleep(1);
+ /* Print message every 3 minutes */
+ if (cnt++ >= 180) {
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild in progress (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ cnt = 0;
+ }
+ } else {
+ dev_warn(&dd->pdev->dev,
+ "FTL rebuild complete (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ mtip_block_initialize(dd);
+ return 0;
+ }
+ ssleep(10);
+ } while (time_before(jiffies, timeout));
+
+ /* Check for timeout */
+ dev_err(&dd->pdev->dev,
+ "Timed out waiting for FTL rebuild to complete (%d secs).\n",
+ jiffies_to_msecs(jiffies - start) / 1000);
+ return -EFAULT;
+}
+
+/*
+ * service thread to issue queued commands
+ *
+ * @data Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+
+static int mtip_service_thread(void *data)
+{
+ struct driver_data *dd = (struct driver_data *)data;
+ unsigned long slot, slot_start, slot_wrap;
+ unsigned int num_cmd_slots = dd->slot_groups * 32;
+ struct mtip_port *port = dd->port;
+ int ret;
+
+ while (1) {
+ if (kthread_should_stop() ||
+ test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+ goto st_out;
+ clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+ /*
+ * the condition is to check neither an internal command is
+ * is in progress nor error handling is active
+ */
+ wait_event_interruptible(port->svc_wait, (port->flags) &&
+ !(port->flags & MTIP_PF_PAUSE_IO));
+
+ set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+ if (kthread_should_stop() ||
+ test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+ goto st_out;
+
+ /* If I am an orphan, start self cleanup */
+ if (test_bit(MTIP_PF_SR_CLEANUP_BIT, &port->flags))
+ break;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag)))
+ goto st_out;
+
+restart_eh:
+ /* Demux bits: start with error handling */
+ if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) {
+ mtip_handle_tfe(dd);
+ clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+ }
+
+ if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags))
+ goto restart_eh;
+
+ if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+ slot = 1;
+ /* used to restrict the loop to one iteration */
+ slot_start = num_cmd_slots;
+ slot_wrap = 0;
+ while (1) {
+ slot = find_next_bit(port->cmds_to_issue,
+ num_cmd_slots, slot);
+ if (slot_wrap == 1) {
+ if ((slot_start >= slot) ||
+ (slot >= num_cmd_slots))
+ break;
+ }
+ if (unlikely(slot_start == num_cmd_slots))
+ slot_start = slot;
+
+ if (unlikely(slot == num_cmd_slots)) {
+ slot = 1;
+ slot_wrap = 1;
+ continue;
+ }
+
+ /* Issue the command to the hardware */
+ mtip_issue_ncq_command(port, slot);
+
+ clear_bit(slot, port->cmds_to_issue);
+ }
+
+ clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+ }
+
+ if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
+ if (mtip_ftl_rebuild_poll(dd) < 0)
+ set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
+ &dd->dd_flag);
+ clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
+ }
+ }
+
+ /* wait for pci remove to exit */
+ while (1) {
+ if (test_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag))
+ break;
+ msleep_interruptible(1000);
+ if (kthread_should_stop())
+ goto st_out;
+ }
+
+ while (1) {
+ ret = mtip_free_orphan(dd);
+ if (!ret) {
+ /* NOTE: All data structures are invalid, do not
+ * access any here */
+ return 0;
+ }
+ msleep_interruptible(1000);
+ if (kthread_should_stop())
+ goto st_out;
+ }
+st_out:
+ return 0;
+}
+
+/*
+ * DMA region teardown
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ * None
+ */
+static void mtip_dma_free(struct driver_data *dd)
+{
+ struct mtip_port *port = dd->port;
+
+ if (port->block1)
+ dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+ port->block1, port->block1_dma);
+
+ if (port->command_list) {
+ dmam_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+ port->command_list, port->command_list_dma);
+ }
+}
+
+/*
+ * DMA region setup
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ * -ENOMEM Not enough free DMA region space to initialize driver
+ */
+static int mtip_dma_alloc(struct driver_data *dd)
+{
+ struct mtip_port *port = dd->port;
+
+ /* Allocate dma memory for RX Fis, Identify, and Sector Bufffer */
+ port->block1 =
+ dmam_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+ &port->block1_dma, GFP_KERNEL);
+ if (!port->block1)
+ return -ENOMEM;
+ memset(port->block1, 0, BLOCK_DMA_ALLOC_SZ);
+
+ /* Allocate dma memory for command list */
+ port->command_list =
+ dmam_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+ &port->command_list_dma, GFP_KERNEL);
+ if (!port->command_list) {
+ dmam_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+ port->block1, port->block1_dma);
+ port->block1 = NULL;
+ port->block1_dma = 0;
+ return -ENOMEM;
+ }
+ memset(port->command_list, 0, AHCI_CMD_TBL_SZ);
+
+ /* Setup all pointers into first DMA region */
+ port->rxfis = port->block1 + AHCI_RX_FIS_OFFSET;
+ port->rxfis_dma = port->block1_dma + AHCI_RX_FIS_OFFSET;
+ port->identify = port->block1 + AHCI_IDFY_OFFSET;
+ port->identify_dma = port->block1_dma + AHCI_IDFY_OFFSET;
+ port->log_buf = port->block1 + AHCI_SECTBUF_OFFSET;
+ port->log_buf_dma = port->block1_dma + AHCI_SECTBUF_OFFSET;
+ port->smart_buf = port->block1 + AHCI_SMARTBUF_OFFSET;
+ port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET;
+
+ return 0;
+}
+
+static int mtip_hw_get_identify(struct driver_data *dd)
+{
+ struct smart_attr attr242;
+ unsigned char *buf;
+ int rv;
+
+ if (mtip_get_identify(dd->port, NULL) < 0)
+ return -EFAULT;
+
+ if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+ MTIP_FTL_REBUILD_MAGIC) {
+ set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
+ return MTIP_FTL_REBUILD_MAGIC;
+ }
+ mtip_dump_identify(dd->port);
+
+ /* check write protect, over temp and rebuild statuses */
+ rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+ dd->port->log_buf,
+ dd->port->log_buf_dma, 1);
+ if (rv) {
+ dev_warn(&dd->pdev->dev,
+ "Error in READ LOG EXT (10h) command\n");
+ /* non-critical error, don't fail the load */
+ } else {
+ buf = (unsigned char *)dd->port->log_buf;
+ if (buf[259] & 0x1) {
+ dev_info(&dd->pdev->dev,
+ "Write protect bit is set.\n");
+ set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+ }
+ if (buf[288] == 0xF7) {
+ dev_info(&dd->pdev->dev,
+ "Exceeded Tmax, drive in thermal shutdown.\n");
+ set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+ }
+ if (buf[288] == 0xBF) {
+ dev_info(&dd->pdev->dev,
+ "Drive indicates rebuild has failed.\n");
+ /* TODO */
+ }
+ }
+
+ /* get write protect progess */
+ memset(&attr242, 0, sizeof(struct smart_attr));
+ if (mtip_get_smart_attr(dd->port, 242, &attr242))
+ dev_warn(&dd->pdev->dev,
+ "Unable to check write protect progress\n");
+ else
+ dev_info(&dd->pdev->dev,
+ "Write protect progress: %u%% (%u blocks)\n",
+ attr242.cur, le32_to_cpu(attr242.data));
+
+ return rv;
+}
+
+/*
+ * Called once for each card.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 on success, else an error code.
+ */
+static int mtip_hw_init(struct driver_data *dd)
+{
+ int i;
+ int rv;
+ unsigned int num_command_slots;
+ unsigned long timeout, timetaken;
+
+ dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+
+ mtip_detect_product(dd);
+ if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
+ rv = -EIO;
+ goto out1;
+ }
+ num_command_slots = dd->slot_groups * 32;
+
+ hba_setup(dd);
+
+ dd->port = kzalloc_node(sizeof(struct mtip_port), GFP_KERNEL,
+ dd->numa_node);
+ if (!dd->port) {
+ dev_err(&dd->pdev->dev,
+ "Memory allocation: port structure\n");
+ return -ENOMEM;
+ }
+
+ /* Continue workqueue setup */
+ for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+ dd->work[i].port = dd->port;
+
+ /* Enable unaligned IO constraints for some devices */
+ if (mtip_device_unaligned_constrained(dd))
+ dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS;
+ else
+ dd->unal_qdepth = 0;
+
+ sema_init(&dd->port->cmd_slot_unal, dd->unal_qdepth);
+
+ /* Spinlock to prevent concurrent issue */
+ for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+ spin_lock_init(&dd->port->cmd_issue_lock[i]);
+
+ /* Set the port mmio base address. */
+ dd->port->mmio = dd->mmio + PORT_OFFSET;
+ dd->port->dd = dd;
+
+ /* DMA allocations */
+ rv = mtip_dma_alloc(dd);
+ if (rv < 0)
+ goto out1;
+
+ /* Setup the pointers to the extended s_active and CI registers. */
+ for (i = 0; i < dd->slot_groups; i++) {
+ dd->port->s_active[i] =
+ dd->port->mmio + i*0x80 + PORT_SCR_ACT;
+ dd->port->cmd_issue[i] =
+ dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE;
+ dd->port->completed[i] =
+ dd->port->mmio + i*0x80 + PORT_SDBV;
+ }
+
+ timetaken = jiffies;
+ timeout = jiffies + msecs_to_jiffies(30000);
+ while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) &&
+ time_before(jiffies, timeout)) {
+ mdelay(100);
+ }
+ if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+ timetaken = jiffies - timetaken;
+ dev_warn(&dd->pdev->dev,
+ "Surprise removal detected at %u ms\n",
+ jiffies_to_msecs(timetaken));
+ rv = -ENODEV;
+ goto out2 ;
+ }
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+ timetaken = jiffies - timetaken;
+ dev_warn(&dd->pdev->dev,
+ "Removal detected at %u ms\n",
+ jiffies_to_msecs(timetaken));
+ rv = -EFAULT;
+ goto out2;
+ }
+
+ /* Conditionally reset the HBA. */
+ if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) {
+ if (mtip_hba_reset(dd) < 0) {
+ dev_err(&dd->pdev->dev,
+ "Card did not reset within timeout\n");
+ rv = -EIO;
+ goto out2;
+ }
+ } else {
+ /* Clear any pending interrupts on the HBA */
+ writel(readl(dd->mmio + HOST_IRQ_STAT),
+ dd->mmio + HOST_IRQ_STAT);
+ }
+
+ mtip_init_port(dd->port);
+ mtip_start_port(dd->port);
+
+ /* Setup the ISR and enable interrupts. */
+ rv = devm_request_irq(&dd->pdev->dev,
+ dd->pdev->irq,
+ mtip_irq_handler,
+ IRQF_SHARED,
+ dev_driver_string(&dd->pdev->dev),
+ dd);
+
+ if (rv) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate IRQ %d\n", dd->pdev->irq);
+ goto out2;
+ }
+ irq_set_affinity_hint(dd->pdev->irq, get_cpu_mask(dd->isr_binding));
+
+ /* Enable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ init_waitqueue_head(&dd->port->svc_wait);
+
+ if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+ rv = -EFAULT;
+ goto out3;
+ }
+
+ return rv;
+
+out3:
+ /* Disable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ /* Release the IRQ. */
+ irq_set_affinity_hint(dd->pdev->irq, NULL);
+ devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+out2:
+ mtip_deinit_port(dd->port);
+ mtip_dma_free(dd);
+
+out1:
+ /* Free the memory allocated for the for structure. */
+ kfree(dd->port);
+
+ return rv;
+}
+
+static void mtip_standby_drive(struct driver_data *dd)
+{
+ if (dd->sr)
+ return;
+
+ /*
+ * Send standby immediate (E0h) to the drive so that it
+ * saves its state.
+ */
+ if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
+ !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
+ if (mtip_standby_immediate(dd->port))
+ dev_warn(&dd->pdev->dev,
+ "STANDBY IMMEDIATE failed\n");
+}
+
+/*
+ * Called to deinitialize an interface.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_hw_exit(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive so that it
+ * saves its state.
+ */
+ if (!dd->sr) {
+ /* de-initialize the port. */
+ mtip_deinit_port(dd->port);
+
+ /* Disable interrupts on the HBA. */
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+ }
+
+ /* Release the IRQ. */
+ irq_set_affinity_hint(dd->pdev->irq, NULL);
+ devm_free_irq(&dd->pdev->dev, dd->pdev->irq, dd);
+
+ /* Free dma regions */
+ mtip_dma_free(dd);
+
+ /* Free the memory allocated for the for structure. */
+ kfree(dd->port);
+ dd->port = NULL;
+
+ return 0;
+}
+
+/*
+ * Issue a Standby Immediate command to the device.
+ *
+ * This function is called by the Block Layer just before the
+ * system powers off during a shutdown.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_hw_shutdown(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive so that it
+ * saves its state.
+ */
+ if (!dd->sr && dd->port)
+ mtip_standby_immediate(dd->port);
+
+ return 0;
+}
+
+/*
+ * Suspend function
+ *
+ * This function is called by the Block Layer just before the
+ * system hibernates.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 Suspend was successful
+ * -EFAULT Suspend was not successful
+ */
+static int mtip_hw_suspend(struct driver_data *dd)
+{
+ /*
+ * Send standby immediate (E0h) to the drive
+ * so that it saves its state.
+ */
+ if (mtip_standby_immediate(dd->port) != 0) {
+ dev_err(&dd->pdev->dev,
+ "Failed standby-immediate command\n");
+ return -EFAULT;
+ }
+
+ /* Disable interrupts on the HBA.*/
+ writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+ mtip_deinit_port(dd->port);
+
+ return 0;
+}
+
+/*
+ * Resume function
+ *
+ * This function is called by the Block Layer as the
+ * system resumes.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 Resume was successful
+ * -EFAULT Resume was not successful
+ */
+static int mtip_hw_resume(struct driver_data *dd)
+{
+ /* Perform any needed hardware setup steps */
+ hba_setup(dd);
+
+ /* Reset the HBA */
+ if (mtip_hba_reset(dd) != 0) {
+ dev_err(&dd->pdev->dev,
+ "Unable to reset the HBA\n");
+ return -EFAULT;
+ }
+
+ /*
+ * Enable the port, DMA engine, and FIS reception specific
+ * h/w in controller.
+ */
+ mtip_init_port(dd->port);
+ mtip_start_port(dd->port);
+
+ /* Enable interrupts on the HBA.*/
+ writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+ dd->mmio + HOST_CTL);
+
+ return 0;
+}
+
+/*
+ * Helper function for reusing disk name
+ * upon hot insertion.
+ */
+static int rssd_disk_name_format(char *prefix,
+ int index,
+ char *buf,
+ int buflen)
+{
+ const int base = 'z' - 'a' + 1;
+ char *begin = buf + strlen(prefix);
+ char *end = buf + buflen;
+ char *p;
+ int unit;
+
+ p = end - 1;
+ *p = '\0';
+ unit = base;
+ do {
+ if (p == begin)
+ return -EINVAL;
+ *--p = 'a' + (index % unit);
+ index = (index / unit) - 1;
+ } while (index >= 0);
+
+ memmove(begin, p, end - p);
+ memcpy(buf, prefix, strlen(prefix));
+
+ return 0;
+}
+
+/*
+ * Block layer IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ * 0 IOCTL completed successfully.
+ * -ENOTTY IOCTL not supported or invalid driver data
+ * structure pointer.
+ */
+static int mtip_block_ioctl(struct block_device *dev,
+ fmode_t mode,
+ unsigned cmd,
+ unsigned long arg)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+ return -ENOTTY;
+
+ switch (cmd) {
+ case BLKFLSBUF:
+ return -ENOTTY;
+ default:
+ return mtip_hw_ioctl(dd, cmd, arg);
+ }
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Block layer compat IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ * 0 IOCTL completed successfully.
+ * -ENOTTY IOCTL not supported or invalid driver data
+ * structure pointer.
+ */
+static int mtip_block_compat_ioctl(struct block_device *dev,
+ fmode_t mode,
+ unsigned cmd,
+ unsigned long arg)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+ return -ENOTTY;
+
+ switch (cmd) {
+ case BLKFLSBUF:
+ return -ENOTTY;
+ case HDIO_DRIVE_TASKFILE: {
+ struct mtip_compat_ide_task_request_s __user *compat_req_task;
+ ide_task_request_t req_task;
+ int compat_tasksize, outtotal, ret;
+
+ compat_tasksize =
+ sizeof(struct mtip_compat_ide_task_request_s);
+
+ compat_req_task =
+ (struct mtip_compat_ide_task_request_s __user *) arg;
+
+ if (copy_from_user(&req_task, (void __user *) arg,
+ compat_tasksize - (2 * sizeof(compat_long_t))))
+ return -EFAULT;
+
+ if (get_user(req_task.out_size, &compat_req_task->out_size))
+ return -EFAULT;
+
+ if (get_user(req_task.in_size, &compat_req_task->in_size))
+ return -EFAULT;
+
+ outtotal = sizeof(struct mtip_compat_ide_task_request_s);
+
+ ret = exec_drive_taskfile(dd, (void __user *) arg,
+ &req_task, outtotal);
+
+ if (copy_to_user((void __user *) arg, &req_task,
+ compat_tasksize -
+ (2 * sizeof(compat_long_t))))
+ return -EFAULT;
+
+ if (put_user(req_task.out_size, &compat_req_task->out_size))
+ return -EFAULT;
+
+ if (put_user(req_task.in_size, &compat_req_task->in_size))
+ return -EFAULT;
+
+ return ret;
+ }
+ default:
+ return mtip_hw_ioctl(dd, cmd, arg);
+ }
+}
+#endif
+
+/*
+ * Obtain the geometry of the device.
+ *
+ * You may think that this function is obsolete, but some applications,
+ * fdisk for example still used CHS values. This function describes the
+ * device as having 224 heads and 56 sectors per cylinder. These values are
+ * chosen so that each cylinder is aligned on a 4KB boundary. Since a
+ * partition is described in terms of a start and end cylinder this means
+ * that each partition is also 4KB aligned. Non-aligned partitions adversely
+ * affects performance.
+ *
+ * @dev Pointer to the block_device strucutre.
+ * @geo Pointer to a hd_geometry structure.
+ *
+ * return value
+ * 0 Operation completed successfully.
+ * -ENOTTY An error occurred while reading the drive capacity.
+ */
+static int mtip_block_getgeo(struct block_device *dev,
+ struct hd_geometry *geo)
+{
+ struct driver_data *dd = dev->bd_disk->private_data;
+ sector_t capacity;
+
+ if (!dd)
+ return -ENOTTY;
+
+ if (!(mtip_hw_get_capacity(dd, &capacity))) {
+ dev_warn(&dd->pdev->dev,
+ "Could not get drive capacity.\n");
+ return -ENOTTY;
+ }
+
+ geo->heads = 224;
+ geo->sectors = 56;
+ sector_div(capacity, (geo->heads * geo->sectors));
+ geo->cylinders = capacity;
+ return 0;
+}
+
+/*
+ * Block device operation function.
+ *
+ * This structure contains pointers to the functions required by the block
+ * layer.
+ */
+static const struct block_device_operations mtip_block_ops = {
+ .ioctl = mtip_block_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = mtip_block_compat_ioctl,
+#endif
+ .getgeo = mtip_block_getgeo,
+ .owner = THIS_MODULE
+};
+
+/*
+ * Block layer make request function.
+ *
+ * This function is called by the kernel to process a BIO for
+ * the P320 device.
+ *
+ * @queue Pointer to the request queue. Unused other than to obtain
+ * the driver data structure.
+ * @rq Pointer to the request.
+ *
+ */
+static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+ struct driver_data *dd = hctx->queue->queuedata;
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ unsigned int nents;
+
+ if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
+ if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+ &dd->dd_flag))) {
+ return -ENXIO;
+ }
+ if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
+ return -ENODATA;
+ }
+ if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
+ &dd->dd_flag) &&
+ rq_data_dir(rq))) {
+ return -ENODATA;
+ }
+ if (unlikely(test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)))
+ return -ENODATA;
+ if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+ return -ENXIO;
+ }
+
+ if (rq->cmd_flags & REQ_DISCARD) {
+ int err;
+
+ err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq));
+ blk_mq_end_request(rq, err);
+ return 0;
+ }
+
+ /* Create the scatter list for this request. */
+ nents = blk_rq_map_sg(hctx->queue, rq, cmd->sg);
+
+ /* Issue the read/write. */
+ mtip_hw_submit_io(dd, rq, cmd, nents, hctx);
+ return 0;
+}
+
+static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
+ struct request *rq)
+{
+ struct driver_data *dd = hctx->queue->queuedata;
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ if (rq_data_dir(rq) == READ || !dd->unal_qdepth)
+ return false;
+
+ /*
+ * If unaligned depth must be limited on this controller, mark it
+ * as unaligned if the IO isn't on a 4k boundary (start of length).
+ */
+ if (blk_rq_sectors(rq) <= 64) {
+ if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7))
+ cmd->unaligned = 1;
+ }
+
+ if (cmd->unaligned && down_trylock(&dd->port->cmd_slot_unal))
+ return true;
+
+ return false;
+}
+
+static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ int ret;
+
+ if (unlikely(mtip_check_unal_depth(hctx, rq)))
+ return BLK_MQ_RQ_QUEUE_BUSY;
+
+ blk_mq_start_request(rq);
+
+ ret = mtip_submit_request(hctx, rq);
+ if (likely(!ret))
+ return BLK_MQ_RQ_QUEUE_OK;
+
+ rq->errors = ret;
+ return BLK_MQ_RQ_QUEUE_ERROR;
+}
+
+static void mtip_free_cmd(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx)
+{
+ struct driver_data *dd = data;
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ if (!cmd->command)
+ return;
+
+ dmam_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+ cmd->command, cmd->command_dma);
+}
+
+static int mtip_init_cmd(void *data, struct request *rq, unsigned int hctx_idx,
+ unsigned int request_idx, unsigned int numa_node)
+{
+ struct driver_data *dd = data;
+ struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+ u32 host_cap_64 = readl(dd->mmio + HOST_CAP) & HOST_CAP_64;
+
+ cmd->command = dmam_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+ &cmd->command_dma, GFP_KERNEL);
+ if (!cmd->command)
+ return -ENOMEM;
+
+ memset(cmd->command, 0, CMD_DMA_ALLOC_SZ);
+
+ /* Point the command headers at the command tables. */
+ cmd->command_header = dd->port->command_list +
+ (sizeof(struct mtip_cmd_hdr) * request_idx);
+ cmd->command_header_dma = dd->port->command_list_dma +
+ (sizeof(struct mtip_cmd_hdr) * request_idx);
+
+ if (host_cap_64)
+ cmd->command_header->ctbau = __force_bit2int cpu_to_le32((cmd->command_dma >> 16) >> 16);
+
+ cmd->command_header->ctba = __force_bit2int cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+
+ sg_init_table(cmd->sg, MTIP_MAX_SG);
+ return 0;
+}
+
+static struct blk_mq_ops mtip_mq_ops = {
+ .queue_rq = mtip_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = mtip_init_cmd,
+ .exit_request = mtip_free_cmd,
+};
+
+/*
+ * Block layer initialization function.
+ *
+ * This function is called once by the PCI layer for each P320
+ * device that is connected to the system.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0 on success else an error code.
+ */
+static int mtip_block_initialize(struct driver_data *dd)
+{
+ int rv = 0, wait_for_rebuild = 0;
+ sector_t capacity;
+ unsigned int index = 0;
+ struct kobject *kobj;
+ unsigned char thd_name[16];
+
+ if (dd->disk)
+ goto skip_create_disk; /* hw init done, before rebuild */
+
+ if (mtip_hw_init(dd)) {
+ rv = -EINVAL;
+ goto protocol_init_error;
+ }
+
+ dd->disk = alloc_disk_node(MTIP_MAX_MINORS, dd->numa_node);
+ if (dd->disk == NULL) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate gendisk structure\n");
+ rv = -EINVAL;
+ goto alloc_disk_error;
+ }
+
+ /* Generate the disk name, implemented same as in sd.c */
+ do {
+ if (!ida_pre_get(&rssd_index_ida, GFP_KERNEL))
+ goto ida_get_error;
+
+ spin_lock(&rssd_index_lock);
+ rv = ida_get_new(&rssd_index_ida, &index);
+ spin_unlock(&rssd_index_lock);
+ } while (rv == -EAGAIN);
+
+ if (rv)
+ goto ida_get_error;
+
+ rv = rssd_disk_name_format("rssd",
+ index,
+ dd->disk->disk_name,
+ DISK_NAME_LEN);
+ if (rv)
+ goto disk_index_error;
+
+ dd->disk->driverfs_dev = &dd->pdev->dev;
+ dd->disk->major = dd->major;
+ dd->disk->first_minor = dd->instance * MTIP_MAX_MINORS;
+ dd->disk->fops = &mtip_block_ops;
+ dd->disk->private_data = dd;
+ dd->index = index;
+
+ mtip_hw_debugfs_init(dd);
+
+skip_create_disk:
+ memset(&dd->tags, 0, sizeof(dd->tags));
+ dd->tags.ops = &mtip_mq_ops;
+ dd->tags.nr_hw_queues = 1;
+ dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS;
+ dd->tags.reserved_tags = 1;
+ dd->tags.cmd_size = sizeof(struct mtip_cmd);
+ dd->tags.numa_node = dd->numa_node;
+ dd->tags.flags = BLK_MQ_F_SHOULD_MERGE;
+ dd->tags.driver_data = dd;
+
+ rv = blk_mq_alloc_tag_set(&dd->tags);
+ if (rv) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate request queue\n");
+ goto block_queue_alloc_init_error;
+ }
+
+ /* Allocate the request queue. */
+ dd->queue = blk_mq_init_queue(&dd->tags);
+ if (IS_ERR(dd->queue)) {
+ dev_err(&dd->pdev->dev,
+ "Unable to allocate request queue\n");
+ rv = -ENOMEM;
+ goto block_queue_alloc_init_error;
+ }
+
+ dd->disk->queue = dd->queue;
+ dd->queue->queuedata = dd;
+
+ /* Initialize the protocol layer. */
+ wait_for_rebuild = mtip_hw_get_identify(dd);
+ if (wait_for_rebuild < 0) {
+ dev_err(&dd->pdev->dev,
+ "Protocol layer initialization failed\n");
+ rv = -EINVAL;
+ goto init_hw_cmds_error;
+ }
+
+ /*
+ * if rebuild pending, start the service thread, and delay the block
+ * queue creation and add_disk()
+ */
+ if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+ goto start_service_thread;
+
+ /* Set device limits. */
+ set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
+ clear_bit(QUEUE_FLAG_ADD_RANDOM, &dd->queue->queue_flags);
+ blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+ blk_queue_physical_block_size(dd->queue, 4096);
+ blk_queue_max_hw_sectors(dd->queue, 0xffff);
+ blk_queue_max_segment_size(dd->queue, 0x400000);
+ blk_queue_io_min(dd->queue, 4096);
+ blk_queue_bounce_limit(dd->queue, dd->pdev->dma_mask);
+
+ /*
+ * write back cache is not supported in the device. FUA depends on
+ * write back cache support, hence setting flush support to zero.
+ */
+ blk_queue_flush(dd->queue, 0);
+
+ /* Signal trim support */
+ if (dd->trim_supp == true) {
+ set_bit(QUEUE_FLAG_DISCARD, &dd->queue->queue_flags);
+ dd->queue->limits.discard_granularity = 4096;
+ blk_queue_max_discard_sectors(dd->queue,
+ MTIP_MAX_TRIM_ENTRY_LEN * MTIP_MAX_TRIM_ENTRIES);
+ dd->queue->limits.discard_zeroes_data = 0;
+ }
+
+ /* Set the capacity of the device in 512 byte sectors. */
+ if (!(mtip_hw_get_capacity(dd, &capacity))) {
+ dev_warn(&dd->pdev->dev,
+ "Could not read drive capacity\n");
+ rv = -EIO;
+ goto read_capacity_error;
+ }
+ set_capacity(dd->disk, capacity);
+
+ /* Enable the block device and add it to /dev */
+ add_disk(dd->disk);
+
+ dd->bdev = bdget_disk(dd->disk, 0);
+ /*
+ * Now that the disk is active, initialize any sysfs attributes
+ * managed by the protocol layer.
+ */
+ kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+ if (kobj) {
+ mtip_hw_sysfs_init(dd, kobj);
+ kobject_put(kobj);
+ }
+
+ if (dd->mtip_svc_handler) {
+ set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+ return rv; /* service thread created for handling rebuild */
+ }
+
+start_service_thread:
+ sprintf(thd_name, "mtip_svc_thd_%02d", index);
+ dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread,
+ dd, dd->numa_node, "%s",
+ thd_name);
+
+ if (IS_ERR(dd->mtip_svc_handler)) {
+ dev_err(&dd->pdev->dev, "service thread failed to start\n");
+ dd->mtip_svc_handler = NULL;
+ rv = -EFAULT;
+ goto kthread_run_error;
+ }
+ wake_up_process(dd->mtip_svc_handler);
+ if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+ rv = wait_for_rebuild;
+
+ return rv;
+
+kthread_run_error:
+ bdput(dd->bdev);
+ dd->bdev = NULL;
+
+ /* Delete our gendisk. This also removes the device from /dev */
+ del_gendisk(dd->disk);
+
+read_capacity_error:
+init_hw_cmds_error:
+ blk_cleanup_queue(dd->queue);
+ blk_mq_free_tag_set(&dd->tags);
+block_queue_alloc_init_error:
+ mtip_hw_debugfs_exit(dd);
+disk_index_error:
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, index);
+ spin_unlock(&rssd_index_lock);
+
+ida_get_error:
+ put_disk(dd->disk);
+
+alloc_disk_error:
+ mtip_hw_exit(dd); /* De-initialize the protocol layer. */
+
+protocol_init_error:
+ return rv;
+}
+
+/*
+ * Block layer deinitialization function.
+ *
+ * Called by the PCI layer as each P320 device is removed.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_block_remove(struct driver_data *dd)
+{
+ struct kobject *kobj;
+
+ if (!dd->sr) {
+ mtip_hw_debugfs_exit(dd);
+
+ if (dd->mtip_svc_handler) {
+ set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
+ wake_up_interruptible(&dd->port->svc_wait);
+ kthread_stop(dd->mtip_svc_handler);
+ }
+
+ /* Clean up the sysfs attributes, if created */
+ if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
+ kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+ if (kobj) {
+ mtip_hw_sysfs_exit(dd, kobj);
+ kobject_put(kobj);
+ }
+ }
+
+ mtip_standby_drive(dd);
+
+ /*
+ * Delete our gendisk structure. This also removes the device
+ * from /dev
+ */
+ if (dd->bdev) {
+ bdput(dd->bdev);
+ dd->bdev = NULL;
+ }
+ if (dd->disk) {
+ if (dd->disk->queue) {
+ del_gendisk(dd->disk);
+ blk_cleanup_queue(dd->queue);
+ blk_mq_free_tag_set(&dd->tags);
+ dd->queue = NULL;
+ } else
+ put_disk(dd->disk);
+ }
+ dd->disk = NULL;
+
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, dd->index);
+ spin_unlock(&rssd_index_lock);
+ } else {
+ dev_info(&dd->pdev->dev, "device %s surprise removal\n",
+ dd->disk->disk_name);
+ }
+
+ /* De-initialize the protocol layer. */
+ mtip_hw_exit(dd);
+
+ return 0;
+}
+
+/*
+ * Function called by the PCI layer when just before the
+ * machine shuts down.
+ *
+ * If a protocol layer shutdown function is present it will be called
+ * by this function.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ * 0
+ */
+static int mtip_block_shutdown(struct driver_data *dd)
+{
+ mtip_hw_shutdown(dd);
+
+ /* Delete our gendisk structure, and cleanup the blk queue. */
+ if (dd->disk) {
+ dev_info(&dd->pdev->dev,
+ "Shutting down %s ...\n", dd->disk->disk_name);
+
+ if (dd->disk->queue) {
+ del_gendisk(dd->disk);
+ blk_cleanup_queue(dd->queue);
+ blk_mq_free_tag_set(&dd->tags);
+ } else
+ put_disk(dd->disk);
+ dd->disk = NULL;
+ dd->queue = NULL;
+ }
+
+ spin_lock(&rssd_index_lock);
+ ida_remove(&rssd_index_ida, dd->index);
+ spin_unlock(&rssd_index_lock);
+ return 0;
+}
+
+static int mtip_block_suspend(struct driver_data *dd)
+{
+ dev_info(&dd->pdev->dev,
+ "Suspending %s ...\n", dd->disk->disk_name);
+ mtip_hw_suspend(dd);
+ return 0;
+}
+
+static int mtip_block_resume(struct driver_data *dd)
+{
+ dev_info(&dd->pdev->dev, "Resuming %s ...\n",
+ dd->disk->disk_name);
+ mtip_hw_resume(dd);
+ return 0;
+}
+
+static void drop_cpu(int cpu)
+{
+ cpu_use[cpu]--;
+}
+
+static int get_least_used_cpu_on_node(int node)
+{
+ int cpu, least_used_cpu, least_cnt;
+ const struct cpumask *node_mask;
+
+ node_mask = cpumask_of_node(node);
+ least_used_cpu = cpumask_first(node_mask);
+ least_cnt = cpu_use[least_used_cpu];
+ cpu = least_used_cpu;
+
+ for_each_cpu(cpu, node_mask) {
+ if (cpu_use[cpu] < least_cnt) {
+ least_used_cpu = cpu;
+ least_cnt = cpu_use[cpu];
+ }
+ }
+ cpu_use[least_used_cpu]++;
+ return least_used_cpu;
+}
+
+/* Helper for selecting a node in round robin mode */
+static inline int mtip_get_next_rr_node(void)
+{
+ static int next_node = -1;
+
+ if (next_node == -1) {
+ next_node = first_online_node;
+ return next_node;
+ }
+
+ next_node = next_online_node(next_node);
+ if (next_node == MAX_NUMNODES)
+ next_node = first_online_node;
+ return next_node;
+}
+
+static DEFINE_HANDLER(0);
+static DEFINE_HANDLER(1);
+static DEFINE_HANDLER(2);
+static DEFINE_HANDLER(3);
+static DEFINE_HANDLER(4);
+static DEFINE_HANDLER(5);
+static DEFINE_HANDLER(6);
+static DEFINE_HANDLER(7);
+
+static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev)
+{
+ int pos;
+ unsigned short pcie_dev_ctrl;
+
+ pos = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+ if (pos) {
+ pci_read_config_word(pdev,
+ pos + PCI_EXP_DEVCTL,
+ &pcie_dev_ctrl);
+ if (pcie_dev_ctrl & (1 << 11) ||
+ pcie_dev_ctrl & (1 << 4)) {
+ dev_info(&dd->pdev->dev,
+ "Disabling ERO/No-Snoop on bridge device %04x:%04x\n",
+ pdev->vendor, pdev->device);
+ pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN |
+ PCI_EXP_DEVCTL_RELAX_EN);
+ pci_write_config_word(pdev,
+ pos + PCI_EXP_DEVCTL,
+ pcie_dev_ctrl);
+ }
+ }
+}
+
+static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev)
+{
+ /*
+ * This workaround is specific to AMD/ATI chipset with a PCI upstream
+ * device with device id 0x5aXX
+ */
+ if (pdev->bus && pdev->bus->self) {
+ if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI &&
+ ((pdev->bus->self->device & 0xff00) == 0x5a00)) {
+ mtip_disable_link_opts(dd, pdev->bus->self);
+ } else {
+ /* Check further up the topology */
+ struct pci_dev *parent_dev = pdev->bus->self;
+ if (parent_dev->bus &&
+ parent_dev->bus->parent &&
+ parent_dev->bus->parent->self &&
+ parent_dev->bus->parent->self->vendor ==
+ PCI_VENDOR_ID_ATI &&
+ (parent_dev->bus->parent->self->device &
+ 0xff00) == 0x5a00) {
+ mtip_disable_link_opts(dd,
+ parent_dev->bus->parent->self);
+ }
+ }
+ }
+}
+
+/*
+ * Called for each supported PCI device detected.
+ *
+ * This function allocates the private data structure, enables the
+ * PCI device and then calls the block layer initialization function.
+ *
+ * return value
+ * 0 on success else an error code.
+ */
+static int mtip_pci_probe(struct pci_dev *pdev,
+ const struct pci_device_id *ent)
+{
+ int rv = 0;
+ struct driver_data *dd = NULL;
+ char cpu_list[256];
+ const struct cpumask *node_mask;
+ int cpu, i = 0, j = 0;
+ int my_node = NUMA_NO_NODE;
+ unsigned long flags;
+
+ /* Allocate memory for this devices private data. */
+ my_node = pcibus_to_node(pdev->bus);
+ if (my_node != NUMA_NO_NODE) {
+ if (!node_online(my_node))
+ my_node = mtip_get_next_rr_node();
+ } else {
+ dev_info(&pdev->dev, "Kernel not reporting proximity, choosing a node\n");
+ my_node = mtip_get_next_rr_node();
+ }
+ dev_info(&pdev->dev, "NUMA node %d (closest: %d,%d, probe on %d:%d)\n",
+ my_node, pcibus_to_node(pdev->bus), dev_to_node(&pdev->dev),
+ cpu_to_node(raw_smp_processor_id()), raw_smp_processor_id());
+
+ dd = kzalloc_node(sizeof(struct driver_data), GFP_KERNEL, my_node);
+ if (dd == NULL) {
+ dev_err(&pdev->dev,
+ "Unable to allocate memory for driver data\n");
+ return -ENOMEM;
+ }
+
+ /* Attach the private data to this PCI device. */
+ pci_set_drvdata(pdev, dd);
+
+ rv = pcim_enable_device(pdev);
+ if (rv < 0) {
+ dev_err(&pdev->dev, "Unable to enable device\n");
+ goto iomap_err;
+ }
+
+ /* Map BAR5 to memory. */
+ rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
+ if (rv < 0) {
+ dev_err(&pdev->dev, "Unable to map regions\n");
+ goto iomap_err;
+ }
+
+ if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64))) {
+ rv = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+
+ if (rv) {
+ rv = pci_set_consistent_dma_mask(pdev,
+ DMA_BIT_MASK(32));
+ if (rv) {
+ dev_warn(&pdev->dev,
+ "64-bit DMA enable failed\n");
+ goto setmask_err;
+ }
+ }
+ }
+
+ /* Copy the info we may need later into the private data structure. */
+ dd->major = mtip_major;
+ dd->instance = instance;
+ dd->pdev = pdev;
+ dd->numa_node = my_node;
+
+ INIT_LIST_HEAD(&dd->online_list);
+ INIT_LIST_HEAD(&dd->remove_list);
+
+ memset(dd->workq_name, 0, 32);
+ snprintf(dd->workq_name, 31, "mtipq%d", dd->instance);
+
+ dd->isr_workq = create_workqueue(dd->workq_name);
+ if (!dd->isr_workq) {
+ dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
+ rv = -ENOMEM;
+ goto block_initialize_err;
+ }
+
+ memset(cpu_list, 0, sizeof(cpu_list));
+
+ node_mask = cpumask_of_node(dd->numa_node);
+ if (!cpumask_empty(node_mask)) {
+ for_each_cpu(cpu, node_mask)
+ {
+ snprintf(&cpu_list[j], 256 - j, "%d ", cpu);
+ j = strlen(cpu_list);
+ }
+
+ dev_info(&pdev->dev, "Node %d on package %d has %d cpu(s): %s\n",
+ dd->numa_node,
+ topology_physical_package_id(cpumask_first(node_mask)),
+ nr_cpus_node(dd->numa_node),
+ cpu_list);
+ } else
+ dev_dbg(&pdev->dev, "mtip32xx: node_mask empty\n");
+
+ dd->isr_binding = get_least_used_cpu_on_node(dd->numa_node);
+ dev_info(&pdev->dev, "Initial IRQ binding node:cpu %d:%d\n",
+ cpu_to_node(dd->isr_binding), dd->isr_binding);
+
+ /* first worker context always runs in ISR */
+ dd->work[0].cpu_binding = dd->isr_binding;
+ dd->work[1].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+ dd->work[2].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+ dd->work[3].cpu_binding = dd->work[0].cpu_binding;
+ dd->work[4].cpu_binding = dd->work[1].cpu_binding;
+ dd->work[5].cpu_binding = dd->work[2].cpu_binding;
+ dd->work[6].cpu_binding = dd->work[2].cpu_binding;
+ dd->work[7].cpu_binding = dd->work[1].cpu_binding;
+
+ /* Log the bindings */
+ for_each_present_cpu(cpu) {
+ memset(cpu_list, 0, sizeof(cpu_list));
+ for (i = 0, j = 0; i < MTIP_MAX_SLOT_GROUPS; i++) {
+ if (dd->work[i].cpu_binding == cpu) {
+ snprintf(&cpu_list[j], 256 - j, "%d ", i);
+ j = strlen(cpu_list);
+ }
+ }
+ if (j)
+ dev_info(&pdev->dev, "CPU %d: WQs %s\n", cpu, cpu_list);
+ }
+
+ INIT_WORK(&dd->work[0].work, mtip_workq_sdbf0);
+ INIT_WORK(&dd->work[1].work, mtip_workq_sdbf1);
+ INIT_WORK(&dd->work[2].work, mtip_workq_sdbf2);
+ INIT_WORK(&dd->work[3].work, mtip_workq_sdbf3);
+ INIT_WORK(&dd->work[4].work, mtip_workq_sdbf4);
+ INIT_WORK(&dd->work[5].work, mtip_workq_sdbf5);
+ INIT_WORK(&dd->work[6].work, mtip_workq_sdbf6);
+ INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7);
+
+ pci_set_master(pdev);
+ rv = pci_enable_msi(pdev);
+ if (rv) {
+ dev_warn(&pdev->dev,
+ "Unable to enable MSI interrupt.\n");
+ goto msi_initialize_err;
+ }
+
+ mtip_fix_ero_nosnoop(dd, pdev);
+
+ /* Initialize the block layer. */
+ rv = mtip_block_initialize(dd);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Unable to initialize block layer\n");
+ goto block_initialize_err;
+ }
+
+ /*
+ * Increment the instance count so that each device has a unique
+ * instance number.
+ */
+ instance++;
+ if (rv != MTIP_FTL_REBUILD_MAGIC)
+ set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+ else
+ rv = 0; /* device in rebuild state, return 0 from probe */
+
+ /* Add to online list even if in ftl rebuild */
+ spin_lock_irqsave(&dev_lock, flags);
+ list_add(&dd->online_list, &online_list);
+ spin_unlock_irqrestore(&dev_lock, flags);
+
+ goto done;
+
+block_initialize_err:
+ pci_disable_msi(pdev);
+
+msi_initialize_err:
+ if (dd->isr_workq) {
+ flush_workqueue(dd->isr_workq);
+ destroy_workqueue(dd->isr_workq);
+ drop_cpu(dd->work[0].cpu_binding);
+ drop_cpu(dd->work[1].cpu_binding);
+ drop_cpu(dd->work[2].cpu_binding);
+ }
+setmask_err:
+ pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+
+iomap_err:
+ kfree(dd);
+ pci_set_drvdata(pdev, NULL);
+ return rv;
+done:
+ return rv;
+}
+
+/*
+ * Called for each probed device when the device is removed or the
+ * driver is unloaded.
+ *
+ * return value
+ * None
+ */
+static void mtip_pci_remove(struct pci_dev *pdev)
+{
+ struct driver_data *dd = pci_get_drvdata(pdev);
+ unsigned long flags, to;
+
+ set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
+
+ spin_lock_irqsave(&dev_lock, flags);
+ list_del_init(&dd->online_list);
+ list_add(&dd->remove_list, &removing_list);
+ spin_unlock_irqrestore(&dev_lock, flags);
+
+ mtip_check_surprise_removal(pdev);
+ synchronize_irq(dd->pdev->irq);
+
+ /* Spin until workers are done */
+ to = jiffies + msecs_to_jiffies(4000);
+ do {
+ msleep(20);
+ } while (atomic_read(&dd->irq_workers_active) != 0 &&
+ time_before(jiffies, to));
+
+ if (atomic_read(&dd->irq_workers_active) != 0) {
+ dev_warn(&dd->pdev->dev,
+ "Completion workers still active!\n");
+ }
+
+ /* Clean up the block layer. */
+ mtip_block_remove(dd);
+
+ if (dd->isr_workq) {
+ flush_workqueue(dd->isr_workq);
+ destroy_workqueue(dd->isr_workq);
+ drop_cpu(dd->work[0].cpu_binding);
+ drop_cpu(dd->work[1].cpu_binding);
+ drop_cpu(dd->work[2].cpu_binding);
+ }
+
+ pci_disable_msi(pdev);
+
+ spin_lock_irqsave(&dev_lock, flags);
+ list_del_init(&dd->remove_list);
+ spin_unlock_irqrestore(&dev_lock, flags);
+
+ if (!dd->sr)
+ kfree(dd);
+ else
+ set_bit(MTIP_DDF_REMOVE_DONE_BIT, &dd->dd_flag);
+
+ pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+ pci_set_drvdata(pdev, NULL);
+}
+
+/*
+ * Called for each probed device when the device is suspended.
+ *
+ * return value
+ * 0 Success
+ * <0 Error
+ */
+static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
+{
+ int rv = 0;
+ struct driver_data *dd = pci_get_drvdata(pdev);
+
+ if (!dd) {
+ dev_err(&pdev->dev,
+ "Driver private datastructure is NULL\n");
+ return -EFAULT;
+ }
+
+ set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+ /* Disable ports & interrupts then send standby immediate */
+ rv = mtip_block_suspend(dd);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Failed to suspend controller\n");
+ return rv;
+ }
+
+ /*
+ * Save the pci config space to pdev structure &
+ * disable the device
+ */
+ pci_save_state(pdev);
+ pci_disable_device(pdev);
+
+ /* Move to Low power state*/
+ pci_set_power_state(pdev, PCI_D3hot);
+
+ return rv;
+}
+
+/*
+ * Called for each probed device when the device is resumed.
+ *
+ * return value
+ * 0 Success
+ * <0 Error
+ */
+static int mtip_pci_resume(struct pci_dev *pdev)
+{
+ int rv = 0;
+ struct driver_data *dd;
+
+ dd = pci_get_drvdata(pdev);
+ if (!dd) {
+ dev_err(&pdev->dev,
+ "Driver private datastructure is NULL\n");
+ return -EFAULT;
+ }
+
+ /* Move the device to active State */
+ pci_set_power_state(pdev, PCI_D0);
+
+ /* Restore PCI configuration space */
+ pci_restore_state(pdev);
+
+ /* Enable the PCI device*/
+ rv = pcim_enable_device(pdev);
+ if (rv < 0) {
+ dev_err(&pdev->dev,
+ "Failed to enable card during resume\n");
+ goto err;
+ }
+ pci_set_master(pdev);
+
+ /*
+ * Calls hbaReset, initPort, & startPort function
+ * then enables interrupts
+ */
+ rv = mtip_block_resume(dd);
+ if (rv < 0)
+ dev_err(&pdev->dev, "Unable to resume\n");
+
+err:
+ clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+ return rv;
+}
+
+/*
+ * Shutdown routine
+ *
+ * return value
+ * None
+ */
+static void mtip_pci_shutdown(struct pci_dev *pdev)
+{
+ struct driver_data *dd = pci_get_drvdata(pdev);
+ if (dd)
+ mtip_block_shutdown(dd);
+}
+
+/* Table of device ids supported by this driver. */
+static const struct pci_device_id mtip_pci_tbl[] = {
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320H_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320M_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320S_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P325M_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420H_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420M_DEVICE_ID) },
+ { PCI_DEVICE(PCI_VENDOR_ID_MICRON, P425M_DEVICE_ID) },
+ { 0 }
+};
+
+/* Structure that describes the PCI driver functions. */
+static struct pci_driver mtip_pci_driver = {
+ .name = MTIP_DRV_NAME,
+ .id_table = mtip_pci_tbl,
+ .probe = mtip_pci_probe,
+ .remove = mtip_pci_remove,
+ .suspend = mtip_pci_suspend,
+ .resume = mtip_pci_resume,
+ .shutdown = mtip_pci_shutdown,
+};
+
+MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
+
+/*
+ * Module initialization function.
+ *
+ * Called once when the module is loaded. This function allocates a major
+ * block device number to the Cyclone devices and registers the PCI layer
+ * of the driver.
+ *
+ * Return value
+ * 0 on success else error code.
+ */
+static int __init mtip_init(void)
+{
+ int error;
+
+ pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
+
+ spin_lock_init(&dev_lock);
+
+ INIT_LIST_HEAD(&online_list);
+ INIT_LIST_HEAD(&removing_list);
+
+ /* Allocate a major block device number to use with this driver. */
+ error = register_blkdev(0, MTIP_DRV_NAME);
+ if (error <= 0) {
+ pr_err("Unable to register block device (%d)\n",
+ error);
+ return -EBUSY;
+ }
+ mtip_major = error;
+
+ dfs_parent = debugfs_create_dir("rssd", NULL);
+ if (IS_ERR_OR_NULL(dfs_parent)) {
+ pr_warn("Error creating debugfs parent\n");
+ dfs_parent = NULL;
+ }
+ if (dfs_parent) {
+ dfs_device_status = debugfs_create_file("device_status",
+ S_IRUGO, dfs_parent, NULL,
+ &mtip_device_status_fops);
+ if (IS_ERR_OR_NULL(dfs_device_status)) {
+ pr_err("Error creating device_status node\n");
+ dfs_device_status = NULL;
+ }
+ }
+
+ /* Register our PCI operations. */
+ error = pci_register_driver(&mtip_pci_driver);
+ if (error) {
+ debugfs_remove(dfs_parent);
+ unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+ }
+
+ return error;
+}
+
+/*
+ * Module de-initialization function.
+ *
+ * Called once when the module is unloaded. This function deallocates
+ * the major block device number allocated by mtip_init() and
+ * unregisters the PCI layer of the driver.
+ *
+ * Return value
+ * none
+ */
+static void __exit mtip_exit(void)
+{
+ /* Release the allocated major block device number. */
+ unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+ /* Unregister the PCI driver. */
+ pci_unregister_driver(&mtip_pci_driver);
+
+ debugfs_remove_recursive(dfs_parent);
+}
+
+MODULE_AUTHOR("Micron Technology, Inc");
+MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MTIP_DRV_VERSION);
+
+module_init(mtip_init);
+module_exit(mtip_exit);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
new file mode 100644
index 000000000..ba1b31ee2
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -0,0 +1,511 @@
+/*
+ * mtip32xx.h - Header file for the P320 SSD Block Driver
+ * Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ * Copyright (C) 2009 Integrated Device Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __MTIP32XX_H__
+#define __MTIP32XX_H__
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+#include <linux/genhd.h>
+
+/* Offset of Subsystem Device ID in pci confoguration space */
+#define PCI_SUBSYSTEM_DEVICEID 0x2E
+
+/* offset of Device Control register in PCIe extended capabilites space */
+#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET 0x48
+
+/* check for erase mode support during secure erase */
+#define MTIP_SEC_ERASE_MODE 0x2
+
+/* # of times to retry timed out/failed IOs */
+#define MTIP_MAX_RETRIES 2
+
+/* Various timeout values in ms */
+#define MTIP_NCQ_CMD_TIMEOUT_MS 15000
+#define MTIP_IOCTL_CMD_TIMEOUT_MS 5000
+#define MTIP_INT_CMD_TIMEOUT_MS 5000
+#define MTIP_QUIESCE_IO_TIMEOUT_MS (MTIP_NCQ_CMD_TIMEOUT_MS * \
+ (MTIP_MAX_RETRIES + 1))
+
+/* check for timeouts every 500ms */
+#define MTIP_TIMEOUT_CHECK_PERIOD 500
+
+/* ftl rebuild */
+#define MTIP_FTL_REBUILD_OFFSET 142
+#define MTIP_FTL_REBUILD_MAGIC 0xED51
+#define MTIP_FTL_REBUILD_TIMEOUT_MS 2400000
+
+/* unaligned IO handling */
+#define MTIP_MAX_UNALIGNED_SLOTS 2
+
+/* Macro to extract the tag bit number from a tag value. */
+#define MTIP_TAG_BIT(tag) (tag & 0x1F)
+
+/*
+ * Macro to extract the tag index from a tag value. The index
+ * is used to access the correct s_active/Command Issue register based
+ * on the tag value.
+ */
+#define MTIP_TAG_INDEX(tag) (tag >> 5)
+
+/*
+ * Maximum number of scatter gather entries
+ * a single command may have.
+ */
+#define MTIP_MAX_SG 504
+
+/*
+ * Maximum number of slot groups (Command Issue & s_active registers)
+ * NOTE: This is the driver maximum; check dd->slot_groups for actual value.
+ */
+#define MTIP_MAX_SLOT_GROUPS 8
+
+/* Internal command tag. */
+#define MTIP_TAG_INTERNAL 0
+
+/* Micron Vendor ID & P320x SSD Device ID */
+#define PCI_VENDOR_ID_MICRON 0x1344
+#define P320H_DEVICE_ID 0x5150
+#define P320M_DEVICE_ID 0x5151
+#define P320S_DEVICE_ID 0x5152
+#define P325M_DEVICE_ID 0x5153
+#define P420H_DEVICE_ID 0x5160
+#define P420M_DEVICE_ID 0x5161
+#define P425M_DEVICE_ID 0x5163
+
+/* Driver name and version strings */
+#define MTIP_DRV_NAME "mtip32xx"
+#define MTIP_DRV_VERSION "1.3.1"
+
+/* Maximum number of minor device numbers per device. */
+#define MTIP_MAX_MINORS 16
+
+/* Maximum number of supported command slots. */
+#define MTIP_MAX_COMMAND_SLOTS (MTIP_MAX_SLOT_GROUPS * 32)
+
+/*
+ * Per-tag bitfield size in longs.
+ * Linux bit manipulation functions
+ * (i.e. test_and_set_bit, find_next_zero_bit)
+ * manipulate memory in longs, so we try to make the math work.
+ * take the slot groups and find the number of longs, rounding up.
+ * Careful! i386 and x86_64 use different size longs!
+ */
+#define U32_PER_LONG (sizeof(long) / sizeof(u32))
+#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \
+ (U32_PER_LONG-1))/U32_PER_LONG)
+
+/* BAR number used to access the HBA registers. */
+#define MTIP_ABAR 5
+
+#ifdef DEBUG
+ #define dbg_printk(format, arg...) \
+ printk(pr_fmt(format), ##arg);
+#else
+ #define dbg_printk(format, arg...)
+#endif
+
+#define MTIP_DFS_MAX_BUF_SIZE 1024
+
+#define __force_bit2int (unsigned int __force)
+
+enum {
+ /* below are bit numbers in 'flags' defined in mtip_port */
+ MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */
+ MTIP_PF_EH_ACTIVE_BIT = 1, /* error handling */
+ MTIP_PF_SE_ACTIVE_BIT = 2, /* secure erase */
+ MTIP_PF_DM_ACTIVE_BIT = 3, /* download microcde */
+ MTIP_PF_PAUSE_IO = ((1 << MTIP_PF_IC_ACTIVE_BIT) |
+ (1 << MTIP_PF_EH_ACTIVE_BIT) |
+ (1 << MTIP_PF_SE_ACTIVE_BIT) |
+ (1 << MTIP_PF_DM_ACTIVE_BIT)),
+
+ MTIP_PF_SVC_THD_ACTIVE_BIT = 4,
+ MTIP_PF_ISSUE_CMDS_BIT = 5,
+ MTIP_PF_REBUILD_BIT = 6,
+ MTIP_PF_SR_CLEANUP_BIT = 7,
+ MTIP_PF_SVC_THD_STOP_BIT = 8,
+
+ /* below are bit numbers in 'dd_flag' defined in driver_data */
+ MTIP_DDF_SEC_LOCK_BIT = 0,
+ MTIP_DDF_REMOVE_PENDING_BIT = 1,
+ MTIP_DDF_OVER_TEMP_BIT = 2,
+ MTIP_DDF_WRITE_PROTECT_BIT = 3,
+ MTIP_DDF_REMOVE_DONE_BIT = 4,
+ MTIP_DDF_CLEANUP_BIT = 5,
+ MTIP_DDF_RESUME_BIT = 6,
+ MTIP_DDF_INIT_DONE_BIT = 7,
+ MTIP_DDF_REBUILD_FAILED_BIT = 8,
+
+ MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) |
+ (1 << MTIP_DDF_SEC_LOCK_BIT) |
+ (1 << MTIP_DDF_OVER_TEMP_BIT) |
+ (1 << MTIP_DDF_WRITE_PROTECT_BIT) |
+ (1 << MTIP_DDF_REBUILD_FAILED_BIT)),
+
+};
+
+struct smart_attr {
+ u8 attr_id;
+ u16 flags;
+ u8 cur;
+ u8 worst;
+ u32 data;
+ u8 res[3];
+} __packed;
+
+struct mtip_work {
+ struct work_struct work;
+ void *port;
+ int cpu_binding;
+ u32 completed;
+} ____cacheline_aligned_in_smp;
+
+#define DEFINE_HANDLER(group) \
+ void mtip_workq_sdbf##group(struct work_struct *work) \
+ { \
+ struct mtip_work *w = (struct mtip_work *) work; \
+ mtip_workq_sdbfx(w->port, group, w->completed); \
+ }
+
+#define MTIP_TRIM_TIMEOUT_MS 240000
+#define MTIP_MAX_TRIM_ENTRIES 8
+#define MTIP_MAX_TRIM_ENTRY_LEN 0xfff8
+
+struct mtip_trim_entry {
+ u32 lba; /* starting lba of region */
+ u16 rsvd; /* unused */
+ u16 range; /* # of 512b blocks to trim */
+} __packed;
+
+struct mtip_trim {
+ /* Array of regions to trim */
+ struct mtip_trim_entry entry[MTIP_MAX_TRIM_ENTRIES];
+} __packed;
+
+/* Register Frame Information Structure (FIS), host to device. */
+struct host_to_dev_fis {
+ /*
+ * FIS type.
+ * - 27h Register FIS, host to device.
+ * - 34h Register FIS, device to host.
+ * - 39h DMA Activate FIS, device to host.
+ * - 41h DMA Setup FIS, bi-directional.
+ * - 46h Data FIS, bi-directional.
+ * - 58h BIST Activate FIS, bi-directional.
+ * - 5Fh PIO Setup FIS, device to host.
+ * - A1h Set Device Bits FIS, device to host.
+ */
+ unsigned char type;
+ unsigned char opts;
+ unsigned char command;
+ unsigned char features;
+
+ union {
+ unsigned char lba_low;
+ unsigned char sector;
+ };
+ union {
+ unsigned char lba_mid;
+ unsigned char cyl_low;
+ };
+ union {
+ unsigned char lba_hi;
+ unsigned char cyl_hi;
+ };
+ union {
+ unsigned char device;
+ unsigned char head;
+ };
+
+ union {
+ unsigned char lba_low_ex;
+ unsigned char sector_ex;
+ };
+ union {
+ unsigned char lba_mid_ex;
+ unsigned char cyl_low_ex;
+ };
+ union {
+ unsigned char lba_hi_ex;
+ unsigned char cyl_hi_ex;
+ };
+ unsigned char features_ex;
+
+ unsigned char sect_count;
+ unsigned char sect_cnt_ex;
+ unsigned char res2;
+ unsigned char control;
+
+ unsigned int res3;
+};
+
+/* Command header structure. */
+struct mtip_cmd_hdr {
+ /*
+ * Command options.
+ * - Bits 31:16 Number of PRD entries.
+ * - Bits 15:8 Unused in this implementation.
+ * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries.
+ * - Bit 6 Write bit, should be set when writing data to the device.
+ * - Bit 5 Unused in this implementation.
+ * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
+ */
+ unsigned int opts;
+ /* This field is unsed when using NCQ. */
+ union {
+ unsigned int byte_count;
+ unsigned int status;
+ };
+ /*
+ * Lower 32 bits of the command table address associated with this
+ * header. The command table addresses must be 128 byte aligned.
+ */
+ unsigned int ctba;
+ /*
+ * If 64 bit addressing is used this field is the upper 32 bits
+ * of the command table address associated with this command.
+ */
+ unsigned int ctbau;
+ /* Reserved and unused. */
+ unsigned int res[4];
+};
+
+/* Command scatter gather structure (PRD). */
+struct mtip_cmd_sg {
+ /*
+ * Low 32 bits of the data buffer address. For P320 this
+ * address must be 8 byte aligned signified by bits 2:0 being
+ * set to 0.
+ */
+ unsigned int dba;
+ /*
+ * When 64 bit addressing is used this field is the upper
+ * 32 bits of the data buffer address.
+ */
+ unsigned int dba_upper;
+ /* Unused. */
+ unsigned int reserved;
+ /*
+ * Bit 31: interrupt when this data block has been transferred.
+ * Bits 30..22: reserved
+ * Bits 21..0: byte count (minus 1). For P320 the byte count must be
+ * 8 byte aligned signified by bits 2:0 being set to 1.
+ */
+ unsigned int info;
+};
+struct mtip_port;
+
+/* Structure used to describe a command. */
+struct mtip_cmd {
+
+ struct mtip_cmd_hdr *command_header; /* ptr to command header entry */
+
+ dma_addr_t command_header_dma; /* corresponding physical address */
+
+ void *command; /* ptr to command table entry */
+
+ dma_addr_t command_dma; /* corresponding physical address */
+
+ void *comp_data; /* data passed to completion function comp_func() */
+ /*
+ * Completion function called by the ISR upon completion of
+ * a command.
+ */
+ void (*comp_func)(struct mtip_port *port,
+ int tag,
+ struct mtip_cmd *cmd,
+ int status);
+
+ int scatter_ents; /* Number of scatter list entries used */
+
+ int unaligned; /* command is unaligned on 4k boundary */
+
+ struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+
+ int retries; /* The number of retries left for this command. */
+
+ int direction; /* Data transfer direction */
+};
+
+/* Structure used to describe a port. */
+struct mtip_port {
+ /* Pointer back to the driver data for this port. */
+ struct driver_data *dd;
+ /*
+ * Used to determine if the data pointed to by the
+ * identify field is valid.
+ */
+ unsigned long identify_valid;
+ /* Base address of the memory mapped IO for the port. */
+ void __iomem *mmio;
+ /* Array of pointers to the memory mapped s_active registers. */
+ void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
+ /* Array of pointers to the memory mapped completed registers. */
+ void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
+ /* Array of pointers to the memory mapped Command Issue registers. */
+ void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
+ /*
+ * Pointer to the beginning of the command header memory as used
+ * by the driver.
+ */
+ void *command_list;
+ /*
+ * Pointer to the beginning of the command header memory as used
+ * by the DMA.
+ */
+ dma_addr_t command_list_dma;
+ /*
+ * Pointer to the beginning of the RX FIS memory as used
+ * by the driver.
+ */
+ void *rxfis;
+ /*
+ * Pointer to the beginning of the RX FIS memory as used
+ * by the DMA.
+ */
+ dma_addr_t rxfis_dma;
+ /*
+ * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART
+ */
+ void *block1;
+ /*
+ * DMA address of region for RX Fis, Identify, RLE10, and SMART
+ */
+ dma_addr_t block1_dma;
+ /*
+ * Pointer to the beginning of the identify data memory as used
+ * by the driver.
+ */
+ u16 *identify;
+ /*
+ * Pointer to the beginning of the identify data memory as used
+ * by the DMA.
+ */
+ dma_addr_t identify_dma;
+ /*
+ * Pointer to the beginning of a sector buffer that is used
+ * by the driver when issuing internal commands.
+ */
+ u16 *sector_buffer;
+ /*
+ * Pointer to the beginning of a sector buffer that is used
+ * by the DMA when the driver issues internal commands.
+ */
+ dma_addr_t sector_buffer_dma;
+ /*
+ * Bit significant, used to determine if a command slot has
+ * been allocated. i.e. the slot is in use. Bits are cleared
+ * when the command slot and all associated data structures
+ * are no longer needed.
+ */
+ u16 *log_buf;
+ dma_addr_t log_buf_dma;
+
+ u8 *smart_buf;
+ dma_addr_t smart_buf_dma;
+
+ unsigned long allocated[SLOTBITS_IN_LONGS];
+ /*
+ * used to queue commands when an internal command is in progress
+ * or error handling is active
+ */
+ unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
+ /* Used by mtip_service_thread to wait for an event */
+ wait_queue_head_t svc_wait;
+ /*
+ * indicates the state of the port. Also, helps the service thread
+ * to determine its action on wake up.
+ */
+ unsigned long flags;
+ /*
+ * Timer used to complete commands that have been active for too long.
+ */
+ unsigned long ic_pause_timer;
+
+ /* Semaphore to control queue depth of unaligned IOs */
+ struct semaphore cmd_slot_unal;
+
+ /* Spinlock for working around command-issue bug. */
+ spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
+};
+
+/*
+ * Driver private data structure.
+ *
+ * One structure is allocated per probed device.
+ */
+struct driver_data {
+ void __iomem *mmio; /* Base address of the HBA registers. */
+
+ int major; /* Major device number. */
+
+ int instance; /* Instance number. First device probed is 0, ... */
+
+ struct gendisk *disk; /* Pointer to our gendisk structure. */
+
+ struct pci_dev *pdev; /* Pointer to the PCI device structure. */
+
+ struct request_queue *queue; /* Our request queue. */
+
+ struct blk_mq_tag_set tags; /* blk_mq tags */
+
+ struct mtip_port *port; /* Pointer to the port data structure. */
+
+ unsigned product_type; /* magic value declaring the product type */
+
+ unsigned slot_groups; /* number of slot groups the product supports */
+
+ unsigned long index; /* Index to determine the disk name */
+
+ unsigned long dd_flag; /* NOTE: use atomic bit operations on this */
+
+ struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
+
+ struct dentry *dfs_node;
+
+ bool trim_supp; /* flag indicating trim support */
+
+ bool sr;
+
+ int numa_node; /* NUMA support */
+
+ char workq_name[32];
+
+ struct workqueue_struct *isr_workq;
+
+ atomic_t irq_workers_active;
+
+ struct mtip_work work[MTIP_MAX_SLOT_GROUPS];
+
+ int isr_binding;
+
+ struct block_device *bdev;
+
+ struct list_head online_list; /* linkage for online list */
+
+ struct list_head remove_list; /* linkage for removing list */
+
+ int unal_qdepth; /* qdepth of unaligned IO queue */
+};
+
+#endif
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
new file mode 100644
index 000000000..39e5f7fae
--- /dev/null
+++ b/drivers/block/nbd.c
@@ -0,0 +1,896 @@
+/*
+ * Network block device - make block devices work over TCP
+ *
+ * Note that you can not swap over this thing, yet. Seems to work but
+ * deadlocks sometimes - you can not swap over TCP in general.
+ *
+ * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
+ * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
+ *
+ * This file is released under GPLv2 or later.
+ *
+ * (part of code stolen from loop.c)
+ */
+
+#include <linux/major.h>
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/ioctl.h>
+#include <linux/mutex.h>
+#include <linux/compiler.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+
+#include <asm/uaccess.h>
+#include <asm/types.h>
+
+#include <linux/nbd.h>
+
+struct nbd_device {
+ int flags;
+ int harderror; /* Code of hard error */
+ struct socket * sock; /* If == NULL, device is not ready, yet */
+ int magic;
+
+ spinlock_t queue_lock;
+ struct list_head queue_head; /* Requests waiting result */
+ struct request *active_req;
+ wait_queue_head_t active_wq;
+ struct list_head waiting_queue; /* Requests to be sent */
+ wait_queue_head_t waiting_wq;
+
+ struct mutex tx_lock;
+ struct gendisk *disk;
+ int blksize;
+ loff_t bytesize;
+ pid_t pid; /* pid of nbd-client, if attached */
+ int xmit_timeout;
+ int disconnect; /* a disconnect has been requested by user */
+};
+
+#define NBD_MAGIC 0x68797548
+
+static unsigned int nbds_max = 16;
+static struct nbd_device *nbd_dev;
+static int max_part;
+
+/*
+ * Use just one lock (or at most 1 per NIC). Two arguments for this:
+ * 1. Each NIC is essentially a synchronization point for all servers
+ * accessed through that NIC so there's no need to have more locks
+ * than NICs anyway.
+ * 2. More locks lead to more "Dirty cache line bouncing" which will slow
+ * down each lock to the point where they're actually slower than just
+ * a single lock.
+ * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
+ */
+static DEFINE_SPINLOCK(nbd_lock);
+
+static inline struct device *nbd_to_dev(struct nbd_device *nbd)
+{
+ return disk_to_dev(nbd->disk);
+}
+
+static const char *nbdcmd_to_ascii(int cmd)
+{
+ switch (cmd) {
+ case NBD_CMD_READ: return "read";
+ case NBD_CMD_WRITE: return "write";
+ case NBD_CMD_DISC: return "disconnect";
+ case NBD_CMD_FLUSH: return "flush";
+ case NBD_CMD_TRIM: return "trim/discard";
+ }
+ return "invalid";
+}
+
+static void nbd_end_request(struct nbd_device *nbd, struct request *req)
+{
+ int error = req->errors ? -EIO : 0;
+ struct request_queue *q = req->q;
+ unsigned long flags;
+
+ dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
+ error ? "failed" : "done");
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ __blk_end_request_all(req, error);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/*
+ * Forcibly shutdown the socket causing all listeners to error
+ */
+static void sock_shutdown(struct nbd_device *nbd, int lock)
+{
+ if (lock)
+ mutex_lock(&nbd->tx_lock);
+ if (nbd->sock) {
+ dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
+ kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+ nbd->sock = NULL;
+ }
+ if (lock)
+ mutex_unlock(&nbd->tx_lock);
+}
+
+static void nbd_xmit_timeout(unsigned long arg)
+{
+ struct task_struct *task = (struct task_struct *)arg;
+
+ printk(KERN_WARNING "nbd: killing hung xmit (%s, pid: %d)\n",
+ task->comm, task->pid);
+ force_sig(SIGKILL, task);
+}
+
+/*
+ * Send or receive packet.
+ */
+static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
+ int msg_flags)
+{
+ struct socket *sock = nbd->sock;
+ int result;
+ struct msghdr msg;
+ struct kvec iov;
+ sigset_t blocked, oldset;
+ unsigned long pflags = current->flags;
+
+ if (unlikely(!sock)) {
+ dev_err(disk_to_dev(nbd->disk),
+ "Attempted %s on closed socket in sock_xmit\n",
+ (send ? "send" : "recv"));
+ return -EINVAL;
+ }
+
+ /* Allow interception of SIGKILL only
+ * Don't allow other signals to interrupt the transmission */
+ siginitsetinv(&blocked, sigmask(SIGKILL));
+ sigprocmask(SIG_SETMASK, &blocked, &oldset);
+
+ current->flags |= PF_MEMALLOC;
+ do {
+ sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
+ iov.iov_base = buf;
+ iov.iov_len = size;
+ msg.msg_name = NULL;
+ msg.msg_namelen = 0;
+ msg.msg_control = NULL;
+ msg.msg_controllen = 0;
+ msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+ if (send) {
+ struct timer_list ti;
+
+ if (nbd->xmit_timeout) {
+ init_timer(&ti);
+ ti.function = nbd_xmit_timeout;
+ ti.data = (unsigned long)current;
+ ti.expires = jiffies + nbd->xmit_timeout;
+ add_timer(&ti);
+ }
+ result = kernel_sendmsg(sock, &msg, &iov, 1, size);
+ if (nbd->xmit_timeout)
+ del_timer_sync(&ti);
+ } else
+ result = kernel_recvmsg(sock, &msg, &iov, 1, size,
+ msg.msg_flags);
+
+ if (signal_pending(current)) {
+ siginfo_t info;
+ printk(KERN_WARNING "nbd (pid %d: %s) got signal %d\n",
+ task_pid_nr(current), current->comm,
+ dequeue_signal_lock(current, &current->blocked, &info));
+ result = -EINTR;
+ sock_shutdown(nbd, !send);
+ break;
+ }
+
+ if (result <= 0) {
+ if (result == 0)
+ result = -EPIPE; /* short read */
+ break;
+ }
+ size -= result;
+ buf += result;
+ } while (size > 0);
+
+ sigprocmask(SIG_SETMASK, &oldset, NULL);
+ tsk_restore_flags(current, pflags, PF_MEMALLOC);
+
+ return result;
+}
+
+static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
+ int flags)
+{
+ int result;
+ void *kaddr = kmap(bvec->bv_page);
+ result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+ bvec->bv_len, flags);
+ kunmap(bvec->bv_page);
+ return result;
+}
+
+/* always call with the tx_lock held */
+static int nbd_send_req(struct nbd_device *nbd, struct request *req)
+{
+ int result, flags;
+ struct nbd_request request;
+ unsigned long size = blk_rq_bytes(req);
+
+ memset(&request, 0, sizeof(request));
+ request.magic = htonl(NBD_REQUEST_MAGIC);
+ request.type = htonl(nbd_cmd(req));
+
+ if (nbd_cmd(req) != NBD_CMD_FLUSH && nbd_cmd(req) != NBD_CMD_DISC) {
+ request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
+ request.len = htonl(size);
+ }
+ memcpy(request.handle, &req, sizeof(req));
+
+ dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
+ req, nbdcmd_to_ascii(nbd_cmd(req)),
+ (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
+ result = sock_xmit(nbd, 1, &request, sizeof(request),
+ (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
+ if (result <= 0) {
+ dev_err(disk_to_dev(nbd->disk),
+ "Send control failed (result %d)\n", result);
+ return -EIO;
+ }
+
+ if (nbd_cmd(req) == NBD_CMD_WRITE) {
+ struct req_iterator iter;
+ struct bio_vec bvec;
+ /*
+ * we are really probing at internals to determine
+ * whether to set MSG_MORE or not...
+ */
+ rq_for_each_segment(bvec, req, iter) {
+ flags = 0;
+ if (!rq_iter_last(bvec, iter))
+ flags = MSG_MORE;
+ dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
+ req, bvec.bv_len);
+ result = sock_send_bvec(nbd, &bvec, flags);
+ if (result <= 0) {
+ dev_err(disk_to_dev(nbd->disk),
+ "Send data failed (result %d)\n",
+ result);
+ return -EIO;
+ }
+ }
+ }
+ return 0;
+}
+
+static struct request *nbd_find_request(struct nbd_device *nbd,
+ struct request *xreq)
+{
+ struct request *req, *tmp;
+ int err;
+
+ err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
+ if (unlikely(err))
+ return ERR_PTR(err);
+
+ spin_lock(&nbd->queue_lock);
+ list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
+ if (req != xreq)
+ continue;
+ list_del_init(&req->queuelist);
+ spin_unlock(&nbd->queue_lock);
+ return req;
+ }
+ spin_unlock(&nbd->queue_lock);
+
+ return ERR_PTR(-ENOENT);
+}
+
+static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
+{
+ int result;
+ void *kaddr = kmap(bvec->bv_page);
+ result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
+ MSG_WAITALL);
+ kunmap(bvec->bv_page);
+ return result;
+}
+
+/* NULL returned = something went wrong, inform userspace */
+static struct request *nbd_read_stat(struct nbd_device *nbd)
+{
+ int result;
+ struct nbd_reply reply;
+ struct request *req;
+
+ reply.magic = 0;
+ result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
+ if (result <= 0) {
+ dev_err(disk_to_dev(nbd->disk),
+ "Receive control failed (result %d)\n", result);
+ goto harderror;
+ }
+
+ if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
+ dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
+ (unsigned long)ntohl(reply.magic));
+ result = -EPROTO;
+ goto harderror;
+ }
+
+ req = nbd_find_request(nbd, *(struct request **)reply.handle);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ if (result != -ENOENT)
+ goto harderror;
+
+ dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
+ reply.handle);
+ result = -EBADR;
+ goto harderror;
+ }
+
+ if (ntohl(reply.error)) {
+ dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
+ ntohl(reply.error));
+ req->errors++;
+ return req;
+ }
+
+ dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
+ if (nbd_cmd(req) == NBD_CMD_READ) {
+ struct req_iterator iter;
+ struct bio_vec bvec;
+
+ rq_for_each_segment(bvec, req, iter) {
+ result = sock_recv_bvec(nbd, &bvec);
+ if (result <= 0) {
+ dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
+ result);
+ req->errors++;
+ return req;
+ }
+ dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
+ req, bvec.bv_len);
+ }
+ }
+ return req;
+harderror:
+ nbd->harderror = result;
+ return NULL;
+}
+
+static ssize_t pid_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%ld\n",
+ (long) ((struct nbd_device *)disk->private_data)->pid);
+}
+
+static struct device_attribute pid_attr = {
+ .attr = { .name = "pid", .mode = S_IRUGO},
+ .show = pid_show,
+};
+
+static int nbd_do_it(struct nbd_device *nbd)
+{
+ struct request *req;
+ int ret;
+
+ BUG_ON(nbd->magic != NBD_MAGIC);
+
+ sk_set_memalloc(nbd->sock->sk);
+ nbd->pid = task_pid_nr(current);
+ ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
+ if (ret) {
+ dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+ nbd->pid = 0;
+ return ret;
+ }
+
+ while ((req = nbd_read_stat(nbd)) != NULL)
+ nbd_end_request(nbd, req);
+
+ device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+ nbd->pid = 0;
+ return 0;
+}
+
+static void nbd_clear_que(struct nbd_device *nbd)
+{
+ struct request *req;
+
+ BUG_ON(nbd->magic != NBD_MAGIC);
+
+ /*
+ * Because we have set nbd->sock to NULL under the tx_lock, all
+ * modifications to the list must have completed by now. For
+ * the same reason, the active_req must be NULL.
+ *
+ * As a consequence, we don't need to take the spin lock while
+ * purging the list here.
+ */
+ BUG_ON(nbd->sock);
+ BUG_ON(nbd->active_req);
+
+ while (!list_empty(&nbd->queue_head)) {
+ req = list_entry(nbd->queue_head.next, struct request,
+ queuelist);
+ list_del_init(&req->queuelist);
+ req->errors++;
+ nbd_end_request(nbd, req);
+ }
+
+ while (!list_empty(&nbd->waiting_queue)) {
+ req = list_entry(nbd->waiting_queue.next, struct request,
+ queuelist);
+ list_del_init(&req->queuelist);
+ req->errors++;
+ nbd_end_request(nbd, req);
+ }
+}
+
+
+static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
+{
+ if (req->cmd_type != REQ_TYPE_FS)
+ goto error_out;
+
+ nbd_cmd(req) = NBD_CMD_READ;
+ if (rq_data_dir(req) == WRITE) {
+ if ((req->cmd_flags & REQ_DISCARD)) {
+ WARN_ON(!(nbd->flags & NBD_FLAG_SEND_TRIM));
+ nbd_cmd(req) = NBD_CMD_TRIM;
+ } else
+ nbd_cmd(req) = NBD_CMD_WRITE;
+ if (nbd->flags & NBD_FLAG_READ_ONLY) {
+ dev_err(disk_to_dev(nbd->disk),
+ "Write on read-only\n");
+ goto error_out;
+ }
+ }
+
+ if (req->cmd_flags & REQ_FLUSH) {
+ BUG_ON(unlikely(blk_rq_sectors(req)));
+ nbd_cmd(req) = NBD_CMD_FLUSH;
+ }
+
+ req->errors = 0;
+
+ mutex_lock(&nbd->tx_lock);
+ if (unlikely(!nbd->sock)) {
+ mutex_unlock(&nbd->tx_lock);
+ dev_err(disk_to_dev(nbd->disk),
+ "Attempted send on closed socket\n");
+ goto error_out;
+ }
+
+ nbd->active_req = req;
+
+ if (nbd_send_req(nbd, req) != 0) {
+ dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
+ req->errors++;
+ nbd_end_request(nbd, req);
+ } else {
+ spin_lock(&nbd->queue_lock);
+ list_add_tail(&req->queuelist, &nbd->queue_head);
+ spin_unlock(&nbd->queue_lock);
+ }
+
+ nbd->active_req = NULL;
+ mutex_unlock(&nbd->tx_lock);
+ wake_up_all(&nbd->active_wq);
+
+ return;
+
+error_out:
+ req->errors++;
+ nbd_end_request(nbd, req);
+}
+
+static int nbd_thread(void *data)
+{
+ struct nbd_device *nbd = data;
+ struct request *req;
+
+ set_user_nice(current, MIN_NICE);
+ while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
+ /* wait for something to do */
+ wait_event_interruptible(nbd->waiting_wq,
+ kthread_should_stop() ||
+ !list_empty(&nbd->waiting_queue));
+
+ /* extract request */
+ if (list_empty(&nbd->waiting_queue))
+ continue;
+
+ spin_lock_irq(&nbd->queue_lock);
+ req = list_entry(nbd->waiting_queue.next, struct request,
+ queuelist);
+ list_del_init(&req->queuelist);
+ spin_unlock_irq(&nbd->queue_lock);
+
+ /* handle request */
+ nbd_handle_req(nbd, req);
+ }
+ return 0;
+}
+
+/*
+ * We always wait for result of write, for now. It would be nice to make it optional
+ * in future
+ * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
+ * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
+ */
+
+static void do_nbd_request(struct request_queue *q)
+ __releases(q->queue_lock) __acquires(q->queue_lock)
+{
+ struct request *req;
+
+ while ((req = blk_fetch_request(q)) != NULL) {
+ struct nbd_device *nbd;
+
+ spin_unlock_irq(q->queue_lock);
+
+ nbd = req->rq_disk->private_data;
+
+ BUG_ON(nbd->magic != NBD_MAGIC);
+
+ dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
+ req, req->cmd_type);
+
+ if (unlikely(!nbd->sock)) {
+ dev_err(disk_to_dev(nbd->disk),
+ "Attempted send on closed socket\n");
+ req->errors++;
+ nbd_end_request(nbd, req);
+ spin_lock_irq(q->queue_lock);
+ continue;
+ }
+
+ spin_lock_irq(&nbd->queue_lock);
+ list_add_tail(&req->queuelist, &nbd->waiting_queue);
+ spin_unlock_irq(&nbd->queue_lock);
+
+ wake_up(&nbd->waiting_wq);
+
+ spin_lock_irq(q->queue_lock);
+ }
+}
+
+/* Must be called with tx_lock held */
+
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case NBD_DISCONNECT: {
+ struct request sreq;
+
+ dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
+ if (!nbd->sock)
+ return -EINVAL;
+
+ mutex_unlock(&nbd->tx_lock);
+ fsync_bdev(bdev);
+ mutex_lock(&nbd->tx_lock);
+ blk_rq_init(NULL, &sreq);
+ sreq.cmd_type = REQ_TYPE_SPECIAL;
+ nbd_cmd(&sreq) = NBD_CMD_DISC;
+
+ /* Check again after getting mutex back. */
+ if (!nbd->sock)
+ return -EINVAL;
+
+ nbd->disconnect = 1;
+
+ nbd_send_req(nbd, &sreq);
+ return 0;
+ }
+
+ case NBD_CLEAR_SOCK: {
+ struct socket *sock = nbd->sock;
+ nbd->sock = NULL;
+ nbd_clear_que(nbd);
+ BUG_ON(!list_empty(&nbd->queue_head));
+ BUG_ON(!list_empty(&nbd->waiting_queue));
+ kill_bdev(bdev);
+ if (sock)
+ sockfd_put(sock);
+ return 0;
+ }
+
+ case NBD_SET_SOCK: {
+ struct socket *sock;
+ int err;
+ if (nbd->sock)
+ return -EBUSY;
+ sock = sockfd_lookup(arg, &err);
+ if (sock) {
+ nbd->sock = sock;
+ if (max_part > 0)
+ bdev->bd_invalidated = 1;
+ nbd->disconnect = 0; /* we're connected now */
+ return 0;
+ }
+ return -EINVAL;
+ }
+
+ case NBD_SET_BLKSIZE:
+ nbd->blksize = arg;
+ nbd->bytesize &= ~(nbd->blksize-1);
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
+ return 0;
+
+ case NBD_SET_SIZE:
+ nbd->bytesize = arg & ~(nbd->blksize-1);
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
+ return 0;
+
+ case NBD_SET_TIMEOUT:
+ nbd->xmit_timeout = arg * HZ;
+ return 0;
+
+ case NBD_SET_FLAGS:
+ nbd->flags = arg;
+ return 0;
+
+ case NBD_SET_SIZE_BLOCKS:
+ nbd->bytesize = ((u64) arg) * nbd->blksize;
+ bdev->bd_inode->i_size = nbd->bytesize;
+ set_blocksize(bdev, nbd->blksize);
+ set_capacity(nbd->disk, nbd->bytesize >> 9);
+ return 0;
+
+ case NBD_DO_IT: {
+ struct task_struct *thread;
+ struct socket *sock;
+ int error;
+
+ if (nbd->pid)
+ return -EBUSY;
+ if (!nbd->sock)
+ return -EINVAL;
+
+ mutex_unlock(&nbd->tx_lock);
+
+ if (nbd->flags & NBD_FLAG_READ_ONLY)
+ set_device_ro(bdev, true);
+ if (nbd->flags & NBD_FLAG_SEND_TRIM)
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
+ nbd->disk->queue);
+ if (nbd->flags & NBD_FLAG_SEND_FLUSH)
+ blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
+ else
+ blk_queue_flush(nbd->disk->queue, 0);
+
+ thread = kthread_run(nbd_thread, nbd, "%s",
+ nbd->disk->disk_name);
+ if (IS_ERR(thread)) {
+ mutex_lock(&nbd->tx_lock);
+ return PTR_ERR(thread);
+ }
+
+ error = nbd_do_it(nbd);
+ kthread_stop(thread);
+
+ mutex_lock(&nbd->tx_lock);
+ if (error)
+ return error;
+ sock_shutdown(nbd, 0);
+ sock = nbd->sock;
+ nbd->sock = NULL;
+ nbd_clear_que(nbd);
+ dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
+ kill_bdev(bdev);
+ queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
+ set_device_ro(bdev, false);
+ if (sock)
+ sockfd_put(sock);
+ nbd->flags = 0;
+ nbd->bytesize = 0;
+ bdev->bd_inode->i_size = 0;
+ set_capacity(nbd->disk, 0);
+ if (max_part > 0)
+ ioctl_by_bdev(bdev, BLKRRPART, 0);
+ if (nbd->disconnect) /* user requested, ignore socket errors */
+ return 0;
+ return nbd->harderror;
+ }
+
+ case NBD_CLEAR_QUE:
+ /*
+ * This is for compatibility only. The queue is always cleared
+ * by NBD_DO_IT or NBD_CLEAR_SOCK.
+ */
+ return 0;
+
+ case NBD_PRINT_DEBUG:
+ dev_info(disk_to_dev(nbd->disk),
+ "next = %p, prev = %p, head = %p\n",
+ nbd->queue_head.next, nbd->queue_head.prev,
+ &nbd->queue_head);
+ return 0;
+ }
+ return -ENOTTY;
+}
+
+static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct nbd_device *nbd = bdev->bd_disk->private_data;
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ BUG_ON(nbd->magic != NBD_MAGIC);
+
+ mutex_lock(&nbd->tx_lock);
+ error = __nbd_ioctl(bdev, nbd, cmd, arg);
+ mutex_unlock(&nbd->tx_lock);
+
+ return error;
+}
+
+static const struct block_device_operations nbd_fops =
+{
+ .owner = THIS_MODULE,
+ .ioctl = nbd_ioctl,
+};
+
+/*
+ * And here should be modules and kernel interface
+ * (Just smiley confuses emacs :-)
+ */
+
+static int __init nbd_init(void)
+{
+ int err = -ENOMEM;
+ int i;
+ int part_shift;
+
+ BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
+
+ if (max_part < 0) {
+ printk(KERN_ERR "nbd: max_part must be >= 0\n");
+ return -EINVAL;
+ }
+
+ part_shift = 0;
+ if (max_part > 0) {
+ part_shift = fls(max_part);
+
+ /*
+ * Adjust max_part according to part_shift as it is exported
+ * to user space so that user can know the max number of
+ * partition kernel should be able to manage.
+ *
+ * Note that -1 is required because partition 0 is reserved
+ * for the whole disk.
+ */
+ max_part = (1UL << part_shift) - 1;
+ }
+
+ if ((1UL << part_shift) > DISK_MAX_PARTS)
+ return -EINVAL;
+
+ if (nbds_max > 1UL << (MINORBITS - part_shift))
+ return -EINVAL;
+
+ nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
+ if (!nbd_dev)
+ return -ENOMEM;
+
+ for (i = 0; i < nbds_max; i++) {
+ struct gendisk *disk = alloc_disk(1 << part_shift);
+ if (!disk)
+ goto out;
+ nbd_dev[i].disk = disk;
+ /*
+ * The new linux 2.5 block layer implementation requires
+ * every gendisk to have its very own request_queue struct.
+ * These structs are big so we dynamically allocate them.
+ */
+ disk->queue = blk_init_queue(do_nbd_request, &nbd_lock);
+ if (!disk->queue) {
+ put_disk(disk);
+ goto out;
+ }
+ /*
+ * Tell the block layer that we are not a rotational device
+ */
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
+ queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+ disk->queue->limits.discard_granularity = 512;
+ disk->queue->limits.max_discard_sectors = UINT_MAX;
+ disk->queue->limits.discard_zeroes_data = 0;
+ blk_queue_max_hw_sectors(disk->queue, 65536);
+ disk->queue->limits.max_sectors = 256;
+ }
+
+ if (register_blkdev(NBD_MAJOR, "nbd")) {
+ err = -EIO;
+ goto out;
+ }
+
+ printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
+
+ for (i = 0; i < nbds_max; i++) {
+ struct gendisk *disk = nbd_dev[i].disk;
+ nbd_dev[i].magic = NBD_MAGIC;
+ INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
+ spin_lock_init(&nbd_dev[i].queue_lock);
+ INIT_LIST_HEAD(&nbd_dev[i].queue_head);
+ mutex_init(&nbd_dev[i].tx_lock);
+ init_waitqueue_head(&nbd_dev[i].active_wq);
+ init_waitqueue_head(&nbd_dev[i].waiting_wq);
+ nbd_dev[i].blksize = 1024;
+ nbd_dev[i].bytesize = 0;
+ disk->major = NBD_MAJOR;
+ disk->first_minor = i << part_shift;
+ disk->fops = &nbd_fops;
+ disk->private_data = &nbd_dev[i];
+ sprintf(disk->disk_name, "nbd%d", i);
+ set_capacity(disk, 0);
+ add_disk(disk);
+ }
+
+ return 0;
+out:
+ while (i--) {
+ blk_cleanup_queue(nbd_dev[i].disk->queue);
+ put_disk(nbd_dev[i].disk);
+ }
+ kfree(nbd_dev);
+ return err;
+}
+
+static void __exit nbd_cleanup(void)
+{
+ int i;
+ for (i = 0; i < nbds_max; i++) {
+ struct gendisk *disk = nbd_dev[i].disk;
+ nbd_dev[i].magic = 0;
+ if (disk) {
+ del_gendisk(disk);
+ blk_cleanup_queue(disk->queue);
+ put_disk(disk);
+ }
+ }
+ unregister_blkdev(NBD_MAJOR, "nbd");
+ kfree(nbd_dev);
+ printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
+}
+
+module_init(nbd_init);
+module_exit(nbd_cleanup);
+
+MODULE_DESCRIPTION("Network Block Device");
+MODULE_LICENSE("GPL");
+
+module_param(nbds_max, int, 0444);
+MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
+module_param(max_part, int, 0444);
+MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
new file mode 100644
index 000000000..65cd61a41
--- /dev/null
+++ b/drivers/block/null_blk.c
@@ -0,0 +1,674 @@
+#include <linux/module.h>
+
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/hrtimer.h>
+
+struct nullb_cmd {
+ struct list_head list;
+ struct llist_node ll_list;
+ struct call_single_data csd;
+ struct request *rq;
+ struct bio *bio;
+ unsigned int tag;
+ struct nullb_queue *nq;
+};
+
+struct nullb_queue {
+ unsigned long *tag_map;
+ wait_queue_head_t wait;
+ unsigned int queue_depth;
+
+ struct nullb_cmd *cmds;
+};
+
+struct nullb {
+ struct list_head list;
+ unsigned int index;
+ struct request_queue *q;
+ struct gendisk *disk;
+ struct blk_mq_tag_set tag_set;
+ struct hrtimer timer;
+ unsigned int queue_depth;
+ spinlock_t lock;
+
+ struct nullb_queue *queues;
+ unsigned int nr_queues;
+};
+
+static LIST_HEAD(nullb_list);
+static struct mutex lock;
+static int null_major;
+static int nullb_indexes;
+
+struct completion_queue {
+ struct llist_head list;
+ struct hrtimer timer;
+};
+
+/*
+ * These are per-cpu for now, they will need to be configured by the
+ * complete_queues parameter and appropriately mapped.
+ */
+static DEFINE_PER_CPU(struct completion_queue, completion_queues);
+
+enum {
+ NULL_IRQ_NONE = 0,
+ NULL_IRQ_SOFTIRQ = 1,
+ NULL_IRQ_TIMER = 2,
+};
+
+enum {
+ NULL_Q_BIO = 0,
+ NULL_Q_RQ = 1,
+ NULL_Q_MQ = 2,
+};
+
+static int submit_queues;
+module_param(submit_queues, int, S_IRUGO);
+MODULE_PARM_DESC(submit_queues, "Number of submission queues");
+
+static int home_node = NUMA_NO_NODE;
+module_param(home_node, int, S_IRUGO);
+MODULE_PARM_DESC(home_node, "Home node for the device");
+
+static int queue_mode = NULL_Q_MQ;
+
+static int null_param_store_val(const char *str, int *val, int min, int max)
+{
+ int ret, new_val;
+
+ ret = kstrtoint(str, 10, &new_val);
+ if (ret)
+ return -EINVAL;
+
+ if (new_val < min || new_val > max)
+ return -EINVAL;
+
+ *val = new_val;
+ return 0;
+}
+
+static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
+{
+ return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ);
+}
+
+static struct kernel_param_ops null_queue_mode_param_ops = {
+ .set = null_set_queue_mode,
+ .get = param_get_int,
+};
+
+device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO);
+MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
+
+static int gb = 250;
+module_param(gb, int, S_IRUGO);
+MODULE_PARM_DESC(gb, "Size in GB");
+
+static int bs = 512;
+module_param(bs, int, S_IRUGO);
+MODULE_PARM_DESC(bs, "Block size (in bytes)");
+
+static int nr_devices = 2;
+module_param(nr_devices, int, S_IRUGO);
+MODULE_PARM_DESC(nr_devices, "Number of devices to register");
+
+static int irqmode = NULL_IRQ_SOFTIRQ;
+
+static int null_set_irqmode(const char *str, const struct kernel_param *kp)
+{
+ return null_param_store_val(str, &irqmode, NULL_IRQ_NONE,
+ NULL_IRQ_TIMER);
+}
+
+static struct kernel_param_ops null_irqmode_param_ops = {
+ .set = null_set_irqmode,
+ .get = param_get_int,
+};
+
+device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO);
+MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
+
+static int completion_nsec = 10000;
+module_param(completion_nsec, int, S_IRUGO);
+MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
+
+static int hw_queue_depth = 64;
+module_param(hw_queue_depth, int, S_IRUGO);
+MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
+
+static bool use_per_node_hctx = false;
+module_param(use_per_node_hctx, bool, S_IRUGO);
+MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
+
+static void put_tag(struct nullb_queue *nq, unsigned int tag)
+{
+ clear_bit_unlock(tag, nq->tag_map);
+
+ if (waitqueue_active(&nq->wait))
+ wake_up(&nq->wait);
+}
+
+static unsigned int get_tag(struct nullb_queue *nq)
+{
+ unsigned int tag;
+
+ do {
+ tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
+ if (tag >= nq->queue_depth)
+ return -1U;
+ } while (test_and_set_bit_lock(tag, nq->tag_map));
+
+ return tag;
+}
+
+static void free_cmd(struct nullb_cmd *cmd)
+{
+ put_tag(cmd->nq, cmd->tag);
+}
+
+static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
+{
+ struct nullb_cmd *cmd;
+ unsigned int tag;
+
+ tag = get_tag(nq);
+ if (tag != -1U) {
+ cmd = &nq->cmds[tag];
+ cmd->tag = tag;
+ cmd->nq = nq;
+ return cmd;
+ }
+
+ return NULL;
+}
+
+static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, int can_wait)
+{
+ struct nullb_cmd *cmd;
+ DEFINE_WAIT(wait);
+
+ cmd = __alloc_cmd(nq);
+ if (cmd || !can_wait)
+ return cmd;
+
+ do {
+ prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
+ cmd = __alloc_cmd(nq);
+ if (cmd)
+ break;
+
+ io_schedule();
+ } while (1);
+
+ finish_wait(&nq->wait, &wait);
+ return cmd;
+}
+
+static void end_cmd(struct nullb_cmd *cmd)
+{
+ switch (queue_mode) {
+ case NULL_Q_MQ:
+ blk_mq_end_request(cmd->rq, 0);
+ return;
+ case NULL_Q_RQ:
+ INIT_LIST_HEAD(&cmd->rq->queuelist);
+ blk_end_request_all(cmd->rq, 0);
+ break;
+ case NULL_Q_BIO:
+ bio_endio(cmd->bio, 0);
+ break;
+ }
+
+ free_cmd(cmd);
+}
+
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
+{
+ struct completion_queue *cq;
+ struct llist_node *entry;
+ struct nullb_cmd *cmd;
+
+ cq = &per_cpu(completion_queues, smp_processor_id());
+
+ while ((entry = llist_del_all(&cq->list)) != NULL) {
+ entry = llist_reverse_order(entry);
+ do {
+ cmd = container_of(entry, struct nullb_cmd, ll_list);
+ entry = entry->next;
+ end_cmd(cmd);
+ } while (entry);
+ }
+
+ return HRTIMER_NORESTART;
+}
+
+static void null_cmd_end_timer(struct nullb_cmd *cmd)
+{
+ struct completion_queue *cq = &per_cpu(completion_queues, get_cpu());
+
+ cmd->ll_list.next = NULL;
+ if (llist_add(&cmd->ll_list, &cq->list)) {
+ ktime_t kt = ktime_set(0, completion_nsec);
+
+ hrtimer_start(&cq->timer, kt, HRTIMER_MODE_REL);
+ }
+
+ put_cpu();
+}
+
+static void null_softirq_done_fn(struct request *rq)
+{
+ if (queue_mode == NULL_Q_MQ)
+ end_cmd(blk_mq_rq_to_pdu(rq));
+ else
+ end_cmd(rq->special);
+}
+
+static inline void null_handle_cmd(struct nullb_cmd *cmd)
+{
+ /* Complete IO by inline, softirq or timer */
+ switch (irqmode) {
+ case NULL_IRQ_SOFTIRQ:
+ switch (queue_mode) {
+ case NULL_Q_MQ:
+ blk_mq_complete_request(cmd->rq);
+ break;
+ case NULL_Q_RQ:
+ blk_complete_request(cmd->rq);
+ break;
+ case NULL_Q_BIO:
+ /*
+ * XXX: no proper submitting cpu information available.
+ */
+ end_cmd(cmd);
+ break;
+ }
+ break;
+ case NULL_IRQ_NONE:
+ end_cmd(cmd);
+ break;
+ case NULL_IRQ_TIMER:
+ null_cmd_end_timer(cmd);
+ break;
+ }
+}
+
+static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
+{
+ int index = 0;
+
+ if (nullb->nr_queues != 1)
+ index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
+
+ return &nullb->queues[index];
+}
+
+static void null_queue_bio(struct request_queue *q, struct bio *bio)
+{
+ struct nullb *nullb = q->queuedata;
+ struct nullb_queue *nq = nullb_to_queue(nullb);
+ struct nullb_cmd *cmd;
+
+ cmd = alloc_cmd(nq, 1);
+ cmd->bio = bio;
+
+ null_handle_cmd(cmd);
+}
+
+static int null_rq_prep_fn(struct request_queue *q, struct request *req)
+{
+ struct nullb *nullb = q->queuedata;
+ struct nullb_queue *nq = nullb_to_queue(nullb);
+ struct nullb_cmd *cmd;
+
+ cmd = alloc_cmd(nq, 0);
+ if (cmd) {
+ cmd->rq = req;
+ req->special = cmd;
+ return BLKPREP_OK;
+ }
+
+ return BLKPREP_DEFER;
+}
+
+static void null_request_fn(struct request_queue *q)
+{
+ struct request *rq;
+
+ while ((rq = blk_fetch_request(q)) != NULL) {
+ struct nullb_cmd *cmd = rq->special;
+
+ spin_unlock_irq(q->queue_lock);
+ null_handle_cmd(cmd);
+ spin_lock_irq(q->queue_lock);
+ }
+}
+
+static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+
+ cmd->rq = bd->rq;
+ cmd->nq = hctx->driver_data;
+
+ blk_mq_start_request(bd->rq);
+
+ null_handle_cmd(cmd);
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
+{
+ BUG_ON(!nullb);
+ BUG_ON(!nq);
+
+ init_waitqueue_head(&nq->wait);
+ nq->queue_depth = nullb->queue_depth;
+}
+
+static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int index)
+{
+ struct nullb *nullb = data;
+ struct nullb_queue *nq = &nullb->queues[index];
+
+ hctx->driver_data = nq;
+ null_init_queue(nullb, nq);
+ nullb->nr_queues++;
+
+ return 0;
+}
+
+static struct blk_mq_ops null_mq_ops = {
+ .queue_rq = null_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_hctx = null_init_hctx,
+ .complete = null_softirq_done_fn,
+};
+
+static void null_del_dev(struct nullb *nullb)
+{
+ list_del_init(&nullb->list);
+
+ del_gendisk(nullb->disk);
+ blk_cleanup_queue(nullb->q);
+ if (queue_mode == NULL_Q_MQ)
+ blk_mq_free_tag_set(&nullb->tag_set);
+ put_disk(nullb->disk);
+ kfree(nullb);
+}
+
+static int null_open(struct block_device *bdev, fmode_t mode)
+{
+ return 0;
+}
+
+static void null_release(struct gendisk *disk, fmode_t mode)
+{
+}
+
+static const struct block_device_operations null_fops = {
+ .owner = THIS_MODULE,
+ .open = null_open,
+ .release = null_release,
+};
+
+static int setup_commands(struct nullb_queue *nq)
+{
+ struct nullb_cmd *cmd;
+ int i, tag_size;
+
+ nq->cmds = kzalloc(nq->queue_depth * sizeof(*cmd), GFP_KERNEL);
+ if (!nq->cmds)
+ return -ENOMEM;
+
+ tag_size = ALIGN(nq->queue_depth, BITS_PER_LONG) / BITS_PER_LONG;
+ nq->tag_map = kzalloc(tag_size * sizeof(unsigned long), GFP_KERNEL);
+ if (!nq->tag_map) {
+ kfree(nq->cmds);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nq->queue_depth; i++) {
+ cmd = &nq->cmds[i];
+ INIT_LIST_HEAD(&cmd->list);
+ cmd->ll_list.next = NULL;
+ cmd->tag = -1U;
+ }
+
+ return 0;
+}
+
+static void cleanup_queue(struct nullb_queue *nq)
+{
+ kfree(nq->tag_map);
+ kfree(nq->cmds);
+}
+
+static void cleanup_queues(struct nullb *nullb)
+{
+ int i;
+
+ for (i = 0; i < nullb->nr_queues; i++)
+ cleanup_queue(&nullb->queues[i]);
+
+ kfree(nullb->queues);
+}
+
+static int setup_queues(struct nullb *nullb)
+{
+ nullb->queues = kzalloc(submit_queues * sizeof(struct nullb_queue),
+ GFP_KERNEL);
+ if (!nullb->queues)
+ return -ENOMEM;
+
+ nullb->nr_queues = 0;
+ nullb->queue_depth = hw_queue_depth;
+
+ return 0;
+}
+
+static int init_driver_queues(struct nullb *nullb)
+{
+ struct nullb_queue *nq;
+ int i, ret = 0;
+
+ for (i = 0; i < submit_queues; i++) {
+ nq = &nullb->queues[i];
+
+ null_init_queue(nullb, nq);
+
+ ret = setup_commands(nq);
+ if (ret)
+ return ret;
+ nullb->nr_queues++;
+ }
+ return 0;
+}
+
+static int null_add_dev(void)
+{
+ struct gendisk *disk;
+ struct nullb *nullb;
+ sector_t size;
+ int rv;
+
+ nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
+ if (!nullb) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock_init(&nullb->lock);
+
+ if (queue_mode == NULL_Q_MQ && use_per_node_hctx)
+ submit_queues = nr_online_nodes;
+
+ rv = setup_queues(nullb);
+ if (rv)
+ goto out_free_nullb;
+
+ if (queue_mode == NULL_Q_MQ) {
+ nullb->tag_set.ops = &null_mq_ops;
+ nullb->tag_set.nr_hw_queues = submit_queues;
+ nullb->tag_set.queue_depth = hw_queue_depth;
+ nullb->tag_set.numa_node = home_node;
+ nullb->tag_set.cmd_size = sizeof(struct nullb_cmd);
+ nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ nullb->tag_set.driver_data = nullb;
+
+ rv = blk_mq_alloc_tag_set(&nullb->tag_set);
+ if (rv)
+ goto out_cleanup_queues;
+
+ nullb->q = blk_mq_init_queue(&nullb->tag_set);
+ if (IS_ERR(nullb->q)) {
+ rv = -ENOMEM;
+ goto out_cleanup_tags;
+ }
+ } else if (queue_mode == NULL_Q_BIO) {
+ nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+ if (!nullb->q) {
+ rv = -ENOMEM;
+ goto out_cleanup_queues;
+ }
+ blk_queue_make_request(nullb->q, null_queue_bio);
+ rv = init_driver_queues(nullb);
+ if (rv)
+ goto out_cleanup_blk_queue;
+ } else {
+ nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+ if (!nullb->q) {
+ rv = -ENOMEM;
+ goto out_cleanup_queues;
+ }
+ blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
+ blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+ rv = init_driver_queues(nullb);
+ if (rv)
+ goto out_cleanup_blk_queue;
+ }
+
+ nullb->q->queuedata = nullb;
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
+ queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, nullb->q);
+
+ disk = nullb->disk = alloc_disk_node(1, home_node);
+ if (!disk) {
+ rv = -ENOMEM;
+ goto out_cleanup_blk_queue;
+ }
+
+ mutex_lock(&lock);
+ list_add_tail(&nullb->list, &nullb_list);
+ nullb->index = nullb_indexes++;
+ mutex_unlock(&lock);
+
+ blk_queue_logical_block_size(nullb->q, bs);
+ blk_queue_physical_block_size(nullb->q, bs);
+
+ size = gb * 1024 * 1024 * 1024ULL;
+ sector_div(size, bs);
+ set_capacity(disk, size);
+
+ disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
+ disk->major = null_major;
+ disk->first_minor = nullb->index;
+ disk->fops = &null_fops;
+ disk->private_data = nullb;
+ disk->queue = nullb->q;
+ sprintf(disk->disk_name, "nullb%d", nullb->index);
+ add_disk(disk);
+ return 0;
+
+out_cleanup_blk_queue:
+ blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+ if (queue_mode == NULL_Q_MQ)
+ blk_mq_free_tag_set(&nullb->tag_set);
+out_cleanup_queues:
+ cleanup_queues(nullb);
+out_free_nullb:
+ kfree(nullb);
+out:
+ return rv;
+}
+
+static int __init null_init(void)
+{
+ unsigned int i;
+
+ if (bs > PAGE_SIZE) {
+ pr_warn("null_blk: invalid block size\n");
+ pr_warn("null_blk: defaults block size to %lu\n", PAGE_SIZE);
+ bs = PAGE_SIZE;
+ }
+
+ if (queue_mode == NULL_Q_MQ && use_per_node_hctx) {
+ if (submit_queues < nr_online_nodes) {
+ pr_warn("null_blk: submit_queues param is set to %u.",
+ nr_online_nodes);
+ submit_queues = nr_online_nodes;
+ }
+ } else if (submit_queues > nr_cpu_ids)
+ submit_queues = nr_cpu_ids;
+ else if (!submit_queues)
+ submit_queues = 1;
+
+ mutex_init(&lock);
+
+ /* Initialize a separate list for each CPU for issuing softirqs */
+ for_each_possible_cpu(i) {
+ struct completion_queue *cq = &per_cpu(completion_queues, i);
+
+ init_llist_head(&cq->list);
+
+ if (irqmode != NULL_IRQ_TIMER)
+ continue;
+
+ hrtimer_init(&cq->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ cq->timer.function = null_cmd_timer_expired;
+ }
+
+ null_major = register_blkdev(0, "nullb");
+ if (null_major < 0)
+ return null_major;
+
+ for (i = 0; i < nr_devices; i++) {
+ if (null_add_dev()) {
+ unregister_blkdev(null_major, "nullb");
+ return -EINVAL;
+ }
+ }
+
+ pr_info("null: module loaded\n");
+ return 0;
+}
+
+static void __exit null_exit(void)
+{
+ struct nullb *nullb;
+
+ unregister_blkdev(null_major, "nullb");
+
+ mutex_lock(&lock);
+ while (!list_empty(&nullb_list)) {
+ nullb = list_entry(nullb_list.next, struct nullb, list);
+ null_del_dev(nullb);
+ }
+ mutex_unlock(&lock);
+}
+
+module_init(null_init);
+module_exit(null_exit);
+
+MODULE_AUTHOR("Jens Axboe <jaxboe@fusionio.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
new file mode 100644
index 000000000..683dff272
--- /dev/null
+++ b/drivers/block/nvme-core.c
@@ -0,0 +1,3178 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/hdreg.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/poison.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/t10-pi.h>
+#include <linux/types.h>
+#include <scsi/sg.h>
+#include <asm-generic/io-64-nonatomic-lo-hi.h>
+
+#define NVME_MINORS (1U << MINORBITS)
+#define NVME_Q_DEPTH 1024
+#define NVME_AQ_DEPTH 256
+#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
+#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
+#define ADMIN_TIMEOUT (admin_timeout * HZ)
+#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ)
+
+static unsigned char admin_timeout = 60;
+module_param(admin_timeout, byte, 0644);
+MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands");
+
+unsigned char nvme_io_timeout = 30;
+module_param_named(io_timeout, nvme_io_timeout, byte, 0644);
+MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O");
+
+static unsigned char shutdown_timeout = 5;
+module_param(shutdown_timeout, byte, 0644);
+MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown");
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+static int nvme_char_major;
+module_param(nvme_char_major, int, 0);
+
+static int use_threaded_interrupts;
+module_param(use_threaded_interrupts, int, 0);
+
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+static struct task_struct *nvme_thread;
+static struct workqueue_struct *nvme_workq;
+static wait_queue_head_t nvme_kthread_wait;
+
+static struct class *nvme_class;
+
+static void nvme_reset_failed_dev(struct work_struct *ws);
+static int nvme_process_cq(struct nvme_queue *nvmeq);
+
+struct async_cmd_info {
+ struct kthread_work work;
+ struct kthread_worker *worker;
+ struct request *req;
+ u32 result;
+ int status;
+ void *ctx;
+};
+
+/*
+ * An NVM Express queue. Each device has at least two (one for admin
+ * commands and one for I/O commands).
+ */
+struct nvme_queue {
+ struct device *q_dmadev;
+ struct nvme_dev *dev;
+ char irqname[24]; /* nvme4294967295-65535\0 */
+ spinlock_t q_lock;
+ struct nvme_command *sq_cmds;
+ volatile struct nvme_completion *cqes;
+ dma_addr_t sq_dma_addr;
+ dma_addr_t cq_dma_addr;
+ u32 __iomem *q_db;
+ u16 q_depth;
+ s16 cq_vector;
+ u16 sq_head;
+ u16 sq_tail;
+ u16 cq_head;
+ u16 qid;
+ u8 cq_phase;
+ u8 cqe_seen;
+ struct async_cmd_info cmdinfo;
+ struct blk_mq_hw_ctx *hctx;
+};
+
+/*
+ * Check we didin't inadvertently grow the command struct
+ */
+static inline void _nvme_check_size(void)
+{
+ BUILD_BUG_ON(sizeof(struct nvme_rw_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_create_cq) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_abort_cmd) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
+ BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
+ BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
+ BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
+}
+
+typedef void (*nvme_completion_fn)(struct nvme_queue *, void *,
+ struct nvme_completion *);
+
+struct nvme_cmd_info {
+ nvme_completion_fn fn;
+ void *ctx;
+ int aborted;
+ struct nvme_queue *nvmeq;
+ struct nvme_iod iod[0];
+};
+
+/*
+ * Max size of iod being embedded in the request payload
+ */
+#define NVME_INT_PAGES 2
+#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size)
+#define NVME_INT_MASK 0x01
+
+/*
+ * Will slightly overestimate the number of pages needed. This is OK
+ * as it only leads to a small amount of wasted memory for the lifetime of
+ * the I/O.
+ */
+static int nvme_npages(unsigned size, struct nvme_dev *dev)
+{
+ unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size);
+ return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
+}
+
+static unsigned int nvme_cmd_size(struct nvme_dev *dev)
+{
+ unsigned int ret = sizeof(struct nvme_cmd_info);
+
+ ret += sizeof(struct nvme_iod);
+ ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev);
+ ret += sizeof(struct scatterlist) * NVME_INT_PAGES;
+
+ return ret;
+}
+
+static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ struct nvme_dev *dev = data;
+ struct nvme_queue *nvmeq = dev->queues[0];
+
+ WARN_ON(nvmeq->hctx);
+ nvmeq->hctx = hctx;
+ hctx->driver_data = nvmeq;
+ return 0;
+}
+
+static int nvme_admin_init_request(void *data, struct request *req,
+ unsigned int hctx_idx, unsigned int rq_idx,
+ unsigned int numa_node)
+{
+ struct nvme_dev *dev = data;
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = dev->queues[0];
+
+ BUG_ON(!nvmeq);
+ cmd->nvmeq = nvmeq;
+ return 0;
+}
+
+static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ struct nvme_queue *nvmeq = hctx->driver_data;
+
+ nvmeq->hctx = NULL;
+}
+
+static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
+ unsigned int hctx_idx)
+{
+ struct nvme_dev *dev = data;
+ struct nvme_queue *nvmeq = dev->queues[
+ (hctx_idx % dev->queue_count) + 1];
+
+ if (!nvmeq->hctx)
+ nvmeq->hctx = hctx;
+
+ /* nvmeq queues are shared between namespaces. We assume here that
+ * blk-mq map the tags so they match up with the nvme queue tags. */
+ WARN_ON(nvmeq->hctx->tags != hctx->tags);
+
+ hctx->driver_data = nvmeq;
+ return 0;
+}
+
+static int nvme_init_request(void *data, struct request *req,
+ unsigned int hctx_idx, unsigned int rq_idx,
+ unsigned int numa_node)
+{
+ struct nvme_dev *dev = data;
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+
+ BUG_ON(!nvmeq);
+ cmd->nvmeq = nvmeq;
+ return 0;
+}
+
+static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx,
+ nvme_completion_fn handler)
+{
+ cmd->fn = handler;
+ cmd->ctx = ctx;
+ cmd->aborted = 0;
+ blk_mq_start_request(blk_mq_rq_from_pdu(cmd));
+}
+
+static void *iod_get_private(struct nvme_iod *iod)
+{
+ return (void *) (iod->private & ~0x1UL);
+}
+
+/*
+ * If bit 0 is set, the iod is embedded in the request payload.
+ */
+static bool iod_should_kfree(struct nvme_iod *iod)
+{
+ return (iod->private & NVME_INT_MASK) == 0;
+}
+
+/* Special values must be less than 0x1000 */
+#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA)
+#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE)
+#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE)
+#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE)
+
+static void special_completion(struct nvme_queue *nvmeq, void *ctx,
+ struct nvme_completion *cqe)
+{
+ if (ctx == CMD_CTX_CANCELLED)
+ return;
+ if (ctx == CMD_CTX_COMPLETED) {
+ dev_warn(nvmeq->q_dmadev,
+ "completed id %d twice on queue %d\n",
+ cqe->command_id, le16_to_cpup(&cqe->sq_id));
+ return;
+ }
+ if (ctx == CMD_CTX_INVALID) {
+ dev_warn(nvmeq->q_dmadev,
+ "invalid id %d completed on queue %d\n",
+ cqe->command_id, le16_to_cpup(&cqe->sq_id));
+ return;
+ }
+ dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx);
+}
+
+static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn)
+{
+ void *ctx;
+
+ if (fn)
+ *fn = cmd->fn;
+ ctx = cmd->ctx;
+ cmd->fn = special_completion;
+ cmd->ctx = CMD_CTX_CANCELLED;
+ return ctx;
+}
+
+static void async_req_completion(struct nvme_queue *nvmeq, void *ctx,
+ struct nvme_completion *cqe)
+{
+ u32 result = le32_to_cpup(&cqe->result);
+ u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+ if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ)
+ ++nvmeq->dev->event_limit;
+ if (status == NVME_SC_SUCCESS)
+ dev_warn(nvmeq->q_dmadev,
+ "async event result %08x\n", result);
+}
+
+static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
+ struct nvme_completion *cqe)
+{
+ struct request *req = ctx;
+
+ u16 status = le16_to_cpup(&cqe->status) >> 1;
+ u32 result = le32_to_cpup(&cqe->result);
+
+ blk_mq_free_hctx_request(nvmeq->hctx, req);
+
+ dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
+ ++nvmeq->dev->abort_limit;
+}
+
+static void async_completion(struct nvme_queue *nvmeq, void *ctx,
+ struct nvme_completion *cqe)
+{
+ struct async_cmd_info *cmdinfo = ctx;
+ cmdinfo->result = le32_to_cpup(&cqe->result);
+ cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+ queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
+ blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req);
+}
+
+static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
+ unsigned int tag)
+{
+ struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+ struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
+
+ return blk_mq_rq_to_pdu(req);
+}
+
+/*
+ * Called with local interrupts disabled and the q_lock held. May not sleep.
+ */
+static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag,
+ nvme_completion_fn *fn)
+{
+ struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag);
+ void *ctx;
+ if (tag >= nvmeq->q_depth) {
+ *fn = special_completion;
+ return CMD_CTX_INVALID;
+ }
+ if (fn)
+ *fn = cmd->fn;
+ ctx = cmd->ctx;
+ cmd->fn = special_completion;
+ cmd->ctx = CMD_CTX_COMPLETED;
+ return ctx;
+}
+
+/**
+ * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
+ * @nvmeq: The queue to use
+ * @cmd: The command to send
+ *
+ * Safe to use from interrupt context
+ */
+static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+{
+ u16 tail = nvmeq->sq_tail;
+
+ memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
+ if (++tail == nvmeq->q_depth)
+ tail = 0;
+ writel(tail, nvmeq->q_db);
+ nvmeq->sq_tail = tail;
+
+ return 0;
+}
+
+static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
+{
+ unsigned long flags;
+ int ret;
+ spin_lock_irqsave(&nvmeq->q_lock, flags);
+ ret = __nvme_submit_cmd(nvmeq, cmd);
+ spin_unlock_irqrestore(&nvmeq->q_lock, flags);
+ return ret;
+}
+
+static __le64 **iod_list(struct nvme_iod *iod)
+{
+ return ((void *)iod) + iod->offset;
+}
+
+static inline void iod_init(struct nvme_iod *iod, unsigned nbytes,
+ unsigned nseg, unsigned long private)
+{
+ iod->private = private;
+ iod->offset = offsetof(struct nvme_iod, sg[nseg]);
+ iod->npages = -1;
+ iod->length = nbytes;
+ iod->nents = 0;
+}
+
+static struct nvme_iod *
+__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev,
+ unsigned long priv, gfp_t gfp)
+{
+ struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) +
+ sizeof(__le64 *) * nvme_npages(bytes, dev) +
+ sizeof(struct scatterlist) * nseg, gfp);
+
+ if (iod)
+ iod_init(iod, bytes, nseg, priv);
+
+ return iod;
+}
+
+static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev,
+ gfp_t gfp)
+{
+ unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) :
+ sizeof(struct nvme_dsm_range);
+ struct nvme_iod *iod;
+
+ if (rq->nr_phys_segments <= NVME_INT_PAGES &&
+ size <= NVME_INT_BYTES(dev)) {
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq);
+
+ iod = cmd->iod;
+ iod_init(iod, size, rq->nr_phys_segments,
+ (unsigned long) rq | NVME_INT_MASK);
+ return iod;
+ }
+
+ return __nvme_alloc_iod(rq->nr_phys_segments, size, dev,
+ (unsigned long) rq, gfp);
+}
+
+void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+{
+ const int last_prp = dev->page_size / 8 - 1;
+ int i;
+ __le64 **list = iod_list(iod);
+ dma_addr_t prp_dma = iod->first_dma;
+
+ if (iod->npages == 0)
+ dma_pool_free(dev->prp_small_pool, list[0], prp_dma);
+ for (i = 0; i < iod->npages; i++) {
+ __le64 *prp_list = list[i];
+ dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]);
+ dma_pool_free(dev->prp_page_pool, prp_list, prp_dma);
+ prp_dma = next_prp_dma;
+ }
+
+ if (iod_should_kfree(iod))
+ kfree(iod);
+}
+
+static int nvme_error_status(u16 status)
+{
+ switch (status & 0x7ff) {
+ case NVME_SC_SUCCESS:
+ return 0;
+ case NVME_SC_CAP_EXCEEDED:
+ return -ENOSPC;
+ default:
+ return -EIO;
+ }
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+ if (be32_to_cpu(pi->ref_tag) == v)
+ pi->ref_tag = cpu_to_be32(p);
+}
+
+static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+ if (be32_to_cpu(pi->ref_tag) == p)
+ pi->ref_tag = cpu_to_be32(v);
+}
+
+/**
+ * nvme_dif_remap - remaps ref tags to bip seed and physical lba
+ *
+ * The virtual start sector is the one that was originally submitted by the
+ * block layer. Due to partitioning, MD/DM cloning, etc. the actual physical
+ * start sector may be different. Remap protection information to match the
+ * physical LBA on writes, and back to the original seed on reads.
+ *
+ * Type 0 and 3 do not have a ref tag, so no remapping required.
+ */
+static void nvme_dif_remap(struct request *req,
+ void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
+{
+ struct nvme_ns *ns = req->rq_disk->private_data;
+ struct bio_integrity_payload *bip;
+ struct t10_pi_tuple *pi;
+ void *p, *pmap;
+ u32 i, nlb, ts, phys, virt;
+
+ if (!ns->pi_type || ns->pi_type == NVME_NS_DPS_PI_TYPE3)
+ return;
+
+ bip = bio_integrity(req->bio);
+ if (!bip)
+ return;
+
+ pmap = kmap_atomic(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset;
+
+ p = pmap;
+ virt = bip_get_seed(bip);
+ phys = nvme_block_nr(ns, blk_rq_pos(req));
+ nlb = (blk_rq_bytes(req) >> ns->lba_shift);
+ ts = ns->disk->integrity->tuple_size;
+
+ for (i = 0; i < nlb; i++, virt++, phys++) {
+ pi = (struct t10_pi_tuple *)p;
+ dif_swap(phys, virt, pi);
+ p += ts;
+ }
+ kunmap_atomic(pmap);
+}
+
+static int nvme_noop_verify(struct blk_integrity_iter *iter)
+{
+ return 0;
+}
+
+static int nvme_noop_generate(struct blk_integrity_iter *iter)
+{
+ return 0;
+}
+
+struct blk_integrity nvme_meta_noop = {
+ .name = "NVME_META_NOOP",
+ .generate_fn = nvme_noop_generate,
+ .verify_fn = nvme_noop_verify,
+};
+
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+ struct blk_integrity integrity;
+
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE3:
+ integrity = t10_pi_type3_crc;
+ break;
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ integrity = t10_pi_type1_crc;
+ break;
+ default:
+ integrity = nvme_meta_noop;
+ break;
+ }
+ integrity.tuple_size = ns->ms;
+ blk_integrity_register(ns->disk, &integrity);
+ blk_queue_max_integrity_segments(ns->queue, 1);
+}
+#else /* CONFIG_BLK_DEV_INTEGRITY */
+static void nvme_dif_remap(struct request *req,
+ void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi))
+{
+}
+static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+}
+static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi)
+{
+}
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+}
+#endif
+
+static void req_completion(struct nvme_queue *nvmeq, void *ctx,
+ struct nvme_completion *cqe)
+{
+ struct nvme_iod *iod = ctx;
+ struct request *req = iod_get_private(iod);
+ struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+
+ u16 status = le16_to_cpup(&cqe->status) >> 1;
+
+ if (unlikely(status)) {
+ if (!(status & NVME_SC_DNR || blk_noretry_request(req))
+ && (jiffies - req->start_time) < req->timeout) {
+ unsigned long flags;
+
+ blk_mq_requeue_request(req);
+ spin_lock_irqsave(req->q->queue_lock, flags);
+ if (!blk_queue_stopped(req->q))
+ blk_mq_kick_requeue_list(req->q);
+ spin_unlock_irqrestore(req->q->queue_lock, flags);
+ return;
+ }
+ req->errors = nvme_error_status(status);
+ } else
+ req->errors = 0;
+
+ if (cmd_rq->aborted)
+ dev_warn(&nvmeq->dev->pci_dev->dev,
+ "completing aborted command with status:%04x\n",
+ status);
+
+ if (iod->nents) {
+ dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents,
+ rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ if (blk_integrity_rq(req)) {
+ if (!rq_data_dir(req))
+ nvme_dif_remap(req, nvme_dif_complete);
+ dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1,
+ rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ }
+ }
+ nvme_free_iod(nvmeq->dev, iod);
+
+ blk_mq_complete_request(req);
+}
+
+/* length is in bytes. gfp flags indicates whether we may sleep. */
+int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len,
+ gfp_t gfp)
+{
+ struct dma_pool *pool;
+ int length = total_len;
+ struct scatterlist *sg = iod->sg;
+ int dma_len = sg_dma_len(sg);
+ u64 dma_addr = sg_dma_address(sg);
+ u32 page_size = dev->page_size;
+ int offset = dma_addr & (page_size - 1);
+ __le64 *prp_list;
+ __le64 **list = iod_list(iod);
+ dma_addr_t prp_dma;
+ int nprps, i;
+
+ length -= (page_size - offset);
+ if (length <= 0)
+ return total_len;
+
+ dma_len -= (page_size - offset);
+ if (dma_len) {
+ dma_addr += (page_size - offset);
+ } else {
+ sg = sg_next(sg);
+ dma_addr = sg_dma_address(sg);
+ dma_len = sg_dma_len(sg);
+ }
+
+ if (length <= page_size) {
+ iod->first_dma = dma_addr;
+ return total_len;
+ }
+
+ nprps = DIV_ROUND_UP(length, page_size);
+ if (nprps <= (256 / 8)) {
+ pool = dev->prp_small_pool;
+ iod->npages = 0;
+ } else {
+ pool = dev->prp_page_pool;
+ iod->npages = 1;
+ }
+
+ prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+ if (!prp_list) {
+ iod->first_dma = dma_addr;
+ iod->npages = -1;
+ return (total_len - length) + page_size;
+ }
+ list[0] = prp_list;
+ iod->first_dma = prp_dma;
+ i = 0;
+ for (;;) {
+ if (i == page_size >> 3) {
+ __le64 *old_prp_list = prp_list;
+ prp_list = dma_pool_alloc(pool, gfp, &prp_dma);
+ if (!prp_list)
+ return total_len - length;
+ list[iod->npages++] = prp_list;
+ prp_list[0] = old_prp_list[i - 1];
+ old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+ i = 1;
+ }
+ prp_list[i++] = cpu_to_le64(dma_addr);
+ dma_len -= page_size;
+ dma_addr += page_size;
+ length -= page_size;
+ if (length <= 0)
+ break;
+ if (dma_len > 0)
+ continue;
+ BUG_ON(dma_len < 0);
+ sg = sg_next(sg);
+ dma_addr = sg_dma_address(sg);
+ dma_len = sg_dma_len(sg);
+ }
+
+ return total_len;
+}
+
+/*
+ * We reuse the small pool to allocate the 16-byte range here as it is not
+ * worth having a special pool for these or additional cases to handle freeing
+ * the iod.
+ */
+static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+ struct request *req, struct nvme_iod *iod)
+{
+ struct nvme_dsm_range *range =
+ (struct nvme_dsm_range *)iod_list(iod)[0];
+ struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+ range->cattr = cpu_to_le32(0);
+ range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift);
+ range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+
+ memset(cmnd, 0, sizeof(*cmnd));
+ cmnd->dsm.opcode = nvme_cmd_dsm;
+ cmnd->dsm.command_id = req->tag;
+ cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
+ cmnd->dsm.nr = 0;
+ cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+
+ if (++nvmeq->sq_tail == nvmeq->q_depth)
+ nvmeq->sq_tail = 0;
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+}
+
+static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+ int cmdid)
+{
+ struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+ memset(cmnd, 0, sizeof(*cmnd));
+ cmnd->common.opcode = nvme_cmd_flush;
+ cmnd->common.command_id = cmdid;
+ cmnd->common.nsid = cpu_to_le32(ns->ns_id);
+
+ if (++nvmeq->sq_tail == nvmeq->q_depth)
+ nvmeq->sq_tail = 0;
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+}
+
+static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod,
+ struct nvme_ns *ns)
+{
+ struct request *req = iod_get_private(iod);
+ struct nvme_command *cmnd;
+ u16 control = 0;
+ u32 dsmgmt = 0;
+
+ if (req->cmd_flags & REQ_FUA)
+ control |= NVME_RW_FUA;
+ if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD))
+ control |= NVME_RW_LR;
+
+ if (req->cmd_flags & REQ_RAHEAD)
+ dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+
+ cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+ memset(cmnd, 0, sizeof(*cmnd));
+
+ cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read);
+ cmnd->rw.command_id = req->tag;
+ cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
+ cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ cmnd->rw.prp2 = cpu_to_le64(iod->first_dma);
+ cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req)));
+ cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+
+ if (blk_integrity_rq(req)) {
+ cmnd->rw.metadata = cpu_to_le64(sg_dma_address(iod->meta_sg));
+ switch (ns->pi_type) {
+ case NVME_NS_DPS_PI_TYPE3:
+ control |= NVME_RW_PRINFO_PRCHK_GUARD;
+ break;
+ case NVME_NS_DPS_PI_TYPE1:
+ case NVME_NS_DPS_PI_TYPE2:
+ control |= NVME_RW_PRINFO_PRCHK_GUARD |
+ NVME_RW_PRINFO_PRCHK_REF;
+ cmnd->rw.reftag = cpu_to_le32(
+ nvme_block_nr(ns, blk_rq_pos(req)));
+ break;
+ }
+ } else if (ns->ms)
+ control |= NVME_RW_PRINFO_PRACT;
+
+ cmnd->rw.control = cpu_to_le16(control);
+ cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
+
+ if (++nvmeq->sq_tail == nvmeq->q_depth)
+ nvmeq->sq_tail = 0;
+ writel(nvmeq->sq_tail, nvmeq->q_db);
+
+ return 0;
+}
+
+static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct nvme_ns *ns = hctx->queue->queuedata;
+ struct nvme_queue *nvmeq = hctx->driver_data;
+ struct request *req = bd->rq;
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+ struct nvme_iod *iod;
+ enum dma_data_direction dma_dir;
+
+ /*
+ * If formated with metadata, require the block layer provide a buffer
+ * unless this namespace is formated such that the metadata can be
+ * stripped/generated by the controller with PRACT=1.
+ */
+ if (ns->ms && !blk_integrity_rq(req)) {
+ if (!(ns->pi_type && ns->ms == 8)) {
+ req->errors = -EFAULT;
+ blk_mq_complete_request(req);
+ return BLK_MQ_RQ_QUEUE_OK;
+ }
+ }
+
+ iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC);
+ if (!iod)
+ return BLK_MQ_RQ_QUEUE_BUSY;
+
+ if (req->cmd_flags & REQ_DISCARD) {
+ void *range;
+ /*
+ * We reuse the small pool to allocate the 16-byte range here
+ * as it is not worth having a special pool for these or
+ * additional cases to handle freeing the iod.
+ */
+ range = dma_pool_alloc(nvmeq->dev->prp_small_pool,
+ GFP_ATOMIC,
+ &iod->first_dma);
+ if (!range)
+ goto retry_cmd;
+ iod_list(iod)[0] = (__le64 *)range;
+ iod->npages = 0;
+ } else if (req->nr_phys_segments) {
+ dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+ sg_init_table(iod->sg, req->nr_phys_segments);
+ iod->nents = blk_rq_map_sg(req->q, req, iod->sg);
+ if (!iod->nents)
+ goto error_cmd;
+
+ if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir))
+ goto retry_cmd;
+
+ if (blk_rq_bytes(req) !=
+ nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) {
+ dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg,
+ iod->nents, dma_dir);
+ goto retry_cmd;
+ }
+ if (blk_integrity_rq(req)) {
+ if (blk_rq_count_integrity_sg(req->q, req->bio) != 1)
+ goto error_cmd;
+
+ sg_init_table(iod->meta_sg, 1);
+ if (blk_rq_map_integrity_sg(
+ req->q, req->bio, iod->meta_sg) != 1)
+ goto error_cmd;
+
+ if (rq_data_dir(req))
+ nvme_dif_remap(req, nvme_dif_prep);
+
+ if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir))
+ goto error_cmd;
+ }
+ }
+
+ nvme_set_info(cmd, iod, req_completion);
+ spin_lock_irq(&nvmeq->q_lock);
+ if (req->cmd_flags & REQ_DISCARD)
+ nvme_submit_discard(nvmeq, ns, req, iod);
+ else if (req->cmd_flags & REQ_FLUSH)
+ nvme_submit_flush(nvmeq, ns, req->tag);
+ else
+ nvme_submit_iod(nvmeq, iod, ns);
+
+ nvme_process_cq(nvmeq);
+ spin_unlock_irq(&nvmeq->q_lock);
+ return BLK_MQ_RQ_QUEUE_OK;
+
+ error_cmd:
+ nvme_free_iod(nvmeq->dev, iod);
+ return BLK_MQ_RQ_QUEUE_ERROR;
+ retry_cmd:
+ nvme_free_iod(nvmeq->dev, iod);
+ return BLK_MQ_RQ_QUEUE_BUSY;
+}
+
+static int nvme_process_cq(struct nvme_queue *nvmeq)
+{
+ u16 head, phase;
+
+ head = nvmeq->cq_head;
+ phase = nvmeq->cq_phase;
+
+ for (;;) {
+ void *ctx;
+ nvme_completion_fn fn;
+ struct nvme_completion cqe = nvmeq->cqes[head];
+ if ((le16_to_cpu(cqe.status) & 1) != phase)
+ break;
+ nvmeq->sq_head = le16_to_cpu(cqe.sq_head);
+ if (++head == nvmeq->q_depth) {
+ head = 0;
+ phase = !phase;
+ }
+ ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn);
+ fn(nvmeq, ctx, &cqe);
+ }
+
+ /* If the controller ignores the cq head doorbell and continuously
+ * writes to the queue, it is theoretically possible to wrap around
+ * the queue twice and mistakenly return IRQ_NONE. Linux only
+ * requires that 0.1% of your interrupts are handled, so this isn't
+ * a big problem.
+ */
+ if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
+ return 0;
+
+ writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
+ nvmeq->cq_head = head;
+ nvmeq->cq_phase = phase;
+
+ nvmeq->cqe_seen = 1;
+ return 1;
+}
+
+/* Admin queue isn't initialized as a request queue. If at some point this
+ * happens anyway, make sure to notify the user */
+static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ WARN_ON_ONCE(1);
+ return BLK_MQ_RQ_QUEUE_ERROR;
+}
+
+static irqreturn_t nvme_irq(int irq, void *data)
+{
+ irqreturn_t result;
+ struct nvme_queue *nvmeq = data;
+ spin_lock(&nvmeq->q_lock);
+ nvme_process_cq(nvmeq);
+ result = nvmeq->cqe_seen ? IRQ_HANDLED : IRQ_NONE;
+ nvmeq->cqe_seen = 0;
+ spin_unlock(&nvmeq->q_lock);
+ return result;
+}
+
+static irqreturn_t nvme_irq_check(int irq, void *data)
+{
+ struct nvme_queue *nvmeq = data;
+ struct nvme_completion cqe = nvmeq->cqes[nvmeq->cq_head];
+ if ((le16_to_cpu(cqe.status) & 1) != nvmeq->cq_phase)
+ return IRQ_NONE;
+ return IRQ_WAKE_THREAD;
+}
+
+struct sync_cmd_info {
+ struct task_struct *task;
+ u32 result;
+ int status;
+};
+
+static void sync_completion(struct nvme_queue *nvmeq, void *ctx,
+ struct nvme_completion *cqe)
+{
+ struct sync_cmd_info *cmdinfo = ctx;
+ cmdinfo->result = le32_to_cpup(&cqe->result);
+ cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
+ wake_up_process(cmdinfo->task);
+}
+
+/*
+ * Returns 0 on success. If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd,
+ u32 *result, unsigned timeout)
+{
+ struct sync_cmd_info cmdinfo;
+ struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = cmd_rq->nvmeq;
+
+ cmdinfo.task = current;
+ cmdinfo.status = -EINTR;
+
+ cmd->common.command_id = req->tag;
+
+ nvme_set_info(cmd_rq, &cmdinfo, sync_completion);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ nvme_submit_cmd(nvmeq, cmd);
+ schedule();
+
+ if (result)
+ *result = cmdinfo.result;
+ return cmdinfo.status;
+}
+
+static int nvme_submit_async_admin_req(struct nvme_dev *dev)
+{
+ struct nvme_queue *nvmeq = dev->queues[0];
+ struct nvme_command c;
+ struct nvme_cmd_info *cmd_info;
+ struct request *req;
+
+ req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ req->cmd_flags |= REQ_NO_TIMEOUT;
+ cmd_info = blk_mq_rq_to_pdu(req);
+ nvme_set_info(cmd_info, NULL, async_req_completion);
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = nvme_admin_async_event;
+ c.common.command_id = req->tag;
+
+ blk_mq_free_hctx_request(nvmeq->hctx, req);
+ return __nvme_submit_cmd(nvmeq, &c);
+}
+
+static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
+ struct nvme_command *cmd,
+ struct async_cmd_info *cmdinfo, unsigned timeout)
+{
+ struct nvme_queue *nvmeq = dev->queues[0];
+ struct request *req;
+ struct nvme_cmd_info *cmd_rq;
+
+ req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ req->timeout = timeout;
+ cmd_rq = blk_mq_rq_to_pdu(req);
+ cmdinfo->req = req;
+ nvme_set_info(cmd_rq, cmdinfo, async_completion);
+ cmdinfo->status = -EINTR;
+
+ cmd->common.command_id = req->tag;
+
+ return nvme_submit_cmd(nvmeq, cmd);
+}
+
+static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+ u32 *result, unsigned timeout)
+{
+ int res;
+ struct request *req;
+
+ req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ res = nvme_submit_sync_cmd(req, cmd, result, timeout);
+ blk_mq_free_request(req);
+ return res;
+}
+
+int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+ u32 *result)
+{
+ return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT);
+}
+
+int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
+ struct nvme_command *cmd, u32 *result)
+{
+ int res;
+ struct request *req;
+
+ req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT),
+ false);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT);
+ blk_mq_free_request(req);
+ return res;
+}
+
+static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.delete_queue.opcode = opcode;
+ c.delete_queue.qid = cpu_to_le16(id);
+
+ return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
+ struct nvme_queue *nvmeq)
+{
+ struct nvme_command c;
+ int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
+
+ memset(&c, 0, sizeof(c));
+ c.create_cq.opcode = nvme_admin_create_cq;
+ c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
+ c.create_cq.cqid = cpu_to_le16(qid);
+ c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+ c.create_cq.cq_flags = cpu_to_le16(flags);
+ c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
+
+ return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
+ struct nvme_queue *nvmeq)
+{
+ struct nvme_command c;
+ int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
+
+ memset(&c, 0, sizeof(c));
+ c.create_sq.opcode = nvme_admin_create_sq;
+ c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
+ c.create_sq.sqid = cpu_to_le16(qid);
+ c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
+ c.create_sq.sq_flags = cpu_to_le16(flags);
+ c.create_sq.cqid = cpu_to_le16(qid);
+
+ return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid)
+{
+ return adapter_delete_queue(dev, nvme_admin_delete_cq, cqid);
+}
+
+static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
+{
+ return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
+}
+
+int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
+ dma_addr_t dma_addr)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.identify.opcode = nvme_admin_identify;
+ c.identify.nsid = cpu_to_le32(nsid);
+ c.identify.prp1 = cpu_to_le64(dma_addr);
+ c.identify.cns = cpu_to_le32(cns);
+
+ return nvme_submit_admin_cmd(dev, &c, NULL);
+}
+
+int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
+ dma_addr_t dma_addr, u32 *result)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.features.opcode = nvme_admin_get_features;
+ c.features.nsid = cpu_to_le32(nsid);
+ c.features.prp1 = cpu_to_le64(dma_addr);
+ c.features.fid = cpu_to_le32(fid);
+
+ return nvme_submit_admin_cmd(dev, &c, result);
+}
+
+int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
+ dma_addr_t dma_addr, u32 *result)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.features.opcode = nvme_admin_set_features;
+ c.features.prp1 = cpu_to_le64(dma_addr);
+ c.features.fid = cpu_to_le32(fid);
+ c.features.dword11 = cpu_to_le32(dword11);
+
+ return nvme_submit_admin_cmd(dev, &c, result);
+}
+
+/**
+ * nvme_abort_req - Attempt aborting a request
+ *
+ * Schedule controller reset if the command was already aborted once before and
+ * still hasn't been returned to the driver, or if this is the admin queue.
+ */
+static void nvme_abort_req(struct request *req)
+{
+ struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = cmd_rq->nvmeq;
+ struct nvme_dev *dev = nvmeq->dev;
+ struct request *abort_req;
+ struct nvme_cmd_info *abort_cmd;
+ struct nvme_command cmd;
+
+ if (!nvmeq->qid || cmd_rq->aborted) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&dev_list_lock, flags);
+ if (work_busy(&dev->reset_work))
+ goto out;
+ list_del_init(&dev->node);
+ dev_warn(&dev->pci_dev->dev,
+ "I/O %d QID %d timeout, reset controller\n",
+ req->tag, nvmeq->qid);
+ dev->reset_workfn = nvme_reset_failed_dev;
+ queue_work(nvme_workq, &dev->reset_work);
+ out:
+ spin_unlock_irqrestore(&dev_list_lock, flags);
+ return;
+ }
+
+ if (!dev->abort_limit)
+ return;
+
+ abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
+ false);
+ if (IS_ERR(abort_req))
+ return;
+
+ abort_cmd = blk_mq_rq_to_pdu(abort_req);
+ nvme_set_info(abort_cmd, abort_req, abort_completion);
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.abort.opcode = nvme_admin_abort_cmd;
+ cmd.abort.cid = req->tag;
+ cmd.abort.sqid = cpu_to_le16(nvmeq->qid);
+ cmd.abort.command_id = abort_req->tag;
+
+ --dev->abort_limit;
+ cmd_rq->aborted = 1;
+
+ dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag,
+ nvmeq->qid);
+ if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) {
+ dev_warn(nvmeq->q_dmadev,
+ "Could not abort I/O %d QID %d",
+ req->tag, nvmeq->qid);
+ blk_mq_free_request(abort_req);
+ }
+}
+
+static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx,
+ struct request *req, void *data, bool reserved)
+{
+ struct nvme_queue *nvmeq = data;
+ void *ctx;
+ nvme_completion_fn fn;
+ struct nvme_cmd_info *cmd;
+ struct nvme_completion cqe;
+
+ if (!blk_mq_request_started(req))
+ return;
+
+ cmd = blk_mq_rq_to_pdu(req);
+
+ if (cmd->ctx == CMD_CTX_CANCELLED)
+ return;
+
+ if (blk_queue_dying(req->q))
+ cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1);
+ else
+ cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1);
+
+
+ dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n",
+ req->tag, nvmeq->qid);
+ ctx = cancel_cmd_info(cmd, &fn);
+ fn(nvmeq, ctx, &cqe);
+}
+
+static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
+{
+ struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req);
+ struct nvme_queue *nvmeq = cmd->nvmeq;
+
+ dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag,
+ nvmeq->qid);
+ spin_lock_irq(&nvmeq->q_lock);
+ nvme_abort_req(req);
+ spin_unlock_irq(&nvmeq->q_lock);
+
+ /*
+ * The aborted req will be completed on receiving the abort req.
+ * We enable the timer again. If hit twice, it'll cause a device reset,
+ * as the device then is in a faulty state.
+ */
+ return BLK_EH_RESET_TIMER;
+}
+
+static void nvme_free_queue(struct nvme_queue *nvmeq)
+{
+ dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
+ (void *)nvmeq->cqes, nvmeq->cq_dma_addr);
+ dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
+ nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+ kfree(nvmeq);
+}
+
+static void nvme_free_queues(struct nvme_dev *dev, int lowest)
+{
+ int i;
+
+ for (i = dev->queue_count - 1; i >= lowest; i--) {
+ struct nvme_queue *nvmeq = dev->queues[i];
+ dev->queue_count--;
+ dev->queues[i] = NULL;
+ nvme_free_queue(nvmeq);
+ }
+}
+
+/**
+ * nvme_suspend_queue - put queue into suspended state
+ * @nvmeq - queue to suspend
+ */
+static int nvme_suspend_queue(struct nvme_queue *nvmeq)
+{
+ int vector;
+
+ spin_lock_irq(&nvmeq->q_lock);
+ if (nvmeq->cq_vector == -1) {
+ spin_unlock_irq(&nvmeq->q_lock);
+ return 1;
+ }
+ vector = nvmeq->dev->entry[nvmeq->cq_vector].vector;
+ nvmeq->dev->online_queues--;
+ nvmeq->cq_vector = -1;
+ spin_unlock_irq(&nvmeq->q_lock);
+
+ if (!nvmeq->qid && nvmeq->dev->admin_q)
+ blk_mq_freeze_queue_start(nvmeq->dev->admin_q);
+
+ irq_set_affinity_hint(vector, NULL);
+ free_irq(vector, nvmeq);
+
+ return 0;
+}
+
+static void nvme_clear_queue(struct nvme_queue *nvmeq)
+{
+ struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
+
+ spin_lock_irq(&nvmeq->q_lock);
+ if (hctx && hctx->tags)
+ blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
+ spin_unlock_irq(&nvmeq->q_lock);
+}
+
+static void nvme_disable_queue(struct nvme_dev *dev, int qid)
+{
+ struct nvme_queue *nvmeq = dev->queues[qid];
+
+ if (!nvmeq)
+ return;
+ if (nvme_suspend_queue(nvmeq))
+ return;
+
+ /* Don't tell the adapter to delete the admin queue.
+ * Don't tell a removed adapter to delete IO queues. */
+ if (qid && readl(&dev->bar->csts) != -1) {
+ adapter_delete_sq(dev, qid);
+ adapter_delete_cq(dev, qid);
+ }
+
+ spin_lock_irq(&nvmeq->q_lock);
+ nvme_process_cq(nvmeq);
+ spin_unlock_irq(&nvmeq->q_lock);
+}
+
+static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
+ int depth)
+{
+ struct device *dmadev = &dev->pci_dev->dev;
+ struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL);
+ if (!nvmeq)
+ return NULL;
+
+ nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth),
+ &nvmeq->cq_dma_addr, GFP_KERNEL);
+ if (!nvmeq->cqes)
+ goto free_nvmeq;
+
+ nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth),
+ &nvmeq->sq_dma_addr, GFP_KERNEL);
+ if (!nvmeq->sq_cmds)
+ goto free_cqdma;
+
+ nvmeq->q_dmadev = dmadev;
+ nvmeq->dev = dev;
+ snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d",
+ dev->instance, qid);
+ spin_lock_init(&nvmeq->q_lock);
+ nvmeq->cq_head = 0;
+ nvmeq->cq_phase = 1;
+ nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+ nvmeq->q_depth = depth;
+ nvmeq->qid = qid;
+ dev->queue_count++;
+ dev->queues[qid] = nvmeq;
+
+ return nvmeq;
+
+ free_cqdma:
+ dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes,
+ nvmeq->cq_dma_addr);
+ free_nvmeq:
+ kfree(nvmeq);
+ return NULL;
+}
+
+static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+ const char *name)
+{
+ if (use_threaded_interrupts)
+ return request_threaded_irq(dev->entry[nvmeq->cq_vector].vector,
+ nvme_irq_check, nvme_irq, IRQF_SHARED,
+ name, nvmeq);
+ return request_irq(dev->entry[nvmeq->cq_vector].vector, nvme_irq,
+ IRQF_SHARED, name, nvmeq);
+}
+
+static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
+{
+ struct nvme_dev *dev = nvmeq->dev;
+
+ spin_lock_irq(&nvmeq->q_lock);
+ nvmeq->sq_tail = 0;
+ nvmeq->cq_head = 0;
+ nvmeq->cq_phase = 1;
+ nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
+ memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
+ dev->online_queues++;
+ spin_unlock_irq(&nvmeq->q_lock);
+}
+
+static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
+{
+ struct nvme_dev *dev = nvmeq->dev;
+ int result;
+
+ nvmeq->cq_vector = qid - 1;
+ result = adapter_alloc_cq(dev, qid, nvmeq);
+ if (result < 0)
+ return result;
+
+ result = adapter_alloc_sq(dev, qid, nvmeq);
+ if (result < 0)
+ goto release_cq;
+
+ result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
+ if (result < 0)
+ goto release_sq;
+
+ nvme_init_queue(nvmeq, qid);
+ return result;
+
+ release_sq:
+ adapter_delete_sq(dev, qid);
+ release_cq:
+ adapter_delete_cq(dev, qid);
+ return result;
+}
+
+static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled)
+{
+ unsigned long timeout;
+ u32 bit = enabled ? NVME_CSTS_RDY : 0;
+
+ timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies;
+
+ while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) {
+ msleep(100);
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ if (time_after(jiffies, timeout)) {
+ dev_err(&dev->pci_dev->dev,
+ "Device not ready; aborting %s\n", enabled ?
+ "initialisation" : "reset");
+ return -ENODEV;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * If the device has been passed off to us in an enabled state, just clear
+ * the enabled bit. The spec says we should set the 'shutdown notification
+ * bits', but doing so may cause the device to complete commands to the
+ * admin queue ... and we don't know what memory that might be pointing at!
+ */
+static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap)
+{
+ dev->ctrl_config &= ~NVME_CC_SHN_MASK;
+ dev->ctrl_config &= ~NVME_CC_ENABLE;
+ writel(dev->ctrl_config, &dev->bar->cc);
+
+ return nvme_wait_ready(dev, cap, false);
+}
+
+static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap)
+{
+ dev->ctrl_config &= ~NVME_CC_SHN_MASK;
+ dev->ctrl_config |= NVME_CC_ENABLE;
+ writel(dev->ctrl_config, &dev->bar->cc);
+
+ return nvme_wait_ready(dev, cap, true);
+}
+
+static int nvme_shutdown_ctrl(struct nvme_dev *dev)
+{
+ unsigned long timeout;
+
+ dev->ctrl_config &= ~NVME_CC_SHN_MASK;
+ dev->ctrl_config |= NVME_CC_SHN_NORMAL;
+
+ writel(dev->ctrl_config, &dev->bar->cc);
+
+ timeout = SHUTDOWN_TIMEOUT + jiffies;
+ while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) !=
+ NVME_CSTS_SHST_CMPLT) {
+ msleep(100);
+ if (fatal_signal_pending(current))
+ return -EINTR;
+ if (time_after(jiffies, timeout)) {
+ dev_err(&dev->pci_dev->dev,
+ "Device shutdown incomplete; abort shutdown\n");
+ return -ENODEV;
+ }
+ }
+
+ return 0;
+}
+
+static struct blk_mq_ops nvme_mq_admin_ops = {
+ .queue_rq = nvme_admin_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_hctx = nvme_admin_init_hctx,
+ .exit_hctx = nvme_exit_hctx,
+ .init_request = nvme_admin_init_request,
+ .timeout = nvme_timeout,
+};
+
+static struct blk_mq_ops nvme_mq_ops = {
+ .queue_rq = nvme_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_hctx = nvme_init_hctx,
+ .exit_hctx = nvme_exit_hctx,
+ .init_request = nvme_init_request,
+ .timeout = nvme_timeout,
+};
+
+static void nvme_dev_remove_admin(struct nvme_dev *dev)
+{
+ if (dev->admin_q && !blk_queue_dying(dev->admin_q)) {
+ blk_cleanup_queue(dev->admin_q);
+ blk_mq_free_tag_set(&dev->admin_tagset);
+ }
+}
+
+static int nvme_alloc_admin_tags(struct nvme_dev *dev)
+{
+ if (!dev->admin_q) {
+ dev->admin_tagset.ops = &nvme_mq_admin_ops;
+ dev->admin_tagset.nr_hw_queues = 1;
+ dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1;
+ dev->admin_tagset.reserved_tags = 1;
+ dev->admin_tagset.timeout = ADMIN_TIMEOUT;
+ dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+ dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
+ dev->admin_tagset.driver_data = dev;
+
+ if (blk_mq_alloc_tag_set(&dev->admin_tagset))
+ return -ENOMEM;
+
+ dev->admin_q = blk_mq_init_queue(&dev->admin_tagset);
+ if (IS_ERR(dev->admin_q)) {
+ blk_mq_free_tag_set(&dev->admin_tagset);
+ return -ENOMEM;
+ }
+ if (!blk_get_queue(dev->admin_q)) {
+ nvme_dev_remove_admin(dev);
+ return -ENODEV;
+ }
+ } else
+ blk_mq_unfreeze_queue(dev->admin_q);
+
+ return 0;
+}
+
+static int nvme_configure_admin_queue(struct nvme_dev *dev)
+{
+ int result;
+ u32 aqa;
+ u64 cap = readq(&dev->bar->cap);
+ struct nvme_queue *nvmeq;
+ unsigned page_shift = PAGE_SHIFT;
+ unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12;
+ unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12;
+
+ if (page_shift < dev_page_min) {
+ dev_err(&dev->pci_dev->dev,
+ "Minimum device page size (%u) too large for "
+ "host (%u)\n", 1 << dev_page_min,
+ 1 << page_shift);
+ return -ENODEV;
+ }
+ if (page_shift > dev_page_max) {
+ dev_info(&dev->pci_dev->dev,
+ "Device maximum page size (%u) smaller than "
+ "host (%u); enabling work-around\n",
+ 1 << dev_page_max, 1 << page_shift);
+ page_shift = dev_page_max;
+ }
+
+ result = nvme_disable_ctrl(dev, cap);
+ if (result < 0)
+ return result;
+
+ nvmeq = dev->queues[0];
+ if (!nvmeq) {
+ nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
+ if (!nvmeq)
+ return -ENOMEM;
+ }
+
+ aqa = nvmeq->q_depth - 1;
+ aqa |= aqa << 16;
+
+ dev->page_size = 1 << page_shift;
+
+ dev->ctrl_config = NVME_CC_CSS_NVM;
+ dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT;
+ dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE;
+ dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
+
+ writel(aqa, &dev->bar->aqa);
+ writeq(nvmeq->sq_dma_addr, &dev->bar->asq);
+ writeq(nvmeq->cq_dma_addr, &dev->bar->acq);
+
+ result = nvme_enable_ctrl(dev, cap);
+ if (result)
+ goto free_nvmeq;
+
+ nvmeq->cq_vector = 0;
+ result = queue_request_irq(dev, nvmeq, nvmeq->irqname);
+ if (result)
+ goto free_nvmeq;
+
+ return result;
+
+ free_nvmeq:
+ nvme_free_queues(dev, 0);
+ return result;
+}
+
+struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+ unsigned long addr, unsigned length)
+{
+ int i, err, count, nents, offset;
+ struct scatterlist *sg;
+ struct page **pages;
+ struct nvme_iod *iod;
+
+ if (addr & 3)
+ return ERR_PTR(-EINVAL);
+ if (!length || length > INT_MAX - PAGE_SIZE)
+ return ERR_PTR(-EINVAL);
+
+ offset = offset_in_page(addr);
+ count = DIV_ROUND_UP(offset + length, PAGE_SIZE);
+ pages = kcalloc(count, sizeof(*pages), GFP_KERNEL);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ err = get_user_pages_fast(addr, count, 1, pages);
+ if (err < count) {
+ count = err;
+ err = -EFAULT;
+ goto put_pages;
+ }
+
+ err = -ENOMEM;
+ iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL);
+ if (!iod)
+ goto put_pages;
+
+ sg = iod->sg;
+ sg_init_table(sg, count);
+ for (i = 0; i < count; i++) {
+ sg_set_page(&sg[i], pages[i],
+ min_t(unsigned, length, PAGE_SIZE - offset),
+ offset);
+ length -= (PAGE_SIZE - offset);
+ offset = 0;
+ }
+ sg_mark_end(&sg[i - 1]);
+ iod->nents = count;
+
+ nents = dma_map_sg(&dev->pci_dev->dev, sg, count,
+ write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+ if (!nents)
+ goto free_iod;
+
+ kfree(pages);
+ return iod;
+
+ free_iod:
+ kfree(iod);
+ put_pages:
+ for (i = 0; i < count; i++)
+ put_page(pages[i]);
+ kfree(pages);
+ return ERR_PTR(err);
+}
+
+void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
+ struct nvme_iod *iod)
+{
+ int i;
+
+ dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents,
+ write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+ for (i = 0; i < iod->nents; i++)
+ put_page(sg_page(&iod->sg[i]));
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+ struct nvme_dev *dev = ns->dev;
+ struct nvme_user_io io;
+ struct nvme_command c;
+ unsigned length, meta_len, prp_len;
+ int status, write;
+ struct nvme_iod *iod;
+ dma_addr_t meta_dma = 0;
+ void *meta = NULL;
+ void __user *metadata;
+
+ if (copy_from_user(&io, uio, sizeof(io)))
+ return -EFAULT;
+ length = (io.nblocks + 1) << ns->lba_shift;
+ meta_len = (io.nblocks + 1) * ns->ms;
+
+ if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext)
+ return -EINVAL;
+ else if (meta_len && ns->ext) {
+ length += meta_len;
+ meta_len = 0;
+ }
+
+ metadata = (void __user *)(unsigned long)io.metadata;
+
+ write = io.opcode & 1;
+
+ switch (io.opcode) {
+ case nvme_cmd_write:
+ case nvme_cmd_read:
+ case nvme_cmd_compare:
+ iod = nvme_map_user_pages(dev, write, io.addr, length);
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (IS_ERR(iod))
+ return PTR_ERR(iod);
+
+ prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+ if (length != prp_len) {
+ status = -ENOMEM;
+ goto unmap;
+ }
+ if (meta_len) {
+ meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
+ &meta_dma, GFP_KERNEL);
+
+ if (!meta) {
+ status = -ENOMEM;
+ goto unmap;
+ }
+ if (write) {
+ if (copy_from_user(meta, metadata, meta_len)) {
+ status = -EFAULT;
+ goto unmap;
+ }
+ }
+ }
+
+ memset(&c, 0, sizeof(c));
+ c.rw.opcode = io.opcode;
+ c.rw.flags = io.flags;
+ c.rw.nsid = cpu_to_le32(ns->ns_id);
+ c.rw.slba = cpu_to_le64(io.slba);
+ c.rw.length = cpu_to_le16(io.nblocks);
+ c.rw.control = cpu_to_le16(io.control);
+ c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+ c.rw.reftag = cpu_to_le32(io.reftag);
+ c.rw.apptag = cpu_to_le16(io.apptag);
+ c.rw.appmask = cpu_to_le16(io.appmask);
+ c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ c.rw.prp2 = cpu_to_le64(iod->first_dma);
+ c.rw.metadata = cpu_to_le64(meta_dma);
+ status = nvme_submit_io_cmd(dev, ns, &c, NULL);
+ unmap:
+ nvme_unmap_user_pages(dev, write, iod);
+ nvme_free_iod(dev, iod);
+ if (meta) {
+ if (status == NVME_SC_SUCCESS && !write) {
+ if (copy_to_user(metadata, meta, meta_len))
+ status = -EFAULT;
+ }
+ dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma);
+ }
+ return status;
+}
+
+static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns,
+ struct nvme_passthru_cmd __user *ucmd)
+{
+ struct nvme_passthru_cmd cmd;
+ struct nvme_command c;
+ int status, length;
+ struct nvme_iod *uninitialized_var(iod);
+ unsigned timeout;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+ return -EFAULT;
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = cmd.opcode;
+ c.common.flags = cmd.flags;
+ c.common.nsid = cpu_to_le32(cmd.nsid);
+ c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+ c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+ c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+ c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+ c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+ c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+ c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+ c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+ length = cmd.data_len;
+ if (cmd.data_len) {
+ iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr,
+ length);
+ if (IS_ERR(iod))
+ return PTR_ERR(iod);
+ length = nvme_setup_prps(dev, iod, length, GFP_KERNEL);
+ c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ c.common.prp2 = cpu_to_le64(iod->first_dma);
+ }
+
+ timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) :
+ ADMIN_TIMEOUT;
+
+ if (length != cmd.data_len)
+ status = -ENOMEM;
+ else if (ns) {
+ struct request *req;
+
+ req = blk_mq_alloc_request(ns->queue, WRITE,
+ (GFP_KERNEL|__GFP_WAIT), false);
+ if (IS_ERR(req))
+ status = PTR_ERR(req);
+ else {
+ status = nvme_submit_sync_cmd(req, &c, &cmd.result,
+ timeout);
+ blk_mq_free_request(req);
+ }
+ } else
+ status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout);
+
+ if (cmd.data_len) {
+ nvme_unmap_user_pages(dev, cmd.opcode & 1, iod);
+ nvme_free_iod(dev, iod);
+ }
+
+ if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result,
+ sizeof(cmd.result)))
+ status = -EFAULT;
+
+ return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+ unsigned long arg)
+{
+ struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+ switch (cmd) {
+ case NVME_IOCTL_ID:
+ force_successful_syscall_return();
+ return ns->ns_id;
+ case NVME_IOCTL_ADMIN_CMD:
+ return nvme_user_cmd(ns->dev, NULL, (void __user *)arg);
+ case NVME_IOCTL_IO_CMD:
+ return nvme_user_cmd(ns->dev, ns, (void __user *)arg);
+ case NVME_IOCTL_SUBMIT_IO:
+ return nvme_submit_io(ns, (void __user *)arg);
+ case SG_GET_VERSION_NUM:
+ return nvme_sg_get_version_num((void __user *)arg);
+ case SG_IO:
+ return nvme_sg_io(ns, (void __user *)arg);
+ default:
+ return -ENOTTY;
+ }
+}
+
+#ifdef CONFIG_COMPAT
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case SG_IO:
+ return -ENOIOCTLCMD;
+ }
+ return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl NULL
+#endif
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+ int ret = 0;
+ struct nvme_ns *ns;
+
+ spin_lock(&dev_list_lock);
+ ns = bdev->bd_disk->private_data;
+ if (!ns)
+ ret = -ENXIO;
+ else if (!kref_get_unless_zero(&ns->dev->kref))
+ ret = -ENXIO;
+ spin_unlock(&dev_list_lock);
+
+ return ret;
+}
+
+static void nvme_free_dev(struct kref *kref);
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+ struct nvme_ns *ns = disk->private_data;
+ struct nvme_dev *dev = ns->dev;
+
+ kref_put(&dev->kref, nvme_free_dev);
+}
+
+static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+ /* some standard values */
+ geo->heads = 1 << 6;
+ geo->sectors = 1 << 5;
+ geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ return 0;
+}
+
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+ u32 logical_block_size = queue_logical_block_size(ns->queue);
+ ns->queue->limits.discard_zeroes_data = 0;
+ ns->queue->limits.discard_alignment = logical_block_size;
+ ns->queue->limits.discard_granularity = logical_block_size;
+ ns->queue->limits.max_discard_sectors = 0xffffffff;
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+ struct nvme_ns *ns = disk->private_data;
+ struct nvme_dev *dev = ns->dev;
+ struct nvme_id_ns *id;
+ dma_addr_t dma_addr;
+ u8 lbaf, pi_type;
+ u16 old_ms;
+ unsigned short bs;
+
+ id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr,
+ GFP_KERNEL);
+ if (!id) {
+ dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n",
+ __func__);
+ return 0;
+ }
+ if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) {
+ dev_warn(&dev->pci_dev->dev,
+ "identify failed ns:%d, setting capacity to 0\n",
+ ns->ns_id);
+ memset(id, 0, sizeof(*id));
+ }
+
+ old_ms = ns->ms;
+ lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+ ns->lba_shift = id->lbaf[lbaf].ds;
+ ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+ ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+
+ /*
+ * If identify namespace failed, use default 512 byte block size so
+ * block layer can use before failing read/write for 0 capacity.
+ */
+ if (ns->lba_shift == 0)
+ ns->lba_shift = 9;
+ bs = 1 << ns->lba_shift;
+
+ /* XXX: PI implementation requires metadata equal t10 pi tuple size */
+ pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
+ id->dps & NVME_NS_DPS_PI_MASK : 0;
+
+ if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
+ ns->ms != old_ms ||
+ bs != queue_logical_block_size(disk->queue) ||
+ (ns->ms && ns->ext)))
+ blk_integrity_unregister(disk);
+
+ ns->pi_type = pi_type;
+ blk_queue_logical_block_size(ns->queue, bs);
+
+ if (ns->ms && !blk_get_integrity(disk) && (disk->flags & GENHD_FL_UP) &&
+ !ns->ext)
+ nvme_init_integrity(ns);
+
+ if (id->ncap == 0 || (ns->ms && !blk_get_integrity(disk)))
+ set_capacity(disk, 0);
+ else
+ set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+ if (dev->oncs & NVME_CTRL_ONCS_DSM)
+ nvme_config_discard(ns);
+
+ dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr);
+ return 0;
+}
+
+static const struct block_device_operations nvme_fops = {
+ .owner = THIS_MODULE,
+ .ioctl = nvme_ioctl,
+ .compat_ioctl = nvme_compat_ioctl,
+ .open = nvme_open,
+ .release = nvme_release,
+ .getgeo = nvme_getgeo,
+ .revalidate_disk= nvme_revalidate_disk,
+};
+
+static int nvme_kthread(void *data)
+{
+ struct nvme_dev *dev, *next;
+
+ while (!kthread_should_stop()) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ spin_lock(&dev_list_lock);
+ list_for_each_entry_safe(dev, next, &dev_list, node) {
+ int i;
+ if (readl(&dev->bar->csts) & NVME_CSTS_CFS) {
+ if (work_busy(&dev->reset_work))
+ continue;
+ list_del_init(&dev->node);
+ dev_warn(&dev->pci_dev->dev,
+ "Failed status: %x, reset controller\n",
+ readl(&dev->bar->csts));
+ dev->reset_workfn = nvme_reset_failed_dev;
+ queue_work(nvme_workq, &dev->reset_work);
+ continue;
+ }
+ for (i = 0; i < dev->queue_count; i++) {
+ struct nvme_queue *nvmeq = dev->queues[i];
+ if (!nvmeq)
+ continue;
+ spin_lock_irq(&nvmeq->q_lock);
+ nvme_process_cq(nvmeq);
+
+ while ((i == 0) && (dev->event_limit > 0)) {
+ if (nvme_submit_async_admin_req(dev))
+ break;
+ dev->event_limit--;
+ }
+ spin_unlock_irq(&nvmeq->q_lock);
+ }
+ }
+ spin_unlock(&dev_list_lock);
+ schedule_timeout(round_jiffies_relative(HZ));
+ }
+ return 0;
+}
+
+static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid)
+{
+ struct nvme_ns *ns;
+ struct gendisk *disk;
+ int node = dev_to_node(&dev->pci_dev->dev);
+
+ ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
+ if (!ns)
+ return;
+
+ ns->queue = blk_mq_init_queue(&dev->tagset);
+ if (IS_ERR(ns->queue))
+ goto out_free_ns;
+ queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+ queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue);
+ ns->dev = dev;
+ ns->queue->queuedata = ns;
+
+ disk = alloc_disk_node(0, node);
+ if (!disk)
+ goto out_free_queue;
+
+ ns->ns_id = nsid;
+ ns->disk = disk;
+ ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+ list_add_tail(&ns->list, &dev->namespaces);
+
+ blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+ if (dev->max_hw_sectors)
+ blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+ if (dev->stripe_size)
+ blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9);
+ if (dev->vwc & NVME_CTRL_VWC_PRESENT)
+ blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
+
+ disk->major = nvme_major;
+ disk->first_minor = 0;
+ disk->fops = &nvme_fops;
+ disk->private_data = ns;
+ disk->queue = ns->queue;
+ disk->driverfs_dev = dev->device;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
+
+ /*
+ * Initialize capacity to 0 until we establish the namespace format and
+ * setup integrity extentions if necessary. The revalidate_disk after
+ * add_disk allows the driver to register with integrity if the format
+ * requires it.
+ */
+ set_capacity(disk, 0);
+ nvme_revalidate_disk(ns->disk);
+ add_disk(ns->disk);
+ if (ns->ms)
+ revalidate_disk(ns->disk);
+ return;
+ out_free_queue:
+ blk_cleanup_queue(ns->queue);
+ out_free_ns:
+ kfree(ns);
+}
+
+static void nvme_create_io_queues(struct nvme_dev *dev)
+{
+ unsigned i;
+
+ for (i = dev->queue_count; i <= dev->max_qid; i++)
+ if (!nvme_alloc_queue(dev, i, dev->q_depth))
+ break;
+
+ for (i = dev->online_queues; i <= dev->queue_count - 1; i++)
+ if (nvme_create_queue(dev->queues[i], i))
+ break;
+}
+
+static int set_queue_count(struct nvme_dev *dev, int count)
+{
+ int status;
+ u32 result;
+ u32 q_count = (count - 1) | ((count - 1) << 16);
+
+ status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0,
+ &result);
+ if (status < 0)
+ return status;
+ if (status > 0) {
+ dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n",
+ status);
+ return 0;
+ }
+ return min(result & 0xffff, result >> 16) + 1;
+}
+
+static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
+{
+ return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride);
+}
+
+static int nvme_setup_io_queues(struct nvme_dev *dev)
+{
+ struct nvme_queue *adminq = dev->queues[0];
+ struct pci_dev *pdev = dev->pci_dev;
+ int result, i, vecs, nr_io_queues, size;
+
+ nr_io_queues = num_possible_cpus();
+ result = set_queue_count(dev, nr_io_queues);
+ if (result <= 0)
+ return result;
+ if (result < nr_io_queues)
+ nr_io_queues = result;
+
+ size = db_bar_size(dev, nr_io_queues);
+ if (size > 8192) {
+ iounmap(dev->bar);
+ do {
+ dev->bar = ioremap(pci_resource_start(pdev, 0), size);
+ if (dev->bar)
+ break;
+ if (!--nr_io_queues)
+ return -ENOMEM;
+ size = db_bar_size(dev, nr_io_queues);
+ } while (1);
+ dev->dbs = ((void __iomem *)dev->bar) + 4096;
+ adminq->q_db = dev->dbs;
+ }
+
+ /* Deregister the admin queue's interrupt */
+ free_irq(dev->entry[0].vector, adminq);
+
+ /*
+ * If we enable msix early due to not intx, disable it again before
+ * setting up the full range we need.
+ */
+ if (!pdev->irq)
+ pci_disable_msix(pdev);
+
+ for (i = 0; i < nr_io_queues; i++)
+ dev->entry[i].entry = i;
+ vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues);
+ if (vecs < 0) {
+ vecs = pci_enable_msi_range(pdev, 1, min(nr_io_queues, 32));
+ if (vecs < 0) {
+ vecs = 1;
+ } else {
+ for (i = 0; i < vecs; i++)
+ dev->entry[i].vector = i + pdev->irq;
+ }
+ }
+
+ /*
+ * Should investigate if there's a performance win from allocating
+ * more queues than interrupt vectors; it might allow the submission
+ * path to scale better, even if the receive path is limited by the
+ * number of interrupts.
+ */
+ nr_io_queues = vecs;
+ dev->max_qid = nr_io_queues;
+
+ result = queue_request_irq(dev, adminq, adminq->irqname);
+ if (result)
+ goto free_queues;
+
+ /* Free previously allocated queues that are no longer usable */
+ nvme_free_queues(dev, nr_io_queues + 1);
+ nvme_create_io_queues(dev);
+
+ return 0;
+
+ free_queues:
+ nvme_free_queues(dev, 1);
+ return result;
+}
+
+/*
+ * Return: error value if an error occurred setting up the queues or calling
+ * Identify Device. 0 if these succeeded, even if adding some of the
+ * namespaces failed. At the moment, these failures are silent. TBD which
+ * failures should be reported.
+ */
+static int nvme_dev_add(struct nvme_dev *dev)
+{
+ struct pci_dev *pdev = dev->pci_dev;
+ int res;
+ unsigned nn, i;
+ struct nvme_id_ctrl *ctrl;
+ void *mem;
+ dma_addr_t dma_addr;
+ int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
+
+ mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL);
+ if (!mem)
+ return -ENOMEM;
+
+ res = nvme_identify(dev, 0, 1, dma_addr);
+ if (res) {
+ dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res);
+ dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+ return -EIO;
+ }
+
+ ctrl = mem;
+ nn = le32_to_cpup(&ctrl->nn);
+ dev->oncs = le16_to_cpup(&ctrl->oncs);
+ dev->abort_limit = ctrl->acl + 1;
+ dev->vwc = ctrl->vwc;
+ memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
+ memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
+ memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
+ if (ctrl->mdts)
+ dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
+ if ((pdev->vendor == PCI_VENDOR_ID_INTEL) &&
+ (pdev->device == 0x0953) && ctrl->vs[3]) {
+ unsigned int max_hw_sectors;
+
+ dev->stripe_size = 1 << (ctrl->vs[3] + shift);
+ max_hw_sectors = dev->stripe_size >> (shift - 9);
+ if (dev->max_hw_sectors) {
+ dev->max_hw_sectors = min(max_hw_sectors,
+ dev->max_hw_sectors);
+ } else
+ dev->max_hw_sectors = max_hw_sectors;
+ }
+ dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr);
+
+ dev->tagset.ops = &nvme_mq_ops;
+ dev->tagset.nr_hw_queues = dev->online_queues - 1;
+ dev->tagset.timeout = NVME_IO_TIMEOUT;
+ dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev);
+ dev->tagset.queue_depth =
+ min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
+ dev->tagset.cmd_size = nvme_cmd_size(dev);
+ dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
+ dev->tagset.driver_data = dev;
+
+ if (blk_mq_alloc_tag_set(&dev->tagset))
+ return 0;
+
+ for (i = 1; i <= nn; i++)
+ nvme_alloc_ns(dev, i);
+
+ return 0;
+}
+
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+ u64 cap;
+ int bars, result = -ENOMEM;
+ struct pci_dev *pdev = dev->pci_dev;
+
+ if (pci_enable_device_mem(pdev))
+ return result;
+
+ dev->entry[0].vector = pdev->irq;
+ pci_set_master(pdev);
+ bars = pci_select_bars(pdev, IORESOURCE_MEM);
+ if (!bars)
+ goto disable_pci;
+
+ if (pci_request_selected_regions(pdev, bars, "nvme"))
+ goto disable_pci;
+
+ if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) &&
+ dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)))
+ goto disable;
+
+ dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+ if (!dev->bar)
+ goto disable;
+
+ if (readl(&dev->bar->csts) == -1) {
+ result = -ENODEV;
+ goto unmap;
+ }
+
+ /*
+ * Some devices don't advertse INTx interrupts, pre-enable a single
+ * MSIX vec for setup. We'll adjust this later.
+ */
+ if (!pdev->irq) {
+ result = pci_enable_msix(pdev, dev->entry, 1);
+ if (result < 0)
+ goto unmap;
+ }
+
+ cap = readq(&dev->bar->cap);
+ dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
+ dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
+ dev->dbs = ((void __iomem *)dev->bar) + 4096;
+
+ return 0;
+
+ unmap:
+ iounmap(dev->bar);
+ dev->bar = NULL;
+ disable:
+ pci_release_regions(pdev);
+ disable_pci:
+ pci_disable_device(pdev);
+ return result;
+}
+
+static void nvme_dev_unmap(struct nvme_dev *dev)
+{
+ if (dev->pci_dev->msi_enabled)
+ pci_disable_msi(dev->pci_dev);
+ else if (dev->pci_dev->msix_enabled)
+ pci_disable_msix(dev->pci_dev);
+
+ if (dev->bar) {
+ iounmap(dev->bar);
+ dev->bar = NULL;
+ pci_release_regions(dev->pci_dev);
+ }
+
+ if (pci_is_enabled(dev->pci_dev))
+ pci_disable_device(dev->pci_dev);
+}
+
+struct nvme_delq_ctx {
+ struct task_struct *waiter;
+ struct kthread_worker *worker;
+ atomic_t refcount;
+};
+
+static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev)
+{
+ dq->waiter = current;
+ mb();
+
+ for (;;) {
+ set_current_state(TASK_KILLABLE);
+ if (!atomic_read(&dq->refcount))
+ break;
+ if (!schedule_timeout(ADMIN_TIMEOUT) ||
+ fatal_signal_pending(current)) {
+ /*
+ * Disable the controller first since we can't trust it
+ * at this point, but leave the admin queue enabled
+ * until all queue deletion requests are flushed.
+ * FIXME: This may take a while if there are more h/w
+ * queues than admin tags.
+ */
+ set_current_state(TASK_RUNNING);
+ nvme_disable_ctrl(dev, readq(&dev->bar->cap));
+ nvme_clear_queue(dev->queues[0]);
+ flush_kthread_worker(dq->worker);
+ nvme_disable_queue(dev, 0);
+ return;
+ }
+ }
+ set_current_state(TASK_RUNNING);
+}
+
+static void nvme_put_dq(struct nvme_delq_ctx *dq)
+{
+ atomic_dec(&dq->refcount);
+ if (dq->waiter)
+ wake_up_process(dq->waiter);
+}
+
+static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq)
+{
+ atomic_inc(&dq->refcount);
+ return dq;
+}
+
+static void nvme_del_queue_end(struct nvme_queue *nvmeq)
+{
+ struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx;
+ nvme_put_dq(dq);
+}
+
+static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode,
+ kthread_work_func_t fn)
+{
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.delete_queue.opcode = opcode;
+ c.delete_queue.qid = cpu_to_le16(nvmeq->qid);
+
+ init_kthread_work(&nvmeq->cmdinfo.work, fn);
+ return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo,
+ ADMIN_TIMEOUT);
+}
+
+static void nvme_del_cq_work_handler(struct kthread_work *work)
+{
+ struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+ cmdinfo.work);
+ nvme_del_queue_end(nvmeq);
+}
+
+static int nvme_delete_cq(struct nvme_queue *nvmeq)
+{
+ return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq,
+ nvme_del_cq_work_handler);
+}
+
+static void nvme_del_sq_work_handler(struct kthread_work *work)
+{
+ struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+ cmdinfo.work);
+ int status = nvmeq->cmdinfo.status;
+
+ if (!status)
+ status = nvme_delete_cq(nvmeq);
+ if (status)
+ nvme_del_queue_end(nvmeq);
+}
+
+static int nvme_delete_sq(struct nvme_queue *nvmeq)
+{
+ return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq,
+ nvme_del_sq_work_handler);
+}
+
+static void nvme_del_queue_start(struct kthread_work *work)
+{
+ struct nvme_queue *nvmeq = container_of(work, struct nvme_queue,
+ cmdinfo.work);
+ if (nvme_delete_sq(nvmeq))
+ nvme_del_queue_end(nvmeq);
+}
+
+static void nvme_disable_io_queues(struct nvme_dev *dev)
+{
+ int i;
+ DEFINE_KTHREAD_WORKER_ONSTACK(worker);
+ struct nvme_delq_ctx dq;
+ struct task_struct *kworker_task = kthread_run(kthread_worker_fn,
+ &worker, "nvme%d", dev->instance);
+
+ if (IS_ERR(kworker_task)) {
+ dev_err(&dev->pci_dev->dev,
+ "Failed to create queue del task\n");
+ for (i = dev->queue_count - 1; i > 0; i--)
+ nvme_disable_queue(dev, i);
+ return;
+ }
+
+ dq.waiter = NULL;
+ atomic_set(&dq.refcount, 0);
+ dq.worker = &worker;
+ for (i = dev->queue_count - 1; i > 0; i--) {
+ struct nvme_queue *nvmeq = dev->queues[i];
+
+ if (nvme_suspend_queue(nvmeq))
+ continue;
+ nvmeq->cmdinfo.ctx = nvme_get_dq(&dq);
+ nvmeq->cmdinfo.worker = dq.worker;
+ init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start);
+ queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work);
+ }
+ nvme_wait_dq(&dq, dev);
+ kthread_stop(kworker_task);
+}
+
+/*
+* Remove the node from the device list and check
+* for whether or not we need to stop the nvme_thread.
+*/
+static void nvme_dev_list_remove(struct nvme_dev *dev)
+{
+ struct task_struct *tmp = NULL;
+
+ spin_lock(&dev_list_lock);
+ list_del_init(&dev->node);
+ if (list_empty(&dev_list) && !IS_ERR_OR_NULL(nvme_thread)) {
+ tmp = nvme_thread;
+ nvme_thread = NULL;
+ }
+ spin_unlock(&dev_list_lock);
+
+ if (tmp)
+ kthread_stop(tmp);
+}
+
+static void nvme_freeze_queues(struct nvme_dev *dev)
+{
+ struct nvme_ns *ns;
+
+ list_for_each_entry(ns, &dev->namespaces, list) {
+ blk_mq_freeze_queue_start(ns->queue);
+
+ spin_lock(ns->queue->queue_lock);
+ queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue);
+ spin_unlock(ns->queue->queue_lock);
+
+ blk_mq_cancel_requeue_work(ns->queue);
+ blk_mq_stop_hw_queues(ns->queue);
+ }
+}
+
+static void nvme_unfreeze_queues(struct nvme_dev *dev)
+{
+ struct nvme_ns *ns;
+
+ list_for_each_entry(ns, &dev->namespaces, list) {
+ queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue);
+ blk_mq_unfreeze_queue(ns->queue);
+ blk_mq_start_stopped_hw_queues(ns->queue, true);
+ blk_mq_kick_requeue_list(ns->queue);
+ }
+}
+
+static void nvme_dev_shutdown(struct nvme_dev *dev)
+{
+ int i;
+ u32 csts = -1;
+
+ nvme_dev_list_remove(dev);
+
+ if (dev->bar) {
+ nvme_freeze_queues(dev);
+ csts = readl(&dev->bar->csts);
+ }
+ if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
+ for (i = dev->queue_count - 1; i >= 0; i--) {
+ struct nvme_queue *nvmeq = dev->queues[i];
+ nvme_suspend_queue(nvmeq);
+ }
+ } else {
+ nvme_disable_io_queues(dev);
+ nvme_shutdown_ctrl(dev);
+ nvme_disable_queue(dev, 0);
+ }
+ nvme_dev_unmap(dev);
+
+ for (i = dev->queue_count - 1; i >= 0; i--)
+ nvme_clear_queue(dev->queues[i]);
+}
+
+static void nvme_dev_remove(struct nvme_dev *dev)
+{
+ struct nvme_ns *ns;
+
+ list_for_each_entry(ns, &dev->namespaces, list) {
+ if (ns->disk->flags & GENHD_FL_UP) {
+ if (blk_get_integrity(ns->disk))
+ blk_integrity_unregister(ns->disk);
+ del_gendisk(ns->disk);
+ }
+ if (!blk_queue_dying(ns->queue)) {
+ blk_mq_abort_requeue_list(ns->queue);
+ blk_cleanup_queue(ns->queue);
+ }
+ }
+}
+
+static int nvme_setup_prp_pools(struct nvme_dev *dev)
+{
+ struct device *dmadev = &dev->pci_dev->dev;
+ dev->prp_page_pool = dma_pool_create("prp list page", dmadev,
+ PAGE_SIZE, PAGE_SIZE, 0);
+ if (!dev->prp_page_pool)
+ return -ENOMEM;
+
+ /* Optimisation for I/Os between 4k and 128k */
+ dev->prp_small_pool = dma_pool_create("prp list 256", dmadev,
+ 256, 256, 0);
+ if (!dev->prp_small_pool) {
+ dma_pool_destroy(dev->prp_page_pool);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+ dma_pool_destroy(dev->prp_page_pool);
+ dma_pool_destroy(dev->prp_small_pool);
+}
+
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct nvme_dev *dev)
+{
+ int instance, error;
+
+ do {
+ if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+ return -ENODEV;
+
+ spin_lock(&dev_list_lock);
+ error = ida_get_new(&nvme_instance_ida, &instance);
+ spin_unlock(&dev_list_lock);
+ } while (error == -EAGAIN);
+
+ if (error)
+ return -ENODEV;
+
+ dev->instance = instance;
+ return 0;
+}
+
+static void nvme_release_instance(struct nvme_dev *dev)
+{
+ spin_lock(&dev_list_lock);
+ ida_remove(&nvme_instance_ida, dev->instance);
+ spin_unlock(&dev_list_lock);
+}
+
+static void nvme_free_namespaces(struct nvme_dev *dev)
+{
+ struct nvme_ns *ns, *next;
+
+ list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
+ list_del(&ns->list);
+
+ spin_lock(&dev_list_lock);
+ ns->disk->private_data = NULL;
+ spin_unlock(&dev_list_lock);
+
+ put_disk(ns->disk);
+ kfree(ns);
+ }
+}
+
+static void nvme_free_dev(struct kref *kref)
+{
+ struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
+
+ pci_dev_put(dev->pci_dev);
+ put_device(dev->device);
+ nvme_free_namespaces(dev);
+ nvme_release_instance(dev);
+ blk_mq_free_tag_set(&dev->tagset);
+ blk_put_queue(dev->admin_q);
+ kfree(dev->queues);
+ kfree(dev->entry);
+ kfree(dev);
+}
+
+static int nvme_dev_open(struct inode *inode, struct file *f)
+{
+ struct nvme_dev *dev;
+ int instance = iminor(inode);
+ int ret = -ENODEV;
+
+ spin_lock(&dev_list_lock);
+ list_for_each_entry(dev, &dev_list, node) {
+ if (dev->instance == instance) {
+ if (!dev->admin_q) {
+ ret = -EWOULDBLOCK;
+ break;
+ }
+ if (!kref_get_unless_zero(&dev->kref))
+ break;
+ f->private_data = dev;
+ ret = 0;
+ break;
+ }
+ }
+ spin_unlock(&dev_list_lock);
+
+ return ret;
+}
+
+static int nvme_dev_release(struct inode *inode, struct file *f)
+{
+ struct nvme_dev *dev = f->private_data;
+ kref_put(&dev->kref, nvme_free_dev);
+ return 0;
+}
+
+static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+ struct nvme_dev *dev = f->private_data;
+ struct nvme_ns *ns;
+
+ switch (cmd) {
+ case NVME_IOCTL_ADMIN_CMD:
+ return nvme_user_cmd(dev, NULL, (void __user *)arg);
+ case NVME_IOCTL_IO_CMD:
+ if (list_empty(&dev->namespaces))
+ return -ENOTTY;
+ ns = list_first_entry(&dev->namespaces, struct nvme_ns, list);
+ return nvme_user_cmd(dev, ns, (void __user *)arg);
+ default:
+ return -ENOTTY;
+ }
+}
+
+static const struct file_operations nvme_dev_fops = {
+ .owner = THIS_MODULE,
+ .open = nvme_dev_open,
+ .release = nvme_dev_release,
+ .unlocked_ioctl = nvme_dev_ioctl,
+ .compat_ioctl = nvme_dev_ioctl,
+};
+
+static void nvme_set_irq_hints(struct nvme_dev *dev)
+{
+ struct nvme_queue *nvmeq;
+ int i;
+
+ for (i = 0; i < dev->online_queues; i++) {
+ nvmeq = dev->queues[i];
+
+ if (!nvmeq->hctx)
+ continue;
+
+ irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
+ nvmeq->hctx->cpumask);
+ }
+}
+
+static int nvme_dev_start(struct nvme_dev *dev)
+{
+ int result;
+ bool start_thread = false;
+
+ result = nvme_dev_map(dev);
+ if (result)
+ return result;
+
+ result = nvme_configure_admin_queue(dev);
+ if (result)
+ goto unmap;
+
+ spin_lock(&dev_list_lock);
+ if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) {
+ start_thread = true;
+ nvme_thread = NULL;
+ }
+ list_add(&dev->node, &dev_list);
+ spin_unlock(&dev_list_lock);
+
+ if (start_thread) {
+ nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+ wake_up_all(&nvme_kthread_wait);
+ } else
+ wait_event_killable(nvme_kthread_wait, nvme_thread);
+
+ if (IS_ERR_OR_NULL(nvme_thread)) {
+ result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR;
+ goto disable;
+ }
+
+ nvme_init_queue(dev->queues[0], 0);
+ result = nvme_alloc_admin_tags(dev);
+ if (result)
+ goto disable;
+
+ result = nvme_setup_io_queues(dev);
+ if (result)
+ goto free_tags;
+
+ nvme_set_irq_hints(dev);
+
+ dev->event_limit = 1;
+ return result;
+
+ free_tags:
+ nvme_dev_remove_admin(dev);
+ disable:
+ nvme_disable_queue(dev, 0);
+ nvme_dev_list_remove(dev);
+ unmap:
+ nvme_dev_unmap(dev);
+ return result;
+}
+
+static int nvme_remove_dead_ctrl(void *arg)
+{
+ struct nvme_dev *dev = (struct nvme_dev *)arg;
+ struct pci_dev *pdev = dev->pci_dev;
+
+ if (pci_get_drvdata(pdev))
+ pci_stop_and_remove_bus_device_locked(pdev);
+ kref_put(&dev->kref, nvme_free_dev);
+ return 0;
+}
+
+static void nvme_remove_disks(struct work_struct *ws)
+{
+ struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
+
+ nvme_free_queues(dev, 1);
+ nvme_dev_remove(dev);
+}
+
+static int nvme_dev_resume(struct nvme_dev *dev)
+{
+ int ret;
+
+ ret = nvme_dev_start(dev);
+ if (ret)
+ return ret;
+ if (dev->online_queues < 2) {
+ spin_lock(&dev_list_lock);
+ dev->reset_workfn = nvme_remove_disks;
+ queue_work(nvme_workq, &dev->reset_work);
+ spin_unlock(&dev_list_lock);
+ } else {
+ nvme_unfreeze_queues(dev);
+ nvme_set_irq_hints(dev);
+ }
+ return 0;
+}
+
+static void nvme_dev_reset(struct nvme_dev *dev)
+{
+ nvme_dev_shutdown(dev);
+ if (nvme_dev_resume(dev)) {
+ dev_warn(&dev->pci_dev->dev, "Device failed to resume\n");
+ kref_get(&dev->kref);
+ if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d",
+ dev->instance))) {
+ dev_err(&dev->pci_dev->dev,
+ "Failed to start controller remove task\n");
+ kref_put(&dev->kref, nvme_free_dev);
+ }
+ }
+}
+
+static void nvme_reset_failed_dev(struct work_struct *ws)
+{
+ struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work);
+ nvme_dev_reset(dev);
+}
+
+static void nvme_reset_workfn(struct work_struct *work)
+{
+ struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
+ dev->reset_workfn(work);
+}
+
+static void nvme_async_probe(struct work_struct *work);
+static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ int node, result = -ENOMEM;
+ struct nvme_dev *dev;
+
+ node = dev_to_node(&pdev->dev);
+ if (node == NUMA_NO_NODE)
+ set_dev_node(&pdev->dev, 0);
+
+ dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+ if (!dev)
+ return -ENOMEM;
+ dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry),
+ GFP_KERNEL, node);
+ if (!dev->entry)
+ goto free;
+ dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
+ GFP_KERNEL, node);
+ if (!dev->queues)
+ goto free;
+
+ INIT_LIST_HEAD(&dev->namespaces);
+ dev->reset_workfn = nvme_reset_failed_dev;
+ INIT_WORK(&dev->reset_work, nvme_reset_workfn);
+ dev->pci_dev = pci_dev_get(pdev);
+ pci_set_drvdata(pdev, dev);
+ result = nvme_set_instance(dev);
+ if (result)
+ goto put_pci;
+
+ result = nvme_setup_prp_pools(dev);
+ if (result)
+ goto release;
+
+ kref_init(&dev->kref);
+ dev->device = device_create(nvme_class, &pdev->dev,
+ MKDEV(nvme_char_major, dev->instance),
+ dev, "nvme%d", dev->instance);
+ if (IS_ERR(dev->device)) {
+ result = PTR_ERR(dev->device);
+ goto release_pools;
+ }
+ get_device(dev->device);
+
+ INIT_LIST_HEAD(&dev->node);
+ INIT_WORK(&dev->probe_work, nvme_async_probe);
+ schedule_work(&dev->probe_work);
+ return 0;
+
+ release_pools:
+ nvme_release_prp_pools(dev);
+ release:
+ nvme_release_instance(dev);
+ put_pci:
+ pci_dev_put(dev->pci_dev);
+ free:
+ kfree(dev->queues);
+ kfree(dev->entry);
+ kfree(dev);
+ return result;
+}
+
+static void nvme_async_probe(struct work_struct *work)
+{
+ struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work);
+ int result;
+
+ result = nvme_dev_start(dev);
+ if (result)
+ goto reset;
+
+ if (dev->online_queues > 1)
+ result = nvme_dev_add(dev);
+ if (result)
+ goto reset;
+
+ nvme_set_irq_hints(dev);
+ return;
+ reset:
+ if (!work_busy(&dev->reset_work)) {
+ dev->reset_workfn = nvme_reset_failed_dev;
+ queue_work(nvme_workq, &dev->reset_work);
+ }
+}
+
+static void nvme_reset_notify(struct pci_dev *pdev, bool prepare)
+{
+ struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+ if (prepare)
+ nvme_dev_shutdown(dev);
+ else
+ nvme_dev_resume(dev);
+}
+
+static void nvme_shutdown(struct pci_dev *pdev)
+{
+ struct nvme_dev *dev = pci_get_drvdata(pdev);
+ nvme_dev_shutdown(dev);
+}
+
+static void nvme_remove(struct pci_dev *pdev)
+{
+ struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+ spin_lock(&dev_list_lock);
+ list_del_init(&dev->node);
+ spin_unlock(&dev_list_lock);
+
+ pci_set_drvdata(pdev, NULL);
+ flush_work(&dev->probe_work);
+ flush_work(&dev->reset_work);
+ nvme_dev_shutdown(dev);
+ nvme_dev_remove(dev);
+ nvme_dev_remove_admin(dev);
+ device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance));
+ nvme_free_queues(dev, 0);
+ nvme_release_prp_pools(dev);
+ kref_put(&dev->kref, nvme_free_dev);
+}
+
+/* These functions are yet to be implemented */
+#define nvme_error_detected NULL
+#define nvme_dump_registers NULL
+#define nvme_link_reset NULL
+#define nvme_slot_reset NULL
+#define nvme_error_resume NULL
+
+#ifdef CONFIG_PM_SLEEP
+static int nvme_suspend(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+ nvme_dev_shutdown(ndev);
+ return 0;
+}
+
+static int nvme_resume(struct device *dev)
+{
+ struct pci_dev *pdev = to_pci_dev(dev);
+ struct nvme_dev *ndev = pci_get_drvdata(pdev);
+
+ if (nvme_dev_resume(ndev) && !work_busy(&ndev->reset_work)) {
+ ndev->reset_workfn = nvme_reset_failed_dev;
+ queue_work(nvme_workq, &ndev->reset_work);
+ }
+ return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
+
+static const struct pci_error_handlers nvme_err_handler = {
+ .error_detected = nvme_error_detected,
+ .mmio_enabled = nvme_dump_registers,
+ .link_reset = nvme_link_reset,
+ .slot_reset = nvme_slot_reset,
+ .resume = nvme_error_resume,
+ .reset_notify = nvme_reset_notify,
+};
+
+/* Move to pci_ids.h later */
+#define PCI_CLASS_STORAGE_EXPRESS 0x010802
+
+static const struct pci_device_id nvme_id_table[] = {
+ { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
+ { 0, }
+};
+MODULE_DEVICE_TABLE(pci, nvme_id_table);
+
+static struct pci_driver nvme_driver = {
+ .name = "nvme",
+ .id_table = nvme_id_table,
+ .probe = nvme_probe,
+ .remove = nvme_remove,
+ .shutdown = nvme_shutdown,
+ .driver = {
+ .pm = &nvme_dev_pm_ops,
+ },
+ .err_handler = &nvme_err_handler,
+};
+
+static int __init nvme_init(void)
+{
+ int result;
+
+ init_waitqueue_head(&nvme_kthread_wait);
+
+ nvme_workq = create_singlethread_workqueue("nvme");
+ if (!nvme_workq)
+ return -ENOMEM;
+
+ result = register_blkdev(nvme_major, "nvme");
+ if (result < 0)
+ goto kill_workq;
+ else if (result > 0)
+ nvme_major = result;
+
+ result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
+ &nvme_dev_fops);
+ if (result < 0)
+ goto unregister_blkdev;
+ else if (result > 0)
+ nvme_char_major = result;
+
+ nvme_class = class_create(THIS_MODULE, "nvme");
+ if (IS_ERR(nvme_class)) {
+ result = PTR_ERR(nvme_class);
+ goto unregister_chrdev;
+ }
+
+ result = pci_register_driver(&nvme_driver);
+ if (result)
+ goto destroy_class;
+ return 0;
+
+ destroy_class:
+ class_destroy(nvme_class);
+ unregister_chrdev:
+ __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+ unregister_blkdev:
+ unregister_blkdev(nvme_major, "nvme");
+ kill_workq:
+ destroy_workqueue(nvme_workq);
+ return result;
+}
+
+static void __exit nvme_exit(void)
+{
+ pci_unregister_driver(&nvme_driver);
+ unregister_blkdev(nvme_major, "nvme");
+ destroy_workqueue(nvme_workq);
+ class_destroy(nvme_class);
+ __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+ BUG_ON(nvme_thread && !IS_ERR(nvme_thread));
+ _nvme_check_size();
+}
+
+MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
+module_init(nvme_init);
+module_exit(nvme_exit);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
new file mode 100644
index 000000000..44f2514fb
--- /dev/null
+++ b/drivers/block/nvme-scsi.c
@@ -0,0 +1,3070 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+/*
+ * Refer to the SCSI-NVMe Translation spec for details on how
+ * each command is translated.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/compat.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <scsi/sg.h>
+#include <scsi/scsi.h>
+
+
+static int sg_version_num = 30534; /* 2 digits for each component */
+
+#define SNTI_TRANSLATION_SUCCESS 0
+#define SNTI_INTERNAL_ERROR 1
+
+/* VPD Page Codes */
+#define VPD_SUPPORTED_PAGES 0x00
+#define VPD_SERIAL_NUMBER 0x80
+#define VPD_DEVICE_IDENTIFIERS 0x83
+#define VPD_EXTENDED_INQUIRY 0x86
+#define VPD_BLOCK_LIMITS 0xB0
+#define VPD_BLOCK_DEV_CHARACTERISTICS 0xB1
+
+/* CDB offsets */
+#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET 6
+#define REPORT_LUNS_SR_OFFSET 2
+#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET 10
+#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET 4
+#define REQUEST_SENSE_DESC_OFFSET 1
+#define REQUEST_SENSE_DESC_MASK 0x01
+#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE 1
+#define INQUIRY_EVPD_BYTE_OFFSET 1
+#define INQUIRY_PAGE_CODE_BYTE_OFFSET 2
+#define INQUIRY_EVPD_BIT_MASK 1
+#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET 3
+#define START_STOP_UNIT_CDB_IMMED_OFFSET 1
+#define START_STOP_UNIT_CDB_IMMED_MASK 0x1
+#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET 3
+#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK 0xF
+#define START_STOP_UNIT_CDB_POWER_COND_OFFSET 4
+#define START_STOP_UNIT_CDB_POWER_COND_MASK 0xF0
+#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET 4
+#define START_STOP_UNIT_CDB_NO_FLUSH_MASK 0x4
+#define START_STOP_UNIT_CDB_START_OFFSET 4
+#define START_STOP_UNIT_CDB_START_MASK 0x1
+#define WRITE_BUFFER_CDB_MODE_OFFSET 1
+#define WRITE_BUFFER_CDB_MODE_MASK 0x1F
+#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET 2
+#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET 3
+#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET 6
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET 1
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK 0xC0
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT 6
+#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET 1
+#define FORMAT_UNIT_CDB_LONG_LIST_MASK 0x20
+#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET 1
+#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK 0x10
+#define FORMAT_UNIT_SHORT_PARM_LIST_LEN 4
+#define FORMAT_UNIT_LONG_PARM_LIST_LEN 8
+#define FORMAT_UNIT_PROT_INT_OFFSET 3
+#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET 0
+#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK 0x07
+#define UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET 7
+
+/* Misc. defines */
+#define NIBBLE_SHIFT 4
+#define FIXED_SENSE_DATA 0x70
+#define DESC_FORMAT_SENSE_DATA 0x72
+#define FIXED_SENSE_DATA_ADD_LENGTH 10
+#define LUN_ENTRY_SIZE 8
+#define LUN_DATA_HEADER_SIZE 8
+#define ALL_LUNS_RETURNED 0x02
+#define ALL_WELL_KNOWN_LUNS_RETURNED 0x01
+#define RESTRICTED_LUNS_RETURNED 0x00
+#define NVME_POWER_STATE_START_VALID 0x00
+#define NVME_POWER_STATE_ACTIVE 0x01
+#define NVME_POWER_STATE_IDLE 0x02
+#define NVME_POWER_STATE_STANDBY 0x03
+#define NVME_POWER_STATE_LU_CONTROL 0x07
+#define POWER_STATE_0 0
+#define POWER_STATE_1 1
+#define POWER_STATE_2 2
+#define POWER_STATE_3 3
+#define DOWNLOAD_SAVE_ACTIVATE 0x05
+#define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E
+#define ACTIVATE_DEFERRED_MICROCODE 0x0F
+#define FORMAT_UNIT_IMMED_MASK 0x2
+#define FORMAT_UNIT_IMMED_OFFSET 1
+#define KELVIN_TEMP_FACTOR 273
+#define FIXED_FMT_SENSE_DATA_SIZE 18
+#define DESC_FMT_SENSE_DATA_SIZE 8
+
+/* SCSI/NVMe defines and bit masks */
+#define INQ_STANDARD_INQUIRY_PAGE 0x00
+#define INQ_SUPPORTED_VPD_PAGES_PAGE 0x00
+#define INQ_UNIT_SERIAL_NUMBER_PAGE 0x80
+#define INQ_DEVICE_IDENTIFICATION_PAGE 0x83
+#define INQ_EXTENDED_INQUIRY_DATA_PAGE 0x86
+#define INQ_BDEV_LIMITS_PAGE 0xB0
+#define INQ_BDEV_CHARACTERISTICS_PAGE 0xB1
+#define INQ_SERIAL_NUMBER_LENGTH 0x14
+#define INQ_NUM_SUPPORTED_VPD_PAGES 6
+#define VERSION_SPC_4 0x06
+#define ACA_UNSUPPORTED 0
+#define STANDARD_INQUIRY_LENGTH 36
+#define ADDITIONAL_STD_INQ_LENGTH 31
+#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH 0x3C
+#define RESERVED_FIELD 0
+
+/* SCSI READ/WRITE Defines */
+#define IO_CDB_WP_MASK 0xE0
+#define IO_CDB_WP_SHIFT 5
+#define IO_CDB_FUA_MASK 0x8
+#define IO_6_CDB_LBA_OFFSET 0
+#define IO_6_CDB_LBA_MASK 0x001FFFFF
+#define IO_6_CDB_TX_LEN_OFFSET 4
+#define IO_6_DEFAULT_TX_LEN 256
+#define IO_10_CDB_LBA_OFFSET 2
+#define IO_10_CDB_TX_LEN_OFFSET 7
+#define IO_10_CDB_WP_OFFSET 1
+#define IO_10_CDB_FUA_OFFSET 1
+#define IO_12_CDB_LBA_OFFSET 2
+#define IO_12_CDB_TX_LEN_OFFSET 6
+#define IO_12_CDB_WP_OFFSET 1
+#define IO_12_CDB_FUA_OFFSET 1
+#define IO_16_CDB_FUA_OFFSET 1
+#define IO_16_CDB_WP_OFFSET 1
+#define IO_16_CDB_LBA_OFFSET 2
+#define IO_16_CDB_TX_LEN_OFFSET 10
+
+/* Mode Sense/Select defines */
+#define MODE_PAGE_INFO_EXCEP 0x1C
+#define MODE_PAGE_CACHING 0x08
+#define MODE_PAGE_CONTROL 0x0A
+#define MODE_PAGE_POWER_CONDITION 0x1A
+#define MODE_PAGE_RETURN_ALL 0x3F
+#define MODE_PAGE_BLK_DES_LEN 0x08
+#define MODE_PAGE_LLBAA_BLK_DES_LEN 0x10
+#define MODE_PAGE_CACHING_LEN 0x14
+#define MODE_PAGE_CONTROL_LEN 0x0C
+#define MODE_PAGE_POW_CND_LEN 0x28
+#define MODE_PAGE_INF_EXC_LEN 0x0C
+#define MODE_PAGE_ALL_LEN 0x54
+#define MODE_SENSE6_MPH_SIZE 4
+#define MODE_SENSE6_ALLOC_LEN_OFFSET 4
+#define MODE_SENSE_PAGE_CONTROL_OFFSET 2
+#define MODE_SENSE_PAGE_CONTROL_MASK 0xC0
+#define MODE_SENSE_PAGE_CODE_OFFSET 2
+#define MODE_SENSE_PAGE_CODE_MASK 0x3F
+#define MODE_SENSE_LLBAA_OFFSET 1
+#define MODE_SENSE_LLBAA_MASK 0x10
+#define MODE_SENSE_LLBAA_SHIFT 4
+#define MODE_SENSE_DBD_OFFSET 1
+#define MODE_SENSE_DBD_MASK 8
+#define MODE_SENSE_DBD_SHIFT 3
+#define MODE_SENSE10_MPH_SIZE 8
+#define MODE_SENSE10_ALLOC_LEN_OFFSET 7
+#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET 1
+#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET 1
+#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET 4
+#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET 7
+#define MODE_SELECT_CDB_PAGE_FORMAT_MASK 0x10
+#define MODE_SELECT_CDB_SAVE_PAGES_MASK 0x1
+#define MODE_SELECT_6_BD_OFFSET 3
+#define MODE_SELECT_10_BD_OFFSET 6
+#define MODE_SELECT_10_LLBAA_OFFSET 4
+#define MODE_SELECT_10_LLBAA_MASK 1
+#define MODE_SELECT_6_MPH_SIZE 4
+#define MODE_SELECT_10_MPH_SIZE 8
+#define CACHING_MODE_PAGE_WCE_MASK 0x04
+#define MODE_SENSE_BLK_DESC_ENABLED 0
+#define MODE_SENSE_BLK_DESC_COUNT 1
+#define MODE_SELECT_PAGE_CODE_MASK 0x3F
+#define SHORT_DESC_BLOCK 8
+#define LONG_DESC_BLOCK 16
+#define MODE_PAGE_POW_CND_LEN_FIELD 0x26
+#define MODE_PAGE_INF_EXC_LEN_FIELD 0x0A
+#define MODE_PAGE_CACHING_LEN_FIELD 0x12
+#define MODE_PAGE_CONTROL_LEN_FIELD 0x0A
+#define MODE_SENSE_PC_CURRENT_VALUES 0
+
+/* Log Sense defines */
+#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE 0x00
+#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH 0x07
+#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE 0x2F
+#define LOG_PAGE_TEMPERATURE_PAGE 0x0D
+#define LOG_SENSE_CDB_SP_OFFSET 1
+#define LOG_SENSE_CDB_SP_NOT_ENABLED 0
+#define LOG_SENSE_CDB_PC_OFFSET 2
+#define LOG_SENSE_CDB_PC_MASK 0xC0
+#define LOG_SENSE_CDB_PC_SHIFT 6
+#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES 1
+#define LOG_SENSE_CDB_PAGE_CODE_MASK 0x3F
+#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET 7
+#define REMAINING_INFO_EXCP_PAGE_LENGTH 0x8
+#define LOG_INFO_EXCP_PAGE_LENGTH 0xC
+#define REMAINING_TEMP_PAGE_LENGTH 0xC
+#define LOG_TEMP_PAGE_LENGTH 0x10
+#define LOG_TEMP_UNKNOWN 0xFF
+#define SUPPORTED_LOG_PAGES_PAGE_LENGTH 0x3
+
+/* Read Capacity defines */
+#define READ_CAP_10_RESP_SIZE 8
+#define READ_CAP_16_RESP_SIZE 32
+
+/* NVMe Namespace and Command Defines */
+#define BYTES_TO_DWORDS 4
+#define NVME_MAX_FIRMWARE_SLOT 7
+
+/* Report LUNs defines */
+#define REPORT_LUNS_FIRST_LUN_OFFSET 8
+
+/* SCSI ADDITIONAL SENSE Codes */
+
+#define SCSI_ASC_NO_SENSE 0x00
+#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT 0x03
+#define SCSI_ASC_LUN_NOT_READY 0x04
+#define SCSI_ASC_WARNING 0x0B
+#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED 0x10
+#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED 0x10
+#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED 0x10
+#define SCSI_ASC_UNRECOVERED_READ_ERROR 0x11
+#define SCSI_ASC_MISCOMPARE_DURING_VERIFY 0x1D
+#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID 0x20
+#define SCSI_ASC_ILLEGAL_COMMAND 0x20
+#define SCSI_ASC_ILLEGAL_BLOCK 0x21
+#define SCSI_ASC_INVALID_CDB 0x24
+#define SCSI_ASC_INVALID_LUN 0x25
+#define SCSI_ASC_INVALID_PARAMETER 0x26
+#define SCSI_ASC_FORMAT_COMMAND_FAILED 0x31
+#define SCSI_ASC_INTERNAL_TARGET_FAILURE 0x44
+
+/* SCSI ADDITIONAL SENSE Code Qualifiers */
+
+#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE 0x00
+#define SCSI_ASCQ_FORMAT_COMMAND_FAILED 0x01
+#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED 0x01
+#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED 0x02
+#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED 0x03
+#define SCSI_ASCQ_FORMAT_IN_PROGRESS 0x04
+#define SCSI_ASCQ_POWER_LOSS_EXPECTED 0x08
+#define SCSI_ASCQ_INVALID_LUN_ID 0x09
+
+/**
+ * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to
+ * enable DPOFUA support type 0x10 value.
+ */
+#define DEVICE_SPECIFIC_PARAMETER 0
+#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR)
+
+/* MACROs to extract information from CDBs */
+
+#define GET_OPCODE(cdb) cdb[0]
+
+#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0)
+
+#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0))
+
+#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \
+(cdb[index + 1] << 8) | \
+(cdb[index + 2] << 0))
+
+#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \
+(cdb[index + 1] << 16) | \
+(cdb[index + 2] << 8) | \
+(cdb[index + 3] << 0))
+
+#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \
+(((u64)cdb[index + 1]) << 48) | \
+(((u64)cdb[index + 2]) << 40) | \
+(((u64)cdb[index + 3]) << 32) | \
+(((u64)cdb[index + 4]) << 24) | \
+(((u64)cdb[index + 5]) << 16) | \
+(((u64)cdb[index + 6]) << 8) | \
+(((u64)cdb[index + 7]) << 0))
+
+/* Inquiry Helper Macros */
+#define GET_INQ_EVPD_BIT(cdb) \
+((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) & \
+INQUIRY_EVPD_BIT_MASK) ? 1 : 0)
+
+#define GET_INQ_PAGE_CODE(cdb) \
+(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET))
+
+#define GET_INQ_ALLOC_LENGTH(cdb) \
+(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET))
+
+/* Report LUNs Helper Macros */
+#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb) \
+(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET))
+
+/* Read Capacity Helper Macros */
+#define GET_READ_CAP_16_ALLOC_LENGTH(cdb) \
+(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET))
+
+#define IS_READ_CAP_16(cdb) \
+((cdb[0] == SERVICE_ACTION_IN_16 && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0)
+
+/* Request Sense Helper Macros */
+#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \
+(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET))
+
+/* Mode Sense Helper Macros */
+#define GET_MODE_SENSE_DBD(cdb) \
+((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >> \
+MODE_SENSE_DBD_SHIFT)
+
+#define GET_MODE_SENSE_LLBAA(cdb) \
+((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) & \
+MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT)
+
+#define GET_MODE_SENSE_MPH_SIZE(cdb10) \
+(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE)
+
+
+/* Struct to gather data that needs to be extracted from a SCSI CDB.
+ Not conforming to any particular CDB variant, but compatible with all. */
+
+struct nvme_trans_io_cdb {
+ u8 fua;
+ u8 prot_info;
+ u64 lba;
+ u32 xfer_len;
+};
+
+
+/* Internal Helper Functions */
+
+
+/* Copy data to userspace memory */
+
+static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from,
+ unsigned long n)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ unsigned long not_copied;
+ int i;
+ void *index = from;
+ size_t remaining = n;
+ size_t xfer_len;
+
+ if (hdr->iovec_count > 0) {
+ struct sg_iovec sgl;
+
+ for (i = 0; i < hdr->iovec_count; i++) {
+ not_copied = copy_from_user(&sgl, hdr->dxferp +
+ i * sizeof(struct sg_iovec),
+ sizeof(struct sg_iovec));
+ if (not_copied)
+ return -EFAULT;
+ xfer_len = min(remaining, sgl.iov_len);
+ not_copied = copy_to_user(sgl.iov_base, index,
+ xfer_len);
+ if (not_copied) {
+ res = -EFAULT;
+ break;
+ }
+ index += xfer_len;
+ remaining -= xfer_len;
+ if (remaining == 0)
+ break;
+ }
+ return res;
+ }
+ not_copied = copy_to_user(hdr->dxferp, from, n);
+ if (not_copied)
+ res = -EFAULT;
+ return res;
+}
+
+/* Copy data from userspace memory */
+
+static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to,
+ unsigned long n)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ unsigned long not_copied;
+ int i;
+ void *index = to;
+ size_t remaining = n;
+ size_t xfer_len;
+
+ if (hdr->iovec_count > 0) {
+ struct sg_iovec sgl;
+
+ for (i = 0; i < hdr->iovec_count; i++) {
+ not_copied = copy_from_user(&sgl, hdr->dxferp +
+ i * sizeof(struct sg_iovec),
+ sizeof(struct sg_iovec));
+ if (not_copied)
+ return -EFAULT;
+ xfer_len = min(remaining, sgl.iov_len);
+ not_copied = copy_from_user(index, sgl.iov_base,
+ xfer_len);
+ if (not_copied) {
+ res = -EFAULT;
+ break;
+ }
+ index += xfer_len;
+ remaining -= xfer_len;
+ if (remaining == 0)
+ break;
+ }
+ return res;
+ }
+
+ not_copied = copy_from_user(to, hdr->dxferp, n);
+ if (not_copied)
+ res = -EFAULT;
+ return res;
+}
+
+/* Status/Sense Buffer Writeback */
+
+static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key,
+ u8 asc, u8 ascq)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u8 xfer_len;
+ u8 resp[DESC_FMT_SENSE_DATA_SIZE];
+
+ if (scsi_status_is_good(status)) {
+ hdr->status = SAM_STAT_GOOD;
+ hdr->masked_status = GOOD;
+ hdr->host_status = DID_OK;
+ hdr->driver_status = DRIVER_OK;
+ hdr->sb_len_wr = 0;
+ } else {
+ hdr->status = status;
+ hdr->masked_status = status >> 1;
+ hdr->host_status = DID_OK;
+ hdr->driver_status = DRIVER_OK;
+
+ memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE);
+ resp[0] = DESC_FORMAT_SENSE_DATA;
+ resp[1] = sense_key;
+ resp[2] = asc;
+ resp[3] = ascq;
+
+ xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE);
+ hdr->sb_len_wr = xfer_len;
+ if (copy_to_user(hdr->sbp, resp, xfer_len) > 0)
+ res = -EFAULT;
+ }
+
+ return res;
+}
+
+static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc)
+{
+ u8 status, sense_key, asc, ascq;
+ int res = SNTI_TRANSLATION_SUCCESS;
+
+ /* For non-nvme (Linux) errors, simply return the error code */
+ if (nvme_sc < 0)
+ return nvme_sc;
+
+ /* Mask DNR, More, and reserved fields */
+ nvme_sc &= 0x7FF;
+
+ switch (nvme_sc) {
+ /* Generic Command Status */
+ case NVME_SC_SUCCESS:
+ status = SAM_STAT_GOOD;
+ sense_key = NO_SENSE;
+ asc = SCSI_ASC_NO_SENSE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_INVALID_OPCODE:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = ILLEGAL_REQUEST;
+ asc = SCSI_ASC_ILLEGAL_COMMAND;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_INVALID_FIELD:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = ILLEGAL_REQUEST;
+ asc = SCSI_ASC_INVALID_CDB;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_DATA_XFER_ERROR:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = MEDIUM_ERROR;
+ asc = SCSI_ASC_NO_SENSE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_POWER_LOSS:
+ status = SAM_STAT_TASK_ABORTED;
+ sense_key = ABORTED_COMMAND;
+ asc = SCSI_ASC_WARNING;
+ ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED;
+ break;
+ case NVME_SC_INTERNAL:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = HARDWARE_ERROR;
+ asc = SCSI_ASC_INTERNAL_TARGET_FAILURE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_ABORT_REQ:
+ status = SAM_STAT_TASK_ABORTED;
+ sense_key = ABORTED_COMMAND;
+ asc = SCSI_ASC_NO_SENSE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_ABORT_QUEUE:
+ status = SAM_STAT_TASK_ABORTED;
+ sense_key = ABORTED_COMMAND;
+ asc = SCSI_ASC_NO_SENSE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_FUSED_FAIL:
+ status = SAM_STAT_TASK_ABORTED;
+ sense_key = ABORTED_COMMAND;
+ asc = SCSI_ASC_NO_SENSE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_FUSED_MISSING:
+ status = SAM_STAT_TASK_ABORTED;
+ sense_key = ABORTED_COMMAND;
+ asc = SCSI_ASC_NO_SENSE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_INVALID_NS:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = ILLEGAL_REQUEST;
+ asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID;
+ ascq = SCSI_ASCQ_INVALID_LUN_ID;
+ break;
+ case NVME_SC_LBA_RANGE:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = ILLEGAL_REQUEST;
+ asc = SCSI_ASC_ILLEGAL_BLOCK;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_CAP_EXCEEDED:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = MEDIUM_ERROR;
+ asc = SCSI_ASC_NO_SENSE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_NS_NOT_READY:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = NOT_READY;
+ asc = SCSI_ASC_LUN_NOT_READY;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+
+ /* Command Specific Status */
+ case NVME_SC_INVALID_FORMAT:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = ILLEGAL_REQUEST;
+ asc = SCSI_ASC_FORMAT_COMMAND_FAILED;
+ ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED;
+ break;
+ case NVME_SC_BAD_ATTRIBUTES:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = ILLEGAL_REQUEST;
+ asc = SCSI_ASC_INVALID_CDB;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+
+ /* Media Errors */
+ case NVME_SC_WRITE_FAULT:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = MEDIUM_ERROR;
+ asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_READ_ERROR:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = MEDIUM_ERROR;
+ asc = SCSI_ASC_UNRECOVERED_READ_ERROR;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_GUARD_CHECK:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = MEDIUM_ERROR;
+ asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED;
+ ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED;
+ break;
+ case NVME_SC_APPTAG_CHECK:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = MEDIUM_ERROR;
+ asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED;
+ ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED;
+ break;
+ case NVME_SC_REFTAG_CHECK:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = MEDIUM_ERROR;
+ asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED;
+ ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED;
+ break;
+ case NVME_SC_COMPARE_FAILED:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = MISCOMPARE;
+ asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ case NVME_SC_ACCESS_DENIED:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = ILLEGAL_REQUEST;
+ asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID;
+ ascq = SCSI_ASCQ_INVALID_LUN_ID;
+ break;
+
+ /* Unspecified/Default */
+ case NVME_SC_CMDID_CONFLICT:
+ case NVME_SC_CMD_SEQ_ERROR:
+ case NVME_SC_CQ_INVALID:
+ case NVME_SC_QID_INVALID:
+ case NVME_SC_QUEUE_SIZE:
+ case NVME_SC_ABORT_LIMIT:
+ case NVME_SC_ABORT_MISSING:
+ case NVME_SC_ASYNC_LIMIT:
+ case NVME_SC_FIRMWARE_SLOT:
+ case NVME_SC_FIRMWARE_IMAGE:
+ case NVME_SC_INVALID_VECTOR:
+ case NVME_SC_INVALID_LOG_PAGE:
+ default:
+ status = SAM_STAT_CHECK_CONDITION;
+ sense_key = ILLEGAL_REQUEST;
+ asc = SCSI_ASC_NO_SENSE;
+ ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ break;
+ }
+
+ res = nvme_trans_completion(hdr, status, sense_key, asc, ascq);
+
+ return res;
+}
+
+/* INQUIRY Helper Functions */
+
+static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, u8 *inq_response,
+ int alloc_len)
+{
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ struct nvme_id_ns *id_ns;
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ int xfer_len;
+ u8 resp_data_format = 0x02;
+ u8 protect;
+ u8 cmdque = 0x01 << 1;
+ u8 fw_offset = sizeof(dev->firmware_rev);
+
+ mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out_dma;
+ }
+
+ /* nvme ns identify - use DPS value for PROTECT field */
+ nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ /*
+ * If nvme_sc was -ve, res will be -ve here.
+ * If nvme_sc was +ve, the status would bace been translated, and res
+ * can only be 0 or -ve.
+ * - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc
+ * - If -ve, return because its a Linux error.
+ */
+ if (res)
+ goto out_free;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_free;
+ }
+ id_ns = mem;
+ (id_ns->dps) ? (protect = 0x01) : (protect = 0);
+
+ memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+ inq_response[2] = VERSION_SPC_4;
+ inq_response[3] = resp_data_format; /*normaca=0 | hisup=0 */
+ inq_response[4] = ADDITIONAL_STD_INQ_LENGTH;
+ inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */
+ inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */
+ strncpy(&inq_response[8], "NVMe ", 8);
+ strncpy(&inq_response[16], dev->model, 16);
+
+ while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4)
+ fw_offset--;
+ fw_offset -= 4;
+ strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4);
+
+ xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+ res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+ dma_addr);
+ out_dma:
+ return res;
+}
+
+static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, u8 *inq_response,
+ int alloc_len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int xfer_len;
+
+ memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+ inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE; /* Page Code */
+ inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES; /* Page Length */
+ inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE;
+ inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE;
+ inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE;
+ inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE;
+ inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE;
+ inq_response[9] = INQ_BDEV_LIMITS_PAGE;
+
+ xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+ res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ return res;
+}
+
+static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, u8 *inq_response,
+ int alloc_len)
+{
+ struct nvme_dev *dev = ns->dev;
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int xfer_len;
+
+ memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+ inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */
+ inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */
+ strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH);
+
+ xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+ res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ return res;
+}
+
+static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *inq_response, int alloc_len)
+{
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ int xfer_len;
+ __be32 tmp_id = cpu_to_be32(ns->ns_id);
+
+ mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out_dma;
+ }
+
+ memset(inq_response, 0, alloc_len);
+ inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */
+ if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) {
+ struct nvme_id_ns *id_ns = mem;
+ void *eui = id_ns->eui64;
+ int len = sizeof(id_ns->eui64);
+
+ nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_free;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_free;
+ }
+
+ if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) {
+ if (bitmap_empty(eui, len * 8)) {
+ eui = id_ns->nguid;
+ len = sizeof(id_ns->nguid);
+ }
+ }
+ if (bitmap_empty(eui, len * 8))
+ goto scsi_string;
+
+ inq_response[3] = 4 + len; /* Page Length */
+ /* Designation Descriptor start */
+ inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */
+ inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */
+ inq_response[6] = 0x00; /* Rsvd */
+ inq_response[7] = len; /* Designator Length */
+ memcpy(&inq_response[8], eui, len);
+ } else {
+ scsi_string:
+ if (alloc_len < 72) {
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out_free;
+ }
+ inq_response[3] = 0x48; /* Page Length */
+ /* Designation Descriptor start */
+ inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */
+ inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */
+ inq_response[6] = 0x00; /* Rsvd */
+ inq_response[7] = 0x44; /* Designator Length */
+
+ sprintf(&inq_response[8], "%04x", dev->pci_dev->vendor);
+ memcpy(&inq_response[12], dev->model, sizeof(dev->model));
+ sprintf(&inq_response[52], "%04x", tmp_id);
+ memcpy(&inq_response[56], dev->serial, sizeof(dev->serial));
+ }
+ xfer_len = alloc_len;
+ res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+ dma_addr);
+ out_dma:
+ return res;
+}
+
+static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ int alloc_len)
+{
+ u8 *inq_response;
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ struct nvme_id_ctrl *id_ctrl;
+ struct nvme_id_ns *id_ns;
+ int xfer_len;
+ u8 microcode = 0x80;
+ u8 spt;
+ u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7};
+ u8 grd_chk, app_chk, ref_chk, protect;
+ u8 uask_sup = 0x20;
+ u8 v_sup;
+ u8 luiclr = 0x01;
+
+ inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
+ if (inq_response == NULL) {
+ res = -ENOMEM;
+ goto out_mem;
+ }
+
+ mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out_dma;
+ }
+
+ /* nvme ns identify */
+ nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_free;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_free;
+ }
+ id_ns = mem;
+ spt = spt_lut[(id_ns->dpc) & 0x07] << 3;
+ (id_ns->dps) ? (protect = 0x01) : (protect = 0);
+ grd_chk = protect << 2;
+ app_chk = protect << 1;
+ ref_chk = protect;
+
+ /* nvme controller identify */
+ nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_free;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_free;
+ }
+ id_ctrl = mem;
+ v_sup = id_ctrl->vwc;
+
+ memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+ inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE; /* Page Code */
+ inq_response[2] = 0x00; /* Page Length MSB */
+ inq_response[3] = 0x3C; /* Page Length LSB */
+ inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk;
+ inq_response[5] = uask_sup;
+ inq_response[6] = v_sup;
+ inq_response[7] = luiclr;
+ inq_response[8] = 0;
+ inq_response[9] = 0;
+
+ xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+ res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+ dma_addr);
+ out_dma:
+ kfree(inq_response);
+ out_mem:
+ return res;
+}
+
+static int nvme_trans_bdev_limits_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *inq_response, int alloc_len)
+{
+ __be32 max_sectors = cpu_to_be32(
+ nvme_block_nr(ns, queue_max_hw_sectors(ns->queue)));
+ __be32 max_discard = cpu_to_be32(ns->queue->limits.max_discard_sectors);
+ __be32 discard_desc_count = cpu_to_be32(0x100);
+
+ memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+ inq_response[1] = VPD_BLOCK_LIMITS;
+ inq_response[3] = 0x3c; /* Page Length */
+ memcpy(&inq_response[8], &max_sectors, sizeof(u32));
+ memcpy(&inq_response[20], &max_discard, sizeof(u32));
+
+ if (max_discard)
+ memcpy(&inq_response[24], &discard_desc_count, sizeof(u32));
+
+ return nvme_trans_copy_to_user(hdr, inq_response, 0x3c);
+}
+
+static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ int alloc_len)
+{
+ u8 *inq_response;
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int xfer_len;
+
+ inq_response = kzalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
+ if (inq_response == NULL) {
+ res = -ENOMEM;
+ goto out_mem;
+ }
+
+ inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE; /* Page Code */
+ inq_response[2] = 0x00; /* Page Length MSB */
+ inq_response[3] = 0x3C; /* Page Length LSB */
+ inq_response[4] = 0x00; /* Medium Rotation Rate MSB */
+ inq_response[5] = 0x01; /* Medium Rotation Rate LSB */
+ inq_response[6] = 0x00; /* Form Factor */
+
+ xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+ res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ kfree(inq_response);
+ out_mem:
+ return res;
+}
+
+/* LOG SENSE Helper Functions */
+
+static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ int alloc_len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int xfer_len;
+ u8 *log_response;
+
+ log_response = kzalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL);
+ if (log_response == NULL) {
+ res = -ENOMEM;
+ goto out_mem;
+ }
+
+ log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE;
+ /* Subpage=0x00, Page Length MSB=0 */
+ log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH;
+ log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE;
+ log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE;
+ log_response[6] = LOG_PAGE_TEMPERATURE_PAGE;
+
+ xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH);
+ res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+ kfree(log_response);
+ out_mem:
+ return res;
+}
+
+static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, int alloc_len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int xfer_len;
+ u8 *log_response;
+ struct nvme_command c;
+ struct nvme_dev *dev = ns->dev;
+ struct nvme_smart_log *smart_log;
+ dma_addr_t dma_addr;
+ void *mem;
+ u8 temp_c;
+ u16 temp_k;
+
+ log_response = kzalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL);
+ if (log_response == NULL) {
+ res = -ENOMEM;
+ goto out_mem;
+ }
+
+ mem = dma_alloc_coherent(&dev->pci_dev->dev,
+ sizeof(struct nvme_smart_log),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out_dma;
+ }
+
+ /* Get SMART Log Page */
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = nvme_admin_get_log_page;
+ c.common.nsid = cpu_to_le32(0xFFFFFFFF);
+ c.common.prp1 = cpu_to_le64(dma_addr);
+ c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
+ BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
+ res = nvme_submit_admin_cmd(dev, &c, NULL);
+ if (res != NVME_SC_SUCCESS) {
+ temp_c = LOG_TEMP_UNKNOWN;
+ } else {
+ smart_log = mem;
+ temp_k = (smart_log->temperature[1] << 8) +
+ (smart_log->temperature[0]);
+ temp_c = temp_k - KELVIN_TEMP_FACTOR;
+ }
+
+ log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE;
+ /* Subpage=0x00, Page Length MSB=0 */
+ log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH;
+ /* Informational Exceptions Log Parameter 1 Start */
+ /* Parameter Code=0x0000 bytes 4,5 */
+ log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */
+ log_response[7] = 0x04; /* PARAMETER LENGTH */
+ /* Add sense Code and qualifier = 0x00 each */
+ /* Use Temperature from NVMe Get Log Page, convert to C from K */
+ log_response[10] = temp_c;
+
+ xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH);
+ res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
+ mem, dma_addr);
+ out_dma:
+ kfree(log_response);
+ out_mem:
+ return res;
+}
+
+static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ int alloc_len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int xfer_len;
+ u8 *log_response;
+ struct nvme_command c;
+ struct nvme_dev *dev = ns->dev;
+ struct nvme_smart_log *smart_log;
+ dma_addr_t dma_addr;
+ void *mem;
+ u32 feature_resp;
+ u8 temp_c_cur, temp_c_thresh;
+ u16 temp_k;
+
+ log_response = kzalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL);
+ if (log_response == NULL) {
+ res = -ENOMEM;
+ goto out_mem;
+ }
+
+ mem = dma_alloc_coherent(&dev->pci_dev->dev,
+ sizeof(struct nvme_smart_log),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out_dma;
+ }
+
+ /* Get SMART Log Page */
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = nvme_admin_get_log_page;
+ c.common.nsid = cpu_to_le32(0xFFFFFFFF);
+ c.common.prp1 = cpu_to_le64(dma_addr);
+ c.common.cdw10[0] = cpu_to_le32((((sizeof(struct nvme_smart_log) /
+ BYTES_TO_DWORDS) - 1) << 16) | NVME_LOG_SMART);
+ res = nvme_submit_admin_cmd(dev, &c, NULL);
+ if (res != NVME_SC_SUCCESS) {
+ temp_c_cur = LOG_TEMP_UNKNOWN;
+ } else {
+ smart_log = mem;
+ temp_k = (smart_log->temperature[1] << 8) +
+ (smart_log->temperature[0]);
+ temp_c_cur = temp_k - KELVIN_TEMP_FACTOR;
+ }
+
+ /* Get Features for Temp Threshold */
+ res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0,
+ &feature_resp);
+ if (res != NVME_SC_SUCCESS)
+ temp_c_thresh = LOG_TEMP_UNKNOWN;
+ else
+ temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR;
+
+ log_response[0] = LOG_PAGE_TEMPERATURE_PAGE;
+ /* Subpage=0x00, Page Length MSB=0 */
+ log_response[3] = REMAINING_TEMP_PAGE_LENGTH;
+ /* Temperature Log Parameter 1 (Temperature) Start */
+ /* Parameter Code = 0x0000 */
+ log_response[6] = 0x01; /* Format and Linking = 01b */
+ log_response[7] = 0x02; /* Parameter Length */
+ /* Use Temperature from NVMe Get Log Page, convert to C from K */
+ log_response[9] = temp_c_cur;
+ /* Temperature Log Parameter 2 (Reference Temperature) Start */
+ log_response[11] = 0x01; /* Parameter Code = 0x0001 */
+ log_response[12] = 0x01; /* Format and Linking = 01b */
+ log_response[13] = 0x02; /* Parameter Length */
+ /* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */
+ log_response[15] = temp_c_thresh;
+
+ xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH);
+ res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
+ mem, dma_addr);
+ out_dma:
+ kfree(log_response);
+ out_mem:
+ return res;
+}
+
+/* MODE SENSE Helper Functions */
+
+static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa,
+ u16 mode_data_length, u16 blk_desc_len)
+{
+ /* Quick check to make sure I don't stomp on my own memory... */
+ if ((cdb10 && len < 8) || (!cdb10 && len < 4))
+ return SNTI_INTERNAL_ERROR;
+
+ if (cdb10) {
+ resp[0] = (mode_data_length & 0xFF00) >> 8;
+ resp[1] = (mode_data_length & 0x00FF);
+ /* resp[2] and [3] are zero */
+ resp[4] = llbaa;
+ resp[5] = RESERVED_FIELD;
+ resp[6] = (blk_desc_len & 0xFF00) >> 8;
+ resp[7] = (blk_desc_len & 0x00FF);
+ } else {
+ resp[0] = (mode_data_length & 0x00FF);
+ /* resp[1] and [2] are zero */
+ resp[3] = (blk_desc_len & 0x00FF);
+ }
+
+ return SNTI_TRANSLATION_SUCCESS;
+}
+
+static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *resp, int len, u8 llbaa)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ struct nvme_id_ns *id_ns;
+ u8 flbas;
+ u32 lba_length;
+
+ if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN)
+ return SNTI_INTERNAL_ERROR;
+ else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
+ return SNTI_INTERNAL_ERROR;
+
+ mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+
+ /* nvme ns identify */
+ nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_dma;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_dma;
+ }
+ id_ns = mem;
+ flbas = (id_ns->flbas) & 0x0F;
+ lba_length = (1 << (id_ns->lbaf[flbas].ds));
+
+ if (llbaa == 0) {
+ __be32 tmp_cap = cpu_to_be32(le64_to_cpu(id_ns->ncap));
+ /* Byte 4 is reserved */
+ __be32 tmp_len = cpu_to_be32(lba_length & 0x00FFFFFF);
+
+ memcpy(resp, &tmp_cap, sizeof(u32));
+ memcpy(&resp[4], &tmp_len, sizeof(u32));
+ } else {
+ __be64 tmp_cap = cpu_to_be64(le64_to_cpu(id_ns->ncap));
+ __be32 tmp_len = cpu_to_be32(lba_length);
+
+ memcpy(resp, &tmp_cap, sizeof(u64));
+ /* Bytes 8, 9, 10, 11 are reserved */
+ memcpy(&resp[12], &tmp_len, sizeof(u32));
+ }
+
+ out_dma:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+ dma_addr);
+ out:
+ return res;
+}
+
+static int nvme_trans_fill_control_page(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, u8 *resp,
+ int len)
+{
+ if (len < MODE_PAGE_CONTROL_LEN)
+ return SNTI_INTERNAL_ERROR;
+
+ resp[0] = MODE_PAGE_CONTROL;
+ resp[1] = MODE_PAGE_CONTROL_LEN_FIELD;
+ resp[2] = 0x0E; /* TST=000b, TMF_ONLY=0, DPICZ=1,
+ * D_SENSE=1, GLTSD=1, RLEC=0 */
+ resp[3] = 0x12; /* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */
+ /* Byte 4: VS=0, RAC=0, UA_INT=0, SWP=0 */
+ resp[5] = 0x40; /* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */
+ /* resp[6] and [7] are obsolete, thus zero */
+ resp[8] = 0xFF; /* Busy timeout period = 0xffff */
+ resp[9] = 0xFF;
+ /* Bytes 10,11: Extended selftest completion time = 0x0000 */
+
+ return SNTI_TRANSLATION_SUCCESS;
+}
+
+static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr,
+ u8 *resp, int len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ u32 feature_resp;
+ u8 vwc;
+
+ if (len < MODE_PAGE_CACHING_LEN)
+ return SNTI_INTERNAL_ERROR;
+
+ nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0,
+ &feature_resp);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out;
+ }
+ vwc = feature_resp & 0x00000001;
+
+ resp[0] = MODE_PAGE_CACHING;
+ resp[1] = MODE_PAGE_CACHING_LEN_FIELD;
+ resp[2] = vwc << 2;
+
+ out:
+ return res;
+}
+
+static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, u8 *resp,
+ int len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+
+ if (len < MODE_PAGE_POW_CND_LEN)
+ return SNTI_INTERNAL_ERROR;
+
+ resp[0] = MODE_PAGE_POWER_CONDITION;
+ resp[1] = MODE_PAGE_POW_CND_LEN_FIELD;
+ /* All other bytes are zero */
+
+ return res;
+}
+
+static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, u8 *resp,
+ int len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+
+ if (len < MODE_PAGE_INF_EXC_LEN)
+ return SNTI_INTERNAL_ERROR;
+
+ resp[0] = MODE_PAGE_INFO_EXCEP;
+ resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD;
+ resp[2] = 0x88;
+ /* All other bytes are zero */
+
+ return res;
+}
+
+static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *resp, int len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u16 mode_pages_offset_1 = 0;
+ u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4;
+
+ mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN;
+ mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN;
+ mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN;
+
+ res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1],
+ MODE_PAGE_CACHING_LEN);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+ res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2],
+ MODE_PAGE_CONTROL_LEN);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+ res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3],
+ MODE_PAGE_POW_CND_LEN);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+ res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4],
+ MODE_PAGE_INF_EXC_LEN);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+
+ out:
+ return res;
+}
+
+static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa)
+{
+ if (dbd == MODE_SENSE_BLK_DESC_ENABLED) {
+ /* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */
+ return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT;
+ } else {
+ return 0;
+ }
+}
+
+static int nvme_trans_mode_page_create(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, u8 *cmd,
+ u16 alloc_len, u8 cdb10,
+ int (*mode_page_fill_func)
+ (struct nvme_ns *,
+ struct sg_io_hdr *hdr, u8 *, int),
+ u16 mode_pages_tot_len)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int xfer_len;
+ u8 *response;
+ u8 dbd, llbaa;
+ u16 resp_size;
+ int mph_size;
+ u16 mode_pages_offset_1;
+ u16 blk_desc_len, blk_desc_offset, mode_data_length;
+
+ dbd = GET_MODE_SENSE_DBD(cmd);
+ llbaa = GET_MODE_SENSE_LLBAA(cmd);
+ mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10);
+ blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa);
+
+ resp_size = mph_size + blk_desc_len + mode_pages_tot_len;
+ /* Refer spc4r34 Table 440 for calculation of Mode data Length field */
+ mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len;
+
+ blk_desc_offset = mph_size;
+ mode_pages_offset_1 = blk_desc_offset + blk_desc_len;
+
+ response = kzalloc(resp_size, GFP_KERNEL);
+ if (response == NULL) {
+ res = -ENOMEM;
+ goto out_mem;
+ }
+
+ res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10,
+ llbaa, mode_data_length, blk_desc_len);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out_free;
+ if (blk_desc_len > 0) {
+ res = nvme_trans_fill_blk_desc(ns, hdr,
+ &response[blk_desc_offset],
+ blk_desc_len, llbaa);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out_free;
+ }
+ res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1],
+ mode_pages_tot_len);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out_free;
+
+ xfer_len = min(alloc_len, resp_size);
+ res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+ out_free:
+ kfree(response);
+ out_mem:
+ return res;
+}
+
+/* Read Capacity Helper Functions */
+
+static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns,
+ u8 cdb16)
+{
+ u8 flbas;
+ u32 lba_length;
+ u64 rlba;
+ u8 prot_en;
+ u8 p_type_lut[4] = {0, 0, 1, 2};
+ __be64 tmp_rlba;
+ __be32 tmp_rlba_32;
+ __be32 tmp_len;
+
+ flbas = (id_ns->flbas) & 0x0F;
+ lba_length = (1 << (id_ns->lbaf[flbas].ds));
+ rlba = le64_to_cpup(&id_ns->nsze) - 1;
+ (id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0);
+
+ if (!cdb16) {
+ if (rlba > 0xFFFFFFFF)
+ rlba = 0xFFFFFFFF;
+ tmp_rlba_32 = cpu_to_be32(rlba);
+ tmp_len = cpu_to_be32(lba_length);
+ memcpy(response, &tmp_rlba_32, sizeof(u32));
+ memcpy(&response[4], &tmp_len, sizeof(u32));
+ } else {
+ tmp_rlba = cpu_to_be64(rlba);
+ tmp_len = cpu_to_be32(lba_length);
+ memcpy(response, &tmp_rlba, sizeof(u64));
+ memcpy(&response[8], &tmp_len, sizeof(u32));
+ response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en;
+ /* P_I_Exponent = 0x0 | LBPPBE = 0x0 */
+ /* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */
+ /* Bytes 16-31 - Reserved */
+ }
+}
+
+/* Start Stop Unit Helper Functions */
+
+static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 pc, u8 pcmod, u8 start)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ struct nvme_id_ctrl *id_ctrl;
+ int lowest_pow_st; /* max npss = lowest power consumption */
+ unsigned ps_desired = 0;
+
+ /* NVMe Controller Identify */
+ mem = dma_alloc_coherent(&dev->pci_dev->dev,
+ sizeof(struct nvme_id_ctrl),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+ nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_dma;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_dma;
+ }
+ id_ctrl = mem;
+ lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1));
+
+ switch (pc) {
+ case NVME_POWER_STATE_START_VALID:
+ /* Action unspecified if POWER CONDITION MODIFIER != 0 */
+ if (pcmod == 0 && start == 0x1)
+ ps_desired = POWER_STATE_0;
+ if (pcmod == 0 && start == 0x0)
+ ps_desired = lowest_pow_st;
+ break;
+ case NVME_POWER_STATE_ACTIVE:
+ /* Action unspecified if POWER CONDITION MODIFIER != 0 */
+ if (pcmod == 0)
+ ps_desired = POWER_STATE_0;
+ break;
+ case NVME_POWER_STATE_IDLE:
+ /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */
+ if (pcmod == 0x0)
+ ps_desired = POWER_STATE_1;
+ else if (pcmod == 0x1)
+ ps_desired = POWER_STATE_2;
+ else if (pcmod == 0x2)
+ ps_desired = POWER_STATE_3;
+ break;
+ case NVME_POWER_STATE_STANDBY:
+ /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */
+ if (pcmod == 0x0)
+ ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2));
+ else if (pcmod == 0x1)
+ ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1));
+ break;
+ case NVME_POWER_STATE_LU_CONTROL:
+ default:
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+ nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0,
+ NULL);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_dma;
+ if (nvme_sc)
+ res = nvme_sc;
+ out_dma:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
+ dma_addr);
+ out:
+ return res;
+}
+
+/* Write Buffer Helper Functions */
+/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */
+
+static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 opcode, u32 tot_len, u32 offset,
+ u8 buffer_id)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ struct nvme_command c;
+ struct nvme_iod *iod = NULL;
+ unsigned length;
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = opcode;
+ if (opcode == nvme_admin_download_fw) {
+ if (hdr->iovec_count > 0) {
+ /* Assuming SGL is not allowed for this command */
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST,
+ SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ }
+ iod = nvme_map_user_pages(dev, DMA_TO_DEVICE,
+ (unsigned long)hdr->dxferp, tot_len);
+ if (IS_ERR(iod)) {
+ res = PTR_ERR(iod);
+ goto out;
+ }
+ length = nvme_setup_prps(dev, iod, tot_len, GFP_KERNEL);
+ if (length != tot_len) {
+ res = -ENOMEM;
+ goto out_unmap;
+ }
+
+ c.dlfw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ c.dlfw.prp2 = cpu_to_le64(iod->first_dma);
+ c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
+ c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
+ } else if (opcode == nvme_admin_activate_fw) {
+ u32 cdw10 = buffer_id | NVME_FWACT_REPL_ACTV;
+ c.common.cdw10[0] = cpu_to_le32(cdw10);
+ }
+
+ nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_unmap;
+ if (nvme_sc)
+ res = nvme_sc;
+
+ out_unmap:
+ if (opcode == nvme_admin_download_fw) {
+ nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod);
+ nvme_free_iod(dev, iod);
+ }
+ out:
+ return res;
+}
+
+/* Mode Select Helper Functions */
+
+static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10,
+ u16 *bd_len, u8 *llbaa)
+{
+ if (cdb10) {
+ /* 10 Byte CDB */
+ *bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) +
+ parm_list[MODE_SELECT_10_BD_OFFSET + 1];
+ *llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] &
+ MODE_SELECT_10_LLBAA_MASK;
+ } else {
+ /* 6 Byte CDB */
+ *bd_len = parm_list[MODE_SELECT_6_BD_OFFSET];
+ }
+}
+
+static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list,
+ u16 idx, u16 bd_len, u8 llbaa)
+{
+ u16 bd_num;
+
+ bd_num = bd_len / ((llbaa == 0) ?
+ SHORT_DESC_BLOCK : LONG_DESC_BLOCK);
+ /* Store block descriptor info if a FORMAT UNIT comes later */
+ /* TODO Saving 1st BD info; what to do if multiple BD received? */
+ if (llbaa == 0) {
+ /* Standard Block Descriptor - spc4r34 7.5.5.1 */
+ ns->mode_select_num_blocks =
+ (parm_list[idx + 1] << 16) +
+ (parm_list[idx + 2] << 8) +
+ (parm_list[idx + 3]);
+
+ ns->mode_select_block_len =
+ (parm_list[idx + 5] << 16) +
+ (parm_list[idx + 6] << 8) +
+ (parm_list[idx + 7]);
+ } else {
+ /* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */
+ ns->mode_select_num_blocks =
+ (((u64)parm_list[idx + 0]) << 56) +
+ (((u64)parm_list[idx + 1]) << 48) +
+ (((u64)parm_list[idx + 2]) << 40) +
+ (((u64)parm_list[idx + 3]) << 32) +
+ (((u64)parm_list[idx + 4]) << 24) +
+ (((u64)parm_list[idx + 5]) << 16) +
+ (((u64)parm_list[idx + 6]) << 8) +
+ ((u64)parm_list[idx + 7]);
+
+ ns->mode_select_block_len =
+ (parm_list[idx + 12] << 24) +
+ (parm_list[idx + 13] << 16) +
+ (parm_list[idx + 14] << 8) +
+ (parm_list[idx + 15]);
+ }
+}
+
+static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *mode_page, u8 page_code)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ unsigned dword11;
+
+ switch (page_code) {
+ case MODE_PAGE_CACHING:
+ dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0);
+ nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11,
+ 0, NULL);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ break;
+ if (nvme_sc) {
+ res = nvme_sc;
+ break;
+ }
+ break;
+ case MODE_PAGE_CONTROL:
+ break;
+ case MODE_PAGE_POWER_CONDITION:
+ /* Verify the OS is not trying to set timers */
+ if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) {
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST,
+ SCSI_ASC_INVALID_PARAMETER,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ if (!res)
+ res = SNTI_INTERNAL_ERROR;
+ break;
+ }
+ break;
+ default:
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ if (!res)
+ res = SNTI_INTERNAL_ERROR;
+ break;
+ }
+
+ return res;
+}
+
+static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd, u16 parm_list_len, u8 pf,
+ u8 sp, u8 cdb10)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u8 *parm_list;
+ u16 bd_len;
+ u8 llbaa = 0;
+ u16 index, saved_index;
+ u8 page_code;
+ u16 mp_size;
+
+ /* Get parm list from data-in/out buffer */
+ parm_list = kmalloc(parm_list_len, GFP_KERNEL);
+ if (parm_list == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+
+ res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out_mem;
+
+ nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa);
+ index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE);
+
+ if (bd_len != 0) {
+ /* Block Descriptors present, parse */
+ nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa);
+ index += bd_len;
+ }
+ saved_index = index;
+
+ /* Multiple mode pages may be present; iterate through all */
+ /* In 1st Iteration, don't do NVME Command, only check for CDB errors */
+ do {
+ page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK;
+ mp_size = parm_list[index + 1] + 2;
+ if ((page_code != MODE_PAGE_CACHING) &&
+ (page_code != MODE_PAGE_CONTROL) &&
+ (page_code != MODE_PAGE_POWER_CONDITION)) {
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST,
+ SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out_mem;
+ }
+ index += mp_size;
+ } while (index < parm_list_len);
+
+ /* In 2nd Iteration, do the NVME Commands */
+ index = saved_index;
+ do {
+ page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK;
+ mp_size = parm_list[index + 1] + 2;
+ res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index],
+ page_code);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ break;
+ index += mp_size;
+ } while (index < parm_list_len);
+
+ out_mem:
+ kfree(parm_list);
+ out:
+ return res;
+}
+
+/* Format Unit Helper Functions */
+
+static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ struct nvme_id_ns *id_ns;
+ u8 flbas;
+
+ /*
+ * SCSI Expects a MODE SELECT would have been issued prior to
+ * a FORMAT UNIT, and the block size and number would be used
+ * from the block descriptor in it. If a MODE SELECT had not
+ * been issued, FORMAT shall use the current values for both.
+ */
+
+ if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
+ mem = dma_alloc_coherent(&dev->pci_dev->dev,
+ sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+ /* nvme ns identify */
+ nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_dma;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_dma;
+ }
+ id_ns = mem;
+
+ if (ns->mode_select_num_blocks == 0)
+ ns->mode_select_num_blocks = le64_to_cpu(id_ns->ncap);
+ if (ns->mode_select_block_len == 0) {
+ flbas = (id_ns->flbas) & 0x0F;
+ ns->mode_select_block_len =
+ (1 << (id_ns->lbaf[flbas].ds));
+ }
+ out_dma:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+ mem, dma_addr);
+ }
+ out:
+ return res;
+}
+
+static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len,
+ u8 format_prot_info, u8 *nvme_pf_code)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u8 *parm_list;
+ u8 pf_usage, pf_code;
+
+ parm_list = kmalloc(len, GFP_KERNEL);
+ if (parm_list == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+ res = nvme_trans_copy_from_user(hdr, parm_list, len);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out_mem;
+
+ if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] &
+ FORMAT_UNIT_IMMED_MASK) != 0) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out_mem;
+ }
+
+ if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN &&
+ (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out_mem;
+ }
+ pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] &
+ FORMAT_UNIT_PROT_FIELD_USAGE_MASK;
+ pf_code = (pf_usage << 2) | format_prot_info;
+ switch (pf_code) {
+ case 0:
+ *nvme_pf_code = 0;
+ break;
+ case 2:
+ *nvme_pf_code = 1;
+ break;
+ case 3:
+ *nvme_pf_code = 2;
+ break;
+ case 7:
+ *nvme_pf_code = 3;
+ break;
+ default:
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+
+ out_mem:
+ kfree(parm_list);
+ out:
+ return res;
+}
+
+static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 prot_info)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ struct nvme_id_ns *id_ns;
+ u8 i;
+ u8 flbas, nlbaf;
+ u8 selected_lbaf = 0xFF;
+ u32 cdw10 = 0;
+ struct nvme_command c;
+
+ /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
+ mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+ /* nvme ns identify */
+ nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_dma;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_dma;
+ }
+ id_ns = mem;
+ flbas = (id_ns->flbas) & 0x0F;
+ nlbaf = id_ns->nlbaf;
+
+ for (i = 0; i < nlbaf; i++) {
+ if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) {
+ selected_lbaf = i;
+ break;
+ }
+ }
+ if (selected_lbaf > 0x0F) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ }
+ if (ns->mode_select_num_blocks != le64_to_cpu(id_ns->ncap)) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ }
+
+ cdw10 |= prot_info << 5;
+ cdw10 |= selected_lbaf & 0x0F;
+ memset(&c, 0, sizeof(c));
+ c.format.opcode = nvme_admin_format_nvm;
+ c.format.nsid = cpu_to_le32(ns->ns_id);
+ c.format.cdw10 = cpu_to_le32(cdw10);
+
+ nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_dma;
+ if (nvme_sc)
+ res = nvme_sc;
+
+ out_dma:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+ dma_addr);
+ out:
+ return res;
+}
+
+/* Read/Write Helper Functions */
+
+static inline void nvme_trans_get_io_cdb6(u8 *cmd,
+ struct nvme_trans_io_cdb *cdb_info)
+{
+ cdb_info->fua = 0;
+ cdb_info->prot_info = 0;
+ cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) &
+ IO_6_CDB_LBA_MASK;
+ cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET);
+
+ /* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */
+ if (cdb_info->xfer_len == 0)
+ cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN;
+}
+
+static inline void nvme_trans_get_io_cdb10(u8 *cmd,
+ struct nvme_trans_io_cdb *cdb_info)
+{
+ cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) &
+ IO_CDB_FUA_MASK;
+ cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) &
+ IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+ cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET);
+ cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET);
+}
+
+static inline void nvme_trans_get_io_cdb12(u8 *cmd,
+ struct nvme_trans_io_cdb *cdb_info)
+{
+ cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) &
+ IO_CDB_FUA_MASK;
+ cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) &
+ IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+ cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET);
+ cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET);
+}
+
+static inline void nvme_trans_get_io_cdb16(u8 *cmd,
+ struct nvme_trans_io_cdb *cdb_info)
+{
+ cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) &
+ IO_CDB_FUA_MASK;
+ cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) &
+ IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+ cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET);
+ cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET);
+}
+
+static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr,
+ struct nvme_trans_io_cdb *cdb_info,
+ u32 max_blocks)
+{
+ /* If using iovecs, send one nvme command per vector */
+ if (hdr->iovec_count > 0)
+ return hdr->iovec_count;
+ else if (cdb_info->xfer_len > max_blocks)
+ return ((cdb_info->xfer_len - 1) / max_blocks) + 1;
+ else
+ return 1;
+}
+
+static u16 nvme_trans_io_get_control(struct nvme_ns *ns,
+ struct nvme_trans_io_cdb *cdb_info)
+{
+ u16 control = 0;
+
+ /* When Protection information support is added, implement here */
+
+ if (cdb_info->fua > 0)
+ control |= NVME_RW_FUA;
+
+ return control;
+}
+
+static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ struct nvme_trans_io_cdb *cdb_info, u8 is_write)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_dev *dev = ns->dev;
+ u32 num_cmds;
+ struct nvme_iod *iod;
+ u64 unit_len;
+ u64 unit_num_blocks; /* Number of blocks to xfer in each nvme cmd */
+ u32 retcode;
+ u32 i = 0;
+ u64 nvme_offset = 0;
+ void __user *next_mapping_addr;
+ struct nvme_command c;
+ u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read);
+ u16 control;
+ u32 max_blocks = queue_max_hw_sectors(ns->queue);
+
+ num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks);
+
+ /*
+ * This loop handles two cases.
+ * First, when an SGL is used in the form of an iovec list:
+ * - Use iov_base as the next mapping address for the nvme command_id
+ * - Use iov_len as the data transfer length for the command.
+ * Second, when we have a single buffer
+ * - If larger than max_blocks, split into chunks, offset
+ * each nvme command accordingly.
+ */
+ for (i = 0; i < num_cmds; i++) {
+ memset(&c, 0, sizeof(c));
+ if (hdr->iovec_count > 0) {
+ struct sg_iovec sgl;
+
+ retcode = copy_from_user(&sgl, hdr->dxferp +
+ i * sizeof(struct sg_iovec),
+ sizeof(struct sg_iovec));
+ if (retcode)
+ return -EFAULT;
+ unit_len = sgl.iov_len;
+ unit_num_blocks = unit_len >> ns->lba_shift;
+ next_mapping_addr = sgl.iov_base;
+ } else {
+ unit_num_blocks = min((u64)max_blocks,
+ (cdb_info->xfer_len - nvme_offset));
+ unit_len = unit_num_blocks << ns->lba_shift;
+ next_mapping_addr = hdr->dxferp +
+ ((1 << ns->lba_shift) * nvme_offset);
+ }
+
+ c.rw.opcode = opcode;
+ c.rw.nsid = cpu_to_le32(ns->ns_id);
+ c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset);
+ c.rw.length = cpu_to_le16(unit_num_blocks - 1);
+ control = nvme_trans_io_get_control(ns, cdb_info);
+ c.rw.control = cpu_to_le16(control);
+
+ iod = nvme_map_user_pages(dev,
+ (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+ (unsigned long)next_mapping_addr, unit_len);
+ if (IS_ERR(iod)) {
+ res = PTR_ERR(iod);
+ goto out;
+ }
+ retcode = nvme_setup_prps(dev, iod, unit_len, GFP_KERNEL);
+ if (retcode != unit_len) {
+ nvme_unmap_user_pages(dev,
+ (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+ iod);
+ nvme_free_iod(dev, iod);
+ res = -ENOMEM;
+ goto out;
+ }
+ c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg));
+ c.rw.prp2 = cpu_to_le64(iod->first_dma);
+
+ nvme_offset += unit_num_blocks;
+
+ nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
+ if (nvme_sc != NVME_SC_SUCCESS) {
+ nvme_unmap_user_pages(dev,
+ (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+ iod);
+ nvme_free_iod(dev, iod);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ goto out;
+ }
+ nvme_unmap_user_pages(dev,
+ (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+ iod);
+ nvme_free_iod(dev, iod);
+ }
+ res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS);
+
+ out:
+ return res;
+}
+
+
+/* SCSI Command Translation Functions */
+
+static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ struct nvme_trans_io_cdb cdb_info;
+ u8 opcode = cmd[0];
+ u64 xfer_bytes;
+ u64 sum_iov_len = 0;
+ struct sg_iovec sgl;
+ int i;
+ size_t not_copied;
+
+ /* Extract Fields from CDB */
+ switch (opcode) {
+ case WRITE_6:
+ case READ_6:
+ nvme_trans_get_io_cdb6(cmd, &cdb_info);
+ break;
+ case WRITE_10:
+ case READ_10:
+ nvme_trans_get_io_cdb10(cmd, &cdb_info);
+ break;
+ case WRITE_12:
+ case READ_12:
+ nvme_trans_get_io_cdb12(cmd, &cdb_info);
+ break;
+ case WRITE_16:
+ case READ_16:
+ nvme_trans_get_io_cdb16(cmd, &cdb_info);
+ break;
+ default:
+ /* Will never really reach here */
+ res = SNTI_INTERNAL_ERROR;
+ goto out;
+ }
+
+ /* Calculate total length of transfer (in bytes) */
+ if (hdr->iovec_count > 0) {
+ for (i = 0; i < hdr->iovec_count; i++) {
+ not_copied = copy_from_user(&sgl, hdr->dxferp +
+ i * sizeof(struct sg_iovec),
+ sizeof(struct sg_iovec));
+ if (not_copied)
+ return -EFAULT;
+ sum_iov_len += sgl.iov_len;
+ /* IO vector sizes should be multiples of block size */
+ if (sgl.iov_len % (1 << ns->lba_shift) != 0) {
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST,
+ SCSI_ASC_INVALID_PARAMETER,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ }
+ }
+ } else {
+ sum_iov_len = hdr->dxfer_len;
+ }
+
+ /* As Per sg ioctl howto, if the lengths differ, use the lower one */
+ xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len);
+
+ /* If block count and actual data buffer size dont match, error out */
+ if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) {
+ res = -EINVAL;
+ goto out;
+ }
+
+ /* Check for 0 length transfer - it is not illegal */
+ if (cdb_info.xfer_len == 0)
+ goto out;
+
+ /* Send NVMe IO Command(s) */
+ res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+
+ out:
+ return res;
+}
+
+static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u8 evpd;
+ u8 page_code;
+ int alloc_len;
+ u8 *inq_response;
+
+ evpd = GET_INQ_EVPD_BIT(cmd);
+ page_code = GET_INQ_PAGE_CODE(cmd);
+ alloc_len = GET_INQ_ALLOC_LENGTH(cmd);
+
+ inq_response = kmalloc(max(alloc_len, STANDARD_INQUIRY_LENGTH),
+ GFP_KERNEL);
+ if (inq_response == NULL) {
+ res = -ENOMEM;
+ goto out_mem;
+ }
+
+ if (evpd == 0) {
+ if (page_code == INQ_STANDARD_INQUIRY_PAGE) {
+ res = nvme_trans_standard_inquiry_page(ns, hdr,
+ inq_response, alloc_len);
+ } else {
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST,
+ SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ }
+ } else {
+ switch (page_code) {
+ case VPD_SUPPORTED_PAGES:
+ res = nvme_trans_supported_vpd_pages(ns, hdr,
+ inq_response, alloc_len);
+ break;
+ case VPD_SERIAL_NUMBER:
+ res = nvme_trans_unit_serial_page(ns, hdr, inq_response,
+ alloc_len);
+ break;
+ case VPD_DEVICE_IDENTIFIERS:
+ res = nvme_trans_device_id_page(ns, hdr, inq_response,
+ alloc_len);
+ break;
+ case VPD_EXTENDED_INQUIRY:
+ res = nvme_trans_ext_inq_page(ns, hdr, alloc_len);
+ break;
+ case VPD_BLOCK_LIMITS:
+ res = nvme_trans_bdev_limits_page(ns, hdr, inq_response,
+ alloc_len);
+ break;
+ case VPD_BLOCK_DEV_CHARACTERISTICS:
+ res = nvme_trans_bdev_char_page(ns, hdr, alloc_len);
+ break;
+ default:
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST,
+ SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+ }
+ kfree(inq_response);
+ out_mem:
+ return res;
+}
+
+static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u16 alloc_len;
+ u8 sp;
+ u8 pc;
+ u8 page_code;
+
+ sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET);
+ if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ }
+ pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET);
+ page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK;
+ pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT;
+ if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ }
+ alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET);
+ switch (page_code) {
+ case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE:
+ res = nvme_trans_log_supp_pages(ns, hdr, alloc_len);
+ break;
+ case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE:
+ res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len);
+ break;
+ case LOG_PAGE_TEMPERATURE_PAGE:
+ res = nvme_trans_log_temperature(ns, hdr, alloc_len);
+ break;
+ default:
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+
+ out:
+ return res;
+}
+
+static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u8 cdb10 = 0;
+ u16 parm_list_len;
+ u8 page_format;
+ u8 save_pages;
+
+ page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET);
+ page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK;
+
+ save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET);
+ save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK;
+
+ if (GET_OPCODE(cmd) == MODE_SELECT) {
+ parm_list_len = GET_U8_FROM_CDB(cmd,
+ MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET);
+ } else {
+ parm_list_len = GET_U16_FROM_CDB(cmd,
+ MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET);
+ cdb10 = 1;
+ }
+
+ if (parm_list_len != 0) {
+ /*
+ * According to SPC-4 r24, a paramter list length field of 0
+ * shall not be considered an error
+ */
+ res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len,
+ page_format, save_pages, cdb10);
+ }
+
+ return res;
+}
+
+static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u16 alloc_len;
+ u8 cdb10 = 0;
+ u8 page_code;
+ u8 pc;
+
+ if (GET_OPCODE(cmd) == MODE_SENSE) {
+ alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET);
+ } else {
+ alloc_len = GET_U16_FROM_CDB(cmd,
+ MODE_SENSE10_ALLOC_LEN_OFFSET);
+ cdb10 = 1;
+ }
+
+ pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) &
+ MODE_SENSE_PAGE_CONTROL_MASK;
+ if (pc != MODE_SENSE_PC_CURRENT_VALUES) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ }
+
+ page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) &
+ MODE_SENSE_PAGE_CODE_MASK;
+ switch (page_code) {
+ case MODE_PAGE_CACHING:
+ res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+ cdb10,
+ &nvme_trans_fill_caching_page,
+ MODE_PAGE_CACHING_LEN);
+ break;
+ case MODE_PAGE_CONTROL:
+ res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+ cdb10,
+ &nvme_trans_fill_control_page,
+ MODE_PAGE_CONTROL_LEN);
+ break;
+ case MODE_PAGE_POWER_CONDITION:
+ res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+ cdb10,
+ &nvme_trans_fill_pow_cnd_page,
+ MODE_PAGE_POW_CND_LEN);
+ break;
+ case MODE_PAGE_INFO_EXCEP:
+ res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+ cdb10,
+ &nvme_trans_fill_inf_exc_page,
+ MODE_PAGE_INF_EXC_LEN);
+ break;
+ case MODE_PAGE_RETURN_ALL:
+ res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+ cdb10,
+ &nvme_trans_fill_all_pages,
+ MODE_PAGE_ALL_LEN);
+ break;
+ default:
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+
+ out:
+ return res;
+}
+
+static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ u32 alloc_len = READ_CAP_10_RESP_SIZE;
+ u32 resp_size = READ_CAP_10_RESP_SIZE;
+ u32 xfer_len;
+ u8 cdb16;
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ struct nvme_id_ns *id_ns;
+ u8 *response;
+
+ cdb16 = IS_READ_CAP_16(cmd);
+ if (cdb16) {
+ alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd);
+ resp_size = READ_CAP_16_RESP_SIZE;
+ }
+
+ mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+ /* nvme ns identify */
+ nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_dma;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_dma;
+ }
+ id_ns = mem;
+
+ response = kzalloc(resp_size, GFP_KERNEL);
+ if (response == NULL) {
+ res = -ENOMEM;
+ goto out_dma;
+ }
+ nvme_trans_fill_read_cap(response, id_ns, cdb16);
+
+ xfer_len = min(alloc_len, resp_size);
+ res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+ kfree(response);
+ out_dma:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+ dma_addr);
+ out:
+ return res;
+}
+
+static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ u32 alloc_len, xfer_len, resp_size;
+ u8 select_report;
+ u8 *response;
+ struct nvme_dev *dev = ns->dev;
+ dma_addr_t dma_addr;
+ void *mem;
+ struct nvme_id_ctrl *id_ctrl;
+ u32 ll_length, lun_id;
+ u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET;
+ __be32 tmp_len;
+
+ alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd);
+ select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET);
+
+ if ((select_report != ALL_LUNS_RETURNED) &&
+ (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) &&
+ (select_report != RESTRICTED_LUNS_RETURNED)) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ } else {
+ /* NVMe Controller Identify */
+ mem = dma_alloc_coherent(&dev->pci_dev->dev,
+ sizeof(struct nvme_id_ctrl),
+ &dma_addr, GFP_KERNEL);
+ if (mem == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+ nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out_dma;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out_dma;
+ }
+ id_ctrl = mem;
+ ll_length = le32_to_cpu(id_ctrl->nn) * LUN_ENTRY_SIZE;
+ resp_size = ll_length + LUN_DATA_HEADER_SIZE;
+
+ if (alloc_len < resp_size) {
+ res = nvme_trans_completion(hdr,
+ SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out_dma;
+ }
+
+ response = kzalloc(resp_size, GFP_KERNEL);
+ if (response == NULL) {
+ res = -ENOMEM;
+ goto out_dma;
+ }
+
+ /* The first LUN ID will always be 0 per the SAM spec */
+ for (lun_id = 0; lun_id < le32_to_cpu(id_ctrl->nn); lun_id++) {
+ /*
+ * Set the LUN Id and then increment to the next LUN
+ * location in the parameter data.
+ */
+ __be64 tmp_id = cpu_to_be64(lun_id);
+ memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64));
+ lun_id_offset += LUN_ENTRY_SIZE;
+ }
+ tmp_len = cpu_to_be32(ll_length);
+ memcpy(response, &tmp_len, sizeof(u32));
+ }
+
+ xfer_len = min(alloc_len, resp_size);
+ res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+ kfree(response);
+ out_dma:
+ dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
+ dma_addr);
+ out:
+ return res;
+}
+
+static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u8 alloc_len, xfer_len, resp_size;
+ u8 desc_format;
+ u8 *response;
+
+ alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd);
+ desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET);
+ desc_format &= REQUEST_SENSE_DESC_MASK;
+
+ resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) :
+ (FIXED_FMT_SENSE_DATA_SIZE));
+ response = kzalloc(resp_size, GFP_KERNEL);
+ if (response == NULL) {
+ res = -ENOMEM;
+ goto out;
+ }
+
+ if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) {
+ /* Descriptor Format Sense Data */
+ response[0] = DESC_FORMAT_SENSE_DATA;
+ response[1] = NO_SENSE;
+ /* TODO How is LOW POWER CONDITION ON handled? (byte 2) */
+ response[2] = SCSI_ASC_NO_SENSE;
+ response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ /* SDAT_OVFL = 0 | Additional Sense Length = 0 */
+ } else {
+ /* Fixed Format Sense Data */
+ response[0] = FIXED_SENSE_DATA;
+ /* Byte 1 = Obsolete */
+ response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */
+ /* Bytes 3-6 - Information - set to zero */
+ response[7] = FIXED_SENSE_DATA_ADD_LENGTH;
+ /* Bytes 8-11 - Cmd Specific Information - set to zero */
+ response[12] = SCSI_ASC_NO_SENSE;
+ response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+ /* Byte 14 = Field Replaceable Unit Code = 0 */
+ /* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */
+ }
+
+ xfer_len = min(alloc_len, resp_size);
+ res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+ kfree(response);
+ out:
+ return res;
+}
+
+static int nvme_trans_security_protocol(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+}
+
+static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_command c;
+ u8 immed, pcmod, pc, no_flush, start;
+
+ immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET);
+ pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET);
+ pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET);
+ no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET);
+ start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET);
+
+ immed &= START_STOP_UNIT_CDB_IMMED_MASK;
+ pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK;
+ pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT;
+ no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK;
+ start &= START_STOP_UNIT_CDB_START_MASK;
+
+ if (immed != 0) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ } else {
+ if (no_flush == 0) {
+ /* Issue NVME FLUSH command prior to START STOP UNIT */
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = nvme_cmd_flush;
+ c.common.nsid = cpu_to_le32(ns->ns_id);
+
+ nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out;
+ if (nvme_sc) {
+ res = nvme_sc;
+ goto out;
+ }
+ }
+ /* Setup the expected power state transition */
+ res = nvme_trans_power_state(ns, hdr, pc, pcmod, start);
+ }
+
+ out:
+ return res;
+}
+
+static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr, u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ int nvme_sc;
+ struct nvme_command c;
+
+ memset(&c, 0, sizeof(c));
+ c.common.opcode = nvme_cmd_flush;
+ c.common.nsid = cpu_to_le32(ns->ns_id);
+
+ nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL);
+
+ res = nvme_trans_status_code(hdr, nvme_sc);
+ if (res)
+ goto out;
+ if (nvme_sc)
+ res = nvme_sc;
+
+ out:
+ return res;
+}
+
+static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u8 parm_hdr_len = 0;
+ u8 nvme_pf_code = 0;
+ u8 format_prot_info, long_list, format_data;
+
+ format_prot_info = GET_U8_FROM_CDB(cmd,
+ FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET);
+ long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET);
+ format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET);
+
+ format_prot_info = (format_prot_info &
+ FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >>
+ FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT;
+ long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK;
+ format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK;
+
+ if (format_data != 0) {
+ if (format_prot_info != 0) {
+ if (long_list == 0)
+ parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN;
+ else
+ parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN;
+ }
+ } else if (format_data == 0 && format_prot_info != 0) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ }
+
+ /* Get parm header from data-in/out buffer */
+ /*
+ * According to the translation spec, the only fields in the parameter
+ * list we are concerned with are in the header. So allocate only that.
+ */
+ if (parm_hdr_len > 0) {
+ res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len,
+ format_prot_info, &nvme_pf_code);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+ }
+
+ /* Attempt to activate any previously downloaded firmware image */
+ res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0);
+
+ /* Determine Block size and count and send format command */
+ res = nvme_trans_fmt_set_blk_size_count(ns, hdr);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+
+ res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code);
+
+ out:
+ return res;
+}
+
+static int nvme_trans_test_unit_ready(struct nvme_ns *ns,
+ struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ struct nvme_dev *dev = ns->dev;
+
+ if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY))
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ NOT_READY, SCSI_ASC_LUN_NOT_READY,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ else
+ res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0);
+
+ return res;
+}
+
+static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ int res = SNTI_TRANSLATION_SUCCESS;
+ u32 buffer_offset, parm_list_length;
+ u8 buffer_id, mode;
+
+ parm_list_length =
+ GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET);
+ if (parm_list_length % BYTES_TO_DWORDS != 0) {
+ /* NVMe expects Firmware file to be a whole number of DWORDS */
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ }
+ buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET);
+ if (buffer_id > NVME_MAX_FIRMWARE_SLOT) {
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ goto out;
+ }
+ mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) &
+ WRITE_BUFFER_CDB_MODE_MASK;
+ buffer_offset =
+ GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET);
+
+ switch (mode) {
+ case DOWNLOAD_SAVE_ACTIVATE:
+ res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw,
+ parm_list_length, buffer_offset,
+ buffer_id);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+ res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw,
+ parm_list_length, buffer_offset,
+ buffer_id);
+ break;
+ case DOWNLOAD_SAVE_DEFER_ACTIVATE:
+ res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw,
+ parm_list_length, buffer_offset,
+ buffer_id);
+ break;
+ case ACTIVATE_DEFERRED_MICROCODE:
+ res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw,
+ parm_list_length, buffer_offset,
+ buffer_id);
+ break;
+ default:
+ res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+
+ out:
+ return res;
+}
+
+struct scsi_unmap_blk_desc {
+ __be64 slba;
+ __be32 nlb;
+ u32 resv;
+};
+
+struct scsi_unmap_parm_list {
+ __be16 unmap_data_len;
+ __be16 unmap_blk_desc_data_len;
+ u32 resv;
+ struct scsi_unmap_blk_desc desc[0];
+};
+
+static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+ u8 *cmd)
+{
+ struct nvme_dev *dev = ns->dev;
+ struct scsi_unmap_parm_list *plist;
+ struct nvme_dsm_range *range;
+ struct nvme_command c;
+ int i, nvme_sc, res = -ENOMEM;
+ u16 ndesc, list_len;
+ dma_addr_t dma_addr;
+
+ list_len = GET_U16_FROM_CDB(cmd, UNMAP_CDB_PARAM_LIST_LENGTH_OFFSET);
+ if (!list_len)
+ return -EINVAL;
+
+ plist = kmalloc(list_len, GFP_KERNEL);
+ if (!plist)
+ return -ENOMEM;
+
+ res = nvme_trans_copy_from_user(hdr, plist, list_len);
+ if (res != SNTI_TRANSLATION_SUCCESS)
+ goto out;
+
+ ndesc = be16_to_cpu(plist->unmap_blk_desc_data_len) >> 4;
+ if (!ndesc || ndesc > 256) {
+ res = -EINVAL;
+ goto out;
+ }
+
+ range = dma_alloc_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
+ &dma_addr, GFP_KERNEL);
+ if (!range)
+ goto out;
+
+ for (i = 0; i < ndesc; i++) {
+ range[i].nlb = cpu_to_le32(be32_to_cpu(plist->desc[i].nlb));
+ range[i].slba = cpu_to_le64(be64_to_cpu(plist->desc[i].slba));
+ range[i].cattr = 0;
+ }
+
+ memset(&c, 0, sizeof(c));
+ c.dsm.opcode = nvme_cmd_dsm;
+ c.dsm.nsid = cpu_to_le32(ns->ns_id);
+ c.dsm.prp1 = cpu_to_le64(dma_addr);
+ c.dsm.nr = cpu_to_le32(ndesc - 1);
+ c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+
+ nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL);
+ res = nvme_trans_status_code(hdr, nvme_sc);
+
+ dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range),
+ range, dma_addr);
+ out:
+ kfree(plist);
+ return res;
+}
+
+static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
+{
+ u8 cmd[BLK_MAX_CDB];
+ int retcode;
+ unsigned int opcode;
+
+ if (hdr->cmdp == NULL)
+ return -EMSGSIZE;
+ if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
+ return -EFAULT;
+
+ /*
+ * Prime the hdr with good status for scsi commands that don't require
+ * an nvme command for translation.
+ */
+ retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS);
+ if (retcode)
+ return retcode;
+
+ opcode = cmd[0];
+
+ switch (opcode) {
+ case READ_6:
+ case READ_10:
+ case READ_12:
+ case READ_16:
+ retcode = nvme_trans_io(ns, hdr, 0, cmd);
+ break;
+ case WRITE_6:
+ case WRITE_10:
+ case WRITE_12:
+ case WRITE_16:
+ retcode = nvme_trans_io(ns, hdr, 1, cmd);
+ break;
+ case INQUIRY:
+ retcode = nvme_trans_inquiry(ns, hdr, cmd);
+ break;
+ case LOG_SENSE:
+ retcode = nvme_trans_log_sense(ns, hdr, cmd);
+ break;
+ case MODE_SELECT:
+ case MODE_SELECT_10:
+ retcode = nvme_trans_mode_select(ns, hdr, cmd);
+ break;
+ case MODE_SENSE:
+ case MODE_SENSE_10:
+ retcode = nvme_trans_mode_sense(ns, hdr, cmd);
+ break;
+ case READ_CAPACITY:
+ retcode = nvme_trans_read_capacity(ns, hdr, cmd);
+ break;
+ case SERVICE_ACTION_IN_16:
+ if (IS_READ_CAP_16(cmd))
+ retcode = nvme_trans_read_capacity(ns, hdr, cmd);
+ else
+ goto out;
+ break;
+ case REPORT_LUNS:
+ retcode = nvme_trans_report_luns(ns, hdr, cmd);
+ break;
+ case REQUEST_SENSE:
+ retcode = nvme_trans_request_sense(ns, hdr, cmd);
+ break;
+ case SECURITY_PROTOCOL_IN:
+ case SECURITY_PROTOCOL_OUT:
+ retcode = nvme_trans_security_protocol(ns, hdr, cmd);
+ break;
+ case START_STOP:
+ retcode = nvme_trans_start_stop(ns, hdr, cmd);
+ break;
+ case SYNCHRONIZE_CACHE:
+ retcode = nvme_trans_synchronize_cache(ns, hdr, cmd);
+ break;
+ case FORMAT_UNIT:
+ retcode = nvme_trans_format_unit(ns, hdr, cmd);
+ break;
+ case TEST_UNIT_READY:
+ retcode = nvme_trans_test_unit_ready(ns, hdr, cmd);
+ break;
+ case WRITE_BUFFER:
+ retcode = nvme_trans_write_buffer(ns, hdr, cmd);
+ break;
+ case UNMAP:
+ retcode = nvme_trans_unmap(ns, hdr, cmd);
+ break;
+ default:
+ out:
+ retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+ ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND,
+ SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+ break;
+ }
+ return retcode;
+}
+
+int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr)
+{
+ struct sg_io_hdr hdr;
+ int retcode;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (copy_from_user(&hdr, u_hdr, sizeof(hdr)))
+ return -EFAULT;
+ if (hdr.interface_id != 'S')
+ return -EINVAL;
+ if (hdr.cmd_len > BLK_MAX_CDB)
+ return -EINVAL;
+
+ retcode = nvme_scsi_translate(ns, &hdr);
+ if (retcode < 0)
+ return retcode;
+ if (retcode > 0)
+ retcode = SNTI_TRANSLATION_SUCCESS;
+ if (copy_to_user(u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0)
+ return -EFAULT;
+
+ return retcode;
+}
+
+int nvme_sg_get_version_num(int __user *ip)
+{
+ return put_user(sg_version_num, ip);
+}
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
new file mode 100644
index 000000000..e22942596
--- /dev/null
+++ b/drivers/block/osdblk.c
@@ -0,0 +1,699 @@
+
+/*
+ osdblk.c -- Export a single SCSI OSD object as a Linux block device
+
+
+ Copyright 2009 Red Hat, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+ Instructions for use
+ --------------------
+
+ 1) Map a Linux block device to an existing OSD object.
+
+ In this example, we will use partition id 1234, object id 5678,
+ OSD device /dev/osd1.
+
+ $ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
+
+
+ 2) List all active blkdev<->object mappings.
+
+ In this example, we have performed step #1 twice, creating two blkdevs,
+ mapped to two separate OSD objects.
+
+ $ cat /sys/class/osdblk/list
+ 0 174 1234 5678 /dev/osd1
+ 1 179 1994 897123 /dev/osd0
+
+ The columns, in order, are:
+ - blkdev unique id
+ - blkdev assigned major
+ - OSD object partition id
+ - OSD object id
+ - OSD device
+
+
+ 3) Remove an active blkdev<->object mapping.
+
+ In this example, we remove the mapping with blkdev unique id 1.
+
+ $ echo 1 > /sys/class/osdblk/remove
+
+
+ NOTE: The actual creation and deletion of OSD objects is outside the scope
+ of this driver.
+
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_sec.h>
+#include <scsi/scsi_device.h>
+
+#define DRV_NAME "osdblk"
+#define PFX DRV_NAME ": "
+
+/* #define _OSDBLK_DEBUG */
+#ifdef _OSDBLK_DEBUG
+#define OSDBLK_DEBUG(fmt, a...) \
+ printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define OSDBLK_DEBUG(fmt, a...) \
+ do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko");
+MODULE_LICENSE("GPL");
+
+struct osdblk_device;
+
+enum {
+ OSDBLK_MINORS_PER_MAJOR = 256, /* max minors per blkdev */
+ OSDBLK_MAX_REQ = 32, /* max parallel requests */
+ OSDBLK_OP_TIMEOUT = 4 * 60, /* sync OSD req timeout */
+};
+
+struct osdblk_request {
+ struct request *rq; /* blk layer request */
+ struct bio *bio; /* cloned bio */
+ struct osdblk_device *osdev; /* associated blkdev */
+};
+
+struct osdblk_device {
+ int id; /* blkdev unique id */
+
+ int major; /* blkdev assigned major */
+ struct gendisk *disk; /* blkdev's gendisk and rq */
+ struct request_queue *q;
+
+ struct osd_dev *osd; /* associated OSD */
+
+ char name[32]; /* blkdev name, e.g. osdblk34 */
+
+ spinlock_t lock; /* queue lock */
+
+ struct osd_obj_id obj; /* OSD partition, obj id */
+ uint8_t obj_cred[OSD_CAP_LEN]; /* OSD cred */
+
+ struct osdblk_request req[OSDBLK_MAX_REQ]; /* request table */
+
+ struct list_head node;
+
+ char osd_path[0]; /* OSD device path */
+};
+
+static struct class *class_osdblk; /* /sys/class/osdblk */
+static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+static LIST_HEAD(osdblkdev_list);
+
+static const struct block_device_operations osdblk_bd_ops = {
+ .owner = THIS_MODULE,
+};
+
+static const struct osd_attr g_attr_logical_length = ATTR_DEF(
+ OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+static void osdblk_make_credential(u8 cred_a[OSD_CAP_LEN],
+ const struct osd_obj_id *obj)
+{
+ osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/* copied from exofs; move to libosd? */
+/*
+ * Perform a synchronous OSD operation. copied from exofs; move to libosd?
+ */
+static int osd_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+ int ret;
+
+ or->timeout = timeout;
+ ret = osd_finalize_request(or, 0, credential, NULL);
+ if (ret)
+ return ret;
+
+ ret = osd_execute_request(or);
+
+ /* osd_req_decode_sense(or, ret); */
+ return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation. copied from exofs; move to libosd?
+ */
+static int osd_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+ void *caller_context, u8 *cred)
+{
+ int ret;
+
+ ret = osd_finalize_request(or, 0, cred, NULL);
+ if (ret)
+ return ret;
+
+ ret = osd_execute_request_async(or, async_done, caller_context);
+
+ return ret;
+}
+
+/* copied from exofs; move to libosd? */
+static int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+ struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+ void *iter = NULL;
+ int nelem;
+
+ do {
+ nelem = 1;
+ osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+ if ((cur_attr.attr_page == attr->attr_page) &&
+ (cur_attr.attr_id == attr->attr_id)) {
+ attr->len = cur_attr.len;
+ attr->val_ptr = cur_attr.val_ptr;
+ return 0;
+ }
+ } while (iter);
+
+ return -EIO;
+}
+
+static int osdblk_get_obj_size(struct osdblk_device *osdev, u64 *size_out)
+{
+ struct osd_request *or;
+ struct osd_attr attr;
+ int ret;
+
+ /* start request */
+ or = osd_start_request(osdev->osd, GFP_KERNEL);
+ if (!or)
+ return -ENOMEM;
+
+ /* create a get-attributes(length) request */
+ osd_req_get_attributes(or, &osdev->obj);
+
+ osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+
+ /* execute op synchronously */
+ ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
+ if (ret)
+ goto out;
+
+ /* extract length from returned attribute info */
+ attr = g_attr_logical_length;
+ ret = extract_attr_from_req(or, &attr);
+ if (ret)
+ goto out;
+
+ *size_out = get_unaligned_be64(attr.val_ptr);
+
+out:
+ osd_end_request(or);
+ return ret;
+
+}
+
+static void osdblk_osd_complete(struct osd_request *or, void *private)
+{
+ struct osdblk_request *orq = private;
+ struct osd_sense_info osi;
+ int ret = osd_req_decode_sense(or, &osi);
+
+ if (ret) {
+ ret = -EIO;
+ OSDBLK_DEBUG("osdblk_osd_complete with err=%d\n", ret);
+ }
+
+ /* complete OSD request */
+ osd_end_request(or);
+
+ /* complete request passed to osdblk by block layer */
+ __blk_end_request_all(orq->rq, ret);
+}
+
+static void bio_chain_put(struct bio *chain)
+{
+ struct bio *tmp;
+
+ while (chain) {
+ tmp = chain;
+ chain = chain->bi_next;
+
+ bio_put(tmp);
+ }
+}
+
+static struct bio *bio_chain_clone(struct bio *old_chain, gfp_t gfpmask)
+{
+ struct bio *tmp, *new_chain = NULL, *tail = NULL;
+
+ while (old_chain) {
+ tmp = bio_clone_kmalloc(old_chain, gfpmask);
+ if (!tmp)
+ goto err_out;
+
+ tmp->bi_bdev = NULL;
+ gfpmask &= ~__GFP_WAIT;
+ tmp->bi_next = NULL;
+
+ if (!new_chain)
+ new_chain = tail = tmp;
+ else {
+ tail->bi_next = tmp;
+ tail = tmp;
+ }
+
+ old_chain = old_chain->bi_next;
+ }
+
+ return new_chain;
+
+err_out:
+ OSDBLK_DEBUG("bio_chain_clone with err\n");
+ bio_chain_put(new_chain);
+ return NULL;
+}
+
+static void osdblk_rq_fn(struct request_queue *q)
+{
+ struct osdblk_device *osdev = q->queuedata;
+
+ while (1) {
+ struct request *rq;
+ struct osdblk_request *orq;
+ struct osd_request *or;
+ struct bio *bio;
+ bool do_write, do_flush;
+
+ /* peek at request from block layer */
+ rq = blk_fetch_request(q);
+ if (!rq)
+ break;
+
+ /* filter out block requests we don't understand */
+ if (rq->cmd_type != REQ_TYPE_FS) {
+ blk_end_request_all(rq, 0);
+ continue;
+ }
+
+ /* deduce our operation (read, write, flush) */
+ /* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
+ * into a clearly defined set of RPC commands:
+ * read, write, flush, scsi command, power mgmt req,
+ * driver-specific, etc.
+ */
+
+ do_flush = rq->cmd_flags & REQ_FLUSH;
+ do_write = (rq_data_dir(rq) == WRITE);
+
+ if (!do_flush) { /* osd_flush does not use a bio */
+ /* a bio clone to be passed down to OSD request */
+ bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
+ if (!bio)
+ break;
+ } else
+ bio = NULL;
+
+ /* alloc internal OSD request, for OSD command execution */
+ or = osd_start_request(osdev->osd, GFP_ATOMIC);
+ if (!or) {
+ bio_chain_put(bio);
+ OSDBLK_DEBUG("osd_start_request with err\n");
+ break;
+ }
+
+ orq = &osdev->req[rq->tag];
+ orq->rq = rq;
+ orq->bio = bio;
+ orq->osdev = osdev;
+
+ /* init OSD command: flush, write or read */
+ if (do_flush)
+ osd_req_flush_object(or, &osdev->obj,
+ OSD_CDB_FLUSH_ALL, 0, 0);
+ else if (do_write)
+ osd_req_write(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
+ bio, blk_rq_bytes(rq));
+ else
+ osd_req_read(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
+ bio, blk_rq_bytes(rq));
+
+ OSDBLK_DEBUG("%s 0x%x bytes at 0x%llx\n",
+ do_flush ? "flush" : do_write ?
+ "write" : "read", blk_rq_bytes(rq),
+ blk_rq_pos(rq) * 512ULL);
+
+ /* begin OSD command execution */
+ if (osd_async_op(or, osdblk_osd_complete, orq,
+ osdev->obj_cred)) {
+ osd_end_request(or);
+ blk_requeue_request(q, rq);
+ bio_chain_put(bio);
+ OSDBLK_DEBUG("osd_execute_request_async with err\n");
+ break;
+ }
+
+ /* remove the special 'flush' marker, now that the command
+ * is executing
+ */
+ rq->special = NULL;
+ }
+}
+
+static void osdblk_free_disk(struct osdblk_device *osdev)
+{
+ struct gendisk *disk = osdev->disk;
+
+ if (!disk)
+ return;
+
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+ put_disk(disk);
+}
+
+static int osdblk_init_disk(struct osdblk_device *osdev)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int rc;
+ u64 obj_size = 0;
+
+ /* contact OSD, request size info about the object being mapped */
+ rc = osdblk_get_obj_size(osdev, &obj_size);
+ if (rc)
+ return rc;
+
+ /* create gendisk info */
+ disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
+ if (!disk)
+ return -ENOMEM;
+
+ sprintf(disk->disk_name, DRV_NAME "%d", osdev->id);
+ disk->major = osdev->major;
+ disk->first_minor = 0;
+ disk->fops = &osdblk_bd_ops;
+ disk->private_data = osdev;
+
+ /* init rq */
+ q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
+ if (!q) {
+ put_disk(disk);
+ return -ENOMEM;
+ }
+
+ /* switch queue to TCQ mode; allocate tag map */
+ rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL, BLK_TAG_ALLOC_FIFO);
+ if (rc) {
+ blk_cleanup_queue(q);
+ put_disk(disk);
+ return rc;
+ }
+
+ /* Set our limits to the lower device limits, because osdblk cannot
+ * sleep when allocating a lower-request and therefore cannot be
+ * bouncing.
+ */
+ blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
+
+ blk_queue_prep_rq(q, blk_queue_start_tag);
+ blk_queue_flush(q, REQ_FLUSH);
+
+ disk->queue = q;
+
+ q->queuedata = osdev;
+
+ osdev->disk = disk;
+ osdev->q = q;
+
+ /* finally, announce the disk to the world */
+ set_capacity(disk, obj_size / 512ULL);
+ add_disk(disk);
+
+ printk(KERN_INFO "%s: Added of size 0x%llx\n",
+ disk->disk_name, (unsigned long long)obj_size);
+
+ return 0;
+}
+
+/********************************************************************
+ * /sys/class/osdblk/
+ * add map OSD object to blkdev
+ * remove unmap OSD object
+ * list show mappings
+ *******************************************************************/
+
+static void class_osdblk_release(struct class *cls)
+{
+ kfree(cls);
+}
+
+static ssize_t class_osdblk_list(struct class *c,
+ struct class_attribute *attr,
+ char *data)
+{
+ int n = 0;
+ struct list_head *tmp;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ list_for_each(tmp, &osdblkdev_list) {
+ struct osdblk_device *osdev;
+
+ osdev = list_entry(tmp, struct osdblk_device, node);
+
+ n += sprintf(data+n, "%d %d %llu %llu %s\n",
+ osdev->id,
+ osdev->major,
+ osdev->obj.partition,
+ osdev->obj.id,
+ osdev->osd_path);
+ }
+
+ mutex_unlock(&ctl_mutex);
+ return n;
+}
+
+static ssize_t class_osdblk_add(struct class *c,
+ struct class_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct osdblk_device *osdev;
+ ssize_t rc;
+ int irc, new_id = 0;
+ struct list_head *tmp;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ /* new osdblk_device object */
+ osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
+ if (!osdev) {
+ rc = -ENOMEM;
+ goto err_out_mod;
+ }
+
+ /* static osdblk_device initialization */
+ spin_lock_init(&osdev->lock);
+ INIT_LIST_HEAD(&osdev->node);
+
+ /* generate unique id: find highest unique id, add one */
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ list_for_each(tmp, &osdblkdev_list) {
+ struct osdblk_device *osdev;
+
+ osdev = list_entry(tmp, struct osdblk_device, node);
+ if (osdev->id > new_id)
+ new_id = osdev->id + 1;
+ }
+
+ osdev->id = new_id;
+
+ /* add to global list */
+ list_add_tail(&osdev->node, &osdblkdev_list);
+
+ mutex_unlock(&ctl_mutex);
+
+ /* parse add command */
+ if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
+ osdev->osd_path) != 3) {
+ rc = -EINVAL;
+ goto err_out_slot;
+ }
+
+ /* initialize rest of new object */
+ sprintf(osdev->name, DRV_NAME "%d", osdev->id);
+
+ /* contact requested OSD */
+ osdev->osd = osduld_path_lookup(osdev->osd_path);
+ if (IS_ERR(osdev->osd)) {
+ rc = PTR_ERR(osdev->osd);
+ goto err_out_slot;
+ }
+
+ /* build OSD credential */
+ osdblk_make_credential(osdev->obj_cred, &osdev->obj);
+
+ /* register our block device */
+ irc = register_blkdev(0, osdev->name);
+ if (irc < 0) {
+ rc = irc;
+ goto err_out_osd;
+ }
+
+ osdev->major = irc;
+
+ /* set up and announce blkdev mapping */
+ rc = osdblk_init_disk(osdev);
+ if (rc)
+ goto err_out_blkdev;
+
+ return count;
+
+err_out_blkdev:
+ unregister_blkdev(osdev->major, osdev->name);
+err_out_osd:
+ osduld_put_device(osdev->osd);
+err_out_slot:
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ list_del_init(&osdev->node);
+ mutex_unlock(&ctl_mutex);
+
+ kfree(osdev);
+err_out_mod:
+ OSDBLK_DEBUG("Error adding device %s\n", buf);
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static ssize_t class_osdblk_remove(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct osdblk_device *osdev = NULL;
+ int target_id, rc;
+ unsigned long ul;
+ struct list_head *tmp;
+
+ rc = kstrtoul(buf, 10, &ul);
+ if (rc)
+ return rc;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ target_id = (int) ul;
+ if (target_id != ul)
+ return -EINVAL;
+
+ /* remove object from list immediately */
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ list_for_each(tmp, &osdblkdev_list) {
+ osdev = list_entry(tmp, struct osdblk_device, node);
+ if (osdev->id == target_id) {
+ list_del_init(&osdev->node);
+ break;
+ }
+ osdev = NULL;
+ }
+
+ mutex_unlock(&ctl_mutex);
+
+ if (!osdev)
+ return -ENOENT;
+
+ /* clean up and free blkdev and associated OSD connection */
+ osdblk_free_disk(osdev);
+ unregister_blkdev(osdev->major, osdev->name);
+ osduld_put_device(osdev->osd);
+ kfree(osdev);
+
+ /* release module ref */
+ module_put(THIS_MODULE);
+
+ return count;
+}
+
+static struct class_attribute class_osdblk_attrs[] = {
+ __ATTR(add, 0200, NULL, class_osdblk_add),
+ __ATTR(remove, 0200, NULL, class_osdblk_remove),
+ __ATTR(list, 0444, class_osdblk_list, NULL),
+ __ATTR_NULL
+};
+
+static int osdblk_sysfs_init(void)
+{
+ int ret = 0;
+
+ /*
+ * create control files in sysfs
+ * /sys/class/osdblk/...
+ */
+ class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
+ if (!class_osdblk)
+ return -ENOMEM;
+
+ class_osdblk->name = DRV_NAME;
+ class_osdblk->owner = THIS_MODULE;
+ class_osdblk->class_release = class_osdblk_release;
+ class_osdblk->class_attrs = class_osdblk_attrs;
+
+ ret = class_register(class_osdblk);
+ if (ret) {
+ kfree(class_osdblk);
+ class_osdblk = NULL;
+ printk(PFX "failed to create class osdblk\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static void osdblk_sysfs_cleanup(void)
+{
+ if (class_osdblk)
+ class_destroy(class_osdblk);
+ class_osdblk = NULL;
+}
+
+static int __init osdblk_init(void)
+{
+ int rc;
+
+ rc = osdblk_sysfs_init();
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
+static void __exit osdblk_exit(void)
+{
+ osdblk_sysfs_cleanup();
+}
+
+module_init(osdblk_init);
+module_exit(osdblk_exit);
+
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig
new file mode 100644
index 000000000..efefb5ac3
--- /dev/null
+++ b/drivers/block/paride/Kconfig
@@ -0,0 +1,300 @@
+#
+# PARIDE configuration
+#
+# PARIDE doesn't need PARPORT, but if PARPORT is configured as a module,
+# PARIDE must also be a module.
+# PARIDE only supports PC style parports. Tough for USB or other parports...
+
+comment "Parallel IDE high-level drivers"
+ depends on PARIDE
+
+config PARIDE_PD
+ tristate "Parallel port IDE disks"
+ depends on PARIDE
+ help
+ This option enables the high-level driver for IDE-type disk devices
+ connected through a parallel port. If you chose to build PARIDE
+ support into your kernel, you may answer Y here to build in the
+ parallel port IDE driver, otherwise you should answer M to build
+ it as a loadable module. The module will be called pd. You
+ must also have at least one parallel port protocol driver in your
+ system. Among the devices supported by this driver are the SyQuest
+ EZ-135, EZ-230 and SparQ drives, the Avatar Shark and the backpack
+ hard drives from MicroSolutions.
+
+config PARIDE_PCD
+ tristate "Parallel port ATAPI CD-ROMs"
+ depends on PARIDE
+ ---help---
+ This option enables the high-level driver for ATAPI CD-ROM devices
+ connected through a parallel port. If you chose to build PARIDE
+ support into your kernel, you may answer Y here to build in the
+ parallel port ATAPI CD-ROM driver, otherwise you should answer M to
+ build it as a loadable module. The module will be called pcd. You
+ must also have at least one parallel port protocol driver in your
+ system. Among the devices supported by this driver are the
+ MicroSolutions backpack CD-ROM drives and the Freecom Power CD. If
+ you have such a CD-ROM drive, you should also say Y or M to "ISO
+ 9660 CD-ROM file system support" below, because that's the file
+ system used on CD-ROMs.
+
+config PARIDE_PF
+ tristate "Parallel port ATAPI disks"
+ depends on PARIDE
+ help
+ This option enables the high-level driver for ATAPI disk devices
+ connected through a parallel port. If you chose to build PARIDE
+ support into your kernel, you may answer Y here to build in the
+ parallel port ATAPI disk driver, otherwise you should answer M
+ to build it as a loadable module. The module will be called pf.
+ You must also have at least one parallel port protocol driver in
+ your system. Among the devices supported by this driver are the
+ MicroSolutions backpack PD/CD drive and the Imation Superdisk
+ LS-120 drive.
+
+config PARIDE_PT
+ tristate "Parallel port ATAPI tapes"
+ depends on PARIDE
+ help
+ This option enables the high-level driver for ATAPI tape devices
+ connected through a parallel port. If you chose to build PARIDE
+ support into your kernel, you may answer Y here to build in the
+ parallel port ATAPI disk driver, otherwise you should answer M
+ to build it as a loadable module. The module will be called pt.
+ You must also have at least one parallel port protocol driver in
+ your system. Among the devices supported by this driver is the
+ parallel port version of the HP 5GB drive.
+
+config PARIDE_PG
+ tristate "Parallel port generic ATAPI devices"
+ depends on PARIDE
+ ---help---
+ This option enables a special high-level driver for generic ATAPI
+ devices connected through a parallel port. The driver allows user
+ programs, such as cdrtools, to send ATAPI commands directly to a
+ device.
+
+ If you chose to build PARIDE support into your kernel, you may
+ answer Y here to build in the parallel port generic ATAPI driver,
+ otherwise you should answer M to build it as a loadable module. The
+ module will be called pg.
+
+ You must also have at least one parallel port protocol driver in
+ your system.
+
+ This driver implements an API loosely related to the generic SCSI
+ driver. See <file:include/linux/pg.h>. for details.
+
+ You can obtain the most recent version of cdrtools from
+ <ftp://ftp.berlios.de/pub/cdrecord/>. Versions 1.6.1a3 and
+ later fully support this driver.
+
+comment "Parallel IDE protocol modules"
+ depends on PARIDE
+
+config PARIDE_ATEN
+ tristate "ATEN EH-100 protocol"
+ depends on PARIDE
+ help
+ This option enables support for the ATEN EH-100 parallel port IDE
+ protocol. This protocol is used in some inexpensive low performance
+ parallel port kits made in Hong Kong. If you chose to build PARIDE
+ support into your kernel, you may answer Y here to build in the
+ protocol driver, otherwise you should answer M to build it as a
+ loadable module. The module will be called aten. You must also
+ have a high-level driver for the type of device that you want to
+ support.
+
+config PARIDE_BPCK
+ tristate "MicroSolutions backpack (Series 5) protocol"
+ depends on PARIDE
+ ---help---
+ This option enables support for the Micro Solutions BACKPACK
+ parallel port Series 5 IDE protocol. (Most BACKPACK drives made
+ before 1999 were Series 5) Series 5 drives will NOT always have the
+ Series noted on the bottom of the drive. Series 6 drivers will.
+
+ In other words, if your BACKPACK drive doesn't say "Series 6" on the
+ bottom, enable this option.
+
+ If you chose to build PARIDE support into your kernel, you may
+ answer Y here to build in the protocol driver, otherwise you should
+ answer M to build it as a loadable module. The module will be
+ called bpck. You must also have a high-level driver for the type
+ of device that you want to support.
+
+config PARIDE_BPCK6
+ tristate "MicroSolutions backpack (Series 6) protocol"
+ depends on PARIDE && !64BIT
+ ---help---
+ This option enables support for the Micro Solutions BACKPACK
+ parallel port Series 6 IDE protocol. (Most BACKPACK drives made
+ after 1999 were Series 6) Series 6 drives will have the Series noted
+ on the bottom of the drive. Series 5 drivers don't always have it
+ noted.
+
+ In other words, if your BACKPACK drive says "Series 6" on the
+ bottom, enable this option.
+
+ If you chose to build PARIDE support into your kernel, you may
+ answer Y here to build in the protocol driver, otherwise you should
+ answer M to build it as a loadable module. The module will be
+ called bpck6. You must also have a high-level driver for the type
+ of device that you want to support.
+
+config PARIDE_COMM
+ tristate "DataStor Commuter protocol"
+ depends on PARIDE
+ help
+ This option enables support for the Commuter parallel port IDE
+ protocol from DataStor. If you chose to build PARIDE support
+ into your kernel, you may answer Y here to build in the protocol
+ driver, otherwise you should answer M to build it as a loadable
+ module. The module will be called comm. You must also have
+ a high-level driver for the type of device that you want to support.
+
+config PARIDE_DSTR
+ tristate "DataStor EP-2000 protocol"
+ depends on PARIDE
+ help
+ This option enables support for the EP-2000 parallel port IDE
+ protocol from DataStor. If you chose to build PARIDE support
+ into your kernel, you may answer Y here to build in the protocol
+ driver, otherwise you should answer M to build it as a loadable
+ module. The module will be called dstr. You must also have
+ a high-level driver for the type of device that you want to support.
+
+config PARIDE_FIT2
+ tristate "FIT TD-2000 protocol"
+ depends on PARIDE
+ help
+ This option enables support for the TD-2000 parallel port IDE
+ protocol from Fidelity International Technology. This is a simple
+ (low speed) adapter that is used in some portable hard drives. If
+ you chose to build PARIDE support into your kernel, you may answer Y
+ here to build in the protocol driver, otherwise you should answer M
+ to build it as a loadable module. The module will be called ktti.
+ You must also have a high-level driver for the type of device that
+ you want to support.
+
+config PARIDE_FIT3
+ tristate "FIT TD-3000 protocol"
+ depends on PARIDE
+ help
+ This option enables support for the TD-3000 parallel port IDE
+ protocol from Fidelity International Technology. This protocol is
+ used in newer models of their portable disk, CD-ROM and PD/CD
+ devices. If you chose to build PARIDE support into your kernel, you
+ may answer Y here to build in the protocol driver, otherwise you
+ should answer M to build it as a loadable module. The module will be
+ called fit3. You must also have a high-level driver for the type
+ of device that you want to support.
+
+config PARIDE_EPAT
+ tristate "Shuttle EPAT/EPEZ protocol"
+ depends on PARIDE
+ help
+ This option enables support for the EPAT parallel port IDE protocol.
+ EPAT is a parallel port IDE adapter manufactured by Shuttle
+ Technology and widely used in devices from major vendors such as
+ Hewlett-Packard, SyQuest, Imation and Avatar. If you chose to build
+ PARIDE support into your kernel, you may answer Y here to build in
+ the protocol driver, otherwise you should answer M to build it as a
+ loadable module. The module will be called epat. You must also
+ have a high-level driver for the type of device that you want to
+ support.
+
+config PARIDE_EPATC8
+ bool "Support c7/c8 chips"
+ depends on PARIDE_EPAT
+ help
+ This option enables support for the newer Shuttle EP1284 (aka c7 and
+ c8) chip. You need this if you are using any recent Imation SuperDisk
+ (LS-120) drive.
+
+config PARIDE_EPIA
+ tristate "Shuttle EPIA protocol"
+ depends on PARIDE
+ help
+ This option enables support for the (obsolete) EPIA parallel port
+ IDE protocol from Shuttle Technology. This adapter can still be
+ found in some no-name kits. If you chose to build PARIDE support
+ into your kernel, you may answer Y here to build in the protocol
+ driver, otherwise you should answer M to build it as a loadable
+ module. The module will be called epia. You must also have a
+ high-level driver for the type of device that you want to support.
+
+config PARIDE_FRIQ
+ tristate "Freecom IQ ASIC-2 protocol"
+ depends on PARIDE
+ help
+ This option enables support for version 2 of the Freecom IQ parallel
+ port IDE adapter. This adapter is used by the Maxell Superdisk
+ drive. If you chose to build PARIDE support into your kernel, you
+ may answer Y here to build in the protocol driver, otherwise you
+ should answer M to build it as a loadable module. The module will be
+ called friq. You must also have a high-level driver for the type
+ of device that you want to support.
+
+config PARIDE_FRPW
+ tristate "FreeCom power protocol"
+ depends on PARIDE
+ help
+ This option enables support for the Freecom power parallel port IDE
+ protocol. If you chose to build PARIDE support into your kernel, you
+ may answer Y here to build in the protocol driver, otherwise you
+ should answer M to build it as a loadable module. The module will be
+ called frpw. You must also have a high-level driver for the type
+ of device that you want to support.
+
+config PARIDE_KBIC
+ tristate "KingByte KBIC-951A/971A protocols"
+ depends on PARIDE
+ help
+ This option enables support for the KBIC-951A and KBIC-971A parallel
+ port IDE protocols from KingByte Information Corp. KingByte's
+ adapters appear in many no-name portable disk and CD-ROM products,
+ especially in Europe. If you chose to build PARIDE support into your
+ kernel, you may answer Y here to build in the protocol driver,
+ otherwise you should answer M to build it as a loadable module. The
+ module will be called kbic. You must also have a high-level driver
+ for the type of device that you want to support.
+
+config PARIDE_KTTI
+ tristate "KT PHd protocol"
+ depends on PARIDE
+ help
+ This option enables support for the "PHd" parallel port IDE protocol
+ from KT Technology. This is a simple (low speed) adapter that is
+ used in some 2.5" portable hard drives. If you chose to build PARIDE
+ support into your kernel, you may answer Y here to build in the
+ protocol driver, otherwise you should answer M to build it as a
+ loadable module. The module will be called ktti. You must also
+ have a high-level driver for the type of device that you want to
+ support.
+
+config PARIDE_ON20
+ tristate "OnSpec 90c20 protocol"
+ depends on PARIDE
+ help
+ This option enables support for the (obsolete) 90c20 parallel port
+ IDE protocol from OnSpec (often marketed under the ValuStore brand
+ name). If you chose to build PARIDE support into your kernel, you
+ may answer Y here to build in the protocol driver, otherwise you
+ should answer M to build it as a loadable module. The module will
+ be called on20. You must also have a high-level driver for the
+ type of device that you want to support.
+
+config PARIDE_ON26
+ tristate "OnSpec 90c26 protocol"
+ depends on PARIDE
+ help
+ This option enables support for the 90c26 parallel port IDE protocol
+ from OnSpec Electronics (often marketed under the ValuStore brand
+ name). If you chose to build PARIDE support into your kernel, you
+ may answer Y here to build in the protocol driver, otherwise you
+ should answer M to build it as a loadable module. The module will be
+ called on26. You must also have a high-level driver for the type
+ of device that you want to support.
+
+#
diff --git a/drivers/block/paride/Makefile b/drivers/block/paride/Makefile
new file mode 100644
index 000000000..a539e004b
--- /dev/null
+++ b/drivers/block/paride/Makefile
@@ -0,0 +1,28 @@
+#
+# Makefile for Parallel port IDE device drivers.
+#
+# 7 October 2000, Bartlomiej Zolnierkiewicz <bkz@linux-ide.org>
+# Rewritten to use lists instead of if-statements.
+#
+
+obj-$(CONFIG_PARIDE) += paride.o
+obj-$(CONFIG_PARIDE_ATEN) += aten.o
+obj-$(CONFIG_PARIDE_BPCK) += bpck.o
+obj-$(CONFIG_PARIDE_COMM) += comm.o
+obj-$(CONFIG_PARIDE_DSTR) += dstr.o
+obj-$(CONFIG_PARIDE_KBIC) += kbic.o
+obj-$(CONFIG_PARIDE_EPAT) += epat.o
+obj-$(CONFIG_PARIDE_EPIA) += epia.o
+obj-$(CONFIG_PARIDE_FRPW) += frpw.o
+obj-$(CONFIG_PARIDE_FRIQ) += friq.o
+obj-$(CONFIG_PARIDE_FIT2) += fit2.o
+obj-$(CONFIG_PARIDE_FIT3) += fit3.o
+obj-$(CONFIG_PARIDE_ON20) += on20.o
+obj-$(CONFIG_PARIDE_ON26) += on26.o
+obj-$(CONFIG_PARIDE_KTTI) += ktti.o
+obj-$(CONFIG_PARIDE_BPCK6) += bpck6.o
+obj-$(CONFIG_PARIDE_PD) += pd.o
+obj-$(CONFIG_PARIDE_PCD) += pcd.o
+obj-$(CONFIG_PARIDE_PF) += pf.o
+obj-$(CONFIG_PARIDE_PT) += pt.o
+obj-$(CONFIG_PARIDE_PG) += pg.o
diff --git a/drivers/block/paride/Transition-notes b/drivers/block/paride/Transition-notes
new file mode 100644
index 000000000..70374907c
--- /dev/null
+++ b/drivers/block/paride/Transition-notes
@@ -0,0 +1,128 @@
+Lemma 1:
+ If ps_tq is scheduled, ps_tq_active is 1. ps_tq_int() can be called
+ only when ps_tq_active is 1.
+Proof: All assignments to ps_tq_active and all scheduling of ps_tq happen
+ under ps_spinlock. There are three places where that can happen:
+ one in ps_set_intr() (A) and two in ps_tq_int() (B and C).
+ Consider the sequnce of these events. A can not be preceded by
+ anything except B, since it is under if (!ps_tq_active) under
+ ps_spinlock. C is always preceded by B, since we can't reach it
+ other than through B and we don't drop ps_spinlock between them.
+ IOW, the sequence is A?(BA|BC|B)*. OTOH, number of B can not exceed
+ the sum of numbers of A and C, since each call of ps_tq_int() is
+ the result of ps_tq execution. Therefore, the sequence starts with
+ A and each B is preceded by either A or C. Moments when we enter
+ ps_tq_int() are sandwiched between {A,C} and B in that sequence,
+ since at any time number of B can not exceed the number of these
+ moments which, in turn, can not exceed the number of A and C.
+ In other words, the sequence of events is (A or C set ps_tq_active to
+ 1 and schedule ps_tq, ps_tq is executed, ps_tq_int() is entered,
+ B resets ps_tq_active)*.
+
+
+consider the following area:
+ * in do_pd_request1(): to calls of pi_do_claimed() and return in
+ case when pd_req is NULL.
+ * in next_request(): to call of do_pd_request1()
+ * in do_pd_read(): to call of ps_set_intr()
+ * in do_pd_read_start(): to calls of pi_do_claimed(), next_request()
+and ps_set_intr()
+ * in do_pd_read_drq(): to calls of pi_do_claimed() and next_request()
+ * in do_pd_write(): to call of ps_set_intr()
+ * in do_pd_write_start(): to calls of pi_do_claimed(), next_request()
+and ps_set_intr()
+ * in do_pd_write_done(): to calls of pi_do_claimed() and next_request()
+ * in ps_set_intr(): to check for ps_tq_active and to scheduling
+ ps_tq if ps_tq_active was 0.
+ * in ps_tq_int(): from the moment when we get ps_spinlock() to the
+ return, call of con() or scheduling ps_tq.
+ * in pi_schedule_claimed() when called from pi_do_claimed() called from
+ pd.c, everything until returning 1 or setting or setting ->claim_cont
+ on the path that returns 0
+ * in pi_do_claimed() when called from pd.c, everything until the call
+ of pi_do_claimed() plus the everything until the call of cont() if
+ pi_do_claimed() has returned 1.
+ * in pi_wake_up() called for PIA that belongs to pd.c, everything from
+ the moment when pi_spinlock has been acquired.
+
+Lemma 2:
+ 1) at any time at most one thread of execution can be in that area or
+ be preempted there.
+ 2) When there is such a thread, pd_busy is set or pd_lock is held by
+ that thread.
+ 3) When there is such a thread, ps_tq_active is 0 or ps_spinlock is
+ held by that thread.
+ 4) When there is such a thread, all PIA belonging to pd.c have NULL
+ ->claim_cont or pi_spinlock is held by thread in question.
+
+Proof: consider the first moment when the above is not true.
+
+(1) can become not true if some thread enters that area while another is there.
+ a) do_pd_request1() can be called from next_request() or do_pd_request()
+ In the first case the thread was already in the area. In the second,
+ the thread was holding pd_lock and found pd_busy not set, which would
+ mean that (2) was already not true.
+ b) ps_set_intr() and pi_schedule_claimed() can be called only from the
+ area.
+ c) pi_do_claimed() is called by pd.c only from the area.
+ d) ps_tq_int() can enter the area only when the thread is holding
+ ps_spinlock and ps_tq_active is 1 (due to Lemma 1). It means that
+ (3) was already not true.
+ e) do_pd_{read,write}* could be called only from the area. The only
+ case that needs consideration is call from pi_wake_up() and there
+ we would have to be called for the PIA that got ->claimed_cont
+ from pd.c. That could happen only if pi_do_claimed() had been
+ called from pd.c for that PIA, which happens only for PIA belonging
+ to pd.c.
+ f) pi_wake_up() can enter the area only when the thread is holding
+ pi_spinlock and ->claimed_cont is non-NULL for PIA belonging to
+ pd.c. It means that (4) was already not true.
+
+(2) can become not true only when pd_lock is released by the thread in question.
+ Indeed, pd_busy is reset only in the area and thread that resets
+ it is holding pd_lock. The only place within the area where we
+ release pd_lock is in pd_next_buf() (called from within the area).
+ But that code does not reset pd_busy, so pd_busy would have to be
+ 0 when pd_next_buf() had acquired pd_lock. If it become 0 while
+ we were acquiring the lock, (1) would be already false, since
+ the thread that had reset it would be in the area simulateously.
+ If it was 0 before we tried to acquire pd_lock, (2) would be
+ already false.
+
+For similar reasons, (3) can become not true only when ps_spinlock is released
+by the thread in question. However, all such places within the area are right
+after resetting ps_tq_active to 0.
+
+(4) is done the same way - all places where we release pi_spinlock within
+the area are either after resetting ->claimed_cont to NULL while holding
+pi_spinlock, or after not tocuhing ->claimed_cont since acquiring pi_spinlock
+also in the area. The only place where ->claimed_cont is made non-NULL is
+in the area, under pi_spinlock and we do not release it until after leaving
+the area.
+
+QED.
+
+
+Corollary 1: ps_tq_active can be killed. Indeed, the only place where we
+check its value is in ps_set_intr() and if it had been non-zero at that
+point, we would have violated either (2.1) (if it was set while ps_set_intr()
+was acquiring ps_spinlock) or (2.3) (if it was set when we started to
+acquire ps_spinlock).
+
+Corollary 2: ps_spinlock can be killed. Indeed, Lemma 1 and Lemma 2 show
+that the only possible contention is between scheduling ps_tq followed by
+immediate release of spinlock and beginning of execution of ps_tq on
+another CPU.
+
+Corollary 3: assignment to pd_busy in do_pd_read_start() and do_pd_write_start()
+can be killed. Indeed, we are not holding pd_lock and thus pd_busy is already
+1 here.
+
+Corollary 4: in ps_tq_int() uses of con can be replaced with uses of
+ps_continuation, since the latter is changed only from the area.
+We don't need to reset it to NULL, since we are guaranteed that there
+will be a call of ps_set_intr() before we look at ps_continuation again.
+We can remove the check for ps_continuation being NULL for the same
+reason - the value is guaranteed to be set by the last ps_set_intr() and
+we never pass it NULL. Assignements in the beginning of ps_set_intr()
+can be taken to callers as long as they remain within the area.
diff --git a/drivers/block/paride/aten.c b/drivers/block/paride/aten.c
new file mode 100644
index 000000000..269546556
--- /dev/null
+++ b/drivers/block/paride/aten.c
@@ -0,0 +1,162 @@
+/*
+ aten.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ aten.c is a low-level protocol driver for the ATEN EH-100
+ parallel port adapter. The EH-100 supports 4-bit and 8-bit
+ modes only. There is also an EH-132 which supports EPP mode
+ transfers. The EH-132 is not yet supported.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.05 init_proto, release_proto
+
+*/
+
+#define ATEN_VERSION "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/types.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b) ((((a>>4)&0x0f)|(b&0xf0))^0x88)
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int cont_map[2] = { 0x08, 0x20 };
+
+static void aten_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ int r;
+
+ r = regr + cont_map[cont] + 0x80;
+
+ w0(r); w2(0xe); w2(6); w0(val); w2(7); w2(6); w2(0xc);
+}
+
+static int aten_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b, r;
+
+ r = regr + cont_map[cont] + 0x40;
+
+ switch (pi->mode) {
+
+ case 0: w0(r); w2(0xe); w2(6);
+ w2(7); w2(6); w2(0);
+ a = r1(); w0(0x10); b = r1(); w2(0xc);
+ return j44(a,b);
+
+ case 1: r |= 0x10;
+ w0(r); w2(0xe); w2(6); w0(0xff);
+ w2(0x27); w2(0x26); w2(0x20);
+ a = r0();
+ w2(0x26); w2(0xc);
+ return a;
+ }
+ return -1;
+}
+
+static void aten_read_block( PIA *pi, char * buf, int count )
+
+{ int k, a, b, c, d;
+
+ switch (pi->mode) {
+
+ case 0: w0(0x48); w2(0xe); w2(6);
+ for (k=0;k<count/2;k++) {
+ w2(7); w2(6); w2(2);
+ a = r1(); w0(0x58); b = r1();
+ w2(0); d = r1(); w0(0x48); c = r1();
+ buf[2*k] = j44(c,d);
+ buf[2*k+1] = j44(a,b);
+ }
+ w2(0xc);
+ break;
+
+ case 1: w0(0x58); w2(0xe); w2(6);
+ for (k=0;k<count/2;k++) {
+ w2(0x27); w2(0x26); w2(0x22);
+ a = r0(); w2(0x20); b = r0();
+ buf[2*k] = b; buf[2*k+1] = a;
+ }
+ w2(0x26); w2(0xc);
+ break;
+ }
+}
+
+static void aten_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ w0(0x88); w2(0xe); w2(6);
+ for (k=0;k<count/2;k++) {
+ w0(buf[2*k+1]); w2(0xe); w2(6);
+ w0(buf[2*k]); w2(7); w2(6);
+ }
+ w2(0xc);
+}
+
+static void aten_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(0xc);
+}
+
+static void aten_disconnect ( PIA *pi )
+
+{ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static void aten_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[2] = {"4-bit","8-bit"};
+
+ printk("%s: aten %s, ATEN EH-100 at 0x%x, ",
+ pi->device,ATEN_VERSION,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol aten = {
+ .owner = THIS_MODULE,
+ .name = "aten",
+ .max_mode = 2,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = aten_write_regr,
+ .read_regr = aten_read_regr,
+ .write_block = aten_write_block,
+ .read_block = aten_read_block,
+ .connect = aten_connect,
+ .disconnect = aten_disconnect,
+ .log_adapter = aten_log_adapter,
+};
+
+static int __init aten_init(void)
+{
+ return paride_register(&aten);
+}
+
+static void __exit aten_exit(void)
+{
+ paride_unregister( &aten );
+}
+
+MODULE_LICENSE("GPL");
+module_init(aten_init)
+module_exit(aten_exit)
diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c
new file mode 100644
index 000000000..4f27e7392
--- /dev/null
+++ b/drivers/block/paride/bpck.c
@@ -0,0 +1,477 @@
+/*
+ bpck.c (c) 1996-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ bpck.c is a low-level protocol driver for the MicroSolutions
+ "backpack" parallel port IDE adapter.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.05 init_proto, release_proto, pi->delay
+ 1.02 GRG 1998.08.15 default pi->delay returned to 4
+
+*/
+
+#define BPCK_VERSION "1.02"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#undef r2
+#undef w2
+
+#define PC pi->private
+#define r2() (PC=(in_p(2) & 0xff))
+#define w2(byte) {out_p(2,byte); PC = byte;}
+#define t2(pat) {PC ^= pat; out_p(2,PC);}
+#define e2() {PC &= 0xfe; out_p(2,PC);}
+#define o2() {PC |= 1; out_p(2,PC);}
+
+#define j44(l,h) (((l>>3)&0x7)|((l>>4)&0x8)|((h<<1)&0x70)|(h&0x80))
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+ cont = 2 - use internal bpck register addressing
+*/
+
+static int cont_map[3] = { 0x40, 0x48, 0 };
+
+static int bpck_read_regr( PIA *pi, int cont, int regr )
+
+{ int r, l, h;
+
+ r = regr + cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0: w0(r & 0xf); w0(r); t2(2); t2(4);
+ l = r1();
+ t2(4);
+ h = r1();
+ return j44(l,h);
+
+ case 1: w0(r & 0xf); w0(r); t2(2);
+ e2(); t2(0x20);
+ t2(4); h = r0();
+ t2(1); t2(0x20);
+ return h;
+
+ case 2:
+ case 3:
+ case 4: w0(r); w2(9); w2(0); w2(0x20);
+ h = r4();
+ w2(0);
+ return h;
+
+ }
+ return -1;
+}
+
+static void bpck_write_regr( PIA *pi, int cont, int regr, int val )
+
+{ int r;
+
+ r = regr + cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: w0(r);
+ t2(2);
+ w0(val);
+ o2(); t2(4); t2(1);
+ break;
+
+ case 2:
+ case 3:
+ case 4: w0(r); w2(9); w2(0);
+ w0(val); w2(1); w2(3); w2(0);
+ break;
+
+ }
+}
+
+/* These macros access the bpck registers in native addressing */
+
+#define WR(r,v) bpck_write_regr(pi,2,r,v)
+#define RR(r) (bpck_read_regr(pi,2,r))
+
+static void bpck_write_block( PIA *pi, char * buf, int count )
+
+{ int i;
+
+ switch (pi->mode) {
+
+ case 0: WR(4,0x40);
+ w0(0x40); t2(2); t2(1);
+ for (i=0;i<count;i++) { w0(buf[i]); t2(4); }
+ WR(4,0);
+ break;
+
+ case 1: WR(4,0x50);
+ w0(0x40); t2(2); t2(1);
+ for (i=0;i<count;i++) { w0(buf[i]); t2(4); }
+ WR(4,0x10);
+ break;
+
+ case 2: WR(4,0x48);
+ w0(0x40); w2(9); w2(0); w2(1);
+ for (i=0;i<count;i++) w4(buf[i]);
+ w2(0);
+ WR(4,8);
+ break;
+
+ case 3: WR(4,0x48);
+ w0(0x40); w2(9); w2(0); w2(1);
+ for (i=0;i<count/2;i++) w4w(((u16 *)buf)[i]);
+ w2(0);
+ WR(4,8);
+ break;
+
+ case 4: WR(4,0x48);
+ w0(0x40); w2(9); w2(0); w2(1);
+ for (i=0;i<count/4;i++) w4l(((u32 *)buf)[i]);
+ w2(0);
+ WR(4,8);
+ break;
+ }
+}
+
+static void bpck_read_block( PIA *pi, char * buf, int count )
+
+{ int i, l, h;
+
+ switch (pi->mode) {
+
+ case 0: WR(4,0x40);
+ w0(0x40); t2(2);
+ for (i=0;i<count;i++) {
+ t2(4); l = r1();
+ t2(4); h = r1();
+ buf[i] = j44(l,h);
+ }
+ WR(4,0);
+ break;
+
+ case 1: WR(4,0x50);
+ w0(0x40); t2(2); t2(0x20);
+ for(i=0;i<count;i++) { t2(4); buf[i] = r0(); }
+ t2(1); t2(0x20);
+ WR(4,0x10);
+ break;
+
+ case 2: WR(4,0x48);
+ w0(0x40); w2(9); w2(0); w2(0x20);
+ for (i=0;i<count;i++) buf[i] = r4();
+ w2(0);
+ WR(4,8);
+ break;
+
+ case 3: WR(4,0x48);
+ w0(0x40); w2(9); w2(0); w2(0x20);
+ for (i=0;i<count/2;i++) ((u16 *)buf)[i] = r4w();
+ w2(0);
+ WR(4,8);
+ break;
+
+ case 4: WR(4,0x48);
+ w0(0x40); w2(9); w2(0); w2(0x20);
+ for (i=0;i<count/4;i++) ((u32 *)buf)[i] = r4l();
+ w2(0);
+ WR(4,8);
+ break;
+
+ }
+}
+
+static int bpck_probe_unit ( PIA *pi )
+
+{ int o1, o0, f7, id;
+ int t, s;
+
+ id = pi->unit;
+ s = 0;
+ w2(4); w2(0xe); r2(); t2(2);
+ o1 = r1()&0xf8;
+ o0 = r0();
+ w0(255-id); w2(4); w0(id);
+ t2(8); t2(8); t2(8);
+ t2(2); t = r1()&0xf8;
+ f7 = ((id % 8) == 7);
+ if ((f7) || (t != o1)) { t2(2); s = r1()&0xf8; }
+ if ((t == o1) && ((!f7) || (s == o1))) {
+ w2(0x4c); w0(o0);
+ return 0;
+ }
+ t2(8); w0(0); t2(2); w2(0x4c); w0(o0);
+ return 1;
+}
+
+static void bpck_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ w0(0xff-pi->unit); w2(4); w0(pi->unit);
+ t2(8); t2(8); t2(8);
+ t2(2); t2(2);
+
+ switch (pi->mode) {
+
+ case 0: t2(8); WR(4,0);
+ break;
+
+ case 1: t2(8); WR(4,0x10);
+ break;
+
+ case 2:
+ case 3:
+ case 4: w2(0); WR(4,8);
+ break;
+
+ }
+
+ WR(5,8);
+
+ if (pi->devtype == PI_PCD) {
+ WR(0x46,0x10); /* fiddle with ESS logic ??? */
+ WR(0x4c,0x38);
+ WR(0x4d,0x88);
+ WR(0x46,0xa0);
+ WR(0x41,0);
+ WR(0x4e,8);
+ }
+}
+
+static void bpck_disconnect ( PIA *pi )
+
+{ w0(0);
+ if (pi->mode >= 2) { w2(9); w2(0); } else t2(2);
+ w2(0x4c); w0(pi->saved_r0);
+}
+
+static void bpck_force_spp ( PIA *pi )
+
+/* This fakes the EPP protocol to turn off EPP ... */
+
+{ pi->saved_r0 = r0();
+ w0(0xff-pi->unit); w2(4); w0(pi->unit);
+ t2(8); t2(8); t2(8);
+ t2(2); t2(2);
+
+ w2(0);
+ w0(4); w2(9); w2(0);
+ w0(0); w2(1); w2(3); w2(0);
+ w0(0); w2(9); w2(0);
+ w2(0x4c); w0(pi->saved_r0);
+}
+
+#define TEST_LEN 16
+
+static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
+
+{ int i, e, l, h, om;
+ char buf[TEST_LEN];
+
+ bpck_force_spp(pi);
+
+ switch (pi->mode) {
+
+ case 0: bpck_connect(pi);
+ WR(0x13,0x7f);
+ w0(0x13); t2(2);
+ for(i=0;i<TEST_LEN;i++) {
+ t2(4); l = r1();
+ t2(4); h = r1();
+ buf[i] = j44(l,h);
+ }
+ bpck_disconnect(pi);
+ break;
+
+ case 1: bpck_connect(pi);
+ WR(0x13,0x7f);
+ w0(0x13); t2(2); t2(0x20);
+ for(i=0;i<TEST_LEN;i++) { t2(4); buf[i] = r0(); }
+ t2(1); t2(0x20);
+ bpck_disconnect(pi);
+ break;
+
+ case 2:
+ case 3:
+ case 4: om = pi->mode;
+ pi->mode = 0;
+ bpck_connect(pi);
+ WR(7,3);
+ WR(4,8);
+ bpck_disconnect(pi);
+
+ pi->mode = om;
+ bpck_connect(pi);
+ w0(0x13); w2(9); w2(1); w0(0); w2(3); w2(0); w2(0xe0);
+
+ switch (pi->mode) {
+ case 2: for (i=0;i<TEST_LEN;i++) buf[i] = r4();
+ break;
+ case 3: for (i=0;i<TEST_LEN/2;i++) ((u16 *)buf)[i] = r4w();
+ break;
+ case 4: for (i=0;i<TEST_LEN/4;i++) ((u32 *)buf)[i] = r4l();
+ break;
+ }
+
+ w2(0);
+ WR(7,0);
+ bpck_disconnect(pi);
+
+ break;
+
+ }
+
+ if (verbose) {
+ printk("%s: bpck: 0x%x unit %d mode %d: ",
+ pi->device,pi->port,pi->unit,pi->mode);
+ for (i=0;i<TEST_LEN;i++) printk("%3d",buf[i]);
+ printk("\n");
+ }
+
+ e = 0;
+ for (i=0;i<TEST_LEN;i++) if (buf[i] != (i+1)) e++;
+ return e;
+}
+
+static void bpck_read_eeprom ( PIA *pi, char * buf )
+
+{ int i,j,k,n,p,v,f, om, od;
+
+ bpck_force_spp(pi);
+
+ om = pi->mode; od = pi->delay;
+ pi->mode = 0; pi->delay = 6;
+
+ bpck_connect(pi);
+
+ n = 0;
+ WR(4,0);
+ for (i=0;i<64;i++) {
+ WR(6,8);
+ WR(6,0xc);
+ p = 0x100;
+ for (k=0;k<9;k++) {
+ f = (((i + 0x180) & p) != 0) * 2;
+ WR(6,f+0xc);
+ WR(6,f+0xd);
+ WR(6,f+0xc);
+ p = (p >> 1);
+ }
+ for (j=0;j<2;j++) {
+ v = 0;
+ for (k=0;k<8;k++) {
+ WR(6,0xc);
+ WR(6,0xd);
+ WR(6,0xc);
+ f = RR(0);
+ v = 2*v + (f == 0x84);
+ }
+ buf[2*i+1-j] = v;
+ }
+ }
+ WR(6,8);
+ WR(6,0);
+ WR(5,8);
+
+ bpck_disconnect(pi);
+
+ if (om >= 2) {
+ bpck_connect(pi);
+ WR(7,3);
+ WR(4,8);
+ bpck_disconnect(pi);
+ }
+
+ pi->mode = om; pi->delay = od;
+}
+
+static int bpck_test_port ( PIA *pi ) /* check for 8-bit port */
+
+{ int i, r, m;
+
+ w2(0x2c); i = r0(); w0(255-i); r = r0(); w0(i);
+ m = -1;
+ if (r == i) m = 2;
+ if (r == (255-i)) m = 0;
+
+ w2(0xc); i = r0(); w0(255-i); r = r0(); w0(i);
+ if (r != (255-i)) m = -1;
+
+ if (m == 0) { w2(6); w2(0xc); r = r0(); w0(0xaa); w0(r); w0(0xaa); }
+ if (m == 2) { w2(0x26); w2(0xc); }
+
+ if (m == -1) return 0;
+ return 5;
+}
+
+static void bpck_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[5] = { "4-bit","8-bit","EPP-8",
+ "EPP-16","EPP-32" };
+
+#ifdef DUMP_EEPROM
+ int i;
+#endif
+
+ bpck_read_eeprom(pi,scratch);
+
+#ifdef DUMP_EEPROM
+ if (verbose) {
+ for(i=0;i<128;i++)
+ if ((scratch[i] < ' ') || (scratch[i] > '~'))
+ scratch[i] = '.';
+ printk("%s: bpck EEPROM: %64.64s\n",pi->device,scratch);
+ printk("%s: %64.64s\n",pi->device,&scratch[64]);
+ }
+#endif
+
+ printk("%s: bpck %s, backpack %8.8s unit %d",
+ pi->device,BPCK_VERSION,&scratch[110],pi->unit);
+ printk(" at 0x%x, mode %d (%s), delay %d\n",pi->port,
+ pi->mode,mode_string[pi->mode],pi->delay);
+}
+
+static struct pi_protocol bpck = {
+ .owner = THIS_MODULE,
+ .name = "bpck",
+ .max_mode = 5,
+ .epp_first = 2,
+ .default_delay = 4,
+ .max_units = 255,
+ .write_regr = bpck_write_regr,
+ .read_regr = bpck_read_regr,
+ .write_block = bpck_write_block,
+ .read_block = bpck_read_block,
+ .connect = bpck_connect,
+ .disconnect = bpck_disconnect,
+ .test_port = bpck_test_port,
+ .probe_unit = bpck_probe_unit,
+ .test_proto = bpck_test_proto,
+ .log_adapter = bpck_log_adapter,
+};
+
+static int __init bpck_init(void)
+{
+ return paride_register(&bpck);
+}
+
+static void __exit bpck_exit(void)
+{
+ paride_unregister(&bpck);
+}
+
+MODULE_LICENSE("GPL");
+module_init(bpck_init)
+module_exit(bpck_exit)
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
new file mode 100644
index 000000000..ec64e7f5d
--- /dev/null
+++ b/drivers/block/paride/bpck6.c
@@ -0,0 +1,267 @@
+/*
+ backpack.c (c) 2001 Micro Solutions Inc.
+ Released under the terms of the GNU General Public license
+
+ backpack.c is a low-level protocol driver for the Micro Solutions
+ "BACKPACK" parallel port IDE adapter
+ (Works on Series 6 drives)
+
+ Written by: Ken Hahn (linux-dev@micro-solutions.com)
+ Clive Turvey (linux-dev@micro-solutions.com)
+
+*/
+
+/*
+ This is Ken's linux wrapper for the PPC library
+ Version 1.0.0 is the backpack driver for which source is not available
+ Version 2.0.0 is the first to have source released
+ Version 2.0.1 is the "Cox-ified" source code
+ Version 2.0.2 - fixed version string usage, and made ppc functions static
+*/
+
+
+#define BACKPACK_VERSION "2.0.2"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <asm/io.h>
+#include <linux/parport.h>
+
+#include "ppc6lnx.c"
+#include "paride.h"
+
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
+
+
+#define PPCSTRUCT(pi) ((Interface *)(pi->private))
+
+/****************************************************************/
+/*
+ ATAPI CDROM DRIVE REGISTERS
+*/
+#define ATAPI_DATA 0 /* data port */
+#define ATAPI_ERROR 1 /* error register (read) */
+#define ATAPI_FEATURES 1 /* feature register (write) */
+#define ATAPI_INT_REASON 2 /* interrupt reason register */
+#define ATAPI_COUNT_LOW 4 /* byte count register (low) */
+#define ATAPI_COUNT_HIGH 5 /* byte count register (high) */
+#define ATAPI_DRIVE_SEL 6 /* drive select register */
+#define ATAPI_STATUS 7 /* status port (read) */
+#define ATAPI_COMMAND 7 /* command port (write) */
+#define ATAPI_ALT_STATUS 0x0e /* alternate status reg (read) */
+#define ATAPI_DEVICE_CONTROL 0x0e /* device control (write) */
+/****************************************************************/
+
+static int bpck6_read_regr(PIA *pi, int cont, int reg)
+{
+ unsigned int out;
+
+ /* check for bad settings */
+ if (reg<0 || reg>7 || cont<0 || cont>2)
+ {
+ return(-1);
+ }
+ out=ppc6_rd_port(PPCSTRUCT(pi),cont?reg|8:reg);
+ return(out);
+}
+
+static void bpck6_write_regr(PIA *pi, int cont, int reg, int val)
+{
+ /* check for bad settings */
+ if (reg>=0 && reg<=7 && cont>=0 && cont<=1)
+ {
+ ppc6_wr_port(PPCSTRUCT(pi),cont?reg|8:reg,(u8)val);
+ }
+}
+
+static void bpck6_write_block( PIA *pi, char * buf, int len )
+{
+ ppc6_wr_port16_blk(PPCSTRUCT(pi),ATAPI_DATA,buf,(u32)len>>1);
+}
+
+static void bpck6_read_block( PIA *pi, char * buf, int len )
+{
+ ppc6_rd_port16_blk(PPCSTRUCT(pi),ATAPI_DATA,buf,(u32)len>>1);
+}
+
+static void bpck6_connect ( PIA *pi )
+{
+ if(verbose)
+ {
+ printk(KERN_DEBUG "connect\n");
+ }
+
+ if(pi->mode >=2)
+ {
+ PPCSTRUCT(pi)->mode=4+pi->mode-2;
+ }
+ else if(pi->mode==1)
+ {
+ PPCSTRUCT(pi)->mode=3;
+ }
+ else
+ {
+ PPCSTRUCT(pi)->mode=1;
+ }
+
+ ppc6_open(PPCSTRUCT(pi));
+ ppc6_wr_extout(PPCSTRUCT(pi),0x3);
+}
+
+static void bpck6_disconnect ( PIA *pi )
+{
+ if(verbose)
+ {
+ printk("disconnect\n");
+ }
+ ppc6_wr_extout(PPCSTRUCT(pi),0x0);
+ ppc6_close(PPCSTRUCT(pi));
+}
+
+static int bpck6_test_port ( PIA *pi ) /* check for 8-bit port */
+{
+ if(verbose)
+ {
+ printk(KERN_DEBUG "PARPORT indicates modes=%x for lp=0x%lx\n",
+ ((struct pardevice*)(pi->pardev))->port->modes,
+ ((struct pardevice *)(pi->pardev))->port->base);
+ }
+
+ /*copy over duplicate stuff.. initialize state info*/
+ PPCSTRUCT(pi)->ppc_id=pi->unit;
+ PPCSTRUCT(pi)->lpt_addr=pi->port;
+
+ /* look at the parport device to see if what modes we can use */
+ if(((struct pardevice *)(pi->pardev))->port->modes &
+ (PARPORT_MODE_EPP)
+ )
+ {
+ return 5; /* Can do EPP*/
+ }
+ else if(((struct pardevice *)(pi->pardev))->port->modes &
+ (PARPORT_MODE_TRISTATE)
+ )
+ {
+ return 2;
+ }
+ else /*Just flat SPP*/
+ {
+ return 1;
+ }
+}
+
+static int bpck6_probe_unit ( PIA *pi )
+{
+ int out;
+
+ if(verbose)
+ {
+ printk(KERN_DEBUG "PROBE UNIT %x on port:%x\n",pi->unit,pi->port);
+ }
+
+ /*SET PPC UNIT NUMBER*/
+ PPCSTRUCT(pi)->ppc_id=pi->unit;
+
+ /*LOWER DOWN TO UNIDIRECTIONAL*/
+ PPCSTRUCT(pi)->mode=1;
+
+ out=ppc6_open(PPCSTRUCT(pi));
+
+ if(verbose)
+ {
+ printk(KERN_DEBUG "ppc_open returned %2x\n",out);
+ }
+
+ if(out)
+ {
+ ppc6_close(PPCSTRUCT(pi));
+ if(verbose)
+ {
+ printk(KERN_DEBUG "leaving probe\n");
+ }
+ return(1);
+ }
+ else
+ {
+ if(verbose)
+ {
+ printk(KERN_DEBUG "Failed open\n");
+ }
+ return(0);
+ }
+}
+
+static void bpck6_log_adapter( PIA *pi, char * scratch, int verbose )
+{
+ char *mode_string[5]=
+ {"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
+
+ printk("%s: BACKPACK Protocol Driver V"BACKPACK_VERSION"\n",pi->device);
+ printk("%s: Copyright 2001 by Micro Solutions, Inc., DeKalb IL.\n",pi->device);
+ printk("%s: BACKPACK %s, Micro Solutions BACKPACK Drive at 0x%x\n",
+ pi->device,BACKPACK_VERSION,pi->port);
+ printk("%s: Unit: %d Mode:%d (%s) Delay %d\n",pi->device,
+ pi->unit,pi->mode,mode_string[pi->mode],pi->delay);
+}
+
+static int bpck6_init_proto(PIA *pi)
+{
+ Interface *p = kzalloc(sizeof(Interface), GFP_KERNEL);
+
+ if (p) {
+ pi->private = (unsigned long)p;
+ return 0;
+ }
+
+ printk(KERN_ERR "%s: ERROR COULDN'T ALLOCATE MEMORY\n", pi->device);
+ return -1;
+}
+
+static void bpck6_release_proto(PIA *pi)
+{
+ kfree((void *)(pi->private));
+}
+
+static struct pi_protocol bpck6 = {
+ .owner = THIS_MODULE,
+ .name = "bpck6",
+ .max_mode = 5,
+ .epp_first = 2, /* 2-5 use epp (need 8 ports) */
+ .max_units = 255,
+ .write_regr = bpck6_write_regr,
+ .read_regr = bpck6_read_regr,
+ .write_block = bpck6_write_block,
+ .read_block = bpck6_read_block,
+ .connect = bpck6_connect,
+ .disconnect = bpck6_disconnect,
+ .test_port = bpck6_test_port,
+ .probe_unit = bpck6_probe_unit,
+ .log_adapter = bpck6_log_adapter,
+ .init_proto = bpck6_init_proto,
+ .release_proto = bpck6_release_proto,
+};
+
+static int __init bpck6_init(void)
+{
+ printk(KERN_INFO "bpck6: BACKPACK Protocol Driver V"BACKPACK_VERSION"\n");
+ printk(KERN_INFO "bpck6: Copyright 2001 by Micro Solutions, Inc., DeKalb IL. USA\n");
+ if(verbose)
+ printk(KERN_DEBUG "bpck6: verbose debug enabled.\n");
+ return paride_register(&bpck6);
+}
+
+static void __exit bpck6_exit(void)
+{
+ paride_unregister(&bpck6);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Micro Solutions Inc.");
+MODULE_DESCRIPTION("BACKPACK Protocol module, compatible with PARIDE");
+module_param(verbose, bool, 0644);
+module_init(bpck6_init)
+module_exit(bpck6_exit)
diff --git a/drivers/block/paride/comm.c b/drivers/block/paride/comm.c
new file mode 100644
index 000000000..9bcd35495
--- /dev/null
+++ b/drivers/block/paride/comm.c
@@ -0,0 +1,218 @@
+/*
+ comm.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ comm.c is a low-level protocol driver for some older models
+ of the DataStor "Commuter" parallel to IDE adapter. Some of
+ the parallel port devices marketed by Arista currently
+ use this adapter.
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.05 init_proto, release_proto
+
+*/
+
+#define COMM_VERSION "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+/* mode codes: 0 nybble reads, 8-bit writes
+ 1 8-bit reads and writes
+ 2 8-bit EPP mode
+*/
+
+#define j44(a,b) (((a>>3)&0x0f)|((b<<1)&0xf0))
+
+#define P1 w2(5);w2(0xd);w2(0xd);w2(5);w2(4);
+#define P2 w2(5);w2(7);w2(7);w2(5);w2(4);
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int cont_map[2] = { 0x08, 0x10 };
+
+static int comm_read_regr( PIA *pi, int cont, int regr )
+
+{ int l, h, r;
+
+ r = regr + cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0: w0(r); P1; w0(0);
+ w2(6); l = r1(); w0(0x80); h = r1(); w2(4);
+ return j44(l,h);
+
+ case 1: w0(r+0x20); P1;
+ w0(0); w2(0x26); h = r0(); w2(4);
+ return h;
+
+ case 2:
+ case 3:
+ case 4: w3(r+0x20); (void)r1();
+ w2(0x24); h = r4(); w2(4);
+ return h;
+
+ }
+ return -1;
+}
+
+static void comm_write_regr( PIA *pi, int cont, int regr, int val )
+
+{ int r;
+
+ r = regr + cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: w0(r); P1; w0(val); P2;
+ break;
+
+ case 2:
+ case 3:
+ case 4: w3(r); (void)r1(); w4(val);
+ break;
+ }
+}
+
+static void comm_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(4); w0(0xff); w2(6);
+ w2(4); w0(0xaa); w2(6);
+ w2(4); w0(0x00); w2(6);
+ w2(4); w0(0x87); w2(6);
+ w2(4); w0(0xe0); w2(0xc); w2(0xc); w2(4);
+}
+
+static void comm_disconnect ( PIA *pi )
+
+{ w2(0); w2(0); w2(0); w2(4);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static void comm_read_block( PIA *pi, char * buf, int count )
+
+{ int i, l, h;
+
+ switch (pi->mode) {
+
+ case 0: w0(0x48); P1;
+ for(i=0;i<count;i++) {
+ w0(0); w2(6); l = r1();
+ w0(0x80); h = r1(); w2(4);
+ buf[i] = j44(l,h);
+ }
+ break;
+
+ case 1: w0(0x68); P1; w0(0);
+ for(i=0;i<count;i++) {
+ w2(0x26); buf[i] = r0(); w2(0x24);
+ }
+ w2(4);
+ break;
+
+ case 2: w3(0x68); (void)r1(); w2(0x24);
+ for (i=0;i<count;i++) buf[i] = r4();
+ w2(4);
+ break;
+
+ case 3: w3(0x68); (void)r1(); w2(0x24);
+ for (i=0;i<count/2;i++) ((u16 *)buf)[i] = r4w();
+ w2(4);
+ break;
+
+ case 4: w3(0x68); (void)r1(); w2(0x24);
+ for (i=0;i<count/4;i++) ((u32 *)buf)[i] = r4l();
+ w2(4);
+ break;
+
+ }
+}
+
+/* NB: Watch out for the byte swapped writes ! */
+
+static void comm_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: w0(0x68); P1;
+ for (k=0;k<count;k++) {
+ w2(5); w0(buf[k^1]); w2(7);
+ }
+ w2(5); w2(4);
+ break;
+
+ case 2: w3(0x48); (void)r1();
+ for (k=0;k<count;k++) w4(buf[k^1]);
+ break;
+
+ case 3: w3(0x48); (void)r1();
+ for (k=0;k<count/2;k++) w4w(pi_swab16(buf,k));
+ break;
+
+ case 4: w3(0x48); (void)r1();
+ for (k=0;k<count/4;k++) w4l(pi_swab32(buf,k));
+ break;
+
+
+ }
+}
+
+static void comm_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[5] = {"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
+
+ printk("%s: comm %s, DataStor Commuter at 0x%x, ",
+ pi->device,COMM_VERSION,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol comm = {
+ .owner = THIS_MODULE,
+ .name = "comm",
+ .max_mode = 5,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = comm_write_regr,
+ .read_regr = comm_read_regr,
+ .write_block = comm_write_block,
+ .read_block = comm_read_block,
+ .connect = comm_connect,
+ .disconnect = comm_disconnect,
+ .log_adapter = comm_log_adapter,
+};
+
+static int __init comm_init(void)
+{
+ return paride_register(&comm);
+}
+
+static void __exit comm_exit(void)
+{
+ paride_unregister(&comm);
+}
+
+MODULE_LICENSE("GPL");
+module_init(comm_init)
+module_exit(comm_exit)
diff --git a/drivers/block/paride/dstr.c b/drivers/block/paride/dstr.c
new file mode 100644
index 000000000..accc5c777
--- /dev/null
+++ b/drivers/block/paride/dstr.c
@@ -0,0 +1,233 @@
+/*
+ dstr.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ dstr.c is a low-level protocol driver for the
+ DataStor EP2000 parallel to IDE adapter chip.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.06 init_proto, release_proto
+
+*/
+
+#define DSTR_VERSION "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+/* mode codes: 0 nybble reads, 8-bit writes
+ 1 8-bit reads and writes
+ 2 8-bit EPP mode
+ 3 EPP-16
+ 4 EPP-32
+*/
+
+#define j44(a,b) (((a>>3)&0x07)|((~a>>4)&0x08)|((b<<1)&0x70)|((~b)&0x80))
+
+#define P1 w2(5);w2(0xd);w2(5);w2(4);
+#define P2 w2(5);w2(7);w2(5);w2(4);
+#define P3 w2(6);w2(4);w2(6);w2(4);
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int cont_map[2] = { 0x20, 0x40 };
+
+static int dstr_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b, r;
+
+ r = regr + cont_map[cont];
+
+ w0(0x81); P1;
+ if (pi->mode) { w0(0x11); } else { w0(1); }
+ P2; w0(r); P1;
+
+ switch (pi->mode) {
+
+ case 0: w2(6); a = r1(); w2(4); w2(6); b = r1(); w2(4);
+ return j44(a,b);
+
+ case 1: w0(0); w2(0x26); a = r0(); w2(4);
+ return a;
+
+ case 2:
+ case 3:
+ case 4: w2(0x24); a = r4(); w2(4);
+ return a;
+
+ }
+ return -1;
+}
+
+static void dstr_write_regr( PIA *pi, int cont, int regr, int val )
+
+{ int r;
+
+ r = regr + cont_map[cont];
+
+ w0(0x81); P1;
+ if (pi->mode >= 2) { w0(0x11); } else { w0(1); }
+ P2; w0(r); P1;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: w0(val); w2(5); w2(7); w2(5); w2(4);
+ break;
+
+ case 2:
+ case 3:
+ case 4: w4(val);
+ break;
+ }
+}
+
+#define CCP(x) w0(0xff);w2(0xc);w2(4);\
+ w0(0xaa);w0(0x55);w0(0);w0(0xff);w0(0x87);w0(0x78);\
+ w0(x);w2(5);w2(4);
+
+static void dstr_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(4); CCP(0xe0); w0(0xff);
+}
+
+static void dstr_disconnect ( PIA *pi )
+
+{ CCP(0x30);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static void dstr_read_block( PIA *pi, char * buf, int count )
+
+{ int k, a, b;
+
+ w0(0x81); P1;
+ if (pi->mode) { w0(0x19); } else { w0(9); }
+ P2; w0(0x82); P1; P3; w0(0x20); P1;
+
+ switch (pi->mode) {
+
+ case 0: for (k=0;k<count;k++) {
+ w2(6); a = r1(); w2(4);
+ w2(6); b = r1(); w2(4);
+ buf[k] = j44(a,b);
+ }
+ break;
+
+ case 1: w0(0);
+ for (k=0;k<count;k++) {
+ w2(0x26); buf[k] = r0(); w2(0x24);
+ }
+ w2(4);
+ break;
+
+ case 2: w2(0x24);
+ for (k=0;k<count;k++) buf[k] = r4();
+ w2(4);
+ break;
+
+ case 3: w2(0x24);
+ for (k=0;k<count/2;k++) ((u16 *)buf)[k] = r4w();
+ w2(4);
+ break;
+
+ case 4: w2(0x24);
+ for (k=0;k<count/4;k++) ((u32 *)buf)[k] = r4l();
+ w2(4);
+ break;
+
+ }
+}
+
+static void dstr_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ w0(0x81); P1;
+ if (pi->mode) { w0(0x19); } else { w0(9); }
+ P2; w0(0x82); P1; P3; w0(0x20); P1;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: for (k=0;k<count;k++) {
+ w2(5); w0(buf[k]); w2(7);
+ }
+ w2(5); w2(4);
+ break;
+
+ case 2: w2(0xc5);
+ for (k=0;k<count;k++) w4(buf[k]);
+ w2(0xc4);
+ break;
+
+ case 3: w2(0xc5);
+ for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+ w2(0xc4);
+ break;
+
+ case 4: w2(0xc5);
+ for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+ w2(0xc4);
+ break;
+
+ }
+}
+
+
+static void dstr_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[5] = {"4-bit","8-bit","EPP-8",
+ "EPP-16","EPP-32"};
+
+ printk("%s: dstr %s, DataStor EP2000 at 0x%x, ",
+ pi->device,DSTR_VERSION,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol dstr = {
+ .owner = THIS_MODULE,
+ .name = "dstr",
+ .max_mode = 5,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = dstr_write_regr,
+ .read_regr = dstr_read_regr,
+ .write_block = dstr_write_block,
+ .read_block = dstr_read_block,
+ .connect = dstr_connect,
+ .disconnect = dstr_disconnect,
+ .log_adapter = dstr_log_adapter,
+};
+
+static int __init dstr_init(void)
+{
+ return paride_register(&dstr);
+}
+
+static void __exit dstr_exit(void)
+{
+ paride_unregister(&dstr);
+}
+
+MODULE_LICENSE("GPL");
+module_init(dstr_init)
+module_exit(dstr_exit)
diff --git a/drivers/block/paride/epat.c b/drivers/block/paride/epat.c
new file mode 100644
index 000000000..1bcdff773
--- /dev/null
+++ b/drivers/block/paride/epat.c
@@ -0,0 +1,340 @@
+/*
+ epat.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ This is the low level protocol driver for the EPAT parallel
+ to IDE adapter from Shuttle Technologies. This adapter is
+ used in many popular parallel port disk products such as the
+ SyQuest EZ drives, the Avatar Shark and the Imation SuperDisk.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.06 init_proto, release_proto
+ 1.02 Joshua b. Jore CPP(renamed), epat_connect, epat_disconnect
+
+*/
+
+#define EPAT_VERSION "1.02"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b) (((a>>4)&0x0f)+(b&0xf0))
+#define j53(a,b) (((a>>3)&0x1f)+((b<<4)&0xe0))
+
+static int epatc8;
+
+module_param(epatc8, int, 0);
+MODULE_PARM_DESC(epatc8, "support for the Shuttle EP1284 chip, "
+ "used in any recent Imation SuperDisk (LS-120) drive.");
+
+/* cont = 0 IDE register file
+ cont = 1 IDE control registers
+ cont = 2 internal EPAT registers
+*/
+
+static int cont_map[3] = { 0x18, 0x10, 0 };
+
+static void epat_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ int r;
+
+ r = regr + cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1:
+ case 2: w0(0x60+r); w2(1); w0(val); w2(4);
+ break;
+
+ case 3:
+ case 4:
+ case 5: w3(0x40+r); w4(val);
+ break;
+
+ }
+}
+
+static int epat_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b, r;
+
+ r = regr + cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0: w0(r); w2(1); w2(3);
+ a = r1(); w2(4); b = r1();
+ return j44(a,b);
+
+ case 1: w0(0x40+r); w2(1); w2(4);
+ a = r1(); b = r2(); w0(0xff);
+ return j53(a,b);
+
+ case 2: w0(0x20+r); w2(1); w2(0x25);
+ a = r0(); w2(4);
+ return a;
+
+ case 3:
+ case 4:
+ case 5: w3(r); w2(0x24); a = r4(); w2(4);
+ return a;
+
+ }
+ return -1; /* never gets here */
+}
+
+static void epat_read_block( PIA *pi, char * buf, int count )
+
+{ int k, ph, a, b;
+
+ switch (pi->mode) {
+
+ case 0: w0(7); w2(1); w2(3); w0(0xff);
+ ph = 0;
+ for(k=0;k<count;k++) {
+ if (k == count-1) w0(0xfd);
+ w2(6+ph); a = r1();
+ if (a & 8) b = a;
+ else { w2(4+ph); b = r1(); }
+ buf[k] = j44(a,b);
+ ph = 1 - ph;
+ }
+ w0(0); w2(4);
+ break;
+
+ case 1: w0(0x47); w2(1); w2(5); w0(0xff);
+ ph = 0;
+ for(k=0;k<count;k++) {
+ if (k == count-1) w0(0xfd);
+ w2(4+ph);
+ a = r1(); b = r2();
+ buf[k] = j53(a,b);
+ ph = 1 - ph;
+ }
+ w0(0); w2(4);
+ break;
+
+ case 2: w0(0x27); w2(1); w2(0x25); w0(0);
+ ph = 0;
+ for(k=0;k<count-1;k++) {
+ w2(0x24+ph);
+ buf[k] = r0();
+ ph = 1 - ph;
+ }
+ w2(0x26); w2(0x27); buf[count-1] = r0();
+ w2(0x25); w2(4);
+ break;
+
+ case 3: w3(0x80); w2(0x24);
+ for(k=0;k<count-1;k++) buf[k] = r4();
+ w2(4); w3(0xa0); w2(0x24); buf[count-1] = r4();
+ w2(4);
+ break;
+
+ case 4: w3(0x80); w2(0x24);
+ for(k=0;k<(count/2)-1;k++) ((u16 *)buf)[k] = r4w();
+ buf[count-2] = r4();
+ w2(4); w3(0xa0); w2(0x24); buf[count-1] = r4();
+ w2(4);
+ break;
+
+ case 5: w3(0x80); w2(0x24);
+ for(k=0;k<(count/4)-1;k++) ((u32 *)buf)[k] = r4l();
+ for(k=count-4;k<count-1;k++) buf[k] = r4();
+ w2(4); w3(0xa0); w2(0x24); buf[count-1] = r4();
+ w2(4);
+ break;
+
+ }
+}
+
+static void epat_write_block( PIA *pi, char * buf, int count )
+
+{ int ph, k;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1:
+ case 2: w0(0x67); w2(1); w2(5);
+ ph = 0;
+ for(k=0;k<count;k++) {
+ w0(buf[k]);
+ w2(4+ph);
+ ph = 1 - ph;
+ }
+ w2(7); w2(4);
+ break;
+
+ case 3: w3(0xc0);
+ for(k=0;k<count;k++) w4(buf[k]);
+ w2(4);
+ break;
+
+ case 4: w3(0xc0);
+ for(k=0;k<(count/2);k++) w4w(((u16 *)buf)[k]);
+ w2(4);
+ break;
+
+ case 5: w3(0xc0);
+ for(k=0;k<(count/4);k++) w4l(((u32 *)buf)[k]);
+ w2(4);
+ break;
+
+ }
+}
+
+/* these macros access the EPAT registers in native addressing */
+
+#define WR(r,v) epat_write_regr(pi,2,r,v)
+#define RR(r) (epat_read_regr(pi,2,r))
+
+/* and these access the IDE task file */
+
+#define WRi(r,v) epat_write_regr(pi,0,r,v)
+#define RRi(r) (epat_read_regr(pi,0,r))
+
+/* FIXME: the CPP stuff should be fixed to handle multiple EPATs on a chain */
+
+#define CPP(x) w2(4);w0(0x22);w0(0xaa);w0(0x55);w0(0);w0(0xff);\
+ w0(0x87);w0(0x78);w0(x);w2(4);w2(5);w2(4);w0(0xff);
+
+static void epat_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+
+ /* Initialize the chip */
+ CPP(0);
+
+ if (epatc8) {
+ CPP(0x40);CPP(0xe0);
+ w0(0);w2(1);w2(4);
+ WR(0x8,0x12);WR(0xc,0x14);WR(0x12,0x10);
+ WR(0xe,0xf);WR(0xf,4);
+ /* WR(0xe,0xa);WR(0xf,4); */
+ WR(0xe,0xd);WR(0xf,0);
+ /* CPP(0x30); */
+ }
+
+ /* Connect to the chip */
+ CPP(0xe0);
+ w0(0);w2(1);w2(4); /* Idle into SPP */
+ if (pi->mode >= 3) {
+ w0(0);w2(1);w2(4);w2(0xc);
+ /* Request EPP */
+ w0(0x40);w2(6);w2(7);w2(4);w2(0xc);w2(4);
+ }
+
+ if (!epatc8) {
+ WR(8,0x10); WR(0xc,0x14); WR(0xa,0x38); WR(0x12,0x10);
+ }
+}
+
+static void epat_disconnect (PIA *pi)
+{ CPP(0x30);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static int epat_test_proto( PIA *pi, char * scratch, int verbose )
+
+{ int k, j, f, cc;
+ int e[2] = {0,0};
+
+ epat_connect(pi);
+ cc = RR(0xd);
+ epat_disconnect(pi);
+
+ epat_connect(pi);
+ for (j=0;j<2;j++) {
+ WRi(6,0xa0+j*0x10);
+ for (k=0;k<256;k++) {
+ WRi(2,k^0xaa);
+ WRi(3,k^0x55);
+ if (RRi(2) != (k^0xaa)) e[j]++;
+ }
+ }
+ epat_disconnect(pi);
+
+ f = 0;
+ epat_connect(pi);
+ WR(0x13,1); WR(0x13,0); WR(0xa,0x11);
+ epat_read_block(pi,scratch,512);
+
+ for (k=0;k<256;k++) {
+ if ((scratch[2*k] & 0xff) != k) f++;
+ if ((scratch[2*k+1] & 0xff) != (0xff-k)) f++;
+ }
+ epat_disconnect(pi);
+
+ if (verbose) {
+ printk("%s: epat: port 0x%x, mode %d, ccr %x, test=(%d,%d,%d)\n",
+ pi->device,pi->port,pi->mode,cc,e[0],e[1],f);
+ }
+
+ return (e[0] && e[1]) || f;
+}
+
+static void epat_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ int ver;
+ char *mode_string[6] =
+ {"4-bit","5/3","8-bit","EPP-8","EPP-16","EPP-32"};
+
+ epat_connect(pi);
+ WR(0xa,0x38); /* read the version code */
+ ver = RR(0xb);
+ epat_disconnect(pi);
+
+ printk("%s: epat %s, Shuttle EPAT chip %x at 0x%x, ",
+ pi->device,EPAT_VERSION,ver,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol epat = {
+ .owner = THIS_MODULE,
+ .name = "epat",
+ .max_mode = 6,
+ .epp_first = 3,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = epat_write_regr,
+ .read_regr = epat_read_regr,
+ .write_block = epat_write_block,
+ .read_block = epat_read_block,
+ .connect = epat_connect,
+ .disconnect = epat_disconnect,
+ .test_proto = epat_test_proto,
+ .log_adapter = epat_log_adapter,
+};
+
+static int __init epat_init(void)
+{
+#ifdef CONFIG_PARIDE_EPATC8
+ epatc8 = 1;
+#endif
+ return paride_register(&epat);
+}
+
+static void __exit epat_exit(void)
+{
+ paride_unregister(&epat);
+}
+
+MODULE_LICENSE("GPL");
+module_init(epat_init)
+module_exit(epat_exit)
diff --git a/drivers/block/paride/epia.c b/drivers/block/paride/epia.c
new file mode 100644
index 000000000..fb0e782d0
--- /dev/null
+++ b/drivers/block/paride/epia.c
@@ -0,0 +1,316 @@
+/*
+ epia.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ epia.c is a low-level protocol driver for Shuttle Technologies
+ EPIA parallel to IDE adapter chip. This device is now obsolete
+ and has been replaced with the EPAT chip, which is supported
+ by epat.c, however, some devices based on EPIA are still
+ available.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.06 init_proto, release_proto
+ 1.02 GRG 1998.06.17 support older versions of EPIA
+
+*/
+
+#define EPIA_VERSION "1.02"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+/* mode codes: 0 nybble reads on port 1, 8-bit writes
+ 1 5/3 reads on ports 1 & 2, 8-bit writes
+ 2 8-bit reads and writes
+ 3 8-bit EPP mode
+ 4 16-bit EPP
+ 5 32-bit EPP
+*/
+
+#define j44(a,b) (((a>>4)&0x0f)+(b&0xf0))
+#define j53(a,b) (((a>>3)&0x1f)+((b<<4)&0xe0))
+
+/* cont = 0 IDE register file
+ cont = 1 IDE control registers
+*/
+
+static int cont_map[2] = { 0, 0x80 };
+
+static int epia_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b, r;
+
+ regr += cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0: r = regr^0x39;
+ w0(r); w2(1); w2(3); w0(r);
+ a = r1(); w2(1); b = r1(); w2(4);
+ return j44(a,b);
+
+ case 1: r = regr^0x31;
+ w0(r); w2(1); w0(r&0x37);
+ w2(3); w2(5); w0(r|0xf0);
+ a = r1(); b = r2(); w2(4);
+ return j53(a,b);
+
+ case 2: r = regr^0x29;
+ w0(r); w2(1); w2(0X21); w2(0x23);
+ a = r0(); w2(4);
+ return a;
+
+ case 3:
+ case 4:
+ case 5: w3(regr); w2(0x24); a = r4(); w2(4);
+ return a;
+
+ }
+ return -1;
+}
+
+static void epia_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ int r;
+
+ regr += cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1:
+ case 2: r = regr^0x19;
+ w0(r); w2(1); w0(val); w2(3); w2(4);
+ break;
+
+ case 3:
+ case 4:
+ case 5: r = regr^0x40;
+ w3(r); w4(val); w2(4);
+ break;
+ }
+}
+
+#define WR(r,v) epia_write_regr(pi,0,r,v)
+#define RR(r) (epia_read_regr(pi,0,r))
+
+/* The use of register 0x84 is entirely unclear - it seems to control
+ some EPP counters ... currently we know about 3 different block
+ sizes: the standard 512 byte reads and writes, 12 byte writes and
+ 2048 byte reads (the last two being used in the CDrom drivers.
+*/
+
+static void epia_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+
+ w2(4); w0(0xa0); w0(0x50); w0(0xc0); w0(0x30); w0(0xa0); w0(0);
+ w2(1); w2(4);
+ if (pi->mode >= 3) {
+ w0(0xa); w2(1); w2(4); w0(0x82); w2(4); w2(0xc); w2(4);
+ w2(0x24); w2(0x26); w2(4);
+ }
+ WR(0x86,8);
+}
+
+static void epia_disconnect ( PIA *pi )
+
+{ /* WR(0x84,0x10); */
+ w0(pi->saved_r0);
+ w2(1); w2(4);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static void epia_read_block( PIA *pi, char * buf, int count )
+
+{ int k, ph, a, b;
+
+ switch (pi->mode) {
+
+ case 0: w0(0x81); w2(1); w2(3); w0(0xc1);
+ ph = 1;
+ for (k=0;k<count;k++) {
+ w2(2+ph); a = r1();
+ w2(4+ph); b = r1();
+ buf[k] = j44(a,b);
+ ph = 1 - ph;
+ }
+ w0(0); w2(4);
+ break;
+
+ case 1: w0(0x91); w2(1); w0(0x10); w2(3);
+ w0(0x51); w2(5); w0(0xd1);
+ ph = 1;
+ for (k=0;k<count;k++) {
+ w2(4+ph);
+ a = r1(); b = r2();
+ buf[k] = j53(a,b);
+ ph = 1 - ph;
+ }
+ w0(0); w2(4);
+ break;
+
+ case 2: w0(0x89); w2(1); w2(0x23); w2(0x21);
+ ph = 1;
+ for (k=0;k<count;k++) {
+ w2(0x24+ph);
+ buf[k] = r0();
+ ph = 1 - ph;
+ }
+ w2(6); w2(4);
+ break;
+
+ case 3: if (count > 512) WR(0x84,3);
+ w3(0); w2(0x24);
+ for (k=0;k<count;k++) buf[k] = r4();
+ w2(4); WR(0x84,0);
+ break;
+
+ case 4: if (count > 512) WR(0x84,3);
+ w3(0); w2(0x24);
+ for (k=0;k<count/2;k++) ((u16 *)buf)[k] = r4w();
+ w2(4); WR(0x84,0);
+ break;
+
+ case 5: if (count > 512) WR(0x84,3);
+ w3(0); w2(0x24);
+ for (k=0;k<count/4;k++) ((u32 *)buf)[k] = r4l();
+ w2(4); WR(0x84,0);
+ break;
+
+ }
+}
+
+static void epia_write_block( PIA *pi, char * buf, int count )
+
+{ int ph, k, last, d;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1:
+ case 2: w0(0xa1); w2(1); w2(3); w2(1); w2(5);
+ ph = 0; last = 0x8000;
+ for (k=0;k<count;k++) {
+ d = buf[k];
+ if (d != last) { last = d; w0(d); }
+ w2(4+ph);
+ ph = 1 - ph;
+ }
+ w2(7); w2(4);
+ break;
+
+ case 3: if (count < 512) WR(0x84,1);
+ w3(0x40);
+ for (k=0;k<count;k++) w4(buf[k]);
+ if (count < 512) WR(0x84,0);
+ break;
+
+ case 4: if (count < 512) WR(0x84,1);
+ w3(0x40);
+ for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+ if (count < 512) WR(0x84,0);
+ break;
+
+ case 5: if (count < 512) WR(0x84,1);
+ w3(0x40);
+ for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+ if (count < 512) WR(0x84,0);
+ break;
+
+ }
+
+}
+
+static int epia_test_proto( PIA *pi, char * scratch, int verbose )
+
+{ int j, k, f;
+ int e[2] = {0,0};
+
+ epia_connect(pi);
+ for (j=0;j<2;j++) {
+ WR(6,0xa0+j*0x10);
+ for (k=0;k<256;k++) {
+ WR(2,k^0xaa);
+ WR(3,k^0x55);
+ if (RR(2) != (k^0xaa)) e[j]++;
+ }
+ WR(2,1); WR(3,1);
+ }
+ epia_disconnect(pi);
+
+ f = 0;
+ epia_connect(pi);
+ WR(0x84,8);
+ epia_read_block(pi,scratch,512);
+ for (k=0;k<256;k++) {
+ if ((scratch[2*k] & 0xff) != ((k+1) & 0xff)) f++;
+ if ((scratch[2*k+1] & 0xff) != ((-2-k) & 0xff)) f++;
+ }
+ WR(0x84,0);
+ epia_disconnect(pi);
+
+ if (verbose) {
+ printk("%s: epia: port 0x%x, mode %d, test=(%d,%d,%d)\n",
+ pi->device,pi->port,pi->mode,e[0],e[1],f);
+ }
+
+ return (e[0] && e[1]) || f;
+
+}
+
+
+static void epia_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[6] = {"4-bit","5/3","8-bit",
+ "EPP-8","EPP-16","EPP-32"};
+
+ printk("%s: epia %s, Shuttle EPIA at 0x%x, ",
+ pi->device,EPIA_VERSION,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol epia = {
+ .owner = THIS_MODULE,
+ .name = "epia",
+ .max_mode = 6,
+ .epp_first = 3,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = epia_write_regr,
+ .read_regr = epia_read_regr,
+ .write_block = epia_write_block,
+ .read_block = epia_read_block,
+ .connect = epia_connect,
+ .disconnect = epia_disconnect,
+ .test_proto = epia_test_proto,
+ .log_adapter = epia_log_adapter,
+};
+
+static int __init epia_init(void)
+{
+ return paride_register(&epia);
+}
+
+static void __exit epia_exit(void)
+{
+ paride_unregister(&epia);
+}
+
+MODULE_LICENSE("GPL");
+module_init(epia_init)
+module_exit(epia_exit)
diff --git a/drivers/block/paride/fit2.c b/drivers/block/paride/fit2.c
new file mode 100644
index 000000000..381283753
--- /dev/null
+++ b/drivers/block/paride/fit2.c
@@ -0,0 +1,151 @@
+/*
+ fit2.c (c) 1998 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ fit2.c is a low-level protocol driver for the older version
+ of the Fidelity International Technology parallel port adapter.
+ This adapter is used in their TransDisk 2000 and older TransDisk
+ 3000 portable hard-drives. As far as I can tell, this device
+ supports 4-bit mode _only_.
+
+ Newer models of the FIT products use an enhanced protocol.
+ The "fit3" protocol module should support current drives.
+
+*/
+
+#define FIT2_VERSION "1.0"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b) (((a>>4)&0x0f)|(b&0xf0))
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+
+NB: The FIT adapter does not appear to use the control registers.
+So, we map ALT_STATUS to STATUS and NO-OP writes to the device
+control register - this means that IDE reset will not work on these
+devices.
+
+*/
+
+static void fit2_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ if (cont == 1) return;
+ w2(0xc); w0(regr); w2(4); w0(val); w2(5); w0(0); w2(4);
+}
+
+static int fit2_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b, r;
+
+ if (cont) {
+ if (regr != 6) return 0xff;
+ r = 7;
+ } else r = regr + 0x10;
+
+ w2(0xc); w0(r); w2(4); w2(5);
+ w0(0); a = r1();
+ w0(1); b = r1();
+ w2(4);
+
+ return j44(a,b);
+
+}
+
+static void fit2_read_block( PIA *pi, char * buf, int count )
+
+{ int k, a, b, c, d;
+
+ w2(0xc); w0(0x10);
+
+ for (k=0;k<count/4;k++) {
+
+ w2(4); w2(5);
+ w0(0); a = r1(); w0(1); b = r1();
+ w0(3); c = r1(); w0(2); d = r1();
+ buf[4*k+0] = j44(a,b);
+ buf[4*k+1] = j44(d,c);
+
+ w2(4); w2(5);
+ a = r1(); w0(3); b = r1();
+ w0(1); c = r1(); w0(0); d = r1();
+ buf[4*k+2] = j44(d,c);
+ buf[4*k+3] = j44(a,b);
+
+ }
+
+ w2(4);
+
+}
+
+static void fit2_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+
+ w2(0xc); w0(0);
+ for (k=0;k<count/2;k++) {
+ w2(4); w0(buf[2*k]);
+ w2(5); w0(buf[2*k+1]);
+ }
+ w2(4);
+}
+
+static void fit2_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(0xcc);
+}
+
+static void fit2_disconnect ( PIA *pi )
+
+{ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static void fit2_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ printk("%s: fit2 %s, FIT 2000 adapter at 0x%x, delay %d\n",
+ pi->device,FIT2_VERSION,pi->port,pi->delay);
+
+}
+
+static struct pi_protocol fit2 = {
+ .owner = THIS_MODULE,
+ .name = "fit2",
+ .max_mode = 1,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = fit2_write_regr,
+ .read_regr = fit2_read_regr,
+ .write_block = fit2_write_block,
+ .read_block = fit2_read_block,
+ .connect = fit2_connect,
+ .disconnect = fit2_disconnect,
+ .log_adapter = fit2_log_adapter,
+};
+
+static int __init fit2_init(void)
+{
+ return paride_register(&fit2);
+}
+
+static void __exit fit2_exit(void)
+{
+ paride_unregister(&fit2);
+}
+
+MODULE_LICENSE("GPL");
+module_init(fit2_init)
+module_exit(fit2_exit)
diff --git a/drivers/block/paride/fit3.c b/drivers/block/paride/fit3.c
new file mode 100644
index 000000000..275d26945
--- /dev/null
+++ b/drivers/block/paride/fit3.c
@@ -0,0 +1,211 @@
+/*
+ fit3.c (c) 1998 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ fit3.c is a low-level protocol driver for newer models
+ of the Fidelity International Technology parallel port adapter.
+ This adapter is used in their TransDisk 3000 portable
+ hard-drives, as well as CD-ROM, PD-CD and other devices.
+
+ The TD-2000 and certain older devices use a different protocol.
+ Try the fit2 protocol module with them.
+
+ NB: The FIT adapters do not appear to support the control
+ registers. So, we map ALT_STATUS to STATUS and NO-OP writes
+ to the device control register - this means that IDE reset
+ will not work on these devices.
+
+*/
+
+#define FIT3_VERSION "1.0"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b) (((a>>3)&0x0f)|((b<<1)&0xf0))
+
+#define w7(byte) {out_p(7,byte);}
+#define r7() (in_p(7) & 0xff)
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+
+*/
+
+static void fit3_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ if (cont == 1) return;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: w2(0xc); w0(regr); w2(0x8); w2(0xc);
+ w0(val); w2(0xd);
+ w0(0); w2(0xc);
+ break;
+
+ case 2: w2(0xc); w0(regr); w2(0x8); w2(0xc);
+ w4(val); w4(0);
+ w2(0xc);
+ break;
+
+ }
+}
+
+static int fit3_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b;
+
+ if (cont) {
+ if (regr != 6) return 0xff;
+ regr = 7;
+ }
+
+ switch (pi->mode) {
+
+ case 0: w2(0xc); w0(regr + 0x10); w2(0x8); w2(0xc);
+ w2(0xd); a = r1();
+ w2(0xf); b = r1();
+ w2(0xc);
+ return j44(a,b);
+
+ case 1: w2(0xc); w0(regr + 0x90); w2(0x8); w2(0xc);
+ w2(0xec); w2(0xee); w2(0xef); a = r0();
+ w2(0xc);
+ return a;
+
+ case 2: w2(0xc); w0(regr + 0x90); w2(0x8); w2(0xc);
+ w2(0xec);
+ a = r4(); b = r4();
+ w2(0xc);
+ return a;
+
+ }
+ return -1;
+
+}
+
+static void fit3_read_block( PIA *pi, char * buf, int count )
+
+{ int k, a, b, c, d;
+
+ switch (pi->mode) {
+
+ case 0: w2(0xc); w0(0x10); w2(0x8); w2(0xc);
+ for (k=0;k<count/2;k++) {
+ w2(0xd); a = r1();
+ w2(0xf); b = r1();
+ w2(0xc); c = r1();
+ w2(0xe); d = r1();
+ buf[2*k ] = j44(a,b);
+ buf[2*k+1] = j44(c,d);
+ }
+ w2(0xc);
+ break;
+
+ case 1: w2(0xc); w0(0x90); w2(0x8); w2(0xc);
+ w2(0xec); w2(0xee);
+ for (k=0;k<count/2;k++) {
+ w2(0xef); a = r0();
+ w2(0xee); b = r0();
+ buf[2*k ] = a;
+ buf[2*k+1] = b;
+ }
+ w2(0xec);
+ w2(0xc);
+ break;
+
+ case 2: w2(0xc); w0(0x90); w2(0x8); w2(0xc);
+ w2(0xec);
+ for (k=0;k<count;k++) buf[k] = r4();
+ w2(0xc);
+ break;
+
+ }
+}
+
+static void fit3_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: w2(0xc); w0(0); w2(0x8); w2(0xc);
+ for (k=0;k<count/2;k++) {
+ w0(buf[2*k ]); w2(0xd);
+ w0(buf[2*k+1]); w2(0xc);
+ }
+ break;
+
+ case 2: w2(0xc); w0(0); w2(0x8); w2(0xc);
+ for (k=0;k<count;k++) w4(buf[k]);
+ w2(0xc);
+ break;
+ }
+}
+
+static void fit3_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(0xc); w0(0); w2(0xa);
+ if (pi->mode == 2) {
+ w2(0xc); w0(0x9); w2(0x8); w2(0xc);
+ }
+}
+
+static void fit3_disconnect ( PIA *pi )
+
+{ w2(0xc); w0(0xa); w2(0x8); w2(0xc);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static void fit3_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[3] = {"4-bit","8-bit","EPP"};
+
+ printk("%s: fit3 %s, FIT 3000 adapter at 0x%x, "
+ "mode %d (%s), delay %d\n",
+ pi->device,FIT3_VERSION,pi->port,
+ pi->mode,mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol fit3 = {
+ .owner = THIS_MODULE,
+ .name = "fit3",
+ .max_mode = 3,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = fit3_write_regr,
+ .read_regr = fit3_read_regr,
+ .write_block = fit3_write_block,
+ .read_block = fit3_read_block,
+ .connect = fit3_connect,
+ .disconnect = fit3_disconnect,
+ .log_adapter = fit3_log_adapter,
+};
+
+static int __init fit3_init(void)
+{
+ return paride_register(&fit3);
+}
+
+static void __exit fit3_exit(void)
+{
+ paride_unregister(&fit3);
+}
+
+MODULE_LICENSE("GPL");
+module_init(fit3_init)
+module_exit(fit3_exit)
diff --git a/drivers/block/paride/friq.c b/drivers/block/paride/friq.c
new file mode 100644
index 000000000..4f2ba2446
--- /dev/null
+++ b/drivers/block/paride/friq.c
@@ -0,0 +1,276 @@
+/*
+ friq.c (c) 1998 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License
+
+ friq.c is a low-level protocol driver for the Freecom "IQ"
+ parallel port IDE adapter. Early versions of this adapter
+ use the 'frpw' protocol.
+
+ Freecom uses this adapter in a battery powered external
+ CD-ROM drive. It is also used in LS-120 drives by
+ Maxell and Panasonic, and other devices.
+
+ The battery powered drive requires software support to
+ control the power to the drive. This module enables the
+ drive power when the high level driver (pcd) is loaded
+ and disables it when the module is unloaded. Note, if
+ the friq module is built in to the kernel, the power
+ will never be switched off, so other means should be
+ used to conserve battery power.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.12.20 Added support for soft power switch
+*/
+
+#define FRIQ_VERSION "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define CMD(x) w2(4);w0(0xff);w0(0xff);w0(0x73);w0(0x73);\
+ w0(0xc9);w0(0xc9);w0(0x26);w0(0x26);w0(x);w0(x);
+
+#define j44(l,h) (((l>>4)&0x0f)|(h&0xf0))
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int cont_map[2] = { 0x08, 0x10 };
+
+static int friq_read_regr( PIA *pi, int cont, int regr )
+
+{ int h,l,r;
+
+ r = regr + cont_map[cont];
+
+ CMD(r);
+ w2(6); l = r1();
+ w2(4); h = r1();
+ w2(4);
+
+ return j44(l,h);
+
+}
+
+static void friq_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ int r;
+
+ r = regr + cont_map[cont];
+
+ CMD(r);
+ w0(val);
+ w2(5);w2(7);w2(5);w2(4);
+}
+
+static void friq_read_block_int( PIA *pi, char * buf, int count, int regr )
+
+{ int h, l, k, ph;
+
+ switch(pi->mode) {
+
+ case 0: CMD(regr);
+ for (k=0;k<count;k++) {
+ w2(6); l = r1();
+ w2(4); h = r1();
+ buf[k] = j44(l,h);
+ }
+ w2(4);
+ break;
+
+ case 1: ph = 2;
+ CMD(regr+0xc0);
+ w0(0xff);
+ for (k=0;k<count;k++) {
+ w2(0xa4 + ph);
+ buf[k] = r0();
+ ph = 2 - ph;
+ }
+ w2(0xac); w2(0xa4); w2(4);
+ break;
+
+ case 2: CMD(regr+0x80);
+ for (k=0;k<count-2;k++) buf[k] = r4();
+ w2(0xac); w2(0xa4);
+ buf[count-2] = r4();
+ buf[count-1] = r4();
+ w2(4);
+ break;
+
+ case 3: CMD(regr+0x80);
+ for (k=0;k<(count/2)-1;k++) ((u16 *)buf)[k] = r4w();
+ w2(0xac); w2(0xa4);
+ buf[count-2] = r4();
+ buf[count-1] = r4();
+ w2(4);
+ break;
+
+ case 4: CMD(regr+0x80);
+ for (k=0;k<(count/4)-1;k++) ((u32 *)buf)[k] = r4l();
+ buf[count-4] = r4();
+ buf[count-3] = r4();
+ w2(0xac); w2(0xa4);
+ buf[count-2] = r4();
+ buf[count-1] = r4();
+ w2(4);
+ break;
+
+ }
+}
+
+static void friq_read_block( PIA *pi, char * buf, int count)
+
+{ friq_read_block_int(pi,buf,count,0x08);
+}
+
+static void friq_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ switch(pi->mode) {
+
+ case 0:
+ case 1: CMD(8); w2(5);
+ for (k=0;k<count;k++) {
+ w0(buf[k]);
+ w2(7);w2(5);
+ }
+ w2(4);
+ break;
+
+ case 2: CMD(0xc8); w2(5);
+ for (k=0;k<count;k++) w4(buf[k]);
+ w2(4);
+ break;
+
+ case 3: CMD(0xc8); w2(5);
+ for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+ w2(4);
+ break;
+
+ case 4: CMD(0xc8); w2(5);
+ for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+ w2(4);
+ break;
+ }
+}
+
+static void friq_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(4);
+}
+
+static void friq_disconnect ( PIA *pi )
+
+{ CMD(0x20);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static int friq_test_proto( PIA *pi, char * scratch, int verbose )
+
+{ int j, k, r;
+ int e[2] = {0,0};
+
+ pi->saved_r0 = r0();
+ w0(0xff); udelay(20); CMD(0x3d); /* turn the power on */
+ udelay(500);
+ w0(pi->saved_r0);
+
+ friq_connect(pi);
+ for (j=0;j<2;j++) {
+ friq_write_regr(pi,0,6,0xa0+j*0x10);
+ for (k=0;k<256;k++) {
+ friq_write_regr(pi,0,2,k^0xaa);
+ friq_write_regr(pi,0,3,k^0x55);
+ if (friq_read_regr(pi,0,2) != (k^0xaa)) e[j]++;
+ }
+ }
+ friq_disconnect(pi);
+
+ friq_connect(pi);
+ friq_read_block_int(pi,scratch,512,0x10);
+ r = 0;
+ for (k=0;k<128;k++) if (scratch[k] != k) r++;
+ friq_disconnect(pi);
+
+ if (verbose) {
+ printk("%s: friq: port 0x%x, mode %d, test=(%d,%d,%d)\n",
+ pi->device,pi->port,pi->mode,e[0],e[1],r);
+ }
+
+ return (r || (e[0] && e[1]));
+}
+
+
+static void friq_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[6] = {"4-bit","8-bit",
+ "EPP-8","EPP-16","EPP-32"};
+
+ printk("%s: friq %s, Freecom IQ ASIC-2 adapter at 0x%x, ", pi->device,
+ FRIQ_VERSION,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+ pi->private = 1;
+ friq_connect(pi);
+ CMD(0x9e); /* disable sleep timer */
+ friq_disconnect(pi);
+
+}
+
+static void friq_release_proto( PIA *pi)
+{
+ if (pi->private) { /* turn off the power */
+ friq_connect(pi);
+ CMD(0x1d); CMD(0x1e);
+ friq_disconnect(pi);
+ pi->private = 0;
+ }
+}
+
+static struct pi_protocol friq = {
+ .owner = THIS_MODULE,
+ .name = "friq",
+ .max_mode = 5,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = friq_write_regr,
+ .read_regr = friq_read_regr,
+ .write_block = friq_write_block,
+ .read_block = friq_read_block,
+ .connect = friq_connect,
+ .disconnect = friq_disconnect,
+ .test_proto = friq_test_proto,
+ .log_adapter = friq_log_adapter,
+ .release_proto = friq_release_proto,
+};
+
+static int __init friq_init(void)
+{
+ return paride_register(&friq);
+}
+
+static void __exit friq_exit(void)
+{
+ paride_unregister(&friq);
+}
+
+MODULE_LICENSE("GPL");
+module_init(friq_init)
+module_exit(friq_exit)
diff --git a/drivers/block/paride/frpw.c b/drivers/block/paride/frpw.c
new file mode 100644
index 000000000..c3cde3646
--- /dev/null
+++ b/drivers/block/paride/frpw.c
@@ -0,0 +1,313 @@
+/*
+ frpw.c (c) 1996-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License
+
+ frpw.c is a low-level protocol driver for the Freecom "Power"
+ parallel port IDE adapter.
+
+ Some applications of this adapter may require a "printer" reset
+ prior to loading the driver. This can be done by loading and
+ unloading the "lp" driver, or it can be done by this driver
+ if you define FRPW_HARD_RESET. The latter is not recommended
+ as it may upset devices on other ports.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.06 init_proto, release_proto
+ fix chip detect
+ added EPP-16 and EPP-32
+ 1.02 GRG 1998.09.23 added hard reset to initialisation process
+ 1.03 GRG 1998.12.14 made hard reset conditional
+
+*/
+
+#define FRPW_VERSION "1.03"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define cec4 w2(0xc);w2(0xe);w2(0xe);w2(0xc);w2(4);w2(4);w2(4);
+#define j44(l,h) (((l>>4)&0x0f)|(h&0xf0))
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int cont_map[2] = { 0x08, 0x10 };
+
+static int frpw_read_regr( PIA *pi, int cont, int regr )
+
+{ int h,l,r;
+
+ r = regr + cont_map[cont];
+
+ w2(4);
+ w0(r); cec4;
+ w2(6); l = r1();
+ w2(4); h = r1();
+ w2(4);
+
+ return j44(l,h);
+
+}
+
+static void frpw_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ int r;
+
+ r = regr + cont_map[cont];
+
+ w2(4); w0(r); cec4;
+ w0(val);
+ w2(5);w2(7);w2(5);w2(4);
+}
+
+static void frpw_read_block_int( PIA *pi, char * buf, int count, int regr )
+
+{ int h, l, k, ph;
+
+ switch(pi->mode) {
+
+ case 0: w2(4); w0(regr); cec4;
+ for (k=0;k<count;k++) {
+ w2(6); l = r1();
+ w2(4); h = r1();
+ buf[k] = j44(l,h);
+ }
+ w2(4);
+ break;
+
+ case 1: ph = 2;
+ w2(4); w0(regr + 0xc0); cec4;
+ w0(0xff);
+ for (k=0;k<count;k++) {
+ w2(0xa4 + ph);
+ buf[k] = r0();
+ ph = 2 - ph;
+ }
+ w2(0xac); w2(0xa4); w2(4);
+ break;
+
+ case 2: w2(4); w0(regr + 0x80); cec4;
+ for (k=0;k<count;k++) buf[k] = r4();
+ w2(0xac); w2(0xa4);
+ w2(4);
+ break;
+
+ case 3: w2(4); w0(regr + 0x80); cec4;
+ for (k=0;k<count-2;k++) buf[k] = r4();
+ w2(0xac); w2(0xa4);
+ buf[count-2] = r4();
+ buf[count-1] = r4();
+ w2(4);
+ break;
+
+ case 4: w2(4); w0(regr + 0x80); cec4;
+ for (k=0;k<(count/2)-1;k++) ((u16 *)buf)[k] = r4w();
+ w2(0xac); w2(0xa4);
+ buf[count-2] = r4();
+ buf[count-1] = r4();
+ w2(4);
+ break;
+
+ case 5: w2(4); w0(regr + 0x80); cec4;
+ for (k=0;k<(count/4)-1;k++) ((u32 *)buf)[k] = r4l();
+ buf[count-4] = r4();
+ buf[count-3] = r4();
+ w2(0xac); w2(0xa4);
+ buf[count-2] = r4();
+ buf[count-1] = r4();
+ w2(4);
+ break;
+
+ }
+}
+
+static void frpw_read_block( PIA *pi, char * buf, int count)
+
+{ frpw_read_block_int(pi,buf,count,0x08);
+}
+
+static void frpw_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ switch(pi->mode) {
+
+ case 0:
+ case 1:
+ case 2: w2(4); w0(8); cec4; w2(5);
+ for (k=0;k<count;k++) {
+ w0(buf[k]);
+ w2(7);w2(5);
+ }
+ w2(4);
+ break;
+
+ case 3: w2(4); w0(0xc8); cec4; w2(5);
+ for (k=0;k<count;k++) w4(buf[k]);
+ w2(4);
+ break;
+
+ case 4: w2(4); w0(0xc8); cec4; w2(5);
+ for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+ w2(4);
+ break;
+
+ case 5: w2(4); w0(0xc8); cec4; w2(5);
+ for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+ w2(4);
+ break;
+ }
+}
+
+static void frpw_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(4);
+}
+
+static void frpw_disconnect ( PIA *pi )
+
+{ w2(4); w0(0x20); cec4;
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+/* Stub logic to see if PNP string is available - used to distinguish
+ between the Xilinx and ASIC implementations of the Freecom adapter.
+*/
+
+static int frpw_test_pnp ( PIA *pi )
+
+/* returns chip_type: 0 = Xilinx, 1 = ASIC */
+
+{ int olddelay, a, b;
+
+#ifdef FRPW_HARD_RESET
+ w0(0); w2(8); udelay(50); w2(0xc); /* parallel bus reset */
+ mdelay(1500);
+#endif
+
+ olddelay = pi->delay;
+ pi->delay = 10;
+
+ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+
+ w2(4); w0(4); w2(6); w2(7);
+ a = r1() & 0xff; w2(4); b = r1() & 0xff;
+ w2(0xc); w2(0xe); w2(4);
+
+ pi->delay = olddelay;
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+
+ return ((~a&0x40) && (b&0x40));
+}
+
+/* We use the pi->private to remember the result of the PNP test.
+ To make this work, private = port*2 + chip. Yes, I know it's
+ a hack :-(
+*/
+
+static int frpw_test_proto( PIA *pi, char * scratch, int verbose )
+
+{ int j, k, r;
+ int e[2] = {0,0};
+
+ if ((pi->private>>1) != pi->port)
+ pi->private = frpw_test_pnp(pi) + 2*pi->port;
+
+ if (((pi->private%2) == 0) && (pi->mode > 2)) {
+ if (verbose)
+ printk("%s: frpw: Xilinx does not support mode %d\n",
+ pi->device, pi->mode);
+ return 1;
+ }
+
+ if (((pi->private%2) == 1) && (pi->mode == 2)) {
+ if (verbose)
+ printk("%s: frpw: ASIC does not support mode 2\n",
+ pi->device);
+ return 1;
+ }
+
+ frpw_connect(pi);
+ for (j=0;j<2;j++) {
+ frpw_write_regr(pi,0,6,0xa0+j*0x10);
+ for (k=0;k<256;k++) {
+ frpw_write_regr(pi,0,2,k^0xaa);
+ frpw_write_regr(pi,0,3,k^0x55);
+ if (frpw_read_regr(pi,0,2) != (k^0xaa)) e[j]++;
+ }
+ }
+ frpw_disconnect(pi);
+
+ frpw_connect(pi);
+ frpw_read_block_int(pi,scratch,512,0x10);
+ r = 0;
+ for (k=0;k<128;k++) if (scratch[k] != k) r++;
+ frpw_disconnect(pi);
+
+ if (verbose) {
+ printk("%s: frpw: port 0x%x, chip %ld, mode %d, test=(%d,%d,%d)\n",
+ pi->device,pi->port,(pi->private%2),pi->mode,e[0],e[1],r);
+ }
+
+ return (r || (e[0] && e[1]));
+}
+
+
+static void frpw_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[6] = {"4-bit","8-bit","EPP",
+ "EPP-8","EPP-16","EPP-32"};
+
+ printk("%s: frpw %s, Freecom (%s) adapter at 0x%x, ", pi->device,
+ FRPW_VERSION,((pi->private%2) == 0)?"Xilinx":"ASIC",pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol frpw = {
+ .owner = THIS_MODULE,
+ .name = "frpw",
+ .max_mode = 6,
+ .epp_first = 2,
+ .default_delay = 2,
+ .max_units = 1,
+ .write_regr = frpw_write_regr,
+ .read_regr = frpw_read_regr,
+ .write_block = frpw_write_block,
+ .read_block = frpw_read_block,
+ .connect = frpw_connect,
+ .disconnect = frpw_disconnect,
+ .test_proto = frpw_test_proto,
+ .log_adapter = frpw_log_adapter,
+};
+
+static int __init frpw_init(void)
+{
+ return paride_register(&frpw);
+}
+
+static void __exit frpw_exit(void)
+{
+ paride_unregister(&frpw);
+}
+
+MODULE_LICENSE("GPL");
+module_init(frpw_init)
+module_exit(frpw_exit)
diff --git a/drivers/block/paride/kbic.c b/drivers/block/paride/kbic.c
new file mode 100644
index 000000000..35999c415
--- /dev/null
+++ b/drivers/block/paride/kbic.c
@@ -0,0 +1,305 @@
+/*
+ kbic.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ This is a low-level driver for the KBIC-951A and KBIC-971A
+ parallel to IDE adapter chips from KingByte Information Systems.
+
+ The chips are almost identical, however, the wakeup code
+ required for the 971A interferes with the correct operation of
+ the 951A, so this driver registers itself twice, once for
+ each chip.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.06 init_proto, release_proto
+
+*/
+
+#define KBIC_VERSION "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define r12w() (delay_p,inw(pi->port+1)&0xffff)
+
+#define j44(a,b) ((((a>>4)&0x0f)|(b&0xf0))^0x88)
+#define j53(w) (((w>>3)&0x1f)|((w>>4)&0xe0))
+
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int cont_map[2] = { 0x80, 0x40 };
+
+static int kbic_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b, s;
+
+ s = cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0: w0(regr|0x18|s); w2(4); w2(6); w2(4); w2(1); w0(8);
+ a = r1(); w0(0x28); b = r1(); w2(4);
+ return j44(a,b);
+
+ case 1: w0(regr|0x38|s); w2(4); w2(6); w2(4); w2(5); w0(8);
+ a = r12w(); w2(4);
+ return j53(a);
+
+ case 2: w0(regr|0x08|s); w2(4); w2(6); w2(4); w2(0xa5); w2(0xa1);
+ a = r0(); w2(4);
+ return a;
+
+ case 3:
+ case 4:
+ case 5: w0(0x20|s); w2(4); w2(6); w2(4); w3(regr);
+ a = r4(); b = r4(); w2(4); w2(0); w2(4);
+ return a;
+
+ }
+ return -1;
+}
+
+static void kbic_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ int s;
+
+ s = cont_map[cont];
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1:
+ case 2: w0(regr|0x10|s); w2(4); w2(6); w2(4);
+ w0(val); w2(5); w2(4);
+ break;
+
+ case 3:
+ case 4:
+ case 5: w0(0x20|s); w2(4); w2(6); w2(4); w3(regr);
+ w4(val); w4(val);
+ w2(4); w2(0); w2(4);
+ break;
+
+ }
+}
+
+static void k951_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(4);
+}
+
+static void k951_disconnect ( PIA *pi )
+
+{ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+#define CCP(x) w2(0xc4);w0(0xaa);w0(0x55);w0(0);w0(0xff);w0(0x87);\
+ w0(0x78);w0(x);w2(0xc5);w2(0xc4);w0(0xff);
+
+static void k971_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ CCP(0x20);
+ w2(4);
+}
+
+static void k971_disconnect ( PIA *pi )
+
+{ CCP(0x30);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+/* counts must be congruent to 0 MOD 4, but all known applications
+ have this property.
+*/
+
+static void kbic_read_block( PIA *pi, char * buf, int count )
+
+{ int k, a, b;
+
+ switch (pi->mode) {
+
+ case 0: w0(0x98); w2(4); w2(6); w2(4);
+ for (k=0;k<count/2;k++) {
+ w2(1); w0(8); a = r1();
+ w0(0x28); b = r1();
+ buf[2*k] = j44(a,b);
+ w2(5); b = r1();
+ w0(8); a = r1();
+ buf[2*k+1] = j44(a,b);
+ w2(4);
+ }
+ break;
+
+ case 1: w0(0xb8); w2(4); w2(6); w2(4);
+ for (k=0;k<count/4;k++) {
+ w0(0xb8);
+ w2(4); w2(5);
+ w0(8); buf[4*k] = j53(r12w());
+ w0(0xb8); buf[4*k+1] = j53(r12w());
+ w2(4); w2(5);
+ buf[4*k+3] = j53(r12w());
+ w0(8); buf[4*k+2] = j53(r12w());
+ }
+ w2(4);
+ break;
+
+ case 2: w0(0x88); w2(4); w2(6); w2(4);
+ for (k=0;k<count/2;k++) {
+ w2(0xa0); w2(0xa1); buf[2*k] = r0();
+ w2(0xa5); buf[2*k+1] = r0();
+ }
+ w2(4);
+ break;
+
+ case 3: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+ for (k=0;k<count;k++) buf[k] = r4();
+ w2(4); w2(0); w2(4);
+ break;
+
+ case 4: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+ for (k=0;k<count/2;k++) ((u16 *)buf)[k] = r4w();
+ w2(4); w2(0); w2(4);
+ break;
+
+ case 5: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+ for (k=0;k<count/4;k++) ((u32 *)buf)[k] = r4l();
+ w2(4); w2(0); w2(4);
+ break;
+
+
+ }
+}
+
+static void kbic_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1:
+ case 2: w0(0x90); w2(4); w2(6); w2(4);
+ for(k=0;k<count/2;k++) {
+ w0(buf[2*k+1]); w2(0); w2(4);
+ w0(buf[2*k]); w2(5); w2(4);
+ }
+ break;
+
+ case 3: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+ for(k=0;k<count/2;k++) {
+ w4(buf[2*k+1]);
+ w4(buf[2*k]);
+ }
+ w2(4); w2(0); w2(4);
+ break;
+
+ case 4: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+ for(k=0;k<count/2;k++) w4w(pi_swab16(buf,k));
+ w2(4); w2(0); w2(4);
+ break;
+
+ case 5: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+ for(k=0;k<count/4;k++) w4l(pi_swab32(buf,k));
+ w2(4); w2(0); w2(4);
+ break;
+
+ }
+
+}
+
+static void kbic_log_adapter( PIA *pi, char * scratch,
+ int verbose, char * chip )
+
+{ char *mode_string[6] = {"4-bit","5/3","8-bit",
+ "EPP-8","EPP_16","EPP-32"};
+
+ printk("%s: kbic %s, KingByte %s at 0x%x, ",
+ pi->device,KBIC_VERSION,chip,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static void k951_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ kbic_log_adapter(pi,scratch,verbose,"KBIC-951A");
+}
+
+static void k971_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ kbic_log_adapter(pi,scratch,verbose,"KBIC-971A");
+}
+
+static struct pi_protocol k951 = {
+ .owner = THIS_MODULE,
+ .name = "k951",
+ .max_mode = 6,
+ .epp_first = 3,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = kbic_write_regr,
+ .read_regr = kbic_read_regr,
+ .write_block = kbic_write_block,
+ .read_block = kbic_read_block,
+ .connect = k951_connect,
+ .disconnect = k951_disconnect,
+ .log_adapter = k951_log_adapter,
+};
+
+static struct pi_protocol k971 = {
+ .owner = THIS_MODULE,
+ .name = "k971",
+ .max_mode = 6,
+ .epp_first = 3,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = kbic_write_regr,
+ .read_regr = kbic_read_regr,
+ .write_block = kbic_write_block,
+ .read_block = kbic_read_block,
+ .connect = k971_connect,
+ .disconnect = k971_disconnect,
+ .log_adapter = k971_log_adapter,
+};
+
+static int __init kbic_init(void)
+{
+ int rv;
+
+ rv = paride_register(&k951);
+ if (rv < 0)
+ return rv;
+ rv = paride_register(&k971);
+ if (rv < 0)
+ paride_unregister(&k951);
+ return rv;
+}
+
+static void __exit kbic_exit(void)
+{
+ paride_unregister(&k951);
+ paride_unregister(&k971);
+}
+
+MODULE_LICENSE("GPL");
+module_init(kbic_init)
+module_exit(kbic_exit)
diff --git a/drivers/block/paride/ktti.c b/drivers/block/paride/ktti.c
new file mode 100644
index 000000000..117ab0e8c
--- /dev/null
+++ b/drivers/block/paride/ktti.c
@@ -0,0 +1,128 @@
+/*
+ ktti.c (c) 1998 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ ktti.c is a low-level protocol driver for the KT Technology
+ parallel port adapter. This adapter is used in the "PHd"
+ portable hard-drives. As far as I can tell, this device
+ supports 4-bit mode _only_.
+
+*/
+
+#define KTTI_VERSION "1.0"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b) (((a>>4)&0x0f)|(b&0xf0))
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int cont_map[2] = { 0x10, 0x08 };
+
+static void ktti_write_regr( PIA *pi, int cont, int regr, int val)
+
+{ int r;
+
+ r = regr + cont_map[cont];
+
+ w0(r); w2(0xb); w2(0xa); w2(3); w2(6);
+ w0(val); w2(3); w0(0); w2(6); w2(0xb);
+}
+
+static int ktti_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b, r;
+
+ r = regr + cont_map[cont];
+
+ w0(r); w2(0xb); w2(0xa); w2(9); w2(0xc); w2(9);
+ a = r1(); w2(0xc); b = r1(); w2(9); w2(0xc); w2(9);
+ return j44(a,b);
+
+}
+
+static void ktti_read_block( PIA *pi, char * buf, int count )
+
+{ int k, a, b;
+
+ for (k=0;k<count/2;k++) {
+ w0(0x10); w2(0xb); w2(0xa); w2(9); w2(0xc); w2(9);
+ a = r1(); w2(0xc); b = r1(); w2(9);
+ buf[2*k] = j44(a,b);
+ a = r1(); w2(0xc); b = r1(); w2(9);
+ buf[2*k+1] = j44(a,b);
+ }
+}
+
+static void ktti_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ for (k=0;k<count/2;k++) {
+ w0(0x10); w2(0xb); w2(0xa); w2(3); w2(6);
+ w0(buf[2*k]); w2(3);
+ w0(buf[2*k+1]); w2(6);
+ w2(0xb);
+ }
+}
+
+static void ktti_connect ( PIA *pi )
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+ w2(0xb); w2(0xa); w0(0); w2(3); w2(6);
+}
+
+static void ktti_disconnect ( PIA *pi )
+
+{ w2(0xb); w2(0xa); w0(0xa0); w2(3); w2(4);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static void ktti_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ printk("%s: ktti %s, KT adapter at 0x%x, delay %d\n",
+ pi->device,KTTI_VERSION,pi->port,pi->delay);
+
+}
+
+static struct pi_protocol ktti = {
+ .owner = THIS_MODULE,
+ .name = "ktti",
+ .max_mode = 1,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = ktti_write_regr,
+ .read_regr = ktti_read_regr,
+ .write_block = ktti_write_block,
+ .read_block = ktti_read_block,
+ .connect = ktti_connect,
+ .disconnect = ktti_disconnect,
+ .log_adapter = ktti_log_adapter,
+};
+
+static int __init ktti_init(void)
+{
+ return paride_register(&ktti);
+}
+
+static void __exit ktti_exit(void)
+{
+ paride_unregister(&ktti);
+}
+
+MODULE_LICENSE("GPL");
+module_init(ktti_init)
+module_exit(ktti_exit)
diff --git a/drivers/block/paride/mkd b/drivers/block/paride/mkd
new file mode 100644
index 000000000..971f099b4
--- /dev/null
+++ b/drivers/block/paride/mkd
@@ -0,0 +1,30 @@
+#!/bin/bash
+#
+# mkd -- a script to create the device special files for the PARIDE subsystem
+#
+# block devices: pd (45), pcd (46), pf (47)
+# character devices: pt (96), pg (97)
+#
+function mkdev {
+ mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
+}
+#
+function pd {
+ D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
+ mkdev pd$D b 45 $[ $1 * 16 ]
+ for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
+ done
+}
+#
+cd /dev
+#
+for u in 0 1 2 3 ; do pd $u ; done
+for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done
+for u in 0 1 2 3 ; do mkdev pf$u b 47 $u ; done
+for u in 0 1 2 3 ; do mkdev pt$u c 96 $u ; done
+for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done
+for u in 0 1 2 3 ; do mkdev pg$u c 97 $u ; done
+#
+# end of mkd
+
diff --git a/drivers/block/paride/on20.c b/drivers/block/paride/on20.c
new file mode 100644
index 000000000..0173697a1
--- /dev/null
+++ b/drivers/block/paride/on20.c
@@ -0,0 +1,153 @@
+/*
+ on20.c (c) 1996-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ on20.c is a low-level protocol driver for the
+ Onspec 90c20 parallel to IDE adapter.
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.06 init_proto, release_proto
+
+*/
+
+#define ON20_VERSION "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define op(f) w2(4);w0(f);w2(5);w2(0xd);w2(5);w2(0xd);w2(5);w2(4);
+#define vl(v) w2(4);w0(v);w2(5);w2(7);w2(5);w2(4);
+
+#define j44(a,b) (((a>>4)&0x0f)|(b&0xf0))
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int on20_read_regr( PIA *pi, int cont, int regr )
+
+{ int h,l, r ;
+
+ r = (regr<<2) + 1 + cont;
+
+ op(1); vl(r); op(0);
+
+ switch (pi->mode) {
+
+ case 0: w2(4); w2(6); l = r1();
+ w2(4); w2(6); h = r1();
+ w2(4); w2(6); w2(4); w2(6); w2(4);
+ return j44(l,h);
+
+ case 1: w2(4); w2(0x26); r = r0();
+ w2(4); w2(0x26); w2(4);
+ return r;
+
+ }
+ return -1;
+}
+
+static void on20_write_regr( PIA *pi, int cont, int regr, int val )
+
+{ int r;
+
+ r = (regr<<2) + 1 + cont;
+
+ op(1); vl(r);
+ op(0); vl(val);
+ op(0); vl(val);
+}
+
+static void on20_connect ( PIA *pi)
+
+{ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+
+ w2(4);w0(0);w2(0xc);w2(4);w2(6);w2(4);w2(6);w2(4);
+ if (pi->mode) { op(2); vl(8); op(2); vl(9); }
+ else { op(2); vl(0); op(2); vl(8); }
+}
+
+static void on20_disconnect ( PIA *pi )
+
+{ w2(4);w0(7);w2(4);w2(0xc);w2(4);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+static void on20_read_block( PIA *pi, char * buf, int count )
+
+{ int k, l, h;
+
+ op(1); vl(1); op(0);
+
+ for (k=0;k<count;k++)
+ if (pi->mode) {
+ w2(4); w2(0x26); buf[k] = r0();
+ } else {
+ w2(6); l = r1(); w2(4);
+ w2(6); h = r1(); w2(4);
+ buf[k] = j44(l,h);
+ }
+ w2(4);
+}
+
+static void on20_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ op(1); vl(1); op(0);
+
+ for (k=0;k<count;k++) { w2(5); w0(buf[k]); w2(7); }
+ w2(4);
+}
+
+static void on20_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[2] = {"4-bit","8-bit"};
+
+ printk("%s: on20 %s, OnSpec 90c20 at 0x%x, ",
+ pi->device,ON20_VERSION,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol on20 = {
+ .owner = THIS_MODULE,
+ .name = "on20",
+ .max_mode = 2,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = on20_write_regr,
+ .read_regr = on20_read_regr,
+ .write_block = on20_write_block,
+ .read_block = on20_read_block,
+ .connect = on20_connect,
+ .disconnect = on20_disconnect,
+ .log_adapter = on20_log_adapter,
+};
+
+static int __init on20_init(void)
+{
+ return paride_register(&on20);
+}
+
+static void __exit on20_exit(void)
+{
+ paride_unregister(&on20);
+}
+
+MODULE_LICENSE("GPL");
+module_init(on20_init)
+module_exit(on20_exit)
diff --git a/drivers/block/paride/on26.c b/drivers/block/paride/on26.c
new file mode 100644
index 000000000..95ba25692
--- /dev/null
+++ b/drivers/block/paride/on26.c
@@ -0,0 +1,319 @@
+/*
+ on26.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ on26.c is a low-level protocol driver for the
+ OnSpec 90c26 parallel to IDE adapter chip.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.06 init_proto, release_proto
+ 1.02 GRG 1998.09.23 updates for the -E rev chip
+ 1.03 GRG 1998.12.14 fix for slave drives
+ 1.04 GRG 1998.12.20 yet another bug fix
+
+*/
+
+#define ON26_VERSION "1.04"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+/* mode codes: 0 nybble reads, 8-bit writes
+ 1 8-bit reads and writes
+ 2 8-bit EPP mode
+ 3 EPP-16
+ 4 EPP-32
+*/
+
+#define j44(a,b) (((a>>4)&0x0f)|(b&0xf0))
+
+#define P1 w2(5);w2(0xd);w2(5);w2(0xd);w2(5);w2(4);
+#define P2 w2(5);w2(7);w2(5);w2(4);
+
+/* cont = 0 - access the IDE register file
+ cont = 1 - access the IDE command set
+*/
+
+static int on26_read_regr( PIA *pi, int cont, int regr )
+
+{ int a, b, r;
+
+ r = (regr<<2) + 1 + cont;
+
+ switch (pi->mode) {
+
+ case 0: w0(1); P1; w0(r); P2; w0(0); P1;
+ w2(6); a = r1(); w2(4);
+ w2(6); b = r1(); w2(4);
+ w2(6); w2(4); w2(6); w2(4);
+ return j44(a,b);
+
+ case 1: w0(1); P1; w0(r); P2; w0(0); P1;
+ w2(0x26); a = r0(); w2(4); w2(0x26); w2(4);
+ return a;
+
+ case 2:
+ case 3:
+ case 4: w3(1); w3(1); w2(5); w4(r); w2(4);
+ w3(0); w3(0); w2(0x24); a = r4(); w2(4);
+ w2(0x24); (void)r4(); w2(4);
+ return a;
+
+ }
+ return -1;
+}
+
+static void on26_write_regr( PIA *pi, int cont, int regr, int val )
+
+{ int r;
+
+ r = (regr<<2) + 1 + cont;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: w0(1); P1; w0(r); P2; w0(0); P1;
+ w0(val); P2; w0(val); P2;
+ break;
+
+ case 2:
+ case 3:
+ case 4: w3(1); w3(1); w2(5); w4(r); w2(4);
+ w3(0); w3(0);
+ w2(5); w4(val); w2(4);
+ w2(5); w4(val); w2(4);
+ break;
+ }
+}
+
+#define CCP(x) w0(0xfe);w0(0xaa);w0(0x55);w0(0);w0(0xff);\
+ w0(0x87);w0(0x78);w0(x);w2(4);w2(5);w2(4);w0(0xff);
+
+static void on26_connect ( PIA *pi )
+
+{ int x;
+
+ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+
+ CCP(0x20);
+ x = 8; if (pi->mode) x = 9;
+
+ w0(2); P1; w0(8); P2;
+ w0(2); P1; w0(x); P2;
+}
+
+static void on26_disconnect ( PIA *pi )
+
+{ if (pi->mode >= 2) { w3(4); w3(4); w3(4); w3(4); }
+ else { w0(4); P1; w0(4); P1; }
+ CCP(0x30);
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+}
+
+#define RESET_WAIT 200
+
+static int on26_test_port( PIA *pi) /* hard reset */
+
+{ int i, m, d, x=0, y=0;
+
+ pi->saved_r0 = r0();
+ pi->saved_r2 = r2();
+
+ d = pi->delay;
+ m = pi->mode;
+ pi->delay = 5;
+ pi->mode = 0;
+
+ w2(0xc);
+
+ CCP(0x30); CCP(0);
+
+ w0(0xfe);w0(0xaa);w0(0x55);w0(0);w0(0xff);
+ i = ((r1() & 0xf0) << 4); w0(0x87);
+ i |= (r1() & 0xf0); w0(0x78);
+ w0(0x20);w2(4);w2(5);
+ i |= ((r1() & 0xf0) >> 4);
+ w2(4);w0(0xff);
+
+ if (i == 0xb5f) {
+
+ w0(2); P1; w0(0); P2;
+ w0(3); P1; w0(0); P2;
+ w0(2); P1; w0(8); P2; udelay(100);
+ w0(2); P1; w0(0xa); P2; udelay(100);
+ w0(2); P1; w0(8); P2; udelay(1000);
+
+ on26_write_regr(pi,0,6,0xa0);
+
+ for (i=0;i<RESET_WAIT;i++) {
+ on26_write_regr(pi,0,6,0xa0);
+ x = on26_read_regr(pi,0,7);
+ on26_write_regr(pi,0,6,0xb0);
+ y = on26_read_regr(pi,0,7);
+ if (!((x&0x80)||(y&0x80))) break;
+ mdelay(100);
+ }
+
+ if (i == RESET_WAIT)
+ printk("on26: Device reset failed (%x,%x)\n",x,y);
+
+ w0(4); P1; w0(4); P1;
+ }
+
+ CCP(0x30);
+
+ pi->delay = d;
+ pi->mode = m;
+ w0(pi->saved_r0);
+ w2(pi->saved_r2);
+
+ return 5;
+}
+
+
+static void on26_read_block( PIA *pi, char * buf, int count )
+
+{ int k, a, b;
+
+ switch (pi->mode) {
+
+ case 0: w0(1); P1; w0(1); P2; w0(2); P1; w0(0x18); P2; w0(0); P1;
+ udelay(10);
+ for (k=0;k<count;k++) {
+ w2(6); a = r1();
+ w2(4); b = r1();
+ buf[k] = j44(a,b);
+ }
+ w0(2); P1; w0(8); P2;
+ break;
+
+ case 1: w0(1); P1; w0(1); P2; w0(2); P1; w0(0x19); P2; w0(0); P1;
+ udelay(10);
+ for (k=0;k<count/2;k++) {
+ w2(0x26); buf[2*k] = r0();
+ w2(0x24); buf[2*k+1] = r0();
+ }
+ w0(2); P1; w0(9); P2;
+ break;
+
+ case 2: w3(1); w3(1); w2(5); w4(1); w2(4);
+ w3(0); w3(0); w2(0x24);
+ udelay(10);
+ for (k=0;k<count;k++) buf[k] = r4();
+ w2(4);
+ break;
+
+ case 3: w3(1); w3(1); w2(5); w4(1); w2(4);
+ w3(0); w3(0); w2(0x24);
+ udelay(10);
+ for (k=0;k<count/2;k++) ((u16 *)buf)[k] = r4w();
+ w2(4);
+ break;
+
+ case 4: w3(1); w3(1); w2(5); w4(1); w2(4);
+ w3(0); w3(0); w2(0x24);
+ udelay(10);
+ for (k=0;k<count/4;k++) ((u32 *)buf)[k] = r4l();
+ w2(4);
+ break;
+
+ }
+}
+
+static void on26_write_block( PIA *pi, char * buf, int count )
+
+{ int k;
+
+ switch (pi->mode) {
+
+ case 0:
+ case 1: w0(1); P1; w0(1); P2;
+ w0(2); P1; w0(0x18+pi->mode); P2; w0(0); P1;
+ udelay(10);
+ for (k=0;k<count/2;k++) {
+ w2(5); w0(buf[2*k]);
+ w2(7); w0(buf[2*k+1]);
+ }
+ w2(5); w2(4);
+ w0(2); P1; w0(8+pi->mode); P2;
+ break;
+
+ case 2: w3(1); w3(1); w2(5); w4(1); w2(4);
+ w3(0); w3(0); w2(0xc5);
+ udelay(10);
+ for (k=0;k<count;k++) w4(buf[k]);
+ w2(0xc4);
+ break;
+
+ case 3: w3(1); w3(1); w2(5); w4(1); w2(4);
+ w3(0); w3(0); w2(0xc5);
+ udelay(10);
+ for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+ w2(0xc4);
+ break;
+
+ case 4: w3(1); w3(1); w2(5); w4(1); w2(4);
+ w3(0); w3(0); w2(0xc5);
+ udelay(10);
+ for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+ w2(0xc4);
+ break;
+
+ }
+
+}
+
+static void on26_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{ char *mode_string[5] = {"4-bit","8-bit","EPP-8",
+ "EPP-16","EPP-32"};
+
+ printk("%s: on26 %s, OnSpec 90c26 at 0x%x, ",
+ pi->device,ON26_VERSION,pi->port);
+ printk("mode %d (%s), delay %d\n",pi->mode,
+ mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol on26 = {
+ .owner = THIS_MODULE,
+ .name = "on26",
+ .max_mode = 5,
+ .epp_first = 2,
+ .default_delay = 1,
+ .max_units = 1,
+ .write_regr = on26_write_regr,
+ .read_regr = on26_read_regr,
+ .write_block = on26_write_block,
+ .read_block = on26_read_block,
+ .connect = on26_connect,
+ .disconnect = on26_disconnect,
+ .test_port = on26_test_port,
+ .log_adapter = on26_log_adapter,
+};
+
+static int __init on26_init(void)
+{
+ return paride_register(&on26);
+}
+
+static void __exit on26_exit(void)
+{
+ paride_unregister(&on26);
+}
+
+MODULE_LICENSE("GPL");
+module_init(on26_init)
+module_exit(on26_exit)
diff --git a/drivers/block/paride/paride.c b/drivers/block/paride/paride.c
new file mode 100644
index 000000000..48c50f11f
--- /dev/null
+++ b/drivers/block/paride/paride.c
@@ -0,0 +1,434 @@
+/*
+ paride.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ This is the base module for the family of device drivers
+ that support parallel port IDE devices.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.03 Use spinlocks
+ 1.02 GRG 1998.05.05 init_proto, release_proto, ktti
+ 1.03 GRG 1998.08.15 eliminate compiler warning
+ 1.04 GRG 1998.11.28 added support for FRIQ
+ 1.05 TMW 2000.06.06 use parport_find_number instead of
+ parport_enumerate
+ 1.06 TMW 2001.03.26 more sane parport-or-not resource management
+*/
+
+#define PI_VERSION "1.06"
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/sched.h> /* TASK_* */
+#include <linux/parport.h>
+
+#include "paride.h"
+
+MODULE_LICENSE("GPL");
+
+#define MAX_PROTOS 32
+
+static struct pi_protocol *protocols[MAX_PROTOS];
+
+static DEFINE_SPINLOCK(pi_spinlock);
+
+void pi_write_regr(PIA * pi, int cont, int regr, int val)
+{
+ pi->proto->write_regr(pi, cont, regr, val);
+}
+
+EXPORT_SYMBOL(pi_write_regr);
+
+int pi_read_regr(PIA * pi, int cont, int regr)
+{
+ return pi->proto->read_regr(pi, cont, regr);
+}
+
+EXPORT_SYMBOL(pi_read_regr);
+
+void pi_write_block(PIA * pi, char *buf, int count)
+{
+ pi->proto->write_block(pi, buf, count);
+}
+
+EXPORT_SYMBOL(pi_write_block);
+
+void pi_read_block(PIA * pi, char *buf, int count)
+{
+ pi->proto->read_block(pi, buf, count);
+}
+
+EXPORT_SYMBOL(pi_read_block);
+
+static void pi_wake_up(void *p)
+{
+ PIA *pi = (PIA *) p;
+ unsigned long flags;
+ void (*cont) (void) = NULL;
+
+ spin_lock_irqsave(&pi_spinlock, flags);
+
+ if (pi->claim_cont && !parport_claim(pi->pardev)) {
+ cont = pi->claim_cont;
+ pi->claim_cont = NULL;
+ pi->claimed = 1;
+ }
+
+ spin_unlock_irqrestore(&pi_spinlock, flags);
+
+ wake_up(&(pi->parq));
+
+ if (cont)
+ cont();
+}
+
+int pi_schedule_claimed(PIA * pi, void (*cont) (void))
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&pi_spinlock, flags);
+ if (pi->pardev && parport_claim(pi->pardev)) {
+ pi->claim_cont = cont;
+ spin_unlock_irqrestore(&pi_spinlock, flags);
+ return 0;
+ }
+ pi->claimed = 1;
+ spin_unlock_irqrestore(&pi_spinlock, flags);
+ return 1;
+}
+EXPORT_SYMBOL(pi_schedule_claimed);
+
+void pi_do_claimed(PIA * pi, void (*cont) (void))
+{
+ if (pi_schedule_claimed(pi, cont))
+ cont();
+}
+
+EXPORT_SYMBOL(pi_do_claimed);
+
+static void pi_claim(PIA * pi)
+{
+ if (pi->claimed)
+ return;
+ pi->claimed = 1;
+ if (pi->pardev)
+ wait_event(pi->parq,
+ !parport_claim((struct pardevice *) pi->pardev));
+}
+
+static void pi_unclaim(PIA * pi)
+{
+ pi->claimed = 0;
+ if (pi->pardev)
+ parport_release((struct pardevice *) (pi->pardev));
+}
+
+void pi_connect(PIA * pi)
+{
+ pi_claim(pi);
+ pi->proto->connect(pi);
+}
+
+EXPORT_SYMBOL(pi_connect);
+
+void pi_disconnect(PIA * pi)
+{
+ pi->proto->disconnect(pi);
+ pi_unclaim(pi);
+}
+
+EXPORT_SYMBOL(pi_disconnect);
+
+static void pi_unregister_parport(PIA * pi)
+{
+ if (pi->pardev) {
+ parport_unregister_device((struct pardevice *) (pi->pardev));
+ pi->pardev = NULL;
+ }
+}
+
+void pi_release(PIA * pi)
+{
+ pi_unregister_parport(pi);
+ if (pi->proto->release_proto)
+ pi->proto->release_proto(pi);
+ module_put(pi->proto->owner);
+}
+
+EXPORT_SYMBOL(pi_release);
+
+static int default_test_proto(PIA * pi, char *scratch, int verbose)
+{
+ int j, k;
+ int e[2] = { 0, 0 };
+
+ pi->proto->connect(pi);
+
+ for (j = 0; j < 2; j++) {
+ pi_write_regr(pi, 0, 6, 0xa0 + j * 0x10);
+ for (k = 0; k < 256; k++) {
+ pi_write_regr(pi, 0, 2, k ^ 0xaa);
+ pi_write_regr(pi, 0, 3, k ^ 0x55);
+ if (pi_read_regr(pi, 0, 2) != (k ^ 0xaa))
+ e[j]++;
+ }
+ }
+ pi->proto->disconnect(pi);
+
+ if (verbose)
+ printk("%s: %s: port 0x%x, mode %d, test=(%d,%d)\n",
+ pi->device, pi->proto->name, pi->port,
+ pi->mode, e[0], e[1]);
+
+ return (e[0] && e[1]); /* not here if both > 0 */
+}
+
+static int pi_test_proto(PIA * pi, char *scratch, int verbose)
+{
+ int res;
+
+ pi_claim(pi);
+ if (pi->proto->test_proto)
+ res = pi->proto->test_proto(pi, scratch, verbose);
+ else
+ res = default_test_proto(pi, scratch, verbose);
+ pi_unclaim(pi);
+
+ return res;
+}
+
+int paride_register(PIP * pr)
+{
+ int k;
+
+ for (k = 0; k < MAX_PROTOS; k++)
+ if (protocols[k] && !strcmp(pr->name, protocols[k]->name)) {
+ printk("paride: %s protocol already registered\n",
+ pr->name);
+ return -1;
+ }
+ k = 0;
+ while ((k < MAX_PROTOS) && (protocols[k]))
+ k++;
+ if (k == MAX_PROTOS) {
+ printk("paride: protocol table full\n");
+ return -1;
+ }
+ protocols[k] = pr;
+ pr->index = k;
+ printk("paride: %s registered as protocol %d\n", pr->name, k);
+ return 0;
+}
+
+EXPORT_SYMBOL(paride_register);
+
+void paride_unregister(PIP * pr)
+{
+ if (!pr)
+ return;
+ if (protocols[pr->index] != pr) {
+ printk("paride: %s not registered\n", pr->name);
+ return;
+ }
+ protocols[pr->index] = NULL;
+}
+
+EXPORT_SYMBOL(paride_unregister);
+
+static int pi_register_parport(PIA * pi, int verbose)
+{
+ struct parport *port;
+
+ port = parport_find_base(pi->port);
+ if (!port)
+ return 0;
+
+ pi->pardev = parport_register_device(port,
+ pi->device, NULL,
+ pi_wake_up, NULL, 0, (void *) pi);
+ parport_put_port(port);
+ if (!pi->pardev)
+ return 0;
+
+ init_waitqueue_head(&pi->parq);
+
+ if (verbose)
+ printk("%s: 0x%x is %s\n", pi->device, pi->port, port->name);
+
+ pi->parname = (char *) port->name;
+
+ return 1;
+}
+
+static int pi_probe_mode(PIA * pi, int max, char *scratch, int verbose)
+{
+ int best, range;
+
+ if (pi->mode != -1) {
+ if (pi->mode >= max)
+ return 0;
+ range = 3;
+ if (pi->mode >= pi->proto->epp_first)
+ range = 8;
+ if ((range == 8) && (pi->port % 8))
+ return 0;
+ pi->reserved = range;
+ return (!pi_test_proto(pi, scratch, verbose));
+ }
+ best = -1;
+ for (pi->mode = 0; pi->mode < max; pi->mode++) {
+ range = 3;
+ if (pi->mode >= pi->proto->epp_first)
+ range = 8;
+ if ((range == 8) && (pi->port % 8))
+ break;
+ pi->reserved = range;
+ if (!pi_test_proto(pi, scratch, verbose))
+ best = pi->mode;
+ }
+ pi->mode = best;
+ return (best > -1);
+}
+
+static int pi_probe_unit(PIA * pi, int unit, char *scratch, int verbose)
+{
+ int max, s, e;
+
+ s = unit;
+ e = s + 1;
+
+ if (s == -1) {
+ s = 0;
+ e = pi->proto->max_units;
+ }
+
+ if (!pi_register_parport(pi, verbose))
+ return 0;
+
+ if (pi->proto->test_port) {
+ pi_claim(pi);
+ max = pi->proto->test_port(pi);
+ pi_unclaim(pi);
+ } else
+ max = pi->proto->max_mode;
+
+ if (pi->proto->probe_unit) {
+ pi_claim(pi);
+ for (pi->unit = s; pi->unit < e; pi->unit++)
+ if (pi->proto->probe_unit(pi)) {
+ pi_unclaim(pi);
+ if (pi_probe_mode(pi, max, scratch, verbose))
+ return 1;
+ pi_unregister_parport(pi);
+ return 0;
+ }
+ pi_unclaim(pi);
+ pi_unregister_parport(pi);
+ return 0;
+ }
+
+ if (!pi_probe_mode(pi, max, scratch, verbose)) {
+ pi_unregister_parport(pi);
+ return 0;
+ }
+ return 1;
+
+}
+
+int pi_init(PIA * pi, int autoprobe, int port, int mode,
+ int unit, int protocol, int delay, char *scratch,
+ int devtype, int verbose, char *device)
+{
+ int p, k, s, e;
+ int lpts[7] = { 0x3bc, 0x378, 0x278, 0x268, 0x27c, 0x26c, 0 };
+
+ s = protocol;
+ e = s + 1;
+
+ if (!protocols[0])
+ request_module("paride_protocol");
+
+ if (autoprobe) {
+ s = 0;
+ e = MAX_PROTOS;
+ } else if ((s < 0) || (s >= MAX_PROTOS) || (port <= 0) ||
+ (!protocols[s]) || (unit < 0) ||
+ (unit >= protocols[s]->max_units)) {
+ printk("%s: Invalid parameters\n", device);
+ return 0;
+ }
+
+ for (p = s; p < e; p++) {
+ struct pi_protocol *proto = protocols[p];
+ if (!proto)
+ continue;
+ /* still racy */
+ if (!try_module_get(proto->owner))
+ continue;
+ pi->proto = proto;
+ pi->private = 0;
+ if (proto->init_proto && proto->init_proto(pi) < 0) {
+ pi->proto = NULL;
+ module_put(proto->owner);
+ continue;
+ }
+ if (delay == -1)
+ pi->delay = pi->proto->default_delay;
+ else
+ pi->delay = delay;
+ pi->devtype = devtype;
+ pi->device = device;
+
+ pi->parname = NULL;
+ pi->pardev = NULL;
+ init_waitqueue_head(&pi->parq);
+ pi->claimed = 0;
+ pi->claim_cont = NULL;
+
+ pi->mode = mode;
+ if (port != -1) {
+ pi->port = port;
+ if (pi_probe_unit(pi, unit, scratch, verbose))
+ break;
+ pi->port = 0;
+ } else {
+ k = 0;
+ while ((pi->port = lpts[k++]))
+ if (pi_probe_unit
+ (pi, unit, scratch, verbose))
+ break;
+ if (pi->port)
+ break;
+ }
+ if (pi->proto->release_proto)
+ pi->proto->release_proto(pi);
+ module_put(proto->owner);
+ }
+
+ if (!pi->port) {
+ if (autoprobe)
+ printk("%s: Autoprobe failed\n", device);
+ else
+ printk("%s: Adapter not found\n", device);
+ return 0;
+ }
+
+ if (pi->parname)
+ printk("%s: Sharing %s at 0x%x\n", pi->device,
+ pi->parname, pi->port);
+
+ pi->proto->log_adapter(pi, scratch, verbose);
+
+ return 1;
+}
+
+EXPORT_SYMBOL(pi_init);
diff --git a/drivers/block/paride/paride.h b/drivers/block/paride/paride.h
new file mode 100644
index 000000000..2bddbf455
--- /dev/null
+++ b/drivers/block/paride/paride.h
@@ -0,0 +1,170 @@
+#ifndef __DRIVERS_PARIDE_H__
+#define __DRIVERS_PARIDE_H__
+
+/*
+ paride.h (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GPL.
+
+ This file defines the interface between the high-level parallel
+ IDE device drivers (pd, pf, pcd, pt) and the adapter chips.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.05 init_proto, release_proto
+*/
+
+#define PARIDE_H_VERSION "1.01"
+
+/* Some adapters need to know what kind of device they are in
+
+ Values for devtype:
+*/
+
+#define PI_PD 0 /* IDE disk */
+#define PI_PCD 1 /* ATAPI CDrom */
+#define PI_PF 2 /* ATAPI disk */
+#define PI_PT 3 /* ATAPI tape */
+#define PI_PG 4 /* ATAPI generic */
+
+/* The paride module contains no state, instead the drivers allocate
+ a pi_adapter data structure and pass it to paride in every operation.
+
+*/
+
+struct pi_adapter {
+
+ struct pi_protocol *proto; /* adapter protocol */
+ int port; /* base address of parallel port */
+ int mode; /* transfer mode in use */
+ int delay; /* adapter delay setting */
+ int devtype; /* device type: PI_PD etc. */
+ char *device; /* name of driver */
+ int unit; /* unit number for chained adapters */
+ int saved_r0; /* saved port state */
+ int saved_r2; /* saved port state */
+ int reserved; /* number of ports reserved */
+ unsigned long private; /* for protocol module */
+
+ wait_queue_head_t parq; /* semaphore for parport sharing */
+ void *pardev; /* pointer to pardevice */
+ char *parname; /* parport name */
+ int claimed; /* parport has already been claimed */
+ void (*claim_cont)(void); /* continuation for parport wait */
+};
+
+typedef struct pi_adapter PIA;
+
+/* functions exported by paride to the high level drivers */
+
+extern int pi_init(PIA *pi,
+ int autoprobe, /* 1 to autoprobe */
+ int port, /* base port address */
+ int mode, /* -1 for autoprobe */
+ int unit, /* unit number, if supported */
+ int protocol, /* protocol to use */
+ int delay, /* -1 to use adapter specific default */
+ char * scratch, /* address of 512 byte buffer */
+ int devtype, /* device type: PI_PD, PI_PCD, etc ... */
+ int verbose, /* log verbose data while probing */
+ char *device /* name of the driver */
+ ); /* returns 0 on failure, 1 on success */
+
+extern void pi_release(PIA *pi);
+
+/* registers are addressed as (cont,regr)
+
+ cont: 0 for command register file, 1 for control register(s)
+ regr: 0-7 for register number.
+
+*/
+
+extern void pi_write_regr(PIA *pi, int cont, int regr, int val);
+
+extern int pi_read_regr(PIA *pi, int cont, int regr);
+
+extern void pi_write_block(PIA *pi, char * buf, int count);
+
+extern void pi_read_block(PIA *pi, char * buf, int count);
+
+extern void pi_connect(PIA *pi);
+
+extern void pi_disconnect(PIA *pi);
+
+extern void pi_do_claimed(PIA *pi, void (*cont)(void));
+extern int pi_schedule_claimed(PIA *pi, void (*cont)(void));
+
+/* macros and functions exported to the protocol modules */
+
+#define delay_p (pi->delay?udelay(pi->delay):(void)0)
+#define out_p(offs,byte) outb(byte,pi->port+offs); delay_p;
+#define in_p(offs) (delay_p,inb(pi->port+offs))
+
+#define w0(byte) {out_p(0,byte);}
+#define r0() (in_p(0) & 0xff)
+#define w1(byte) {out_p(1,byte);}
+#define r1() (in_p(1) & 0xff)
+#define w2(byte) {out_p(2,byte);}
+#define r2() (in_p(2) & 0xff)
+#define w3(byte) {out_p(3,byte);}
+#define w4(byte) {out_p(4,byte);}
+#define r4() (in_p(4) & 0xff)
+#define w4w(data) {outw(data,pi->port+4); delay_p;}
+#define w4l(data) {outl(data,pi->port+4); delay_p;}
+#define r4w() (delay_p,inw(pi->port+4)&0xffff)
+#define r4l() (delay_p,inl(pi->port+4)&0xffffffff)
+
+static inline u16 pi_swab16( char *b, int k)
+
+{ union { u16 u; char t[2]; } r;
+
+ r.t[0]=b[2*k+1]; r.t[1]=b[2*k];
+ return r.u;
+}
+
+static inline u32 pi_swab32( char *b, int k)
+
+{ union { u32 u; char f[4]; } r;
+
+ r.f[0]=b[4*k+1]; r.f[1]=b[4*k];
+ r.f[2]=b[4*k+3]; r.f[3]=b[4*k+2];
+ return r.u;
+}
+
+struct pi_protocol {
+
+ char name[8]; /* name for this protocol */
+ int index; /* index into protocol table */
+
+ int max_mode; /* max mode number */
+ int epp_first; /* modes >= this use 8 ports */
+
+ int default_delay; /* delay parameter if not specified */
+ int max_units; /* max chained units probed for */
+
+ void (*write_regr)(PIA *,int,int,int);
+ int (*read_regr)(PIA *,int,int);
+ void (*write_block)(PIA *,char *,int);
+ void (*read_block)(PIA *,char *,int);
+
+ void (*connect)(PIA *);
+ void (*disconnect)(PIA *);
+
+ int (*test_port)(PIA *);
+ int (*probe_unit)(PIA *);
+ int (*test_proto)(PIA *,char *,int);
+ void (*log_adapter)(PIA *,char *,int);
+
+ int (*init_proto)(PIA *);
+ void (*release_proto)(PIA *);
+ struct module *owner;
+};
+
+typedef struct pi_protocol PIP;
+
+extern int paride_register( PIP * );
+extern void paride_unregister ( PIP * );
+
+#endif /* __DRIVERS_PARIDE_H__ */
+/* end of paride.h */
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
new file mode 100644
index 000000000..3b7c9f1be
--- /dev/null
+++ b/drivers/block/paride/pcd.c
@@ -0,0 +1,991 @@
+/*
+ pcd.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ This is a high-level driver for parallel port ATAPI CD-ROM
+ drives based on chips supported by the paride module.
+
+ By default, the driver will autoprobe for a single parallel
+ port ATAPI CD-ROM drive, but if their individual parameters are
+ specified, the driver can handle up to 4 drives.
+
+ The behaviour of the pcd driver can be altered by setting
+ some parameters from the insmod command line. The following
+ parameters are adjustable:
+
+ drive0 These four arguments can be arrays of
+ drive1 1-6 integers as follows:
+ drive2
+ drive3 <prt>,<pro>,<uni>,<mod>,<slv>,<dly>
+
+ Where,
+
+ <prt> is the base of the parallel port address for
+ the corresponding drive. (required)
+
+ <pro> is the protocol number for the adapter that
+ supports this drive. These numbers are
+ logged by 'paride' when the protocol modules
+ are initialised. (0 if not given)
+
+ <uni> for those adapters that support chained
+ devices, this is the unit selector for the
+ chain of devices on the given port. It should
+ be zero for devices that don't support chaining.
+ (0 if not given)
+
+ <mod> this can be -1 to choose the best mode, or one
+ of the mode numbers supported by the adapter.
+ (-1 if not given)
+
+ <slv> ATAPI CD-ROMs can be jumpered to master or slave.
+ Set this to 0 to choose the master drive, 1 to
+ choose the slave, -1 (the default) to choose the
+ first drive found.
+
+ <dly> some parallel ports require the driver to
+ go more slowly. -1 sets a default value that
+ should work with the chosen protocol. Otherwise,
+ set this to a small integer, the larger it is
+ the slower the port i/o. In some cases, setting
+ this to zero will speed up the device. (default -1)
+
+ major You may use this parameter to overide the
+ default major number (46) that this driver
+ will use. Be sure to change the device
+ name as well.
+
+ name This parameter is a character string that
+ contains the name the kernel will use for this
+ device (in /proc output, for instance).
+ (default "pcd")
+
+ verbose This parameter controls the amount of logging
+ that the driver will do. Set it to 0 for
+ normal operation, 1 to see autoprobe progress
+ messages, or 2 to see additional debugging
+ output. (default 0)
+
+ nice This parameter controls the driver's use of
+ idle CPU time, at the expense of some speed.
+
+ If this driver is built into the kernel, you can use the
+ following kernel command line parameters, with the same values
+ as the corresponding module parameters listed above:
+
+ pcd.drive0
+ pcd.drive1
+ pcd.drive2
+ pcd.drive3
+ pcd.nice
+
+ In addition, you can use the parameter pcd.disable to disable
+ the driver entirely.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.01.24 Added test unit ready support
+ 1.02 GRG 1998.05.06 Changes to pcd_completion, ready_wait,
+ and loosen interpretation of ATAPI
+ standard for clearing error status.
+ Use spinlocks. Eliminate sti().
+ 1.03 GRG 1998.06.16 Eliminated an Ugh
+ 1.04 GRG 1998.08.15 Added extra debugging, improvements to
+ pcd_completion, use HZ in loop timing
+ 1.05 GRG 1998.08.16 Conformed to "Uniform CD-ROM" standard
+ 1.06 GRG 1998.08.19 Added audio ioctl support
+ 1.07 GRG 1998.09.24 Increased reset timeout, added jumbo support
+
+*/
+
+#define PCD_VERSION "1.07"
+#define PCD_MAJOR 46
+#define PCD_NAME "pcd"
+#define PCD_UNITS 4
+
+/* Here are things one can override from the insmod command.
+ Most are autoprobed by paride unless set here. Verbose is off
+ by default.
+
+*/
+
+static int verbose = 0;
+static int major = PCD_MAJOR;
+static char *name = PCD_NAME;
+static int nice = 0;
+static int disable = 0;
+
+static int drive0[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive1[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive2[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive3[6] = { 0, 0, 0, -1, -1, -1 };
+
+static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
+static int pcd_drive_count;
+
+enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
+
+/* end of parameters */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/cdrom.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+
+static DEFINE_MUTEX(pcd_mutex);
+static DEFINE_SPINLOCK(pcd_lock);
+
+module_param(verbose, int, 0644);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param(nice, int, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+#include "pseudo.h"
+
+#define PCD_RETRIES 5
+#define PCD_TMO 800 /* timeout in jiffies */
+#define PCD_DELAY 50 /* spin delay in uS */
+#define PCD_READY_TMO 20 /* in seconds */
+#define PCD_RESET_TMO 100 /* in tenths of a second */
+
+#define PCD_SPIN (1000000*PCD_TMO)/(HZ*PCD_DELAY)
+
+#define IDE_ERR 0x01
+#define IDE_DRQ 0x08
+#define IDE_READY 0x40
+#define IDE_BUSY 0x80
+
+static int pcd_open(struct cdrom_device_info *cdi, int purpose);
+static void pcd_release(struct cdrom_device_info *cdi);
+static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr);
+static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
+ unsigned int clearing, int slot_nr);
+static int pcd_tray_move(struct cdrom_device_info *cdi, int position);
+static int pcd_lock_door(struct cdrom_device_info *cdi, int lock);
+static int pcd_drive_reset(struct cdrom_device_info *cdi);
+static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn);
+static int pcd_audio_ioctl(struct cdrom_device_info *cdi,
+ unsigned int cmd, void *arg);
+static int pcd_packet(struct cdrom_device_info *cdi,
+ struct packet_command *cgc);
+
+static int pcd_detect(void);
+static void pcd_probe_capabilities(void);
+static void do_pcd_read_drq(void);
+static void do_pcd_request(struct request_queue * q);
+static void do_pcd_read(void);
+
+struct pcd_unit {
+ struct pi_adapter pia; /* interface to paride layer */
+ struct pi_adapter *pi;
+ int drive; /* master/slave */
+ int last_sense; /* result of last request sense */
+ int changed; /* media change seen */
+ int present; /* does this unit exist ? */
+ char *name; /* pcd0, pcd1, etc */
+ struct cdrom_device_info info; /* uniform cdrom interface */
+ struct gendisk *disk;
+};
+
+static struct pcd_unit pcd[PCD_UNITS];
+
+static char pcd_scratch[64];
+static char pcd_buffer[2048]; /* raw block buffer */
+static int pcd_bufblk = -1; /* block in buffer, in CD units,
+ -1 for nothing there. See also
+ pd_unit.
+ */
+
+/* the variables below are used mainly in the I/O request engine, which
+ processes only one request at a time.
+*/
+
+static struct pcd_unit *pcd_current; /* current request's drive */
+static struct request *pcd_req;
+static int pcd_retries; /* retries on current request */
+static int pcd_busy; /* request being processed ? */
+static int pcd_sector; /* address of next requested sector */
+static int pcd_count; /* number of blocks still to do */
+static char *pcd_buf; /* buffer for request in progress */
+
+/* kernel glue structures */
+
+static int pcd_block_open(struct block_device *bdev, fmode_t mode)
+{
+ struct pcd_unit *cd = bdev->bd_disk->private_data;
+ int ret;
+
+ mutex_lock(&pcd_mutex);
+ ret = cdrom_open(&cd->info, bdev, mode);
+ mutex_unlock(&pcd_mutex);
+
+ return ret;
+}
+
+static void pcd_block_release(struct gendisk *disk, fmode_t mode)
+{
+ struct pcd_unit *cd = disk->private_data;
+ mutex_lock(&pcd_mutex);
+ cdrom_release(&cd->info, mode);
+ mutex_unlock(&pcd_mutex);
+}
+
+static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned cmd, unsigned long arg)
+{
+ struct pcd_unit *cd = bdev->bd_disk->private_data;
+ int ret;
+
+ mutex_lock(&pcd_mutex);
+ ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
+ mutex_unlock(&pcd_mutex);
+
+ return ret;
+}
+
+static unsigned int pcd_block_check_events(struct gendisk *disk,
+ unsigned int clearing)
+{
+ struct pcd_unit *cd = disk->private_data;
+ return cdrom_check_events(&cd->info, clearing);
+}
+
+static const struct block_device_operations pcd_bdops = {
+ .owner = THIS_MODULE,
+ .open = pcd_block_open,
+ .release = pcd_block_release,
+ .ioctl = pcd_block_ioctl,
+ .check_events = pcd_block_check_events,
+};
+
+static struct cdrom_device_ops pcd_dops = {
+ .open = pcd_open,
+ .release = pcd_release,
+ .drive_status = pcd_drive_status,
+ .check_events = pcd_check_events,
+ .tray_move = pcd_tray_move,
+ .lock_door = pcd_lock_door,
+ .get_mcn = pcd_get_mcn,
+ .reset = pcd_drive_reset,
+ .audio_ioctl = pcd_audio_ioctl,
+ .generic_packet = pcd_packet,
+ .capability = CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK |
+ CDC_MCN | CDC_MEDIA_CHANGED | CDC_RESET |
+ CDC_PLAY_AUDIO | CDC_GENERIC_PACKET | CDC_CD_R |
+ CDC_CD_RW,
+};
+
+static void pcd_init_units(void)
+{
+ struct pcd_unit *cd;
+ int unit;
+
+ pcd_drive_count = 0;
+ for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
+ struct gendisk *disk = alloc_disk(1);
+ if (!disk)
+ continue;
+ cd->disk = disk;
+ cd->pi = &cd->pia;
+ cd->present = 0;
+ cd->last_sense = 0;
+ cd->changed = 1;
+ cd->drive = (*drives[unit])[D_SLV];
+ if ((*drives[unit])[D_PRT])
+ pcd_drive_count++;
+
+ cd->name = &cd->info.name[0];
+ snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit);
+ cd->info.ops = &pcd_dops;
+ cd->info.handle = cd;
+ cd->info.speed = 0;
+ cd->info.capacity = 1;
+ cd->info.mask = 0;
+ disk->major = major;
+ disk->first_minor = unit;
+ strcpy(disk->disk_name, cd->name); /* umm... */
+ disk->fops = &pcd_bdops;
+ disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE;
+ }
+}
+
+static int pcd_open(struct cdrom_device_info *cdi, int purpose)
+{
+ struct pcd_unit *cd = cdi->handle;
+ if (!cd->present)
+ return -ENODEV;
+ return 0;
+}
+
+static void pcd_release(struct cdrom_device_info *cdi)
+{
+}
+
+static inline int status_reg(struct pcd_unit *cd)
+{
+ return pi_read_regr(cd->pi, 1, 6);
+}
+
+static inline int read_reg(struct pcd_unit *cd, int reg)
+{
+ return pi_read_regr(cd->pi, 0, reg);
+}
+
+static inline void write_reg(struct pcd_unit *cd, int reg, int val)
+{
+ pi_write_regr(cd->pi, 0, reg, val);
+}
+
+static int pcd_wait(struct pcd_unit *cd, int go, int stop, char *fun, char *msg)
+{
+ int j, r, e, s, p;
+
+ j = 0;
+ while ((((r = status_reg(cd)) & go) || (stop && (!(r & stop))))
+ && (j++ < PCD_SPIN))
+ udelay(PCD_DELAY);
+
+ if ((r & (IDE_ERR & stop)) || (j > PCD_SPIN)) {
+ s = read_reg(cd, 7);
+ e = read_reg(cd, 1);
+ p = read_reg(cd, 2);
+ if (j > PCD_SPIN)
+ e |= 0x100;
+ if (fun)
+ printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
+ " loop=%d phase=%d\n",
+ cd->name, fun, msg, r, s, e, j, p);
+ return (s << 8) + r;
+ }
+ return 0;
+}
+
+static int pcd_command(struct pcd_unit *cd, char *cmd, int dlen, char *fun)
+{
+ pi_connect(cd->pi);
+
+ write_reg(cd, 6, 0xa0 + 0x10 * cd->drive);
+
+ if (pcd_wait(cd, IDE_BUSY | IDE_DRQ, 0, fun, "before command")) {
+ pi_disconnect(cd->pi);
+ return -1;
+ }
+
+ write_reg(cd, 4, dlen % 256);
+ write_reg(cd, 5, dlen / 256);
+ write_reg(cd, 7, 0xa0); /* ATAPI packet command */
+
+ if (pcd_wait(cd, IDE_BUSY, IDE_DRQ, fun, "command DRQ")) {
+ pi_disconnect(cd->pi);
+ return -1;
+ }
+
+ if (read_reg(cd, 2) != 1) {
+ printk("%s: %s: command phase error\n", cd->name, fun);
+ pi_disconnect(cd->pi);
+ return -1;
+ }
+
+ pi_write_block(cd->pi, cmd, 12);
+
+ return 0;
+}
+
+static int pcd_completion(struct pcd_unit *cd, char *buf, char *fun)
+{
+ int r, d, p, n, k, j;
+
+ r = -1;
+ k = 0;
+ j = 0;
+
+ if (!pcd_wait(cd, IDE_BUSY, IDE_DRQ | IDE_READY | IDE_ERR,
+ fun, "completion")) {
+ r = 0;
+ while (read_reg(cd, 7) & IDE_DRQ) {
+ d = read_reg(cd, 4) + 256 * read_reg(cd, 5);
+ n = (d + 3) & 0xfffc;
+ p = read_reg(cd, 2) & 3;
+
+ if ((p == 2) && (n > 0) && (j == 0)) {
+ pi_read_block(cd->pi, buf, n);
+ if (verbose > 1)
+ printk("%s: %s: Read %d bytes\n",
+ cd->name, fun, n);
+ r = 0;
+ j++;
+ } else {
+ if (verbose > 1)
+ printk
+ ("%s: %s: Unexpected phase %d, d=%d, k=%d\n",
+ cd->name, fun, p, d, k);
+ if (verbose < 2)
+ printk_once(
+ "%s: WARNING: ATAPI phase errors\n",
+ cd->name);
+ mdelay(1);
+ }
+ if (k++ > PCD_TMO) {
+ printk("%s: Stuck DRQ\n", cd->name);
+ break;
+ }
+ if (pcd_wait
+ (cd, IDE_BUSY, IDE_DRQ | IDE_READY | IDE_ERR, fun,
+ "completion")) {
+ r = -1;
+ break;
+ }
+ }
+ }
+
+ pi_disconnect(cd->pi);
+
+ return r;
+}
+
+static void pcd_req_sense(struct pcd_unit *cd, char *fun)
+{
+ char rs_cmd[12] = { 0x03, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 };
+ char buf[16];
+ int r, c;
+
+ r = pcd_command(cd, rs_cmd, 16, "Request sense");
+ mdelay(1);
+ if (!r)
+ pcd_completion(cd, buf, "Request sense");
+
+ cd->last_sense = -1;
+ c = 2;
+ if (!r) {
+ if (fun)
+ printk("%s: %s: Sense key: %x, ASC: %x, ASQ: %x\n",
+ cd->name, fun, buf[2] & 0xf, buf[12], buf[13]);
+ c = buf[2] & 0xf;
+ cd->last_sense =
+ c | ((buf[12] & 0xff) << 8) | ((buf[13] & 0xff) << 16);
+ }
+ if ((c == 2) || (c == 6))
+ cd->changed = 1;
+}
+
+static int pcd_atapi(struct pcd_unit *cd, char *cmd, int dlen, char *buf, char *fun)
+{
+ int r;
+
+ r = pcd_command(cd, cmd, dlen, fun);
+ mdelay(1);
+ if (!r)
+ r = pcd_completion(cd, buf, fun);
+ if (r)
+ pcd_req_sense(cd, fun);
+
+ return r;
+}
+
+static int pcd_packet(struct cdrom_device_info *cdi, struct packet_command *cgc)
+{
+ return pcd_atapi(cdi->handle, cgc->cmd, cgc->buflen, cgc->buffer,
+ "generic packet");
+}
+
+#define DBMSG(msg) ((verbose>1)?(msg):NULL)
+
+static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
+ unsigned int clearing, int slot_nr)
+{
+ struct pcd_unit *cd = cdi->handle;
+ int res = cd->changed;
+ if (res)
+ cd->changed = 0;
+ return res ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static int pcd_lock_door(struct cdrom_device_info *cdi, int lock)
+{
+ char un_cmd[12] = { 0x1e, 0, 0, 0, lock, 0, 0, 0, 0, 0, 0, 0 };
+
+ return pcd_atapi(cdi->handle, un_cmd, 0, pcd_scratch,
+ lock ? "lock door" : "unlock door");
+}
+
+static int pcd_tray_move(struct cdrom_device_info *cdi, int position)
+{
+ char ej_cmd[12] = { 0x1b, 0, 0, 0, 3 - position, 0, 0, 0, 0, 0, 0, 0 };
+
+ return pcd_atapi(cdi->handle, ej_cmd, 0, pcd_scratch,
+ position ? "eject" : "close tray");
+}
+
+static void pcd_sleep(int cs)
+{
+ schedule_timeout_interruptible(cs);
+}
+
+static int pcd_reset(struct pcd_unit *cd)
+{
+ int i, k, flg;
+ int expect[5] = { 1, 1, 1, 0x14, 0xeb };
+
+ pi_connect(cd->pi);
+ write_reg(cd, 6, 0xa0 + 0x10 * cd->drive);
+ write_reg(cd, 7, 8);
+
+ pcd_sleep(20 * HZ / 1000); /* delay a bit */
+
+ k = 0;
+ while ((k++ < PCD_RESET_TMO) && (status_reg(cd) & IDE_BUSY))
+ pcd_sleep(HZ / 10);
+
+ flg = 1;
+ for (i = 0; i < 5; i++)
+ flg &= (read_reg(cd, i + 1) == expect[i]);
+
+ if (verbose) {
+ printk("%s: Reset (%d) signature = ", cd->name, k);
+ for (i = 0; i < 5; i++)
+ printk("%3x", read_reg(cd, i + 1));
+ if (!flg)
+ printk(" (incorrect)");
+ printk("\n");
+ }
+
+ pi_disconnect(cd->pi);
+ return flg - 1;
+}
+
+static int pcd_drive_reset(struct cdrom_device_info *cdi)
+{
+ return pcd_reset(cdi->handle);
+}
+
+static int pcd_ready_wait(struct pcd_unit *cd, int tmo)
+{
+ char tr_cmd[12] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ int k, p;
+
+ k = 0;
+ while (k < tmo) {
+ cd->last_sense = 0;
+ pcd_atapi(cd, tr_cmd, 0, NULL, DBMSG("test unit ready"));
+ p = cd->last_sense;
+ if (!p)
+ return 0;
+ if (!(((p & 0xffff) == 0x0402) || ((p & 0xff) == 6)))
+ return p;
+ k++;
+ pcd_sleep(HZ);
+ }
+ return 0x000020; /* timeout */
+}
+
+static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr)
+{
+ char rc_cmd[12] = { 0x25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ struct pcd_unit *cd = cdi->handle;
+
+ if (pcd_ready_wait(cd, PCD_READY_TMO))
+ return CDS_DRIVE_NOT_READY;
+ if (pcd_atapi(cd, rc_cmd, 8, pcd_scratch, DBMSG("check media")))
+ return CDS_NO_DISC;
+ return CDS_DISC_OK;
+}
+
+static int pcd_identify(struct pcd_unit *cd, char *id)
+{
+ int k, s;
+ char id_cmd[12] = { 0x12, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+
+ pcd_bufblk = -1;
+
+ s = pcd_atapi(cd, id_cmd, 36, pcd_buffer, "identify");
+
+ if (s)
+ return -1;
+ if ((pcd_buffer[0] & 0x1f) != 5) {
+ if (verbose)
+ printk("%s: %s is not a CD-ROM\n",
+ cd->name, cd->drive ? "Slave" : "Master");
+ return -1;
+ }
+ memcpy(id, pcd_buffer + 16, 16);
+ id[16] = 0;
+ k = 16;
+ while ((k >= 0) && (id[k] <= 0x20)) {
+ id[k] = 0;
+ k--;
+ }
+
+ printk("%s: %s: %s\n", cd->name, cd->drive ? "Slave" : "Master", id);
+
+ return 0;
+}
+
+/*
+ * returns 0, with id set if drive is detected
+ * -1, if drive detection failed
+ */
+static int pcd_probe(struct pcd_unit *cd, int ms, char *id)
+{
+ if (ms == -1) {
+ for (cd->drive = 0; cd->drive <= 1; cd->drive++)
+ if (!pcd_reset(cd) && !pcd_identify(cd, id))
+ return 0;
+ } else {
+ cd->drive = ms;
+ if (!pcd_reset(cd) && !pcd_identify(cd, id))
+ return 0;
+ }
+ return -1;
+}
+
+static void pcd_probe_capabilities(void)
+{
+ int unit, r;
+ char buffer[32];
+ char cmd[12] = { 0x5a, 1 << 3, 0x2a, 0, 0, 0, 0, 18, 0, 0, 0, 0 };
+ struct pcd_unit *cd;
+
+ for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
+ if (!cd->present)
+ continue;
+ r = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities");
+ if (r)
+ continue;
+ /* we should now have the cap page */
+ if ((buffer[11] & 1) == 0)
+ cd->info.mask |= CDC_CD_R;
+ if ((buffer[11] & 2) == 0)
+ cd->info.mask |= CDC_CD_RW;
+ if ((buffer[12] & 1) == 0)
+ cd->info.mask |= CDC_PLAY_AUDIO;
+ if ((buffer[14] & 1) == 0)
+ cd->info.mask |= CDC_LOCK;
+ if ((buffer[14] & 8) == 0)
+ cd->info.mask |= CDC_OPEN_TRAY;
+ if ((buffer[14] >> 6) == 0)
+ cd->info.mask |= CDC_CLOSE_TRAY;
+ }
+}
+
+static int pcd_detect(void)
+{
+ char id[18];
+ int k, unit;
+ struct pcd_unit *cd;
+
+ printk("%s: %s version %s, major %d, nice %d\n",
+ name, name, PCD_VERSION, major, nice);
+
+ k = 0;
+ if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
+ cd = pcd;
+ if (pi_init(cd->pi, 1, -1, -1, -1, -1, -1, pcd_buffer,
+ PI_PCD, verbose, cd->name)) {
+ if (!pcd_probe(cd, -1, id) && cd->disk) {
+ cd->present = 1;
+ k++;
+ } else
+ pi_release(cd->pi);
+ }
+ } else {
+ for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
+ int *conf = *drives[unit];
+ if (!conf[D_PRT])
+ continue;
+ if (!pi_init(cd->pi, 0, conf[D_PRT], conf[D_MOD],
+ conf[D_UNI], conf[D_PRO], conf[D_DLY],
+ pcd_buffer, PI_PCD, verbose, cd->name))
+ continue;
+ if (!pcd_probe(cd, conf[D_SLV], id) && cd->disk) {
+ cd->present = 1;
+ k++;
+ } else
+ pi_release(cd->pi);
+ }
+ }
+ if (k)
+ return 0;
+
+ printk("%s: No CD-ROM drive found\n", name);
+ for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
+ put_disk(cd->disk);
+ return -1;
+}
+
+/* I/O request processing */
+static struct request_queue *pcd_queue;
+
+static void do_pcd_request(struct request_queue * q)
+{
+ if (pcd_busy)
+ return;
+ while (1) {
+ if (!pcd_req) {
+ pcd_req = blk_fetch_request(q);
+ if (!pcd_req)
+ return;
+ }
+
+ if (rq_data_dir(pcd_req) == READ) {
+ struct pcd_unit *cd = pcd_req->rq_disk->private_data;
+ if (cd != pcd_current)
+ pcd_bufblk = -1;
+ pcd_current = cd;
+ pcd_sector = blk_rq_pos(pcd_req);
+ pcd_count = blk_rq_cur_sectors(pcd_req);
+ pcd_buf = bio_data(pcd_req->bio);
+ pcd_busy = 1;
+ ps_set_intr(do_pcd_read, NULL, 0, nice);
+ return;
+ } else {
+ __blk_end_request_all(pcd_req, -EIO);
+ pcd_req = NULL;
+ }
+ }
+}
+
+static inline void next_request(int err)
+{
+ unsigned long saved_flags;
+
+ spin_lock_irqsave(&pcd_lock, saved_flags);
+ if (!__blk_end_request_cur(pcd_req, err))
+ pcd_req = NULL;
+ pcd_busy = 0;
+ do_pcd_request(pcd_queue);
+ spin_unlock_irqrestore(&pcd_lock, saved_flags);
+}
+
+static int pcd_ready(void)
+{
+ return (((status_reg(pcd_current) & (IDE_BUSY | IDE_DRQ)) == IDE_DRQ));
+}
+
+static void pcd_transfer(void)
+{
+
+ while (pcd_count && (pcd_sector / 4 == pcd_bufblk)) {
+ int o = (pcd_sector % 4) * 512;
+ memcpy(pcd_buf, pcd_buffer + o, 512);
+ pcd_count--;
+ pcd_buf += 512;
+ pcd_sector++;
+ }
+}
+
+static void pcd_start(void)
+{
+ int b, i;
+ char rd_cmd[12] = { 0xa8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 };
+
+ pcd_bufblk = pcd_sector / 4;
+ b = pcd_bufblk;
+ for (i = 0; i < 4; i++) {
+ rd_cmd[5 - i] = b & 0xff;
+ b = b >> 8;
+ }
+
+ if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) {
+ pcd_bufblk = -1;
+ next_request(-EIO);
+ return;
+ }
+
+ mdelay(1);
+
+ ps_set_intr(do_pcd_read_drq, pcd_ready, PCD_TMO, nice);
+}
+
+static void do_pcd_read(void)
+{
+ pcd_busy = 1;
+ pcd_retries = 0;
+ pcd_transfer();
+ if (!pcd_count) {
+ next_request(0);
+ return;
+ }
+
+ pi_do_claimed(pcd_current->pi, pcd_start);
+}
+
+static void do_pcd_read_drq(void)
+{
+ unsigned long saved_flags;
+
+ if (pcd_completion(pcd_current, pcd_buffer, "read block")) {
+ if (pcd_retries < PCD_RETRIES) {
+ mdelay(1);
+ pcd_retries++;
+ pi_do_claimed(pcd_current->pi, pcd_start);
+ return;
+ }
+ pcd_bufblk = -1;
+ next_request(-EIO);
+ return;
+ }
+
+ do_pcd_read();
+ spin_lock_irqsave(&pcd_lock, saved_flags);
+ do_pcd_request(pcd_queue);
+ spin_unlock_irqrestore(&pcd_lock, saved_flags);
+}
+
+/* the audio_ioctl stuff is adapted from sr_ioctl.c */
+
+static int pcd_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd, void *arg)
+{
+ struct pcd_unit *cd = cdi->handle;
+
+ switch (cmd) {
+
+ case CDROMREADTOCHDR:
+
+ {
+ char cmd[12] =
+ { GPCMD_READ_TOC_PMA_ATIP, 0, 0, 0, 0, 0, 0, 0, 12,
+ 0, 0, 0 };
+ struct cdrom_tochdr *tochdr =
+ (struct cdrom_tochdr *) arg;
+ char buffer[32];
+ int r;
+
+ r = pcd_atapi(cd, cmd, 12, buffer, "read toc header");
+
+ tochdr->cdth_trk0 = buffer[2];
+ tochdr->cdth_trk1 = buffer[3];
+
+ return r ? -EIO : 0;
+ }
+
+ case CDROMREADTOCENTRY:
+
+ {
+ char cmd[12] =
+ { GPCMD_READ_TOC_PMA_ATIP, 0, 0, 0, 0, 0, 0, 0, 12,
+ 0, 0, 0 };
+
+ struct cdrom_tocentry *tocentry =
+ (struct cdrom_tocentry *) arg;
+ unsigned char buffer[32];
+ int r;
+
+ cmd[1] =
+ (tocentry->cdte_format == CDROM_MSF ? 0x02 : 0);
+ cmd[6] = tocentry->cdte_track;
+
+ r = pcd_atapi(cd, cmd, 12, buffer, "read toc entry");
+
+ tocentry->cdte_ctrl = buffer[5] & 0xf;
+ tocentry->cdte_adr = buffer[5] >> 4;
+ tocentry->cdte_datamode =
+ (tocentry->cdte_ctrl & 0x04) ? 1 : 0;
+ if (tocentry->cdte_format == CDROM_MSF) {
+ tocentry->cdte_addr.msf.minute = buffer[9];
+ tocentry->cdte_addr.msf.second = buffer[10];
+ tocentry->cdte_addr.msf.frame = buffer[11];
+ } else
+ tocentry->cdte_addr.lba =
+ (((((buffer[8] << 8) + buffer[9]) << 8)
+ + buffer[10]) << 8) + buffer[11];
+
+ return r ? -EIO : 0;
+ }
+
+ default:
+
+ return -ENOSYS;
+ }
+}
+
+static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn)
+{
+ char cmd[12] =
+ { GPCMD_READ_SUBCHANNEL, 0, 0x40, 2, 0, 0, 0, 0, 24, 0, 0, 0 };
+ char buffer[32];
+
+ if (pcd_atapi(cdi->handle, cmd, 24, buffer, "get mcn"))
+ return -EIO;
+
+ memcpy(mcn->medium_catalog_number, buffer + 9, 13);
+ mcn->medium_catalog_number[13] = 0;
+
+ return 0;
+}
+
+static int __init pcd_init(void)
+{
+ struct pcd_unit *cd;
+ int unit;
+
+ if (disable)
+ return -EINVAL;
+
+ pcd_init_units();
+
+ if (pcd_detect())
+ return -ENODEV;
+
+ /* get the atapi capabilities page */
+ pcd_probe_capabilities();
+
+ if (register_blkdev(major, name)) {
+ for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
+ put_disk(cd->disk);
+ return -EBUSY;
+ }
+
+ pcd_queue = blk_init_queue(do_pcd_request, &pcd_lock);
+ if (!pcd_queue) {
+ unregister_blkdev(major, name);
+ for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++)
+ put_disk(cd->disk);
+ return -ENOMEM;
+ }
+
+ for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
+ if (cd->present) {
+ register_cdrom(&cd->info);
+ cd->disk->private_data = cd;
+ cd->disk->queue = pcd_queue;
+ add_disk(cd->disk);
+ }
+ }
+
+ return 0;
+}
+
+static void __exit pcd_exit(void)
+{
+ struct pcd_unit *cd;
+ int unit;
+
+ for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
+ if (cd->present) {
+ del_gendisk(cd->disk);
+ pi_release(cd->pi);
+ unregister_cdrom(&cd->info);
+ }
+ put_disk(cd->disk);
+ }
+ blk_cleanup_queue(pcd_queue);
+ unregister_blkdev(major, name);
+}
+
+MODULE_LICENSE("GPL");
+module_init(pcd_init)
+module_exit(pcd_exit)
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
new file mode 100644
index 000000000..d48715b28
--- /dev/null
+++ b/drivers/block/paride/pd.c
@@ -0,0 +1,958 @@
+/*
+ pd.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ This is the high-level driver for parallel port IDE hard
+ drives based on chips supported by the paride module.
+
+ By default, the driver will autoprobe for a single parallel
+ port IDE drive, but if their individual parameters are
+ specified, the driver can handle up to 4 drives.
+
+ The behaviour of the pd driver can be altered by setting
+ some parameters from the insmod command line. The following
+ parameters are adjustable:
+
+ drive0 These four arguments can be arrays of
+ drive1 1-8 integers as follows:
+ drive2
+ drive3 <prt>,<pro>,<uni>,<mod>,<geo>,<sby>,<dly>,<slv>
+
+ Where,
+
+ <prt> is the base of the parallel port address for
+ the corresponding drive. (required)
+
+ <pro> is the protocol number for the adapter that
+ supports this drive. These numbers are
+ logged by 'paride' when the protocol modules
+ are initialised. (0 if not given)
+
+ <uni> for those adapters that support chained
+ devices, this is the unit selector for the
+ chain of devices on the given port. It should
+ be zero for devices that don't support chaining.
+ (0 if not given)
+
+ <mod> this can be -1 to choose the best mode, or one
+ of the mode numbers supported by the adapter.
+ (-1 if not given)
+
+ <geo> this defaults to 0 to indicate that the driver
+ should use the CHS geometry provided by the drive
+ itself. If set to 1, the driver will provide
+ a logical geometry with 64 heads and 32 sectors
+ per track, to be consistent with most SCSI
+ drivers. (0 if not given)
+
+ <sby> set this to zero to disable the power saving
+ standby mode, if needed. (1 if not given)
+
+ <dly> some parallel ports require the driver to
+ go more slowly. -1 sets a default value that
+ should work with the chosen protocol. Otherwise,
+ set this to a small integer, the larger it is
+ the slower the port i/o. In some cases, setting
+ this to zero will speed up the device. (default -1)
+
+ <slv> IDE disks can be jumpered to master or slave.
+ Set this to 0 to choose the master drive, 1 to
+ choose the slave, -1 (the default) to choose the
+ first drive found.
+
+
+ major You may use this parameter to overide the
+ default major number (45) that this driver
+ will use. Be sure to change the device
+ name as well.
+
+ name This parameter is a character string that
+ contains the name the kernel will use for this
+ device (in /proc output, for instance).
+ (default "pd")
+
+ cluster The driver will attempt to aggregate requests
+ for adjacent blocks into larger multi-block
+ clusters. The maximum cluster size (in 512
+ byte sectors) is set with this parameter.
+ (default 64)
+
+ verbose This parameter controls the amount of logging
+ that the driver will do. Set it to 0 for
+ normal operation, 1 to see autoprobe progress
+ messages, or 2 to see additional debugging
+ output. (default 0)
+
+ nice This parameter controls the driver's use of
+ idle CPU time, at the expense of some speed.
+
+ If this driver is built into the kernel, you can use kernel
+ the following command line parameters, with the same values
+ as the corresponding module parameters listed above:
+
+ pd.drive0
+ pd.drive1
+ pd.drive2
+ pd.drive3
+ pd.cluster
+ pd.nice
+
+ In addition, you can use the parameter pd.disable to disable
+ the driver entirely.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1997.01.24 Restored pd_reset()
+ Added eject ioctl
+ 1.02 GRG 1998.05.06 SMP spinlock changes,
+ Added slave support
+ 1.03 GRG 1998.06.16 Eliminate an Ugh.
+ 1.04 GRG 1998.08.15 Extra debugging, use HZ in loop timing
+ 1.05 GRG 1998.09.24 Added jumbo support
+
+*/
+
+#define PD_VERSION "1.05"
+#define PD_MAJOR 45
+#define PD_NAME "pd"
+#define PD_UNITS 4
+
+/* Here are things one can override from the insmod command.
+ Most are autoprobed by paride unless set here. Verbose is off
+ by default.
+
+*/
+#include <linux/types.h>
+
+static bool verbose = 0;
+static int major = PD_MAJOR;
+static char *name = PD_NAME;
+static int cluster = 64;
+static int nice = 0;
+static int disable = 0;
+
+static int drive0[8] = { 0, 0, 0, -1, 0, 1, -1, -1 };
+static int drive1[8] = { 0, 0, 0, -1, 0, 1, -1, -1 };
+static int drive2[8] = { 0, 0, 0, -1, 0, 1, -1, -1 };
+static int drive3[8] = { 0, 0, 0, -1, 0, 1, -1, -1 };
+
+static int (*drives[4])[8] = {&drive0, &drive1, &drive2, &drive3};
+
+enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
+
+/* end of parameters */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/cdrom.h> /* for the eject ioctl */
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+#include <linux/workqueue.h>
+
+static DEFINE_MUTEX(pd_mutex);
+static DEFINE_SPINLOCK(pd_lock);
+
+module_param(verbose, bool, 0);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param(cluster, int, 0);
+module_param(nice, int, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+
+#define PD_BITS 4
+
+/* numbers for "SCSI" geometry */
+
+#define PD_LOG_HEADS 64
+#define PD_LOG_SECTS 32
+
+#define PD_ID_OFF 54
+#define PD_ID_LEN 14
+
+#define PD_MAX_RETRIES 5
+#define PD_TMO 800 /* interrupt timeout in jiffies */
+#define PD_SPIN_DEL 50 /* spin delay in micro-seconds */
+
+#define PD_SPIN (1000000*PD_TMO)/(HZ*PD_SPIN_DEL)
+
+#define STAT_ERR 0x00001
+#define STAT_INDEX 0x00002
+#define STAT_ECC 0x00004
+#define STAT_DRQ 0x00008
+#define STAT_SEEK 0x00010
+#define STAT_WRERR 0x00020
+#define STAT_READY 0x00040
+#define STAT_BUSY 0x00080
+
+#define ERR_AMNF 0x00100
+#define ERR_TK0NF 0x00200
+#define ERR_ABRT 0x00400
+#define ERR_MCR 0x00800
+#define ERR_IDNF 0x01000
+#define ERR_MC 0x02000
+#define ERR_UNC 0x04000
+#define ERR_TMO 0x10000
+
+#define IDE_READ 0x20
+#define IDE_WRITE 0x30
+#define IDE_READ_VRFY 0x40
+#define IDE_INIT_DEV_PARMS 0x91
+#define IDE_STANDBY 0x96
+#define IDE_ACKCHANGE 0xdb
+#define IDE_DOORLOCK 0xde
+#define IDE_DOORUNLOCK 0xdf
+#define IDE_IDENTIFY 0xec
+#define IDE_EJECT 0xed
+
+#define PD_NAMELEN 8
+
+struct pd_unit {
+ struct pi_adapter pia; /* interface to paride layer */
+ struct pi_adapter *pi;
+ int access; /* count of active opens ... */
+ int capacity; /* Size of this volume in sectors */
+ int heads; /* physical geometry */
+ int sectors;
+ int cylinders;
+ int can_lba;
+ int drive; /* master=0 slave=1 */
+ int changed; /* Have we seen a disk change ? */
+ int removable; /* removable media device ? */
+ int standby;
+ int alt_geom;
+ char name[PD_NAMELEN]; /* pda, pdb, etc ... */
+ struct gendisk *gd;
+};
+
+static struct pd_unit pd[PD_UNITS];
+
+static char pd_scratch[512]; /* scratch block buffer */
+
+static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR",
+ "READY", "BUSY", "AMNF", "TK0NF", "ABRT", "MCR",
+ "IDNF", "MC", "UNC", "???", "TMO"
+};
+
+static inline int status_reg(struct pd_unit *disk)
+{
+ return pi_read_regr(disk->pi, 1, 6);
+}
+
+static inline int read_reg(struct pd_unit *disk, int reg)
+{
+ return pi_read_regr(disk->pi, 0, reg);
+}
+
+static inline void write_status(struct pd_unit *disk, int val)
+{
+ pi_write_regr(disk->pi, 1, 6, val);
+}
+
+static inline void write_reg(struct pd_unit *disk, int reg, int val)
+{
+ pi_write_regr(disk->pi, 0, reg, val);
+}
+
+static inline u8 DRIVE(struct pd_unit *disk)
+{
+ return 0xa0+0x10*disk->drive;
+}
+
+/* ide command interface */
+
+static void pd_print_error(struct pd_unit *disk, char *msg, int status)
+{
+ int i;
+
+ printk("%s: %s: status = 0x%x =", disk->name, msg, status);
+ for (i = 0; i < ARRAY_SIZE(pd_errs); i++)
+ if (status & (1 << i))
+ printk(" %s", pd_errs[i]);
+ printk("\n");
+}
+
+static void pd_reset(struct pd_unit *disk)
+{ /* called only for MASTER drive */
+ write_status(disk, 4);
+ udelay(50);
+ write_status(disk, 0);
+ udelay(250);
+}
+
+#define DBMSG(msg) ((verbose>1)?(msg):NULL)
+
+static int pd_wait_for(struct pd_unit *disk, int w, char *msg)
+{ /* polled wait */
+ int k, r, e;
+
+ k = 0;
+ while (k < PD_SPIN) {
+ r = status_reg(disk);
+ k++;
+ if (((r & w) == w) && !(r & STAT_BUSY))
+ break;
+ udelay(PD_SPIN_DEL);
+ }
+ e = (read_reg(disk, 1) << 8) + read_reg(disk, 7);
+ if (k >= PD_SPIN)
+ e |= ERR_TMO;
+ if ((e & (STAT_ERR | ERR_TMO)) && (msg != NULL))
+ pd_print_error(disk, msg, e);
+ return e;
+}
+
+static void pd_send_command(struct pd_unit *disk, int n, int s, int h, int c0, int c1, int func)
+{
+ write_reg(disk, 6, DRIVE(disk) + h);
+ write_reg(disk, 1, 0); /* the IDE task file */
+ write_reg(disk, 2, n);
+ write_reg(disk, 3, s);
+ write_reg(disk, 4, c0);
+ write_reg(disk, 5, c1);
+ write_reg(disk, 7, func);
+
+ udelay(1);
+}
+
+static void pd_ide_command(struct pd_unit *disk, int func, int block, int count)
+{
+ int c1, c0, h, s;
+
+ if (disk->can_lba) {
+ s = block & 255;
+ c0 = (block >>= 8) & 255;
+ c1 = (block >>= 8) & 255;
+ h = ((block >>= 8) & 15) + 0x40;
+ } else {
+ s = (block % disk->sectors) + 1;
+ h = (block /= disk->sectors) % disk->heads;
+ c0 = (block /= disk->heads) % 256;
+ c1 = (block >>= 8);
+ }
+ pd_send_command(disk, count, s, h, c0, c1, func);
+}
+
+/* The i/o request engine */
+
+enum action {Fail = 0, Ok = 1, Hold, Wait};
+
+static struct request *pd_req; /* current request */
+static enum action (*phase)(void);
+
+static void run_fsm(void);
+
+static void ps_tq_int(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(fsm_tq, ps_tq_int);
+
+static void schedule_fsm(void)
+{
+ if (!nice)
+ schedule_delayed_work(&fsm_tq, 0);
+ else
+ schedule_delayed_work(&fsm_tq, nice-1);
+}
+
+static void ps_tq_int(struct work_struct *work)
+{
+ run_fsm();
+}
+
+static enum action do_pd_io_start(void);
+static enum action pd_special(void);
+static enum action do_pd_read_start(void);
+static enum action do_pd_write_start(void);
+static enum action do_pd_read_drq(void);
+static enum action do_pd_write_done(void);
+
+static struct request_queue *pd_queue;
+static int pd_claimed;
+
+static struct pd_unit *pd_current; /* current request's drive */
+static PIA *pi_current; /* current request's PIA */
+
+static void run_fsm(void)
+{
+ while (1) {
+ enum action res;
+ unsigned long saved_flags;
+ int stop = 0;
+
+ if (!phase) {
+ pd_current = pd_req->rq_disk->private_data;
+ pi_current = pd_current->pi;
+ phase = do_pd_io_start;
+ }
+
+ switch (pd_claimed) {
+ case 0:
+ pd_claimed = 1;
+ if (!pi_schedule_claimed(pi_current, run_fsm))
+ return;
+ case 1:
+ pd_claimed = 2;
+ pi_current->proto->connect(pi_current);
+ }
+
+ switch(res = phase()) {
+ case Ok: case Fail:
+ pi_disconnect(pi_current);
+ pd_claimed = 0;
+ phase = NULL;
+ spin_lock_irqsave(&pd_lock, saved_flags);
+ if (!__blk_end_request_cur(pd_req,
+ res == Ok ? 0 : -EIO)) {
+ pd_req = blk_fetch_request(pd_queue);
+ if (!pd_req)
+ stop = 1;
+ }
+ spin_unlock_irqrestore(&pd_lock, saved_flags);
+ if (stop)
+ return;
+ case Hold:
+ schedule_fsm();
+ return;
+ case Wait:
+ pi_disconnect(pi_current);
+ pd_claimed = 0;
+ }
+ }
+}
+
+static int pd_retries = 0; /* i/o error retry count */
+static int pd_block; /* address of next requested block */
+static int pd_count; /* number of blocks still to do */
+static int pd_run; /* sectors in current cluster */
+static int pd_cmd; /* current command READ/WRITE */
+static char *pd_buf; /* buffer for request in progress */
+
+static enum action do_pd_io_start(void)
+{
+ if (pd_req->cmd_type == REQ_TYPE_SPECIAL) {
+ phase = pd_special;
+ return pd_special();
+ }
+
+ pd_cmd = rq_data_dir(pd_req);
+ if (pd_cmd == READ || pd_cmd == WRITE) {
+ pd_block = blk_rq_pos(pd_req);
+ pd_count = blk_rq_cur_sectors(pd_req);
+ if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
+ return Fail;
+ pd_run = blk_rq_sectors(pd_req);
+ pd_buf = bio_data(pd_req->bio);
+ pd_retries = 0;
+ if (pd_cmd == READ)
+ return do_pd_read_start();
+ else
+ return do_pd_write_start();
+ }
+ return Fail;
+}
+
+static enum action pd_special(void)
+{
+ enum action (*func)(struct pd_unit *) = pd_req->special;
+ return func(pd_current);
+}
+
+static int pd_next_buf(void)
+{
+ unsigned long saved_flags;
+
+ pd_count--;
+ pd_run--;
+ pd_buf += 512;
+ pd_block++;
+ if (!pd_run)
+ return 1;
+ if (pd_count)
+ return 0;
+ spin_lock_irqsave(&pd_lock, saved_flags);
+ __blk_end_request_cur(pd_req, 0);
+ pd_count = blk_rq_cur_sectors(pd_req);
+ pd_buf = bio_data(pd_req->bio);
+ spin_unlock_irqrestore(&pd_lock, saved_flags);
+ return 0;
+}
+
+static unsigned long pd_timeout;
+
+static enum action do_pd_read_start(void)
+{
+ if (pd_wait_for(pd_current, STAT_READY, "do_pd_read") & STAT_ERR) {
+ if (pd_retries < PD_MAX_RETRIES) {
+ pd_retries++;
+ return Wait;
+ }
+ return Fail;
+ }
+ pd_ide_command(pd_current, IDE_READ, pd_block, pd_run);
+ phase = do_pd_read_drq;
+ pd_timeout = jiffies + PD_TMO;
+ return Hold;
+}
+
+static enum action do_pd_write_start(void)
+{
+ if (pd_wait_for(pd_current, STAT_READY, "do_pd_write") & STAT_ERR) {
+ if (pd_retries < PD_MAX_RETRIES) {
+ pd_retries++;
+ return Wait;
+ }
+ return Fail;
+ }
+ pd_ide_command(pd_current, IDE_WRITE, pd_block, pd_run);
+ while (1) {
+ if (pd_wait_for(pd_current, STAT_DRQ, "do_pd_write_drq") & STAT_ERR) {
+ if (pd_retries < PD_MAX_RETRIES) {
+ pd_retries++;
+ return Wait;
+ }
+ return Fail;
+ }
+ pi_write_block(pd_current->pi, pd_buf, 512);
+ if (pd_next_buf())
+ break;
+ }
+ phase = do_pd_write_done;
+ pd_timeout = jiffies + PD_TMO;
+ return Hold;
+}
+
+static inline int pd_ready(void)
+{
+ return !(status_reg(pd_current) & STAT_BUSY);
+}
+
+static enum action do_pd_read_drq(void)
+{
+ if (!pd_ready() && !time_after_eq(jiffies, pd_timeout))
+ return Hold;
+
+ while (1) {
+ if (pd_wait_for(pd_current, STAT_DRQ, "do_pd_read_drq") & STAT_ERR) {
+ if (pd_retries < PD_MAX_RETRIES) {
+ pd_retries++;
+ phase = do_pd_read_start;
+ return Wait;
+ }
+ return Fail;
+ }
+ pi_read_block(pd_current->pi, pd_buf, 512);
+ if (pd_next_buf())
+ break;
+ }
+ return Ok;
+}
+
+static enum action do_pd_write_done(void)
+{
+ if (!pd_ready() && !time_after_eq(jiffies, pd_timeout))
+ return Hold;
+
+ if (pd_wait_for(pd_current, STAT_READY, "do_pd_write_done") & STAT_ERR) {
+ if (pd_retries < PD_MAX_RETRIES) {
+ pd_retries++;
+ phase = do_pd_write_start;
+ return Wait;
+ }
+ return Fail;
+ }
+ return Ok;
+}
+
+/* special io requests */
+
+/* According to the ATA standard, the default CHS geometry should be
+ available following a reset. Some Western Digital drives come up
+ in a mode where only LBA addresses are accepted until the device
+ parameters are initialised.
+*/
+
+static void pd_init_dev_parms(struct pd_unit *disk)
+{
+ pd_wait_for(disk, 0, DBMSG("before init_dev_parms"));
+ pd_send_command(disk, disk->sectors, 0, disk->heads - 1, 0, 0,
+ IDE_INIT_DEV_PARMS);
+ udelay(300);
+ pd_wait_for(disk, 0, "Initialise device parameters");
+}
+
+static enum action pd_door_lock(struct pd_unit *disk)
+{
+ if (!(pd_wait_for(disk, STAT_READY, "Lock") & STAT_ERR)) {
+ pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORLOCK);
+ pd_wait_for(disk, STAT_READY, "Lock done");
+ }
+ return Ok;
+}
+
+static enum action pd_door_unlock(struct pd_unit *disk)
+{
+ if (!(pd_wait_for(disk, STAT_READY, "Lock") & STAT_ERR)) {
+ pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORUNLOCK);
+ pd_wait_for(disk, STAT_READY, "Lock done");
+ }
+ return Ok;
+}
+
+static enum action pd_eject(struct pd_unit *disk)
+{
+ pd_wait_for(disk, 0, DBMSG("before unlock on eject"));
+ pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORUNLOCK);
+ pd_wait_for(disk, 0, DBMSG("after unlock on eject"));
+ pd_wait_for(disk, 0, DBMSG("before eject"));
+ pd_send_command(disk, 0, 0, 0, 0, 0, IDE_EJECT);
+ pd_wait_for(disk, 0, DBMSG("after eject"));
+ return Ok;
+}
+
+static enum action pd_media_check(struct pd_unit *disk)
+{
+ int r = pd_wait_for(disk, STAT_READY, DBMSG("before media_check"));
+ if (!(r & STAT_ERR)) {
+ pd_send_command(disk, 1, 1, 0, 0, 0, IDE_READ_VRFY);
+ r = pd_wait_for(disk, STAT_READY, DBMSG("RDY after READ_VRFY"));
+ } else
+ disk->changed = 1; /* say changed if other error */
+ if (r & ERR_MC) {
+ disk->changed = 1;
+ pd_send_command(disk, 1, 0, 0, 0, 0, IDE_ACKCHANGE);
+ pd_wait_for(disk, STAT_READY, DBMSG("RDY after ACKCHANGE"));
+ pd_send_command(disk, 1, 1, 0, 0, 0, IDE_READ_VRFY);
+ r = pd_wait_for(disk, STAT_READY, DBMSG("RDY after VRFY"));
+ }
+ return Ok;
+}
+
+static void pd_standby_off(struct pd_unit *disk)
+{
+ pd_wait_for(disk, 0, DBMSG("before STANDBY"));
+ pd_send_command(disk, 0, 0, 0, 0, 0, IDE_STANDBY);
+ pd_wait_for(disk, 0, DBMSG("after STANDBY"));
+}
+
+static enum action pd_identify(struct pd_unit *disk)
+{
+ int j;
+ char id[PD_ID_LEN + 1];
+
+/* WARNING: here there may be dragons. reset() applies to both drives,
+ but we call it only on probing the MASTER. This should allow most
+ common configurations to work, but be warned that a reset can clear
+ settings on the SLAVE drive.
+*/
+
+ if (disk->drive == 0)
+ pd_reset(disk);
+
+ write_reg(disk, 6, DRIVE(disk));
+ pd_wait_for(disk, 0, DBMSG("before IDENT"));
+ pd_send_command(disk, 1, 0, 0, 0, 0, IDE_IDENTIFY);
+
+ if (pd_wait_for(disk, STAT_DRQ, DBMSG("IDENT DRQ")) & STAT_ERR)
+ return Fail;
+ pi_read_block(disk->pi, pd_scratch, 512);
+ disk->can_lba = pd_scratch[99] & 2;
+ disk->sectors = le16_to_cpu(*(__le16 *) (pd_scratch + 12));
+ disk->heads = le16_to_cpu(*(__le16 *) (pd_scratch + 6));
+ disk->cylinders = le16_to_cpu(*(__le16 *) (pd_scratch + 2));
+ if (disk->can_lba)
+ disk->capacity = le32_to_cpu(*(__le32 *) (pd_scratch + 120));
+ else
+ disk->capacity = disk->sectors * disk->heads * disk->cylinders;
+
+ for (j = 0; j < PD_ID_LEN; j++)
+ id[j ^ 1] = pd_scratch[j + PD_ID_OFF];
+ j = PD_ID_LEN - 1;
+ while ((j >= 0) && (id[j] <= 0x20))
+ j--;
+ j++;
+ id[j] = 0;
+
+ disk->removable = pd_scratch[0] & 0x80;
+
+ printk("%s: %s, %s, %d blocks [%dM], (%d/%d/%d), %s media\n",
+ disk->name, id,
+ disk->drive ? "slave" : "master",
+ disk->capacity, disk->capacity / 2048,
+ disk->cylinders, disk->heads, disk->sectors,
+ disk->removable ? "removable" : "fixed");
+
+ if (disk->capacity)
+ pd_init_dev_parms(disk);
+ if (!disk->standby)
+ pd_standby_off(disk);
+
+ return Ok;
+}
+
+/* end of io request engine */
+
+static void do_pd_request(struct request_queue * q)
+{
+ if (pd_req)
+ return;
+ pd_req = blk_fetch_request(q);
+ if (!pd_req)
+ return;
+
+ schedule_fsm();
+}
+
+static int pd_special_command(struct pd_unit *disk,
+ enum action (*func)(struct pd_unit *disk))
+{
+ struct request *rq;
+ int err = 0;
+
+ rq = blk_get_request(disk->gd->queue, READ, __GFP_WAIT);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+
+ rq->cmd_type = REQ_TYPE_SPECIAL;
+ rq->special = func;
+
+ err = blk_execute_rq(disk->gd->queue, disk->gd, rq, 0);
+
+ blk_put_request(rq);
+ return err;
+}
+
+/* kernel glue structures */
+
+static int pd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct pd_unit *disk = bdev->bd_disk->private_data;
+
+ mutex_lock(&pd_mutex);
+ disk->access++;
+
+ if (disk->removable) {
+ pd_special_command(disk, pd_media_check);
+ pd_special_command(disk, pd_door_lock);
+ }
+ mutex_unlock(&pd_mutex);
+ return 0;
+}
+
+static int pd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct pd_unit *disk = bdev->bd_disk->private_data;
+
+ if (disk->alt_geom) {
+ geo->heads = PD_LOG_HEADS;
+ geo->sectors = PD_LOG_SECTS;
+ geo->cylinders = disk->capacity / (geo->heads * geo->sectors);
+ } else {
+ geo->heads = disk->heads;
+ geo->sectors = disk->sectors;
+ geo->cylinders = disk->cylinders;
+ }
+
+ return 0;
+}
+
+static int pd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct pd_unit *disk = bdev->bd_disk->private_data;
+
+ switch (cmd) {
+ case CDROMEJECT:
+ mutex_lock(&pd_mutex);
+ if (disk->access == 1)
+ pd_special_command(disk, pd_eject);
+ mutex_unlock(&pd_mutex);
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static void pd_release(struct gendisk *p, fmode_t mode)
+{
+ struct pd_unit *disk = p->private_data;
+
+ mutex_lock(&pd_mutex);
+ if (!--disk->access && disk->removable)
+ pd_special_command(disk, pd_door_unlock);
+ mutex_unlock(&pd_mutex);
+}
+
+static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing)
+{
+ struct pd_unit *disk = p->private_data;
+ int r;
+ if (!disk->removable)
+ return 0;
+ pd_special_command(disk, pd_media_check);
+ r = disk->changed;
+ disk->changed = 0;
+ return r ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static int pd_revalidate(struct gendisk *p)
+{
+ struct pd_unit *disk = p->private_data;
+ if (pd_special_command(disk, pd_identify) == 0)
+ set_capacity(p, disk->capacity);
+ else
+ set_capacity(p, 0);
+ return 0;
+}
+
+static const struct block_device_operations pd_fops = {
+ .owner = THIS_MODULE,
+ .open = pd_open,
+ .release = pd_release,
+ .ioctl = pd_ioctl,
+ .getgeo = pd_getgeo,
+ .check_events = pd_check_events,
+ .revalidate_disk= pd_revalidate
+};
+
+/* probing */
+
+static void pd_probe_drive(struct pd_unit *disk)
+{
+ struct gendisk *p = alloc_disk(1 << PD_BITS);
+ if (!p)
+ return;
+ strcpy(p->disk_name, disk->name);
+ p->fops = &pd_fops;
+ p->major = major;
+ p->first_minor = (disk - pd) << PD_BITS;
+ disk->gd = p;
+ p->private_data = disk;
+ p->queue = pd_queue;
+
+ if (disk->drive == -1) {
+ for (disk->drive = 0; disk->drive <= 1; disk->drive++)
+ if (pd_special_command(disk, pd_identify) == 0)
+ return;
+ } else if (pd_special_command(disk, pd_identify) == 0)
+ return;
+ disk->gd = NULL;
+ put_disk(p);
+}
+
+static int pd_detect(void)
+{
+ int found = 0, unit, pd_drive_count = 0;
+ struct pd_unit *disk;
+
+ for (unit = 0; unit < PD_UNITS; unit++) {
+ int *parm = *drives[unit];
+ struct pd_unit *disk = pd + unit;
+ disk->pi = &disk->pia;
+ disk->access = 0;
+ disk->changed = 1;
+ disk->capacity = 0;
+ disk->drive = parm[D_SLV];
+ snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a'+unit);
+ disk->alt_geom = parm[D_GEO];
+ disk->standby = parm[D_SBY];
+ if (parm[D_PRT])
+ pd_drive_count++;
+ }
+
+ if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
+ disk = pd;
+ if (pi_init(disk->pi, 1, -1, -1, -1, -1, -1, pd_scratch,
+ PI_PD, verbose, disk->name)) {
+ pd_probe_drive(disk);
+ if (!disk->gd)
+ pi_release(disk->pi);
+ }
+
+ } else {
+ for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
+ int *parm = *drives[unit];
+ if (!parm[D_PRT])
+ continue;
+ if (pi_init(disk->pi, 0, parm[D_PRT], parm[D_MOD],
+ parm[D_UNI], parm[D_PRO], parm[D_DLY],
+ pd_scratch, PI_PD, verbose, disk->name)) {
+ pd_probe_drive(disk);
+ if (!disk->gd)
+ pi_release(disk->pi);
+ }
+ }
+ }
+ for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
+ if (disk->gd) {
+ set_capacity(disk->gd, disk->capacity);
+ add_disk(disk->gd);
+ found = 1;
+ }
+ }
+ if (!found)
+ printk("%s: no valid drive found\n", name);
+ return found;
+}
+
+static int __init pd_init(void)
+{
+ if (disable)
+ goto out1;
+
+ pd_queue = blk_init_queue(do_pd_request, &pd_lock);
+ if (!pd_queue)
+ goto out1;
+
+ blk_queue_max_hw_sectors(pd_queue, cluster);
+
+ if (register_blkdev(major, name))
+ goto out2;
+
+ printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
+ name, name, PD_VERSION, major, cluster, nice);
+ if (!pd_detect())
+ goto out3;
+
+ return 0;
+
+out3:
+ unregister_blkdev(major, name);
+out2:
+ blk_cleanup_queue(pd_queue);
+out1:
+ return -ENODEV;
+}
+
+static void __exit pd_exit(void)
+{
+ struct pd_unit *disk;
+ int unit;
+ unregister_blkdev(major, name);
+ for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
+ struct gendisk *p = disk->gd;
+ if (p) {
+ disk->gd = NULL;
+ del_gendisk(p);
+ put_disk(p);
+ pi_release(disk->pi);
+ }
+ }
+ blk_cleanup_queue(pd_queue);
+}
+
+MODULE_LICENSE("GPL");
+module_init(pd_init)
+module_exit(pd_exit)
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
new file mode 100644
index 000000000..9a15fd3c9
--- /dev/null
+++ b/drivers/block/paride/pf.c
@@ -0,0 +1,1007 @@
+/*
+ pf.c (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ This is the high-level driver for parallel port ATAPI disk
+ drives based on chips supported by the paride module.
+
+ By default, the driver will autoprobe for a single parallel
+ port ATAPI disk drive, but if their individual parameters are
+ specified, the driver can handle up to 4 drives.
+
+ The behaviour of the pf driver can be altered by setting
+ some parameters from the insmod command line. The following
+ parameters are adjustable:
+
+ drive0 These four arguments can be arrays of
+ drive1 1-7 integers as follows:
+ drive2
+ drive3 <prt>,<pro>,<uni>,<mod>,<slv>,<lun>,<dly>
+
+ Where,
+
+ <prt> is the base of the parallel port address for
+ the corresponding drive. (required)
+
+ <pro> is the protocol number for the adapter that
+ supports this drive. These numbers are
+ logged by 'paride' when the protocol modules
+ are initialised. (0 if not given)
+
+ <uni> for those adapters that support chained
+ devices, this is the unit selector for the
+ chain of devices on the given port. It should
+ be zero for devices that don't support chaining.
+ (0 if not given)
+
+ <mod> this can be -1 to choose the best mode, or one
+ of the mode numbers supported by the adapter.
+ (-1 if not given)
+
+ <slv> ATAPI CDroms can be jumpered to master or slave.
+ Set this to 0 to choose the master drive, 1 to
+ choose the slave, -1 (the default) to choose the
+ first drive found.
+
+ <lun> Some ATAPI devices support multiple LUNs.
+ One example is the ATAPI PD/CD drive from
+ Matshita/Panasonic. This device has a
+ CD drive on LUN 0 and a PD drive on LUN 1.
+ By default, the driver will search for the
+ first LUN with a supported device. Set
+ this parameter to force it to use a specific
+ LUN. (default -1)
+
+ <dly> some parallel ports require the driver to
+ go more slowly. -1 sets a default value that
+ should work with the chosen protocol. Otherwise,
+ set this to a small integer, the larger it is
+ the slower the port i/o. In some cases, setting
+ this to zero will speed up the device. (default -1)
+
+ major You may use this parameter to overide the
+ default major number (47) that this driver
+ will use. Be sure to change the device
+ name as well.
+
+ name This parameter is a character string that
+ contains the name the kernel will use for this
+ device (in /proc output, for instance).
+ (default "pf").
+
+ cluster The driver will attempt to aggregate requests
+ for adjacent blocks into larger multi-block
+ clusters. The maximum cluster size (in 512
+ byte sectors) is set with this parameter.
+ (default 64)
+
+ verbose This parameter controls the amount of logging
+ that the driver will do. Set it to 0 for
+ normal operation, 1 to see autoprobe progress
+ messages, or 2 to see additional debugging
+ output. (default 0)
+
+ nice This parameter controls the driver's use of
+ idle CPU time, at the expense of some speed.
+
+ If this driver is built into the kernel, you can use the
+ following command line parameters, with the same values
+ as the corresponding module parameters listed above:
+
+ pf.drive0
+ pf.drive1
+ pf.drive2
+ pf.drive3
+ pf.cluster
+ pf.nice
+
+ In addition, you can use the parameter pf.disable to disable
+ the driver entirely.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.03 Changes for SMP. Eliminate sti().
+ Fix for drives that don't clear STAT_ERR
+ until after next CDB delivered.
+ Small change in pf_completion to round
+ up transfer size.
+ 1.02 GRG 1998.06.16 Eliminated an Ugh
+ 1.03 GRG 1998.08.16 Use HZ in loop timings, extra debugging
+ 1.04 GRG 1998.09.24 Added jumbo support
+
+*/
+
+#define PF_VERSION "1.04"
+#define PF_MAJOR 47
+#define PF_NAME "pf"
+#define PF_UNITS 4
+
+#include <linux/types.h>
+
+/* Here are things one can override from the insmod command.
+ Most are autoprobed by paride unless set here. Verbose is off
+ by default.
+
+*/
+
+static bool verbose = 0;
+static int major = PF_MAJOR;
+static char *name = PF_NAME;
+static int cluster = 64;
+static int nice = 0;
+static int disable = 0;
+
+static int drive0[7] = { 0, 0, 0, -1, -1, -1, -1 };
+static int drive1[7] = { 0, 0, 0, -1, -1, -1, -1 };
+static int drive2[7] = { 0, 0, 0, -1, -1, -1, -1 };
+static int drive3[7] = { 0, 0, 0, -1, -1, -1, -1 };
+
+static int (*drives[4])[7] = {&drive0, &drive1, &drive2, &drive3};
+static int pf_drive_count;
+
+enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
+
+/* end of parameters */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/cdrom.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+#include <linux/mutex.h>
+#include <asm/uaccess.h>
+
+static DEFINE_MUTEX(pf_mutex);
+static DEFINE_SPINLOCK(pf_spin_lock);
+
+module_param(verbose, bool, 0644);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param(cluster, int, 0);
+module_param(nice, int, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+#include "pseudo.h"
+
+/* constants for faking geometry numbers */
+
+#define PF_FD_MAX 8192 /* use FD geometry under this size */
+#define PF_FD_HDS 2
+#define PF_FD_SPT 18
+#define PF_HD_HDS 64
+#define PF_HD_SPT 32
+
+#define PF_MAX_RETRIES 5
+#define PF_TMO 800 /* interrupt timeout in jiffies */
+#define PF_SPIN_DEL 50 /* spin delay in micro-seconds */
+
+#define PF_SPIN (1000000*PF_TMO)/(HZ*PF_SPIN_DEL)
+
+#define STAT_ERR 0x00001
+#define STAT_INDEX 0x00002
+#define STAT_ECC 0x00004
+#define STAT_DRQ 0x00008
+#define STAT_SEEK 0x00010
+#define STAT_WRERR 0x00020
+#define STAT_READY 0x00040
+#define STAT_BUSY 0x00080
+
+#define ATAPI_REQ_SENSE 0x03
+#define ATAPI_LOCK 0x1e
+#define ATAPI_DOOR 0x1b
+#define ATAPI_MODE_SENSE 0x5a
+#define ATAPI_CAPACITY 0x25
+#define ATAPI_IDENTIFY 0x12
+#define ATAPI_READ_10 0x28
+#define ATAPI_WRITE_10 0x2a
+
+static int pf_open(struct block_device *bdev, fmode_t mode);
+static void do_pf_request(struct request_queue * q);
+static int pf_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg);
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+
+static void pf_release(struct gendisk *disk, fmode_t mode);
+
+static int pf_detect(void);
+static void do_pf_read(void);
+static void do_pf_read_start(void);
+static void do_pf_write(void);
+static void do_pf_write_start(void);
+static void do_pf_read_drq(void);
+static void do_pf_write_done(void);
+
+#define PF_NM 0
+#define PF_RO 1
+#define PF_RW 2
+
+#define PF_NAMELEN 8
+
+struct pf_unit {
+ struct pi_adapter pia; /* interface to paride layer */
+ struct pi_adapter *pi;
+ int removable; /* removable media device ? */
+ int media_status; /* media present ? WP ? */
+ int drive; /* drive */
+ int lun;
+ int access; /* count of active opens ... */
+ int present; /* device present ? */
+ char name[PF_NAMELEN]; /* pf0, pf1, ... */
+ struct gendisk *disk;
+};
+
+static struct pf_unit units[PF_UNITS];
+
+static int pf_identify(struct pf_unit *pf);
+static void pf_lock(struct pf_unit *pf, int func);
+static void pf_eject(struct pf_unit *pf);
+static unsigned int pf_check_events(struct gendisk *disk,
+ unsigned int clearing);
+
+static char pf_scratch[512]; /* scratch block buffer */
+
+/* the variables below are used mainly in the I/O request engine, which
+ processes only one request at a time.
+*/
+
+static int pf_retries = 0; /* i/o error retry count */
+static int pf_busy = 0; /* request being processed ? */
+static struct request *pf_req; /* current request */
+static int pf_block; /* address of next requested block */
+static int pf_count; /* number of blocks still to do */
+static int pf_run; /* sectors in current cluster */
+static int pf_cmd; /* current command READ/WRITE */
+static struct pf_unit *pf_current;/* unit of current request */
+static int pf_mask; /* stopper for pseudo-int */
+static char *pf_buf; /* buffer for request in progress */
+
+/* kernel glue structures */
+
+static const struct block_device_operations pf_fops = {
+ .owner = THIS_MODULE,
+ .open = pf_open,
+ .release = pf_release,
+ .ioctl = pf_ioctl,
+ .getgeo = pf_getgeo,
+ .check_events = pf_check_events,
+};
+
+static void __init pf_init_units(void)
+{
+ struct pf_unit *pf;
+ int unit;
+
+ pf_drive_count = 0;
+ for (unit = 0, pf = units; unit < PF_UNITS; unit++, pf++) {
+ struct gendisk *disk = alloc_disk(1);
+ if (!disk)
+ continue;
+ pf->disk = disk;
+ pf->pi = &pf->pia;
+ pf->media_status = PF_NM;
+ pf->drive = (*drives[unit])[D_SLV];
+ pf->lun = (*drives[unit])[D_LUN];
+ snprintf(pf->name, PF_NAMELEN, "%s%d", name, unit);
+ disk->major = major;
+ disk->first_minor = unit;
+ strcpy(disk->disk_name, pf->name);
+ disk->fops = &pf_fops;
+ if (!(*drives[unit])[D_PRT])
+ pf_drive_count++;
+ }
+}
+
+static int pf_open(struct block_device *bdev, fmode_t mode)
+{
+ struct pf_unit *pf = bdev->bd_disk->private_data;
+ int ret;
+
+ mutex_lock(&pf_mutex);
+ pf_identify(pf);
+
+ ret = -ENODEV;
+ if (pf->media_status == PF_NM)
+ goto out;
+
+ ret = -EROFS;
+ if ((pf->media_status == PF_RO) && (mode & FMODE_WRITE))
+ goto out;
+
+ ret = 0;
+ pf->access++;
+ if (pf->removable)
+ pf_lock(pf, 1);
+out:
+ mutex_unlock(&pf_mutex);
+ return ret;
+}
+
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct pf_unit *pf = bdev->bd_disk->private_data;
+ sector_t capacity = get_capacity(pf->disk);
+
+ if (capacity < PF_FD_MAX) {
+ geo->cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT);
+ geo->heads = PF_FD_HDS;
+ geo->sectors = PF_FD_SPT;
+ } else {
+ geo->cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT);
+ geo->heads = PF_HD_HDS;
+ geo->sectors = PF_HD_SPT;
+ }
+
+ return 0;
+}
+
+static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+{
+ struct pf_unit *pf = bdev->bd_disk->private_data;
+
+ if (cmd != CDROMEJECT)
+ return -EINVAL;
+
+ if (pf->access != 1)
+ return -EBUSY;
+ mutex_lock(&pf_mutex);
+ pf_eject(pf);
+ mutex_unlock(&pf_mutex);
+
+ return 0;
+}
+
+static void pf_release(struct gendisk *disk, fmode_t mode)
+{
+ struct pf_unit *pf = disk->private_data;
+
+ mutex_lock(&pf_mutex);
+ if (pf->access <= 0) {
+ mutex_unlock(&pf_mutex);
+ WARN_ON(1);
+ return;
+ }
+
+ pf->access--;
+
+ if (!pf->access && pf->removable)
+ pf_lock(pf, 0);
+
+ mutex_unlock(&pf_mutex);
+}
+
+static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing)
+{
+ return DISK_EVENT_MEDIA_CHANGE;
+}
+
+static inline int status_reg(struct pf_unit *pf)
+{
+ return pi_read_regr(pf->pi, 1, 6);
+}
+
+static inline int read_reg(struct pf_unit *pf, int reg)
+{
+ return pi_read_regr(pf->pi, 0, reg);
+}
+
+static inline void write_reg(struct pf_unit *pf, int reg, int val)
+{
+ pi_write_regr(pf->pi, 0, reg, val);
+}
+
+static int pf_wait(struct pf_unit *pf, int go, int stop, char *fun, char *msg)
+{
+ int j, r, e, s, p;
+
+ j = 0;
+ while ((((r = status_reg(pf)) & go) || (stop && (!(r & stop))))
+ && (j++ < PF_SPIN))
+ udelay(PF_SPIN_DEL);
+
+ if ((r & (STAT_ERR & stop)) || (j > PF_SPIN)) {
+ s = read_reg(pf, 7);
+ e = read_reg(pf, 1);
+ p = read_reg(pf, 2);
+ if (j > PF_SPIN)
+ e |= 0x100;
+ if (fun)
+ printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
+ " loop=%d phase=%d\n",
+ pf->name, fun, msg, r, s, e, j, p);
+ return (e << 8) + s;
+ }
+ return 0;
+}
+
+static int pf_command(struct pf_unit *pf, char *cmd, int dlen, char *fun)
+{
+ pi_connect(pf->pi);
+
+ write_reg(pf, 6, 0xa0+0x10*pf->drive);
+
+ if (pf_wait(pf, STAT_BUSY | STAT_DRQ, 0, fun, "before command")) {
+ pi_disconnect(pf->pi);
+ return -1;
+ }
+
+ write_reg(pf, 4, dlen % 256);
+ write_reg(pf, 5, dlen / 256);
+ write_reg(pf, 7, 0xa0); /* ATAPI packet command */
+
+ if (pf_wait(pf, STAT_BUSY, STAT_DRQ, fun, "command DRQ")) {
+ pi_disconnect(pf->pi);
+ return -1;
+ }
+
+ if (read_reg(pf, 2) != 1) {
+ printk("%s: %s: command phase error\n", pf->name, fun);
+ pi_disconnect(pf->pi);
+ return -1;
+ }
+
+ pi_write_block(pf->pi, cmd, 12);
+
+ return 0;
+}
+
+static int pf_completion(struct pf_unit *pf, char *buf, char *fun)
+{
+ int r, s, n;
+
+ r = pf_wait(pf, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR,
+ fun, "completion");
+
+ if ((read_reg(pf, 2) & 2) && (read_reg(pf, 7) & STAT_DRQ)) {
+ n = (((read_reg(pf, 4) + 256 * read_reg(pf, 5)) +
+ 3) & 0xfffc);
+ pi_read_block(pf->pi, buf, n);
+ }
+
+ s = pf_wait(pf, STAT_BUSY, STAT_READY | STAT_ERR, fun, "data done");
+
+ pi_disconnect(pf->pi);
+
+ return (r ? r : s);
+}
+
+static void pf_req_sense(struct pf_unit *pf, int quiet)
+{
+ char rs_cmd[12] =
+ { ATAPI_REQ_SENSE, pf->lun << 5, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 };
+ char buf[16];
+ int r;
+
+ r = pf_command(pf, rs_cmd, 16, "Request sense");
+ mdelay(1);
+ if (!r)
+ pf_completion(pf, buf, "Request sense");
+
+ if ((!r) && (!quiet))
+ printk("%s: Sense key: %x, ASC: %x, ASQ: %x\n",
+ pf->name, buf[2] & 0xf, buf[12], buf[13]);
+}
+
+static int pf_atapi(struct pf_unit *pf, char *cmd, int dlen, char *buf, char *fun)
+{
+ int r;
+
+ r = pf_command(pf, cmd, dlen, fun);
+ mdelay(1);
+ if (!r)
+ r = pf_completion(pf, buf, fun);
+ if (r)
+ pf_req_sense(pf, !fun);
+
+ return r;
+}
+
+static void pf_lock(struct pf_unit *pf, int func)
+{
+ char lo_cmd[12] = { ATAPI_LOCK, pf->lun << 5, 0, 0, func, 0, 0, 0, 0, 0, 0, 0 };
+
+ pf_atapi(pf, lo_cmd, 0, pf_scratch, func ? "lock" : "unlock");
+}
+
+static void pf_eject(struct pf_unit *pf)
+{
+ char ej_cmd[12] = { ATAPI_DOOR, pf->lun << 5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 };
+
+ pf_lock(pf, 0);
+ pf_atapi(pf, ej_cmd, 0, pf_scratch, "eject");
+}
+
+#define PF_RESET_TMO 30 /* in tenths of a second */
+
+static void pf_sleep(int cs)
+{
+ schedule_timeout_interruptible(cs);
+}
+
+/* the ATAPI standard actually specifies the contents of all 7 registers
+ after a reset, but the specification is ambiguous concerning the last
+ two bytes, and different drives interpret the standard differently.
+ */
+
+static int pf_reset(struct pf_unit *pf)
+{
+ int i, k, flg;
+ int expect[5] = { 1, 1, 1, 0x14, 0xeb };
+
+ pi_connect(pf->pi);
+ write_reg(pf, 6, 0xa0+0x10*pf->drive);
+ write_reg(pf, 7, 8);
+
+ pf_sleep(20 * HZ / 1000);
+
+ k = 0;
+ while ((k++ < PF_RESET_TMO) && (status_reg(pf) & STAT_BUSY))
+ pf_sleep(HZ / 10);
+
+ flg = 1;
+ for (i = 0; i < 5; i++)
+ flg &= (read_reg(pf, i + 1) == expect[i]);
+
+ if (verbose) {
+ printk("%s: Reset (%d) signature = ", pf->name, k);
+ for (i = 0; i < 5; i++)
+ printk("%3x", read_reg(pf, i + 1));
+ if (!flg)
+ printk(" (incorrect)");
+ printk("\n");
+ }
+
+ pi_disconnect(pf->pi);
+ return flg - 1;
+}
+
+static void pf_mode_sense(struct pf_unit *pf)
+{
+ char ms_cmd[12] =
+ { ATAPI_MODE_SENSE, pf->lun << 5, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0 };
+ char buf[8];
+
+ pf_atapi(pf, ms_cmd, 8, buf, "mode sense");
+ pf->media_status = PF_RW;
+ if (buf[3] & 0x80)
+ pf->media_status = PF_RO;
+}
+
+static void xs(char *buf, char *targ, int offs, int len)
+{
+ int j, k, l;
+
+ j = 0;
+ l = 0;
+ for (k = 0; k < len; k++)
+ if ((buf[k + offs] != 0x20) || (buf[k + offs] != l))
+ l = targ[j++] = buf[k + offs];
+ if (l == 0x20)
+ j--;
+ targ[j] = 0;
+}
+
+static int xl(char *buf, int offs)
+{
+ int v, k;
+
+ v = 0;
+ for (k = 0; k < 4; k++)
+ v = v * 256 + (buf[k + offs] & 0xff);
+ return v;
+}
+
+static void pf_get_capacity(struct pf_unit *pf)
+{
+ char rc_cmd[12] = { ATAPI_CAPACITY, pf->lun << 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ char buf[8];
+ int bs;
+
+ if (pf_atapi(pf, rc_cmd, 8, buf, "get capacity")) {
+ pf->media_status = PF_NM;
+ return;
+ }
+ set_capacity(pf->disk, xl(buf, 0) + 1);
+ bs = xl(buf, 4);
+ if (bs != 512) {
+ set_capacity(pf->disk, 0);
+ if (verbose)
+ printk("%s: Drive %d, LUN %d,"
+ " unsupported block size %d\n",
+ pf->name, pf->drive, pf->lun, bs);
+ }
+}
+
+static int pf_identify(struct pf_unit *pf)
+{
+ int dt, s;
+ char *ms[2] = { "master", "slave" };
+ char mf[10], id[18];
+ char id_cmd[12] =
+ { ATAPI_IDENTIFY, pf->lun << 5, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+ char buf[36];
+
+ s = pf_atapi(pf, id_cmd, 36, buf, "identify");
+ if (s)
+ return -1;
+
+ dt = buf[0] & 0x1f;
+ if ((dt != 0) && (dt != 7)) {
+ if (verbose)
+ printk("%s: Drive %d, LUN %d, unsupported type %d\n",
+ pf->name, pf->drive, pf->lun, dt);
+ return -1;
+ }
+
+ xs(buf, mf, 8, 8);
+ xs(buf, id, 16, 16);
+
+ pf->removable = (buf[1] & 0x80);
+
+ pf_mode_sense(pf);
+ pf_mode_sense(pf);
+ pf_mode_sense(pf);
+
+ pf_get_capacity(pf);
+
+ printk("%s: %s %s, %s LUN %d, type %d",
+ pf->name, mf, id, ms[pf->drive], pf->lun, dt);
+ if (pf->removable)
+ printk(", removable");
+ if (pf->media_status == PF_NM)
+ printk(", no media\n");
+ else {
+ if (pf->media_status == PF_RO)
+ printk(", RO");
+ printk(", %llu blocks\n",
+ (unsigned long long)get_capacity(pf->disk));
+ }
+ return 0;
+}
+
+/* returns 0, with id set if drive is detected
+ -1, if drive detection failed
+*/
+static int pf_probe(struct pf_unit *pf)
+{
+ if (pf->drive == -1) {
+ for (pf->drive = 0; pf->drive <= 1; pf->drive++)
+ if (!pf_reset(pf)) {
+ if (pf->lun != -1)
+ return pf_identify(pf);
+ else
+ for (pf->lun = 0; pf->lun < 8; pf->lun++)
+ if (!pf_identify(pf))
+ return 0;
+ }
+ } else {
+ if (pf_reset(pf))
+ return -1;
+ if (pf->lun != -1)
+ return pf_identify(pf);
+ for (pf->lun = 0; pf->lun < 8; pf->lun++)
+ if (!pf_identify(pf))
+ return 0;
+ }
+ return -1;
+}
+
+static int pf_detect(void)
+{
+ struct pf_unit *pf = units;
+ int k, unit;
+
+ printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
+ name, name, PF_VERSION, major, cluster, nice);
+
+ k = 0;
+ if (pf_drive_count == 0) {
+ if (pi_init(pf->pi, 1, -1, -1, -1, -1, -1, pf_scratch, PI_PF,
+ verbose, pf->name)) {
+ if (!pf_probe(pf) && pf->disk) {
+ pf->present = 1;
+ k++;
+ } else
+ pi_release(pf->pi);
+ }
+
+ } else
+ for (unit = 0; unit < PF_UNITS; unit++, pf++) {
+ int *conf = *drives[unit];
+ if (!conf[D_PRT])
+ continue;
+ if (pi_init(pf->pi, 0, conf[D_PRT], conf[D_MOD],
+ conf[D_UNI], conf[D_PRO], conf[D_DLY],
+ pf_scratch, PI_PF, verbose, pf->name)) {
+ if (pf->disk && !pf_probe(pf)) {
+ pf->present = 1;
+ k++;
+ } else
+ pi_release(pf->pi);
+ }
+ }
+ if (k)
+ return 0;
+
+ printk("%s: No ATAPI disk detected\n", name);
+ for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
+ put_disk(pf->disk);
+ return -1;
+}
+
+/* The i/o request engine */
+
+static int pf_start(struct pf_unit *pf, int cmd, int b, int c)
+{
+ int i;
+ char io_cmd[12] = { cmd, pf->lun << 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+ for (i = 0; i < 4; i++) {
+ io_cmd[5 - i] = b & 0xff;
+ b = b >> 8;
+ }
+
+ io_cmd[8] = c & 0xff;
+ io_cmd[7] = (c >> 8) & 0xff;
+
+ i = pf_command(pf, io_cmd, c * 512, "start i/o");
+
+ mdelay(1);
+
+ return i;
+}
+
+static int pf_ready(void)
+{
+ return (((status_reg(pf_current) & (STAT_BUSY | pf_mask)) == pf_mask));
+}
+
+static struct request_queue *pf_queue;
+
+static void pf_end_request(int err)
+{
+ if (pf_req && !__blk_end_request_cur(pf_req, err))
+ pf_req = NULL;
+}
+
+static void do_pf_request(struct request_queue * q)
+{
+ if (pf_busy)
+ return;
+repeat:
+ if (!pf_req) {
+ pf_req = blk_fetch_request(q);
+ if (!pf_req)
+ return;
+ }
+
+ pf_current = pf_req->rq_disk->private_data;
+ pf_block = blk_rq_pos(pf_req);
+ pf_run = blk_rq_sectors(pf_req);
+ pf_count = blk_rq_cur_sectors(pf_req);
+
+ if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) {
+ pf_end_request(-EIO);
+ goto repeat;
+ }
+
+ pf_cmd = rq_data_dir(pf_req);
+ pf_buf = bio_data(pf_req->bio);
+ pf_retries = 0;
+
+ pf_busy = 1;
+ if (pf_cmd == READ)
+ pi_do_claimed(pf_current->pi, do_pf_read);
+ else if (pf_cmd == WRITE)
+ pi_do_claimed(pf_current->pi, do_pf_write);
+ else {
+ pf_busy = 0;
+ pf_end_request(-EIO);
+ goto repeat;
+ }
+}
+
+static int pf_next_buf(void)
+{
+ unsigned long saved_flags;
+
+ pf_count--;
+ pf_run--;
+ pf_buf += 512;
+ pf_block++;
+ if (!pf_run)
+ return 1;
+ if (!pf_count) {
+ spin_lock_irqsave(&pf_spin_lock, saved_flags);
+ pf_end_request(0);
+ spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
+ if (!pf_req)
+ return 1;
+ pf_count = blk_rq_cur_sectors(pf_req);
+ pf_buf = bio_data(pf_req->bio);
+ }
+ return 0;
+}
+
+static inline void next_request(int err)
+{
+ unsigned long saved_flags;
+
+ spin_lock_irqsave(&pf_spin_lock, saved_flags);
+ pf_end_request(err);
+ pf_busy = 0;
+ do_pf_request(pf_queue);
+ spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
+}
+
+/* detach from the calling context - in case the spinlock is held */
+static void do_pf_read(void)
+{
+ ps_set_intr(do_pf_read_start, NULL, 0, nice);
+}
+
+static void do_pf_read_start(void)
+{
+ pf_busy = 1;
+
+ if (pf_start(pf_current, ATAPI_READ_10, pf_block, pf_run)) {
+ pi_disconnect(pf_current->pi);
+ if (pf_retries < PF_MAX_RETRIES) {
+ pf_retries++;
+ pi_do_claimed(pf_current->pi, do_pf_read_start);
+ return;
+ }
+ next_request(-EIO);
+ return;
+ }
+ pf_mask = STAT_DRQ;
+ ps_set_intr(do_pf_read_drq, pf_ready, PF_TMO, nice);
+}
+
+static void do_pf_read_drq(void)
+{
+ while (1) {
+ if (pf_wait(pf_current, STAT_BUSY, STAT_DRQ | STAT_ERR,
+ "read block", "completion") & STAT_ERR) {
+ pi_disconnect(pf_current->pi);
+ if (pf_retries < PF_MAX_RETRIES) {
+ pf_req_sense(pf_current, 0);
+ pf_retries++;
+ pi_do_claimed(pf_current->pi, do_pf_read_start);
+ return;
+ }
+ next_request(-EIO);
+ return;
+ }
+ pi_read_block(pf_current->pi, pf_buf, 512);
+ if (pf_next_buf())
+ break;
+ }
+ pi_disconnect(pf_current->pi);
+ next_request(0);
+}
+
+static void do_pf_write(void)
+{
+ ps_set_intr(do_pf_write_start, NULL, 0, nice);
+}
+
+static void do_pf_write_start(void)
+{
+ pf_busy = 1;
+
+ if (pf_start(pf_current, ATAPI_WRITE_10, pf_block, pf_run)) {
+ pi_disconnect(pf_current->pi);
+ if (pf_retries < PF_MAX_RETRIES) {
+ pf_retries++;
+ pi_do_claimed(pf_current->pi, do_pf_write_start);
+ return;
+ }
+ next_request(-EIO);
+ return;
+ }
+
+ while (1) {
+ if (pf_wait(pf_current, STAT_BUSY, STAT_DRQ | STAT_ERR,
+ "write block", "data wait") & STAT_ERR) {
+ pi_disconnect(pf_current->pi);
+ if (pf_retries < PF_MAX_RETRIES) {
+ pf_retries++;
+ pi_do_claimed(pf_current->pi, do_pf_write_start);
+ return;
+ }
+ next_request(-EIO);
+ return;
+ }
+ pi_write_block(pf_current->pi, pf_buf, 512);
+ if (pf_next_buf())
+ break;
+ }
+ pf_mask = 0;
+ ps_set_intr(do_pf_write_done, pf_ready, PF_TMO, nice);
+}
+
+static void do_pf_write_done(void)
+{
+ if (pf_wait(pf_current, STAT_BUSY, 0, "write block", "done") & STAT_ERR) {
+ pi_disconnect(pf_current->pi);
+ if (pf_retries < PF_MAX_RETRIES) {
+ pf_retries++;
+ pi_do_claimed(pf_current->pi, do_pf_write_start);
+ return;
+ }
+ next_request(-EIO);
+ return;
+ }
+ pi_disconnect(pf_current->pi);
+ next_request(0);
+}
+
+static int __init pf_init(void)
+{ /* preliminary initialisation */
+ struct pf_unit *pf;
+ int unit;
+
+ if (disable)
+ return -EINVAL;
+
+ pf_init_units();
+
+ if (pf_detect())
+ return -ENODEV;
+ pf_busy = 0;
+
+ if (register_blkdev(major, name)) {
+ for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
+ put_disk(pf->disk);
+ return -EBUSY;
+ }
+ pf_queue = blk_init_queue(do_pf_request, &pf_spin_lock);
+ if (!pf_queue) {
+ unregister_blkdev(major, name);
+ for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++)
+ put_disk(pf->disk);
+ return -ENOMEM;
+ }
+
+ blk_queue_max_segments(pf_queue, cluster);
+
+ for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
+ struct gendisk *disk = pf->disk;
+
+ if (!pf->present)
+ continue;
+ disk->private_data = pf;
+ disk->queue = pf_queue;
+ add_disk(disk);
+ }
+ return 0;
+}
+
+static void __exit pf_exit(void)
+{
+ struct pf_unit *pf;
+ int unit;
+ unregister_blkdev(major, name);
+ for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
+ if (!pf->present)
+ continue;
+ del_gendisk(pf->disk);
+ put_disk(pf->disk);
+ pi_release(pf->pi);
+ }
+ blk_cleanup_queue(pf_queue);
+}
+
+MODULE_LICENSE("GPL");
+module_init(pf_init)
+module_exit(pf_exit)
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
new file mode 100644
index 000000000..876d0c3ea
--- /dev/null
+++ b/drivers/block/paride/pg.c
@@ -0,0 +1,726 @@
+/*
+ pg.c (c) 1998 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ The pg driver provides a simple character device interface for
+ sending ATAPI commands to a device. With the exception of the
+ ATAPI reset operation, all operations are performed by a pair
+ of read and write operations to the appropriate /dev/pgN device.
+ A write operation delivers a command and any outbound data in
+ a single buffer. Normally, the write will succeed unless the
+ device is offline or malfunctioning, or there is already another
+ command pending. If the write succeeds, it should be followed
+ immediately by a read operation, to obtain any returned data and
+ status information. A read will fail if there is no operation
+ in progress.
+
+ As a special case, the device can be reset with a write operation,
+ and in this case, no following read is expected, or permitted.
+
+ There are no ioctl() operations. Any single operation
+ may transfer at most PG_MAX_DATA bytes. Note that the driver must
+ copy the data through an internal buffer. In keeping with all
+ current ATAPI devices, command packets are assumed to be exactly
+ 12 bytes in length.
+
+ To permit future changes to this interface, the headers in the
+ read and write buffers contain a single character "magic" flag.
+ Currently this flag must be the character "P".
+
+ By default, the driver will autoprobe for a single parallel
+ port ATAPI device, but if their individual parameters are
+ specified, the driver can handle up to 4 devices.
+
+ To use this device, you must have the following device
+ special files defined:
+
+ /dev/pg0 c 97 0
+ /dev/pg1 c 97 1
+ /dev/pg2 c 97 2
+ /dev/pg3 c 97 3
+
+ (You'll need to change the 97 to something else if you use
+ the 'major' parameter to install the driver on a different
+ major number.)
+
+ The behaviour of the pg driver can be altered by setting
+ some parameters from the insmod command line. The following
+ parameters are adjustable:
+
+ drive0 These four arguments can be arrays of
+ drive1 1-6 integers as follows:
+ drive2
+ drive3 <prt>,<pro>,<uni>,<mod>,<slv>,<dly>
+
+ Where,
+
+ <prt> is the base of the parallel port address for
+ the corresponding drive. (required)
+
+ <pro> is the protocol number for the adapter that
+ supports this drive. These numbers are
+ logged by 'paride' when the protocol modules
+ are initialised. (0 if not given)
+
+ <uni> for those adapters that support chained
+ devices, this is the unit selector for the
+ chain of devices on the given port. It should
+ be zero for devices that don't support chaining.
+ (0 if not given)
+
+ <mod> this can be -1 to choose the best mode, or one
+ of the mode numbers supported by the adapter.
+ (-1 if not given)
+
+ <slv> ATAPI devices can be jumpered to master or slave.
+ Set this to 0 to choose the master drive, 1 to
+ choose the slave, -1 (the default) to choose the
+ first drive found.
+
+ <dly> some parallel ports require the driver to
+ go more slowly. -1 sets a default value that
+ should work with the chosen protocol. Otherwise,
+ set this to a small integer, the larger it is
+ the slower the port i/o. In some cases, setting
+ this to zero will speed up the device. (default -1)
+
+ major You may use this parameter to overide the
+ default major number (97) that this driver
+ will use. Be sure to change the device
+ name as well.
+
+ name This parameter is a character string that
+ contains the name the kernel will use for this
+ device (in /proc output, for instance).
+ (default "pg").
+
+ verbose This parameter controls the amount of logging
+ that is done by the driver. Set it to 0 for
+ quiet operation, to 1 to enable progress
+ messages while the driver probes for devices,
+ or to 2 for full debug logging. (default 0)
+
+ If this driver is built into the kernel, you can use
+ the following command line parameters, with the same values
+ as the corresponding module parameters listed above:
+
+ pg.drive0
+ pg.drive1
+ pg.drive2
+ pg.drive3
+
+ In addition, you can use the parameter pg.disable to disable
+ the driver entirely.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.06.16 Bug fixes
+ 1.02 GRG 1998.09.24 Added jumbo support
+
+*/
+
+#define PG_VERSION "1.02"
+#define PG_MAJOR 97
+#define PG_NAME "pg"
+#define PG_UNITS 4
+
+#ifndef PI_PG
+#define PI_PG 4
+#endif
+
+#include <linux/types.h>
+/* Here are things one can override from the insmod command.
+ Most are autoprobed by paride unless set here. Verbose is 0
+ by default.
+
+*/
+
+static int verbose;
+static int major = PG_MAJOR;
+static char *name = PG_NAME;
+static int disable = 0;
+
+static int drive0[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive1[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive2[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive3[6] = { 0, 0, 0, -1, -1, -1 };
+
+static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
+static int pg_drive_count;
+
+enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
+
+/* end of parameters */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/mtio.h>
+#include <linux/pg.h>
+#include <linux/device.h>
+#include <linux/sched.h> /* current, TASK_* */
+#include <linux/mutex.h>
+#include <linux/jiffies.h>
+
+#include <asm/uaccess.h>
+
+module_param(verbose, int, 0644);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+
+#define PG_SPIN_DEL 50 /* spin delay in micro-seconds */
+#define PG_SPIN 200
+#define PG_TMO HZ
+#define PG_RESET_TMO 10*HZ
+
+#define STAT_ERR 0x01
+#define STAT_INDEX 0x02
+#define STAT_ECC 0x04
+#define STAT_DRQ 0x08
+#define STAT_SEEK 0x10
+#define STAT_WRERR 0x20
+#define STAT_READY 0x40
+#define STAT_BUSY 0x80
+
+#define ATAPI_IDENTIFY 0x12
+
+static DEFINE_MUTEX(pg_mutex);
+static int pg_open(struct inode *inode, struct file *file);
+static int pg_release(struct inode *inode, struct file *file);
+static ssize_t pg_read(struct file *filp, char __user *buf,
+ size_t count, loff_t * ppos);
+static ssize_t pg_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t * ppos);
+static int pg_detect(void);
+
+#define PG_NAMELEN 8
+
+struct pg {
+ struct pi_adapter pia; /* interface to paride layer */
+ struct pi_adapter *pi;
+ int busy; /* write done, read expected */
+ int start; /* jiffies at command start */
+ int dlen; /* transfer size requested */
+ unsigned long timeout; /* timeout requested */
+ int status; /* last sense key */
+ int drive; /* drive */
+ unsigned long access; /* count of active opens ... */
+ int present; /* device present ? */
+ char *bufptr;
+ char name[PG_NAMELEN]; /* pg0, pg1, ... */
+};
+
+static struct pg devices[PG_UNITS];
+
+static int pg_identify(struct pg *dev, int log);
+
+static char pg_scratch[512]; /* scratch block buffer */
+
+static struct class *pg_class;
+
+/* kernel glue structures */
+
+static const struct file_operations pg_fops = {
+ .owner = THIS_MODULE,
+ .read = pg_read,
+ .write = pg_write,
+ .open = pg_open,
+ .release = pg_release,
+ .llseek = noop_llseek,
+};
+
+static void pg_init_units(void)
+{
+ int unit;
+
+ pg_drive_count = 0;
+ for (unit = 0; unit < PG_UNITS; unit++) {
+ int *parm = *drives[unit];
+ struct pg *dev = &devices[unit];
+ dev->pi = &dev->pia;
+ clear_bit(0, &dev->access);
+ dev->busy = 0;
+ dev->present = 0;
+ dev->bufptr = NULL;
+ dev->drive = parm[D_SLV];
+ snprintf(dev->name, PG_NAMELEN, "%s%c", name, 'a'+unit);
+ if (parm[D_PRT])
+ pg_drive_count++;
+ }
+}
+
+static inline int status_reg(struct pg *dev)
+{
+ return pi_read_regr(dev->pi, 1, 6);
+}
+
+static inline int read_reg(struct pg *dev, int reg)
+{
+ return pi_read_regr(dev->pi, 0, reg);
+}
+
+static inline void write_reg(struct pg *dev, int reg, int val)
+{
+ pi_write_regr(dev->pi, 0, reg, val);
+}
+
+static inline u8 DRIVE(struct pg *dev)
+{
+ return 0xa0+0x10*dev->drive;
+}
+
+static void pg_sleep(int cs)
+{
+ schedule_timeout_interruptible(cs);
+}
+
+static int pg_wait(struct pg *dev, int go, int stop, unsigned long tmo, char *msg)
+{
+ int j, r, e, s, p, to;
+
+ dev->status = 0;
+
+ j = 0;
+ while ((((r = status_reg(dev)) & go) || (stop && (!(r & stop))))
+ && time_before(jiffies, tmo)) {
+ if (j++ < PG_SPIN)
+ udelay(PG_SPIN_DEL);
+ else
+ pg_sleep(1);
+ }
+
+ to = time_after_eq(jiffies, tmo);
+
+ if ((r & (STAT_ERR & stop)) || to) {
+ s = read_reg(dev, 7);
+ e = read_reg(dev, 1);
+ p = read_reg(dev, 2);
+ if (verbose > 1)
+ printk("%s: %s: stat=0x%x err=0x%x phase=%d%s\n",
+ dev->name, msg, s, e, p, to ? " timeout" : "");
+ if (to)
+ e |= 0x100;
+ dev->status = (e >> 4) & 0xff;
+ return -1;
+ }
+ return 0;
+}
+
+static int pg_command(struct pg *dev, char *cmd, int dlen, unsigned long tmo)
+{
+ int k;
+
+ pi_connect(dev->pi);
+
+ write_reg(dev, 6, DRIVE(dev));
+
+ if (pg_wait(dev, STAT_BUSY | STAT_DRQ, 0, tmo, "before command"))
+ goto fail;
+
+ write_reg(dev, 4, dlen % 256);
+ write_reg(dev, 5, dlen / 256);
+ write_reg(dev, 7, 0xa0); /* ATAPI packet command */
+
+ if (pg_wait(dev, STAT_BUSY, STAT_DRQ, tmo, "command DRQ"))
+ goto fail;
+
+ if (read_reg(dev, 2) != 1) {
+ printk("%s: command phase error\n", dev->name);
+ goto fail;
+ }
+
+ pi_write_block(dev->pi, cmd, 12);
+
+ if (verbose > 1) {
+ printk("%s: Command sent, dlen=%d packet= ", dev->name, dlen);
+ for (k = 0; k < 12; k++)
+ printk("%02x ", cmd[k] & 0xff);
+ printk("\n");
+ }
+ return 0;
+fail:
+ pi_disconnect(dev->pi);
+ return -1;
+}
+
+static int pg_completion(struct pg *dev, char *buf, unsigned long tmo)
+{
+ int r, d, n, p;
+
+ r = pg_wait(dev, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR,
+ tmo, "completion");
+
+ dev->dlen = 0;
+
+ while (read_reg(dev, 7) & STAT_DRQ) {
+ d = (read_reg(dev, 4) + 256 * read_reg(dev, 5));
+ n = ((d + 3) & 0xfffc);
+ p = read_reg(dev, 2) & 3;
+ if (p == 0)
+ pi_write_block(dev->pi, buf, n);
+ if (p == 2)
+ pi_read_block(dev->pi, buf, n);
+ if (verbose > 1)
+ printk("%s: %s %d bytes\n", dev->name,
+ p ? "Read" : "Write", n);
+ dev->dlen += (1 - p) * d;
+ buf += d;
+ r = pg_wait(dev, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR,
+ tmo, "completion");
+ }
+
+ pi_disconnect(dev->pi);
+
+ return r;
+}
+
+static int pg_reset(struct pg *dev)
+{
+ int i, k, err;
+ int expect[5] = { 1, 1, 1, 0x14, 0xeb };
+ int got[5];
+
+ pi_connect(dev->pi);
+ write_reg(dev, 6, DRIVE(dev));
+ write_reg(dev, 7, 8);
+
+ pg_sleep(20 * HZ / 1000);
+
+ k = 0;
+ while ((k++ < PG_RESET_TMO) && (status_reg(dev) & STAT_BUSY))
+ pg_sleep(1);
+
+ for (i = 0; i < 5; i++)
+ got[i] = read_reg(dev, i + 1);
+
+ err = memcmp(expect, got, sizeof(got)) ? -1 : 0;
+
+ if (verbose) {
+ printk("%s: Reset (%d) signature = ", dev->name, k);
+ for (i = 0; i < 5; i++)
+ printk("%3x", got[i]);
+ if (err)
+ printk(" (incorrect)");
+ printk("\n");
+ }
+
+ pi_disconnect(dev->pi);
+ return err;
+}
+
+static void xs(char *buf, char *targ, int len)
+{
+ char l = '\0';
+ int k;
+
+ for (k = 0; k < len; k++) {
+ char c = *buf++;
+ if (c != ' ' && c != l)
+ l = *targ++ = c;
+ }
+ if (l == ' ')
+ targ--;
+ *targ = '\0';
+}
+
+static int pg_identify(struct pg *dev, int log)
+{
+ int s;
+ char *ms[2] = { "master", "slave" };
+ char mf[10], id[18];
+ char id_cmd[12] = { ATAPI_IDENTIFY, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+ char buf[36];
+
+ s = pg_command(dev, id_cmd, 36, jiffies + PG_TMO);
+ if (s)
+ return -1;
+ s = pg_completion(dev, buf, jiffies + PG_TMO);
+ if (s)
+ return -1;
+
+ if (log) {
+ xs(buf + 8, mf, 8);
+ xs(buf + 16, id, 16);
+ printk("%s: %s %s, %s\n", dev->name, mf, id, ms[dev->drive]);
+ }
+
+ return 0;
+}
+
+/*
+ * returns 0, with id set if drive is detected
+ * -1, if drive detection failed
+ */
+static int pg_probe(struct pg *dev)
+{
+ if (dev->drive == -1) {
+ for (dev->drive = 0; dev->drive <= 1; dev->drive++)
+ if (!pg_reset(dev))
+ return pg_identify(dev, 1);
+ } else {
+ if (!pg_reset(dev))
+ return pg_identify(dev, 1);
+ }
+ return -1;
+}
+
+static int pg_detect(void)
+{
+ struct pg *dev = &devices[0];
+ int k, unit;
+
+ printk("%s: %s version %s, major %d\n", name, name, PG_VERSION, major);
+
+ k = 0;
+ if (pg_drive_count == 0) {
+ if (pi_init(dev->pi, 1, -1, -1, -1, -1, -1, pg_scratch,
+ PI_PG, verbose, dev->name)) {
+ if (!pg_probe(dev)) {
+ dev->present = 1;
+ k++;
+ } else
+ pi_release(dev->pi);
+ }
+
+ } else
+ for (unit = 0; unit < PG_UNITS; unit++, dev++) {
+ int *parm = *drives[unit];
+ if (!parm[D_PRT])
+ continue;
+ if (pi_init(dev->pi, 0, parm[D_PRT], parm[D_MOD],
+ parm[D_UNI], parm[D_PRO], parm[D_DLY],
+ pg_scratch, PI_PG, verbose, dev->name)) {
+ if (!pg_probe(dev)) {
+ dev->present = 1;
+ k++;
+ } else
+ pi_release(dev->pi);
+ }
+ }
+
+ if (k)
+ return 0;
+
+ printk("%s: No ATAPI device detected\n", name);
+ return -1;
+}
+
+static int pg_open(struct inode *inode, struct file *file)
+{
+ int unit = iminor(inode) & 0x7f;
+ struct pg *dev = &devices[unit];
+ int ret = 0;
+
+ mutex_lock(&pg_mutex);
+ if ((unit >= PG_UNITS) || (!dev->present)) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (test_and_set_bit(0, &dev->access)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (dev->busy) {
+ pg_reset(dev);
+ dev->busy = 0;
+ }
+
+ pg_identify(dev, (verbose > 1));
+
+ dev->bufptr = kmalloc(PG_MAX_DATA, GFP_KERNEL);
+ if (dev->bufptr == NULL) {
+ clear_bit(0, &dev->access);
+ printk("%s: buffer allocation failed\n", dev->name);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ file->private_data = dev;
+
+out:
+ mutex_unlock(&pg_mutex);
+ return ret;
+}
+
+static int pg_release(struct inode *inode, struct file *file)
+{
+ struct pg *dev = file->private_data;
+
+ kfree(dev->bufptr);
+ dev->bufptr = NULL;
+ clear_bit(0, &dev->access);
+
+ return 0;
+}
+
+static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos)
+{
+ struct pg *dev = filp->private_data;
+ struct pg_write_hdr hdr;
+ int hs = sizeof (hdr);
+
+ if (dev->busy)
+ return -EBUSY;
+ if (count < hs)
+ return -EINVAL;
+
+ if (copy_from_user(&hdr, buf, hs))
+ return -EFAULT;
+
+ if (hdr.magic != PG_MAGIC)
+ return -EINVAL;
+ if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA)
+ return -EINVAL;
+ if ((count - hs) > PG_MAX_DATA)
+ return -EINVAL;
+
+ if (hdr.func == PG_RESET) {
+ if (count != hs)
+ return -EINVAL;
+ if (pg_reset(dev))
+ return -EIO;
+ return count;
+ }
+
+ if (hdr.func != PG_COMMAND)
+ return -EINVAL;
+
+ dev->start = jiffies;
+ dev->timeout = hdr.timeout * HZ + HZ / 2 + jiffies;
+
+ if (pg_command(dev, hdr.packet, hdr.dlen, jiffies + PG_TMO)) {
+ if (dev->status & 0x10)
+ return -ETIME;
+ return -EIO;
+ }
+
+ dev->busy = 1;
+
+ if (copy_from_user(dev->bufptr, buf + hs, count - hs))
+ return -EFAULT;
+ return count;
+}
+
+static ssize_t pg_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+{
+ struct pg *dev = filp->private_data;
+ struct pg_read_hdr hdr;
+ int hs = sizeof (hdr);
+ int copy;
+
+ if (!dev->busy)
+ return -EINVAL;
+ if (count < hs)
+ return -EINVAL;
+
+ dev->busy = 0;
+
+ if (pg_completion(dev, dev->bufptr, dev->timeout))
+ if (dev->status & 0x10)
+ return -ETIME;
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.magic = PG_MAGIC;
+ hdr.dlen = dev->dlen;
+ copy = 0;
+
+ if (hdr.dlen < 0) {
+ hdr.dlen = -1 * hdr.dlen;
+ copy = hdr.dlen;
+ if (copy > (count - hs))
+ copy = count - hs;
+ }
+
+ hdr.duration = (jiffies - dev->start + HZ / 2) / HZ;
+ hdr.scsi = dev->status & 0x0f;
+
+ if (copy_to_user(buf, &hdr, hs))
+ return -EFAULT;
+ if (copy > 0)
+ if (copy_to_user(buf + hs, dev->bufptr, copy))
+ return -EFAULT;
+ return copy + hs;
+}
+
+static int __init pg_init(void)
+{
+ int unit;
+ int err;
+
+ if (disable){
+ err = -EINVAL;
+ goto out;
+ }
+
+ pg_init_units();
+
+ if (pg_detect()) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ err = register_chrdev(major, name, &pg_fops);
+ if (err < 0) {
+ printk("pg_init: unable to get major number %d\n", major);
+ for (unit = 0; unit < PG_UNITS; unit++) {
+ struct pg *dev = &devices[unit];
+ if (dev->present)
+ pi_release(dev->pi);
+ }
+ goto out;
+ }
+ major = err; /* In case the user specified `major=0' (dynamic) */
+ pg_class = class_create(THIS_MODULE, "pg");
+ if (IS_ERR(pg_class)) {
+ err = PTR_ERR(pg_class);
+ goto out_chrdev;
+ }
+ for (unit = 0; unit < PG_UNITS; unit++) {
+ struct pg *dev = &devices[unit];
+ if (dev->present)
+ device_create(pg_class, NULL, MKDEV(major, unit), NULL,
+ "pg%u", unit);
+ }
+ err = 0;
+ goto out;
+
+out_chrdev:
+ unregister_chrdev(major, "pg");
+out:
+ return err;
+}
+
+static void __exit pg_exit(void)
+{
+ int unit;
+
+ for (unit = 0; unit < PG_UNITS; unit++) {
+ struct pg *dev = &devices[unit];
+ if (dev->present)
+ device_destroy(pg_class, MKDEV(major, unit));
+ }
+ class_destroy(pg_class);
+ unregister_chrdev(major, name);
+
+ for (unit = 0; unit < PG_UNITS; unit++) {
+ struct pg *dev = &devices[unit];
+ if (dev->present)
+ pi_release(dev->pi);
+ }
+}
+
+MODULE_LICENSE("GPL");
+module_init(pg_init)
+module_exit(pg_exit)
diff --git a/drivers/block/paride/ppc6lnx.c b/drivers/block/paride/ppc6lnx.c
new file mode 100644
index 000000000..5e5521d3b
--- /dev/null
+++ b/drivers/block/paride/ppc6lnx.c
@@ -0,0 +1,726 @@
+/*
+ ppc6lnx.c (c) 2001 Micro Solutions Inc.
+ Released under the terms of the GNU General Public license
+
+ ppc6lnx.c is a par of the protocol driver for the Micro Solutions
+ "BACKPACK" parallel port IDE adapter
+ (Works on Series 6 drives)
+
+*/
+
+//***************************************************************************
+
+// PPC 6 Code in C sanitized for LINUX
+// Original x86 ASM by Ron, Converted to C by Clive
+
+//***************************************************************************
+
+
+#define port_stb 1
+#define port_afd 2
+#define cmd_stb port_afd
+#define port_init 4
+#define data_stb port_init
+#define port_sel 8
+#define port_int 16
+#define port_dir 0x20
+
+#define ECR_EPP 0x80
+#define ECR_BI 0x20
+
+//***************************************************************************
+
+// 60772 Commands
+
+#define ACCESS_REG 0x00
+#define ACCESS_PORT 0x40
+
+#define ACCESS_READ 0x00
+#define ACCESS_WRITE 0x20
+
+// 60772 Command Prefix
+
+#define CMD_PREFIX_SET 0xe0 // Special command that modifies the next command's operation
+#define CMD_PREFIX_RESET 0xc0 // Resets current cmd modifier reg bits
+ #define PREFIX_IO16 0x01 // perform 16-bit wide I/O
+ #define PREFIX_FASTWR 0x04 // enable PPC mode fast-write
+ #define PREFIX_BLK 0x08 // enable block transfer mode
+
+// 60772 Registers
+
+#define REG_STATUS 0x00 // status register
+ #define STATUS_IRQA 0x01 // Peripheral IRQA line
+ #define STATUS_EEPROM_DO 0x40 // Serial EEPROM data bit
+#define REG_VERSION 0x01 // PPC version register (read)
+#define REG_HWCFG 0x02 // Hardware Config register
+#define REG_RAMSIZE 0x03 // Size of RAM Buffer
+ #define RAMSIZE_128K 0x02
+#define REG_EEPROM 0x06 // EEPROM control register
+ #define EEPROM_SK 0x01 // eeprom SK bit
+ #define EEPROM_DI 0x02 // eeprom DI bit
+ #define EEPROM_CS 0x04 // eeprom CS bit
+ #define EEPROM_EN 0x08 // eeprom output enable
+#define REG_BLKSIZE 0x08 // Block transfer len (24 bit)
+
+//***************************************************************************
+
+typedef struct ppc_storage {
+ u16 lpt_addr; // LPT base address
+ u8 ppc_id;
+ u8 mode; // operating mode
+ // 0 = PPC Uni SW
+ // 1 = PPC Uni FW
+ // 2 = PPC Bi SW
+ // 3 = PPC Bi FW
+ // 4 = EPP Byte
+ // 5 = EPP Word
+ // 6 = EPP Dword
+ u8 ppc_flags;
+ u8 org_data; // original LPT data port contents
+ u8 org_ctrl; // original LPT control port contents
+ u8 cur_ctrl; // current control port contents
+} Interface;
+
+//***************************************************************************
+
+// ppc_flags
+
+#define fifo_wait 0x10
+
+//***************************************************************************
+
+// DONT CHANGE THESE LEST YOU BREAK EVERYTHING - BIT FIELD DEPENDENCIES
+
+#define PPCMODE_UNI_SW 0
+#define PPCMODE_UNI_FW 1
+#define PPCMODE_BI_SW 2
+#define PPCMODE_BI_FW 3
+#define PPCMODE_EPP_BYTE 4
+#define PPCMODE_EPP_WORD 5
+#define PPCMODE_EPP_DWORD 6
+
+//***************************************************************************
+
+static int ppc6_select(Interface *ppc);
+static void ppc6_deselect(Interface *ppc);
+static void ppc6_send_cmd(Interface *ppc, u8 cmd);
+static void ppc6_wr_data_byte(Interface *ppc, u8 data);
+static u8 ppc6_rd_data_byte(Interface *ppc);
+static u8 ppc6_rd_port(Interface *ppc, u8 port);
+static void ppc6_wr_port(Interface *ppc, u8 port, u8 data);
+static void ppc6_rd_data_blk(Interface *ppc, u8 *data, long count);
+static void ppc6_wait_for_fifo(Interface *ppc);
+static void ppc6_wr_data_blk(Interface *ppc, u8 *data, long count);
+static void ppc6_rd_port16_blk(Interface *ppc, u8 port, u8 *data, long length);
+static void ppc6_wr_port16_blk(Interface *ppc, u8 port, u8 *data, long length);
+static void ppc6_wr_extout(Interface *ppc, u8 regdata);
+static int ppc6_open(Interface *ppc);
+static void ppc6_close(Interface *ppc);
+
+//***************************************************************************
+
+static int ppc6_select(Interface *ppc)
+{
+ u8 i, j, k;
+
+ i = inb(ppc->lpt_addr + 1);
+
+ if (i & 1)
+ outb(i, ppc->lpt_addr + 1);
+
+ ppc->org_data = inb(ppc->lpt_addr);
+
+ ppc->org_ctrl = inb(ppc->lpt_addr + 2) & 0x5F; // readback ctrl
+
+ ppc->cur_ctrl = ppc->org_ctrl;
+
+ ppc->cur_ctrl |= port_sel;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ if (ppc->org_data == 'b')
+ outb('x', ppc->lpt_addr);
+
+ outb('b', ppc->lpt_addr);
+ outb('p', ppc->lpt_addr);
+ outb(ppc->ppc_id, ppc->lpt_addr);
+ outb(~ppc->ppc_id,ppc->lpt_addr);
+
+ ppc->cur_ctrl &= ~port_sel;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ ppc->cur_ctrl = (ppc->cur_ctrl & port_int) | port_init;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ i = ppc->mode & 0x0C;
+
+ if (i == 0)
+ i = (ppc->mode & 2) | 1;
+
+ outb(i, ppc->lpt_addr);
+
+ ppc->cur_ctrl |= port_sel;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ // DELAY
+
+ ppc->cur_ctrl |= port_afd;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ j = ((i & 0x08) << 4) | ((i & 0x07) << 3);
+
+ k = inb(ppc->lpt_addr + 1) & 0xB8;
+
+ if (j == k)
+ {
+ ppc->cur_ctrl &= ~port_afd;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ k = (inb(ppc->lpt_addr + 1) & 0xB8) ^ 0xB8;
+
+ if (j == k)
+ {
+ if (i & 4) // EPP
+ ppc->cur_ctrl &= ~(port_sel | port_init);
+ else // PPC/ECP
+ ppc->cur_ctrl &= ~port_sel;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ return(1);
+ }
+ }
+
+ outb(ppc->org_ctrl, ppc->lpt_addr + 2);
+
+ outb(ppc->org_data, ppc->lpt_addr);
+
+ return(0); // FAIL
+}
+
+//***************************************************************************
+
+static void ppc6_deselect(Interface *ppc)
+{
+ if (ppc->mode & 4) // EPP
+ ppc->cur_ctrl |= port_init;
+ else // PPC/ECP
+ ppc->cur_ctrl |= port_sel;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ outb(ppc->org_data, ppc->lpt_addr);
+
+ outb((ppc->org_ctrl | port_sel), ppc->lpt_addr + 2);
+
+ outb(ppc->org_ctrl, ppc->lpt_addr + 2);
+}
+
+//***************************************************************************
+
+static void ppc6_send_cmd(Interface *ppc, u8 cmd)
+{
+ switch(ppc->mode)
+ {
+ case PPCMODE_UNI_SW :
+ case PPCMODE_UNI_FW :
+ case PPCMODE_BI_SW :
+ case PPCMODE_BI_FW :
+ {
+ outb(cmd, ppc->lpt_addr);
+
+ ppc->cur_ctrl ^= cmd_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ break;
+ }
+
+ case PPCMODE_EPP_BYTE :
+ case PPCMODE_EPP_WORD :
+ case PPCMODE_EPP_DWORD :
+ {
+ outb(cmd, ppc->lpt_addr + 3);
+
+ break;
+ }
+ }
+}
+
+//***************************************************************************
+
+static void ppc6_wr_data_byte(Interface *ppc, u8 data)
+{
+ switch(ppc->mode)
+ {
+ case PPCMODE_UNI_SW :
+ case PPCMODE_UNI_FW :
+ case PPCMODE_BI_SW :
+ case PPCMODE_BI_FW :
+ {
+ outb(data, ppc->lpt_addr);
+
+ ppc->cur_ctrl ^= data_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ break;
+ }
+
+ case PPCMODE_EPP_BYTE :
+ case PPCMODE_EPP_WORD :
+ case PPCMODE_EPP_DWORD :
+ {
+ outb(data, ppc->lpt_addr + 4);
+
+ break;
+ }
+ }
+}
+
+//***************************************************************************
+
+static u8 ppc6_rd_data_byte(Interface *ppc)
+{
+ u8 data = 0;
+
+ switch(ppc->mode)
+ {
+ case PPCMODE_UNI_SW :
+ case PPCMODE_UNI_FW :
+ {
+ ppc->cur_ctrl = (ppc->cur_ctrl & ~port_stb) ^ data_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ // DELAY
+
+ data = inb(ppc->lpt_addr + 1);
+
+ data = ((data & 0x80) >> 1) | ((data & 0x38) >> 3);
+
+ ppc->cur_ctrl |= port_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ // DELAY
+
+ data |= inb(ppc->lpt_addr + 1) & 0xB8;
+
+ break;
+ }
+
+ case PPCMODE_BI_SW :
+ case PPCMODE_BI_FW :
+ {
+ ppc->cur_ctrl |= port_dir;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ ppc->cur_ctrl = (ppc->cur_ctrl | port_stb) ^ data_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ data = inb(ppc->lpt_addr);
+
+ ppc->cur_ctrl &= ~port_stb;
+
+ outb(ppc->cur_ctrl,ppc->lpt_addr + 2);
+
+ ppc->cur_ctrl &= ~port_dir;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ break;
+ }
+
+ case PPCMODE_EPP_BYTE :
+ case PPCMODE_EPP_WORD :
+ case PPCMODE_EPP_DWORD :
+ {
+ outb((ppc->cur_ctrl | port_dir),ppc->lpt_addr + 2);
+
+ data = inb(ppc->lpt_addr + 4);
+
+ outb(ppc->cur_ctrl,ppc->lpt_addr + 2);
+
+ break;
+ }
+ }
+
+ return(data);
+}
+
+//***************************************************************************
+
+static u8 ppc6_rd_port(Interface *ppc, u8 port)
+{
+ ppc6_send_cmd(ppc,(u8)(port | ACCESS_PORT | ACCESS_READ));
+
+ return(ppc6_rd_data_byte(ppc));
+}
+
+//***************************************************************************
+
+static void ppc6_wr_port(Interface *ppc, u8 port, u8 data)
+{
+ ppc6_send_cmd(ppc,(u8)(port | ACCESS_PORT | ACCESS_WRITE));
+
+ ppc6_wr_data_byte(ppc, data);
+}
+
+//***************************************************************************
+
+static void ppc6_rd_data_blk(Interface *ppc, u8 *data, long count)
+{
+ switch(ppc->mode)
+ {
+ case PPCMODE_UNI_SW :
+ case PPCMODE_UNI_FW :
+ {
+ while(count)
+ {
+ u8 d;
+
+ ppc->cur_ctrl = (ppc->cur_ctrl & ~port_stb) ^ data_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ // DELAY
+
+ d = inb(ppc->lpt_addr + 1);
+
+ d = ((d & 0x80) >> 1) | ((d & 0x38) >> 3);
+
+ ppc->cur_ctrl |= port_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ // DELAY
+
+ d |= inb(ppc->lpt_addr + 1) & 0xB8;
+
+ *data++ = d;
+ count--;
+ }
+
+ break;
+ }
+
+ case PPCMODE_BI_SW :
+ case PPCMODE_BI_FW :
+ {
+ ppc->cur_ctrl |= port_dir;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ ppc->cur_ctrl |= port_stb;
+
+ while(count)
+ {
+ ppc->cur_ctrl ^= data_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ *data++ = inb(ppc->lpt_addr);
+ count--;
+ }
+
+ ppc->cur_ctrl &= ~port_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ ppc->cur_ctrl &= ~port_dir;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ break;
+ }
+
+ case PPCMODE_EPP_BYTE :
+ {
+ outb((ppc->cur_ctrl | port_dir), ppc->lpt_addr + 2);
+
+ // DELAY
+
+ while(count)
+ {
+ *data++ = inb(ppc->lpt_addr + 4);
+ count--;
+ }
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ break;
+ }
+
+ case PPCMODE_EPP_WORD :
+ {
+ outb((ppc->cur_ctrl | port_dir), ppc->lpt_addr + 2);
+
+ // DELAY
+
+ while(count > 1)
+ {
+ *((u16 *)data) = inw(ppc->lpt_addr + 4);
+ data += 2;
+ count -= 2;
+ }
+
+ while(count)
+ {
+ *data++ = inb(ppc->lpt_addr + 4);
+ count--;
+ }
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ break;
+ }
+
+ case PPCMODE_EPP_DWORD :
+ {
+ outb((ppc->cur_ctrl | port_dir),ppc->lpt_addr + 2);
+
+ // DELAY
+
+ while(count > 3)
+ {
+ *((u32 *)data) = inl(ppc->lpt_addr + 4);
+ data += 4;
+ count -= 4;
+ }
+
+ while(count)
+ {
+ *data++ = inb(ppc->lpt_addr + 4);
+ count--;
+ }
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ break;
+ }
+ }
+
+}
+
+//***************************************************************************
+
+static void ppc6_wait_for_fifo(Interface *ppc)
+{
+ int i;
+
+ if (ppc->ppc_flags & fifo_wait)
+ {
+ for(i=0; i<20; i++)
+ inb(ppc->lpt_addr + 1);
+ }
+}
+
+//***************************************************************************
+
+static void ppc6_wr_data_blk(Interface *ppc, u8 *data, long count)
+{
+ switch(ppc->mode)
+ {
+ case PPCMODE_UNI_SW :
+ case PPCMODE_BI_SW :
+ {
+ while(count--)
+ {
+ outb(*data++, ppc->lpt_addr);
+
+ ppc->cur_ctrl ^= data_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+ }
+
+ break;
+ }
+
+ case PPCMODE_UNI_FW :
+ case PPCMODE_BI_FW :
+ {
+ u8 this, last;
+
+ ppc6_send_cmd(ppc,(CMD_PREFIX_SET | PREFIX_FASTWR));
+
+ ppc->cur_ctrl |= port_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ last = *data;
+
+ outb(last, ppc->lpt_addr);
+
+ while(count)
+ {
+ this = *data++;
+ count--;
+
+ if (this == last)
+ {
+ ppc->cur_ctrl ^= data_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+ }
+ else
+ {
+ outb(this, ppc->lpt_addr);
+
+ last = this;
+ }
+ }
+
+ ppc->cur_ctrl &= ~port_stb;
+
+ outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+ ppc6_send_cmd(ppc,(CMD_PREFIX_RESET | PREFIX_FASTWR));
+
+ break;
+ }
+
+ case PPCMODE_EPP_BYTE :
+ {
+ while(count)
+ {
+ outb(*data++,ppc->lpt_addr + 4);
+ count--;
+ }
+
+ ppc6_wait_for_fifo(ppc);
+
+ break;
+ }
+
+ case PPCMODE_EPP_WORD :
+ {
+ while(count > 1)
+ {
+ outw(*((u16 *)data),ppc->lpt_addr + 4);
+ data += 2;
+ count -= 2;
+ }
+
+ while(count)
+ {
+ outb(*data++,ppc->lpt_addr + 4);
+ count--;
+ }
+
+ ppc6_wait_for_fifo(ppc);
+
+ break;
+ }
+
+ case PPCMODE_EPP_DWORD :
+ {
+ while(count > 3)
+ {
+ outl(*((u32 *)data),ppc->lpt_addr + 4);
+ data += 4;
+ count -= 4;
+ }
+
+ while(count)
+ {
+ outb(*data++,ppc->lpt_addr + 4);
+ count--;
+ }
+
+ ppc6_wait_for_fifo(ppc);
+
+ break;
+ }
+ }
+}
+
+//***************************************************************************
+
+static void ppc6_rd_port16_blk(Interface *ppc, u8 port, u8 *data, long length)
+{
+ length = length << 1;
+
+ ppc6_send_cmd(ppc, (REG_BLKSIZE | ACCESS_REG | ACCESS_WRITE));
+ ppc6_wr_data_byte(ppc,(u8)length);
+ ppc6_wr_data_byte(ppc,(u8)(length >> 8));
+ ppc6_wr_data_byte(ppc,0);
+
+ ppc6_send_cmd(ppc, (CMD_PREFIX_SET | PREFIX_IO16 | PREFIX_BLK));
+
+ ppc6_send_cmd(ppc, (u8)(port | ACCESS_PORT | ACCESS_READ));
+
+ ppc6_rd_data_blk(ppc, data, length);
+
+ ppc6_send_cmd(ppc, (CMD_PREFIX_RESET | PREFIX_IO16 | PREFIX_BLK));
+}
+
+//***************************************************************************
+
+static void ppc6_wr_port16_blk(Interface *ppc, u8 port, u8 *data, long length)
+{
+ length = length << 1;
+
+ ppc6_send_cmd(ppc, (REG_BLKSIZE | ACCESS_REG | ACCESS_WRITE));
+ ppc6_wr_data_byte(ppc,(u8)length);
+ ppc6_wr_data_byte(ppc,(u8)(length >> 8));
+ ppc6_wr_data_byte(ppc,0);
+
+ ppc6_send_cmd(ppc, (CMD_PREFIX_SET | PREFIX_IO16 | PREFIX_BLK));
+
+ ppc6_send_cmd(ppc, (u8)(port | ACCESS_PORT | ACCESS_WRITE));
+
+ ppc6_wr_data_blk(ppc, data, length);
+
+ ppc6_send_cmd(ppc, (CMD_PREFIX_RESET | PREFIX_IO16 | PREFIX_BLK));
+}
+
+//***************************************************************************
+
+static void ppc6_wr_extout(Interface *ppc, u8 regdata)
+{
+ ppc6_send_cmd(ppc,(REG_VERSION | ACCESS_REG | ACCESS_WRITE));
+
+ ppc6_wr_data_byte(ppc, (u8)((regdata & 0x03) << 6));
+}
+
+//***************************************************************************
+
+static int ppc6_open(Interface *ppc)
+{
+ int ret;
+
+ ret = ppc6_select(ppc);
+
+ if (ret == 0)
+ return(ret);
+
+ ppc->ppc_flags &= ~fifo_wait;
+
+ ppc6_send_cmd(ppc, (ACCESS_REG | ACCESS_WRITE | REG_RAMSIZE));
+ ppc6_wr_data_byte(ppc, RAMSIZE_128K);
+
+ ppc6_send_cmd(ppc, (ACCESS_REG | ACCESS_READ | REG_VERSION));
+
+ if ((ppc6_rd_data_byte(ppc) & 0x3F) == 0x0C)
+ ppc->ppc_flags |= fifo_wait;
+
+ return(ret);
+}
+
+//***************************************************************************
+
+static void ppc6_close(Interface *ppc)
+{
+ ppc6_deselect(ppc);
+}
+
+//***************************************************************************
+
diff --git a/drivers/block/paride/pseudo.h b/drivers/block/paride/pseudo.h
new file mode 100644
index 000000000..bc3703294
--- /dev/null
+++ b/drivers/block/paride/pseudo.h
@@ -0,0 +1,102 @@
+/*
+ pseudo.h (c) 1997-8 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ This is the "pseudo-interrupt" logic for parallel port drivers.
+
+ This module is #included into each driver. It makes one
+ function available:
+
+ ps_set_intr( void (*continuation)(void),
+ int (*ready)(void),
+ int timeout,
+ int nice )
+
+ Which will arrange for ready() to be evaluated frequently and
+ when either it returns true, or timeout jiffies have passed,
+ continuation() will be invoked.
+
+ If nice is 1, the test will done approximately once a
+ jiffy. If nice is 0, the test will also be done whenever
+ the scheduler runs (by adding it to a task queue). If
+ nice is greater than 1, the test will be done once every
+ (nice-1) jiffies.
+
+*/
+
+/* Changes:
+
+ 1.01 1998.05.03 Switched from cli()/sti() to spinlocks
+ 1.02 1998.12.14 Added support for nice > 1
+*/
+
+#define PS_VERSION "1.02"
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+static void ps_tq_int(struct work_struct *work);
+
+static void (* ps_continuation)(void);
+static int (* ps_ready)(void);
+static unsigned long ps_timeout;
+static int ps_tq_active = 0;
+static int ps_nice = 0;
+
+static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused)));
+
+static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int);
+
+static void ps_set_intr(void (*continuation)(void),
+ int (*ready)(void),
+ int timeout, int nice)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ps_spinlock,flags);
+
+ ps_continuation = continuation;
+ ps_ready = ready;
+ ps_timeout = jiffies + timeout;
+ ps_nice = nice;
+
+ if (!ps_tq_active) {
+ ps_tq_active = 1;
+ if (!ps_nice)
+ schedule_delayed_work(&ps_tq, 0);
+ else
+ schedule_delayed_work(&ps_tq, ps_nice-1);
+ }
+ spin_unlock_irqrestore(&ps_spinlock,flags);
+}
+
+static void ps_tq_int(struct work_struct *work)
+{
+ void (*con)(void);
+ unsigned long flags;
+
+ spin_lock_irqsave(&ps_spinlock,flags);
+
+ con = ps_continuation;
+ ps_tq_active = 0;
+
+ if (!con) {
+ spin_unlock_irqrestore(&ps_spinlock,flags);
+ return;
+ }
+ if (!ps_ready || ps_ready() || time_after_eq(jiffies, ps_timeout)) {
+ ps_continuation = NULL;
+ spin_unlock_irqrestore(&ps_spinlock,flags);
+ con();
+ return;
+ }
+ ps_tq_active = 1;
+ if (!ps_nice)
+ schedule_delayed_work(&ps_tq, 0);
+ else
+ schedule_delayed_work(&ps_tq, ps_nice-1);
+ spin_unlock_irqrestore(&ps_spinlock,flags);
+}
+
+/* end of pseudo.h */
+
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
new file mode 100644
index 000000000..2596042eb
--- /dev/null
+++ b/drivers/block/paride/pt.c
@@ -0,0 +1,1016 @@
+/*
+ pt.c (c) 1998 Grant R. Guenther <grant@torque.net>
+ Under the terms of the GNU General Public License.
+
+ This is the high-level driver for parallel port ATAPI tape
+ drives based on chips supported by the paride module.
+
+ The driver implements both rewinding and non-rewinding
+ devices, filemarks, and the rewind ioctl. It allocates
+ a small internal "bounce buffer" for each open device, but
+ otherwise expects buffering and blocking to be done at the
+ user level. As with most block-structured tapes, short
+ writes are padded to full tape blocks, so reading back a file
+ may return more data than was actually written.
+
+ By default, the driver will autoprobe for a single parallel
+ port ATAPI tape drive, but if their individual parameters are
+ specified, the driver can handle up to 4 drives.
+
+ The rewinding devices are named /dev/pt0, /dev/pt1, ...
+ while the non-rewinding devices are /dev/npt0, /dev/npt1, etc.
+
+ The behaviour of the pt driver can be altered by setting
+ some parameters from the insmod command line. The following
+ parameters are adjustable:
+
+ drive0 These four arguments can be arrays of
+ drive1 1-6 integers as follows:
+ drive2
+ drive3 <prt>,<pro>,<uni>,<mod>,<slv>,<dly>
+
+ Where,
+
+ <prt> is the base of the parallel port address for
+ the corresponding drive. (required)
+
+ <pro> is the protocol number for the adapter that
+ supports this drive. These numbers are
+ logged by 'paride' when the protocol modules
+ are initialised. (0 if not given)
+
+ <uni> for those adapters that support chained
+ devices, this is the unit selector for the
+ chain of devices on the given port. It should
+ be zero for devices that don't support chaining.
+ (0 if not given)
+
+ <mod> this can be -1 to choose the best mode, or one
+ of the mode numbers supported by the adapter.
+ (-1 if not given)
+
+ <slv> ATAPI devices can be jumpered to master or slave.
+ Set this to 0 to choose the master drive, 1 to
+ choose the slave, -1 (the default) to choose the
+ first drive found.
+
+ <dly> some parallel ports require the driver to
+ go more slowly. -1 sets a default value that
+ should work with the chosen protocol. Otherwise,
+ set this to a small integer, the larger it is
+ the slower the port i/o. In some cases, setting
+ this to zero will speed up the device. (default -1)
+
+ major You may use this parameter to overide the
+ default major number (96) that this driver
+ will use. Be sure to change the device
+ name as well.
+
+ name This parameter is a character string that
+ contains the name the kernel will use for this
+ device (in /proc output, for instance).
+ (default "pt").
+
+ verbose This parameter controls the amount of logging
+ that the driver will do. Set it to 0 for
+ normal operation, 1 to see autoprobe progress
+ messages, or 2 to see additional debugging
+ output. (default 0)
+
+ If this driver is built into the kernel, you can use
+ the following command line parameters, with the same values
+ as the corresponding module parameters listed above:
+
+ pt.drive0
+ pt.drive1
+ pt.drive2
+ pt.drive3
+
+ In addition, you can use the parameter pt.disable to disable
+ the driver entirely.
+
+*/
+
+/* Changes:
+
+ 1.01 GRG 1998.05.06 Round up transfer size, fix ready_wait,
+ loosed interpretation of ATAPI standard
+ for clearing error status.
+ Eliminate sti();
+ 1.02 GRG 1998.06.16 Eliminate an Ugh.
+ 1.03 GRG 1998.08.15 Adjusted PT_TMO, use HZ in loop timing,
+ extra debugging
+ 1.04 GRG 1998.09.24 Repair minor coding error, added jumbo support
+
+*/
+
+#define PT_VERSION "1.04"
+#define PT_MAJOR 96
+#define PT_NAME "pt"
+#define PT_UNITS 4
+
+#include <linux/types.h>
+
+/* Here are things one can override from the insmod command.
+ Most are autoprobed by paride unless set here. Verbose is on
+ by default.
+
+*/
+
+static bool verbose = 0;
+static int major = PT_MAJOR;
+static char *name = PT_NAME;
+static int disable = 0;
+
+static int drive0[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive1[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive2[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive3[6] = { 0, 0, 0, -1, -1, -1 };
+
+static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
+
+#define D_PRT 0
+#define D_PRO 1
+#define D_UNI 2
+#define D_MOD 3
+#define D_SLV 4
+#define D_DLY 5
+
+#define DU (*drives[unit])
+
+/* end of parameters */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/mtio.h>
+#include <linux/device.h>
+#include <linux/sched.h> /* current, TASK_*, schedule_timeout() */
+#include <linux/mutex.h>
+
+#include <asm/uaccess.h>
+
+module_param(verbose, bool, 0);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+
+#define PT_MAX_RETRIES 5
+#define PT_TMO 3000 /* interrupt timeout in jiffies */
+#define PT_SPIN_DEL 50 /* spin delay in micro-seconds */
+#define PT_RESET_TMO 30 /* 30 seconds */
+#define PT_READY_TMO 60 /* 60 seconds */
+#define PT_REWIND_TMO 1200 /* 20 minutes */
+
+#define PT_SPIN ((1000000/(HZ*PT_SPIN_DEL))*PT_TMO)
+
+#define STAT_ERR 0x00001
+#define STAT_INDEX 0x00002
+#define STAT_ECC 0x00004
+#define STAT_DRQ 0x00008
+#define STAT_SEEK 0x00010
+#define STAT_WRERR 0x00020
+#define STAT_READY 0x00040
+#define STAT_BUSY 0x00080
+#define STAT_SENSE 0x1f000
+
+#define ATAPI_TEST_READY 0x00
+#define ATAPI_REWIND 0x01
+#define ATAPI_REQ_SENSE 0x03
+#define ATAPI_READ_6 0x08
+#define ATAPI_WRITE_6 0x0a
+#define ATAPI_WFM 0x10
+#define ATAPI_IDENTIFY 0x12
+#define ATAPI_MODE_SENSE 0x1a
+#define ATAPI_LOG_SENSE 0x4d
+
+static DEFINE_MUTEX(pt_mutex);
+static int pt_open(struct inode *inode, struct file *file);
+static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+static int pt_release(struct inode *inode, struct file *file);
+static ssize_t pt_read(struct file *filp, char __user *buf,
+ size_t count, loff_t * ppos);
+static ssize_t pt_write(struct file *filp, const char __user *buf,
+ size_t count, loff_t * ppos);
+static int pt_detect(void);
+
+/* bits in tape->flags */
+
+#define PT_MEDIA 1
+#define PT_WRITE_OK 2
+#define PT_REWIND 4
+#define PT_WRITING 8
+#define PT_READING 16
+#define PT_EOF 32
+
+#define PT_NAMELEN 8
+#define PT_BUFSIZE 16384
+
+struct pt_unit {
+ struct pi_adapter pia; /* interface to paride layer */
+ struct pi_adapter *pi;
+ int flags; /* various state flags */
+ int last_sense; /* result of last request sense */
+ int drive; /* drive */
+ atomic_t available; /* 1 if access is available 0 otherwise */
+ int bs; /* block size */
+ int capacity; /* Size of tape in KB */
+ int present; /* device present ? */
+ char *bufptr;
+ char name[PT_NAMELEN]; /* pf0, pf1, ... */
+};
+
+static int pt_identify(struct pt_unit *tape);
+
+static struct pt_unit pt[PT_UNITS];
+
+static char pt_scratch[512]; /* scratch block buffer */
+
+/* kernel glue structures */
+
+static const struct file_operations pt_fops = {
+ .owner = THIS_MODULE,
+ .read = pt_read,
+ .write = pt_write,
+ .unlocked_ioctl = pt_ioctl,
+ .open = pt_open,
+ .release = pt_release,
+ .llseek = noop_llseek,
+};
+
+/* sysfs class support */
+static struct class *pt_class;
+
+static inline int status_reg(struct pi_adapter *pi)
+{
+ return pi_read_regr(pi, 1, 6);
+}
+
+static inline int read_reg(struct pi_adapter *pi, int reg)
+{
+ return pi_read_regr(pi, 0, reg);
+}
+
+static inline void write_reg(struct pi_adapter *pi, int reg, int val)
+{
+ pi_write_regr(pi, 0, reg, val);
+}
+
+static inline u8 DRIVE(struct pt_unit *tape)
+{
+ return 0xa0+0x10*tape->drive;
+}
+
+static int pt_wait(struct pt_unit *tape, int go, int stop, char *fun, char *msg)
+{
+ int j, r, e, s, p;
+ struct pi_adapter *pi = tape->pi;
+
+ j = 0;
+ while ((((r = status_reg(pi)) & go) || (stop && (!(r & stop))))
+ && (j++ < PT_SPIN))
+ udelay(PT_SPIN_DEL);
+
+ if ((r & (STAT_ERR & stop)) || (j > PT_SPIN)) {
+ s = read_reg(pi, 7);
+ e = read_reg(pi, 1);
+ p = read_reg(pi, 2);
+ if (j > PT_SPIN)
+ e |= 0x100;
+ if (fun)
+ printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
+ " loop=%d phase=%d\n",
+ tape->name, fun, msg, r, s, e, j, p);
+ return (e << 8) + s;
+ }
+ return 0;
+}
+
+static int pt_command(struct pt_unit *tape, char *cmd, int dlen, char *fun)
+{
+ struct pi_adapter *pi = tape->pi;
+ pi_connect(pi);
+
+ write_reg(pi, 6, DRIVE(tape));
+
+ if (pt_wait(tape, STAT_BUSY | STAT_DRQ, 0, fun, "before command")) {
+ pi_disconnect(pi);
+ return -1;
+ }
+
+ write_reg(pi, 4, dlen % 256);
+ write_reg(pi, 5, dlen / 256);
+ write_reg(pi, 7, 0xa0); /* ATAPI packet command */
+
+ if (pt_wait(tape, STAT_BUSY, STAT_DRQ, fun, "command DRQ")) {
+ pi_disconnect(pi);
+ return -1;
+ }
+
+ if (read_reg(pi, 2) != 1) {
+ printk("%s: %s: command phase error\n", tape->name, fun);
+ pi_disconnect(pi);
+ return -1;
+ }
+
+ pi_write_block(pi, cmd, 12);
+
+ return 0;
+}
+
+static int pt_completion(struct pt_unit *tape, char *buf, char *fun)
+{
+ struct pi_adapter *pi = tape->pi;
+ int r, s, n, p;
+
+ r = pt_wait(tape, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR,
+ fun, "completion");
+
+ if (read_reg(pi, 7) & STAT_DRQ) {
+ n = (((read_reg(pi, 4) + 256 * read_reg(pi, 5)) +
+ 3) & 0xfffc);
+ p = read_reg(pi, 2) & 3;
+ if (p == 0)
+ pi_write_block(pi, buf, n);
+ if (p == 2)
+ pi_read_block(pi, buf, n);
+ }
+
+ s = pt_wait(tape, STAT_BUSY, STAT_READY | STAT_ERR, fun, "data done");
+
+ pi_disconnect(pi);
+
+ return (r ? r : s);
+}
+
+static void pt_req_sense(struct pt_unit *tape, int quiet)
+{
+ char rs_cmd[12] = { ATAPI_REQ_SENSE, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 };
+ char buf[16];
+ int r;
+
+ r = pt_command(tape, rs_cmd, 16, "Request sense");
+ mdelay(1);
+ if (!r)
+ pt_completion(tape, buf, "Request sense");
+
+ tape->last_sense = -1;
+ if (!r) {
+ if (!quiet)
+ printk("%s: Sense key: %x, ASC: %x, ASQ: %x\n",
+ tape->name, buf[2] & 0xf, buf[12], buf[13]);
+ tape->last_sense = (buf[2] & 0xf) | ((buf[12] & 0xff) << 8)
+ | ((buf[13] & 0xff) << 16);
+ }
+}
+
+static int pt_atapi(struct pt_unit *tape, char *cmd, int dlen, char *buf, char *fun)
+{
+ int r;
+
+ r = pt_command(tape, cmd, dlen, fun);
+ mdelay(1);
+ if (!r)
+ r = pt_completion(tape, buf, fun);
+ if (r)
+ pt_req_sense(tape, !fun);
+
+ return r;
+}
+
+static void pt_sleep(int cs)
+{
+ schedule_timeout_interruptible(cs);
+}
+
+static int pt_poll_dsc(struct pt_unit *tape, int pause, int tmo, char *msg)
+{
+ struct pi_adapter *pi = tape->pi;
+ int k, e, s;
+
+ k = 0;
+ e = 0;
+ s = 0;
+ while (k < tmo) {
+ pt_sleep(pause);
+ k++;
+ pi_connect(pi);
+ write_reg(pi, 6, DRIVE(tape));
+ s = read_reg(pi, 7);
+ e = read_reg(pi, 1);
+ pi_disconnect(pi);
+ if (s & (STAT_ERR | STAT_SEEK))
+ break;
+ }
+ if ((k >= tmo) || (s & STAT_ERR)) {
+ if (k >= tmo)
+ printk("%s: %s DSC timeout\n", tape->name, msg);
+ else
+ printk("%s: %s stat=0x%x err=0x%x\n", tape->name, msg, s,
+ e);
+ pt_req_sense(tape, 0);
+ return 0;
+ }
+ return 1;
+}
+
+static void pt_media_access_cmd(struct pt_unit *tape, int tmo, char *cmd, char *fun)
+{
+ if (pt_command(tape, cmd, 0, fun)) {
+ pt_req_sense(tape, 0);
+ return;
+ }
+ pi_disconnect(tape->pi);
+ pt_poll_dsc(tape, HZ, tmo, fun);
+}
+
+static void pt_rewind(struct pt_unit *tape)
+{
+ char rw_cmd[12] = { ATAPI_REWIND, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+ pt_media_access_cmd(tape, PT_REWIND_TMO, rw_cmd, "rewind");
+}
+
+static void pt_write_fm(struct pt_unit *tape)
+{
+ char wm_cmd[12] = { ATAPI_WFM, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 };
+
+ pt_media_access_cmd(tape, PT_TMO, wm_cmd, "write filemark");
+}
+
+#define DBMSG(msg) ((verbose>1)?(msg):NULL)
+
+static int pt_reset(struct pt_unit *tape)
+{
+ struct pi_adapter *pi = tape->pi;
+ int i, k, flg;
+ int expect[5] = { 1, 1, 1, 0x14, 0xeb };
+
+ pi_connect(pi);
+ write_reg(pi, 6, DRIVE(tape));
+ write_reg(pi, 7, 8);
+
+ pt_sleep(20 * HZ / 1000);
+
+ k = 0;
+ while ((k++ < PT_RESET_TMO) && (status_reg(pi) & STAT_BUSY))
+ pt_sleep(HZ / 10);
+
+ flg = 1;
+ for (i = 0; i < 5; i++)
+ flg &= (read_reg(pi, i + 1) == expect[i]);
+
+ if (verbose) {
+ printk("%s: Reset (%d) signature = ", tape->name, k);
+ for (i = 0; i < 5; i++)
+ printk("%3x", read_reg(pi, i + 1));
+ if (!flg)
+ printk(" (incorrect)");
+ printk("\n");
+ }
+
+ pi_disconnect(pi);
+ return flg - 1;
+}
+
+static int pt_ready_wait(struct pt_unit *tape, int tmo)
+{
+ char tr_cmd[12] = { ATAPI_TEST_READY, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ int k, p;
+
+ k = 0;
+ while (k < tmo) {
+ tape->last_sense = 0;
+ pt_atapi(tape, tr_cmd, 0, NULL, DBMSG("test unit ready"));
+ p = tape->last_sense;
+ if (!p)
+ return 0;
+ if (!(((p & 0xffff) == 0x0402) || ((p & 0xff) == 6)))
+ return p;
+ k++;
+ pt_sleep(HZ);
+ }
+ return 0x000020; /* timeout */
+}
+
+static void xs(char *buf, char *targ, int offs, int len)
+{
+ int j, k, l;
+
+ j = 0;
+ l = 0;
+ for (k = 0; k < len; k++)
+ if ((buf[k + offs] != 0x20) || (buf[k + offs] != l))
+ l = targ[j++] = buf[k + offs];
+ if (l == 0x20)
+ j--;
+ targ[j] = 0;
+}
+
+static int xn(char *buf, int offs, int size)
+{
+ int v, k;
+
+ v = 0;
+ for (k = 0; k < size; k++)
+ v = v * 256 + (buf[k + offs] & 0xff);
+ return v;
+}
+
+static int pt_identify(struct pt_unit *tape)
+{
+ int dt, s;
+ char *ms[2] = { "master", "slave" };
+ char mf[10], id[18];
+ char id_cmd[12] = { ATAPI_IDENTIFY, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+ char ms_cmd[12] =
+ { ATAPI_MODE_SENSE, 0, 0x2a, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+ char ls_cmd[12] =
+ { ATAPI_LOG_SENSE, 0, 0x71, 0, 0, 0, 0, 0, 36, 0, 0, 0 };
+ char buf[36];
+
+ s = pt_atapi(tape, id_cmd, 36, buf, "identify");
+ if (s)
+ return -1;
+
+ dt = buf[0] & 0x1f;
+ if (dt != 1) {
+ if (verbose)
+ printk("%s: Drive %d, unsupported type %d\n",
+ tape->name, tape->drive, dt);
+ return -1;
+ }
+
+ xs(buf, mf, 8, 8);
+ xs(buf, id, 16, 16);
+
+ tape->flags = 0;
+ tape->capacity = 0;
+ tape->bs = 0;
+
+ if (!pt_ready_wait(tape, PT_READY_TMO))
+ tape->flags |= PT_MEDIA;
+
+ if (!pt_atapi(tape, ms_cmd, 36, buf, "mode sense")) {
+ if (!(buf[2] & 0x80))
+ tape->flags |= PT_WRITE_OK;
+ tape->bs = xn(buf, 10, 2);
+ }
+
+ if (!pt_atapi(tape, ls_cmd, 36, buf, "log sense"))
+ tape->capacity = xn(buf, 24, 4);
+
+ printk("%s: %s %s, %s", tape->name, mf, id, ms[tape->drive]);
+ if (!(tape->flags & PT_MEDIA))
+ printk(", no media\n");
+ else {
+ if (!(tape->flags & PT_WRITE_OK))
+ printk(", RO");
+ printk(", blocksize %d, %d MB\n", tape->bs, tape->capacity / 1024);
+ }
+
+ return 0;
+}
+
+
+/*
+ * returns 0, with id set if drive is detected
+ * -1, if drive detection failed
+ */
+static int pt_probe(struct pt_unit *tape)
+{
+ if (tape->drive == -1) {
+ for (tape->drive = 0; tape->drive <= 1; tape->drive++)
+ if (!pt_reset(tape))
+ return pt_identify(tape);
+ } else {
+ if (!pt_reset(tape))
+ return pt_identify(tape);
+ }
+ return -1;
+}
+
+static int pt_detect(void)
+{
+ struct pt_unit *tape;
+ int specified = 0, found = 0;
+ int unit;
+
+ printk("%s: %s version %s, major %d\n", name, name, PT_VERSION, major);
+
+ specified = 0;
+ for (unit = 0; unit < PT_UNITS; unit++) {
+ struct pt_unit *tape = &pt[unit];
+ tape->pi = &tape->pia;
+ atomic_set(&tape->available, 1);
+ tape->flags = 0;
+ tape->last_sense = 0;
+ tape->present = 0;
+ tape->bufptr = NULL;
+ tape->drive = DU[D_SLV];
+ snprintf(tape->name, PT_NAMELEN, "%s%d", name, unit);
+ if (!DU[D_PRT])
+ continue;
+ specified++;
+ if (pi_init(tape->pi, 0, DU[D_PRT], DU[D_MOD], DU[D_UNI],
+ DU[D_PRO], DU[D_DLY], pt_scratch, PI_PT,
+ verbose, tape->name)) {
+ if (!pt_probe(tape)) {
+ tape->present = 1;
+ found++;
+ } else
+ pi_release(tape->pi);
+ }
+ }
+ if (specified == 0) {
+ tape = pt;
+ if (pi_init(tape->pi, 1, -1, -1, -1, -1, -1, pt_scratch,
+ PI_PT, verbose, tape->name)) {
+ if (!pt_probe(tape)) {
+ tape->present = 1;
+ found++;
+ } else
+ pi_release(tape->pi);
+ }
+
+ }
+ if (found)
+ return 0;
+
+ printk("%s: No ATAPI tape drive detected\n", name);
+ return -1;
+}
+
+static int pt_open(struct inode *inode, struct file *file)
+{
+ int unit = iminor(inode) & 0x7F;
+ struct pt_unit *tape = pt + unit;
+ int err;
+
+ mutex_lock(&pt_mutex);
+ if (unit >= PT_UNITS || (!tape->present)) {
+ mutex_unlock(&pt_mutex);
+ return -ENODEV;
+ }
+
+ err = -EBUSY;
+ if (!atomic_dec_and_test(&tape->available))
+ goto out;
+
+ pt_identify(tape);
+
+ err = -ENODEV;
+ if (!(tape->flags & PT_MEDIA))
+ goto out;
+
+ err = -EROFS;
+ if ((!(tape->flags & PT_WRITE_OK)) && (file->f_mode & FMODE_WRITE))
+ goto out;
+
+ if (!(iminor(inode) & 128))
+ tape->flags |= PT_REWIND;
+
+ err = -ENOMEM;
+ tape->bufptr = kmalloc(PT_BUFSIZE, GFP_KERNEL);
+ if (tape->bufptr == NULL) {
+ printk("%s: buffer allocation failed\n", tape->name);
+ goto out;
+ }
+
+ file->private_data = tape;
+ mutex_unlock(&pt_mutex);
+ return 0;
+
+out:
+ atomic_inc(&tape->available);
+ mutex_unlock(&pt_mutex);
+ return err;
+}
+
+static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ struct pt_unit *tape = file->private_data;
+ struct mtop __user *p = (void __user *)arg;
+ struct mtop mtop;
+
+ switch (cmd) {
+ case MTIOCTOP:
+ if (copy_from_user(&mtop, p, sizeof(struct mtop)))
+ return -EFAULT;
+
+ switch (mtop.mt_op) {
+
+ case MTREW:
+ mutex_lock(&pt_mutex);
+ pt_rewind(tape);
+ mutex_unlock(&pt_mutex);
+ return 0;
+
+ case MTWEOF:
+ mutex_lock(&pt_mutex);
+ pt_write_fm(tape);
+ mutex_unlock(&pt_mutex);
+ return 0;
+
+ default:
+ /* FIXME: rate limit ?? */
+ printk(KERN_DEBUG "%s: Unimplemented mt_op %d\n", tape->name,
+ mtop.mt_op);
+ return -EINVAL;
+ }
+
+ default:
+ return -ENOTTY;
+ }
+}
+
+static int
+pt_release(struct inode *inode, struct file *file)
+{
+ struct pt_unit *tape = file->private_data;
+
+ if (atomic_read(&tape->available) > 1)
+ return -EINVAL;
+
+ if (tape->flags & PT_WRITING)
+ pt_write_fm(tape);
+
+ if (tape->flags & PT_REWIND)
+ pt_rewind(tape);
+
+ kfree(tape->bufptr);
+ tape->bufptr = NULL;
+
+ atomic_inc(&tape->available);
+
+ return 0;
+
+}
+
+static ssize_t pt_read(struct file *filp, char __user *buf, size_t count, loff_t * ppos)
+{
+ struct pt_unit *tape = filp->private_data;
+ struct pi_adapter *pi = tape->pi;
+ char rd_cmd[12] = { ATAPI_READ_6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ int k, n, r, p, s, t, b;
+
+ if (!(tape->flags & (PT_READING | PT_WRITING))) {
+ tape->flags |= PT_READING;
+ if (pt_atapi(tape, rd_cmd, 0, NULL, "start read-ahead"))
+ return -EIO;
+ } else if (tape->flags & PT_WRITING)
+ return -EIO;
+
+ if (tape->flags & PT_EOF)
+ return 0;
+
+ t = 0;
+
+ while (count > 0) {
+
+ if (!pt_poll_dsc(tape, HZ / 100, PT_TMO, "read"))
+ return -EIO;
+
+ n = count;
+ if (n > 32768)
+ n = 32768; /* max per command */
+ b = (n - 1 + tape->bs) / tape->bs;
+ n = b * tape->bs; /* rounded up to even block */
+
+ rd_cmd[4] = b;
+
+ r = pt_command(tape, rd_cmd, n, "read");
+
+ mdelay(1);
+
+ if (r) {
+ pt_req_sense(tape, 0);
+ return -EIO;
+ }
+
+ while (1) {
+
+ r = pt_wait(tape, STAT_BUSY,
+ STAT_DRQ | STAT_ERR | STAT_READY,
+ DBMSG("read DRQ"), "");
+
+ if (r & STAT_SENSE) {
+ pi_disconnect(pi);
+ pt_req_sense(tape, 0);
+ return -EIO;
+ }
+
+ if (r)
+ tape->flags |= PT_EOF;
+
+ s = read_reg(pi, 7);
+
+ if (!(s & STAT_DRQ))
+ break;
+
+ n = (read_reg(pi, 4) + 256 * read_reg(pi, 5));
+ p = (read_reg(pi, 2) & 3);
+ if (p != 2) {
+ pi_disconnect(pi);
+ printk("%s: Phase error on read: %d\n", tape->name,
+ p);
+ return -EIO;
+ }
+
+ while (n > 0) {
+ k = n;
+ if (k > PT_BUFSIZE)
+ k = PT_BUFSIZE;
+ pi_read_block(pi, tape->bufptr, k);
+ n -= k;
+ b = k;
+ if (b > count)
+ b = count;
+ if (copy_to_user(buf + t, tape->bufptr, b)) {
+ pi_disconnect(pi);
+ return -EFAULT;
+ }
+ t += b;
+ count -= b;
+ }
+
+ }
+ pi_disconnect(pi);
+ if (tape->flags & PT_EOF)
+ break;
+ }
+
+ return t;
+
+}
+
+static ssize_t pt_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
+{
+ struct pt_unit *tape = filp->private_data;
+ struct pi_adapter *pi = tape->pi;
+ char wr_cmd[12] = { ATAPI_WRITE_6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+ int k, n, r, p, s, t, b;
+
+ if (!(tape->flags & PT_WRITE_OK))
+ return -EROFS;
+
+ if (!(tape->flags & (PT_READING | PT_WRITING))) {
+ tape->flags |= PT_WRITING;
+ if (pt_atapi
+ (tape, wr_cmd, 0, NULL, "start buffer-available mode"))
+ return -EIO;
+ } else if (tape->flags & PT_READING)
+ return -EIO;
+
+ if (tape->flags & PT_EOF)
+ return -ENOSPC;
+
+ t = 0;
+
+ while (count > 0) {
+
+ if (!pt_poll_dsc(tape, HZ / 100, PT_TMO, "write"))
+ return -EIO;
+
+ n = count;
+ if (n > 32768)
+ n = 32768; /* max per command */
+ b = (n - 1 + tape->bs) / tape->bs;
+ n = b * tape->bs; /* rounded up to even block */
+
+ wr_cmd[4] = b;
+
+ r = pt_command(tape, wr_cmd, n, "write");
+
+ mdelay(1);
+
+ if (r) { /* error delivering command only */
+ pt_req_sense(tape, 0);
+ return -EIO;
+ }
+
+ while (1) {
+
+ r = pt_wait(tape, STAT_BUSY,
+ STAT_DRQ | STAT_ERR | STAT_READY,
+ DBMSG("write DRQ"), NULL);
+
+ if (r & STAT_SENSE) {
+ pi_disconnect(pi);
+ pt_req_sense(tape, 0);
+ return -EIO;
+ }
+
+ if (r)
+ tape->flags |= PT_EOF;
+
+ s = read_reg(pi, 7);
+
+ if (!(s & STAT_DRQ))
+ break;
+
+ n = (read_reg(pi, 4) + 256 * read_reg(pi, 5));
+ p = (read_reg(pi, 2) & 3);
+ if (p != 0) {
+ pi_disconnect(pi);
+ printk("%s: Phase error on write: %d \n",
+ tape->name, p);
+ return -EIO;
+ }
+
+ while (n > 0) {
+ k = n;
+ if (k > PT_BUFSIZE)
+ k = PT_BUFSIZE;
+ b = k;
+ if (b > count)
+ b = count;
+ if (copy_from_user(tape->bufptr, buf + t, b)) {
+ pi_disconnect(pi);
+ return -EFAULT;
+ }
+ pi_write_block(pi, tape->bufptr, k);
+ t += b;
+ count -= b;
+ n -= k;
+ }
+
+ }
+ pi_disconnect(pi);
+ if (tape->flags & PT_EOF)
+ break;
+ }
+
+ return t;
+}
+
+static int __init pt_init(void)
+{
+ int unit;
+ int err;
+
+ if (disable) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (pt_detect()) {
+ err = -ENODEV;
+ goto out;
+ }
+
+ err = register_chrdev(major, name, &pt_fops);
+ if (err < 0) {
+ printk("pt_init: unable to get major number %d\n", major);
+ for (unit = 0; unit < PT_UNITS; unit++)
+ if (pt[unit].present)
+ pi_release(pt[unit].pi);
+ goto out;
+ }
+ major = err;
+ pt_class = class_create(THIS_MODULE, "pt");
+ if (IS_ERR(pt_class)) {
+ err = PTR_ERR(pt_class);
+ goto out_chrdev;
+ }
+
+ for (unit = 0; unit < PT_UNITS; unit++)
+ if (pt[unit].present) {
+ device_create(pt_class, NULL, MKDEV(major, unit), NULL,
+ "pt%d", unit);
+ device_create(pt_class, NULL, MKDEV(major, unit + 128),
+ NULL, "pt%dn", unit);
+ }
+ goto out;
+
+out_chrdev:
+ unregister_chrdev(major, "pt");
+out:
+ return err;
+}
+
+static void __exit pt_exit(void)
+{
+ int unit;
+ for (unit = 0; unit < PT_UNITS; unit++)
+ if (pt[unit].present) {
+ device_destroy(pt_class, MKDEV(major, unit));
+ device_destroy(pt_class, MKDEV(major, unit + 128));
+ }
+ class_destroy(pt_class);
+ unregister_chrdev(major, name);
+ for (unit = 0; unit < PT_UNITS; unit++)
+ if (pt[unit].present)
+ pi_release(pt[unit].pi);
+}
+
+MODULE_LICENSE("GPL");
+module_init(pt_init)
+module_exit(pt_exit)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
new file mode 100644
index 000000000..09e628daf
--- /dev/null
+++ b/drivers/block/pktcdvd.c
@@ -0,0 +1,3029 @@
+/*
+ * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
+ * Copyright (C) 2006 Thomas Maier <balagi@justmail.de>
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License. See linux/COPYING for more information.
+ *
+ * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and
+ * DVD-RAM devices.
+ *
+ * Theory of operation:
+ *
+ * At the lowest level, there is the standard driver for the CD/DVD device,
+ * typically ide-cd.c or sr.c. This driver can handle read and write requests,
+ * but it doesn't know anything about the special restrictions that apply to
+ * packet writing. One restriction is that write requests must be aligned to
+ * packet boundaries on the physical media, and the size of a write request
+ * must be equal to the packet size. Another restriction is that a
+ * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read
+ * command, if the previous command was a write.
+ *
+ * The purpose of the packet writing driver is to hide these restrictions from
+ * higher layers, such as file systems, and present a block device that can be
+ * randomly read and written using 2kB-sized blocks.
+ *
+ * The lowest layer in the packet writing driver is the packet I/O scheduler.
+ * Its data is defined by the struct packet_iosched and includes two bio
+ * queues with pending read and write requests. These queues are processed
+ * by the pkt_iosched_process_queue() function. The write requests in this
+ * queue are already properly aligned and sized. This layer is responsible for
+ * issuing the flush cache commands and scheduling the I/O in a good order.
+ *
+ * The next layer transforms unaligned write requests to aligned writes. This
+ * transformation requires reading missing pieces of data from the underlying
+ * block device, assembling the pieces to full packets and queuing them to the
+ * packet I/O scheduler.
+ *
+ * At the top layer there is a custom make_request_fn function that forwards
+ * read requests directly to the iosched queue and puts write requests in the
+ * unaligned write queue. A kernel thread performs the necessary read
+ * gathering to convert the unaligned writes to aligned writes and then feeds
+ * them to the packet I/O scheduler.
+ *
+ *************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/pktcdvd.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/compat.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/freezer.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_ioctl.h>
+#include <scsi/scsi.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+
+#include <asm/uaccess.h>
+
+#define DRIVER_NAME "pktcdvd"
+
+#define pkt_err(pd, fmt, ...) \
+ pr_err("%s: " fmt, pd->name, ##__VA_ARGS__)
+#define pkt_notice(pd, fmt, ...) \
+ pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__)
+#define pkt_info(pd, fmt, ...) \
+ pr_info("%s: " fmt, pd->name, ##__VA_ARGS__)
+
+#define pkt_dbg(level, pd, fmt, ...) \
+do { \
+ if (level == 2 && PACKET_DEBUG >= 2) \
+ pr_notice("%s: %s():" fmt, \
+ pd->name, __func__, ##__VA_ARGS__); \
+ else if (level == 1 && PACKET_DEBUG >= 1) \
+ pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \
+} while (0)
+
+#define MAX_SPEED 0xffff
+
+static DEFINE_MUTEX(pktcdvd_mutex);
+static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
+static struct proc_dir_entry *pkt_proc;
+static int pktdev_major;
+static int write_congestion_on = PKT_WRITE_CONGESTION_ON;
+static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
+static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */
+static mempool_t *psd_pool;
+
+static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */
+static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
+
+/* forward declaration */
+static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev);
+static int pkt_remove_dev(dev_t pkt_dev);
+static int pkt_seq_show(struct seq_file *m, void *p);
+
+static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd)
+{
+ return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1);
+}
+
+/*
+ * create and register a pktcdvd kernel object.
+ */
+static struct pktcdvd_kobj* pkt_kobj_create(struct pktcdvd_device *pd,
+ const char* name,
+ struct kobject* parent,
+ struct kobj_type* ktype)
+{
+ struct pktcdvd_kobj *p;
+ int error;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return NULL;
+ p->pd = pd;
+ error = kobject_init_and_add(&p->kobj, ktype, parent, "%s", name);
+ if (error) {
+ kobject_put(&p->kobj);
+ return NULL;
+ }
+ kobject_uevent(&p->kobj, KOBJ_ADD);
+ return p;
+}
+/*
+ * remove a pktcdvd kernel object.
+ */
+static void pkt_kobj_remove(struct pktcdvd_kobj *p)
+{
+ if (p)
+ kobject_put(&p->kobj);
+}
+/*
+ * default release function for pktcdvd kernel objects.
+ */
+static void pkt_kobj_release(struct kobject *kobj)
+{
+ kfree(to_pktcdvdkobj(kobj));
+}
+
+
+/**********************************************************
+ *
+ * sysfs interface for pktcdvd
+ * by (C) 2006 Thomas Maier <balagi@justmail.de>
+ *
+ **********************************************************/
+
+#define DEF_ATTR(_obj,_name,_mode) \
+ static struct attribute _obj = { .name = _name, .mode = _mode }
+
+/**********************************************************
+ /sys/class/pktcdvd/pktcdvd[0-7]/
+ stat/reset
+ stat/packets_started
+ stat/packets_finished
+ stat/kb_written
+ stat/kb_read
+ stat/kb_read_gather
+ write_queue/size
+ write_queue/congestion_off
+ write_queue/congestion_on
+ **********************************************************/
+
+DEF_ATTR(kobj_pkt_attr_st1, "reset", 0200);
+DEF_ATTR(kobj_pkt_attr_st2, "packets_started", 0444);
+DEF_ATTR(kobj_pkt_attr_st3, "packets_finished", 0444);
+DEF_ATTR(kobj_pkt_attr_st4, "kb_written", 0444);
+DEF_ATTR(kobj_pkt_attr_st5, "kb_read", 0444);
+DEF_ATTR(kobj_pkt_attr_st6, "kb_read_gather", 0444);
+
+static struct attribute *kobj_pkt_attrs_stat[] = {
+ &kobj_pkt_attr_st1,
+ &kobj_pkt_attr_st2,
+ &kobj_pkt_attr_st3,
+ &kobj_pkt_attr_st4,
+ &kobj_pkt_attr_st5,
+ &kobj_pkt_attr_st6,
+ NULL
+};
+
+DEF_ATTR(kobj_pkt_attr_wq1, "size", 0444);
+DEF_ATTR(kobj_pkt_attr_wq2, "congestion_off", 0644);
+DEF_ATTR(kobj_pkt_attr_wq3, "congestion_on", 0644);
+
+static struct attribute *kobj_pkt_attrs_wqueue[] = {
+ &kobj_pkt_attr_wq1,
+ &kobj_pkt_attr_wq2,
+ &kobj_pkt_attr_wq3,
+ NULL
+};
+
+static ssize_t kobj_pkt_show(struct kobject *kobj,
+ struct attribute *attr, char *data)
+{
+ struct pktcdvd_device *pd = to_pktcdvdkobj(kobj)->pd;
+ int n = 0;
+ int v;
+ if (strcmp(attr->name, "packets_started") == 0) {
+ n = sprintf(data, "%lu\n", pd->stats.pkt_started);
+
+ } else if (strcmp(attr->name, "packets_finished") == 0) {
+ n = sprintf(data, "%lu\n", pd->stats.pkt_ended);
+
+ } else if (strcmp(attr->name, "kb_written") == 0) {
+ n = sprintf(data, "%lu\n", pd->stats.secs_w >> 1);
+
+ } else if (strcmp(attr->name, "kb_read") == 0) {
+ n = sprintf(data, "%lu\n", pd->stats.secs_r >> 1);
+
+ } else if (strcmp(attr->name, "kb_read_gather") == 0) {
+ n = sprintf(data, "%lu\n", pd->stats.secs_rg >> 1);
+
+ } else if (strcmp(attr->name, "size") == 0) {
+ spin_lock(&pd->lock);
+ v = pd->bio_queue_size;
+ spin_unlock(&pd->lock);
+ n = sprintf(data, "%d\n", v);
+
+ } else if (strcmp(attr->name, "congestion_off") == 0) {
+ spin_lock(&pd->lock);
+ v = pd->write_congestion_off;
+ spin_unlock(&pd->lock);
+ n = sprintf(data, "%d\n", v);
+
+ } else if (strcmp(attr->name, "congestion_on") == 0) {
+ spin_lock(&pd->lock);
+ v = pd->write_congestion_on;
+ spin_unlock(&pd->lock);
+ n = sprintf(data, "%d\n", v);
+ }
+ return n;
+}
+
+static void init_write_congestion_marks(int* lo, int* hi)
+{
+ if (*hi > 0) {
+ *hi = max(*hi, 500);
+ *hi = min(*hi, 1000000);
+ if (*lo <= 0)
+ *lo = *hi - 100;
+ else {
+ *lo = min(*lo, *hi - 100);
+ *lo = max(*lo, 100);
+ }
+ } else {
+ *hi = -1;
+ *lo = -1;
+ }
+}
+
+static ssize_t kobj_pkt_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *data, size_t len)
+{
+ struct pktcdvd_device *pd = to_pktcdvdkobj(kobj)->pd;
+ int val;
+
+ if (strcmp(attr->name, "reset") == 0 && len > 0) {
+ pd->stats.pkt_started = 0;
+ pd->stats.pkt_ended = 0;
+ pd->stats.secs_w = 0;
+ pd->stats.secs_rg = 0;
+ pd->stats.secs_r = 0;
+
+ } else if (strcmp(attr->name, "congestion_off") == 0
+ && sscanf(data, "%d", &val) == 1) {
+ spin_lock(&pd->lock);
+ pd->write_congestion_off = val;
+ init_write_congestion_marks(&pd->write_congestion_off,
+ &pd->write_congestion_on);
+ spin_unlock(&pd->lock);
+
+ } else if (strcmp(attr->name, "congestion_on") == 0
+ && sscanf(data, "%d", &val) == 1) {
+ spin_lock(&pd->lock);
+ pd->write_congestion_on = val;
+ init_write_congestion_marks(&pd->write_congestion_off,
+ &pd->write_congestion_on);
+ spin_unlock(&pd->lock);
+ }
+ return len;
+}
+
+static const struct sysfs_ops kobj_pkt_ops = {
+ .show = kobj_pkt_show,
+ .store = kobj_pkt_store
+};
+static struct kobj_type kobj_pkt_type_stat = {
+ .release = pkt_kobj_release,
+ .sysfs_ops = &kobj_pkt_ops,
+ .default_attrs = kobj_pkt_attrs_stat
+};
+static struct kobj_type kobj_pkt_type_wqueue = {
+ .release = pkt_kobj_release,
+ .sysfs_ops = &kobj_pkt_ops,
+ .default_attrs = kobj_pkt_attrs_wqueue
+};
+
+static void pkt_sysfs_dev_new(struct pktcdvd_device *pd)
+{
+ if (class_pktcdvd) {
+ pd->dev = device_create(class_pktcdvd, NULL, MKDEV(0, 0), NULL,
+ "%s", pd->name);
+ if (IS_ERR(pd->dev))
+ pd->dev = NULL;
+ }
+ if (pd->dev) {
+ pd->kobj_stat = pkt_kobj_create(pd, "stat",
+ &pd->dev->kobj,
+ &kobj_pkt_type_stat);
+ pd->kobj_wqueue = pkt_kobj_create(pd, "write_queue",
+ &pd->dev->kobj,
+ &kobj_pkt_type_wqueue);
+ }
+}
+
+static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd)
+{
+ pkt_kobj_remove(pd->kobj_stat);
+ pkt_kobj_remove(pd->kobj_wqueue);
+ if (class_pktcdvd)
+ device_unregister(pd->dev);
+}
+
+
+/********************************************************************
+ /sys/class/pktcdvd/
+ add map block device
+ remove unmap packet dev
+ device_map show mappings
+ *******************************************************************/
+
+static void class_pktcdvd_release(struct class *cls)
+{
+ kfree(cls);
+}
+static ssize_t class_pktcdvd_show_map(struct class *c,
+ struct class_attribute *attr,
+ char *data)
+{
+ int n = 0;
+ int idx;
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+ for (idx = 0; idx < MAX_WRITERS; idx++) {
+ struct pktcdvd_device *pd = pkt_devs[idx];
+ if (!pd)
+ continue;
+ n += sprintf(data+n, "%s %u:%u %u:%u\n",
+ pd->name,
+ MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev),
+ MAJOR(pd->bdev->bd_dev),
+ MINOR(pd->bdev->bd_dev));
+ }
+ mutex_unlock(&ctl_mutex);
+ return n;
+}
+
+static ssize_t class_pktcdvd_store_add(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned int major, minor;
+
+ if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
+ /* pkt_setup_dev() expects caller to hold reference to self */
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ pkt_setup_dev(MKDEV(major, minor), NULL);
+
+ module_put(THIS_MODULE);
+
+ return count;
+ }
+
+ return -EINVAL;
+}
+
+static ssize_t class_pktcdvd_store_remove(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ unsigned int major, minor;
+ if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
+ pkt_remove_dev(MKDEV(major, minor));
+ return count;
+ }
+ return -EINVAL;
+}
+
+static struct class_attribute class_pktcdvd_attrs[] = {
+ __ATTR(add, 0200, NULL, class_pktcdvd_store_add),
+ __ATTR(remove, 0200, NULL, class_pktcdvd_store_remove),
+ __ATTR(device_map, 0444, class_pktcdvd_show_map, NULL),
+ __ATTR_NULL
+};
+
+
+static int pkt_sysfs_init(void)
+{
+ int ret = 0;
+
+ /*
+ * create control files in sysfs
+ * /sys/class/pktcdvd/...
+ */
+ class_pktcdvd = kzalloc(sizeof(*class_pktcdvd), GFP_KERNEL);
+ if (!class_pktcdvd)
+ return -ENOMEM;
+ class_pktcdvd->name = DRIVER_NAME;
+ class_pktcdvd->owner = THIS_MODULE;
+ class_pktcdvd->class_release = class_pktcdvd_release;
+ class_pktcdvd->class_attrs = class_pktcdvd_attrs;
+ ret = class_register(class_pktcdvd);
+ if (ret) {
+ kfree(class_pktcdvd);
+ class_pktcdvd = NULL;
+ pr_err("failed to create class pktcdvd\n");
+ return ret;
+ }
+ return 0;
+}
+
+static void pkt_sysfs_cleanup(void)
+{
+ if (class_pktcdvd)
+ class_destroy(class_pktcdvd);
+ class_pktcdvd = NULL;
+}
+
+/********************************************************************
+ entries in debugfs
+
+ /sys/kernel/debug/pktcdvd[0-7]/
+ info
+
+ *******************************************************************/
+
+static int pkt_debugfs_seq_show(struct seq_file *m, void *p)
+{
+ return pkt_seq_show(m, p);
+}
+
+static int pkt_debugfs_fops_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pkt_debugfs_seq_show, inode->i_private);
+}
+
+static const struct file_operations debug_fops = {
+ .open = pkt_debugfs_fops_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .owner = THIS_MODULE,
+};
+
+static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
+{
+ if (!pkt_debugfs_root)
+ return;
+ pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root);
+ if (!pd->dfs_d_root)
+ return;
+
+ pd->dfs_f_info = debugfs_create_file("info", S_IRUGO,
+ pd->dfs_d_root, pd, &debug_fops);
+}
+
+static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
+{
+ if (!pkt_debugfs_root)
+ return;
+ debugfs_remove(pd->dfs_f_info);
+ debugfs_remove(pd->dfs_d_root);
+ pd->dfs_f_info = NULL;
+ pd->dfs_d_root = NULL;
+}
+
+static void pkt_debugfs_init(void)
+{
+ pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL);
+}
+
+static void pkt_debugfs_cleanup(void)
+{
+ debugfs_remove(pkt_debugfs_root);
+ pkt_debugfs_root = NULL;
+}
+
+/* ----------------------------------------------------------*/
+
+
+static void pkt_bio_finished(struct pktcdvd_device *pd)
+{
+ BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
+ if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
+ pkt_dbg(2, pd, "queue empty\n");
+ atomic_set(&pd->iosched.attention, 1);
+ wake_up(&pd->wqueue);
+ }
+}
+
+/*
+ * Allocate a packet_data struct
+ */
+static struct packet_data *pkt_alloc_packet_data(int frames)
+{
+ int i;
+ struct packet_data *pkt;
+
+ pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL);
+ if (!pkt)
+ goto no_pkt;
+
+ pkt->frames = frames;
+ pkt->w_bio = bio_kmalloc(GFP_KERNEL, frames);
+ if (!pkt->w_bio)
+ goto no_bio;
+
+ for (i = 0; i < frames / FRAMES_PER_PAGE; i++) {
+ pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!pkt->pages[i])
+ goto no_page;
+ }
+
+ spin_lock_init(&pkt->lock);
+ bio_list_init(&pkt->orig_bios);
+
+ for (i = 0; i < frames; i++) {
+ struct bio *bio = bio_kmalloc(GFP_KERNEL, 1);
+ if (!bio)
+ goto no_rd_bio;
+
+ pkt->r_bios[i] = bio;
+ }
+
+ return pkt;
+
+no_rd_bio:
+ for (i = 0; i < frames; i++) {
+ struct bio *bio = pkt->r_bios[i];
+ if (bio)
+ bio_put(bio);
+ }
+
+no_page:
+ for (i = 0; i < frames / FRAMES_PER_PAGE; i++)
+ if (pkt->pages[i])
+ __free_page(pkt->pages[i]);
+ bio_put(pkt->w_bio);
+no_bio:
+ kfree(pkt);
+no_pkt:
+ return NULL;
+}
+
+/*
+ * Free a packet_data struct
+ */
+static void pkt_free_packet_data(struct packet_data *pkt)
+{
+ int i;
+
+ for (i = 0; i < pkt->frames; i++) {
+ struct bio *bio = pkt->r_bios[i];
+ if (bio)
+ bio_put(bio);
+ }
+ for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++)
+ __free_page(pkt->pages[i]);
+ bio_put(pkt->w_bio);
+ kfree(pkt);
+}
+
+static void pkt_shrink_pktlist(struct pktcdvd_device *pd)
+{
+ struct packet_data *pkt, *next;
+
+ BUG_ON(!list_empty(&pd->cdrw.pkt_active_list));
+
+ list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) {
+ pkt_free_packet_data(pkt);
+ }
+ INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
+}
+
+static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
+{
+ struct packet_data *pkt;
+
+ BUG_ON(!list_empty(&pd->cdrw.pkt_free_list));
+
+ while (nr_packets > 0) {
+ pkt = pkt_alloc_packet_data(pd->settings.size >> 2);
+ if (!pkt) {
+ pkt_shrink_pktlist(pd);
+ return 0;
+ }
+ pkt->id = nr_packets;
+ pkt->pd = pd;
+ list_add(&pkt->list, &pd->cdrw.pkt_free_list);
+ nr_packets--;
+ }
+ return 1;
+}
+
+static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
+{
+ struct rb_node *n = rb_next(&node->rb_node);
+ if (!n)
+ return NULL;
+ return rb_entry(n, struct pkt_rb_node, rb_node);
+}
+
+static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+ rb_erase(&node->rb_node, &pd->bio_queue);
+ mempool_free(node, pd->rb_pool);
+ pd->bio_queue_size--;
+ BUG_ON(pd->bio_queue_size < 0);
+}
+
+/*
+ * Find the first node in the pd->bio_queue rb tree with a starting sector >= s.
+ */
+static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s)
+{
+ struct rb_node *n = pd->bio_queue.rb_node;
+ struct rb_node *next;
+ struct pkt_rb_node *tmp;
+
+ if (!n) {
+ BUG_ON(pd->bio_queue_size > 0);
+ return NULL;
+ }
+
+ for (;;) {
+ tmp = rb_entry(n, struct pkt_rb_node, rb_node);
+ if (s <= tmp->bio->bi_iter.bi_sector)
+ next = n->rb_left;
+ else
+ next = n->rb_right;
+ if (!next)
+ break;
+ n = next;
+ }
+
+ if (s > tmp->bio->bi_iter.bi_sector) {
+ tmp = pkt_rbtree_next(tmp);
+ if (!tmp)
+ return NULL;
+ }
+ BUG_ON(s > tmp->bio->bi_iter.bi_sector);
+ return tmp;
+}
+
+/*
+ * Insert a node into the pd->bio_queue rb tree.
+ */
+static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+ struct rb_node **p = &pd->bio_queue.rb_node;
+ struct rb_node *parent = NULL;
+ sector_t s = node->bio->bi_iter.bi_sector;
+ struct pkt_rb_node *tmp;
+
+ while (*p) {
+ parent = *p;
+ tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
+ if (s < tmp->bio->bi_iter.bi_sector)
+ p = &(*p)->rb_left;
+ else
+ p = &(*p)->rb_right;
+ }
+ rb_link_node(&node->rb_node, parent, p);
+ rb_insert_color(&node->rb_node, &pd->bio_queue);
+ pd->bio_queue_size++;
+}
+
+/*
+ * Send a packet_command to the underlying block device and
+ * wait for completion.
+ */
+static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
+{
+ struct request_queue *q = bdev_get_queue(pd->bdev);
+ struct request *rq;
+ int ret = 0;
+
+ rq = blk_get_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
+ WRITE : READ, __GFP_WAIT);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+ blk_rq_set_block_pc(rq);
+
+ if (cgc->buflen) {
+ ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
+ __GFP_WAIT);
+ if (ret)
+ goto out;
+ }
+
+ rq->cmd_len = COMMAND_SIZE(cgc->cmd[0]);
+ memcpy(rq->cmd, cgc->cmd, CDROM_PACKET_SIZE);
+
+ rq->timeout = 60*HZ;
+ if (cgc->quiet)
+ rq->cmd_flags |= REQ_QUIET;
+
+ blk_execute_rq(rq->q, pd->bdev->bd_disk, rq, 0);
+ if (rq->errors)
+ ret = -EIO;
+out:
+ blk_put_request(rq);
+ return ret;
+}
+
+static const char *sense_key_string(__u8 index)
+{
+ static const char * const info[] = {
+ "No sense", "Recovered error", "Not ready",
+ "Medium error", "Hardware error", "Illegal request",
+ "Unit attention", "Data protect", "Blank check",
+ };
+
+ return index < ARRAY_SIZE(info) ? info[index] : "INVALID";
+}
+
+/*
+ * A generic sense dump / resolve mechanism should be implemented across
+ * all ATAPI + SCSI devices.
+ */
+static void pkt_dump_sense(struct pktcdvd_device *pd,
+ struct packet_command *cgc)
+{
+ struct request_sense *sense = cgc->sense;
+
+ if (sense)
+ pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
+ CDROM_PACKET_SIZE, cgc->cmd,
+ sense->sense_key, sense->asc, sense->ascq,
+ sense_key_string(sense->sense_key));
+ else
+ pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
+}
+
+/*
+ * flush the drive cache to media
+ */
+static int pkt_flush_cache(struct pktcdvd_device *pd)
+{
+ struct packet_command cgc;
+
+ init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+ cgc.cmd[0] = GPCMD_FLUSH_CACHE;
+ cgc.quiet = 1;
+
+ /*
+ * the IMMED bit -- we default to not setting it, although that
+ * would allow a much faster close, this is safer
+ */
+#if 0
+ cgc.cmd[1] = 1 << 1;
+#endif
+ return pkt_generic_packet(pd, &cgc);
+}
+
+/*
+ * speed is given as the normal factor, e.g. 4 for 4x
+ */
+static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
+ unsigned write_speed, unsigned read_speed)
+{
+ struct packet_command cgc;
+ struct request_sense sense;
+ int ret;
+
+ init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+ cgc.sense = &sense;
+ cgc.cmd[0] = GPCMD_SET_SPEED;
+ cgc.cmd[2] = (read_speed >> 8) & 0xff;
+ cgc.cmd[3] = read_speed & 0xff;
+ cgc.cmd[4] = (write_speed >> 8) & 0xff;
+ cgc.cmd[5] = write_speed & 0xff;
+
+ if ((ret = pkt_generic_packet(pd, &cgc)))
+ pkt_dump_sense(pd, &cgc);
+
+ return ret;
+}
+
+/*
+ * Queue a bio for processing by the low-level CD device. Must be called
+ * from process context.
+ */
+static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
+{
+ spin_lock(&pd->iosched.lock);
+ if (bio_data_dir(bio) == READ)
+ bio_list_add(&pd->iosched.read_queue, bio);
+ else
+ bio_list_add(&pd->iosched.write_queue, bio);
+ spin_unlock(&pd->iosched.lock);
+
+ atomic_set(&pd->iosched.attention, 1);
+ wake_up(&pd->wqueue);
+}
+
+/*
+ * Process the queued read/write requests. This function handles special
+ * requirements for CDRW drives:
+ * - A cache flush command must be inserted before a read request if the
+ * previous request was a write.
+ * - Switching between reading and writing is slow, so don't do it more often
+ * than necessary.
+ * - Optimize for throughput at the expense of latency. This means that streaming
+ * writes will never be interrupted by a read, but if the drive has to seek
+ * before the next write, switch to reading instead if there are any pending
+ * read requests.
+ * - Set the read speed according to current usage pattern. When only reading
+ * from the device, it's best to use the highest possible read speed, but
+ * when switching often between reading and writing, it's better to have the
+ * same read and write speeds.
+ */
+static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
+{
+
+ if (atomic_read(&pd->iosched.attention) == 0)
+ return;
+ atomic_set(&pd->iosched.attention, 0);
+
+ for (;;) {
+ struct bio *bio;
+ int reads_queued, writes_queued;
+
+ spin_lock(&pd->iosched.lock);
+ reads_queued = !bio_list_empty(&pd->iosched.read_queue);
+ writes_queued = !bio_list_empty(&pd->iosched.write_queue);
+ spin_unlock(&pd->iosched.lock);
+
+ if (!reads_queued && !writes_queued)
+ break;
+
+ if (pd->iosched.writing) {
+ int need_write_seek = 1;
+ spin_lock(&pd->iosched.lock);
+ bio = bio_list_peek(&pd->iosched.write_queue);
+ spin_unlock(&pd->iosched.lock);
+ if (bio && (bio->bi_iter.bi_sector ==
+ pd->iosched.last_write))
+ need_write_seek = 0;
+ if (need_write_seek && reads_queued) {
+ if (atomic_read(&pd->cdrw.pending_bios) > 0) {
+ pkt_dbg(2, pd, "write, waiting\n");
+ break;
+ }
+ pkt_flush_cache(pd);
+ pd->iosched.writing = 0;
+ }
+ } else {
+ if (!reads_queued && writes_queued) {
+ if (atomic_read(&pd->cdrw.pending_bios) > 0) {
+ pkt_dbg(2, pd, "read, waiting\n");
+ break;
+ }
+ pd->iosched.writing = 1;
+ }
+ }
+
+ spin_lock(&pd->iosched.lock);
+ if (pd->iosched.writing)
+ bio = bio_list_pop(&pd->iosched.write_queue);
+ else
+ bio = bio_list_pop(&pd->iosched.read_queue);
+ spin_unlock(&pd->iosched.lock);
+
+ if (!bio)
+ continue;
+
+ if (bio_data_dir(bio) == READ)
+ pd->iosched.successive_reads +=
+ bio->bi_iter.bi_size >> 10;
+ else {
+ pd->iosched.successive_reads = 0;
+ pd->iosched.last_write = bio_end_sector(bio);
+ }
+ if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
+ if (pd->read_speed == pd->write_speed) {
+ pd->read_speed = MAX_SPEED;
+ pkt_set_speed(pd, pd->write_speed, pd->read_speed);
+ }
+ } else {
+ if (pd->read_speed != pd->write_speed) {
+ pd->read_speed = pd->write_speed;
+ pkt_set_speed(pd, pd->write_speed, pd->read_speed);
+ }
+ }
+
+ atomic_inc(&pd->cdrw.pending_bios);
+ generic_make_request(bio);
+ }
+}
+
+/*
+ * Special care is needed if the underlying block device has a small
+ * max_phys_segments value.
+ */
+static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
+{
+ if ((pd->settings.size << 9) / CD_FRAMESIZE
+ <= queue_max_segments(q)) {
+ /*
+ * The cdrom device can handle one segment/frame
+ */
+ clear_bit(PACKET_MERGE_SEGS, &pd->flags);
+ return 0;
+ } else if ((pd->settings.size << 9) / PAGE_SIZE
+ <= queue_max_segments(q)) {
+ /*
+ * We can handle this case at the expense of some extra memory
+ * copies during write operations
+ */
+ set_bit(PACKET_MERGE_SEGS, &pd->flags);
+ return 0;
+ } else {
+ pkt_err(pd, "cdrom max_phys_segments too small\n");
+ return -EIO;
+ }
+}
+
+/*
+ * Copy all data for this packet to pkt->pages[], so that
+ * a) The number of required segments for the write bio is minimized, which
+ * is necessary for some scsi controllers.
+ * b) The data can be used as cache to avoid read requests if we receive a
+ * new write request for the same zone.
+ */
+static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
+{
+ int f, p, offs;
+
+ /* Copy all data to pkt->pages[] */
+ p = 0;
+ offs = 0;
+ for (f = 0; f < pkt->frames; f++) {
+ if (bvec[f].bv_page != pkt->pages[p]) {
+ void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset;
+ void *vto = page_address(pkt->pages[p]) + offs;
+ memcpy(vto, vfrom, CD_FRAMESIZE);
+ kunmap_atomic(vfrom);
+ bvec[f].bv_page = pkt->pages[p];
+ bvec[f].bv_offset = offs;
+ } else {
+ BUG_ON(bvec[f].bv_offset != offs);
+ }
+ offs += CD_FRAMESIZE;
+ if (offs >= PAGE_SIZE) {
+ offs = 0;
+ p++;
+ }
+ }
+}
+
+static void pkt_end_io_read(struct bio *bio, int err)
+{
+ struct packet_data *pkt = bio->bi_private;
+ struct pktcdvd_device *pd = pkt->pd;
+ BUG_ON(!pd);
+
+ pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
+ bio, (unsigned long long)pkt->sector,
+ (unsigned long long)bio->bi_iter.bi_sector, err);
+
+ if (err)
+ atomic_inc(&pkt->io_errors);
+ if (atomic_dec_and_test(&pkt->io_wait)) {
+ atomic_inc(&pkt->run_sm);
+ wake_up(&pd->wqueue);
+ }
+ pkt_bio_finished(pd);
+}
+
+static void pkt_end_io_packet_write(struct bio *bio, int err)
+{
+ struct packet_data *pkt = bio->bi_private;
+ struct pktcdvd_device *pd = pkt->pd;
+ BUG_ON(!pd);
+
+ pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err);
+
+ pd->stats.pkt_ended++;
+
+ pkt_bio_finished(pd);
+ atomic_dec(&pkt->io_wait);
+ atomic_inc(&pkt->run_sm);
+ wake_up(&pd->wqueue);
+}
+
+/*
+ * Schedule reads for the holes in a packet
+ */
+static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+ int frames_read = 0;
+ struct bio *bio;
+ int f;
+ char written[PACKET_MAX_SIZE];
+
+ BUG_ON(bio_list_empty(&pkt->orig_bios));
+
+ atomic_set(&pkt->io_wait, 0);
+ atomic_set(&pkt->io_errors, 0);
+
+ /*
+ * Figure out which frames we need to read before we can write.
+ */
+ memset(written, 0, sizeof(written));
+ spin_lock(&pkt->lock);
+ bio_list_for_each(bio, &pkt->orig_bios) {
+ int first_frame = (bio->bi_iter.bi_sector - pkt->sector) /
+ (CD_FRAMESIZE >> 9);
+ int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE;
+ pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9);
+ BUG_ON(first_frame < 0);
+ BUG_ON(first_frame + num_frames > pkt->frames);
+ for (f = first_frame; f < first_frame + num_frames; f++)
+ written[f] = 1;
+ }
+ spin_unlock(&pkt->lock);
+
+ if (pkt->cache_valid) {
+ pkt_dbg(2, pd, "zone %llx cached\n",
+ (unsigned long long)pkt->sector);
+ goto out_account;
+ }
+
+ /*
+ * Schedule reads for missing parts of the packet.
+ */
+ for (f = 0; f < pkt->frames; f++) {
+ int p, offset;
+
+ if (written[f])
+ continue;
+
+ bio = pkt->r_bios[f];
+ bio_reset(bio);
+ bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
+ bio->bi_bdev = pd->bdev;
+ bio->bi_end_io = pkt_end_io_read;
+ bio->bi_private = pkt;
+
+ p = (f * CD_FRAMESIZE) / PAGE_SIZE;
+ offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+ pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n",
+ f, pkt->pages[p], offset);
+ if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
+ BUG();
+
+ atomic_inc(&pkt->io_wait);
+ bio->bi_rw = READ;
+ pkt_queue_bio(pd, bio);
+ frames_read++;
+ }
+
+out_account:
+ pkt_dbg(2, pd, "need %d frames for zone %llx\n",
+ frames_read, (unsigned long long)pkt->sector);
+ pd->stats.pkt_started++;
+ pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
+}
+
+/*
+ * Find a packet matching zone, or the least recently used packet if
+ * there is no match.
+ */
+static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone)
+{
+ struct packet_data *pkt;
+
+ list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) {
+ if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) {
+ list_del_init(&pkt->list);
+ if (pkt->sector != zone)
+ pkt->cache_valid = 0;
+ return pkt;
+ }
+ }
+ BUG();
+ return NULL;
+}
+
+static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+ if (pkt->cache_valid) {
+ list_add(&pkt->list, &pd->cdrw.pkt_free_list);
+ } else {
+ list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list);
+ }
+}
+
+/*
+ * recover a failed write, query for relocation if possible
+ *
+ * returns 1 if recovery is possible, or 0 if not
+ *
+ */
+static int pkt_start_recovery(struct packet_data *pkt)
+{
+ /*
+ * FIXME. We need help from the file system to implement
+ * recovery handling.
+ */
+ return 0;
+#if 0
+ struct request *rq = pkt->rq;
+ struct pktcdvd_device *pd = rq->rq_disk->private_data;
+ struct block_device *pkt_bdev;
+ struct super_block *sb = NULL;
+ unsigned long old_block, new_block;
+ sector_t new_sector;
+
+ pkt_bdev = bdget(kdev_t_to_nr(pd->pkt_dev));
+ if (pkt_bdev) {
+ sb = get_super(pkt_bdev);
+ bdput(pkt_bdev);
+ }
+
+ if (!sb)
+ return 0;
+
+ if (!sb->s_op->relocate_blocks)
+ goto out;
+
+ old_block = pkt->sector / (CD_FRAMESIZE >> 9);
+ if (sb->s_op->relocate_blocks(sb, old_block, &new_block))
+ goto out;
+
+ new_sector = new_block * (CD_FRAMESIZE >> 9);
+ pkt->sector = new_sector;
+
+ bio_reset(pkt->bio);
+ pkt->bio->bi_bdev = pd->bdev;
+ pkt->bio->bi_rw = REQ_WRITE;
+ pkt->bio->bi_iter.bi_sector = new_sector;
+ pkt->bio->bi_iter.bi_size = pkt->frames * CD_FRAMESIZE;
+ pkt->bio->bi_vcnt = pkt->frames;
+
+ pkt->bio->bi_end_io = pkt_end_io_packet_write;
+ pkt->bio->bi_private = pkt;
+
+ drop_super(sb);
+ return 1;
+
+out:
+ drop_super(sb);
+ return 0;
+#endif
+}
+
+static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state)
+{
+#if PACKET_DEBUG > 1
+ static const char *state_name[] = {
+ "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
+ };
+ enum packet_data_state old_state = pkt->state;
+ pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n",
+ pkt->id, (unsigned long long)pkt->sector,
+ state_name[old_state], state_name[state]);
+#endif
+ pkt->state = state;
+}
+
+/*
+ * Scan the work queue to see if we can start a new packet.
+ * returns non-zero if any work was done.
+ */
+static int pkt_handle_queue(struct pktcdvd_device *pd)
+{
+ struct packet_data *pkt, *p;
+ struct bio *bio = NULL;
+ sector_t zone = 0; /* Suppress gcc warning */
+ struct pkt_rb_node *node, *first_node;
+ struct rb_node *n;
+ int wakeup;
+
+ atomic_set(&pd->scan_queue, 0);
+
+ if (list_empty(&pd->cdrw.pkt_free_list)) {
+ pkt_dbg(2, pd, "no pkt\n");
+ return 0;
+ }
+
+ /*
+ * Try to find a zone we are not already working on.
+ */
+ spin_lock(&pd->lock);
+ first_node = pkt_rbtree_find(pd, pd->current_sector);
+ if (!first_node) {
+ n = rb_first(&pd->bio_queue);
+ if (n)
+ first_node = rb_entry(n, struct pkt_rb_node, rb_node);
+ }
+ node = first_node;
+ while (node) {
+ bio = node->bio;
+ zone = get_zone(bio->bi_iter.bi_sector, pd);
+ list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
+ if (p->sector == zone) {
+ bio = NULL;
+ goto try_next_bio;
+ }
+ }
+ break;
+try_next_bio:
+ node = pkt_rbtree_next(node);
+ if (!node) {
+ n = rb_first(&pd->bio_queue);
+ if (n)
+ node = rb_entry(n, struct pkt_rb_node, rb_node);
+ }
+ if (node == first_node)
+ node = NULL;
+ }
+ spin_unlock(&pd->lock);
+ if (!bio) {
+ pkt_dbg(2, pd, "no bio\n");
+ return 0;
+ }
+
+ pkt = pkt_get_packet_data(pd, zone);
+
+ pd->current_sector = zone + pd->settings.size;
+ pkt->sector = zone;
+ BUG_ON(pkt->frames != pd->settings.size >> 2);
+ pkt->write_size = 0;
+
+ /*
+ * Scan work queue for bios in the same zone and link them
+ * to this packet.
+ */
+ spin_lock(&pd->lock);
+ pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone);
+ while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
+ bio = node->bio;
+ pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long)
+ get_zone(bio->bi_iter.bi_sector, pd));
+ if (get_zone(bio->bi_iter.bi_sector, pd) != zone)
+ break;
+ pkt_rbtree_erase(pd, node);
+ spin_lock(&pkt->lock);
+ bio_list_add(&pkt->orig_bios, bio);
+ pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE;
+ spin_unlock(&pkt->lock);
+ }
+ /* check write congestion marks, and if bio_queue_size is
+ below, wake up any waiters */
+ wakeup = (pd->write_congestion_on > 0
+ && pd->bio_queue_size <= pd->write_congestion_off);
+ spin_unlock(&pd->lock);
+ if (wakeup) {
+ clear_bdi_congested(&pd->disk->queue->backing_dev_info,
+ BLK_RW_ASYNC);
+ }
+
+ pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
+ pkt_set_state(pkt, PACKET_WAITING_STATE);
+ atomic_set(&pkt->run_sm, 1);
+
+ spin_lock(&pd->cdrw.active_list_lock);
+ list_add(&pkt->list, &pd->cdrw.pkt_active_list);
+ spin_unlock(&pd->cdrw.active_list_lock);
+
+ return 1;
+}
+
+/*
+ * Assemble a bio to write one packet and queue the bio for processing
+ * by the underlying block device.
+ */
+static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+ int f;
+ struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
+
+ bio_reset(pkt->w_bio);
+ pkt->w_bio->bi_iter.bi_sector = pkt->sector;
+ pkt->w_bio->bi_bdev = pd->bdev;
+ pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+ pkt->w_bio->bi_private = pkt;
+
+ /* XXX: locking? */
+ for (f = 0; f < pkt->frames; f++) {
+ bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
+ bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+ if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
+ BUG();
+ }
+ pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
+
+ /*
+ * Fill-in bvec with data from orig_bios.
+ */
+ spin_lock(&pkt->lock);
+ bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
+
+ pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
+ spin_unlock(&pkt->lock);
+
+ pkt_dbg(2, pd, "Writing %d frames for zone %llx\n",
+ pkt->write_size, (unsigned long long)pkt->sector);
+
+ if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
+ pkt_make_local_copy(pkt, bvec);
+ pkt->cache_valid = 1;
+ } else {
+ pkt->cache_valid = 0;
+ }
+
+ /* Start the write request */
+ atomic_set(&pkt->io_wait, 1);
+ pkt->w_bio->bi_rw = WRITE;
+ pkt_queue_bio(pd, pkt->w_bio);
+}
+
+static void pkt_finish_packet(struct packet_data *pkt, int uptodate)
+{
+ struct bio *bio;
+
+ if (!uptodate)
+ pkt->cache_valid = 0;
+
+ /* Finish all bios corresponding to this packet */
+ while ((bio = bio_list_pop(&pkt->orig_bios)))
+ bio_endio(bio, uptodate ? 0 : -EIO);
+}
+
+static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+ int uptodate;
+
+ pkt_dbg(2, pd, "pkt %d\n", pkt->id);
+
+ for (;;) {
+ switch (pkt->state) {
+ case PACKET_WAITING_STATE:
+ if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0))
+ return;
+
+ pkt->sleep_time = 0;
+ pkt_gather_data(pd, pkt);
+ pkt_set_state(pkt, PACKET_READ_WAIT_STATE);
+ break;
+
+ case PACKET_READ_WAIT_STATE:
+ if (atomic_read(&pkt->io_wait) > 0)
+ return;
+
+ if (atomic_read(&pkt->io_errors) > 0) {
+ pkt_set_state(pkt, PACKET_RECOVERY_STATE);
+ } else {
+ pkt_start_write(pd, pkt);
+ }
+ break;
+
+ case PACKET_WRITE_WAIT_STATE:
+ if (atomic_read(&pkt->io_wait) > 0)
+ return;
+
+ if (test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags)) {
+ pkt_set_state(pkt, PACKET_FINISHED_STATE);
+ } else {
+ pkt_set_state(pkt, PACKET_RECOVERY_STATE);
+ }
+ break;
+
+ case PACKET_RECOVERY_STATE:
+ if (pkt_start_recovery(pkt)) {
+ pkt_start_write(pd, pkt);
+ } else {
+ pkt_dbg(2, pd, "No recovery possible\n");
+ pkt_set_state(pkt, PACKET_FINISHED_STATE);
+ }
+ break;
+
+ case PACKET_FINISHED_STATE:
+ uptodate = test_bit(BIO_UPTODATE, &pkt->w_bio->bi_flags);
+ pkt_finish_packet(pkt, uptodate);
+ return;
+
+ default:
+ BUG();
+ break;
+ }
+ }
+}
+
+static void pkt_handle_packets(struct pktcdvd_device *pd)
+{
+ struct packet_data *pkt, *next;
+
+ /*
+ * Run state machine for active packets
+ */
+ list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+ if (atomic_read(&pkt->run_sm) > 0) {
+ atomic_set(&pkt->run_sm, 0);
+ pkt_run_state_machine(pd, pkt);
+ }
+ }
+
+ /*
+ * Move no longer active packets to the free list
+ */
+ spin_lock(&pd->cdrw.active_list_lock);
+ list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) {
+ if (pkt->state == PACKET_FINISHED_STATE) {
+ list_del(&pkt->list);
+ pkt_put_packet_data(pd, pkt);
+ pkt_set_state(pkt, PACKET_IDLE_STATE);
+ atomic_set(&pd->scan_queue, 1);
+ }
+ }
+ spin_unlock(&pd->cdrw.active_list_lock);
+}
+
+static void pkt_count_states(struct pktcdvd_device *pd, int *states)
+{
+ struct packet_data *pkt;
+ int i;
+
+ for (i = 0; i < PACKET_NUM_STATES; i++)
+ states[i] = 0;
+
+ spin_lock(&pd->cdrw.active_list_lock);
+ list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+ states[pkt->state]++;
+ }
+ spin_unlock(&pd->cdrw.active_list_lock);
+}
+
+/*
+ * kcdrwd is woken up when writes have been queued for one of our
+ * registered devices
+ */
+static int kcdrwd(void *foobar)
+{
+ struct pktcdvd_device *pd = foobar;
+ struct packet_data *pkt;
+ long min_sleep_time, residue;
+
+ set_user_nice(current, MIN_NICE);
+ set_freezable();
+
+ for (;;) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /*
+ * Wait until there is something to do
+ */
+ add_wait_queue(&pd->wqueue, &wait);
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ /* Check if we need to run pkt_handle_queue */
+ if (atomic_read(&pd->scan_queue) > 0)
+ goto work_to_do;
+
+ /* Check if we need to run the state machine for some packet */
+ list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+ if (atomic_read(&pkt->run_sm) > 0)
+ goto work_to_do;
+ }
+
+ /* Check if we need to process the iosched queues */
+ if (atomic_read(&pd->iosched.attention) != 0)
+ goto work_to_do;
+
+ /* Otherwise, go to sleep */
+ if (PACKET_DEBUG > 1) {
+ int states[PACKET_NUM_STATES];
+ pkt_count_states(pd, states);
+ pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+ states[0], states[1], states[2],
+ states[3], states[4], states[5]);
+ }
+
+ min_sleep_time = MAX_SCHEDULE_TIMEOUT;
+ list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+ if (pkt->sleep_time && pkt->sleep_time < min_sleep_time)
+ min_sleep_time = pkt->sleep_time;
+ }
+
+ pkt_dbg(2, pd, "sleeping\n");
+ residue = schedule_timeout(min_sleep_time);
+ pkt_dbg(2, pd, "wake up\n");
+
+ /* make swsusp happy with our thread */
+ try_to_freeze();
+
+ list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+ if (!pkt->sleep_time)
+ continue;
+ pkt->sleep_time -= min_sleep_time - residue;
+ if (pkt->sleep_time <= 0) {
+ pkt->sleep_time = 0;
+ atomic_inc(&pkt->run_sm);
+ }
+ }
+
+ if (kthread_should_stop())
+ break;
+ }
+work_to_do:
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&pd->wqueue, &wait);
+
+ if (kthread_should_stop())
+ break;
+
+ /*
+ * if pkt_handle_queue returns true, we can queue
+ * another request.
+ */
+ while (pkt_handle_queue(pd))
+ ;
+
+ /*
+ * Handle packet state machine
+ */
+ pkt_handle_packets(pd);
+
+ /*
+ * Handle iosched queues
+ */
+ pkt_iosched_process_queue(pd);
+ }
+
+ return 0;
+}
+
+static void pkt_print_settings(struct pktcdvd_device *pd)
+{
+ pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n",
+ pd->settings.fp ? "Fixed" : "Variable",
+ pd->settings.size >> 2,
+ pd->settings.block_mode == 8 ? '1' : '2');
+}
+
+static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
+{
+ memset(cgc->cmd, 0, sizeof(cgc->cmd));
+
+ cgc->cmd[0] = GPCMD_MODE_SENSE_10;
+ cgc->cmd[2] = page_code | (page_control << 6);
+ cgc->cmd[7] = cgc->buflen >> 8;
+ cgc->cmd[8] = cgc->buflen & 0xff;
+ cgc->data_direction = CGC_DATA_READ;
+ return pkt_generic_packet(pd, cgc);
+}
+
+static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc)
+{
+ memset(cgc->cmd, 0, sizeof(cgc->cmd));
+ memset(cgc->buffer, 0, 2);
+ cgc->cmd[0] = GPCMD_MODE_SELECT_10;
+ cgc->cmd[1] = 0x10; /* PF */
+ cgc->cmd[7] = cgc->buflen >> 8;
+ cgc->cmd[8] = cgc->buflen & 0xff;
+ cgc->data_direction = CGC_DATA_WRITE;
+ return pkt_generic_packet(pd, cgc);
+}
+
+static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
+{
+ struct packet_command cgc;
+ int ret;
+
+ /* set up command and get the disc info */
+ init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
+ cgc.cmd[0] = GPCMD_READ_DISC_INFO;
+ cgc.cmd[8] = cgc.buflen = 2;
+ cgc.quiet = 1;
+
+ if ((ret = pkt_generic_packet(pd, &cgc)))
+ return ret;
+
+ /* not all drives have the same disc_info length, so requeue
+ * packet with the length the drive tells us it can supply
+ */
+ cgc.buflen = be16_to_cpu(di->disc_information_length) +
+ sizeof(di->disc_information_length);
+
+ if (cgc.buflen > sizeof(disc_information))
+ cgc.buflen = sizeof(disc_information);
+
+ cgc.cmd[8] = cgc.buflen;
+ return pkt_generic_packet(pd, &cgc);
+}
+
+static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti)
+{
+ struct packet_command cgc;
+ int ret;
+
+ init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
+ cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
+ cgc.cmd[1] = type & 3;
+ cgc.cmd[4] = (track & 0xff00) >> 8;
+ cgc.cmd[5] = track & 0xff;
+ cgc.cmd[8] = 8;
+ cgc.quiet = 1;
+
+ if ((ret = pkt_generic_packet(pd, &cgc)))
+ return ret;
+
+ cgc.buflen = be16_to_cpu(ti->track_information_length) +
+ sizeof(ti->track_information_length);
+
+ if (cgc.buflen > sizeof(track_information))
+ cgc.buflen = sizeof(track_information);
+
+ cgc.cmd[8] = cgc.buflen;
+ return pkt_generic_packet(pd, &cgc);
+}
+
+static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
+ long *last_written)
+{
+ disc_information di;
+ track_information ti;
+ __u32 last_track;
+ int ret = -1;
+
+ if ((ret = pkt_get_disc_info(pd, &di)))
+ return ret;
+
+ last_track = (di.last_track_msb << 8) | di.last_track_lsb;
+ if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+ return ret;
+
+ /* if this track is blank, try the previous. */
+ if (ti.blank) {
+ last_track--;
+ if ((ret = pkt_get_track_info(pd, last_track, 1, &ti)))
+ return ret;
+ }
+
+ /* if last recorded field is valid, return it. */
+ if (ti.lra_v) {
+ *last_written = be32_to_cpu(ti.last_rec_address);
+ } else {
+ /* make it up instead */
+ *last_written = be32_to_cpu(ti.track_start) +
+ be32_to_cpu(ti.track_size);
+ if (ti.free_blocks)
+ *last_written -= (be32_to_cpu(ti.free_blocks) + 7);
+ }
+ return 0;
+}
+
+/*
+ * write mode select package based on pd->settings
+ */
+static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
+{
+ struct packet_command cgc;
+ struct request_sense sense;
+ write_param_page *wp;
+ char buffer[128];
+ int ret, size;
+
+ /* doesn't apply to DVD+RW or DVD-RAM */
+ if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12))
+ return 0;
+
+ memset(buffer, 0, sizeof(buffer));
+ init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
+ cgc.sense = &sense;
+ if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+ pkt_dump_sense(pd, &cgc);
+ return ret;
+ }
+
+ size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff));
+ pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff);
+ if (size > sizeof(buffer))
+ size = sizeof(buffer);
+
+ /*
+ * now get it all
+ */
+ init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
+ cgc.sense = &sense;
+ if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
+ pkt_dump_sense(pd, &cgc);
+ return ret;
+ }
+
+ /*
+ * write page is offset header + block descriptor length
+ */
+ wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset];
+
+ wp->fp = pd->settings.fp;
+ wp->track_mode = pd->settings.track_mode;
+ wp->write_type = pd->settings.write_type;
+ wp->data_block_type = pd->settings.block_mode;
+
+ wp->multi_session = 0;
+
+#ifdef PACKET_USE_LS
+ wp->link_size = 7;
+ wp->ls_v = 1;
+#endif
+
+ if (wp->data_block_type == PACKET_BLOCK_MODE1) {
+ wp->session_format = 0;
+ wp->subhdr2 = 0x20;
+ } else if (wp->data_block_type == PACKET_BLOCK_MODE2) {
+ wp->session_format = 0x20;
+ wp->subhdr2 = 8;
+#if 0
+ wp->mcn[0] = 0x80;
+ memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1);
+#endif
+ } else {
+ /*
+ * paranoia
+ */
+ pkt_err(pd, "write mode wrong %d\n", wp->data_block_type);
+ return 1;
+ }
+ wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
+
+ cgc.buflen = cgc.cmd[8] = size;
+ if ((ret = pkt_mode_select(pd, &cgc))) {
+ pkt_dump_sense(pd, &cgc);
+ return ret;
+ }
+
+ pkt_print_settings(pd);
+ return 0;
+}
+
+/*
+ * 1 -- we can write to this track, 0 -- we can't
+ */
+static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
+{
+ switch (pd->mmc3_profile) {
+ case 0x1a: /* DVD+RW */
+ case 0x12: /* DVD-RAM */
+ /* The track is always writable on DVD+RW/DVD-RAM */
+ return 1;
+ default:
+ break;
+ }
+
+ if (!ti->packet || !ti->fp)
+ return 0;
+
+ /*
+ * "good" settings as per Mt Fuji.
+ */
+ if (ti->rt == 0 && ti->blank == 0)
+ return 1;
+
+ if (ti->rt == 0 && ti->blank == 1)
+ return 1;
+
+ if (ti->rt == 1 && ti->blank == 0)
+ return 1;
+
+ pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
+ return 0;
+}
+
+/*
+ * 1 -- we can write to this disc, 0 -- we can't
+ */
+static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
+{
+ switch (pd->mmc3_profile) {
+ case 0x0a: /* CD-RW */
+ case 0xffff: /* MMC3 not supported */
+ break;
+ case 0x1a: /* DVD+RW */
+ case 0x13: /* DVD-RW */
+ case 0x12: /* DVD-RAM */
+ return 1;
+ default:
+ pkt_dbg(2, pd, "Wrong disc profile (%x)\n",
+ pd->mmc3_profile);
+ return 0;
+ }
+
+ /*
+ * for disc type 0xff we should probably reserve a new track.
+ * but i'm not sure, should we leave this to user apps? probably.
+ */
+ if (di->disc_type == 0xff) {
+ pkt_notice(pd, "unknown disc - no track?\n");
+ return 0;
+ }
+
+ if (di->disc_type != 0x20 && di->disc_type != 0) {
+ pkt_err(pd, "wrong disc type (%x)\n", di->disc_type);
+ return 0;
+ }
+
+ if (di->erasable == 0) {
+ pkt_notice(pd, "disc not erasable\n");
+ return 0;
+ }
+
+ if (di->border_status == PACKET_SESSION_RESERVED) {
+ pkt_err(pd, "can't write to last track (reserved)\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
+{
+ struct packet_command cgc;
+ unsigned char buf[12];
+ disc_information di;
+ track_information ti;
+ int ret, track;
+
+ init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
+ cgc.cmd[0] = GPCMD_GET_CONFIGURATION;
+ cgc.cmd[8] = 8;
+ ret = pkt_generic_packet(pd, &cgc);
+ pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7];
+
+ memset(&di, 0, sizeof(disc_information));
+ memset(&ti, 0, sizeof(track_information));
+
+ if ((ret = pkt_get_disc_info(pd, &di))) {
+ pkt_err(pd, "failed get_disc\n");
+ return ret;
+ }
+
+ if (!pkt_writable_disc(pd, &di))
+ return -EROFS;
+
+ pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
+
+ track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
+ if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
+ pkt_err(pd, "failed get_track\n");
+ return ret;
+ }
+
+ if (!pkt_writable_track(pd, &ti)) {
+ pkt_err(pd, "can't write to this track\n");
+ return -EROFS;
+ }
+
+ /*
+ * we keep packet size in 512 byte units, makes it easier to
+ * deal with request calculations.
+ */
+ pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
+ if (pd->settings.size == 0) {
+ pkt_notice(pd, "detected zero packet size!\n");
+ return -ENXIO;
+ }
+ if (pd->settings.size > PACKET_MAX_SECTORS) {
+ pkt_err(pd, "packet size is too big\n");
+ return -EROFS;
+ }
+ pd->settings.fp = ti.fp;
+ pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1);
+
+ if (ti.nwa_v) {
+ pd->nwa = be32_to_cpu(ti.next_writable);
+ set_bit(PACKET_NWA_VALID, &pd->flags);
+ }
+
+ /*
+ * in theory we could use lra on -RW media as well and just zero
+ * blocks that haven't been written yet, but in practice that
+ * is just a no-go. we'll use that for -R, naturally.
+ */
+ if (ti.lra_v) {
+ pd->lra = be32_to_cpu(ti.last_rec_address);
+ set_bit(PACKET_LRA_VALID, &pd->flags);
+ } else {
+ pd->lra = 0xffffffff;
+ set_bit(PACKET_LRA_VALID, &pd->flags);
+ }
+
+ /*
+ * fine for now
+ */
+ pd->settings.link_loss = 7;
+ pd->settings.write_type = 0; /* packet */
+ pd->settings.track_mode = ti.track_mode;
+
+ /*
+ * mode1 or mode2 disc
+ */
+ switch (ti.data_mode) {
+ case PACKET_MODE1:
+ pd->settings.block_mode = PACKET_BLOCK_MODE1;
+ break;
+ case PACKET_MODE2:
+ pd->settings.block_mode = PACKET_BLOCK_MODE2;
+ break;
+ default:
+ pkt_err(pd, "unknown data mode\n");
+ return -EROFS;
+ }
+ return 0;
+}
+
+/*
+ * enable/disable write caching on drive
+ */
+static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
+ int set)
+{
+ struct packet_command cgc;
+ struct request_sense sense;
+ unsigned char buf[64];
+ int ret;
+
+ init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
+ cgc.sense = &sense;
+ cgc.buflen = pd->mode_offset + 12;
+
+ /*
+ * caching mode page might not be there, so quiet this command
+ */
+ cgc.quiet = 1;
+
+ if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0)))
+ return ret;
+
+ buf[pd->mode_offset + 10] |= (!!set << 2);
+
+ cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
+ ret = pkt_mode_select(pd, &cgc);
+ if (ret) {
+ pkt_err(pd, "write caching control failed\n");
+ pkt_dump_sense(pd, &cgc);
+ } else if (!ret && set)
+ pkt_notice(pd, "enabled write caching\n");
+ return ret;
+}
+
+static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag)
+{
+ struct packet_command cgc;
+
+ init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+ cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
+ cgc.cmd[4] = lockflag ? 1 : 0;
+ return pkt_generic_packet(pd, &cgc);
+}
+
+/*
+ * Returns drive maximum write speed
+ */
+static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
+ unsigned *write_speed)
+{
+ struct packet_command cgc;
+ struct request_sense sense;
+ unsigned char buf[256+18];
+ unsigned char *cap_buf;
+ int ret, offset;
+
+ cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
+ init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
+ cgc.sense = &sense;
+
+ ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
+ if (ret) {
+ cgc.buflen = pd->mode_offset + cap_buf[1] + 2 +
+ sizeof(struct mode_page_header);
+ ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
+ if (ret) {
+ pkt_dump_sense(pd, &cgc);
+ return ret;
+ }
+ }
+
+ offset = 20; /* Obsoleted field, used by older drives */
+ if (cap_buf[1] >= 28)
+ offset = 28; /* Current write speed selected */
+ if (cap_buf[1] >= 30) {
+ /* If the drive reports at least one "Logical Unit Write
+ * Speed Performance Descriptor Block", use the information
+ * in the first block. (contains the highest speed)
+ */
+ int num_spdb = (cap_buf[30] << 8) + cap_buf[31];
+ if (num_spdb > 0)
+ offset = 34;
+ }
+
+ *write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1];
+ return 0;
+}
+
+/* These tables from cdrecord - I don't have orange book */
+/* standard speed CD-RW (1-4x) */
+static char clv_to_speed[16] = {
+ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
+ 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+/* high speed CD-RW (-10x) */
+static char hs_clv_to_speed[16] = {
+ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
+ 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+/* ultra high speed CD-RW */
+static char us_clv_to_speed[16] = {
+ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
+ 0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0
+};
+
+/*
+ * reads the maximum media speed from ATIP
+ */
+static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
+ unsigned *speed)
+{
+ struct packet_command cgc;
+ struct request_sense sense;
+ unsigned char buf[64];
+ unsigned int size, st, sp;
+ int ret;
+
+ init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
+ cgc.sense = &sense;
+ cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
+ cgc.cmd[1] = 2;
+ cgc.cmd[2] = 4; /* READ ATIP */
+ cgc.cmd[8] = 2;
+ ret = pkt_generic_packet(pd, &cgc);
+ if (ret) {
+ pkt_dump_sense(pd, &cgc);
+ return ret;
+ }
+ size = ((unsigned int) buf[0]<<8) + buf[1] + 2;
+ if (size > sizeof(buf))
+ size = sizeof(buf);
+
+ init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
+ cgc.sense = &sense;
+ cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
+ cgc.cmd[1] = 2;
+ cgc.cmd[2] = 4;
+ cgc.cmd[8] = size;
+ ret = pkt_generic_packet(pd, &cgc);
+ if (ret) {
+ pkt_dump_sense(pd, &cgc);
+ return ret;
+ }
+
+ if (!(buf[6] & 0x40)) {
+ pkt_notice(pd, "disc type is not CD-RW\n");
+ return 1;
+ }
+ if (!(buf[6] & 0x4)) {
+ pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n");
+ return 1;
+ }
+
+ st = (buf[6] >> 3) & 0x7; /* disc sub-type */
+
+ sp = buf[16] & 0xf; /* max speed from ATIP A1 field */
+
+ /* Info from cdrecord */
+ switch (st) {
+ case 0: /* standard speed */
+ *speed = clv_to_speed[sp];
+ break;
+ case 1: /* high speed */
+ *speed = hs_clv_to_speed[sp];
+ break;
+ case 2: /* ultra high speed */
+ *speed = us_clv_to_speed[sp];
+ break;
+ default:
+ pkt_notice(pd, "unknown disc sub-type %d\n", st);
+ return 1;
+ }
+ if (*speed) {
+ pkt_info(pd, "maximum media speed: %d\n", *speed);
+ return 0;
+ } else {
+ pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st);
+ return 1;
+ }
+}
+
+static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
+{
+ struct packet_command cgc;
+ struct request_sense sense;
+ int ret;
+
+ pkt_dbg(2, pd, "Performing OPC\n");
+
+ init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+ cgc.sense = &sense;
+ cgc.timeout = 60*HZ;
+ cgc.cmd[0] = GPCMD_SEND_OPC;
+ cgc.cmd[1] = 1;
+ if ((ret = pkt_generic_packet(pd, &cgc)))
+ pkt_dump_sense(pd, &cgc);
+ return ret;
+}
+
+static int pkt_open_write(struct pktcdvd_device *pd)
+{
+ int ret;
+ unsigned int write_speed, media_write_speed, read_speed;
+
+ if ((ret = pkt_probe_settings(pd))) {
+ pkt_dbg(2, pd, "failed probe\n");
+ return ret;
+ }
+
+ if ((ret = pkt_set_write_settings(pd))) {
+ pkt_dbg(1, pd, "failed saving write settings\n");
+ return -EIO;
+ }
+
+ pkt_write_caching(pd, USE_WCACHING);
+
+ if ((ret = pkt_get_max_speed(pd, &write_speed)))
+ write_speed = 16 * 177;
+ switch (pd->mmc3_profile) {
+ case 0x13: /* DVD-RW */
+ case 0x1a: /* DVD+RW */
+ case 0x12: /* DVD-RAM */
+ pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
+ break;
+ default:
+ if ((ret = pkt_media_speed(pd, &media_write_speed)))
+ media_write_speed = 16;
+ write_speed = min(write_speed, media_write_speed * 177);
+ pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
+ break;
+ }
+ read_speed = write_speed;
+
+ if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
+ pkt_dbg(1, pd, "couldn't set write speed\n");
+ return -EIO;
+ }
+ pd->write_speed = write_speed;
+ pd->read_speed = read_speed;
+
+ if ((ret = pkt_perform_opc(pd))) {
+ pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
+ }
+
+ return 0;
+}
+
+/*
+ * called at open time.
+ */
+static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
+{
+ int ret;
+ long lba;
+ struct request_queue *q;
+
+ /*
+ * We need to re-open the cdrom device without O_NONBLOCK to be able
+ * to read/write from/to it. It is already opened in O_NONBLOCK mode
+ * so bdget() can't fail.
+ */
+ bdget(pd->bdev->bd_dev);
+ if ((ret = blkdev_get(pd->bdev, FMODE_READ | FMODE_EXCL, pd)))
+ goto out;
+
+ if ((ret = pkt_get_last_written(pd, &lba))) {
+ pkt_err(pd, "pkt_get_last_written failed\n");
+ goto out_putdev;
+ }
+
+ set_capacity(pd->disk, lba << 2);
+ set_capacity(pd->bdev->bd_disk, lba << 2);
+ bd_set_size(pd->bdev, (loff_t)lba << 11);
+
+ q = bdev_get_queue(pd->bdev);
+ if (write) {
+ if ((ret = pkt_open_write(pd)))
+ goto out_putdev;
+ /*
+ * Some CDRW drives can not handle writes larger than one packet,
+ * even if the size is a multiple of the packet size.
+ */
+ spin_lock_irq(q->queue_lock);
+ blk_queue_max_hw_sectors(q, pd->settings.size);
+ spin_unlock_irq(q->queue_lock);
+ set_bit(PACKET_WRITABLE, &pd->flags);
+ } else {
+ pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+ clear_bit(PACKET_WRITABLE, &pd->flags);
+ }
+
+ if ((ret = pkt_set_segment_merging(pd, q)))
+ goto out_putdev;
+
+ if (write) {
+ if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
+ pkt_err(pd, "not enough memory for buffers\n");
+ ret = -ENOMEM;
+ goto out_putdev;
+ }
+ pkt_info(pd, "%lukB available on disc\n", lba << 1);
+ }
+
+ return 0;
+
+out_putdev:
+ blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
+out:
+ return ret;
+}
+
+/*
+ * called when the device is closed. makes sure that the device flushes
+ * the internal cache before we close.
+ */
+static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
+{
+ if (flush && pkt_flush_cache(pd))
+ pkt_dbg(1, pd, "not flushing cache\n");
+
+ pkt_lock_door(pd, 0);
+
+ pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+ blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
+
+ pkt_shrink_pktlist(pd);
+}
+
+static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
+{
+ if (dev_minor >= MAX_WRITERS)
+ return NULL;
+ return pkt_devs[dev_minor];
+}
+
+static int pkt_open(struct block_device *bdev, fmode_t mode)
+{
+ struct pktcdvd_device *pd = NULL;
+ int ret;
+
+ mutex_lock(&pktcdvd_mutex);
+ mutex_lock(&ctl_mutex);
+ pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
+ if (!pd) {
+ ret = -ENODEV;
+ goto out;
+ }
+ BUG_ON(pd->refcnt < 0);
+
+ pd->refcnt++;
+ if (pd->refcnt > 1) {
+ if ((mode & FMODE_WRITE) &&
+ !test_bit(PACKET_WRITABLE, &pd->flags)) {
+ ret = -EBUSY;
+ goto out_dec;
+ }
+ } else {
+ ret = pkt_open_dev(pd, mode & FMODE_WRITE);
+ if (ret)
+ goto out_dec;
+ /*
+ * needed here as well, since ext2 (among others) may change
+ * the blocksize at mount time
+ */
+ set_blocksize(bdev, CD_FRAMESIZE);
+ }
+
+ mutex_unlock(&ctl_mutex);
+ mutex_unlock(&pktcdvd_mutex);
+ return 0;
+
+out_dec:
+ pd->refcnt--;
+out:
+ mutex_unlock(&ctl_mutex);
+ mutex_unlock(&pktcdvd_mutex);
+ return ret;
+}
+
+static void pkt_close(struct gendisk *disk, fmode_t mode)
+{
+ struct pktcdvd_device *pd = disk->private_data;
+
+ mutex_lock(&pktcdvd_mutex);
+ mutex_lock(&ctl_mutex);
+ pd->refcnt--;
+ BUG_ON(pd->refcnt < 0);
+ if (pd->refcnt == 0) {
+ int flush = test_bit(PACKET_WRITABLE, &pd->flags);
+ pkt_release_dev(pd, flush);
+ }
+ mutex_unlock(&ctl_mutex);
+ mutex_unlock(&pktcdvd_mutex);
+}
+
+
+static void pkt_end_io_read_cloned(struct bio *bio, int err)
+{
+ struct packet_stacked_data *psd = bio->bi_private;
+ struct pktcdvd_device *pd = psd->pd;
+
+ bio_put(bio);
+ bio_endio(psd->bio, err);
+ mempool_free(psd, psd_pool);
+ pkt_bio_finished(pd);
+}
+
+static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
+{
+ struct bio *cloned_bio = bio_clone(bio, GFP_NOIO);
+ struct packet_stacked_data *psd = mempool_alloc(psd_pool, GFP_NOIO);
+
+ psd->pd = pd;
+ psd->bio = bio;
+ cloned_bio->bi_bdev = pd->bdev;
+ cloned_bio->bi_private = psd;
+ cloned_bio->bi_end_io = pkt_end_io_read_cloned;
+ pd->stats.secs_r += bio_sectors(bio);
+ pkt_queue_bio(pd, cloned_bio);
+}
+
+static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
+{
+ struct pktcdvd_device *pd = q->queuedata;
+ sector_t zone;
+ struct packet_data *pkt;
+ int was_empty, blocked_bio;
+ struct pkt_rb_node *node;
+
+ zone = get_zone(bio->bi_iter.bi_sector, pd);
+
+ /*
+ * If we find a matching packet in state WAITING or READ_WAIT, we can
+ * just append this bio to that packet.
+ */
+ spin_lock(&pd->cdrw.active_list_lock);
+ blocked_bio = 0;
+ list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+ if (pkt->sector == zone) {
+ spin_lock(&pkt->lock);
+ if ((pkt->state == PACKET_WAITING_STATE) ||
+ (pkt->state == PACKET_READ_WAIT_STATE)) {
+ bio_list_add(&pkt->orig_bios, bio);
+ pkt->write_size +=
+ bio->bi_iter.bi_size / CD_FRAMESIZE;
+ if ((pkt->write_size >= pkt->frames) &&
+ (pkt->state == PACKET_WAITING_STATE)) {
+ atomic_inc(&pkt->run_sm);
+ wake_up(&pd->wqueue);
+ }
+ spin_unlock(&pkt->lock);
+ spin_unlock(&pd->cdrw.active_list_lock);
+ return;
+ } else {
+ blocked_bio = 1;
+ }
+ spin_unlock(&pkt->lock);
+ }
+ }
+ spin_unlock(&pd->cdrw.active_list_lock);
+
+ /*
+ * Test if there is enough room left in the bio work queue
+ * (queue size >= congestion on mark).
+ * If not, wait till the work queue size is below the congestion off mark.
+ */
+ spin_lock(&pd->lock);
+ if (pd->write_congestion_on > 0
+ && pd->bio_queue_size >= pd->write_congestion_on) {
+ set_bdi_congested(&q->backing_dev_info, BLK_RW_ASYNC);
+ do {
+ spin_unlock(&pd->lock);
+ congestion_wait(BLK_RW_ASYNC, HZ);
+ spin_lock(&pd->lock);
+ } while(pd->bio_queue_size > pd->write_congestion_off);
+ }
+ spin_unlock(&pd->lock);
+
+ /*
+ * No matching packet found. Store the bio in the work queue.
+ */
+ node = mempool_alloc(pd->rb_pool, GFP_NOIO);
+ node->bio = bio;
+ spin_lock(&pd->lock);
+ BUG_ON(pd->bio_queue_size < 0);
+ was_empty = (pd->bio_queue_size == 0);
+ pkt_rbtree_insert(pd, node);
+ spin_unlock(&pd->lock);
+
+ /*
+ * Wake up the worker thread.
+ */
+ atomic_set(&pd->scan_queue, 1);
+ if (was_empty) {
+ /* This wake_up is required for correct operation */
+ wake_up(&pd->wqueue);
+ } else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) {
+ /*
+ * This wake up is not required for correct operation,
+ * but improves performance in some cases.
+ */
+ wake_up(&pd->wqueue);
+ }
+}
+
+static void pkt_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct pktcdvd_device *pd;
+ char b[BDEVNAME_SIZE];
+ struct bio *split;
+
+ pd = q->queuedata;
+ if (!pd) {
+ pr_err("%s incorrect request queue\n",
+ bdevname(bio->bi_bdev, b));
+ goto end_io;
+ }
+
+ pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
+ (unsigned long long)bio->bi_iter.bi_sector,
+ (unsigned long long)bio_end_sector(bio));
+
+ /*
+ * Clone READ bios so we can have our own bi_end_io callback.
+ */
+ if (bio_data_dir(bio) == READ) {
+ pkt_make_request_read(pd, bio);
+ return;
+ }
+
+ if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
+ pkt_notice(pd, "WRITE for ro device (%llu)\n",
+ (unsigned long long)bio->bi_iter.bi_sector);
+ goto end_io;
+ }
+
+ if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
+ pkt_err(pd, "wrong bio size\n");
+ goto end_io;
+ }
+
+ blk_queue_bounce(q, &bio);
+
+ do {
+ sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
+ sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
+
+ if (last_zone != zone) {
+ BUG_ON(last_zone != zone + pd->settings.size);
+
+ split = bio_split(bio, last_zone -
+ bio->bi_iter.bi_sector,
+ GFP_NOIO, fs_bio_set);
+ bio_chain(split, bio);
+ } else {
+ split = bio;
+ }
+
+ pkt_make_request_write(q, split);
+ } while (split != bio);
+
+ return;
+end_io:
+ bio_io_error(bio);
+}
+
+
+
+static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+ struct bio_vec *bvec)
+{
+ struct pktcdvd_device *pd = q->queuedata;
+ sector_t zone = get_zone(bmd->bi_sector, pd);
+ int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size;
+ int remaining = (pd->settings.size << 9) - used;
+ int remaining2;
+
+ /*
+ * A bio <= PAGE_SIZE must be allowed. If it crosses a packet
+ * boundary, pkt_make_request() will split the bio.
+ */
+ remaining2 = PAGE_SIZE - bmd->bi_size;
+ remaining = max(remaining, remaining2);
+
+ BUG_ON(remaining < 0);
+ return remaining;
+}
+
+static void pkt_init_queue(struct pktcdvd_device *pd)
+{
+ struct request_queue *q = pd->disk->queue;
+
+ blk_queue_make_request(q, pkt_make_request);
+ blk_queue_logical_block_size(q, CD_FRAMESIZE);
+ blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
+ blk_queue_merge_bvec(q, pkt_merge_bvec);
+ q->queuedata = pd;
+}
+
+static int pkt_seq_show(struct seq_file *m, void *p)
+{
+ struct pktcdvd_device *pd = m->private;
+ char *msg;
+ char bdev_buf[BDEVNAME_SIZE];
+ int states[PACKET_NUM_STATES];
+
+ seq_printf(m, "Writer %s mapped to %s:\n", pd->name,
+ bdevname(pd->bdev, bdev_buf));
+
+ seq_printf(m, "\nSettings:\n");
+ seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
+
+ if (pd->settings.write_type == 0)
+ msg = "Packet";
+ else
+ msg = "Unknown";
+ seq_printf(m, "\twrite type:\t\t%s\n", msg);
+
+ seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable");
+ seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss);
+
+ seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode);
+
+ if (pd->settings.block_mode == PACKET_BLOCK_MODE1)
+ msg = "Mode 1";
+ else if (pd->settings.block_mode == PACKET_BLOCK_MODE2)
+ msg = "Mode 2";
+ else
+ msg = "Unknown";
+ seq_printf(m, "\tblock mode:\t\t%s\n", msg);
+
+ seq_printf(m, "\nStatistics:\n");
+ seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started);
+ seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended);
+ seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1);
+ seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1);
+ seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1);
+
+ seq_printf(m, "\nMisc:\n");
+ seq_printf(m, "\treference count:\t%d\n", pd->refcnt);
+ seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags);
+ seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed);
+ seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed);
+ seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset);
+ seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset);
+
+ seq_printf(m, "\nQueue state:\n");
+ seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size);
+ seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios));
+ seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector);
+
+ pkt_count_states(pd, states);
+ seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+ states[0], states[1], states[2], states[3], states[4], states[5]);
+
+ seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n",
+ pd->write_congestion_off,
+ pd->write_congestion_on);
+ return 0;
+}
+
+static int pkt_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, pkt_seq_show, PDE_DATA(inode));
+}
+
+static const struct file_operations pkt_proc_fops = {
+ .open = pkt_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release
+};
+
+static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
+{
+ int i;
+ int ret = 0;
+ char b[BDEVNAME_SIZE];
+ struct block_device *bdev;
+
+ if (pd->pkt_dev == dev) {
+ pkt_err(pd, "recursive setup not allowed\n");
+ return -EBUSY;
+ }
+ for (i = 0; i < MAX_WRITERS; i++) {
+ struct pktcdvd_device *pd2 = pkt_devs[i];
+ if (!pd2)
+ continue;
+ if (pd2->bdev->bd_dev == dev) {
+ pkt_err(pd, "%s already setup\n",
+ bdevname(pd2->bdev, b));
+ return -EBUSY;
+ }
+ if (pd2->pkt_dev == dev) {
+ pkt_err(pd, "can't chain pktcdvd devices\n");
+ return -EBUSY;
+ }
+ }
+
+ bdev = bdget(dev);
+ if (!bdev)
+ return -ENOMEM;
+ ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
+ if (ret)
+ return ret;
+
+ /* This is safe, since we have a reference from open(). */
+ __module_get(THIS_MODULE);
+
+ pd->bdev = bdev;
+ set_blocksize(bdev, CD_FRAMESIZE);
+
+ pkt_init_queue(pd);
+
+ atomic_set(&pd->cdrw.pending_bios, 0);
+ pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
+ if (IS_ERR(pd->cdrw.thread)) {
+ pkt_err(pd, "can't start kernel thread\n");
+ ret = -ENOMEM;
+ goto out_mem;
+ }
+
+ proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd);
+ pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b));
+ return 0;
+
+out_mem:
+ blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
+ /* This is safe: open() is still holding a reference. */
+ module_put(THIS_MODULE);
+ return ret;
+}
+
+static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+{
+ struct pktcdvd_device *pd = bdev->bd_disk->private_data;
+ int ret;
+
+ pkt_dbg(2, pd, "cmd %x, dev %d:%d\n",
+ cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+
+ mutex_lock(&pktcdvd_mutex);
+ switch (cmd) {
+ case CDROMEJECT:
+ /*
+ * The door gets locked when the device is opened, so we
+ * have to unlock it or else the eject command fails.
+ */
+ if (pd->refcnt == 1)
+ pkt_lock_door(pd, 0);
+ /* fallthru */
+ /*
+ * forward selected CDROM ioctls to CD-ROM, for UDF
+ */
+ case CDROMMULTISESSION:
+ case CDROMREADTOCENTRY:
+ case CDROM_LAST_WRITTEN:
+ case CDROM_SEND_PACKET:
+ case SCSI_IOCTL_SEND_COMMAND:
+ ret = __blkdev_driver_ioctl(pd->bdev, mode, cmd, arg);
+ break;
+
+ default:
+ pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd);
+ ret = -ENOTTY;
+ }
+ mutex_unlock(&pktcdvd_mutex);
+
+ return ret;
+}
+
+static unsigned int pkt_check_events(struct gendisk *disk,
+ unsigned int clearing)
+{
+ struct pktcdvd_device *pd = disk->private_data;
+ struct gendisk *attached_disk;
+
+ if (!pd)
+ return 0;
+ if (!pd->bdev)
+ return 0;
+ attached_disk = pd->bdev->bd_disk;
+ if (!attached_disk || !attached_disk->fops->check_events)
+ return 0;
+ return attached_disk->fops->check_events(attached_disk, clearing);
+}
+
+static const struct block_device_operations pktcdvd_ops = {
+ .owner = THIS_MODULE,
+ .open = pkt_open,
+ .release = pkt_close,
+ .ioctl = pkt_ioctl,
+ .check_events = pkt_check_events,
+};
+
+static char *pktcdvd_devnode(struct gendisk *gd, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name);
+}
+
+/*
+ * Set up mapping from pktcdvd device to CD-ROM device.
+ */
+static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
+{
+ int idx;
+ int ret = -ENOMEM;
+ struct pktcdvd_device *pd;
+ struct gendisk *disk;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ for (idx = 0; idx < MAX_WRITERS; idx++)
+ if (!pkt_devs[idx])
+ break;
+ if (idx == MAX_WRITERS) {
+ pr_err("max %d writers supported\n", MAX_WRITERS);
+ ret = -EBUSY;
+ goto out_mutex;
+ }
+
+ pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL);
+ if (!pd)
+ goto out_mutex;
+
+ pd->rb_pool = mempool_create_kmalloc_pool(PKT_RB_POOL_SIZE,
+ sizeof(struct pkt_rb_node));
+ if (!pd->rb_pool)
+ goto out_mem;
+
+ INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
+ INIT_LIST_HEAD(&pd->cdrw.pkt_active_list);
+ spin_lock_init(&pd->cdrw.active_list_lock);
+
+ spin_lock_init(&pd->lock);
+ spin_lock_init(&pd->iosched.lock);
+ bio_list_init(&pd->iosched.read_queue);
+ bio_list_init(&pd->iosched.write_queue);
+ sprintf(pd->name, DRIVER_NAME"%d", idx);
+ init_waitqueue_head(&pd->wqueue);
+ pd->bio_queue = RB_ROOT;
+
+ pd->write_congestion_on = write_congestion_on;
+ pd->write_congestion_off = write_congestion_off;
+
+ disk = alloc_disk(1);
+ if (!disk)
+ goto out_mem;
+ pd->disk = disk;
+ disk->major = pktdev_major;
+ disk->first_minor = idx;
+ disk->fops = &pktcdvd_ops;
+ disk->flags = GENHD_FL_REMOVABLE;
+ strcpy(disk->disk_name, pd->name);
+ disk->devnode = pktcdvd_devnode;
+ disk->private_data = pd;
+ disk->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!disk->queue)
+ goto out_mem2;
+
+ pd->pkt_dev = MKDEV(pktdev_major, idx);
+ ret = pkt_new_dev(pd, dev);
+ if (ret)
+ goto out_new_dev;
+
+ /* inherit events of the host device */
+ disk->events = pd->bdev->bd_disk->events;
+ disk->async_events = pd->bdev->bd_disk->async_events;
+
+ add_disk(disk);
+
+ pkt_sysfs_dev_new(pd);
+ pkt_debugfs_dev_new(pd);
+
+ pkt_devs[idx] = pd;
+ if (pkt_dev)
+ *pkt_dev = pd->pkt_dev;
+
+ mutex_unlock(&ctl_mutex);
+ return 0;
+
+out_new_dev:
+ blk_cleanup_queue(disk->queue);
+out_mem2:
+ put_disk(disk);
+out_mem:
+ if (pd->rb_pool)
+ mempool_destroy(pd->rb_pool);
+ kfree(pd);
+out_mutex:
+ mutex_unlock(&ctl_mutex);
+ pr_err("setup of pktcdvd device failed\n");
+ return ret;
+}
+
+/*
+ * Tear down mapping from pktcdvd device to CD-ROM device.
+ */
+static int pkt_remove_dev(dev_t pkt_dev)
+{
+ struct pktcdvd_device *pd;
+ int idx;
+ int ret = 0;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ for (idx = 0; idx < MAX_WRITERS; idx++) {
+ pd = pkt_devs[idx];
+ if (pd && (pd->pkt_dev == pkt_dev))
+ break;
+ }
+ if (idx == MAX_WRITERS) {
+ pr_debug("dev not setup\n");
+ ret = -ENXIO;
+ goto out;
+ }
+
+ if (pd->refcnt > 0) {
+ ret = -EBUSY;
+ goto out;
+ }
+ if (!IS_ERR(pd->cdrw.thread))
+ kthread_stop(pd->cdrw.thread);
+
+ pkt_devs[idx] = NULL;
+
+ pkt_debugfs_dev_remove(pd);
+ pkt_sysfs_dev_remove(pd);
+
+ blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY);
+
+ remove_proc_entry(pd->name, pkt_proc);
+ pkt_dbg(1, pd, "writer unmapped\n");
+
+ del_gendisk(pd->disk);
+ blk_cleanup_queue(pd->disk->queue);
+ put_disk(pd->disk);
+
+ mempool_destroy(pd->rb_pool);
+ kfree(pd);
+
+ /* This is safe: open() is still holding a reference. */
+ module_put(THIS_MODULE);
+
+out:
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
+{
+ struct pktcdvd_device *pd;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
+ if (pd) {
+ ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev);
+ ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
+ } else {
+ ctrl_cmd->dev = 0;
+ ctrl_cmd->pkt_dev = 0;
+ }
+ ctrl_cmd->num_devices = MAX_WRITERS;
+
+ mutex_unlock(&ctl_mutex);
+}
+
+static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct pkt_ctrl_command ctrl_cmd;
+ int ret = 0;
+ dev_t pkt_dev = 0;
+
+ if (cmd != PACKET_CTRL_CMD)
+ return -ENOTTY;
+
+ if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command)))
+ return -EFAULT;
+
+ switch (ctrl_cmd.command) {
+ case PKT_CTRL_CMD_SETUP:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev);
+ ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev);
+ break;
+ case PKT_CTRL_CMD_TEARDOWN:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev));
+ break;
+ case PKT_CTRL_CMD_STATUS:
+ pkt_get_status(&ctrl_cmd);
+ break;
+ default:
+ return -ENOTTY;
+ }
+
+ if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command)))
+ return -EFAULT;
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations pkt_ctl_fops = {
+ .open = nonseekable_open,
+ .unlocked_ioctl = pkt_ctl_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = pkt_ctl_compat_ioctl,
+#endif
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+};
+
+static struct miscdevice pkt_misc = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = DRIVER_NAME,
+ .nodename = "pktcdvd/control",
+ .fops = &pkt_ctl_fops
+};
+
+static int __init pkt_init(void)
+{
+ int ret;
+
+ mutex_init(&ctl_mutex);
+
+ psd_pool = mempool_create_kmalloc_pool(PSD_POOL_SIZE,
+ sizeof(struct packet_stacked_data));
+ if (!psd_pool)
+ return -ENOMEM;
+
+ ret = register_blkdev(pktdev_major, DRIVER_NAME);
+ if (ret < 0) {
+ pr_err("unable to register block device\n");
+ goto out2;
+ }
+ if (!pktdev_major)
+ pktdev_major = ret;
+
+ ret = pkt_sysfs_init();
+ if (ret)
+ goto out;
+
+ pkt_debugfs_init();
+
+ ret = misc_register(&pkt_misc);
+ if (ret) {
+ pr_err("unable to register misc device\n");
+ goto out_misc;
+ }
+
+ pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL);
+
+ return 0;
+
+out_misc:
+ pkt_debugfs_cleanup();
+ pkt_sysfs_cleanup();
+out:
+ unregister_blkdev(pktdev_major, DRIVER_NAME);
+out2:
+ mempool_destroy(psd_pool);
+ return ret;
+}
+
+static void __exit pkt_exit(void)
+{
+ remove_proc_entry("driver/"DRIVER_NAME, NULL);
+ misc_deregister(&pkt_misc);
+
+ pkt_debugfs_cleanup();
+ pkt_sysfs_cleanup();
+
+ unregister_blkdev(pktdev_major, DRIVER_NAME);
+ mempool_destroy(psd_pool);
+}
+
+MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
+MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
+MODULE_LICENSE("GPL");
+
+module_init(pkt_init);
+module_exit(pkt_exit);
diff --git a/drivers/block/pmem.c b/drivers/block/pmem.c
new file mode 100644
index 000000000..eabf4a8d0
--- /dev/null
+++ b/drivers/block/pmem.c
@@ -0,0 +1,262 @@
+/*
+ * Persistent Memory Driver
+ *
+ * Copyright (c) 2014, Intel Corporation.
+ * Copyright (c) 2015, Christoph Hellwig <hch@lst.de>.
+ * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/cacheflush.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+
+#define PMEM_MINORS 16
+
+struct pmem_device {
+ struct request_queue *pmem_queue;
+ struct gendisk *pmem_disk;
+
+ /* One contiguous memory region per device */
+ phys_addr_t phys_addr;
+ void *virt_addr;
+ size_t size;
+};
+
+static int pmem_major;
+static atomic_t pmem_index;
+
+static void pmem_do_bvec(struct pmem_device *pmem, struct page *page,
+ unsigned int len, unsigned int off, int rw,
+ sector_t sector)
+{
+ void *mem = kmap_atomic(page);
+ size_t pmem_off = sector << 9;
+
+ if (rw == READ) {
+ memcpy(mem + off, pmem->virt_addr + pmem_off, len);
+ flush_dcache_page(page);
+ } else {
+ flush_dcache_page(page);
+ memcpy(pmem->virt_addr + pmem_off, mem + off, len);
+ }
+
+ kunmap_atomic(mem);
+}
+
+static void pmem_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+ int rw;
+ struct bio_vec bvec;
+ sector_t sector;
+ struct bvec_iter iter;
+ int err = 0;
+
+ if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) {
+ err = -EIO;
+ goto out;
+ }
+
+ BUG_ON(bio->bi_rw & REQ_DISCARD);
+
+ rw = bio_data_dir(bio);
+ sector = bio->bi_iter.bi_sector;
+ bio_for_each_segment(bvec, bio, iter) {
+ pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, bvec.bv_offset,
+ rw, sector);
+ sector += bvec.bv_len >> 9;
+ }
+
+out:
+ bio_endio(bio, err);
+}
+
+static int pmem_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, int rw)
+{
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+
+ pmem_do_bvec(pmem, page, PAGE_CACHE_SIZE, 0, rw, sector);
+ page_endio(page, rw & WRITE, 0);
+
+ return 0;
+}
+
+static long pmem_direct_access(struct block_device *bdev, sector_t sector,
+ void **kaddr, unsigned long *pfn, long size)
+{
+ struct pmem_device *pmem = bdev->bd_disk->private_data;
+ size_t offset = sector << 9;
+
+ if (!pmem)
+ return -ENODEV;
+
+ *kaddr = pmem->virt_addr + offset;
+ *pfn = (pmem->phys_addr + offset) >> PAGE_SHIFT;
+
+ return pmem->size - offset;
+}
+
+static const struct block_device_operations pmem_fops = {
+ .owner = THIS_MODULE,
+ .rw_page = pmem_rw_page,
+ .direct_access = pmem_direct_access,
+};
+
+static struct pmem_device *pmem_alloc(struct device *dev, struct resource *res)
+{
+ struct pmem_device *pmem;
+ struct gendisk *disk;
+ int idx, err;
+
+ err = -ENOMEM;
+ pmem = kzalloc(sizeof(*pmem), GFP_KERNEL);
+ if (!pmem)
+ goto out;
+
+ pmem->phys_addr = res->start;
+ pmem->size = resource_size(res);
+
+ err = -EINVAL;
+ if (!request_mem_region(pmem->phys_addr, pmem->size, "pmem")) {
+ dev_warn(dev, "could not reserve region [0x%pa:0x%zx]\n", &pmem->phys_addr, pmem->size);
+ goto out_free_dev;
+ }
+
+ /*
+ * Map the memory as non-cachable, as we can't write back the contents
+ * of the CPU caches in case of a crash.
+ */
+ err = -ENOMEM;
+ pmem->virt_addr = ioremap_nocache(pmem->phys_addr, pmem->size);
+ if (!pmem->virt_addr)
+ goto out_release_region;
+
+ pmem->pmem_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!pmem->pmem_queue)
+ goto out_unmap;
+
+ blk_queue_make_request(pmem->pmem_queue, pmem_make_request);
+ blk_queue_max_hw_sectors(pmem->pmem_queue, 1024);
+ blk_queue_bounce_limit(pmem->pmem_queue, BLK_BOUNCE_ANY);
+
+ disk = alloc_disk(PMEM_MINORS);
+ if (!disk)
+ goto out_free_queue;
+
+ idx = atomic_inc_return(&pmem_index) - 1;
+
+ disk->major = pmem_major;
+ disk->first_minor = PMEM_MINORS * idx;
+ disk->fops = &pmem_fops;
+ disk->private_data = pmem;
+ disk->queue = pmem->pmem_queue;
+ disk->flags = GENHD_FL_EXT_DEVT;
+ sprintf(disk->disk_name, "pmem%d", idx);
+ disk->driverfs_dev = dev;
+ set_capacity(disk, pmem->size >> 9);
+ pmem->pmem_disk = disk;
+
+ add_disk(disk);
+
+ return pmem;
+
+out_free_queue:
+ blk_cleanup_queue(pmem->pmem_queue);
+out_unmap:
+ iounmap(pmem->virt_addr);
+out_release_region:
+ release_mem_region(pmem->phys_addr, pmem->size);
+out_free_dev:
+ kfree(pmem);
+out:
+ return ERR_PTR(err);
+}
+
+static void pmem_free(struct pmem_device *pmem)
+{
+ del_gendisk(pmem->pmem_disk);
+ put_disk(pmem->pmem_disk);
+ blk_cleanup_queue(pmem->pmem_queue);
+ iounmap(pmem->virt_addr);
+ release_mem_region(pmem->phys_addr, pmem->size);
+ kfree(pmem);
+}
+
+static int pmem_probe(struct platform_device *pdev)
+{
+ struct pmem_device *pmem;
+ struct resource *res;
+
+ if (WARN_ON(pdev->num_resources > 1))
+ return -ENXIO;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res)
+ return -ENXIO;
+
+ pmem = pmem_alloc(&pdev->dev, res);
+ if (IS_ERR(pmem))
+ return PTR_ERR(pmem);
+
+ platform_set_drvdata(pdev, pmem);
+
+ return 0;
+}
+
+static int pmem_remove(struct platform_device *pdev)
+{
+ struct pmem_device *pmem = platform_get_drvdata(pdev);
+
+ pmem_free(pmem);
+ return 0;
+}
+
+static struct platform_driver pmem_driver = {
+ .probe = pmem_probe,
+ .remove = pmem_remove,
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "pmem",
+ },
+};
+
+static int __init pmem_init(void)
+{
+ int error;
+
+ pmem_major = register_blkdev(0, "pmem");
+ if (pmem_major < 0)
+ return pmem_major;
+
+ error = platform_driver_register(&pmem_driver);
+ if (error)
+ unregister_blkdev(pmem_major, "pmem");
+ return error;
+}
+module_init(pmem_init);
+
+static void pmem_exit(void)
+{
+ platform_driver_unregister(&pmem_driver);
+ unregister_blkdev(pmem_major, "pmem");
+}
+module_exit(pmem_exit);
+
+MODULE_AUTHOR("Ross Zwisler <ross.zwisler@linux.intel.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
new file mode 100644
index 000000000..c120d70d3
--- /dev/null
+++ b/drivers/block/ps3disk.c
@@ -0,0 +1,589 @@
+/*
+ * PS3 Disk Storage Driver
+ *
+ * Copyright (C) 2007 Sony Computer Entertainment Inc.
+ * Copyright 2007 Sony Corp.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/ata.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <asm/lv1call.h>
+#include <asm/ps3stor.h>
+#include <asm/firmware.h>
+
+
+#define DEVICE_NAME "ps3disk"
+
+#define BOUNCE_SIZE (64*1024)
+
+#define PS3DISK_MAX_DISKS 16
+#define PS3DISK_MINORS 16
+
+
+#define PS3DISK_NAME "ps3d%c"
+
+
+struct ps3disk_private {
+ spinlock_t lock; /* Request queue spinlock */
+ struct request_queue *queue;
+ struct gendisk *gendisk;
+ unsigned int blocking_factor;
+ struct request *req;
+ u64 raw_capacity;
+ unsigned char model[ATA_ID_PROD_LEN+1];
+};
+
+
+#define LV1_STORAGE_SEND_ATA_COMMAND (2)
+#define LV1_STORAGE_ATA_HDDOUT (0x23)
+
+struct lv1_ata_cmnd_block {
+ u16 features;
+ u16 sector_count;
+ u16 LBA_low;
+ u16 LBA_mid;
+ u16 LBA_high;
+ u8 device;
+ u8 command;
+ u32 is_ext;
+ u32 proto;
+ u32 in_out;
+ u32 size;
+ u64 buffer;
+ u32 arglen;
+};
+
+enum lv1_ata_proto {
+ NON_DATA_PROTO = 0,
+ PIO_DATA_IN_PROTO = 1,
+ PIO_DATA_OUT_PROTO = 2,
+ DMA_PROTO = 3
+};
+
+enum lv1_ata_in_out {
+ DIR_WRITE = 0, /* memory -> device */
+ DIR_READ = 1 /* device -> memory */
+};
+
+static int ps3disk_major;
+
+
+static const struct block_device_operations ps3disk_fops = {
+ .owner = THIS_MODULE,
+};
+
+
+static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
+ struct request *req, int gather)
+{
+ unsigned int offset = 0;
+ struct req_iterator iter;
+ struct bio_vec bvec;
+ unsigned int i = 0;
+ size_t size;
+ void *buf;
+
+ rq_for_each_segment(bvec, req, iter) {
+ unsigned long flags;
+ dev_dbg(&dev->sbd.core, "%s:%u: bio %u: %u sectors from %lu\n",
+ __func__, __LINE__, i, bio_sectors(iter.bio),
+ iter.bio->bi_iter.bi_sector);
+
+ size = bvec.bv_len;
+ buf = bvec_kmap_irq(&bvec, &flags);
+ if (gather)
+ memcpy(dev->bounce_buf+offset, buf, size);
+ else
+ memcpy(buf, dev->bounce_buf+offset, size);
+ offset += size;
+ flush_kernel_dcache_page(bvec.bv_page);
+ bvec_kunmap_irq(buf, &flags);
+ i++;
+ }
+}
+
+static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
+ struct request *req)
+{
+ struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+ int write = rq_data_dir(req), res;
+ const char *op = write ? "write" : "read";
+ u64 start_sector, sectors;
+ unsigned int region_id = dev->regions[dev->region_idx].id;
+
+#ifdef DEBUG
+ unsigned int n = 0;
+ struct bio_vec bv;
+ struct req_iterator iter;
+
+ rq_for_each_segment(bv, req, iter)
+ n++;
+ dev_dbg(&dev->sbd.core,
+ "%s:%u: %s req has %u bvecs for %u sectors\n",
+ __func__, __LINE__, op, n, blk_rq_sectors(req));
+#endif
+
+ start_sector = blk_rq_pos(req) * priv->blocking_factor;
+ sectors = blk_rq_sectors(req) * priv->blocking_factor;
+ dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n",
+ __func__, __LINE__, op, sectors, start_sector);
+
+ if (write) {
+ ps3disk_scatter_gather(dev, req, 1);
+
+ res = lv1_storage_write(dev->sbd.dev_id, region_id,
+ start_sector, sectors, 0,
+ dev->bounce_lpar, &dev->tag);
+ } else {
+ res = lv1_storage_read(dev->sbd.dev_id, region_id,
+ start_sector, sectors, 0,
+ dev->bounce_lpar, &dev->tag);
+ }
+ if (res) {
+ dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,
+ __LINE__, op, res);
+ __blk_end_request_all(req, -EIO);
+ return 0;
+ }
+
+ priv->req = req;
+ return 1;
+}
+
+static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,
+ struct request *req)
+{
+ struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+ u64 res;
+
+ dev_dbg(&dev->sbd.core, "%s:%u: flush request\n", __func__, __LINE__);
+
+ res = lv1_storage_send_device_command(dev->sbd.dev_id,
+ LV1_STORAGE_ATA_HDDOUT, 0, 0, 0,
+ 0, &dev->tag);
+ if (res) {
+ dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
+ __func__, __LINE__, res);
+ __blk_end_request_all(req, -EIO);
+ return 0;
+ }
+
+ priv->req = req;
+ return 1;
+}
+
+static void ps3disk_do_request(struct ps3_storage_device *dev,
+ struct request_queue *q)
+{
+ struct request *req;
+
+ dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
+
+ while ((req = blk_fetch_request(q))) {
+ if (req->cmd_flags & REQ_FLUSH) {
+ if (ps3disk_submit_flush_request(dev, req))
+ break;
+ } else if (req->cmd_type == REQ_TYPE_FS) {
+ if (ps3disk_submit_request_sg(dev, req))
+ break;
+ } else {
+ blk_dump_rq_flags(req, DEVICE_NAME " bad request");
+ __blk_end_request_all(req, -EIO);
+ continue;
+ }
+ }
+}
+
+static void ps3disk_request(struct request_queue *q)
+{
+ struct ps3_storage_device *dev = q->queuedata;
+ struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+
+ if (priv->req) {
+ dev_dbg(&dev->sbd.core, "%s:%u busy\n", __func__, __LINE__);
+ return;
+ }
+
+ ps3disk_do_request(dev, q);
+}
+
+static irqreturn_t ps3disk_interrupt(int irq, void *data)
+{
+ struct ps3_storage_device *dev = data;
+ struct ps3disk_private *priv;
+ struct request *req;
+ int res, read, error;
+ u64 tag, status;
+ const char *op;
+
+ res = lv1_storage_get_async_status(dev->sbd.dev_id, &tag, &status);
+
+ if (tag != dev->tag)
+ dev_err(&dev->sbd.core,
+ "%s:%u: tag mismatch, got %llx, expected %llx\n",
+ __func__, __LINE__, tag, dev->tag);
+
+ if (res) {
+ dev_err(&dev->sbd.core, "%s:%u: res=%d status=0x%llx\n",
+ __func__, __LINE__, res, status);
+ return IRQ_HANDLED;
+ }
+
+ priv = ps3_system_bus_get_drvdata(&dev->sbd);
+ req = priv->req;
+ if (!req) {
+ dev_dbg(&dev->sbd.core,
+ "%s:%u non-block layer request completed\n", __func__,
+ __LINE__);
+ dev->lv1_status = status;
+ complete(&dev->done);
+ return IRQ_HANDLED;
+ }
+
+ if (req->cmd_flags & REQ_FLUSH) {
+ read = 0;
+ op = "flush";
+ } else {
+ read = !rq_data_dir(req);
+ op = read ? "read" : "write";
+ }
+ if (status) {
+ dev_dbg(&dev->sbd.core, "%s:%u: %s failed 0x%llx\n", __func__,
+ __LINE__, op, status);
+ error = -EIO;
+ } else {
+ dev_dbg(&dev->sbd.core, "%s:%u: %s completed\n", __func__,
+ __LINE__, op);
+ error = 0;
+ if (read)
+ ps3disk_scatter_gather(dev, req, 0);
+ }
+
+ spin_lock(&priv->lock);
+ __blk_end_request_all(req, error);
+ priv->req = NULL;
+ ps3disk_do_request(dev, priv->queue);
+ spin_unlock(&priv->lock);
+
+ return IRQ_HANDLED;
+}
+
+static int ps3disk_sync_cache(struct ps3_storage_device *dev)
+{
+ u64 res;
+
+ dev_dbg(&dev->sbd.core, "%s:%u: sync cache\n", __func__, __LINE__);
+
+ res = ps3stor_send_command(dev, LV1_STORAGE_ATA_HDDOUT, 0, 0, 0, 0);
+ if (res) {
+ dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
+ __func__, __LINE__, res);
+ return -EIO;
+ }
+ return 0;
+}
+
+
+/* ATA helpers copied from drivers/ata/libata-core.c */
+
+static void swap_buf_le16(u16 *buf, unsigned int buf_words)
+{
+#ifdef __BIG_ENDIAN
+ unsigned int i;
+
+ for (i = 0; i < buf_words; i++)
+ buf[i] = le16_to_cpu(buf[i]);
+#endif /* __BIG_ENDIAN */
+}
+
+static u64 ata_id_n_sectors(const u16 *id)
+{
+ if (ata_id_has_lba(id)) {
+ if (ata_id_has_lba48(id))
+ return ata_id_u64(id, 100);
+ else
+ return ata_id_u32(id, 60);
+ } else {
+ if (ata_id_current_chs_valid(id))
+ return ata_id_u32(id, 57);
+ else
+ return id[1] * id[3] * id[6];
+ }
+}
+
+static void ata_id_string(const u16 *id, unsigned char *s, unsigned int ofs,
+ unsigned int len)
+{
+ unsigned int c;
+
+ while (len > 0) {
+ c = id[ofs] >> 8;
+ *s = c;
+ s++;
+
+ c = id[ofs] & 0xff;
+ *s = c;
+ s++;
+
+ ofs++;
+ len -= 2;
+ }
+}
+
+static void ata_id_c_string(const u16 *id, unsigned char *s, unsigned int ofs,
+ unsigned int len)
+{
+ unsigned char *p;
+
+ WARN_ON(!(len & 1));
+
+ ata_id_string(id, s, ofs, len - 1);
+
+ p = s + strnlen(s, len - 1);
+ while (p > s && p[-1] == ' ')
+ p--;
+ *p = '\0';
+}
+
+static int ps3disk_identify(struct ps3_storage_device *dev)
+{
+ struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+ struct lv1_ata_cmnd_block ata_cmnd;
+ u16 *id = dev->bounce_buf;
+ u64 res;
+
+ dev_dbg(&dev->sbd.core, "%s:%u: identify disk\n", __func__, __LINE__);
+
+ memset(&ata_cmnd, 0, sizeof(struct lv1_ata_cmnd_block));
+ ata_cmnd.command = ATA_CMD_ID_ATA;
+ ata_cmnd.sector_count = 1;
+ ata_cmnd.size = ata_cmnd.arglen = ATA_ID_WORDS * 2;
+ ata_cmnd.buffer = dev->bounce_lpar;
+ ata_cmnd.proto = PIO_DATA_IN_PROTO;
+ ata_cmnd.in_out = DIR_READ;
+
+ res = ps3stor_send_command(dev, LV1_STORAGE_SEND_ATA_COMMAND,
+ ps3_mm_phys_to_lpar(__pa(&ata_cmnd)),
+ sizeof(ata_cmnd), ata_cmnd.buffer,
+ ata_cmnd.arglen);
+ if (res) {
+ dev_err(&dev->sbd.core, "%s:%u: identify disk failed 0x%llx\n",
+ __func__, __LINE__, res);
+ return -EIO;
+ }
+
+ swap_buf_le16(id, ATA_ID_WORDS);
+
+ /* All we're interested in are raw capacity and model name */
+ priv->raw_capacity = ata_id_n_sectors(id);
+ ata_id_c_string(id, priv->model, ATA_ID_PROD, sizeof(priv->model));
+ return 0;
+}
+
+static unsigned long ps3disk_mask;
+
+static DEFINE_MUTEX(ps3disk_mask_mutex);
+
+static int ps3disk_probe(struct ps3_system_bus_device *_dev)
+{
+ struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);
+ struct ps3disk_private *priv;
+ int error;
+ unsigned int devidx;
+ struct request_queue *queue;
+ struct gendisk *gendisk;
+
+ if (dev->blk_size < 512) {
+ dev_err(&dev->sbd.core,
+ "%s:%u: cannot handle block size %llu\n", __func__,
+ __LINE__, dev->blk_size);
+ return -EINVAL;
+ }
+
+ BUILD_BUG_ON(PS3DISK_MAX_DISKS > BITS_PER_LONG);
+ mutex_lock(&ps3disk_mask_mutex);
+ devidx = find_first_zero_bit(&ps3disk_mask, PS3DISK_MAX_DISKS);
+ if (devidx >= PS3DISK_MAX_DISKS) {
+ dev_err(&dev->sbd.core, "%s:%u: Too many disks\n", __func__,
+ __LINE__);
+ mutex_unlock(&ps3disk_mask_mutex);
+ return -ENOSPC;
+ }
+ __set_bit(devidx, &ps3disk_mask);
+ mutex_unlock(&ps3disk_mask_mutex);
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv) {
+ error = -ENOMEM;
+ goto fail;
+ }
+
+ ps3_system_bus_set_drvdata(_dev, priv);
+ spin_lock_init(&priv->lock);
+
+ dev->bounce_size = BOUNCE_SIZE;
+ dev->bounce_buf = kmalloc(BOUNCE_SIZE, GFP_DMA);
+ if (!dev->bounce_buf) {
+ error = -ENOMEM;
+ goto fail_free_priv;
+ }
+
+ error = ps3stor_setup(dev, ps3disk_interrupt);
+ if (error)
+ goto fail_free_bounce;
+
+ ps3disk_identify(dev);
+
+ queue = blk_init_queue(ps3disk_request, &priv->lock);
+ if (!queue) {
+ dev_err(&dev->sbd.core, "%s:%u: blk_init_queue failed\n",
+ __func__, __LINE__);
+ error = -ENOMEM;
+ goto fail_teardown;
+ }
+
+ priv->queue = queue;
+ queue->queuedata = dev;
+
+ blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH);
+
+ blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
+ blk_queue_segment_boundary(queue, -1UL);
+ blk_queue_dma_alignment(queue, dev->blk_size-1);
+ blk_queue_logical_block_size(queue, dev->blk_size);
+
+ blk_queue_flush(queue, REQ_FLUSH);
+
+ blk_queue_max_segments(queue, -1);
+ blk_queue_max_segment_size(queue, dev->bounce_size);
+
+ gendisk = alloc_disk(PS3DISK_MINORS);
+ if (!gendisk) {
+ dev_err(&dev->sbd.core, "%s:%u: alloc_disk failed\n", __func__,
+ __LINE__);
+ error = -ENOMEM;
+ goto fail_cleanup_queue;
+ }
+
+ priv->gendisk = gendisk;
+ gendisk->major = ps3disk_major;
+ gendisk->first_minor = devidx * PS3DISK_MINORS;
+ gendisk->fops = &ps3disk_fops;
+ gendisk->queue = queue;
+ gendisk->private_data = dev;
+ gendisk->driverfs_dev = &dev->sbd.core;
+ snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME,
+ devidx+'a');
+ priv->blocking_factor = dev->blk_size >> 9;
+ set_capacity(gendisk,
+ dev->regions[dev->region_idx].size*priv->blocking_factor);
+
+ dev_info(&dev->sbd.core,
+ "%s is a %s (%llu MiB total, %lu MiB for OtherOS)\n",
+ gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
+ get_capacity(gendisk) >> 11);
+
+ add_disk(gendisk);
+ return 0;
+
+fail_cleanup_queue:
+ blk_cleanup_queue(queue);
+fail_teardown:
+ ps3stor_teardown(dev);
+fail_free_bounce:
+ kfree(dev->bounce_buf);
+fail_free_priv:
+ kfree(priv);
+ ps3_system_bus_set_drvdata(_dev, NULL);
+fail:
+ mutex_lock(&ps3disk_mask_mutex);
+ __clear_bit(devidx, &ps3disk_mask);
+ mutex_unlock(&ps3disk_mask_mutex);
+ return error;
+}
+
+static int ps3disk_remove(struct ps3_system_bus_device *_dev)
+{
+ struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);
+ struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+
+ mutex_lock(&ps3disk_mask_mutex);
+ __clear_bit(MINOR(disk_devt(priv->gendisk)) / PS3DISK_MINORS,
+ &ps3disk_mask);
+ mutex_unlock(&ps3disk_mask_mutex);
+ del_gendisk(priv->gendisk);
+ blk_cleanup_queue(priv->queue);
+ put_disk(priv->gendisk);
+ dev_notice(&dev->sbd.core, "Synchronizing disk cache\n");
+ ps3disk_sync_cache(dev);
+ ps3stor_teardown(dev);
+ kfree(dev->bounce_buf);
+ kfree(priv);
+ ps3_system_bus_set_drvdata(_dev, NULL);
+ return 0;
+}
+
+static struct ps3_system_bus_driver ps3disk = {
+ .match_id = PS3_MATCH_ID_STOR_DISK,
+ .core.name = DEVICE_NAME,
+ .core.owner = THIS_MODULE,
+ .probe = ps3disk_probe,
+ .remove = ps3disk_remove,
+ .shutdown = ps3disk_remove,
+};
+
+
+static int __init ps3disk_init(void)
+{
+ int error;
+
+ if (!firmware_has_feature(FW_FEATURE_PS3_LV1))
+ return -ENODEV;
+
+ error = register_blkdev(0, DEVICE_NAME);
+ if (error <= 0) {
+ printk(KERN_ERR "%s:%u: register_blkdev failed %d\n", __func__,
+ __LINE__, error);
+ return error;
+ }
+ ps3disk_major = error;
+
+ pr_info("%s:%u: registered block device major %d\n", __func__,
+ __LINE__, ps3disk_major);
+
+ error = ps3_system_bus_driver_register(&ps3disk);
+ if (error)
+ unregister_blkdev(ps3disk_major, DEVICE_NAME);
+
+ return error;
+}
+
+static void __exit ps3disk_exit(void)
+{
+ ps3_system_bus_driver_unregister(&ps3disk);
+ unregister_blkdev(ps3disk_major, DEVICE_NAME);
+}
+
+module_init(ps3disk_init);
+module_exit(ps3disk_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PS3 Disk Storage Driver");
+MODULE_AUTHOR("Sony Corporation");
+MODULE_ALIAS(PS3_MODULE_ALIAS_STOR_DISK);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
new file mode 100644
index 000000000..ef45cfb98
--- /dev/null
+++ b/drivers/block/ps3vram.c
@@ -0,0 +1,878 @@
+/*
+ * ps3vram - Use extra PS3 video ram as MTD block device.
+ *
+ * Copyright 2009 Sony Corporation
+ *
+ * Based on the MTD ps3vram driver, which is
+ * Copyright (c) 2007-2008 Jim Paris <jim@jtan.com>
+ * Added support RSX DMA Vivien Chappelier <vivien.chappelier@free.fr>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <asm/cell-regs.h>
+#include <asm/firmware.h>
+#include <asm/lv1call.h>
+#include <asm/ps3.h>
+#include <asm/ps3gpu.h>
+
+
+#define DEVICE_NAME "ps3vram"
+
+
+#define XDR_BUF_SIZE (2 * 1024 * 1024) /* XDR buffer (must be 1MiB aligned) */
+#define XDR_IOIF 0x0c000000
+
+#define FIFO_BASE XDR_IOIF
+#define FIFO_SIZE (64 * 1024)
+
+#define DMA_PAGE_SIZE (4 * 1024)
+
+#define CACHE_PAGE_SIZE (256 * 1024)
+#define CACHE_PAGE_COUNT ((XDR_BUF_SIZE - FIFO_SIZE) / CACHE_PAGE_SIZE)
+
+#define CACHE_OFFSET CACHE_PAGE_SIZE
+#define FIFO_OFFSET 0
+
+#define CTRL_PUT 0x10
+#define CTRL_GET 0x11
+#define CTRL_TOP 0x15
+
+#define UPLOAD_SUBCH 1
+#define DOWNLOAD_SUBCH 2
+
+#define NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN 0x0000030c
+#define NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY 0x00000104
+
+#define CACHE_PAGE_PRESENT 1
+#define CACHE_PAGE_DIRTY 2
+
+struct ps3vram_tag {
+ unsigned int address;
+ unsigned int flags;
+};
+
+struct ps3vram_cache {
+ unsigned int page_count;
+ unsigned int page_size;
+ struct ps3vram_tag *tags;
+ unsigned int hit;
+ unsigned int miss;
+};
+
+struct ps3vram_priv {
+ struct request_queue *queue;
+ struct gendisk *gendisk;
+
+ u64 size;
+
+ u64 memory_handle;
+ u64 context_handle;
+ u32 *ctrl;
+ void *reports;
+ u8 *xdr_buf;
+
+ u32 *fifo_base;
+ u32 *fifo_ptr;
+
+ struct ps3vram_cache cache;
+
+ spinlock_t lock; /* protecting list of bios */
+ struct bio_list list;
+};
+
+
+static int ps3vram_major;
+
+
+static const struct block_device_operations ps3vram_fops = {
+ .owner = THIS_MODULE,
+};
+
+
+#define DMA_NOTIFIER_HANDLE_BASE 0x66604200 /* first DMA notifier handle */
+#define DMA_NOTIFIER_OFFSET_BASE 0x1000 /* first DMA notifier offset */
+#define DMA_NOTIFIER_SIZE 0x40
+#define NOTIFIER 7 /* notifier used for completion report */
+
+static char *size = "256M";
+module_param(size, charp, 0);
+MODULE_PARM_DESC(size, "memory size");
+
+static u32 *ps3vram_get_notifier(void *reports, int notifier)
+{
+ return reports + DMA_NOTIFIER_OFFSET_BASE +
+ DMA_NOTIFIER_SIZE * notifier;
+}
+
+static void ps3vram_notifier_reset(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
+ int i;
+
+ for (i = 0; i < 4; i++)
+ notify[i] = 0xffffffff;
+}
+
+static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
+ unsigned int timeout_ms)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
+ unsigned long timeout;
+
+ for (timeout = 20; timeout; timeout--) {
+ if (!notify[3])
+ return 0;
+ udelay(10);
+ }
+
+ timeout = jiffies + msecs_to_jiffies(timeout_ms);
+
+ do {
+ if (!notify[3])
+ return 0;
+ msleep(1);
+ } while (time_before(jiffies, timeout));
+
+ return -ETIMEDOUT;
+}
+
+static void ps3vram_init_ring(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+ priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET;
+ priv->ctrl[CTRL_GET] = FIFO_BASE + FIFO_OFFSET;
+}
+
+static int ps3vram_wait_ring(struct ps3_system_bus_device *dev,
+ unsigned int timeout_ms)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms);
+
+ do {
+ if (priv->ctrl[CTRL_PUT] == priv->ctrl[CTRL_GET])
+ return 0;
+ msleep(1);
+ } while (time_before(jiffies, timeout));
+
+ dev_warn(&dev->core, "FIFO timeout (%08x/%08x/%08x)\n",
+ priv->ctrl[CTRL_PUT], priv->ctrl[CTRL_GET],
+ priv->ctrl[CTRL_TOP]);
+
+ return -ETIMEDOUT;
+}
+
+static void ps3vram_out_ring(struct ps3vram_priv *priv, u32 data)
+{
+ *(priv->fifo_ptr)++ = data;
+}
+
+static void ps3vram_begin_ring(struct ps3vram_priv *priv, u32 chan, u32 tag,
+ u32 size)
+{
+ ps3vram_out_ring(priv, (size << 18) | (chan << 13) | tag);
+}
+
+static void ps3vram_rewind_ring(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ int status;
+
+ ps3vram_out_ring(priv, 0x20000000 | (FIFO_BASE + FIFO_OFFSET));
+
+ priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET;
+
+ /* asking the HV for a blit will kick the FIFO */
+ status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0);
+ if (status)
+ dev_err(&dev->core, "%s: lv1_gpu_fb_blit failed %d\n",
+ __func__, status);
+
+ priv->fifo_ptr = priv->fifo_base;
+}
+
+static void ps3vram_fire_ring(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ int status;
+
+ mutex_lock(&ps3_gpu_mutex);
+
+ priv->ctrl[CTRL_PUT] = FIFO_BASE + FIFO_OFFSET +
+ (priv->fifo_ptr - priv->fifo_base) * sizeof(u32);
+
+ /* asking the HV for a blit will kick the FIFO */
+ status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0);
+ if (status)
+ dev_err(&dev->core, "%s: lv1_gpu_fb_blit failed %d\n",
+ __func__, status);
+
+ if ((priv->fifo_ptr - priv->fifo_base) * sizeof(u32) >
+ FIFO_SIZE - 1024) {
+ dev_dbg(&dev->core, "FIFO full, rewinding\n");
+ ps3vram_wait_ring(dev, 200);
+ ps3vram_rewind_ring(dev);
+ }
+
+ mutex_unlock(&ps3_gpu_mutex);
+}
+
+static void ps3vram_bind(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+ ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0, 1);
+ ps3vram_out_ring(priv, 0x31337303);
+ ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0x180, 3);
+ ps3vram_out_ring(priv, DMA_NOTIFIER_HANDLE_BASE + NOTIFIER);
+ ps3vram_out_ring(priv, 0xfeed0001); /* DMA system RAM instance */
+ ps3vram_out_ring(priv, 0xfeed0000); /* DMA video RAM instance */
+
+ ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0, 1);
+ ps3vram_out_ring(priv, 0x3137c0de);
+ ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0x180, 3);
+ ps3vram_out_ring(priv, DMA_NOTIFIER_HANDLE_BASE + NOTIFIER);
+ ps3vram_out_ring(priv, 0xfeed0000); /* DMA video RAM instance */
+ ps3vram_out_ring(priv, 0xfeed0001); /* DMA system RAM instance */
+
+ ps3vram_fire_ring(dev);
+}
+
+static int ps3vram_upload(struct ps3_system_bus_device *dev,
+ unsigned int src_offset, unsigned int dst_offset,
+ int len, int count)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+ ps3vram_begin_ring(priv, UPLOAD_SUBCH,
+ NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+ ps3vram_out_ring(priv, XDR_IOIF + src_offset);
+ ps3vram_out_ring(priv, dst_offset);
+ ps3vram_out_ring(priv, len);
+ ps3vram_out_ring(priv, len);
+ ps3vram_out_ring(priv, len);
+ ps3vram_out_ring(priv, count);
+ ps3vram_out_ring(priv, (1 << 8) | 1);
+ ps3vram_out_ring(priv, 0);
+
+ ps3vram_notifier_reset(dev);
+ ps3vram_begin_ring(priv, UPLOAD_SUBCH,
+ NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY, 1);
+ ps3vram_out_ring(priv, 0);
+ ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0x100, 1);
+ ps3vram_out_ring(priv, 0);
+ ps3vram_fire_ring(dev);
+ if (ps3vram_notifier_wait(dev, 200) < 0) {
+ dev_warn(&dev->core, "%s: Notifier timeout\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int ps3vram_download(struct ps3_system_bus_device *dev,
+ unsigned int src_offset, unsigned int dst_offset,
+ int len, int count)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+ ps3vram_begin_ring(priv, DOWNLOAD_SUBCH,
+ NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+ ps3vram_out_ring(priv, src_offset);
+ ps3vram_out_ring(priv, XDR_IOIF + dst_offset);
+ ps3vram_out_ring(priv, len);
+ ps3vram_out_ring(priv, len);
+ ps3vram_out_ring(priv, len);
+ ps3vram_out_ring(priv, count);
+ ps3vram_out_ring(priv, (1 << 8) | 1);
+ ps3vram_out_ring(priv, 0);
+
+ ps3vram_notifier_reset(dev);
+ ps3vram_begin_ring(priv, DOWNLOAD_SUBCH,
+ NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY, 1);
+ ps3vram_out_ring(priv, 0);
+ ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0x100, 1);
+ ps3vram_out_ring(priv, 0);
+ ps3vram_fire_ring(dev);
+ if (ps3vram_notifier_wait(dev, 200) < 0) {
+ dev_warn(&dev->core, "%s: Notifier timeout\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void ps3vram_cache_evict(struct ps3_system_bus_device *dev, int entry)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ struct ps3vram_cache *cache = &priv->cache;
+
+ if (!(cache->tags[entry].flags & CACHE_PAGE_DIRTY))
+ return;
+
+ dev_dbg(&dev->core, "Flushing %d: 0x%08x\n", entry,
+ cache->tags[entry].address);
+ if (ps3vram_upload(dev, CACHE_OFFSET + entry * cache->page_size,
+ cache->tags[entry].address, DMA_PAGE_SIZE,
+ cache->page_size / DMA_PAGE_SIZE) < 0) {
+ dev_err(&dev->core,
+ "Failed to upload from 0x%x to " "0x%x size 0x%x\n",
+ entry * cache->page_size, cache->tags[entry].address,
+ cache->page_size);
+ }
+ cache->tags[entry].flags &= ~CACHE_PAGE_DIRTY;
+}
+
+static void ps3vram_cache_load(struct ps3_system_bus_device *dev, int entry,
+ unsigned int address)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ struct ps3vram_cache *cache = &priv->cache;
+
+ dev_dbg(&dev->core, "Fetching %d: 0x%08x\n", entry, address);
+ if (ps3vram_download(dev, address,
+ CACHE_OFFSET + entry * cache->page_size,
+ DMA_PAGE_SIZE,
+ cache->page_size / DMA_PAGE_SIZE) < 0) {
+ dev_err(&dev->core,
+ "Failed to download from 0x%x to 0x%x size 0x%x\n",
+ address, entry * cache->page_size, cache->page_size);
+ }
+
+ cache->tags[entry].address = address;
+ cache->tags[entry].flags |= CACHE_PAGE_PRESENT;
+}
+
+
+static void ps3vram_cache_flush(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ struct ps3vram_cache *cache = &priv->cache;
+ int i;
+
+ dev_dbg(&dev->core, "FLUSH\n");
+ for (i = 0; i < cache->page_count; i++) {
+ ps3vram_cache_evict(dev, i);
+ cache->tags[i].flags = 0;
+ }
+}
+
+static unsigned int ps3vram_cache_match(struct ps3_system_bus_device *dev,
+ loff_t address)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ struct ps3vram_cache *cache = &priv->cache;
+ unsigned int base;
+ unsigned int offset;
+ int i;
+ static int counter;
+
+ offset = (unsigned int) (address & (cache->page_size - 1));
+ base = (unsigned int) (address - offset);
+
+ /* fully associative check */
+ for (i = 0; i < cache->page_count; i++) {
+ if ((cache->tags[i].flags & CACHE_PAGE_PRESENT) &&
+ cache->tags[i].address == base) {
+ cache->hit++;
+ dev_dbg(&dev->core, "Found entry %d: 0x%08x\n", i,
+ cache->tags[i].address);
+ return i;
+ }
+ }
+
+ /* choose a random entry */
+ i = (jiffies + (counter++)) % cache->page_count;
+ dev_dbg(&dev->core, "Using entry %d\n", i);
+
+ ps3vram_cache_evict(dev, i);
+ ps3vram_cache_load(dev, i, base);
+
+ cache->miss++;
+ return i;
+}
+
+static int ps3vram_cache_init(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+ priv->cache.page_count = CACHE_PAGE_COUNT;
+ priv->cache.page_size = CACHE_PAGE_SIZE;
+ priv->cache.tags = kzalloc(sizeof(struct ps3vram_tag) *
+ CACHE_PAGE_COUNT, GFP_KERNEL);
+ if (priv->cache.tags == NULL) {
+ dev_err(&dev->core, "Could not allocate cache tags\n");
+ return -ENOMEM;
+ }
+
+ dev_info(&dev->core, "Created ram cache: %d entries, %d KiB each\n",
+ CACHE_PAGE_COUNT, CACHE_PAGE_SIZE / 1024);
+
+ return 0;
+}
+
+static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+ ps3vram_cache_flush(dev);
+ kfree(priv->cache.tags);
+}
+
+static int ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
+ size_t len, size_t *retlen, u_char *buf)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ unsigned int cached, count;
+
+ dev_dbg(&dev->core, "%s: from=0x%08x len=0x%zx\n", __func__,
+ (unsigned int)from, len);
+
+ if (from >= priv->size)
+ return -EIO;
+
+ if (len > priv->size - from)
+ len = priv->size - from;
+
+ /* Copy from vram to buf */
+ count = len;
+ while (count) {
+ unsigned int offset, avail;
+ unsigned int entry;
+
+ offset = (unsigned int) (from & (priv->cache.page_size - 1));
+ avail = priv->cache.page_size - offset;
+
+ entry = ps3vram_cache_match(dev, from);
+ cached = CACHE_OFFSET + entry * priv->cache.page_size + offset;
+
+ dev_dbg(&dev->core, "%s: from=%08x cached=%08x offset=%08x "
+ "avail=%08x count=%08x\n", __func__,
+ (unsigned int)from, cached, offset, avail, count);
+
+ if (avail > count)
+ avail = count;
+ memcpy(buf, priv->xdr_buf + cached, avail);
+
+ buf += avail;
+ count -= avail;
+ from += avail;
+ }
+
+ *retlen = len;
+ return 0;
+}
+
+static int ps3vram_write(struct ps3_system_bus_device *dev, loff_t to,
+ size_t len, size_t *retlen, const u_char *buf)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ unsigned int cached, count;
+
+ if (to >= priv->size)
+ return -EIO;
+
+ if (len > priv->size - to)
+ len = priv->size - to;
+
+ /* Copy from buf to vram */
+ count = len;
+ while (count) {
+ unsigned int offset, avail;
+ unsigned int entry;
+
+ offset = (unsigned int) (to & (priv->cache.page_size - 1));
+ avail = priv->cache.page_size - offset;
+
+ entry = ps3vram_cache_match(dev, to);
+ cached = CACHE_OFFSET + entry * priv->cache.page_size + offset;
+
+ dev_dbg(&dev->core, "%s: to=%08x cached=%08x offset=%08x "
+ "avail=%08x count=%08x\n", __func__, (unsigned int)to,
+ cached, offset, avail, count);
+
+ if (avail > count)
+ avail = count;
+ memcpy(priv->xdr_buf + cached, buf, avail);
+
+ priv->cache.tags[entry].flags |= CACHE_PAGE_DIRTY;
+
+ buf += avail;
+ count -= avail;
+ to += avail;
+ }
+
+ *retlen = len;
+ return 0;
+}
+
+static int ps3vram_proc_show(struct seq_file *m, void *v)
+{
+ struct ps3vram_priv *priv = m->private;
+
+ seq_printf(m, "hit:%u\nmiss:%u\n", priv->cache.hit, priv->cache.miss);
+ return 0;
+}
+
+static int ps3vram_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, ps3vram_proc_show, PDE_DATA(inode));
+}
+
+static const struct file_operations ps3vram_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = ps3vram_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void ps3vram_proc_init(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ struct proc_dir_entry *pde;
+
+ pde = proc_create_data(DEVICE_NAME, 0444, NULL, &ps3vram_proc_fops,
+ priv);
+ if (!pde)
+ dev_warn(&dev->core, "failed to create /proc entry\n");
+}
+
+static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev,
+ struct bio *bio)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ int write = bio_data_dir(bio) == WRITE;
+ const char *op = write ? "write" : "read";
+ loff_t offset = bio->bi_iter.bi_sector << 9;
+ int error = 0;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ struct bio *next;
+
+ bio_for_each_segment(bvec, bio, iter) {
+ /* PS3 is ppc64, so we don't handle highmem */
+ char *ptr = page_address(bvec.bv_page) + bvec.bv_offset;
+ size_t len = bvec.bv_len, retlen;
+
+ dev_dbg(&dev->core, " %s %zu bytes at offset %llu\n", op,
+ len, offset);
+ if (write)
+ error = ps3vram_write(dev, offset, len, &retlen, ptr);
+ else
+ error = ps3vram_read(dev, offset, len, &retlen, ptr);
+
+ if (error) {
+ dev_err(&dev->core, "%s failed\n", op);
+ goto out;
+ }
+
+ if (retlen != len) {
+ dev_err(&dev->core, "Short %s\n", op);
+ error = -EIO;
+ goto out;
+ }
+
+ offset += len;
+ }
+
+ dev_dbg(&dev->core, "%s completed\n", op);
+
+out:
+ spin_lock_irq(&priv->lock);
+ bio_list_pop(&priv->list);
+ next = bio_list_peek(&priv->list);
+ spin_unlock_irq(&priv->lock);
+
+ bio_endio(bio, error);
+ return next;
+}
+
+static void ps3vram_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct ps3_system_bus_device *dev = q->queuedata;
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+ int busy;
+
+ dev_dbg(&dev->core, "%s\n", __func__);
+
+ spin_lock_irq(&priv->lock);
+ busy = !bio_list_empty(&priv->list);
+ bio_list_add(&priv->list, bio);
+ spin_unlock_irq(&priv->lock);
+
+ if (busy)
+ return;
+
+ do {
+ bio = ps3vram_do_bio(dev, bio);
+ } while (bio);
+}
+
+static int ps3vram_probe(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv;
+ int error, status;
+ struct request_queue *queue;
+ struct gendisk *gendisk;
+ u64 ddr_size, ddr_lpar, ctrl_lpar, info_lpar, reports_lpar,
+ reports_size, xdr_lpar;
+ char *rest;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv) {
+ error = -ENOMEM;
+ goto fail;
+ }
+
+ spin_lock_init(&priv->lock);
+ bio_list_init(&priv->list);
+ ps3_system_bus_set_drvdata(dev, priv);
+
+ /* Allocate XDR buffer (1MiB aligned) */
+ priv->xdr_buf = (void *)__get_free_pages(GFP_KERNEL,
+ get_order(XDR_BUF_SIZE));
+ if (priv->xdr_buf == NULL) {
+ dev_err(&dev->core, "Could not allocate XDR buffer\n");
+ error = -ENOMEM;
+ goto fail_free_priv;
+ }
+
+ /* Put FIFO at begginning of XDR buffer */
+ priv->fifo_base = (u32 *) (priv->xdr_buf + FIFO_OFFSET);
+ priv->fifo_ptr = priv->fifo_base;
+
+ /* XXX: Need to open GPU, in case ps3fb or snd_ps3 aren't loaded */
+ if (ps3_open_hv_device(dev)) {
+ dev_err(&dev->core, "ps3_open_hv_device failed\n");
+ error = -EAGAIN;
+ goto out_free_xdr_buf;
+ }
+
+ /* Request memory */
+ status = -1;
+ ddr_size = ALIGN(memparse(size, &rest), 1024*1024);
+ if (!ddr_size) {
+ dev_err(&dev->core, "Specified size is too small\n");
+ error = -EINVAL;
+ goto out_close_gpu;
+ }
+
+ while (ddr_size > 0) {
+ status = lv1_gpu_memory_allocate(ddr_size, 0, 0, 0, 0,
+ &priv->memory_handle,
+ &ddr_lpar);
+ if (!status)
+ break;
+ ddr_size -= 1024*1024;
+ }
+ if (status) {
+ dev_err(&dev->core, "lv1_gpu_memory_allocate failed %d\n",
+ status);
+ error = -ENOMEM;
+ goto out_close_gpu;
+ }
+
+ /* Request context */
+ status = lv1_gpu_context_allocate(priv->memory_handle, 0,
+ &priv->context_handle, &ctrl_lpar,
+ &info_lpar, &reports_lpar,
+ &reports_size);
+ if (status) {
+ dev_err(&dev->core, "lv1_gpu_context_allocate failed %d\n",
+ status);
+ error = -ENOMEM;
+ goto out_free_memory;
+ }
+
+ /* Map XDR buffer to RSX */
+ xdr_lpar = ps3_mm_phys_to_lpar(__pa(priv->xdr_buf));
+ status = lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF,
+ xdr_lpar, XDR_BUF_SIZE,
+ CBE_IOPTE_PP_W | CBE_IOPTE_PP_R |
+ CBE_IOPTE_M);
+ if (status) {
+ dev_err(&dev->core, "lv1_gpu_context_iomap failed %d\n",
+ status);
+ error = -ENOMEM;
+ goto out_free_context;
+ }
+
+ priv->ctrl = ioremap(ctrl_lpar, 64 * 1024);
+ if (!priv->ctrl) {
+ dev_err(&dev->core, "ioremap CTRL failed\n");
+ error = -ENOMEM;
+ goto out_unmap_context;
+ }
+
+ priv->reports = ioremap(reports_lpar, reports_size);
+ if (!priv->reports) {
+ dev_err(&dev->core, "ioremap REPORTS failed\n");
+ error = -ENOMEM;
+ goto out_unmap_ctrl;
+ }
+
+ mutex_lock(&ps3_gpu_mutex);
+ ps3vram_init_ring(dev);
+ mutex_unlock(&ps3_gpu_mutex);
+
+ priv->size = ddr_size;
+
+ ps3vram_bind(dev);
+
+ mutex_lock(&ps3_gpu_mutex);
+ error = ps3vram_wait_ring(dev, 100);
+ mutex_unlock(&ps3_gpu_mutex);
+ if (error < 0) {
+ dev_err(&dev->core, "Failed to initialize channels\n");
+ error = -ETIMEDOUT;
+ goto out_unmap_reports;
+ }
+
+ ps3vram_cache_init(dev);
+ ps3vram_proc_init(dev);
+
+ queue = blk_alloc_queue(GFP_KERNEL);
+ if (!queue) {
+ dev_err(&dev->core, "blk_alloc_queue failed\n");
+ error = -ENOMEM;
+ goto out_cache_cleanup;
+ }
+
+ priv->queue = queue;
+ queue->queuedata = dev;
+ blk_queue_make_request(queue, ps3vram_make_request);
+ blk_queue_max_segments(queue, BLK_MAX_SEGMENTS);
+ blk_queue_max_segment_size(queue, BLK_MAX_SEGMENT_SIZE);
+ blk_queue_max_hw_sectors(queue, BLK_SAFE_MAX_SECTORS);
+
+ gendisk = alloc_disk(1);
+ if (!gendisk) {
+ dev_err(&dev->core, "alloc_disk failed\n");
+ error = -ENOMEM;
+ goto fail_cleanup_queue;
+ }
+
+ priv->gendisk = gendisk;
+ gendisk->major = ps3vram_major;
+ gendisk->first_minor = 0;
+ gendisk->fops = &ps3vram_fops;
+ gendisk->queue = queue;
+ gendisk->private_data = dev;
+ gendisk->driverfs_dev = &dev->core;
+ strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name));
+ set_capacity(gendisk, priv->size >> 9);
+
+ dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n",
+ gendisk->disk_name, get_capacity(gendisk) >> 11);
+
+ add_disk(gendisk);
+ return 0;
+
+fail_cleanup_queue:
+ blk_cleanup_queue(queue);
+out_cache_cleanup:
+ remove_proc_entry(DEVICE_NAME, NULL);
+ ps3vram_cache_cleanup(dev);
+out_unmap_reports:
+ iounmap(priv->reports);
+out_unmap_ctrl:
+ iounmap(priv->ctrl);
+out_unmap_context:
+ lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF, xdr_lpar,
+ XDR_BUF_SIZE, CBE_IOPTE_M);
+out_free_context:
+ lv1_gpu_context_free(priv->context_handle);
+out_free_memory:
+ lv1_gpu_memory_free(priv->memory_handle);
+out_close_gpu:
+ ps3_close_hv_device(dev);
+out_free_xdr_buf:
+ free_pages((unsigned long) priv->xdr_buf, get_order(XDR_BUF_SIZE));
+fail_free_priv:
+ kfree(priv);
+ ps3_system_bus_set_drvdata(dev, NULL);
+fail:
+ return error;
+}
+
+static int ps3vram_remove(struct ps3_system_bus_device *dev)
+{
+ struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+ del_gendisk(priv->gendisk);
+ put_disk(priv->gendisk);
+ blk_cleanup_queue(priv->queue);
+ remove_proc_entry(DEVICE_NAME, NULL);
+ ps3vram_cache_cleanup(dev);
+ iounmap(priv->reports);
+ iounmap(priv->ctrl);
+ lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF,
+ ps3_mm_phys_to_lpar(__pa(priv->xdr_buf)),
+ XDR_BUF_SIZE, CBE_IOPTE_M);
+ lv1_gpu_context_free(priv->context_handle);
+ lv1_gpu_memory_free(priv->memory_handle);
+ ps3_close_hv_device(dev);
+ free_pages((unsigned long) priv->xdr_buf, get_order(XDR_BUF_SIZE));
+ kfree(priv);
+ ps3_system_bus_set_drvdata(dev, NULL);
+ return 0;
+}
+
+static struct ps3_system_bus_driver ps3vram = {
+ .match_id = PS3_MATCH_ID_GPU,
+ .match_sub_id = PS3_MATCH_SUB_ID_GPU_RAMDISK,
+ .core.name = DEVICE_NAME,
+ .core.owner = THIS_MODULE,
+ .probe = ps3vram_probe,
+ .remove = ps3vram_remove,
+ .shutdown = ps3vram_remove,
+};
+
+
+static int __init ps3vram_init(void)
+{
+ int error;
+
+ if (!firmware_has_feature(FW_FEATURE_PS3_LV1))
+ return -ENODEV;
+
+ error = register_blkdev(0, DEVICE_NAME);
+ if (error <= 0) {
+ pr_err("%s: register_blkdev failed %d\n", DEVICE_NAME, error);
+ return error;
+ }
+ ps3vram_major = error;
+
+ pr_info("%s: registered block device major %d\n", DEVICE_NAME,
+ ps3vram_major);
+
+ error = ps3_system_bus_driver_register(&ps3vram);
+ if (error)
+ unregister_blkdev(ps3vram_major, DEVICE_NAME);
+
+ return error;
+}
+
+static void __exit ps3vram_exit(void)
+{
+ ps3_system_bus_driver_unregister(&ps3vram);
+ unregister_blkdev(ps3vram_major, DEVICE_NAME);
+}
+
+module_init(ps3vram_init);
+module_exit(ps3vram_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PS3 Video RAM Storage Driver");
+MODULE_AUTHOR("Sony Corporation");
+MODULE_ALIAS(PS3_MODULE_ALIAS_GPU_RAMDISK);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 000000000..53f253574
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,5752 @@
+
+/*
+ rbd.c -- Export ceph rados objects as a Linux block device
+
+
+ based on drivers/block/osdblk.c:
+
+ Copyright 2009 Red Hat, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+ For usage instructions, please refer to:
+
+ Documentation/ABI/testing/sysfs-bus-rbd
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+#include <linux/parser.h>
+#include <linux/bsearch.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include "rbd_types.h"
+
+#define RBD_DEBUG /* Activate rbd_assert() calls */
+
+/*
+ * The basic unit of block I/O is a sector. It is interpreted in a
+ * number of contexts in Linux (blk, bio, genhd), but the default is
+ * universally 512 bytes. These symbols are just slightly more
+ * meaningful than the bare numbers they represent.
+ */
+#define SECTOR_SHIFT 9
+#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
+
+/*
+ * Increment the given counter and return its updated value.
+ * If the counter is already 0 it will not be incremented.
+ * If the counter is already at its maximum value returns
+ * -EINVAL without updating it.
+ */
+static int atomic_inc_return_safe(atomic_t *v)
+{
+ unsigned int counter;
+
+ counter = (unsigned int)__atomic_add_unless(v, 1, 0);
+ if (counter <= (unsigned int)INT_MAX)
+ return (int)counter;
+
+ atomic_dec(v);
+
+ return -EINVAL;
+}
+
+/* Decrement the counter. Return the resulting value, or -EINVAL */
+static int atomic_dec_return_safe(atomic_t *v)
+{
+ int counter;
+
+ counter = atomic_dec_return(v);
+ if (counter >= 0)
+ return counter;
+
+ atomic_inc(v);
+
+ return -EINVAL;
+}
+
+#define RBD_DRV_NAME "rbd"
+
+#define RBD_MINORS_PER_MAJOR 256
+#define RBD_SINGLE_MAJOR_PART_SHIFT 4
+
+#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
+#define RBD_MAX_SNAP_NAME_LEN \
+ (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
+
+#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
+
+#define RBD_SNAP_HEAD_NAME "-"
+
+#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
+
+/* This allows a single page to hold an image name sent by OSD */
+#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
+#define RBD_IMAGE_ID_LEN_MAX 64
+
+#define RBD_OBJ_PREFIX_LEN_MAX 64
+
+/* Feature bits */
+
+#define RBD_FEATURE_LAYERING (1<<0)
+#define RBD_FEATURE_STRIPINGV2 (1<<1)
+#define RBD_FEATURES_ALL \
+ (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
+
+/* Features supported by this (client software) implementation. */
+
+#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
+
+/*
+ * An RBD device name will be "rbd#", where the "rbd" comes from
+ * RBD_DRV_NAME above, and # is a unique integer identifier.
+ * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
+ * enough to hold all possible device names.
+ */
+#define DEV_NAME_LEN 32
+#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+ /* These six fields never change for a given rbd image */
+ char *object_prefix;
+ __u8 obj_order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ u64 stripe_unit;
+ u64 stripe_count;
+ u64 features; /* Might be changeable someday? */
+
+ /* The remaining fields need to be updated occasionally */
+ u64 image_size;
+ struct ceph_snap_context *snapc;
+ char *snap_names; /* format 1 only */
+ u64 *snap_sizes; /* format 1 only */
+};
+
+/*
+ * An rbd image specification.
+ *
+ * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
+ * identify an image. Each rbd_dev structure includes a pointer to
+ * an rbd_spec structure that encapsulates this identity.
+ *
+ * Each of the id's in an rbd_spec has an associated name. For a
+ * user-mapped image, the names are supplied and the id's associated
+ * with them are looked up. For a layered image, a parent image is
+ * defined by the tuple, and the names are looked up.
+ *
+ * An rbd_dev structure contains a parent_spec pointer which is
+ * non-null if the image it represents is a child in a layered
+ * image. This pointer will refer to the rbd_spec structure used
+ * by the parent rbd_dev for its own identity (i.e., the structure
+ * is shared between the parent and child).
+ *
+ * Since these structures are populated once, during the discovery
+ * phase of image construction, they are effectively immutable so
+ * we make no effort to synchronize access to them.
+ *
+ * Note that code herein does not assume the image name is known (it
+ * could be a null pointer).
+ */
+struct rbd_spec {
+ u64 pool_id;
+ const char *pool_name;
+
+ const char *image_id;
+ const char *image_name;
+
+ u64 snap_id;
+ const char *snap_name;
+
+ struct kref kref;
+};
+
+/*
+ * an instance of the client. multiple devices may share an rbd client.
+ */
+struct rbd_client {
+ struct ceph_client *client;
+ struct kref kref;
+ struct list_head node;
+};
+
+struct rbd_img_request;
+typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
+
+#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
+
+struct rbd_obj_request;
+typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
+
+enum obj_request_type {
+ OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
+};
+
+enum obj_operation_type {
+ OBJ_OP_WRITE,
+ OBJ_OP_READ,
+ OBJ_OP_DISCARD,
+};
+
+enum obj_req_flags {
+ OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
+ OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
+ OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
+ OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
+};
+
+struct rbd_obj_request {
+ const char *object_name;
+ u64 offset; /* object start byte */
+ u64 length; /* bytes from offset */
+ unsigned long flags;
+
+ /*
+ * An object request associated with an image will have its
+ * img_data flag set; a standalone object request will not.
+ *
+ * A standalone object request will have which == BAD_WHICH
+ * and a null obj_request pointer.
+ *
+ * An object request initiated in support of a layered image
+ * object (to check for its existence before a write) will
+ * have which == BAD_WHICH and a non-null obj_request pointer.
+ *
+ * Finally, an object request for rbd image data will have
+ * which != BAD_WHICH, and will have a non-null img_request
+ * pointer. The value of which will be in the range
+ * 0..(img_request->obj_request_count-1).
+ */
+ union {
+ struct rbd_obj_request *obj_request; /* STAT op */
+ struct {
+ struct rbd_img_request *img_request;
+ u64 img_offset;
+ /* links for img_request->obj_requests list */
+ struct list_head links;
+ };
+ };
+ u32 which; /* posn image request list */
+
+ enum obj_request_type type;
+ union {
+ struct bio *bio_list;
+ struct {
+ struct page **pages;
+ u32 page_count;
+ };
+ };
+ struct page **copyup_pages;
+ u32 copyup_page_count;
+
+ struct ceph_osd_request *osd_req;
+
+ u64 xferred; /* bytes transferred */
+ int result;
+
+ rbd_obj_callback_t callback;
+ struct completion completion;
+
+ struct kref kref;
+};
+
+enum img_req_flags {
+ IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
+ IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
+ IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
+ IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
+};
+
+struct rbd_img_request {
+ struct rbd_device *rbd_dev;
+ u64 offset; /* starting image byte offset */
+ u64 length; /* byte count from offset */
+ unsigned long flags;
+ union {
+ u64 snap_id; /* for reads */
+ struct ceph_snap_context *snapc; /* for writes */
+ };
+ union {
+ struct request *rq; /* block request */
+ struct rbd_obj_request *obj_request; /* obj req initiator */
+ };
+ struct page **copyup_pages;
+ u32 copyup_page_count;
+ spinlock_t completion_lock;/* protects next_completion */
+ u32 next_completion;
+ rbd_img_callback_t callback;
+ u64 xferred;/* aggregate bytes transferred */
+ int result; /* first nonzero obj_request result */
+
+ u32 obj_request_count;
+ struct list_head obj_requests; /* rbd_obj_request structs */
+
+ struct kref kref;
+};
+
+#define for_each_obj_request(ireq, oreq) \
+ list_for_each_entry(oreq, &(ireq)->obj_requests, links)
+#define for_each_obj_request_from(ireq, oreq) \
+ list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
+#define for_each_obj_request_safe(ireq, oreq, n) \
+ list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
+
+struct rbd_mapping {
+ u64 size;
+ u64 features;
+ bool read_only;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+ int dev_id; /* blkdev unique id */
+
+ int major; /* blkdev assigned major */
+ int minor;
+ struct gendisk *disk; /* blkdev's gendisk and rq */
+
+ u32 image_format; /* Either 1 or 2 */
+ struct rbd_client *rbd_client;
+
+ char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+ spinlock_t lock; /* queue, flags, open_count */
+
+ struct rbd_image_header header;
+ unsigned long flags; /* possibly lock protected */
+ struct rbd_spec *spec;
+
+ char *header_name;
+
+ struct ceph_file_layout layout;
+
+ struct ceph_osd_event *watch_event;
+ struct rbd_obj_request *watch_request;
+
+ struct rbd_spec *parent_spec;
+ u64 parent_overlap;
+ atomic_t parent_ref;
+ struct rbd_device *parent;
+
+ /* Block layer tags. */
+ struct blk_mq_tag_set tag_set;
+
+ /* protects updating the header */
+ struct rw_semaphore header_rwsem;
+
+ struct rbd_mapping mapping;
+
+ struct list_head node;
+
+ /* sysfs related */
+ struct device dev;
+ unsigned long open_count; /* protected by lock */
+};
+
+/*
+ * Flag bits for rbd_dev->flags. If atomicity is required,
+ * rbd_dev->lock is used to protect access.
+ *
+ * Currently, only the "removing" flag (which is coupled with the
+ * "open_count" field) requires atomic access.
+ */
+enum rbd_dev_flags {
+ RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
+ RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
+};
+
+static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
+
+static LIST_HEAD(rbd_dev_list); /* devices */
+static DEFINE_SPINLOCK(rbd_dev_list_lock);
+
+static LIST_HEAD(rbd_client_list); /* clients */
+static DEFINE_SPINLOCK(rbd_client_list_lock);
+
+/* Slab caches for frequently-allocated structures */
+
+static struct kmem_cache *rbd_img_request_cache;
+static struct kmem_cache *rbd_obj_request_cache;
+static struct kmem_cache *rbd_segment_name_cache;
+
+static int rbd_major;
+static DEFINE_IDA(rbd_dev_id_ida);
+
+static struct workqueue_struct *rbd_wq;
+
+/*
+ * Default to false for now, as single-major requires >= 0.75 version of
+ * userspace rbd utility.
+ */
+static bool single_major = false;
+module_param(single_major, bool, S_IRUGO);
+MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
+
+static int rbd_img_request_submit(struct rbd_img_request *img_request);
+
+static void rbd_dev_device_release(struct device *dev);
+
+static ssize_t rbd_add(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
+ size_t count);
+static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
+ size_t count);
+static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping);
+static void rbd_spec_put(struct rbd_spec *spec);
+
+static int rbd_dev_id_to_minor(int dev_id)
+{
+ return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static int minor_to_rbd_dev_id(int minor)
+{
+ return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
+static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
+static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
+static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
+
+static struct attribute *rbd_bus_attrs[] = {
+ &bus_attr_add.attr,
+ &bus_attr_remove.attr,
+ &bus_attr_add_single_major.attr,
+ &bus_attr_remove_single_major.attr,
+ NULL,
+};
+
+static umode_t rbd_bus_is_visible(struct kobject *kobj,
+ struct attribute *attr, int index)
+{
+ if (!single_major &&
+ (attr == &bus_attr_add_single_major.attr ||
+ attr == &bus_attr_remove_single_major.attr))
+ return 0;
+
+ return attr->mode;
+}
+
+static const struct attribute_group rbd_bus_group = {
+ .attrs = rbd_bus_attrs,
+ .is_visible = rbd_bus_is_visible,
+};
+__ATTRIBUTE_GROUPS(rbd_bus);
+
+static struct bus_type rbd_bus_type = {
+ .name = "rbd",
+ .bus_groups = rbd_bus_groups,
+};
+
+static void rbd_root_dev_release(struct device *dev)
+{
+}
+
+static struct device rbd_root_dev = {
+ .init_name = "rbd",
+ .release = rbd_root_dev_release,
+};
+
+static __printf(2, 3)
+void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (!rbd_dev)
+ printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
+ else if (rbd_dev->disk)
+ printk(KERN_WARNING "%s: %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
+ else if (rbd_dev->spec && rbd_dev->spec->image_name)
+ printk(KERN_WARNING "%s: image %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
+ else if (rbd_dev->spec && rbd_dev->spec->image_id)
+ printk(KERN_WARNING "%s: id %s: %pV\n",
+ RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
+ else /* punt */
+ printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
+ RBD_DRV_NAME, rbd_dev, &vaf);
+ va_end(args);
+}
+
+#ifdef RBD_DEBUG
+#define rbd_assert(expr) \
+ if (unlikely(!(expr))) { \
+ printk(KERN_ERR "\nAssertion failure in %s() " \
+ "at line %d:\n\n" \
+ "\trbd_assert(%s);\n\n", \
+ __func__, __LINE__, #expr); \
+ BUG(); \
+ }
+#else /* !RBD_DEBUG */
+# define rbd_assert(expr) ((void) 0)
+#endif /* !RBD_DEBUG */
+
+static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
+static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
+static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
+
+static int rbd_dev_refresh(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
+static int rbd_dev_header_info(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
+static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+ u64 snap_id);
+static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+ u8 *order, u64 *snap_size);
+static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+ u64 *snap_features);
+static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+ bool removing = false;
+
+ if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
+ return -EROFS;
+
+ spin_lock_irq(&rbd_dev->lock);
+ if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
+ removing = true;
+ else
+ rbd_dev->open_count++;
+ spin_unlock_irq(&rbd_dev->lock);
+ if (removing)
+ return -ENOENT;
+
+ (void) get_device(&rbd_dev->dev);
+
+ return 0;
+}
+
+static void rbd_release(struct gendisk *disk, fmode_t mode)
+{
+ struct rbd_device *rbd_dev = disk->private_data;
+ unsigned long open_count_before;
+
+ spin_lock_irq(&rbd_dev->lock);
+ open_count_before = rbd_dev->open_count--;
+ spin_unlock_irq(&rbd_dev->lock);
+ rbd_assert(open_count_before > 0);
+
+ put_device(&rbd_dev->dev);
+}
+
+static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
+{
+ int ret = 0;
+ int val;
+ bool ro;
+ bool ro_changed = false;
+
+ /* get_user() may sleep, so call it before taking rbd_dev->lock */
+ if (get_user(val, (int __user *)(arg)))
+ return -EFAULT;
+
+ ro = val ? true : false;
+ /* Snapshot doesn't allow to write*/
+ if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
+ return -EROFS;
+
+ spin_lock_irq(&rbd_dev->lock);
+ /* prevent others open this device */
+ if (rbd_dev->open_count > 1) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (rbd_dev->mapping.read_only != ro) {
+ rbd_dev->mapping.read_only = ro;
+ ro_changed = true;
+ }
+
+out:
+ spin_unlock_irq(&rbd_dev->lock);
+ /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
+ if (ret == 0 && ro_changed)
+ set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
+
+ return ret;
+}
+
+static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+ int ret = 0;
+
+ switch (cmd) {
+ case BLKROSET:
+ ret = rbd_ioctl_set_ro(rbd_dev, arg);
+ break;
+ default:
+ ret = -ENOTTY;
+ }
+
+ return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ return rbd_ioctl(bdev, mode, cmd, arg);
+}
+#endif /* CONFIG_COMPAT */
+
+static const struct block_device_operations rbd_bd_ops = {
+ .owner = THIS_MODULE,
+ .open = rbd_open,
+ .release = rbd_release,
+ .ioctl = rbd_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = rbd_compat_ioctl,
+#endif
+};
+
+/*
+ * Initialize an rbd client instance. Success or not, this function
+ * consumes ceph_opts. Caller holds client_mutex.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
+{
+ struct rbd_client *rbdc;
+ int ret = -ENOMEM;
+
+ dout("%s:\n", __func__);
+ rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+ if (!rbdc)
+ goto out_opt;
+
+ kref_init(&rbdc->kref);
+ INIT_LIST_HEAD(&rbdc->node);
+
+ rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
+ if (IS_ERR(rbdc->client))
+ goto out_rbdc;
+ ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
+
+ ret = ceph_open_session(rbdc->client);
+ if (ret < 0)
+ goto out_client;
+
+ spin_lock(&rbd_client_list_lock);
+ list_add_tail(&rbdc->node, &rbd_client_list);
+ spin_unlock(&rbd_client_list_lock);
+
+ dout("%s: rbdc %p\n", __func__, rbdc);
+
+ return rbdc;
+out_client:
+ ceph_destroy_client(rbdc->client);
+out_rbdc:
+ kfree(rbdc);
+out_opt:
+ if (ceph_opts)
+ ceph_destroy_options(ceph_opts);
+ dout("%s: error %d\n", __func__, ret);
+
+ return ERR_PTR(ret);
+}
+
+static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
+{
+ kref_get(&rbdc->kref);
+
+ return rbdc;
+}
+
+/*
+ * Find a ceph client with specific addr and configuration. If
+ * found, bump its reference count.
+ */
+static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
+{
+ struct rbd_client *client_node;
+ bool found = false;
+
+ if (ceph_opts->flags & CEPH_OPT_NOSHARE)
+ return NULL;
+
+ spin_lock(&rbd_client_list_lock);
+ list_for_each_entry(client_node, &rbd_client_list, node) {
+ if (!ceph_compare_options(ceph_opts, client_node->client)) {
+ __rbd_get_client(client_node);
+
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&rbd_client_list_lock);
+
+ return found ? client_node : NULL;
+}
+
+/*
+ * mount options
+ */
+enum {
+ Opt_last_int,
+ /* int args above */
+ Opt_last_string,
+ /* string args above */
+ Opt_read_only,
+ Opt_read_write,
+ /* Boolean args above */
+ Opt_last_bool,
+};
+
+static match_table_t rbd_opts_tokens = {
+ /* int args above */
+ /* string args above */
+ {Opt_read_only, "read_only"},
+ {Opt_read_only, "ro"}, /* Alternate spelling */
+ {Opt_read_write, "read_write"},
+ {Opt_read_write, "rw"}, /* Alternate spelling */
+ /* Boolean args above */
+ {-1, NULL}
+};
+
+struct rbd_options {
+ bool read_only;
+};
+
+#define RBD_READ_ONLY_DEFAULT false
+
+static int parse_rbd_opts_token(char *c, void *private)
+{
+ struct rbd_options *rbd_opts = private;
+ substring_t argstr[MAX_OPT_ARGS];
+ int token, intval, ret;
+
+ token = match_token(c, rbd_opts_tokens, argstr);
+ if (token < 0)
+ return -EINVAL;
+
+ if (token < Opt_last_int) {
+ ret = match_int(&argstr[0], &intval);
+ if (ret < 0) {
+ pr_err("bad mount option arg (not int) "
+ "at '%s'\n", c);
+ return ret;
+ }
+ dout("got int token %d val %d\n", token, intval);
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
+ } else if (token > Opt_last_string && token < Opt_last_bool) {
+ dout("got Boolean token %d\n", token);
+ } else {
+ dout("got token %d\n", token);
+ }
+
+ switch (token) {
+ case Opt_read_only:
+ rbd_opts->read_only = true;
+ break;
+ case Opt_read_write:
+ rbd_opts->read_only = false;
+ break;
+ default:
+ rbd_assert(false);
+ break;
+ }
+ return 0;
+}
+
+static char* obj_op_name(enum obj_operation_type op_type)
+{
+ switch (op_type) {
+ case OBJ_OP_READ:
+ return "read";
+ case OBJ_OP_WRITE:
+ return "write";
+ case OBJ_OP_DISCARD:
+ return "discard";
+ default:
+ return "???";
+ }
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it. Either way, ceph_opts is consumed by this
+ * function.
+ */
+static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
+{
+ struct rbd_client *rbdc;
+
+ mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
+ rbdc = rbd_client_find(ceph_opts);
+ if (rbdc) /* using an existing client */
+ ceph_destroy_options(ceph_opts);
+ else
+ rbdc = rbd_client_create(ceph_opts);
+ mutex_unlock(&client_mutex);
+
+ return rbdc;
+}
+
+/*
+ * Destroy ceph client
+ *
+ * Caller must hold rbd_client_list_lock.
+ */
+static void rbd_client_release(struct kref *kref)
+{
+ struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+ dout("%s: rbdc %p\n", __func__, rbdc);
+ spin_lock(&rbd_client_list_lock);
+ list_del(&rbdc->node);
+ spin_unlock(&rbd_client_list_lock);
+
+ ceph_destroy_client(rbdc->client);
+ kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_client *rbdc)
+{
+ if (rbdc)
+ kref_put(&rbdc->kref, rbd_client_release);
+}
+
+static bool rbd_image_format_valid(u32 image_format)
+{
+ return image_format == 1 || image_format == 2;
+}
+
+static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
+{
+ size_t size;
+ u32 snap_count;
+
+ /* The header has to start with the magic rbd header text */
+ if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
+ return false;
+
+ /* The bio layer requires at least sector-sized I/O */
+
+ if (ondisk->options.order < SECTOR_SHIFT)
+ return false;
+
+ /* If we use u64 in a few spots we may be able to loosen this */
+
+ if (ondisk->options.order > 8 * sizeof (int) - 1)
+ return false;
+
+ /*
+ * The size of a snapshot header has to fit in a size_t, and
+ * that limits the number of snapshots.
+ */
+ snap_count = le32_to_cpu(ondisk->snap_count);
+ size = SIZE_MAX - sizeof (struct ceph_snap_context);
+ if (snap_count > size / sizeof (__le64))
+ return false;
+
+ /*
+ * Not only that, but the size of the entire the snapshot
+ * header must also be representable in a size_t.
+ */
+ size -= snap_count * sizeof (__le64);
+ if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
+ return false;
+
+ return true;
+}
+
+/*
+ * Fill an rbd image header with information from the given format 1
+ * on-disk header.
+ */
+static int rbd_header_from_disk(struct rbd_device *rbd_dev,
+ struct rbd_image_header_ondisk *ondisk)
+{
+ struct rbd_image_header *header = &rbd_dev->header;
+ bool first_time = header->object_prefix == NULL;
+ struct ceph_snap_context *snapc;
+ char *object_prefix = NULL;
+ char *snap_names = NULL;
+ u64 *snap_sizes = NULL;
+ u32 snap_count;
+ size_t size;
+ int ret = -ENOMEM;
+ u32 i;
+
+ /* Allocate this now to avoid having to handle failure below */
+
+ if (first_time) {
+ size_t len;
+
+ len = strnlen(ondisk->object_prefix,
+ sizeof (ondisk->object_prefix));
+ object_prefix = kmalloc(len + 1, GFP_KERNEL);
+ if (!object_prefix)
+ return -ENOMEM;
+ memcpy(object_prefix, ondisk->object_prefix, len);
+ object_prefix[len] = '\0';
+ }
+
+ /* Allocate the snapshot context and fill it in */
+
+ snap_count = le32_to_cpu(ondisk->snap_count);
+ snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
+ if (!snapc)
+ goto out_err;
+ snapc->seq = le64_to_cpu(ondisk->snap_seq);
+ if (snap_count) {
+ struct rbd_image_snap_ondisk *snaps;
+ u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+
+ /* We'll keep a copy of the snapshot names... */
+
+ if (snap_names_len > (u64)SIZE_MAX)
+ goto out_2big;
+ snap_names = kmalloc(snap_names_len, GFP_KERNEL);
+ if (!snap_names)
+ goto out_err;
+
+ /* ...as well as the array of their sizes. */
+
+ size = snap_count * sizeof (*header->snap_sizes);
+ snap_sizes = kmalloc(size, GFP_KERNEL);
+ if (!snap_sizes)
+ goto out_err;
+
+ /*
+ * Copy the names, and fill in each snapshot's id
+ * and size.
+ *
+ * Note that rbd_dev_v1_header_info() guarantees the
+ * ondisk buffer we're working with has
+ * snap_names_len bytes beyond the end of the
+ * snapshot id array, this memcpy() is safe.
+ */
+ memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
+ snaps = ondisk->snaps;
+ for (i = 0; i < snap_count; i++) {
+ snapc->snaps[i] = le64_to_cpu(snaps[i].id);
+ snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
+ }
+ }
+
+ /* We won't fail any more, fill in the header */
+
+ if (first_time) {
+ header->object_prefix = object_prefix;
+ header->obj_order = ondisk->options.order;
+ header->crypt_type = ondisk->options.crypt_type;
+ header->comp_type = ondisk->options.comp_type;
+ /* The rest aren't used for format 1 images */
+ header->stripe_unit = 0;
+ header->stripe_count = 0;
+ header->features = 0;
+ } else {
+ ceph_put_snap_context(header->snapc);
+ kfree(header->snap_names);
+ kfree(header->snap_sizes);
+ }
+
+ /* The remaining fields always get updated (when we refresh) */
+
+ header->image_size = le64_to_cpu(ondisk->image_size);
+ header->snapc = snapc;
+ header->snap_names = snap_names;
+ header->snap_sizes = snap_sizes;
+
+ return 0;
+out_2big:
+ ret = -EIO;
+out_err:
+ kfree(snap_sizes);
+ kfree(snap_names);
+ ceph_put_snap_context(snapc);
+ kfree(object_prefix);
+
+ return ret;
+}
+
+static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
+{
+ const char *snap_name;
+
+ rbd_assert(which < rbd_dev->header.snapc->num_snaps);
+
+ /* Skip over names until we find the one we are looking for */
+
+ snap_name = rbd_dev->header.snap_names;
+ while (which--)
+ snap_name += strlen(snap_name) + 1;
+
+ return kstrdup(snap_name, GFP_KERNEL);
+}
+
+/*
+ * Snapshot id comparison function for use with qsort()/bsearch().
+ * Note that result is for snapshots in *descending* order.
+ */
+static int snapid_compare_reverse(const void *s1, const void *s2)
+{
+ u64 snap_id1 = *(u64 *)s1;
+ u64 snap_id2 = *(u64 *)s2;
+
+ if (snap_id1 < snap_id2)
+ return 1;
+ return snap_id1 == snap_id2 ? 0 : -1;
+}
+
+/*
+ * Search a snapshot context to see if the given snapshot id is
+ * present.
+ *
+ * Returns the position of the snapshot id in the array if it's found,
+ * or BAD_SNAP_INDEX otherwise.
+ *
+ * Note: The snapshot array is in kept sorted (by the osd) in
+ * reverse order, highest snapshot id first.
+ */
+static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
+{
+ struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+ u64 *found;
+
+ found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
+ sizeof (snap_id), snapid_compare_reverse);
+
+ return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
+}
+
+static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
+ u64 snap_id)
+{
+ u32 which;
+ const char *snap_name;
+
+ which = rbd_dev_snap_index(rbd_dev, snap_id);
+ if (which == BAD_SNAP_INDEX)
+ return ERR_PTR(-ENOENT);
+
+ snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
+ return snap_name ? snap_name : ERR_PTR(-ENOMEM);
+}
+
+static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
+{
+ if (snap_id == CEPH_NOSNAP)
+ return RBD_SNAP_HEAD_NAME;
+
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ if (rbd_dev->image_format == 1)
+ return rbd_dev_v1_snap_name(rbd_dev, snap_id);
+
+ return rbd_dev_v2_snap_name(rbd_dev, snap_id);
+}
+
+static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+ u64 *snap_size)
+{
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ if (snap_id == CEPH_NOSNAP) {
+ *snap_size = rbd_dev->header.image_size;
+ } else if (rbd_dev->image_format == 1) {
+ u32 which;
+
+ which = rbd_dev_snap_index(rbd_dev, snap_id);
+ if (which == BAD_SNAP_INDEX)
+ return -ENOENT;
+
+ *snap_size = rbd_dev->header.snap_sizes[which];
+ } else {
+ u64 size = 0;
+ int ret;
+
+ ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
+ if (ret)
+ return ret;
+
+ *snap_size = size;
+ }
+ return 0;
+}
+
+static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+ u64 *snap_features)
+{
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+ if (snap_id == CEPH_NOSNAP) {
+ *snap_features = rbd_dev->header.features;
+ } else if (rbd_dev->image_format == 1) {
+ *snap_features = 0; /* No features for format 1 */
+ } else {
+ u64 features = 0;
+ int ret;
+
+ ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
+ if (ret)
+ return ret;
+
+ *snap_features = features;
+ }
+ return 0;
+}
+
+static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
+{
+ u64 snap_id = rbd_dev->spec->snap_id;
+ u64 size = 0;
+ u64 features = 0;
+ int ret;
+
+ ret = rbd_snap_size(rbd_dev, snap_id, &size);
+ if (ret)
+ return ret;
+ ret = rbd_snap_features(rbd_dev, snap_id, &features);
+ if (ret)
+ return ret;
+
+ rbd_dev->mapping.size = size;
+ rbd_dev->mapping.features = features;
+
+ return 0;
+}
+
+static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
+{
+ rbd_dev->mapping.size = 0;
+ rbd_dev->mapping.features = 0;
+}
+
+static void rbd_segment_name_free(const char *name)
+{
+ /* The explicit cast here is needed to drop the const qualifier */
+
+ kmem_cache_free(rbd_segment_name_cache, (void *)name);
+}
+
+static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
+{
+ char *name;
+ u64 segment;
+ int ret;
+ char *name_format;
+
+ name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
+ if (!name)
+ return NULL;
+ segment = offset >> rbd_dev->header.obj_order;
+ name_format = "%s.%012llx";
+ if (rbd_dev->image_format == 2)
+ name_format = "%s.%016llx";
+ ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
+ rbd_dev->header.object_prefix, segment);
+ if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
+ pr_err("error formatting segment name for #%llu (%d)\n",
+ segment, ret);
+ rbd_segment_name_free(name);
+ name = NULL;
+ }
+
+ return name;
+}
+
+static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
+{
+ u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+
+ return offset & (segment_size - 1);
+}
+
+static u64 rbd_segment_length(struct rbd_device *rbd_dev,
+ u64 offset, u64 length)
+{
+ u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
+
+ offset &= segment_size - 1;
+
+ rbd_assert(length <= U64_MAX - offset);
+ if (offset + length > segment_size)
+ length = segment_size - offset;
+
+ return length;
+}
+
+/*
+ * returns the size of an object in the image
+ */
+static u64 rbd_obj_bytes(struct rbd_image_header *header)
+{
+ return 1 << header->obj_order;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+ struct bio *tmp;
+
+ while (chain) {
+ tmp = chain;
+ chain = chain->bi_next;
+ bio_put(tmp);
+ }
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ unsigned long flags;
+ void *buf;
+ int pos = 0;
+
+ while (chain) {
+ bio_for_each_segment(bv, chain, iter) {
+ if (pos + bv.bv_len > start_ofs) {
+ int remainder = max(start_ofs - pos, 0);
+ buf = bvec_kmap_irq(&bv, &flags);
+ memset(buf + remainder, 0,
+ bv.bv_len - remainder);
+ flush_dcache_page(bv.bv_page);
+ bvec_kunmap_irq(buf, &flags);
+ }
+ pos += bv.bv_len;
+ }
+
+ chain = chain->bi_next;
+ }
+}
+
+/*
+ * similar to zero_bio_chain(), zeros data defined by a page array,
+ * starting at the given byte offset from the start of the array and
+ * continuing up to the given end offset. The pages array is
+ * assumed to be big enough to hold all bytes up to the end.
+ */
+static void zero_pages(struct page **pages, u64 offset, u64 end)
+{
+ struct page **page = &pages[offset >> PAGE_SHIFT];
+
+ rbd_assert(end > offset);
+ rbd_assert(end - offset <= (u64)SIZE_MAX);
+ while (offset < end) {
+ size_t page_offset;
+ size_t length;
+ unsigned long flags;
+ void *kaddr;
+
+ page_offset = offset & ~PAGE_MASK;
+ length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
+ local_irq_save(flags);
+ kaddr = kmap_atomic(*page);
+ memset(kaddr + page_offset, 0, length);
+ flush_dcache_page(*page);
+ kunmap_atomic(kaddr);
+ local_irq_restore(flags);
+
+ offset += length;
+ page++;
+ }
+}
+
+/*
+ * Clone a portion of a bio, starting at the given byte offset
+ * and continuing for the number of bytes indicated.
+ */
+static struct bio *bio_clone_range(struct bio *bio_src,
+ unsigned int offset,
+ unsigned int len,
+ gfp_t gfpmask)
+{
+ struct bio *bio;
+
+ bio = bio_clone(bio_src, gfpmask);
+ if (!bio)
+ return NULL; /* ENOMEM */
+
+ bio_advance(bio, offset);
+ bio->bi_iter.bi_size = len;
+
+ return bio;
+}
+
+/*
+ * Clone a portion of a bio chain, starting at the given byte offset
+ * into the first bio in the source chain and continuing for the
+ * number of bytes indicated. The result is another bio chain of
+ * exactly the given length, or a null pointer on error.
+ *
+ * The bio_src and offset parameters are both in-out. On entry they
+ * refer to the first source bio and the offset into that bio where
+ * the start of data to be cloned is located.
+ *
+ * On return, bio_src is updated to refer to the bio in the source
+ * chain that contains first un-cloned byte, and *offset will
+ * contain the offset of that byte within that bio.
+ */
+static struct bio *bio_chain_clone_range(struct bio **bio_src,
+ unsigned int *offset,
+ unsigned int len,
+ gfp_t gfpmask)
+{
+ struct bio *bi = *bio_src;
+ unsigned int off = *offset;
+ struct bio *chain = NULL;
+ struct bio **end;
+
+ /* Build up a chain of clone bios up to the limit */
+
+ if (!bi || off >= bi->bi_iter.bi_size || !len)
+ return NULL; /* Nothing to clone */
+
+ end = &chain;
+ while (len) {
+ unsigned int bi_size;
+ struct bio *bio;
+
+ if (!bi) {
+ rbd_warn(NULL, "bio_chain exhausted with %u left", len);
+ goto out_err; /* EINVAL; ran out of bio's */
+ }
+ bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
+ bio = bio_clone_range(bi, off, bi_size, gfpmask);
+ if (!bio)
+ goto out_err; /* ENOMEM */
+
+ *end = bio;
+ end = &bio->bi_next;
+
+ off += bi_size;
+ if (off == bi->bi_iter.bi_size) {
+ bi = bi->bi_next;
+ off = 0;
+ }
+ len -= bi_size;
+ }
+ *bio_src = bi;
+ *offset = off;
+
+ return chain;
+out_err:
+ bio_chain_put(chain);
+
+ return NULL;
+}
+
+/*
+ * The default/initial value for all object request flags is 0. For
+ * each flag, once its value is set to 1 it is never reset to 0
+ * again.
+ */
+static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
+{
+ if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = obj_request->img_request->rbd_dev;
+ rbd_warn(rbd_dev, "obj_request %p already marked img_data",
+ obj_request);
+ }
+}
+
+static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
+{
+ smp_mb();
+ return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
+}
+
+static void obj_request_done_set(struct rbd_obj_request *obj_request)
+{
+ if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
+ struct rbd_device *rbd_dev = NULL;
+
+ if (obj_request_img_data_test(obj_request))
+ rbd_dev = obj_request->img_request->rbd_dev;
+ rbd_warn(rbd_dev, "obj_request %p already marked done",
+ obj_request);
+ }
+}
+
+static bool obj_request_done_test(struct rbd_obj_request *obj_request)
+{
+ smp_mb();
+ return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
+}
+
+/*
+ * This sets the KNOWN flag after (possibly) setting the EXISTS
+ * flag. The latter is set based on the "exists" value provided.
+ *
+ * Note that for our purposes once an object exists it never goes
+ * away again. It's possible that the response from two existence
+ * checks are separated by the creation of the target object, and
+ * the first ("doesn't exist") response arrives *after* the second
+ * ("does exist"). In that case we ignore the second one.
+ */
+static void obj_request_existence_set(struct rbd_obj_request *obj_request,
+ bool exists)
+{
+ if (exists)
+ set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
+ set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
+ smp_mb();
+}
+
+static bool obj_request_known_test(struct rbd_obj_request *obj_request)
+{
+ smp_mb();
+ return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
+}
+
+static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
+{
+ smp_mb();
+ return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
+}
+
+static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
+{
+ struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
+
+ return obj_request->img_offset <
+ round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
+}
+
+static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p (was %d)\n", __func__, obj_request,
+ atomic_read(&obj_request->kref.refcount));
+ kref_get(&obj_request->kref);
+}
+
+static void rbd_obj_request_destroy(struct kref *kref);
+static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request != NULL);
+ dout("%s: obj %p (was %d)\n", __func__, obj_request,
+ atomic_read(&obj_request->kref.refcount));
+ kref_put(&obj_request->kref, rbd_obj_request_destroy);
+}
+
+static void rbd_img_request_get(struct rbd_img_request *img_request)
+{
+ dout("%s: img %p (was %d)\n", __func__, img_request,
+ atomic_read(&img_request->kref.refcount));
+ kref_get(&img_request->kref);
+}
+
+static bool img_request_child_test(struct rbd_img_request *img_request);
+static void rbd_parent_request_destroy(struct kref *kref);
+static void rbd_img_request_destroy(struct kref *kref);
+static void rbd_img_request_put(struct rbd_img_request *img_request)
+{
+ rbd_assert(img_request != NULL);
+ dout("%s: img %p (was %d)\n", __func__, img_request,
+ atomic_read(&img_request->kref.refcount));
+ if (img_request_child_test(img_request))
+ kref_put(&img_request->kref, rbd_parent_request_destroy);
+ else
+ kref_put(&img_request->kref, rbd_img_request_destroy);
+}
+
+static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
+ struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request->img_request == NULL);
+
+ /* Image request now owns object's original reference */
+ obj_request->img_request = img_request;
+ obj_request->which = img_request->obj_request_count;
+ rbd_assert(!obj_request_img_data_test(obj_request));
+ obj_request_img_data_set(obj_request);
+ rbd_assert(obj_request->which != BAD_WHICH);
+ img_request->obj_request_count++;
+ list_add_tail(&obj_request->links, &img_request->obj_requests);
+ dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
+ obj_request->which);
+}
+
+static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
+ struct rbd_obj_request *obj_request)
+{
+ rbd_assert(obj_request->which != BAD_WHICH);
+
+ dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
+ obj_request->which);
+ list_del(&obj_request->links);
+ rbd_assert(img_request->obj_request_count > 0);
+ img_request->obj_request_count--;
+ rbd_assert(obj_request->which == img_request->obj_request_count);
+ obj_request->which = BAD_WHICH;
+ rbd_assert(obj_request_img_data_test(obj_request));
+ rbd_assert(obj_request->img_request == img_request);
+ obj_request->img_request = NULL;
+ obj_request->callback = NULL;
+ rbd_obj_request_put(obj_request);
+}
+
+static bool obj_request_type_valid(enum obj_request_type type)
+{
+ switch (type) {
+ case OBJ_REQUEST_NODATA:
+ case OBJ_REQUEST_BIO:
+ case OBJ_REQUEST_PAGES:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
+ struct rbd_obj_request *obj_request)
+{
+ dout("%s %p\n", __func__, obj_request);
+ return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
+}
+
+static void rbd_obj_request_end(struct rbd_obj_request *obj_request)
+{
+ dout("%s %p\n", __func__, obj_request);
+ ceph_osdc_cancel_request(obj_request->osd_req);
+}
+
+/*
+ * Wait for an object request to complete. If interrupted, cancel the
+ * underlying osd request.
+ */
+static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
+{
+ int ret;
+
+ dout("%s %p\n", __func__, obj_request);
+
+ ret = wait_for_completion_interruptible(&obj_request->completion);
+ if (ret < 0) {
+ dout("%s %p interrupted\n", __func__, obj_request);
+ rbd_obj_request_end(obj_request);
+ return ret;
+ }
+
+ dout("%s %p done\n", __func__, obj_request);
+ return 0;
+}
+
+static void rbd_img_request_complete(struct rbd_img_request *img_request)
+{
+
+ dout("%s: img %p\n", __func__, img_request);
+
+ /*
+ * If no error occurred, compute the aggregate transfer
+ * count for the image request. We could instead use
+ * atomic64_cmpxchg() to update it as each object request
+ * completes; not clear which way is better off hand.
+ */
+ if (!img_request->result) {
+ struct rbd_obj_request *obj_request;
+ u64 xferred = 0;
+
+ for_each_obj_request(img_request, obj_request)
+ xferred += obj_request->xferred;
+ img_request->xferred = xferred;
+ }
+
+ if (img_request->callback)
+ img_request->callback(img_request);
+ else
+ rbd_img_request_put(img_request);
+}
+
+/*
+ * The default/initial value for all image request flags is 0. Each
+ * is conditionally set to 1 at image request initialization time
+ * and currently never change thereafter.
+ */
+static void img_request_write_set(struct rbd_img_request *img_request)
+{
+ set_bit(IMG_REQ_WRITE, &img_request->flags);
+ smp_mb();
+}
+
+static bool img_request_write_test(struct rbd_img_request *img_request)
+{
+ smp_mb();
+ return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
+}
+
+/*
+ * Set the discard flag when the img_request is an discard request
+ */
+static void img_request_discard_set(struct rbd_img_request *img_request)
+{
+ set_bit(IMG_REQ_DISCARD, &img_request->flags);
+ smp_mb();
+}
+
+static bool img_request_discard_test(struct rbd_img_request *img_request)
+{
+ smp_mb();
+ return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
+}
+
+static void img_request_child_set(struct rbd_img_request *img_request)
+{
+ set_bit(IMG_REQ_CHILD, &img_request->flags);
+ smp_mb();
+}
+
+static void img_request_child_clear(struct rbd_img_request *img_request)
+{
+ clear_bit(IMG_REQ_CHILD, &img_request->flags);
+ smp_mb();
+}
+
+static bool img_request_child_test(struct rbd_img_request *img_request)
+{
+ smp_mb();
+ return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
+}
+
+static void img_request_layered_set(struct rbd_img_request *img_request)
+{
+ set_bit(IMG_REQ_LAYERED, &img_request->flags);
+ smp_mb();
+}
+
+static void img_request_layered_clear(struct rbd_img_request *img_request)
+{
+ clear_bit(IMG_REQ_LAYERED, &img_request->flags);
+ smp_mb();
+}
+
+static bool img_request_layered_test(struct rbd_img_request *img_request)
+{
+ smp_mb();
+ return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
+}
+
+static enum obj_operation_type
+rbd_img_request_op_type(struct rbd_img_request *img_request)
+{
+ if (img_request_write_test(img_request))
+ return OBJ_OP_WRITE;
+ else if (img_request_discard_test(img_request))
+ return OBJ_OP_DISCARD;
+ else
+ return OBJ_OP_READ;
+}
+
+static void
+rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
+{
+ u64 xferred = obj_request->xferred;
+ u64 length = obj_request->length;
+
+ dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
+ obj_request, obj_request->img_request, obj_request->result,
+ xferred, length);
+ /*
+ * ENOENT means a hole in the image. We zero-fill the entire
+ * length of the request. A short read also implies zero-fill
+ * to the end of the request. An error requires the whole
+ * length of the request to be reported finished with an error
+ * to the block layer. In each case we update the xferred
+ * count to indicate the whole request was satisfied.
+ */
+ rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
+ if (obj_request->result == -ENOENT) {
+ if (obj_request->type == OBJ_REQUEST_BIO)
+ zero_bio_chain(obj_request->bio_list, 0);
+ else
+ zero_pages(obj_request->pages, 0, length);
+ obj_request->result = 0;
+ } else if (xferred < length && !obj_request->result) {
+ if (obj_request->type == OBJ_REQUEST_BIO)
+ zero_bio_chain(obj_request->bio_list, xferred);
+ else
+ zero_pages(obj_request->pages, xferred, length);
+ }
+ obj_request->xferred = length;
+ obj_request_done_set(obj_request);
+}
+
+static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p cb %p\n", __func__, obj_request,
+ obj_request->callback);
+ if (obj_request->callback)
+ obj_request->callback(obj_request);
+ else
+ complete_all(&obj_request->completion);
+}
+
+static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p\n", __func__, obj_request);
+ obj_request_done_set(obj_request);
+}
+
+static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request = NULL;
+ struct rbd_device *rbd_dev = NULL;
+ bool layered = false;
+
+ if (obj_request_img_data_test(obj_request)) {
+ img_request = obj_request->img_request;
+ layered = img_request && img_request_layered_test(img_request);
+ rbd_dev = img_request->rbd_dev;
+ }
+
+ dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
+ obj_request, img_request, obj_request->result,
+ obj_request->xferred, obj_request->length);
+ if (layered && obj_request->result == -ENOENT &&
+ obj_request->img_offset < rbd_dev->parent_overlap)
+ rbd_img_parent_read(obj_request);
+ else if (img_request)
+ rbd_img_obj_request_read_callback(obj_request);
+ else
+ obj_request_done_set(obj_request);
+}
+
+static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+ obj_request->result, obj_request->length);
+ /*
+ * There is no such thing as a successful short write. Set
+ * it to our originally-requested length.
+ */
+ obj_request->xferred = obj_request->length;
+ obj_request_done_set(obj_request);
+}
+
+static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p result %d %llu\n", __func__, obj_request,
+ obj_request->result, obj_request->length);
+ /*
+ * There is no such thing as a successful short discard. Set
+ * it to our originally-requested length.
+ */
+ obj_request->xferred = obj_request->length;
+ /* discarding a non-existent object is not a problem */
+ if (obj_request->result == -ENOENT)
+ obj_request->result = 0;
+ obj_request_done_set(obj_request);
+}
+
+/*
+ * For a simple stat call there's nothing to do. We'll do more if
+ * this is part of a write sequence for a layered image.
+ */
+static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
+{
+ dout("%s: obj %p\n", __func__, obj_request);
+ obj_request_done_set(obj_request);
+}
+
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
+ struct ceph_msg *msg)
+{
+ struct rbd_obj_request *obj_request = osd_req->r_priv;
+ u16 opcode;
+
+ dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
+ rbd_assert(osd_req == obj_request->osd_req);
+ if (obj_request_img_data_test(obj_request)) {
+ rbd_assert(obj_request->img_request);
+ rbd_assert(obj_request->which != BAD_WHICH);
+ } else {
+ rbd_assert(obj_request->which == BAD_WHICH);
+ }
+
+ if (osd_req->r_result < 0)
+ obj_request->result = osd_req->r_result;
+
+ rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
+
+ /*
+ * We support a 64-bit length, but ultimately it has to be
+ * passed to the block layer, which just supports a 32-bit
+ * length field.
+ */
+ obj_request->xferred = osd_req->r_reply_op_len[0];
+ rbd_assert(obj_request->xferred < (u64)UINT_MAX);
+
+ opcode = osd_req->r_ops[0].op;
+ switch (opcode) {
+ case CEPH_OSD_OP_READ:
+ rbd_osd_read_callback(obj_request);
+ break;
+ case CEPH_OSD_OP_SETALLOCHINT:
+ rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE);
+ /* fall through */
+ case CEPH_OSD_OP_WRITE:
+ rbd_osd_write_callback(obj_request);
+ break;
+ case CEPH_OSD_OP_STAT:
+ rbd_osd_stat_callback(obj_request);
+ break;
+ case CEPH_OSD_OP_DELETE:
+ case CEPH_OSD_OP_TRUNCATE:
+ case CEPH_OSD_OP_ZERO:
+ rbd_osd_discard_callback(obj_request);
+ break;
+ case CEPH_OSD_OP_CALL:
+ case CEPH_OSD_OP_NOTIFY_ACK:
+ case CEPH_OSD_OP_WATCH:
+ rbd_osd_trivial_callback(obj_request);
+ break;
+ default:
+ rbd_warn(NULL, "%s: unsupported op %hu",
+ obj_request->object_name, (unsigned short) opcode);
+ break;
+ }
+
+ if (obj_request_done_test(obj_request))
+ rbd_obj_request_complete(obj_request);
+}
+
+static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request = obj_request->img_request;
+ struct ceph_osd_request *osd_req = obj_request->osd_req;
+ u64 snap_id;
+
+ rbd_assert(osd_req != NULL);
+
+ snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
+ ceph_osdc_build_request(osd_req, obj_request->offset,
+ NULL, snap_id, NULL);
+}
+
+static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request = obj_request->img_request;
+ struct ceph_osd_request *osd_req = obj_request->osd_req;
+ struct ceph_snap_context *snapc;
+ struct timespec mtime = CURRENT_TIME;
+
+ rbd_assert(osd_req != NULL);
+
+ snapc = img_request ? img_request->snapc : NULL;
+ ceph_osdc_build_request(osd_req, obj_request->offset,
+ snapc, CEPH_NOSNAP, &mtime);
+}
+
+/*
+ * Create an osd request. A read request has one osd op (read).
+ * A write request has either one (watch) or two (hint+write) osd ops.
+ * (All rbd data writes are prefixed with an allocation hint op, but
+ * technically osd watch is a write request, hence this distinction.)
+ */
+static struct ceph_osd_request *rbd_osd_req_create(
+ struct rbd_device *rbd_dev,
+ enum obj_operation_type op_type,
+ unsigned int num_ops,
+ struct rbd_obj_request *obj_request)
+{
+ struct ceph_snap_context *snapc = NULL;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_request *osd_req;
+
+ if (obj_request_img_data_test(obj_request) &&
+ (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
+ struct rbd_img_request *img_request = obj_request->img_request;
+ if (op_type == OBJ_OP_WRITE) {
+ rbd_assert(img_request_write_test(img_request));
+ } else {
+ rbd_assert(img_request_discard_test(img_request));
+ }
+ snapc = img_request->snapc;
+ }
+
+ rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
+
+ /* Allocate and initialize the request, for the num_ops ops */
+
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ osd_req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false,
+ GFP_ATOMIC);
+ if (!osd_req)
+ return NULL; /* ENOMEM */
+
+ if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
+ osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ else
+ osd_req->r_flags = CEPH_OSD_FLAG_READ;
+
+ osd_req->r_callback = rbd_osd_req_callback;
+ osd_req->r_priv = obj_request;
+
+ osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+
+ return osd_req;
+}
+
+/*
+ * Create a copyup osd request based on the information in the object
+ * request supplied. A copyup request has two or three osd ops, a
+ * copyup method call, potentially a hint op, and a write or truncate
+ * or zero op.
+ */
+static struct ceph_osd_request *
+rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request;
+ struct ceph_snap_context *snapc;
+ struct rbd_device *rbd_dev;
+ struct ceph_osd_client *osdc;
+ struct ceph_osd_request *osd_req;
+ int num_osd_ops = 3;
+
+ rbd_assert(obj_request_img_data_test(obj_request));
+ img_request = obj_request->img_request;
+ rbd_assert(img_request);
+ rbd_assert(img_request_write_test(img_request) ||
+ img_request_discard_test(img_request));
+
+ if (img_request_discard_test(img_request))
+ num_osd_ops = 2;
+
+ /* Allocate and initialize the request, for all the ops */
+
+ snapc = img_request->snapc;
+ rbd_dev = img_request->rbd_dev;
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
+ false, GFP_ATOMIC);
+ if (!osd_req)
+ return NULL; /* ENOMEM */
+
+ osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
+ osd_req->r_callback = rbd_osd_req_callback;
+ osd_req->r_priv = obj_request;
+
+ osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ ceph_oid_set_name(&osd_req->r_base_oid, obj_request->object_name);
+
+ return osd_req;
+}
+
+
+static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
+{
+ ceph_osdc_put_request(osd_req);
+}
+
+/* object_name is assumed to be a non-null pointer and NUL-terminated */
+
+static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
+ u64 offset, u64 length,
+ enum obj_request_type type)
+{
+ struct rbd_obj_request *obj_request;
+ size_t size;
+ char *name;
+
+ rbd_assert(obj_request_type_valid(type));
+
+ size = strlen(object_name) + 1;
+ name = kmalloc(size, GFP_NOIO);
+ if (!name)
+ return NULL;
+
+ obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
+ if (!obj_request) {
+ kfree(name);
+ return NULL;
+ }
+
+ obj_request->object_name = memcpy(name, object_name, size);
+ obj_request->offset = offset;
+ obj_request->length = length;
+ obj_request->flags = 0;
+ obj_request->which = BAD_WHICH;
+ obj_request->type = type;
+ INIT_LIST_HEAD(&obj_request->links);
+ init_completion(&obj_request->completion);
+ kref_init(&obj_request->kref);
+
+ dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
+ offset, length, (int)type, obj_request);
+
+ return obj_request;
+}
+
+static void rbd_obj_request_destroy(struct kref *kref)
+{
+ struct rbd_obj_request *obj_request;
+
+ obj_request = container_of(kref, struct rbd_obj_request, kref);
+
+ dout("%s: obj %p\n", __func__, obj_request);
+
+ rbd_assert(obj_request->img_request == NULL);
+ rbd_assert(obj_request->which == BAD_WHICH);
+
+ if (obj_request->osd_req)
+ rbd_osd_req_destroy(obj_request->osd_req);
+
+ rbd_assert(obj_request_type_valid(obj_request->type));
+ switch (obj_request->type) {
+ case OBJ_REQUEST_NODATA:
+ break; /* Nothing to do */
+ case OBJ_REQUEST_BIO:
+ if (obj_request->bio_list)
+ bio_chain_put(obj_request->bio_list);
+ break;
+ case OBJ_REQUEST_PAGES:
+ if (obj_request->pages)
+ ceph_release_page_vector(obj_request->pages,
+ obj_request->page_count);
+ break;
+ }
+
+ kfree(obj_request->object_name);
+ obj_request->object_name = NULL;
+ kmem_cache_free(rbd_obj_request_cache, obj_request);
+}
+
+/* It's OK to call this for a device with no parent */
+
+static void rbd_spec_put(struct rbd_spec *spec);
+static void rbd_dev_unparent(struct rbd_device *rbd_dev)
+{
+ rbd_dev_remove_parent(rbd_dev);
+ rbd_spec_put(rbd_dev->parent_spec);
+ rbd_dev->parent_spec = NULL;
+ rbd_dev->parent_overlap = 0;
+}
+
+/*
+ * Parent image reference counting is used to determine when an
+ * image's parent fields can be safely torn down--after there are no
+ * more in-flight requests to the parent image. When the last
+ * reference is dropped, cleaning them up is safe.
+ */
+static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
+{
+ int counter;
+
+ if (!rbd_dev->parent_spec)
+ return;
+
+ counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
+ if (counter > 0)
+ return;
+
+ /* Last reference; clean up parent data structures */
+
+ if (!counter)
+ rbd_dev_unparent(rbd_dev);
+ else
+ rbd_warn(rbd_dev, "parent reference underflow");
+}
+
+/*
+ * If an image has a non-zero parent overlap, get a reference to its
+ * parent.
+ *
+ * Returns true if the rbd device has a parent with a non-zero
+ * overlap and a reference for it was successfully taken, or
+ * false otherwise.
+ */
+static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
+{
+ int counter = 0;
+
+ if (!rbd_dev->parent_spec)
+ return false;
+
+ down_read(&rbd_dev->header_rwsem);
+ if (rbd_dev->parent_overlap)
+ counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
+ up_read(&rbd_dev->header_rwsem);
+
+ if (counter < 0)
+ rbd_warn(rbd_dev, "parent reference overflow");
+
+ return counter > 0;
+}
+
+/*
+ * Caller is responsible for filling in the list of object requests
+ * that comprises the image request, and the Linux request pointer
+ * (if there is one).
+ */
+static struct rbd_img_request *rbd_img_request_create(
+ struct rbd_device *rbd_dev,
+ u64 offset, u64 length,
+ enum obj_operation_type op_type,
+ struct ceph_snap_context *snapc)
+{
+ struct rbd_img_request *img_request;
+
+ img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
+ if (!img_request)
+ return NULL;
+
+ img_request->rq = NULL;
+ img_request->rbd_dev = rbd_dev;
+ img_request->offset = offset;
+ img_request->length = length;
+ img_request->flags = 0;
+ if (op_type == OBJ_OP_DISCARD) {
+ img_request_discard_set(img_request);
+ img_request->snapc = snapc;
+ } else if (op_type == OBJ_OP_WRITE) {
+ img_request_write_set(img_request);
+ img_request->snapc = snapc;
+ } else {
+ img_request->snap_id = rbd_dev->spec->snap_id;
+ }
+ if (rbd_dev_parent_get(rbd_dev))
+ img_request_layered_set(img_request);
+ spin_lock_init(&img_request->completion_lock);
+ img_request->next_completion = 0;
+ img_request->callback = NULL;
+ img_request->result = 0;
+ img_request->obj_request_count = 0;
+ INIT_LIST_HEAD(&img_request->obj_requests);
+ kref_init(&img_request->kref);
+
+ dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
+ obj_op_name(op_type), offset, length, img_request);
+
+ return img_request;
+}
+
+static void rbd_img_request_destroy(struct kref *kref)
+{
+ struct rbd_img_request *img_request;
+ struct rbd_obj_request *obj_request;
+ struct rbd_obj_request *next_obj_request;
+
+ img_request = container_of(kref, struct rbd_img_request, kref);
+
+ dout("%s: img %p\n", __func__, img_request);
+
+ for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+ rbd_img_obj_request_del(img_request, obj_request);
+ rbd_assert(img_request->obj_request_count == 0);
+
+ if (img_request_layered_test(img_request)) {
+ img_request_layered_clear(img_request);
+ rbd_dev_parent_put(img_request->rbd_dev);
+ }
+
+ if (img_request_write_test(img_request) ||
+ img_request_discard_test(img_request))
+ ceph_put_snap_context(img_request->snapc);
+
+ kmem_cache_free(rbd_img_request_cache, img_request);
+}
+
+static struct rbd_img_request *rbd_parent_request_create(
+ struct rbd_obj_request *obj_request,
+ u64 img_offset, u64 length)
+{
+ struct rbd_img_request *parent_request;
+ struct rbd_device *rbd_dev;
+
+ rbd_assert(obj_request->img_request);
+ rbd_dev = obj_request->img_request->rbd_dev;
+
+ parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
+ length, OBJ_OP_READ, NULL);
+ if (!parent_request)
+ return NULL;
+
+ img_request_child_set(parent_request);
+ rbd_obj_request_get(obj_request);
+ parent_request->obj_request = obj_request;
+
+ return parent_request;
+}
+
+static void rbd_parent_request_destroy(struct kref *kref)
+{
+ struct rbd_img_request *parent_request;
+ struct rbd_obj_request *orig_request;
+
+ parent_request = container_of(kref, struct rbd_img_request, kref);
+ orig_request = parent_request->obj_request;
+
+ parent_request->obj_request = NULL;
+ rbd_obj_request_put(orig_request);
+ img_request_child_clear(parent_request);
+
+ rbd_img_request_destroy(kref);
+}
+
+static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request;
+ unsigned int xferred;
+ int result;
+ bool more;
+
+ rbd_assert(obj_request_img_data_test(obj_request));
+ img_request = obj_request->img_request;
+
+ rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
+ xferred = (unsigned int)obj_request->xferred;
+ result = obj_request->result;
+ if (result) {
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ enum obj_operation_type op_type;
+
+ if (img_request_discard_test(img_request))
+ op_type = OBJ_OP_DISCARD;
+ else if (img_request_write_test(img_request))
+ op_type = OBJ_OP_WRITE;
+ else
+ op_type = OBJ_OP_READ;
+
+ rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
+ obj_op_name(op_type), obj_request->length,
+ obj_request->img_offset, obj_request->offset);
+ rbd_warn(rbd_dev, " result %d xferred %x",
+ result, xferred);
+ if (!img_request->result)
+ img_request->result = result;
+ /*
+ * Need to end I/O on the entire obj_request worth of
+ * bytes in case of error.
+ */
+ xferred = obj_request->length;
+ }
+
+ /* Image object requests don't own their page array */
+
+ if (obj_request->type == OBJ_REQUEST_PAGES) {
+ obj_request->pages = NULL;
+ obj_request->page_count = 0;
+ }
+
+ if (img_request_child_test(img_request)) {
+ rbd_assert(img_request->obj_request != NULL);
+ more = obj_request->which < img_request->obj_request_count - 1;
+ } else {
+ rbd_assert(img_request->rq != NULL);
+
+ more = blk_update_request(img_request->rq, result, xferred);
+ if (!more)
+ __blk_mq_end_request(img_request->rq, result);
+ }
+
+ return more;
+}
+
+static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request;
+ u32 which = obj_request->which;
+ bool more = true;
+
+ rbd_assert(obj_request_img_data_test(obj_request));
+ img_request = obj_request->img_request;
+
+ dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
+ rbd_assert(img_request != NULL);
+ rbd_assert(img_request->obj_request_count > 0);
+ rbd_assert(which != BAD_WHICH);
+ rbd_assert(which < img_request->obj_request_count);
+
+ spin_lock_irq(&img_request->completion_lock);
+ if (which != img_request->next_completion)
+ goto out;
+
+ for_each_obj_request_from(img_request, obj_request) {
+ rbd_assert(more);
+ rbd_assert(which < img_request->obj_request_count);
+
+ if (!obj_request_done_test(obj_request))
+ break;
+ more = rbd_img_obj_end_request(obj_request);
+ which++;
+ }
+
+ rbd_assert(more ^ (which == img_request->obj_request_count));
+ img_request->next_completion = which;
+out:
+ spin_unlock_irq(&img_request->completion_lock);
+ rbd_img_request_put(img_request);
+
+ if (!more)
+ rbd_img_request_complete(img_request);
+}
+
+/*
+ * Add individual osd ops to the given ceph_osd_request and prepare
+ * them for submission. num_ops is the current number of
+ * osd operations already to the object request.
+ */
+static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
+ struct ceph_osd_request *osd_request,
+ enum obj_operation_type op_type,
+ unsigned int num_ops)
+{
+ struct rbd_img_request *img_request = obj_request->img_request;
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ u64 object_size = rbd_obj_bytes(&rbd_dev->header);
+ u64 offset = obj_request->offset;
+ u64 length = obj_request->length;
+ u64 img_end;
+ u16 opcode;
+
+ if (op_type == OBJ_OP_DISCARD) {
+ if (!offset && length == object_size &&
+ (!img_request_layered_test(img_request) ||
+ !obj_request_overlaps_parent(obj_request))) {
+ opcode = CEPH_OSD_OP_DELETE;
+ } else if ((offset + length == object_size)) {
+ opcode = CEPH_OSD_OP_TRUNCATE;
+ } else {
+ down_read(&rbd_dev->header_rwsem);
+ img_end = rbd_dev->header.image_size;
+ up_read(&rbd_dev->header_rwsem);
+
+ if (obj_request->img_offset + length == img_end)
+ opcode = CEPH_OSD_OP_TRUNCATE;
+ else
+ opcode = CEPH_OSD_OP_ZERO;
+ }
+ } else if (op_type == OBJ_OP_WRITE) {
+ opcode = CEPH_OSD_OP_WRITE;
+ osd_req_op_alloc_hint_init(osd_request, num_ops,
+ object_size, object_size);
+ num_ops++;
+ } else {
+ opcode = CEPH_OSD_OP_READ;
+ }
+
+ if (opcode == CEPH_OSD_OP_DELETE)
+ osd_req_op_init(osd_request, num_ops, opcode);
+ else
+ osd_req_op_extent_init(osd_request, num_ops, opcode,
+ offset, length, 0, 0);
+
+ if (obj_request->type == OBJ_REQUEST_BIO)
+ osd_req_op_extent_osd_data_bio(osd_request, num_ops,
+ obj_request->bio_list, length);
+ else if (obj_request->type == OBJ_REQUEST_PAGES)
+ osd_req_op_extent_osd_data_pages(osd_request, num_ops,
+ obj_request->pages, length,
+ offset & ~PAGE_MASK, false, false);
+
+ /* Discards are also writes */
+ if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
+ rbd_osd_req_format_write(obj_request);
+ else
+ rbd_osd_req_format_read(obj_request);
+}
+
+/*
+ * Split up an image request into one or more object requests, each
+ * to a different object. The "type" parameter indicates whether
+ * "data_desc" is the pointer to the head of a list of bio
+ * structures, or the base of a page array. In either case this
+ * function assumes data_desc describes memory sufficient to hold
+ * all data described by the image request.
+ */
+static int rbd_img_request_fill(struct rbd_img_request *img_request,
+ enum obj_request_type type,
+ void *data_desc)
+{
+ struct rbd_device *rbd_dev = img_request->rbd_dev;
+ struct rbd_obj_request *obj_request = NULL;
+ struct rbd_obj_request *next_obj_request;
+ struct bio *bio_list = NULL;
+ unsigned int bio_offset = 0;
+ struct page **pages = NULL;
+ enum obj_operation_type op_type;
+ u64 img_offset;
+ u64 resid;
+
+ dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
+ (int)type, data_desc);
+
+ img_offset = img_request->offset;
+ resid = img_request->length;
+ rbd_assert(resid > 0);
+ op_type = rbd_img_request_op_type(img_request);
+
+ if (type == OBJ_REQUEST_BIO) {
+ bio_list = data_desc;
+ rbd_assert(img_offset ==
+ bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
+ } else if (type == OBJ_REQUEST_PAGES) {
+ pages = data_desc;
+ }
+
+ while (resid) {
+ struct ceph_osd_request *osd_req;
+ const char *object_name;
+ u64 offset;
+ u64 length;
+
+ object_name = rbd_segment_name(rbd_dev, img_offset);
+ if (!object_name)
+ goto out_unwind;
+ offset = rbd_segment_offset(rbd_dev, img_offset);
+ length = rbd_segment_length(rbd_dev, img_offset, resid);
+ obj_request = rbd_obj_request_create(object_name,
+ offset, length, type);
+ /* object request has its own copy of the object name */
+ rbd_segment_name_free(object_name);
+ if (!obj_request)
+ goto out_unwind;
+
+ /*
+ * set obj_request->img_request before creating the
+ * osd_request so that it gets the right snapc
+ */
+ rbd_img_obj_request_add(img_request, obj_request);
+
+ if (type == OBJ_REQUEST_BIO) {
+ unsigned int clone_size;
+
+ rbd_assert(length <= (u64)UINT_MAX);
+ clone_size = (unsigned int)length;
+ obj_request->bio_list =
+ bio_chain_clone_range(&bio_list,
+ &bio_offset,
+ clone_size,
+ GFP_ATOMIC);
+ if (!obj_request->bio_list)
+ goto out_unwind;
+ } else if (type == OBJ_REQUEST_PAGES) {
+ unsigned int page_count;
+
+ obj_request->pages = pages;
+ page_count = (u32)calc_pages_for(offset, length);
+ obj_request->page_count = page_count;
+ if ((offset + length) & ~PAGE_MASK)
+ page_count--; /* more on last page */
+ pages += page_count;
+ }
+
+ osd_req = rbd_osd_req_create(rbd_dev, op_type,
+ (op_type == OBJ_OP_WRITE) ? 2 : 1,
+ obj_request);
+ if (!osd_req)
+ goto out_unwind;
+
+ obj_request->osd_req = osd_req;
+ obj_request->callback = rbd_img_obj_callback;
+ obj_request->img_offset = img_offset;
+
+ rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
+
+ rbd_img_request_get(img_request);
+
+ img_offset += length;
+ resid -= length;
+ }
+
+ return 0;
+
+out_unwind:
+ for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+ rbd_img_obj_request_del(img_request, obj_request);
+
+ return -ENOMEM;
+}
+
+static void
+rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request;
+ struct rbd_device *rbd_dev;
+ struct page **pages;
+ u32 page_count;
+
+ rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
+ obj_request->type == OBJ_REQUEST_NODATA);
+ rbd_assert(obj_request_img_data_test(obj_request));
+ img_request = obj_request->img_request;
+ rbd_assert(img_request);
+
+ rbd_dev = img_request->rbd_dev;
+ rbd_assert(rbd_dev);
+
+ pages = obj_request->copyup_pages;
+ rbd_assert(pages != NULL);
+ obj_request->copyup_pages = NULL;
+ page_count = obj_request->copyup_page_count;
+ rbd_assert(page_count);
+ obj_request->copyup_page_count = 0;
+ ceph_release_page_vector(pages, page_count);
+
+ /*
+ * We want the transfer count to reflect the size of the
+ * original write request. There is no such thing as a
+ * successful short write, so if the request was successful
+ * we can just set it to the originally-requested length.
+ */
+ if (!obj_request->result)
+ obj_request->xferred = obj_request->length;
+
+ /* Finish up with the normal image object callback */
+
+ rbd_img_obj_callback(obj_request);
+}
+
+static void
+rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
+{
+ struct rbd_obj_request *orig_request;
+ struct ceph_osd_request *osd_req;
+ struct ceph_osd_client *osdc;
+ struct rbd_device *rbd_dev;
+ struct page **pages;
+ enum obj_operation_type op_type;
+ u32 page_count;
+ int img_result;
+ u64 parent_length;
+
+ rbd_assert(img_request_child_test(img_request));
+
+ /* First get what we need from the image request */
+
+ pages = img_request->copyup_pages;
+ rbd_assert(pages != NULL);
+ img_request->copyup_pages = NULL;
+ page_count = img_request->copyup_page_count;
+ rbd_assert(page_count);
+ img_request->copyup_page_count = 0;
+
+ orig_request = img_request->obj_request;
+ rbd_assert(orig_request != NULL);
+ rbd_assert(obj_request_type_valid(orig_request->type));
+ img_result = img_request->result;
+ parent_length = img_request->length;
+ rbd_assert(parent_length == img_request->xferred);
+ rbd_img_request_put(img_request);
+
+ rbd_assert(orig_request->img_request);
+ rbd_dev = orig_request->img_request->rbd_dev;
+ rbd_assert(rbd_dev);
+
+ /*
+ * If the overlap has become 0 (most likely because the
+ * image has been flattened) we need to free the pages
+ * and re-submit the original write request.
+ */
+ if (!rbd_dev->parent_overlap) {
+ struct ceph_osd_client *osdc;
+
+ ceph_release_page_vector(pages, page_count);
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ img_result = rbd_obj_request_submit(osdc, orig_request);
+ if (!img_result)
+ return;
+ }
+
+ if (img_result)
+ goto out_err;
+
+ /*
+ * The original osd request is of no use to use any more.
+ * We need a new one that can hold the three ops in a copyup
+ * request. Allocate the new copyup osd request for the
+ * original request, and release the old one.
+ */
+ img_result = -ENOMEM;
+ osd_req = rbd_osd_req_create_copyup(orig_request);
+ if (!osd_req)
+ goto out_err;
+ rbd_osd_req_destroy(orig_request->osd_req);
+ orig_request->osd_req = osd_req;
+ orig_request->copyup_pages = pages;
+ orig_request->copyup_page_count = page_count;
+
+ /* Initialize the copyup op */
+
+ osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
+ osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
+ false, false);
+
+ /* Add the other op(s) */
+
+ op_type = rbd_img_request_op_type(orig_request->img_request);
+ rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
+
+ /* All set, send it off. */
+
+ orig_request->callback = rbd_img_obj_copyup_callback;
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ img_result = rbd_obj_request_submit(osdc, orig_request);
+ if (!img_result)
+ return;
+out_err:
+ /* Record the error code and complete the request */
+
+ orig_request->result = img_result;
+ orig_request->xferred = 0;
+ obj_request_done_set(orig_request);
+ rbd_obj_request_complete(orig_request);
+}
+
+/*
+ * Read from the parent image the range of data that covers the
+ * entire target of the given object request. This is used for
+ * satisfying a layered image write request when the target of an
+ * object request from the image request does not exist.
+ *
+ * A page array big enough to hold the returned data is allocated
+ * and supplied to rbd_img_request_fill() as the "data descriptor."
+ * When the read completes, this page array will be transferred to
+ * the original object request for the copyup operation.
+ *
+ * If an error occurs, record it as the result of the original
+ * object request and mark it done so it gets completed.
+ */
+static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request = NULL;
+ struct rbd_img_request *parent_request = NULL;
+ struct rbd_device *rbd_dev;
+ u64 img_offset;
+ u64 length;
+ struct page **pages = NULL;
+ u32 page_count;
+ int result;
+
+ rbd_assert(obj_request_img_data_test(obj_request));
+ rbd_assert(obj_request_type_valid(obj_request->type));
+
+ img_request = obj_request->img_request;
+ rbd_assert(img_request != NULL);
+ rbd_dev = img_request->rbd_dev;
+ rbd_assert(rbd_dev->parent != NULL);
+
+ /*
+ * Determine the byte range covered by the object in the
+ * child image to which the original request was to be sent.
+ */
+ img_offset = obj_request->img_offset - obj_request->offset;
+ length = (u64)1 << rbd_dev->header.obj_order;
+
+ /*
+ * There is no defined parent data beyond the parent
+ * overlap, so limit what we read at that boundary if
+ * necessary.
+ */
+ if (img_offset + length > rbd_dev->parent_overlap) {
+ rbd_assert(img_offset < rbd_dev->parent_overlap);
+ length = rbd_dev->parent_overlap - img_offset;
+ }
+
+ /*
+ * Allocate a page array big enough to receive the data read
+ * from the parent.
+ */
+ page_count = (u32)calc_pages_for(0, length);
+ pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+ if (IS_ERR(pages)) {
+ result = PTR_ERR(pages);
+ pages = NULL;
+ goto out_err;
+ }
+
+ result = -ENOMEM;
+ parent_request = rbd_parent_request_create(obj_request,
+ img_offset, length);
+ if (!parent_request)
+ goto out_err;
+
+ result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
+ if (result)
+ goto out_err;
+ parent_request->copyup_pages = pages;
+ parent_request->copyup_page_count = page_count;
+
+ parent_request->callback = rbd_img_obj_parent_read_full_callback;
+ result = rbd_img_request_submit(parent_request);
+ if (!result)
+ return 0;
+
+ parent_request->copyup_pages = NULL;
+ parent_request->copyup_page_count = 0;
+ parent_request->obj_request = NULL;
+ rbd_obj_request_put(obj_request);
+out_err:
+ if (pages)
+ ceph_release_page_vector(pages, page_count);
+ if (parent_request)
+ rbd_img_request_put(parent_request);
+ obj_request->result = result;
+ obj_request->xferred = 0;
+ obj_request_done_set(obj_request);
+
+ return result;
+}
+
+static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
+{
+ struct rbd_obj_request *orig_request;
+ struct rbd_device *rbd_dev;
+ int result;
+
+ rbd_assert(!obj_request_img_data_test(obj_request));
+
+ /*
+ * All we need from the object request is the original
+ * request and the result of the STAT op. Grab those, then
+ * we're done with the request.
+ */
+ orig_request = obj_request->obj_request;
+ obj_request->obj_request = NULL;
+ rbd_obj_request_put(orig_request);
+ rbd_assert(orig_request);
+ rbd_assert(orig_request->img_request);
+
+ result = obj_request->result;
+ obj_request->result = 0;
+
+ dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
+ obj_request, orig_request, result,
+ obj_request->xferred, obj_request->length);
+ rbd_obj_request_put(obj_request);
+
+ /*
+ * If the overlap has become 0 (most likely because the
+ * image has been flattened) we need to free the pages
+ * and re-submit the original write request.
+ */
+ rbd_dev = orig_request->img_request->rbd_dev;
+ if (!rbd_dev->parent_overlap) {
+ struct ceph_osd_client *osdc;
+
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ result = rbd_obj_request_submit(osdc, orig_request);
+ if (!result)
+ return;
+ }
+
+ /*
+ * Our only purpose here is to determine whether the object
+ * exists, and we don't want to treat the non-existence as
+ * an error. If something else comes back, transfer the
+ * error to the original request and complete it now.
+ */
+ if (!result) {
+ obj_request_existence_set(orig_request, true);
+ } else if (result == -ENOENT) {
+ obj_request_existence_set(orig_request, false);
+ } else if (result) {
+ orig_request->result = result;
+ goto out;
+ }
+
+ /*
+ * Resubmit the original request now that we have recorded
+ * whether the target object exists.
+ */
+ orig_request->result = rbd_img_obj_request_submit(orig_request);
+out:
+ if (orig_request->result)
+ rbd_obj_request_complete(orig_request);
+}
+
+static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
+{
+ struct rbd_obj_request *stat_request;
+ struct rbd_device *rbd_dev;
+ struct ceph_osd_client *osdc;
+ struct page **pages = NULL;
+ u32 page_count;
+ size_t size;
+ int ret;
+
+ /*
+ * The response data for a STAT call consists of:
+ * le64 length;
+ * struct {
+ * le32 tv_sec;
+ * le32 tv_nsec;
+ * } mtime;
+ */
+ size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
+ page_count = (u32)calc_pages_for(0, size);
+ pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ ret = -ENOMEM;
+ stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
+ OBJ_REQUEST_PAGES);
+ if (!stat_request)
+ goto out;
+
+ rbd_obj_request_get(obj_request);
+ stat_request->obj_request = obj_request;
+ stat_request->pages = pages;
+ stat_request->page_count = page_count;
+
+ rbd_assert(obj_request->img_request);
+ rbd_dev = obj_request->img_request->rbd_dev;
+ stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
+ stat_request);
+ if (!stat_request->osd_req)
+ goto out;
+ stat_request->callback = rbd_img_obj_exists_callback;
+
+ osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
+ osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
+ false, false);
+ rbd_osd_req_format_read(stat_request);
+
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ ret = rbd_obj_request_submit(osdc, stat_request);
+out:
+ if (ret)
+ rbd_obj_request_put(obj_request);
+
+ return ret;
+}
+
+static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request;
+ struct rbd_device *rbd_dev;
+
+ rbd_assert(obj_request_img_data_test(obj_request));
+
+ img_request = obj_request->img_request;
+ rbd_assert(img_request);
+ rbd_dev = img_request->rbd_dev;
+
+ /* Reads */
+ if (!img_request_write_test(img_request) &&
+ !img_request_discard_test(img_request))
+ return true;
+
+ /* Non-layered writes */
+ if (!img_request_layered_test(img_request))
+ return true;
+
+ /*
+ * Layered writes outside of the parent overlap range don't
+ * share any data with the parent.
+ */
+ if (!obj_request_overlaps_parent(obj_request))
+ return true;
+
+ /*
+ * Entire-object layered writes - we will overwrite whatever
+ * parent data there is anyway.
+ */
+ if (!obj_request->offset &&
+ obj_request->length == rbd_obj_bytes(&rbd_dev->header))
+ return true;
+
+ /*
+ * If the object is known to already exist, its parent data has
+ * already been copied.
+ */
+ if (obj_request_known_test(obj_request) &&
+ obj_request_exists_test(obj_request))
+ return true;
+
+ return false;
+}
+
+static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
+{
+ if (img_obj_request_simple(obj_request)) {
+ struct rbd_device *rbd_dev;
+ struct ceph_osd_client *osdc;
+
+ rbd_dev = obj_request->img_request->rbd_dev;
+ osdc = &rbd_dev->rbd_client->client->osdc;
+
+ return rbd_obj_request_submit(osdc, obj_request);
+ }
+
+ /*
+ * It's a layered write. The target object might exist but
+ * we may not know that yet. If we know it doesn't exist,
+ * start by reading the data for the full target object from
+ * the parent so we can use it for a copyup to the target.
+ */
+ if (obj_request_known_test(obj_request))
+ return rbd_img_obj_parent_read_full(obj_request);
+
+ /* We don't know whether the target exists. Go find out. */
+
+ return rbd_img_obj_exists_submit(obj_request);
+}
+
+static int rbd_img_request_submit(struct rbd_img_request *img_request)
+{
+ struct rbd_obj_request *obj_request;
+ struct rbd_obj_request *next_obj_request;
+
+ dout("%s: img %p\n", __func__, img_request);
+ for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
+ int ret;
+
+ ret = rbd_img_obj_request_submit(obj_request);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
+{
+ struct rbd_obj_request *obj_request;
+ struct rbd_device *rbd_dev;
+ u64 obj_end;
+ u64 img_xferred;
+ int img_result;
+
+ rbd_assert(img_request_child_test(img_request));
+
+ /* First get what we need from the image request and release it */
+
+ obj_request = img_request->obj_request;
+ img_xferred = img_request->xferred;
+ img_result = img_request->result;
+ rbd_img_request_put(img_request);
+
+ /*
+ * If the overlap has become 0 (most likely because the
+ * image has been flattened) we need to re-submit the
+ * original request.
+ */
+ rbd_assert(obj_request);
+ rbd_assert(obj_request->img_request);
+ rbd_dev = obj_request->img_request->rbd_dev;
+ if (!rbd_dev->parent_overlap) {
+ struct ceph_osd_client *osdc;
+
+ osdc = &rbd_dev->rbd_client->client->osdc;
+ img_result = rbd_obj_request_submit(osdc, obj_request);
+ if (!img_result)
+ return;
+ }
+
+ obj_request->result = img_result;
+ if (obj_request->result)
+ goto out;
+
+ /*
+ * We need to zero anything beyond the parent overlap
+ * boundary. Since rbd_img_obj_request_read_callback()
+ * will zero anything beyond the end of a short read, an
+ * easy way to do this is to pretend the data from the
+ * parent came up short--ending at the overlap boundary.
+ */
+ rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
+ obj_end = obj_request->img_offset + obj_request->length;
+ if (obj_end > rbd_dev->parent_overlap) {
+ u64 xferred = 0;
+
+ if (obj_request->img_offset < rbd_dev->parent_overlap)
+ xferred = rbd_dev->parent_overlap -
+ obj_request->img_offset;
+
+ obj_request->xferred = min(img_xferred, xferred);
+ } else {
+ obj_request->xferred = img_xferred;
+ }
+out:
+ rbd_img_obj_request_read_callback(obj_request);
+ rbd_obj_request_complete(obj_request);
+}
+
+static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
+{
+ struct rbd_img_request *img_request;
+ int result;
+
+ rbd_assert(obj_request_img_data_test(obj_request));
+ rbd_assert(obj_request->img_request != NULL);
+ rbd_assert(obj_request->result == (s32) -ENOENT);
+ rbd_assert(obj_request_type_valid(obj_request->type));
+
+ /* rbd_read_finish(obj_request, obj_request->length); */
+ img_request = rbd_parent_request_create(obj_request,
+ obj_request->img_offset,
+ obj_request->length);
+ result = -ENOMEM;
+ if (!img_request)
+ goto out_err;
+
+ if (obj_request->type == OBJ_REQUEST_BIO)
+ result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
+ obj_request->bio_list);
+ else
+ result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
+ obj_request->pages);
+ if (result)
+ goto out_err;
+
+ img_request->callback = rbd_img_parent_read_callback;
+ result = rbd_img_request_submit(img_request);
+ if (result)
+ goto out_err;
+
+ return;
+out_err:
+ if (img_request)
+ rbd_img_request_put(img_request);
+ obj_request->result = result;
+ obj_request->xferred = 0;
+ obj_request_done_set(obj_request);
+}
+
+static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
+{
+ struct rbd_obj_request *obj_request;
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ int ret;
+
+ obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+ OBJ_REQUEST_NODATA);
+ if (!obj_request)
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
+ obj_request);
+ if (!obj_request->osd_req)
+ goto out;
+
+ osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
+ notify_id, 0, 0);
+ rbd_osd_req_format_read(obj_request);
+
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out;
+ ret = rbd_obj_request_wait(obj_request);
+out:
+ rbd_obj_request_put(obj_request);
+
+ return ret;
+}
+
+static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
+{
+ struct rbd_device *rbd_dev = (struct rbd_device *)data;
+ int ret;
+
+ if (!rbd_dev)
+ return;
+
+ dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
+ rbd_dev->header_name, (unsigned long long)notify_id,
+ (unsigned int)opcode);
+
+ /*
+ * Until adequate refresh error handling is in place, there is
+ * not much we can do here, except warn.
+ *
+ * See http://tracker.ceph.com/issues/5040
+ */
+ ret = rbd_dev_refresh(rbd_dev);
+ if (ret)
+ rbd_warn(rbd_dev, "refresh failed: %d", ret);
+
+ ret = rbd_obj_notify_ack_sync(rbd_dev, notify_id);
+ if (ret)
+ rbd_warn(rbd_dev, "notify_ack ret %d", ret);
+}
+
+/*
+ * Send a (un)watch request and wait for the ack. Return a request
+ * with a ref held on success or error.
+ */
+static struct rbd_obj_request *rbd_obj_watch_request_helper(
+ struct rbd_device *rbd_dev,
+ bool watch)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_obj_request *obj_request;
+ int ret;
+
+ obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
+ OBJ_REQUEST_NODATA);
+ if (!obj_request)
+ return ERR_PTR(-ENOMEM);
+
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
+ obj_request);
+ if (!obj_request->osd_req) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
+ rbd_dev->watch_event->cookie, 0, watch);
+ rbd_osd_req_format_write(obj_request);
+
+ if (watch)
+ ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
+
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out;
+
+ ret = rbd_obj_request_wait(obj_request);
+ if (ret)
+ goto out;
+
+ ret = obj_request->result;
+ if (ret) {
+ if (watch)
+ rbd_obj_request_end(obj_request);
+ goto out;
+ }
+
+ return obj_request;
+
+out:
+ rbd_obj_request_put(obj_request);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Initiate a watch request, synchronously.
+ */
+static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_obj_request *obj_request;
+ int ret;
+
+ rbd_assert(!rbd_dev->watch_event);
+ rbd_assert(!rbd_dev->watch_request);
+
+ ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
+ &rbd_dev->watch_event);
+ if (ret < 0)
+ return ret;
+
+ obj_request = rbd_obj_watch_request_helper(rbd_dev, true);
+ if (IS_ERR(obj_request)) {
+ ceph_osdc_cancel_event(rbd_dev->watch_event);
+ rbd_dev->watch_event = NULL;
+ return PTR_ERR(obj_request);
+ }
+
+ /*
+ * A watch request is set to linger, so the underlying osd
+ * request won't go away until we unregister it. We retain
+ * a pointer to the object request during that time (in
+ * rbd_dev->watch_request), so we'll keep a reference to it.
+ * We'll drop that reference after we've unregistered it in
+ * rbd_dev_header_unwatch_sync().
+ */
+ rbd_dev->watch_request = obj_request;
+
+ return 0;
+}
+
+/*
+ * Tear down a watch request, synchronously.
+ */
+static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev)
+{
+ struct rbd_obj_request *obj_request;
+
+ rbd_assert(rbd_dev->watch_event);
+ rbd_assert(rbd_dev->watch_request);
+
+ rbd_obj_request_end(rbd_dev->watch_request);
+ rbd_obj_request_put(rbd_dev->watch_request);
+ rbd_dev->watch_request = NULL;
+
+ obj_request = rbd_obj_watch_request_helper(rbd_dev, false);
+ if (!IS_ERR(obj_request))
+ rbd_obj_request_put(obj_request);
+ else
+ rbd_warn(rbd_dev, "unable to tear down watch request (%ld)",
+ PTR_ERR(obj_request));
+
+ ceph_osdc_cancel_event(rbd_dev->watch_event);
+ rbd_dev->watch_event = NULL;
+}
+
+/*
+ * Synchronous osd object method call. Returns the number of bytes
+ * returned in the outbound buffer, or a negative error code.
+ */
+static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
+ const char *object_name,
+ const char *class_name,
+ const char *method_name,
+ const void *outbound,
+ size_t outbound_size,
+ void *inbound,
+ size_t inbound_size)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_obj_request *obj_request;
+ struct page **pages;
+ u32 page_count;
+ int ret;
+
+ /*
+ * Method calls are ultimately read operations. The result
+ * should placed into the inbound buffer provided. They
+ * also supply outbound data--parameters for the object
+ * method. Currently if this is present it will be a
+ * snapshot id.
+ */
+ page_count = (u32)calc_pages_for(0, inbound_size);
+ pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ ret = -ENOMEM;
+ obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
+ OBJ_REQUEST_PAGES);
+ if (!obj_request)
+ goto out;
+
+ obj_request->pages = pages;
+ obj_request->page_count = page_count;
+
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
+ obj_request);
+ if (!obj_request->osd_req)
+ goto out;
+
+ osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
+ class_name, method_name);
+ if (outbound_size) {
+ struct ceph_pagelist *pagelist;
+
+ pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+ if (!pagelist)
+ goto out;
+
+ ceph_pagelist_init(pagelist);
+ ceph_pagelist_append(pagelist, outbound, outbound_size);
+ osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
+ pagelist);
+ }
+ osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
+ obj_request->pages, inbound_size,
+ 0, false, false);
+ rbd_osd_req_format_read(obj_request);
+
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out;
+ ret = rbd_obj_request_wait(obj_request);
+ if (ret)
+ goto out;
+
+ ret = obj_request->result;
+ if (ret < 0)
+ goto out;
+
+ rbd_assert(obj_request->xferred < (u64)INT_MAX);
+ ret = (int)obj_request->xferred;
+ ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
+out:
+ if (obj_request)
+ rbd_obj_request_put(obj_request);
+ else
+ ceph_release_page_vector(pages, page_count);
+
+ return ret;
+}
+
+static void rbd_queue_workfn(struct work_struct *work)
+{
+ struct request *rq = blk_mq_rq_from_pdu(work);
+ struct rbd_device *rbd_dev = rq->q->queuedata;
+ struct rbd_img_request *img_request;
+ struct ceph_snap_context *snapc = NULL;
+ u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
+ u64 length = blk_rq_bytes(rq);
+ enum obj_operation_type op_type;
+ u64 mapping_size;
+ int result;
+
+ if (rq->cmd_type != REQ_TYPE_FS) {
+ dout("%s: non-fs request type %d\n", __func__,
+ (int) rq->cmd_type);
+ result = -EIO;
+ goto err;
+ }
+
+ if (rq->cmd_flags & REQ_DISCARD)
+ op_type = OBJ_OP_DISCARD;
+ else if (rq->cmd_flags & REQ_WRITE)
+ op_type = OBJ_OP_WRITE;
+ else
+ op_type = OBJ_OP_READ;
+
+ /* Ignore/skip any zero-length requests */
+
+ if (!length) {
+ dout("%s: zero-length request\n", __func__);
+ result = 0;
+ goto err_rq;
+ }
+
+ /* Only reads are allowed to a read-only device */
+
+ if (op_type != OBJ_OP_READ) {
+ if (rbd_dev->mapping.read_only) {
+ result = -EROFS;
+ goto err_rq;
+ }
+ rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
+ }
+
+ /*
+ * Quit early if the mapped snapshot no longer exists. It's
+ * still possible the snapshot will have disappeared by the
+ * time our request arrives at the osd, but there's no sense in
+ * sending it if we already know.
+ */
+ if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
+ dout("request for non-existent snapshot");
+ rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
+ result = -ENXIO;
+ goto err_rq;
+ }
+
+ if (offset && length > U64_MAX - offset + 1) {
+ rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
+ length);
+ result = -EINVAL;
+ goto err_rq; /* Shouldn't happen */
+ }
+
+ blk_mq_start_request(rq);
+
+ down_read(&rbd_dev->header_rwsem);
+ mapping_size = rbd_dev->mapping.size;
+ if (op_type != OBJ_OP_READ) {
+ snapc = rbd_dev->header.snapc;
+ ceph_get_snap_context(snapc);
+ }
+ up_read(&rbd_dev->header_rwsem);
+
+ if (offset + length > mapping_size) {
+ rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
+ length, mapping_size);
+ result = -EIO;
+ goto err_rq;
+ }
+
+ img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
+ snapc);
+ if (!img_request) {
+ result = -ENOMEM;
+ goto err_rq;
+ }
+ img_request->rq = rq;
+
+ if (op_type == OBJ_OP_DISCARD)
+ result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
+ NULL);
+ else
+ result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
+ rq->bio);
+ if (result)
+ goto err_img_request;
+
+ result = rbd_img_request_submit(img_request);
+ if (result)
+ goto err_img_request;
+
+ return;
+
+err_img_request:
+ rbd_img_request_put(img_request);
+err_rq:
+ if (result)
+ rbd_warn(rbd_dev, "%s %llx at %llx result %d",
+ obj_op_name(op_type), length, offset, result);
+ ceph_put_snap_context(snapc);
+err:
+ blk_mq_end_request(rq, result);
+}
+
+static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct request *rq = bd->rq;
+ struct work_struct *work = blk_mq_rq_to_pdu(rq);
+
+ queue_work(rbd_wq, work);
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone_range()
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+ struct bio_vec *bvec)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ sector_t sector_offset;
+ sector_t sectors_per_obj;
+ sector_t obj_sector_offset;
+ int ret;
+
+ /*
+ * Find how far into its rbd object the partition-relative
+ * bio start sector is to offset relative to the enclosing
+ * device.
+ */
+ sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
+ sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
+ obj_sector_offset = sector_offset & (sectors_per_obj - 1);
+
+ /*
+ * Compute the number of bytes from that offset to the end
+ * of the object. Account for what's already used by the bio.
+ */
+ ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
+ if (ret > bmd->bi_size)
+ ret -= bmd->bi_size;
+ else
+ ret = 0;
+
+ /*
+ * Don't send back more than was asked for. And if the bio
+ * was empty, let the whole thing through because: "Note
+ * that a block device *must* allow a single page to be
+ * added to an empty bio."
+ */
+ rbd_assert(bvec->bv_len <= PAGE_SIZE);
+ if (ret > (int) bvec->bv_len || !bmd->bi_size)
+ ret = (int) bvec->bv_len;
+
+ return ret;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk = rbd_dev->disk;
+
+ if (!disk)
+ return;
+
+ rbd_dev->disk = NULL;
+ if (disk->flags & GENHD_FL_UP) {
+ del_gendisk(disk);
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+ blk_mq_free_tag_set(&rbd_dev->tag_set);
+ }
+ put_disk(disk);
+}
+
+static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
+ const char *object_name,
+ u64 offset, u64 length, void *buf)
+
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_obj_request *obj_request;
+ struct page **pages = NULL;
+ u32 page_count;
+ size_t size;
+ int ret;
+
+ page_count = (u32) calc_pages_for(offset, length);
+ pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ ret = -ENOMEM;
+ obj_request = rbd_obj_request_create(object_name, offset, length,
+ OBJ_REQUEST_PAGES);
+ if (!obj_request)
+ goto out;
+
+ obj_request->pages = pages;
+ obj_request->page_count = page_count;
+
+ obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
+ obj_request);
+ if (!obj_request->osd_req)
+ goto out;
+
+ osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
+ offset, length, 0, 0);
+ osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
+ obj_request->pages,
+ obj_request->length,
+ obj_request->offset & ~PAGE_MASK,
+ false, false);
+ rbd_osd_req_format_read(obj_request);
+
+ ret = rbd_obj_request_submit(osdc, obj_request);
+ if (ret)
+ goto out;
+ ret = rbd_obj_request_wait(obj_request);
+ if (ret)
+ goto out;
+
+ ret = obj_request->result;
+ if (ret < 0)
+ goto out;
+
+ rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
+ size = (size_t) obj_request->xferred;
+ ceph_copy_from_page_vector(pages, buf, 0, size);
+ rbd_assert(size <= (size_t)INT_MAX);
+ ret = (int)size;
+out:
+ if (obj_request)
+ rbd_obj_request_put(obj_request);
+ else
+ ceph_release_page_vector(pages, page_count);
+
+ return ret;
+}
+
+/*
+ * Read the complete header for the given rbd device. On successful
+ * return, the rbd_dev->header field will contain up-to-date
+ * information about the image.
+ */
+static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
+{
+ struct rbd_image_header_ondisk *ondisk = NULL;
+ u32 snap_count = 0;
+ u64 names_size = 0;
+ u32 want_count;
+ int ret;
+
+ /*
+ * The complete header will include an array of its 64-bit
+ * snapshot ids, followed by the names of those snapshots as
+ * a contiguous block of NUL-terminated strings. Note that
+ * the number of snapshots could change by the time we read
+ * it in, in which case we re-read it.
+ */
+ do {
+ size_t size;
+
+ kfree(ondisk);
+
+ size = sizeof (*ondisk);
+ size += snap_count * sizeof (struct rbd_image_snap_ondisk);
+ size += names_size;
+ ondisk = kmalloc(size, GFP_KERNEL);
+ if (!ondisk)
+ return -ENOMEM;
+
+ ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
+ 0, size, ondisk);
+ if (ret < 0)
+ goto out;
+ if ((size_t)ret < size) {
+ ret = -ENXIO;
+ rbd_warn(rbd_dev, "short header read (want %zd got %d)",
+ size, ret);
+ goto out;
+ }
+ if (!rbd_dev_ondisk_valid(ondisk)) {
+ ret = -ENXIO;
+ rbd_warn(rbd_dev, "invalid header");
+ goto out;
+ }
+
+ names_size = le64_to_cpu(ondisk->snap_names_len);
+ want_count = snap_count;
+ snap_count = le32_to_cpu(ondisk->snap_count);
+ } while (snap_count != want_count);
+
+ ret = rbd_header_from_disk(rbd_dev, ondisk);
+out:
+ kfree(ondisk);
+
+ return ret;
+}
+
+/*
+ * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
+ * has disappeared from the (just updated) snapshot context.
+ */
+static void rbd_exists_validate(struct rbd_device *rbd_dev)
+{
+ u64 snap_id;
+
+ if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
+ return;
+
+ snap_id = rbd_dev->spec->snap_id;
+ if (snap_id == CEPH_NOSNAP)
+ return;
+
+ if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
+ clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+}
+
+static void rbd_dev_update_size(struct rbd_device *rbd_dev)
+{
+ sector_t size;
+ bool removing;
+
+ /*
+ * Don't hold the lock while doing disk operations,
+ * or lock ordering will conflict with the bdev mutex via:
+ * rbd_add() -> blkdev_get() -> rbd_open()
+ */
+ spin_lock_irq(&rbd_dev->lock);
+ removing = test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
+ spin_unlock_irq(&rbd_dev->lock);
+ /*
+ * If the device is being removed, rbd_dev->disk has
+ * been destroyed, so don't try to update its size
+ */
+ if (!removing) {
+ size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
+ dout("setting size to %llu sectors", (unsigned long long)size);
+ set_capacity(rbd_dev->disk, size);
+ revalidate_disk(rbd_dev->disk);
+ }
+}
+
+static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+{
+ u64 mapping_size;
+ int ret;
+
+ down_write(&rbd_dev->header_rwsem);
+ mapping_size = rbd_dev->mapping.size;
+
+ ret = rbd_dev_header_info(rbd_dev);
+ if (ret)
+ goto out;
+
+ /*
+ * If there is a parent, see if it has disappeared due to the
+ * mapped image getting flattened.
+ */
+ if (rbd_dev->parent) {
+ ret = rbd_dev_v2_parent_info(rbd_dev);
+ if (ret)
+ goto out;
+ }
+
+ if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
+ rbd_dev->mapping.size = rbd_dev->header.image_size;
+ } else {
+ /* validate mapped snapshot's EXISTS flag */
+ rbd_exists_validate(rbd_dev);
+ }
+
+out:
+ up_write(&rbd_dev->header_rwsem);
+ if (!ret && mapping_size != rbd_dev->mapping.size)
+ rbd_dev_update_size(rbd_dev);
+
+ return ret;
+}
+
+static int rbd_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct work_struct *work = blk_mq_rq_to_pdu(rq);
+
+ INIT_WORK(work, rbd_queue_workfn);
+ return 0;
+}
+
+static struct blk_mq_ops rbd_mq_ops = {
+ .queue_rq = rbd_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .init_request = rbd_init_request,
+};
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ u64 segment_size;
+ int err;
+
+ /* create gendisk info */
+ disk = alloc_disk(single_major ?
+ (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
+ RBD_MINORS_PER_MAJOR);
+ if (!disk)
+ return -ENOMEM;
+
+ snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
+ rbd_dev->dev_id);
+ disk->major = rbd_dev->major;
+ disk->first_minor = rbd_dev->minor;
+ if (single_major)
+ disk->flags |= GENHD_FL_EXT_DEVT;
+ disk->fops = &rbd_bd_ops;
+ disk->private_data = rbd_dev;
+
+ memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
+ rbd_dev->tag_set.ops = &rbd_mq_ops;
+ rbd_dev->tag_set.queue_depth = BLKDEV_MAX_RQ;
+ rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
+ rbd_dev->tag_set.flags =
+ BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ rbd_dev->tag_set.nr_hw_queues = 1;
+ rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
+
+ err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
+ if (err)
+ goto out_disk;
+
+ q = blk_mq_init_queue(&rbd_dev->tag_set);
+ if (IS_ERR(q)) {
+ err = PTR_ERR(q);
+ goto out_tag_set;
+ }
+
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+ /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
+
+ /* set io sizes to object size */
+ segment_size = rbd_obj_bytes(&rbd_dev->header);
+ blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
+ blk_queue_max_segment_size(q, segment_size);
+ blk_queue_io_min(q, segment_size);
+ blk_queue_io_opt(q, segment_size);
+
+ /* enable the discard support */
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ q->limits.discard_granularity = segment_size;
+ q->limits.discard_alignment = segment_size;
+ q->limits.max_discard_sectors = segment_size / SECTOR_SIZE;
+ q->limits.discard_zeroes_data = 1;
+
+ blk_queue_merge_bvec(q, rbd_merge_bvec);
+ disk->queue = q;
+
+ q->queuedata = rbd_dev;
+
+ rbd_dev->disk = disk;
+
+ return 0;
+out_tag_set:
+ blk_mq_free_tag_set(&rbd_dev->tag_set);
+out_disk:
+ put_disk(disk);
+ return err;
+}
+
+/*
+ sysfs
+*/
+
+static struct rbd_device *dev_to_rbd_dev(struct device *dev)
+{
+ return container_of(dev, struct rbd_device, dev);
+}
+
+static ssize_t rbd_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%llu\n",
+ (unsigned long long)rbd_dev->mapping.size);
+}
+
+/*
+ * Note this shows the features for whatever's mapped, which is not
+ * necessarily the base image.
+ */
+static ssize_t rbd_features_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "0x%016llx\n",
+ (unsigned long long)rbd_dev->mapping.features);
+}
+
+static ssize_t rbd_major_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ if (rbd_dev->major)
+ return sprintf(buf, "%d\n", rbd_dev->major);
+
+ return sprintf(buf, "(none)\n");
+}
+
+static ssize_t rbd_minor_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%d\n", rbd_dev->minor);
+}
+
+static ssize_t rbd_client_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "client%lld\n",
+ ceph_client_id(rbd_dev->rbd_client->client));
+}
+
+static ssize_t rbd_pool_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
+}
+
+static ssize_t rbd_pool_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%llu\n",
+ (unsigned long long) rbd_dev->spec->pool_id);
+}
+
+static ssize_t rbd_name_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ if (rbd_dev->spec->image_name)
+ return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
+
+ return sprintf(buf, "(unknown)\n");
+}
+
+static ssize_t rbd_image_id_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
+}
+
+/*
+ * Shows the name of the currently-mapped snapshot (or
+ * RBD_SNAP_HEAD_NAME for the base image).
+ */
+static ssize_t rbd_snap_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
+}
+
+/*
+ * For a v2 image, shows the chain of parent images, separated by empty
+ * lines. For v1 images or if there is no parent, shows "(no parent
+ * image)".
+ */
+static ssize_t rbd_parent_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+ ssize_t count = 0;
+
+ if (!rbd_dev->parent)
+ return sprintf(buf, "(no parent image)\n");
+
+ for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
+ struct rbd_spec *spec = rbd_dev->parent_spec;
+
+ count += sprintf(&buf[count], "%s"
+ "pool_id %llu\npool_name %s\n"
+ "image_id %s\nimage_name %s\n"
+ "snap_id %llu\nsnap_name %s\n"
+ "overlap %llu\n",
+ !count ? "" : "\n", /* first? */
+ spec->pool_id, spec->pool_name,
+ spec->image_id, spec->image_name ?: "(unknown)",
+ spec->snap_id, spec->snap_name,
+ rbd_dev->parent_overlap);
+ }
+
+ return count;
+}
+
+static ssize_t rbd_image_refresh(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf,
+ size_t size)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+ int ret;
+
+ ret = rbd_dev_refresh(rbd_dev);
+ if (ret)
+ return ret;
+
+ return size;
+}
+
+static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
+static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
+static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
+static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
+static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
+static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
+static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
+static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
+static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
+static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
+static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
+
+static struct attribute *rbd_attrs[] = {
+ &dev_attr_size.attr,
+ &dev_attr_features.attr,
+ &dev_attr_major.attr,
+ &dev_attr_minor.attr,
+ &dev_attr_client_id.attr,
+ &dev_attr_pool.attr,
+ &dev_attr_pool_id.attr,
+ &dev_attr_name.attr,
+ &dev_attr_image_id.attr,
+ &dev_attr_current_snap.attr,
+ &dev_attr_parent.attr,
+ &dev_attr_refresh.attr,
+ NULL
+};
+
+static struct attribute_group rbd_attr_group = {
+ .attrs = rbd_attrs,
+};
+
+static const struct attribute_group *rbd_attr_groups[] = {
+ &rbd_attr_group,
+ NULL
+};
+
+static void rbd_sysfs_dev_release(struct device *dev)
+{
+}
+
+static struct device_type rbd_device_type = {
+ .name = "rbd",
+ .groups = rbd_attr_groups,
+ .release = rbd_sysfs_dev_release,
+};
+
+static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
+{
+ kref_get(&spec->kref);
+
+ return spec;
+}
+
+static void rbd_spec_free(struct kref *kref);
+static void rbd_spec_put(struct rbd_spec *spec)
+{
+ if (spec)
+ kref_put(&spec->kref, rbd_spec_free);
+}
+
+static struct rbd_spec *rbd_spec_alloc(void)
+{
+ struct rbd_spec *spec;
+
+ spec = kzalloc(sizeof (*spec), GFP_KERNEL);
+ if (!spec)
+ return NULL;
+
+ spec->pool_id = CEPH_NOPOOL;
+ spec->snap_id = CEPH_NOSNAP;
+ kref_init(&spec->kref);
+
+ return spec;
+}
+
+static void rbd_spec_free(struct kref *kref)
+{
+ struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
+
+ kfree(spec->pool_name);
+ kfree(spec->image_id);
+ kfree(spec->image_name);
+ kfree(spec->snap_name);
+ kfree(spec);
+}
+
+static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
+ struct rbd_spec *spec)
+{
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
+ if (!rbd_dev)
+ return NULL;
+
+ spin_lock_init(&rbd_dev->lock);
+ rbd_dev->flags = 0;
+ atomic_set(&rbd_dev->parent_ref, 0);
+ INIT_LIST_HEAD(&rbd_dev->node);
+ init_rwsem(&rbd_dev->header_rwsem);
+
+ rbd_dev->spec = spec;
+ rbd_dev->rbd_client = rbdc;
+
+ /* Initialize the layout used for all rbd requests */
+
+ rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
+ rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+
+ return rbd_dev;
+}
+
+static void rbd_dev_destroy(struct rbd_device *rbd_dev)
+{
+ rbd_put_client(rbd_dev->rbd_client);
+ rbd_spec_put(rbd_dev->spec);
+ kfree(rbd_dev);
+}
+
+/*
+ * Get the size and object order for an image snapshot, or if
+ * snap_id is CEPH_NOSNAP, gets this information for the base
+ * image.
+ */
+static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+ u8 *order, u64 *snap_size)
+{
+ __le64 snapid = cpu_to_le64(snap_id);
+ int ret;
+ struct {
+ u8 order;
+ __le64 size;
+ } __attribute__ ((packed)) size_buf = { 0 };
+
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+ "rbd", "get_size",
+ &snapid, sizeof (snapid),
+ &size_buf, sizeof (size_buf));
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret < 0)
+ return ret;
+ if (ret < sizeof (size_buf))
+ return -ERANGE;
+
+ if (order) {
+ *order = size_buf.order;
+ dout(" order %u", (unsigned int)*order);
+ }
+ *snap_size = le64_to_cpu(size_buf.size);
+
+ dout(" snap_id 0x%016llx snap_size = %llu\n",
+ (unsigned long long)snap_id,
+ (unsigned long long)*snap_size);
+
+ return 0;
+}
+
+static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
+{
+ return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
+ &rbd_dev->header.obj_order,
+ &rbd_dev->header.image_size);
+}
+
+static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
+{
+ void *reply_buf;
+ int ret;
+ void *p;
+
+ reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
+ if (!reply_buf)
+ return -ENOMEM;
+
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+ "rbd", "get_object_prefix", NULL, 0,
+ reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret < 0)
+ goto out;
+
+ p = reply_buf;
+ rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
+ p + ret, NULL, GFP_NOIO);
+ ret = 0;
+
+ if (IS_ERR(rbd_dev->header.object_prefix)) {
+ ret = PTR_ERR(rbd_dev->header.object_prefix);
+ rbd_dev->header.object_prefix = NULL;
+ } else {
+ dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
+ }
+out:
+ kfree(reply_buf);
+
+ return ret;
+}
+
+static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+ u64 *snap_features)
+{
+ __le64 snapid = cpu_to_le64(snap_id);
+ struct {
+ __le64 features;
+ __le64 incompat;
+ } __attribute__ ((packed)) features_buf = { 0 };
+ u64 incompat;
+ int ret;
+
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+ "rbd", "get_features",
+ &snapid, sizeof (snapid),
+ &features_buf, sizeof (features_buf));
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret < 0)
+ return ret;
+ if (ret < sizeof (features_buf))
+ return -ERANGE;
+
+ incompat = le64_to_cpu(features_buf.incompat);
+ if (incompat & ~RBD_FEATURES_SUPPORTED)
+ return -ENXIO;
+
+ *snap_features = le64_to_cpu(features_buf.features);
+
+ dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
+ (unsigned long long)snap_id,
+ (unsigned long long)*snap_features,
+ (unsigned long long)le64_to_cpu(features_buf.incompat));
+
+ return 0;
+}
+
+static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
+{
+ return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
+ &rbd_dev->header.features);
+}
+
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
+{
+ struct rbd_spec *parent_spec;
+ size_t size;
+ void *reply_buf = NULL;
+ __le64 snapid;
+ void *p;
+ void *end;
+ u64 pool_id;
+ char *image_id;
+ u64 snap_id;
+ u64 overlap;
+ int ret;
+
+ parent_spec = rbd_spec_alloc();
+ if (!parent_spec)
+ return -ENOMEM;
+
+ size = sizeof (__le64) + /* pool_id */
+ sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
+ sizeof (__le64) + /* snap_id */
+ sizeof (__le64); /* overlap */
+ reply_buf = kmalloc(size, GFP_KERNEL);
+ if (!reply_buf) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ snapid = cpu_to_le64(rbd_dev->spec->snap_id);
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+ "rbd", "get_parent",
+ &snapid, sizeof (snapid),
+ reply_buf, size);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret < 0)
+ goto out_err;
+
+ p = reply_buf;
+ end = reply_buf + ret;
+ ret = -ERANGE;
+ ceph_decode_64_safe(&p, end, pool_id, out_err);
+ if (pool_id == CEPH_NOPOOL) {
+ /*
+ * Either the parent never existed, or we have
+ * record of it but the image got flattened so it no
+ * longer has a parent. When the parent of a
+ * layered image disappears we immediately set the
+ * overlap to 0. The effect of this is that all new
+ * requests will be treated as if the image had no
+ * parent.
+ */
+ if (rbd_dev->parent_overlap) {
+ rbd_dev->parent_overlap = 0;
+ rbd_dev_parent_put(rbd_dev);
+ pr_info("%s: clone image has been flattened\n",
+ rbd_dev->disk->disk_name);
+ }
+
+ goto out; /* No parent? No problem. */
+ }
+
+ /* The ceph file layout needs to fit pool id in 32 bits */
+
+ ret = -EIO;
+ if (pool_id > (u64)U32_MAX) {
+ rbd_warn(NULL, "parent pool id too large (%llu > %u)",
+ (unsigned long long)pool_id, U32_MAX);
+ goto out_err;
+ }
+
+ image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
+ if (IS_ERR(image_id)) {
+ ret = PTR_ERR(image_id);
+ goto out_err;
+ }
+ ceph_decode_64_safe(&p, end, snap_id, out_err);
+ ceph_decode_64_safe(&p, end, overlap, out_err);
+
+ /*
+ * The parent won't change (except when the clone is
+ * flattened, already handled that). So we only need to
+ * record the parent spec we have not already done so.
+ */
+ if (!rbd_dev->parent_spec) {
+ parent_spec->pool_id = pool_id;
+ parent_spec->image_id = image_id;
+ parent_spec->snap_id = snap_id;
+ rbd_dev->parent_spec = parent_spec;
+ parent_spec = NULL; /* rbd_dev now owns this */
+ } else {
+ kfree(image_id);
+ }
+
+ /*
+ * We always update the parent overlap. If it's zero we issue
+ * a warning, as we will proceed as if there was no parent.
+ */
+ if (!overlap) {
+ if (parent_spec) {
+ /* refresh, careful to warn just once */
+ if (rbd_dev->parent_overlap)
+ rbd_warn(rbd_dev,
+ "clone now standalone (overlap became 0)");
+ } else {
+ /* initial probe */
+ rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
+ }
+ }
+ rbd_dev->parent_overlap = overlap;
+
+out:
+ ret = 0;
+out_err:
+ kfree(reply_buf);
+ rbd_spec_put(parent_spec);
+
+ return ret;
+}
+
+static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
+{
+ struct {
+ __le64 stripe_unit;
+ __le64 stripe_count;
+ } __attribute__ ((packed)) striping_info_buf = { 0 };
+ size_t size = sizeof (striping_info_buf);
+ void *p;
+ u64 obj_size;
+ u64 stripe_unit;
+ u64 stripe_count;
+ int ret;
+
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+ "rbd", "get_stripe_unit_count", NULL, 0,
+ (char *)&striping_info_buf, size);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret < 0)
+ return ret;
+ if (ret < size)
+ return -ERANGE;
+
+ /*
+ * We don't actually support the "fancy striping" feature
+ * (STRIPINGV2) yet, but if the striping sizes are the
+ * defaults the behavior is the same as before. So find
+ * out, and only fail if the image has non-default values.
+ */
+ ret = -EINVAL;
+ obj_size = (u64)1 << rbd_dev->header.obj_order;
+ p = &striping_info_buf;
+ stripe_unit = ceph_decode_64(&p);
+ if (stripe_unit != obj_size) {
+ rbd_warn(rbd_dev, "unsupported stripe unit "
+ "(got %llu want %llu)",
+ stripe_unit, obj_size);
+ return -EINVAL;
+ }
+ stripe_count = ceph_decode_64(&p);
+ if (stripe_count != 1) {
+ rbd_warn(rbd_dev, "unsupported stripe count "
+ "(got %llu want 1)", stripe_count);
+ return -EINVAL;
+ }
+ rbd_dev->header.stripe_unit = stripe_unit;
+ rbd_dev->header.stripe_count = stripe_count;
+
+ return 0;
+}
+
+static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
+{
+ size_t image_id_size;
+ char *image_id;
+ void *p;
+ void *end;
+ size_t size;
+ void *reply_buf = NULL;
+ size_t len = 0;
+ char *image_name = NULL;
+ int ret;
+
+ rbd_assert(!rbd_dev->spec->image_name);
+
+ len = strlen(rbd_dev->spec->image_id);
+ image_id_size = sizeof (__le32) + len;
+ image_id = kmalloc(image_id_size, GFP_KERNEL);
+ if (!image_id)
+ return NULL;
+
+ p = image_id;
+ end = image_id + image_id_size;
+ ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
+
+ size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
+ reply_buf = kmalloc(size, GFP_KERNEL);
+ if (!reply_buf)
+ goto out;
+
+ ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
+ "rbd", "dir_get_name",
+ image_id, image_id_size,
+ reply_buf, size);
+ if (ret < 0)
+ goto out;
+ p = reply_buf;
+ end = reply_buf + ret;
+
+ image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
+ if (IS_ERR(image_name))
+ image_name = NULL;
+ else
+ dout("%s: name is %s len is %zd\n", __func__, image_name, len);
+out:
+ kfree(reply_buf);
+ kfree(image_id);
+
+ return image_name;
+}
+
+static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+ struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+ const char *snap_name;
+ u32 which = 0;
+
+ /* Skip over names until we find the one we are looking for */
+
+ snap_name = rbd_dev->header.snap_names;
+ while (which < snapc->num_snaps) {
+ if (!strcmp(name, snap_name))
+ return snapc->snaps[which];
+ snap_name += strlen(snap_name) + 1;
+ which++;
+ }
+ return CEPH_NOSNAP;
+}
+
+static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+ struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+ u32 which;
+ bool found = false;
+ u64 snap_id;
+
+ for (which = 0; !found && which < snapc->num_snaps; which++) {
+ const char *snap_name;
+
+ snap_id = snapc->snaps[which];
+ snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
+ if (IS_ERR(snap_name)) {
+ /* ignore no-longer existing snapshots */
+ if (PTR_ERR(snap_name) == -ENOENT)
+ continue;
+ else
+ break;
+ }
+ found = !strcmp(name, snap_name);
+ kfree(snap_name);
+ }
+ return found ? snap_id : CEPH_NOSNAP;
+}
+
+/*
+ * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
+ * no snapshot by that name is found, or if an error occurs.
+ */
+static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+ if (rbd_dev->image_format == 1)
+ return rbd_v1_snap_id_by_name(rbd_dev, name);
+
+ return rbd_v2_snap_id_by_name(rbd_dev, name);
+}
+
+/*
+ * An image being mapped will have everything but the snap id.
+ */
+static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
+{
+ struct rbd_spec *spec = rbd_dev->spec;
+
+ rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
+ rbd_assert(spec->image_id && spec->image_name);
+ rbd_assert(spec->snap_name);
+
+ if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
+ u64 snap_id;
+
+ snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
+ if (snap_id == CEPH_NOSNAP)
+ return -ENOENT;
+
+ spec->snap_id = snap_id;
+ } else {
+ spec->snap_id = CEPH_NOSNAP;
+ }
+
+ return 0;
+}
+
+/*
+ * A parent image will have all ids but none of the names.
+ *
+ * All names in an rbd spec are dynamically allocated. It's OK if we
+ * can't figure out the name for an image id.
+ */
+static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
+{
+ struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+ struct rbd_spec *spec = rbd_dev->spec;
+ const char *pool_name;
+ const char *image_name;
+ const char *snap_name;
+ int ret;
+
+ rbd_assert(spec->pool_id != CEPH_NOPOOL);
+ rbd_assert(spec->image_id);
+ rbd_assert(spec->snap_id != CEPH_NOSNAP);
+
+ /* Get the pool name; we have to make our own copy of this */
+
+ pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
+ if (!pool_name) {
+ rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
+ return -EIO;
+ }
+ pool_name = kstrdup(pool_name, GFP_KERNEL);
+ if (!pool_name)
+ return -ENOMEM;
+
+ /* Fetch the image name; tolerate failure here */
+
+ image_name = rbd_dev_image_name(rbd_dev);
+ if (!image_name)
+ rbd_warn(rbd_dev, "unable to get image name");
+
+ /* Fetch the snapshot name */
+
+ snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
+ if (IS_ERR(snap_name)) {
+ ret = PTR_ERR(snap_name);
+ goto out_err;
+ }
+
+ spec->pool_name = pool_name;
+ spec->image_name = image_name;
+ spec->snap_name = snap_name;
+
+ return 0;
+
+out_err:
+ kfree(image_name);
+ kfree(pool_name);
+ return ret;
+}
+
+static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
+{
+ size_t size;
+ int ret;
+ void *reply_buf;
+ void *p;
+ void *end;
+ u64 seq;
+ u32 snap_count;
+ struct ceph_snap_context *snapc;
+ u32 i;
+
+ /*
+ * We'll need room for the seq value (maximum snapshot id),
+ * snapshot count, and array of that many snapshot ids.
+ * For now we have a fixed upper limit on the number we're
+ * prepared to receive.
+ */
+ size = sizeof (__le64) + sizeof (__le32) +
+ RBD_MAX_SNAP_COUNT * sizeof (__le64);
+ reply_buf = kzalloc(size, GFP_KERNEL);
+ if (!reply_buf)
+ return -ENOMEM;
+
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+ "rbd", "get_snapcontext", NULL, 0,
+ reply_buf, size);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret < 0)
+ goto out;
+
+ p = reply_buf;
+ end = reply_buf + ret;
+ ret = -ERANGE;
+ ceph_decode_64_safe(&p, end, seq, out);
+ ceph_decode_32_safe(&p, end, snap_count, out);
+
+ /*
+ * Make sure the reported number of snapshot ids wouldn't go
+ * beyond the end of our buffer. But before checking that,
+ * make sure the computed size of the snapshot context we
+ * allocate is representable in a size_t.
+ */
+ if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
+ / sizeof (u64)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
+ goto out;
+ ret = 0;
+
+ snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
+ if (!snapc) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ snapc->seq = seq;
+ for (i = 0; i < snap_count; i++)
+ snapc->snaps[i] = ceph_decode_64(&p);
+
+ ceph_put_snap_context(rbd_dev->header.snapc);
+ rbd_dev->header.snapc = snapc;
+
+ dout(" snap context seq = %llu, snap_count = %u\n",
+ (unsigned long long)seq, (unsigned int)snap_count);
+out:
+ kfree(reply_buf);
+
+ return ret;
+}
+
+static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+ u64 snap_id)
+{
+ size_t size;
+ void *reply_buf;
+ __le64 snapid;
+ int ret;
+ void *p;
+ void *end;
+ char *snap_name;
+
+ size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
+ reply_buf = kmalloc(size, GFP_KERNEL);
+ if (!reply_buf)
+ return ERR_PTR(-ENOMEM);
+
+ snapid = cpu_to_le64(snap_id);
+ ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
+ "rbd", "get_snapshot_name",
+ &snapid, sizeof (snapid),
+ reply_buf, size);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret < 0) {
+ snap_name = ERR_PTR(ret);
+ goto out;
+ }
+
+ p = reply_buf;
+ end = reply_buf + ret;
+ snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
+ if (IS_ERR(snap_name))
+ goto out;
+
+ dout(" snap_id 0x%016llx snap_name = %s\n",
+ (unsigned long long)snap_id, snap_name);
+out:
+ kfree(reply_buf);
+
+ return snap_name;
+}
+
+static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
+{
+ bool first_time = rbd_dev->header.object_prefix == NULL;
+ int ret;
+
+ ret = rbd_dev_v2_image_size(rbd_dev);
+ if (ret)
+ return ret;
+
+ if (first_time) {
+ ret = rbd_dev_v2_header_onetime(rbd_dev);
+ if (ret)
+ return ret;
+ }
+
+ ret = rbd_dev_v2_snap_context(rbd_dev);
+ dout("rbd_dev_v2_snap_context returned %d\n", ret);
+
+ return ret;
+}
+
+static int rbd_dev_header_info(struct rbd_device *rbd_dev)
+{
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+
+ if (rbd_dev->image_format == 1)
+ return rbd_dev_v1_header_info(rbd_dev);
+
+ return rbd_dev_v2_header_info(rbd_dev);
+}
+
+static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
+{
+ struct device *dev;
+ int ret;
+
+ dev = &rbd_dev->dev;
+ dev->bus = &rbd_bus_type;
+ dev->type = &rbd_device_type;
+ dev->parent = &rbd_root_dev;
+ dev->release = rbd_dev_device_release;
+ dev_set_name(dev, "%d", rbd_dev->dev_id);
+ ret = device_register(dev);
+
+ return ret;
+}
+
+static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
+{
+ device_unregister(&rbd_dev->dev);
+}
+
+/*
+ * Get a unique rbd identifier for the given new rbd_dev, and add
+ * the rbd_dev to the global list.
+ */
+static int rbd_dev_id_get(struct rbd_device *rbd_dev)
+{
+ int new_dev_id;
+
+ new_dev_id = ida_simple_get(&rbd_dev_id_ida,
+ 0, minor_to_rbd_dev_id(1 << MINORBITS),
+ GFP_KERNEL);
+ if (new_dev_id < 0)
+ return new_dev_id;
+
+ rbd_dev->dev_id = new_dev_id;
+
+ spin_lock(&rbd_dev_list_lock);
+ list_add_tail(&rbd_dev->node, &rbd_dev_list);
+ spin_unlock(&rbd_dev_list_lock);
+
+ dout("rbd_dev %p given dev id %d\n", rbd_dev, rbd_dev->dev_id);
+
+ return 0;
+}
+
+/*
+ * Remove an rbd_dev from the global list, and record that its
+ * identifier is no longer in use.
+ */
+static void rbd_dev_id_put(struct rbd_device *rbd_dev)
+{
+ spin_lock(&rbd_dev_list_lock);
+ list_del_init(&rbd_dev->node);
+ spin_unlock(&rbd_dev_list_lock);
+
+ ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+
+ dout("rbd_dev %p released dev id %d\n", rbd_dev, rbd_dev->dev_id);
+}
+
+/*
+ * Skips over white space at *buf, and updates *buf to point to the
+ * first found non-space character (if any). Returns the length of
+ * the token (string of non-white space characters) found. Note
+ * that *buf must be terminated with '\0'.
+ */
+static inline size_t next_token(const char **buf)
+{
+ /*
+ * These are the characters that produce nonzero for
+ * isspace() in the "C" and "POSIX" locales.
+ */
+ const char *spaces = " \f\n\r\t\v";
+
+ *buf += strspn(*buf, spaces); /* Find start of token */
+
+ return strcspn(*buf, spaces); /* Return token length */
+}
+
+/*
+ * Finds the next token in *buf, dynamically allocates a buffer big
+ * enough to hold a copy of it, and copies the token into the new
+ * buffer. The copy is guaranteed to be terminated with '\0'. Note
+ * that a duplicate buffer is created even for a zero-length token.
+ *
+ * Returns a pointer to the newly-allocated duplicate, or a null
+ * pointer if memory for the duplicate was not available. If
+ * the lenp argument is a non-null pointer, the length of the token
+ * (not including the '\0') is returned in *lenp.
+ *
+ * If successful, the *buf pointer will be updated to point beyond
+ * the end of the found token.
+ *
+ * Note: uses GFP_KERNEL for allocation.
+ */
+static inline char *dup_token(const char **buf, size_t *lenp)
+{
+ char *dup;
+ size_t len;
+
+ len = next_token(buf);
+ dup = kmemdup(*buf, len + 1, GFP_KERNEL);
+ if (!dup)
+ return NULL;
+ *(dup + len) = '\0';
+ *buf += len;
+
+ if (lenp)
+ *lenp = len;
+
+ return dup;
+}
+
+/*
+ * Parse the options provided for an "rbd add" (i.e., rbd image
+ * mapping) request. These arrive via a write to /sys/bus/rbd/add,
+ * and the data written is passed here via a NUL-terminated buffer.
+ * Returns 0 if successful or an error code otherwise.
+ *
+ * The information extracted from these options is recorded in
+ * the other parameters which return dynamically-allocated
+ * structures:
+ * ceph_opts
+ * The address of a pointer that will refer to a ceph options
+ * structure. Caller must release the returned pointer using
+ * ceph_destroy_options() when it is no longer needed.
+ * rbd_opts
+ * Address of an rbd options pointer. Fully initialized by
+ * this function; caller must release with kfree().
+ * spec
+ * Address of an rbd image specification pointer. Fully
+ * initialized by this function based on parsed options.
+ * Caller must release with rbd_spec_put().
+ *
+ * The options passed take this form:
+ * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
+ * where:
+ * <mon_addrs>
+ * A comma-separated list of one or more monitor addresses.
+ * A monitor address is an ip address, optionally followed
+ * by a port number (separated by a colon).
+ * I.e.: ip1[:port1][,ip2[:port2]...]
+ * <options>
+ * A comma-separated list of ceph and/or rbd options.
+ * <pool_name>
+ * The name of the rados pool containing the rbd image.
+ * <image_name>
+ * The name of the image in that pool to map.
+ * <snap_id>
+ * An optional snapshot id. If provided, the mapping will
+ * present data from the image at the time that snapshot was
+ * created. The image head is used if no snapshot id is
+ * provided. Snapshot mappings are always read-only.
+ */
+static int rbd_add_parse_args(const char *buf,
+ struct ceph_options **ceph_opts,
+ struct rbd_options **opts,
+ struct rbd_spec **rbd_spec)
+{
+ size_t len;
+ char *options;
+ const char *mon_addrs;
+ char *snap_name;
+ size_t mon_addrs_size;
+ struct rbd_spec *spec = NULL;
+ struct rbd_options *rbd_opts = NULL;
+ struct ceph_options *copts;
+ int ret;
+
+ /* The first four tokens are required */
+
+ len = next_token(&buf);
+ if (!len) {
+ rbd_warn(NULL, "no monitor address(es) provided");
+ return -EINVAL;
+ }
+ mon_addrs = buf;
+ mon_addrs_size = len + 1;
+ buf += len;
+
+ ret = -EINVAL;
+ options = dup_token(&buf, NULL);
+ if (!options)
+ return -ENOMEM;
+ if (!*options) {
+ rbd_warn(NULL, "no options provided");
+ goto out_err;
+ }
+
+ spec = rbd_spec_alloc();
+ if (!spec)
+ goto out_mem;
+
+ spec->pool_name = dup_token(&buf, NULL);
+ if (!spec->pool_name)
+ goto out_mem;
+ if (!*spec->pool_name) {
+ rbd_warn(NULL, "no pool name provided");
+ goto out_err;
+ }
+
+ spec->image_name = dup_token(&buf, NULL);
+ if (!spec->image_name)
+ goto out_mem;
+ if (!*spec->image_name) {
+ rbd_warn(NULL, "no image name provided");
+ goto out_err;
+ }
+
+ /*
+ * Snapshot name is optional; default is to use "-"
+ * (indicating the head/no snapshot).
+ */
+ len = next_token(&buf);
+ if (!len) {
+ buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
+ len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
+ } else if (len > RBD_MAX_SNAP_NAME_LEN) {
+ ret = -ENAMETOOLONG;
+ goto out_err;
+ }
+ snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
+ if (!snap_name)
+ goto out_mem;
+ *(snap_name + len) = '\0';
+ spec->snap_name = snap_name;
+
+ /* Initialize all rbd options to the defaults */
+
+ rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
+ if (!rbd_opts)
+ goto out_mem;
+
+ rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
+
+ copts = ceph_parse_options(options, mon_addrs,
+ mon_addrs + mon_addrs_size - 1,
+ parse_rbd_opts_token, rbd_opts);
+ if (IS_ERR(copts)) {
+ ret = PTR_ERR(copts);
+ goto out_err;
+ }
+ kfree(options);
+
+ *ceph_opts = copts;
+ *opts = rbd_opts;
+ *rbd_spec = spec;
+
+ return 0;
+out_mem:
+ ret = -ENOMEM;
+out_err:
+ kfree(rbd_opts);
+ rbd_spec_put(spec);
+ kfree(options);
+
+ return ret;
+}
+
+/*
+ * Return pool id (>= 0) or a negative error code.
+ */
+static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
+{
+ u64 newest_epoch;
+ unsigned long timeout = rbdc->client->options->mount_timeout * HZ;
+ int tries = 0;
+ int ret;
+
+again:
+ ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
+ if (ret == -ENOENT && tries++ < 1) {
+ ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap",
+ &newest_epoch);
+ if (ret < 0)
+ return ret;
+
+ if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
+ ceph_monc_request_next_osdmap(&rbdc->client->monc);
+ (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
+ newest_epoch, timeout);
+ goto again;
+ } else {
+ /* the osdmap we have is new enough */
+ return -ENOENT;
+ }
+ }
+
+ return ret;
+}
+
+/*
+ * An rbd format 2 image has a unique identifier, distinct from the
+ * name given to it by the user. Internally, that identifier is
+ * what's used to specify the names of objects related to the image.
+ *
+ * A special "rbd id" object is used to map an rbd image name to its
+ * id. If that object doesn't exist, then there is no v2 rbd image
+ * with the supplied name.
+ *
+ * This function will record the given rbd_dev's image_id field if
+ * it can be determined, and in that case will return 0. If any
+ * errors occur a negative errno will be returned and the rbd_dev's
+ * image_id field will be unchanged (and should be NULL).
+ */
+static int rbd_dev_image_id(struct rbd_device *rbd_dev)
+{
+ int ret;
+ size_t size;
+ char *object_name;
+ void *response;
+ char *image_id;
+
+ /*
+ * When probing a parent image, the image id is already
+ * known (and the image name likely is not). There's no
+ * need to fetch the image id again in this case. We
+ * do still need to set the image format though.
+ */
+ if (rbd_dev->spec->image_id) {
+ rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
+
+ return 0;
+ }
+
+ /*
+ * First, see if the format 2 image id file exists, and if
+ * so, get the image's persistent id from it.
+ */
+ size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
+ object_name = kmalloc(size, GFP_NOIO);
+ if (!object_name)
+ return -ENOMEM;
+ sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
+ dout("rbd id object name is %s\n", object_name);
+
+ /* Response will be an encoded string, which includes a length */
+
+ size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
+ response = kzalloc(size, GFP_NOIO);
+ if (!response) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* If it doesn't exist we'll assume it's a format 1 image */
+
+ ret = rbd_obj_method_sync(rbd_dev, object_name,
+ "rbd", "get_id", NULL, 0,
+ response, RBD_IMAGE_ID_LEN_MAX);
+ dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+ if (ret == -ENOENT) {
+ image_id = kstrdup("", GFP_KERNEL);
+ ret = image_id ? 0 : -ENOMEM;
+ if (!ret)
+ rbd_dev->image_format = 1;
+ } else if (ret >= 0) {
+ void *p = response;
+
+ image_id = ceph_extract_encoded_string(&p, p + ret,
+ NULL, GFP_NOIO);
+ ret = PTR_ERR_OR_ZERO(image_id);
+ if (!ret)
+ rbd_dev->image_format = 2;
+ }
+
+ if (!ret) {
+ rbd_dev->spec->image_id = image_id;
+ dout("image_id is %s\n", image_id);
+ }
+out:
+ kfree(response);
+ kfree(object_name);
+
+ return ret;
+}
+
+/*
+ * Undo whatever state changes are made by v1 or v2 header info
+ * call.
+ */
+static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
+{
+ struct rbd_image_header *header;
+
+ rbd_dev_parent_put(rbd_dev);
+
+ /* Free dynamic fields from the header, then zero it out */
+
+ header = &rbd_dev->header;
+ ceph_put_snap_context(header->snapc);
+ kfree(header->snap_sizes);
+ kfree(header->snap_names);
+ kfree(header->object_prefix);
+ memset(header, 0, sizeof (*header));
+}
+
+static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ ret = rbd_dev_v2_object_prefix(rbd_dev);
+ if (ret)
+ goto out_err;
+
+ /*
+ * Get the and check features for the image. Currently the
+ * features are assumed to never change.
+ */
+ ret = rbd_dev_v2_features(rbd_dev);
+ if (ret)
+ goto out_err;
+
+ /* If the image supports fancy striping, get its parameters */
+
+ if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
+ ret = rbd_dev_v2_striping_info(rbd_dev);
+ if (ret < 0)
+ goto out_err;
+ }
+ /* No support for crypto and compression type format 2 images */
+
+ return 0;
+out_err:
+ rbd_dev->header.features = 0;
+ kfree(rbd_dev->header.object_prefix);
+ rbd_dev->header.object_prefix = NULL;
+
+ return ret;
+}
+
+static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
+{
+ struct rbd_device *parent = NULL;
+ struct rbd_spec *parent_spec;
+ struct rbd_client *rbdc;
+ int ret;
+
+ if (!rbd_dev->parent_spec)
+ return 0;
+ /*
+ * We need to pass a reference to the client and the parent
+ * spec when creating the parent rbd_dev. Images related by
+ * parent/child relationships always share both.
+ */
+ parent_spec = rbd_spec_get(rbd_dev->parent_spec);
+ rbdc = __rbd_get_client(rbd_dev->rbd_client);
+
+ ret = -ENOMEM;
+ parent = rbd_dev_create(rbdc, parent_spec);
+ if (!parent)
+ goto out_err;
+
+ ret = rbd_dev_image_probe(parent, false);
+ if (ret < 0)
+ goto out_err;
+ rbd_dev->parent = parent;
+ atomic_set(&rbd_dev->parent_ref, 1);
+
+ return 0;
+out_err:
+ if (parent) {
+ rbd_dev_unparent(rbd_dev);
+ kfree(rbd_dev->header_name);
+ rbd_dev_destroy(parent);
+ } else {
+ rbd_put_client(rbdc);
+ rbd_spec_put(parent_spec);
+ }
+
+ return ret;
+}
+
+static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
+{
+ int ret;
+
+ /* Get an id and fill in device name. */
+
+ ret = rbd_dev_id_get(rbd_dev);
+ if (ret)
+ return ret;
+
+ BUILD_BUG_ON(DEV_NAME_LEN
+ < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
+ sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
+
+ /* Record our major and minor device numbers. */
+
+ if (!single_major) {
+ ret = register_blkdev(0, rbd_dev->name);
+ if (ret < 0)
+ goto err_out_id;
+
+ rbd_dev->major = ret;
+ rbd_dev->minor = 0;
+ } else {
+ rbd_dev->major = rbd_major;
+ rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
+ }
+
+ /* Set up the blkdev mapping. */
+
+ ret = rbd_init_disk(rbd_dev);
+ if (ret)
+ goto err_out_blkdev;
+
+ ret = rbd_dev_mapping_set(rbd_dev);
+ if (ret)
+ goto err_out_disk;
+
+ set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
+ set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
+
+ ret = rbd_bus_add_dev(rbd_dev);
+ if (ret)
+ goto err_out_mapping;
+
+ /* Everything's ready. Announce the disk to the world. */
+
+ set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+ add_disk(rbd_dev->disk);
+
+ pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
+ (unsigned long long) rbd_dev->mapping.size);
+
+ return ret;
+
+err_out_mapping:
+ rbd_dev_mapping_clear(rbd_dev);
+err_out_disk:
+ rbd_free_disk(rbd_dev);
+err_out_blkdev:
+ if (!single_major)
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_id:
+ rbd_dev_id_put(rbd_dev);
+ rbd_dev_mapping_clear(rbd_dev);
+
+ return ret;
+}
+
+static int rbd_dev_header_name(struct rbd_device *rbd_dev)
+{
+ struct rbd_spec *spec = rbd_dev->spec;
+ size_t size;
+
+ /* Record the header object name for this rbd image. */
+
+ rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+
+ if (rbd_dev->image_format == 1)
+ size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
+ else
+ size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
+
+ rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
+ if (!rbd_dev->header_name)
+ return -ENOMEM;
+
+ if (rbd_dev->image_format == 1)
+ sprintf(rbd_dev->header_name, "%s%s",
+ spec->image_name, RBD_SUFFIX);
+ else
+ sprintf(rbd_dev->header_name, "%s%s",
+ RBD_HEADER_PREFIX, spec->image_id);
+ return 0;
+}
+
+static void rbd_dev_image_release(struct rbd_device *rbd_dev)
+{
+ rbd_dev_unprobe(rbd_dev);
+ kfree(rbd_dev->header_name);
+ rbd_dev->header_name = NULL;
+ rbd_dev->image_format = 0;
+ kfree(rbd_dev->spec->image_id);
+ rbd_dev->spec->image_id = NULL;
+
+ rbd_dev_destroy(rbd_dev);
+}
+
+/*
+ * Probe for the existence of the header object for the given rbd
+ * device. If this image is the one being mapped (i.e., not a
+ * parent), initiate a watch on its header object before using that
+ * object to get detailed information about the rbd image.
+ */
+static int rbd_dev_image_probe(struct rbd_device *rbd_dev, bool mapping)
+{
+ int ret;
+
+ /*
+ * Get the id from the image id object. Unless there's an
+ * error, rbd_dev->spec->image_id will be filled in with
+ * a dynamically-allocated string, and rbd_dev->image_format
+ * will be set to either 1 or 2.
+ */
+ ret = rbd_dev_image_id(rbd_dev);
+ if (ret)
+ return ret;
+
+ ret = rbd_dev_header_name(rbd_dev);
+ if (ret)
+ goto err_out_format;
+
+ if (mapping) {
+ ret = rbd_dev_header_watch_sync(rbd_dev);
+ if (ret) {
+ if (ret == -ENOENT)
+ pr_info("image %s/%s does not exist\n",
+ rbd_dev->spec->pool_name,
+ rbd_dev->spec->image_name);
+ goto out_header_name;
+ }
+ }
+
+ ret = rbd_dev_header_info(rbd_dev);
+ if (ret)
+ goto err_out_watch;
+
+ /*
+ * If this image is the one being mapped, we have pool name and
+ * id, image name and id, and snap name - need to fill snap id.
+ * Otherwise this is a parent image, identified by pool, image
+ * and snap ids - need to fill in names for those ids.
+ */
+ if (mapping)
+ ret = rbd_spec_fill_snap_id(rbd_dev);
+ else
+ ret = rbd_spec_fill_names(rbd_dev);
+ if (ret) {
+ if (ret == -ENOENT)
+ pr_info("snap %s/%s@%s does not exist\n",
+ rbd_dev->spec->pool_name,
+ rbd_dev->spec->image_name,
+ rbd_dev->spec->snap_name);
+ goto err_out_probe;
+ }
+
+ if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
+ ret = rbd_dev_v2_parent_info(rbd_dev);
+ if (ret)
+ goto err_out_probe;
+
+ /*
+ * Need to warn users if this image is the one being
+ * mapped and has a parent.
+ */
+ if (mapping && rbd_dev->parent_spec)
+ rbd_warn(rbd_dev,
+ "WARNING: kernel layering is EXPERIMENTAL!");
+ }
+
+ ret = rbd_dev_probe_parent(rbd_dev);
+ if (ret)
+ goto err_out_probe;
+
+ dout("discovered format %u image, header name is %s\n",
+ rbd_dev->image_format, rbd_dev->header_name);
+ return 0;
+
+err_out_probe:
+ rbd_dev_unprobe(rbd_dev);
+err_out_watch:
+ if (mapping)
+ rbd_dev_header_unwatch_sync(rbd_dev);
+out_header_name:
+ kfree(rbd_dev->header_name);
+ rbd_dev->header_name = NULL;
+err_out_format:
+ rbd_dev->image_format = 0;
+ kfree(rbd_dev->spec->image_id);
+ rbd_dev->spec->image_id = NULL;
+ return ret;
+}
+
+static ssize_t do_rbd_add(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ struct ceph_options *ceph_opts = NULL;
+ struct rbd_options *rbd_opts = NULL;
+ struct rbd_spec *spec = NULL;
+ struct rbd_client *rbdc;
+ bool read_only;
+ int rc = -ENOMEM;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ /* parse add command */
+ rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
+ if (rc < 0)
+ goto err_out_module;
+ read_only = rbd_opts->read_only;
+ kfree(rbd_opts);
+ rbd_opts = NULL; /* done with this */
+
+ rbdc = rbd_get_client(ceph_opts);
+ if (IS_ERR(rbdc)) {
+ rc = PTR_ERR(rbdc);
+ goto err_out_args;
+ }
+
+ /* pick the pool */
+ rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
+ if (rc < 0) {
+ if (rc == -ENOENT)
+ pr_info("pool %s does not exist\n", spec->pool_name);
+ goto err_out_client;
+ }
+ spec->pool_id = (u64)rc;
+
+ /* The ceph file layout needs to fit pool id in 32 bits */
+
+ if (spec->pool_id > (u64)U32_MAX) {
+ rbd_warn(NULL, "pool id too large (%llu > %u)",
+ (unsigned long long)spec->pool_id, U32_MAX);
+ rc = -EIO;
+ goto err_out_client;
+ }
+
+ rbd_dev = rbd_dev_create(rbdc, spec);
+ if (!rbd_dev)
+ goto err_out_client;
+ rbdc = NULL; /* rbd_dev now owns this */
+ spec = NULL; /* rbd_dev now owns this */
+
+ rc = rbd_dev_image_probe(rbd_dev, true);
+ if (rc < 0)
+ goto err_out_rbd_dev;
+
+ /* If we are mapping a snapshot it must be marked read-only */
+
+ if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
+ read_only = true;
+ rbd_dev->mapping.read_only = read_only;
+
+ rc = rbd_dev_device_setup(rbd_dev);
+ if (rc) {
+ /*
+ * rbd_dev_header_unwatch_sync() can't be moved into
+ * rbd_dev_image_release() without refactoring, see
+ * commit 1f3ef78861ac.
+ */
+ rbd_dev_header_unwatch_sync(rbd_dev);
+ rbd_dev_image_release(rbd_dev);
+ goto err_out_module;
+ }
+
+ return count;
+
+err_out_rbd_dev:
+ rbd_dev_destroy(rbd_dev);
+err_out_client:
+ rbd_put_client(rbdc);
+err_out_args:
+ rbd_spec_put(spec);
+err_out_module:
+ module_put(THIS_MODULE);
+
+ dout("Error adding device %s\n", buf);
+
+ return (ssize_t)rc;
+}
+
+static ssize_t rbd_add(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ if (single_major)
+ return -EINVAL;
+
+ return do_rbd_add(bus, buf, count);
+}
+
+static ssize_t rbd_add_single_major(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ return do_rbd_add(bus, buf, count);
+}
+
+static void rbd_dev_device_release(struct device *dev)
+{
+ struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+ rbd_free_disk(rbd_dev);
+ clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+ rbd_dev_mapping_clear(rbd_dev);
+ if (!single_major)
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+ rbd_dev_id_put(rbd_dev);
+ rbd_dev_mapping_clear(rbd_dev);
+}
+
+static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
+{
+ while (rbd_dev->parent) {
+ struct rbd_device *first = rbd_dev;
+ struct rbd_device *second = first->parent;
+ struct rbd_device *third;
+
+ /*
+ * Follow to the parent with no grandparent and
+ * remove it.
+ */
+ while (second && (third = second->parent)) {
+ first = second;
+ second = third;
+ }
+ rbd_assert(second);
+ rbd_dev_image_release(second);
+ first->parent = NULL;
+ first->parent_overlap = 0;
+
+ rbd_assert(first->parent_spec);
+ rbd_spec_put(first->parent_spec);
+ first->parent_spec = NULL;
+ }
+}
+
+static ssize_t do_rbd_remove(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ struct list_head *tmp;
+ int dev_id;
+ unsigned long ul;
+ bool already = false;
+ int ret;
+
+ ret = kstrtoul(buf, 10, &ul);
+ if (ret)
+ return ret;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ dev_id = (int)ul;
+ if (dev_id != ul)
+ return -EINVAL;
+
+ ret = -ENOENT;
+ spin_lock(&rbd_dev_list_lock);
+ list_for_each(tmp, &rbd_dev_list) {
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->dev_id == dev_id) {
+ ret = 0;
+ break;
+ }
+ }
+ if (!ret) {
+ spin_lock_irq(&rbd_dev->lock);
+ if (rbd_dev->open_count)
+ ret = -EBUSY;
+ else
+ already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
+ &rbd_dev->flags);
+ spin_unlock_irq(&rbd_dev->lock);
+ }
+ spin_unlock(&rbd_dev_list_lock);
+ if (ret < 0 || already)
+ return ret;
+
+ rbd_dev_header_unwatch_sync(rbd_dev);
+ /*
+ * flush remaining watch callbacks - these must be complete
+ * before the osd_client is shutdown
+ */
+ dout("%s: flushing notifies", __func__);
+ ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
+
+ /*
+ * Don't free anything from rbd_dev->disk until after all
+ * notifies are completely processed. Otherwise
+ * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
+ * in a potential use after free of rbd_dev->disk or rbd_dev.
+ */
+ rbd_bus_del_dev(rbd_dev);
+ rbd_dev_image_release(rbd_dev);
+ module_put(THIS_MODULE);
+
+ return count;
+}
+
+static ssize_t rbd_remove(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ if (single_major)
+ return -EINVAL;
+
+ return do_rbd_remove(bus, buf, count);
+}
+
+static ssize_t rbd_remove_single_major(struct bus_type *bus,
+ const char *buf,
+ size_t count)
+{
+ return do_rbd_remove(bus, buf, count);
+}
+
+/*
+ * create control files in sysfs
+ * /sys/bus/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+ int ret;
+
+ ret = device_register(&rbd_root_dev);
+ if (ret < 0)
+ return ret;
+
+ ret = bus_register(&rbd_bus_type);
+ if (ret < 0)
+ device_unregister(&rbd_root_dev);
+
+ return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+ bus_unregister(&rbd_bus_type);
+ device_unregister(&rbd_root_dev);
+}
+
+static int rbd_slab_init(void)
+{
+ rbd_assert(!rbd_img_request_cache);
+ rbd_img_request_cache = kmem_cache_create("rbd_img_request",
+ sizeof (struct rbd_img_request),
+ __alignof__(struct rbd_img_request),
+ 0, NULL);
+ if (!rbd_img_request_cache)
+ return -ENOMEM;
+
+ rbd_assert(!rbd_obj_request_cache);
+ rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
+ sizeof (struct rbd_obj_request),
+ __alignof__(struct rbd_obj_request),
+ 0, NULL);
+ if (!rbd_obj_request_cache)
+ goto out_err;
+
+ rbd_assert(!rbd_segment_name_cache);
+ rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
+ CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
+ if (rbd_segment_name_cache)
+ return 0;
+out_err:
+ if (rbd_obj_request_cache) {
+ kmem_cache_destroy(rbd_obj_request_cache);
+ rbd_obj_request_cache = NULL;
+ }
+
+ kmem_cache_destroy(rbd_img_request_cache);
+ rbd_img_request_cache = NULL;
+
+ return -ENOMEM;
+}
+
+static void rbd_slab_exit(void)
+{
+ rbd_assert(rbd_segment_name_cache);
+ kmem_cache_destroy(rbd_segment_name_cache);
+ rbd_segment_name_cache = NULL;
+
+ rbd_assert(rbd_obj_request_cache);
+ kmem_cache_destroy(rbd_obj_request_cache);
+ rbd_obj_request_cache = NULL;
+
+ rbd_assert(rbd_img_request_cache);
+ kmem_cache_destroy(rbd_img_request_cache);
+ rbd_img_request_cache = NULL;
+}
+
+static int __init rbd_init(void)
+{
+ int rc;
+
+ if (!libceph_compatible(NULL)) {
+ rbd_warn(NULL, "libceph incompatibility (quitting)");
+ return -EINVAL;
+ }
+
+ rc = rbd_slab_init();
+ if (rc)
+ return rc;
+
+ /*
+ * The number of active work items is limited by the number of
+ * rbd devices * queue depth, so leave @max_active at default.
+ */
+ rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
+ if (!rbd_wq) {
+ rc = -ENOMEM;
+ goto err_out_slab;
+ }
+
+ if (single_major) {
+ rbd_major = register_blkdev(0, RBD_DRV_NAME);
+ if (rbd_major < 0) {
+ rc = rbd_major;
+ goto err_out_wq;
+ }
+ }
+
+ rc = rbd_sysfs_init();
+ if (rc)
+ goto err_out_blkdev;
+
+ if (single_major)
+ pr_info("loaded (major %d)\n", rbd_major);
+ else
+ pr_info("loaded\n");
+
+ return 0;
+
+err_out_blkdev:
+ if (single_major)
+ unregister_blkdev(rbd_major, RBD_DRV_NAME);
+err_out_wq:
+ destroy_workqueue(rbd_wq);
+err_out_slab:
+ rbd_slab_exit();
+ return rc;
+}
+
+static void __exit rbd_exit(void)
+{
+ ida_destroy(&rbd_dev_id_ida);
+ rbd_sysfs_cleanup();
+ if (single_major)
+ unregister_blkdev(rbd_major, RBD_DRV_NAME);
+ destroy_workqueue(rbd_wq);
+ rbd_slab_exit();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 000000000..49d77cbcf
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,81 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/* For format version 2, rbd image 'foo' consists of objects
+ * rbd_id.foo - id of image
+ * rbd_header.<id> - image metadata
+ * rbd_data.<id>.0000000000000000
+ * rbd_data.<id>.0000000000000001
+ * ... - data
+ * Clients do not access header data directly in rbd format 2.
+ */
+
+#define RBD_HEADER_PREFIX "rbd_header."
+#define RBD_DATA_PREFIX "rbd_data."
+#define RBD_ID_PREFIX "rbd_id."
+
+/*
+ * For format version 1, rbd image 'foo' consists of objects
+ * foo.rbd - image metadata
+ * rb.<idhi>.<idlo>.00000000
+ * rb.<idhi>.<idlo>.00000001
+ * ... - data
+ * There is no notion of a persistent image id in rbd format 1.
+ */
+
+#define RBD_SUFFIX ".rbd"
+
+#define RBD_DIRECTORY "rbd_directory"
+#define RBD_INFO "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
+#define RBD_MIN_OBJ_ORDER 16
+#define RBD_MAX_OBJ_ORDER 30
+
+#define RBD_COMP_NONE 0
+#define RBD_CRYPT_NONE 0
+
+#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE "RBD"
+#define RBD_HEADER_VERSION "001.005"
+
+struct rbd_image_snap_ondisk {
+ __le64 id;
+ __le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+ char text[40];
+ char object_prefix[24];
+ char signature[4];
+ char version[8];
+ struct {
+ __u8 order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ __u8 unused;
+ } __attribute__((packed)) options;
+ __le64 image_size;
+ __le64 snap_seq;
+ __le32 snap_count;
+ __le32 reserved;
+ __le64 snap_names_len;
+ struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/rsxx/Makefile b/drivers/block/rsxx/Makefile
new file mode 100644
index 000000000..b1c53c0aa
--- /dev/null
+++ b/drivers/block/rsxx/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_BLK_DEV_RSXX) += rsxx.o
+rsxx-objs := config.o core.o cregs.o dev.o dma.o
diff --git a/drivers/block/rsxx/config.c b/drivers/block/rsxx/config.c
new file mode 100644
index 000000000..10cd530d3
--- /dev/null
+++ b/drivers/block/rsxx/config.c
@@ -0,0 +1,211 @@
+/*
+* Filename: config.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/swab.h>
+
+#include "rsxx_priv.h"
+#include "rsxx_cfg.h"
+
+static void initialize_config(struct rsxx_card_cfg *cfg)
+{
+ cfg->hdr.version = RSXX_CFG_VERSION;
+
+ cfg->data.block_size = RSXX_HW_BLK_SIZE;
+ cfg->data.stripe_size = RSXX_HW_BLK_SIZE;
+ cfg->data.vendor_id = RSXX_VENDOR_ID_IBM;
+ cfg->data.cache_order = (-1);
+ cfg->data.intr_coal.mode = RSXX_INTR_COAL_DISABLED;
+ cfg->data.intr_coal.count = 0;
+ cfg->data.intr_coal.latency = 0;
+}
+
+static u32 config_data_crc32(struct rsxx_card_cfg *cfg)
+{
+ /*
+ * Return the compliment of the CRC to ensure compatibility
+ * (i.e. this is how early rsxx drivers did it.)
+ */
+
+ return ~crc32(~0, &cfg->data, sizeof(cfg->data));
+}
+
+
+/*----------------- Config Byte Swap Functions -------------------*/
+static void config_hdr_be_to_cpu(struct card_cfg_hdr *hdr)
+{
+ hdr->version = be32_to_cpu((__force __be32) hdr->version);
+ hdr->crc = be32_to_cpu((__force __be32) hdr->crc);
+}
+
+static void config_hdr_cpu_to_be(struct card_cfg_hdr *hdr)
+{
+ hdr->version = (__force u32) cpu_to_be32(hdr->version);
+ hdr->crc = (__force u32) cpu_to_be32(hdr->crc);
+}
+
+static void config_data_swab(struct rsxx_card_cfg *cfg)
+{
+ u32 *data = (u32 *) &cfg->data;
+ int i;
+
+ for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+ data[i] = swab32(data[i]);
+}
+
+static void config_data_le_to_cpu(struct rsxx_card_cfg *cfg)
+{
+ u32 *data = (u32 *) &cfg->data;
+ int i;
+
+ for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+ data[i] = le32_to_cpu((__force __le32) data[i]);
+}
+
+static void config_data_cpu_to_le(struct rsxx_card_cfg *cfg)
+{
+ u32 *data = (u32 *) &cfg->data;
+ int i;
+
+ for (i = 0; i < (sizeof(cfg->data) / 4); i++)
+ data[i] = (__force u32) cpu_to_le32(data[i]);
+}
+
+
+/*----------------- Config Operations ------------------*/
+static int rsxx_save_config(struct rsxx_cardinfo *card)
+{
+ struct rsxx_card_cfg cfg;
+ int st;
+
+ memcpy(&cfg, &card->config, sizeof(cfg));
+
+ if (unlikely(cfg.hdr.version != RSXX_CFG_VERSION)) {
+ dev_err(CARD_TO_DEV(card),
+ "Cannot save config with invalid version %d\n",
+ cfg.hdr.version);
+ return -EINVAL;
+ }
+
+ /* Convert data to little endian for the CRC calculation. */
+ config_data_cpu_to_le(&cfg);
+
+ cfg.hdr.crc = config_data_crc32(&cfg);
+
+ /*
+ * Swap the data from little endian to big endian so it can be
+ * stored.
+ */
+ config_data_swab(&cfg);
+ config_hdr_cpu_to_be(&cfg.hdr);
+
+ st = rsxx_creg_write(card, CREG_ADD_CONFIG, sizeof(cfg), &cfg, 1);
+ if (st)
+ return st;
+
+ return 0;
+}
+
+int rsxx_load_config(struct rsxx_cardinfo *card)
+{
+ int st;
+ u32 crc;
+
+ st = rsxx_creg_read(card, CREG_ADD_CONFIG, sizeof(card->config),
+ &card->config, 1);
+ if (st) {
+ dev_err(CARD_TO_DEV(card),
+ "Failed reading card config.\n");
+ return st;
+ }
+
+ config_hdr_be_to_cpu(&card->config.hdr);
+
+ if (card->config.hdr.version == RSXX_CFG_VERSION) {
+ /*
+ * We calculate the CRC with the data in little endian, because
+ * early drivers did not take big endian CPUs into account.
+ * The data is always stored in big endian, so we need to byte
+ * swap it before calculating the CRC.
+ */
+
+ config_data_swab(&card->config);
+
+ /* Check the CRC */
+ crc = config_data_crc32(&card->config);
+ if (crc != card->config.hdr.crc) {
+ dev_err(CARD_TO_DEV(card),
+ "Config corruption detected!\n");
+ dev_info(CARD_TO_DEV(card),
+ "CRC (sb x%08x is x%08x)\n",
+ card->config.hdr.crc, crc);
+ return -EIO;
+ }
+
+ /* Convert the data to CPU byteorder */
+ config_data_le_to_cpu(&card->config);
+
+ } else if (card->config.hdr.version != 0) {
+ dev_err(CARD_TO_DEV(card),
+ "Invalid config version %d.\n",
+ card->config.hdr.version);
+ /*
+ * Config version changes require special handling from the
+ * user
+ */
+ return -EINVAL;
+ } else {
+ dev_info(CARD_TO_DEV(card),
+ "Initializing card configuration.\n");
+ initialize_config(&card->config);
+ st = rsxx_save_config(card);
+ if (st)
+ return st;
+ }
+
+ card->config_valid = 1;
+
+ dev_dbg(CARD_TO_DEV(card), "version: x%08x\n",
+ card->config.hdr.version);
+ dev_dbg(CARD_TO_DEV(card), "crc: x%08x\n",
+ card->config.hdr.crc);
+ dev_dbg(CARD_TO_DEV(card), "block_size: x%08x\n",
+ card->config.data.block_size);
+ dev_dbg(CARD_TO_DEV(card), "stripe_size: x%08x\n",
+ card->config.data.stripe_size);
+ dev_dbg(CARD_TO_DEV(card), "vendor_id: x%08x\n",
+ card->config.data.vendor_id);
+ dev_dbg(CARD_TO_DEV(card), "cache_order: x%08x\n",
+ card->config.data.cache_order);
+ dev_dbg(CARD_TO_DEV(card), "mode: x%08x\n",
+ card->config.data.intr_coal.mode);
+ dev_dbg(CARD_TO_DEV(card), "count: x%08x\n",
+ card->config.data.intr_coal.count);
+ dev_dbg(CARD_TO_DEV(card), "latency: x%08x\n",
+ card->config.data.intr_coal.latency);
+
+ return 0;
+}
+
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
new file mode 100644
index 000000000..d8b2488aa
--- /dev/null
+++ b/drivers/block/rsxx/core.c
@@ -0,0 +1,1144 @@
+/*
+* Filename: core.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include <linux/genhd.h>
+#include <linux/idr.h>
+
+#include "rsxx_priv.h"
+#include "rsxx_cfg.h"
+
+#define NO_LEGACY 0
+#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */
+
+MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver");
+MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+static unsigned int force_legacy = NO_LEGACY;
+module_param(force_legacy, uint, 0444);
+MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts");
+
+static unsigned int sync_start = 1;
+module_param(sync_start, uint, 0444);
+MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete "
+ "until the card startup has completed.");
+
+static DEFINE_IDA(rsxx_disk_ida);
+static DEFINE_SPINLOCK(rsxx_ida_lock);
+
+/* --------------------Debugfs Setup ------------------- */
+
+static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p)
+{
+ struct rsxx_cardinfo *card = m->private;
+
+ seq_printf(m, "HWID 0x%08x\n",
+ ioread32(card->regmap + HWID));
+ seq_printf(m, "SCRATCH 0x%08x\n",
+ ioread32(card->regmap + SCRATCH));
+ seq_printf(m, "IER 0x%08x\n",
+ ioread32(card->regmap + IER));
+ seq_printf(m, "IPR 0x%08x\n",
+ ioread32(card->regmap + IPR));
+ seq_printf(m, "CREG_CMD 0x%08x\n",
+ ioread32(card->regmap + CREG_CMD));
+ seq_printf(m, "CREG_ADD 0x%08x\n",
+ ioread32(card->regmap + CREG_ADD));
+ seq_printf(m, "CREG_CNT 0x%08x\n",
+ ioread32(card->regmap + CREG_CNT));
+ seq_printf(m, "CREG_STAT 0x%08x\n",
+ ioread32(card->regmap + CREG_STAT));
+ seq_printf(m, "CREG_DATA0 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA0));
+ seq_printf(m, "CREG_DATA1 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA1));
+ seq_printf(m, "CREG_DATA2 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA2));
+ seq_printf(m, "CREG_DATA3 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA3));
+ seq_printf(m, "CREG_DATA4 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA4));
+ seq_printf(m, "CREG_DATA5 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA5));
+ seq_printf(m, "CREG_DATA6 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA6));
+ seq_printf(m, "CREG_DATA7 0x%08x\n",
+ ioread32(card->regmap + CREG_DATA7));
+ seq_printf(m, "INTR_COAL 0x%08x\n",
+ ioread32(card->regmap + INTR_COAL));
+ seq_printf(m, "HW_ERROR 0x%08x\n",
+ ioread32(card->regmap + HW_ERROR));
+ seq_printf(m, "DEBUG0 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG0));
+ seq_printf(m, "DEBUG1 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG1));
+ seq_printf(m, "DEBUG2 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG2));
+ seq_printf(m, "DEBUG3 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG3));
+ seq_printf(m, "DEBUG4 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG4));
+ seq_printf(m, "DEBUG5 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG5));
+ seq_printf(m, "DEBUG6 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG6));
+ seq_printf(m, "DEBUG7 0x%08x\n",
+ ioread32(card->regmap + PCI_DEBUG7));
+ seq_printf(m, "RECONFIG 0x%08x\n",
+ ioread32(card->regmap + PCI_RECONFIG));
+
+ return 0;
+}
+
+static int rsxx_attr_stats_show(struct seq_file *m, void *p)
+{
+ struct rsxx_cardinfo *card = m->private;
+ int i;
+
+ for (i = 0; i < card->n_targets; i++) {
+ seq_printf(m, "Ctrl %d CRC Errors = %d\n",
+ i, card->ctrl[i].stats.crc_errors);
+ seq_printf(m, "Ctrl %d Hard Errors = %d\n",
+ i, card->ctrl[i].stats.hard_errors);
+ seq_printf(m, "Ctrl %d Soft Errors = %d\n",
+ i, card->ctrl[i].stats.soft_errors);
+ seq_printf(m, "Ctrl %d Writes Issued = %d\n",
+ i, card->ctrl[i].stats.writes_issued);
+ seq_printf(m, "Ctrl %d Writes Failed = %d\n",
+ i, card->ctrl[i].stats.writes_failed);
+ seq_printf(m, "Ctrl %d Reads Issued = %d\n",
+ i, card->ctrl[i].stats.reads_issued);
+ seq_printf(m, "Ctrl %d Reads Failed = %d\n",
+ i, card->ctrl[i].stats.reads_failed);
+ seq_printf(m, "Ctrl %d Reads Retried = %d\n",
+ i, card->ctrl[i].stats.reads_retried);
+ seq_printf(m, "Ctrl %d Discards Issued = %d\n",
+ i, card->ctrl[i].stats.discards_issued);
+ seq_printf(m, "Ctrl %d Discards Failed = %d\n",
+ i, card->ctrl[i].stats.discards_failed);
+ seq_printf(m, "Ctrl %d DMA SW Errors = %d\n",
+ i, card->ctrl[i].stats.dma_sw_err);
+ seq_printf(m, "Ctrl %d DMA HW Faults = %d\n",
+ i, card->ctrl[i].stats.dma_hw_fault);
+ seq_printf(m, "Ctrl %d DMAs Cancelled = %d\n",
+ i, card->ctrl[i].stats.dma_cancelled);
+ seq_printf(m, "Ctrl %d SW Queue Depth = %d\n",
+ i, card->ctrl[i].stats.sw_q_depth);
+ seq_printf(m, "Ctrl %d HW Queue Depth = %d\n",
+ i, atomic_read(&card->ctrl[i].stats.hw_q_depth));
+ }
+
+ return 0;
+}
+
+static int rsxx_attr_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rsxx_attr_stats_show, inode->i_private);
+}
+
+static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rsxx_attr_pci_regs_show, inode->i_private);
+}
+
+static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct rsxx_cardinfo *card = file_inode(fp)->i_private;
+ char *buf;
+ ssize_t st;
+
+ buf = kzalloc(cnt, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ st = rsxx_creg_read(card, CREG_ADD_CRAM + (u32)*ppos, cnt, buf, 1);
+ if (!st)
+ st = copy_to_user(ubuf, buf, cnt);
+ kfree(buf);
+ if (st)
+ return st;
+ *ppos += cnt;
+ return cnt;
+}
+
+static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct rsxx_cardinfo *card = file_inode(fp)->i_private;
+ char *buf;
+ ssize_t st;
+
+ buf = kzalloc(cnt, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ st = copy_from_user(buf, ubuf, cnt);
+ if (!st)
+ st = rsxx_creg_write(card, CREG_ADD_CRAM + (u32)*ppos, cnt,
+ buf, 1);
+ kfree(buf);
+ if (st)
+ return st;
+ *ppos += cnt;
+ return cnt;
+}
+
+static const struct file_operations debugfs_cram_fops = {
+ .owner = THIS_MODULE,
+ .read = rsxx_cram_read,
+ .write = rsxx_cram_write,
+};
+
+static const struct file_operations debugfs_stats_fops = {
+ .owner = THIS_MODULE,
+ .open = rsxx_attr_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static const struct file_operations debugfs_pci_regs_fops = {
+ .owner = THIS_MODULE,
+ .open = rsxx_attr_pci_regs_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
+{
+ struct dentry *debugfs_stats;
+ struct dentry *debugfs_pci_regs;
+ struct dentry *debugfs_cram;
+
+ card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL);
+ if (IS_ERR_OR_NULL(card->debugfs_dir))
+ goto failed_debugfs_dir;
+
+ debugfs_stats = debugfs_create_file("stats", S_IRUGO,
+ card->debugfs_dir, card,
+ &debugfs_stats_fops);
+ if (IS_ERR_OR_NULL(debugfs_stats))
+ goto failed_debugfs_stats;
+
+ debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
+ card->debugfs_dir, card,
+ &debugfs_pci_regs_fops);
+ if (IS_ERR_OR_NULL(debugfs_pci_regs))
+ goto failed_debugfs_pci_regs;
+
+ debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
+ card->debugfs_dir, card,
+ &debugfs_cram_fops);
+ if (IS_ERR_OR_NULL(debugfs_cram))
+ goto failed_debugfs_cram;
+
+ return;
+failed_debugfs_cram:
+ debugfs_remove(debugfs_pci_regs);
+failed_debugfs_pci_regs:
+ debugfs_remove(debugfs_stats);
+failed_debugfs_stats:
+ debugfs_remove(card->debugfs_dir);
+failed_debugfs_dir:
+ card->debugfs_dir = NULL;
+}
+
+/*----------------- Interrupt Control & Handling -------------------*/
+
+static void rsxx_mask_interrupts(struct rsxx_cardinfo *card)
+{
+ card->isr_mask = 0;
+ card->ier_mask = 0;
+}
+
+static void __enable_intr(unsigned int *mask, unsigned int intr)
+{
+ *mask |= intr;
+}
+
+static void __disable_intr(unsigned int *mask, unsigned int intr)
+{
+ *mask &= ~intr;
+}
+
+/*
+ * NOTE: Disabling the IER will disable the hardware interrupt.
+ * Disabling the ISR will disable the software handling of the ISR bit.
+ *
+ * Enable/Disable interrupt functions assume the card->irq_lock
+ * is held by the caller.
+ */
+void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr)
+{
+ if (unlikely(card->halt) ||
+ unlikely(card->eeh_state))
+ return;
+
+ __enable_intr(&card->ier_mask, intr);
+ iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr)
+{
+ if (unlikely(card->eeh_state))
+ return;
+
+ __disable_intr(&card->ier_mask, intr);
+ iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card,
+ unsigned int intr)
+{
+ if (unlikely(card->halt) ||
+ unlikely(card->eeh_state))
+ return;
+
+ __enable_intr(&card->isr_mask, intr);
+ __enable_intr(&card->ier_mask, intr);
+ iowrite32(card->ier_mask, card->regmap + IER);
+}
+void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card,
+ unsigned int intr)
+{
+ if (unlikely(card->eeh_state))
+ return;
+
+ __disable_intr(&card->isr_mask, intr);
+ __disable_intr(&card->ier_mask, intr);
+ iowrite32(card->ier_mask, card->regmap + IER);
+}
+
+static irqreturn_t rsxx_isr(int irq, void *pdata)
+{
+ struct rsxx_cardinfo *card = pdata;
+ unsigned int isr;
+ int handled = 0;
+ int reread_isr;
+ int i;
+
+ spin_lock(&card->irq_lock);
+
+ do {
+ reread_isr = 0;
+
+ if (unlikely(card->eeh_state))
+ break;
+
+ isr = ioread32(card->regmap + ISR);
+ if (isr == 0xffffffff) {
+ /*
+ * A few systems seem to have an intermittent issue
+ * where PCI reads return all Fs, but retrying the read
+ * a little later will return as expected.
+ */
+ dev_info(CARD_TO_DEV(card),
+ "ISR = 0xFFFFFFFF, retrying later\n");
+ break;
+ }
+
+ isr &= card->isr_mask;
+ if (!isr)
+ break;
+
+ for (i = 0; i < card->n_targets; i++) {
+ if (isr & CR_INTR_DMA(i)) {
+ if (card->ier_mask & CR_INTR_DMA(i)) {
+ rsxx_disable_ier(card, CR_INTR_DMA(i));
+ reread_isr = 1;
+ }
+ queue_work(card->ctrl[i].done_wq,
+ &card->ctrl[i].dma_done_work);
+ handled++;
+ }
+ }
+
+ if (isr & CR_INTR_CREG) {
+ queue_work(card->creg_ctrl.creg_wq,
+ &card->creg_ctrl.done_work);
+ handled++;
+ }
+
+ if (isr & CR_INTR_EVENT) {
+ queue_work(card->event_wq, &card->event_work);
+ rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
+ handled++;
+ }
+ } while (reread_isr);
+
+ spin_unlock(&card->irq_lock);
+
+ return handled ? IRQ_HANDLED : IRQ_NONE;
+}
+
+/*----------------- Card Event Handler -------------------*/
+static const char * const rsxx_card_state_to_str(unsigned int state)
+{
+ static const char * const state_strings[] = {
+ "Unknown", "Shutdown", "Starting", "Formatting",
+ "Uninitialized", "Good", "Shutting Down",
+ "Fault", "Read Only Fault", "dStroying"
+ };
+
+ return state_strings[ffs(state)];
+}
+
+static void card_state_change(struct rsxx_cardinfo *card,
+ unsigned int new_state)
+{
+ int st;
+
+ dev_info(CARD_TO_DEV(card),
+ "card state change detected.(%s -> %s)\n",
+ rsxx_card_state_to_str(card->state),
+ rsxx_card_state_to_str(new_state));
+
+ card->state = new_state;
+
+ /* Don't attach DMA interfaces if the card has an invalid config */
+ if (!card->config_valid)
+ return;
+
+ switch (new_state) {
+ case CARD_STATE_RD_ONLY_FAULT:
+ dev_crit(CARD_TO_DEV(card),
+ "Hardware has entered read-only mode!\n");
+ /*
+ * Fall through so the DMA devices can be attached and
+ * the user can attempt to pull off their data.
+ */
+ case CARD_STATE_GOOD:
+ st = rsxx_get_card_size8(card, &card->size8);
+ if (st)
+ dev_err(CARD_TO_DEV(card),
+ "Failed attaching DMA devices\n");
+
+ if (card->config_valid)
+ set_capacity(card->gendisk, card->size8 >> 9);
+ break;
+
+ case CARD_STATE_FAULT:
+ dev_crit(CARD_TO_DEV(card),
+ "Hardware Fault reported!\n");
+ /* Fall through. */
+
+ /* Everything else, detach DMA interface if it's attached. */
+ case CARD_STATE_SHUTDOWN:
+ case CARD_STATE_STARTING:
+ case CARD_STATE_FORMATTING:
+ case CARD_STATE_UNINITIALIZED:
+ case CARD_STATE_SHUTTING_DOWN:
+ /*
+ * dStroy is a term coined by marketing to represent the low level
+ * secure erase.
+ */
+ case CARD_STATE_DSTROYING:
+ set_capacity(card->gendisk, 0);
+ break;
+ }
+}
+
+static void card_event_handler(struct work_struct *work)
+{
+ struct rsxx_cardinfo *card;
+ unsigned int state;
+ unsigned long flags;
+ int st;
+
+ card = container_of(work, struct rsxx_cardinfo, event_work);
+
+ if (unlikely(card->halt))
+ return;
+
+ /*
+ * Enable the interrupt now to avoid any weird race conditions where a
+ * state change might occur while rsxx_get_card_state() is
+ * processing a returned creg cmd.
+ */
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_enable_ier_and_isr(card, CR_INTR_EVENT);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ st = rsxx_get_card_state(card, &state);
+ if (st) {
+ dev_info(CARD_TO_DEV(card),
+ "Failed reading state after event.\n");
+ return;
+ }
+
+ if (card->state != state)
+ card_state_change(card, state);
+
+ if (card->creg_ctrl.creg_stats.stat & CREG_STAT_LOG_PENDING)
+ rsxx_read_hw_log(card);
+}
+
+/*----------------- Card Operations -------------------*/
+static int card_shutdown(struct rsxx_cardinfo *card)
+{
+ unsigned int state;
+ signed long start;
+ const int timeout = msecs_to_jiffies(120000);
+ int st;
+
+ /* We can't issue a shutdown if the card is in a transition state */
+ start = jiffies;
+ do {
+ st = rsxx_get_card_state(card, &state);
+ if (st)
+ return st;
+ } while (state == CARD_STATE_STARTING &&
+ (jiffies - start < timeout));
+
+ if (state == CARD_STATE_STARTING)
+ return -ETIMEDOUT;
+
+ /* Only issue a shutdown if we need to */
+ if ((state != CARD_STATE_SHUTTING_DOWN) &&
+ (state != CARD_STATE_SHUTDOWN)) {
+ st = rsxx_issue_card_cmd(card, CARD_CMD_SHUTDOWN);
+ if (st)
+ return st;
+ }
+
+ start = jiffies;
+ do {
+ st = rsxx_get_card_state(card, &state);
+ if (st)
+ return st;
+ } while (state != CARD_STATE_SHUTDOWN &&
+ (jiffies - start < timeout));
+
+ if (state != CARD_STATE_SHUTDOWN)
+ return -ETIMEDOUT;
+
+ return 0;
+}
+
+static int rsxx_eeh_frozen(struct pci_dev *dev)
+{
+ struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+ int i;
+ int st;
+
+ dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n");
+
+ card->eeh_state = 1;
+ rsxx_mask_interrupts(card);
+
+ /*
+ * We need to guarantee that the write for eeh_state and masking
+ * interrupts does not become reordered. This will prevent a possible
+ * race condition with the EEH code.
+ */
+ wmb();
+
+ pci_disable_device(dev);
+
+ st = rsxx_eeh_save_issued_dmas(card);
+ if (st)
+ return st;
+
+ rsxx_eeh_save_issued_creg(card);
+
+ for (i = 0; i < card->n_targets; i++) {
+ if (card->ctrl[i].status.buf)
+ pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
+ card->ctrl[i].status.buf,
+ card->ctrl[i].status.dma_addr);
+ if (card->ctrl[i].cmd.buf)
+ pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
+ card->ctrl[i].cmd.buf,
+ card->ctrl[i].cmd.dma_addr);
+ }
+
+ return 0;
+}
+
+static void rsxx_eeh_failure(struct pci_dev *dev)
+{
+ struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+ int i;
+ int cnt = 0;
+
+ dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n");
+
+ card->eeh_state = 1;
+ card->halt = 1;
+
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_bh(&card->ctrl[i].queue_lock);
+ cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
+ &card->ctrl[i].queue,
+ COMPLETE_DMA);
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
+
+ cnt += rsxx_dma_cancel(&card->ctrl[i]);
+
+ if (cnt)
+ dev_info(CARD_TO_DEV(card),
+ "Freed %d queued DMAs on channel %d\n",
+ cnt, card->ctrl[i].id);
+ }
+}
+
+static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card)
+{
+ unsigned int status;
+ int iter = 0;
+
+ /* We need to wait for the hardware to reset */
+ while (iter++ < 10) {
+ status = ioread32(card->regmap + PCI_RECONFIG);
+
+ if (status & RSXX_FLUSH_BUSY) {
+ ssleep(1);
+ continue;
+ }
+
+ if (status & RSXX_FLUSH_TIMEOUT)
+ dev_warn(CARD_TO_DEV(card), "HW: flash controller timeout\n");
+ return 0;
+ }
+
+ /* Hardware failed resetting itself. */
+ return -1;
+}
+
+static pci_ers_result_t rsxx_error_detected(struct pci_dev *dev,
+ enum pci_channel_state error)
+{
+ int st;
+
+ if (dev->revision < RSXX_EEH_SUPPORT)
+ return PCI_ERS_RESULT_NONE;
+
+ if (error == pci_channel_io_perm_failure) {
+ rsxx_eeh_failure(dev);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ st = rsxx_eeh_frozen(dev);
+ if (st) {
+ dev_err(&dev->dev, "Slot reset setup failed\n");
+ rsxx_eeh_failure(dev);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ return PCI_ERS_RESULT_NEED_RESET;
+}
+
+static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
+{
+ struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+ unsigned long flags;
+ int i;
+ int st;
+
+ dev_warn(&dev->dev,
+ "IBM Flash Adapter PCI: recovering from slot reset.\n");
+
+ st = pci_enable_device(dev);
+ if (st)
+ goto failed_hw_setup;
+
+ pci_set_master(dev);
+
+ st = rsxx_eeh_fifo_flush_poll(card);
+ if (st)
+ goto failed_hw_setup;
+
+ rsxx_dma_queue_reset(card);
+
+ for (i = 0; i < card->n_targets; i++) {
+ st = rsxx_hw_buffers_init(dev, &card->ctrl[i]);
+ if (st)
+ goto failed_hw_buffers_init;
+ }
+
+ if (card->config_valid)
+ rsxx_dma_configure(card);
+
+ /* Clears the ISR register from spurious interrupts */
+ st = ioread32(card->regmap + ISR);
+
+ card->eeh_state = 0;
+
+ spin_lock_irqsave(&card->irq_lock, flags);
+ if (card->n_targets & RSXX_MAX_TARGETS)
+ rsxx_enable_ier_and_isr(card, CR_INTR_ALL_G);
+ else
+ rsxx_enable_ier_and_isr(card, CR_INTR_ALL_C);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ rsxx_kick_creg_queue(card);
+
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock(&card->ctrl[i].queue_lock);
+ if (list_empty(&card->ctrl[i].queue)) {
+ spin_unlock(&card->ctrl[i].queue_lock);
+ continue;
+ }
+ spin_unlock(&card->ctrl[i].queue_lock);
+
+ queue_work(card->ctrl[i].issue_wq,
+ &card->ctrl[i].issue_dma_work);
+ }
+
+ dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n");
+
+ return PCI_ERS_RESULT_RECOVERED;
+
+failed_hw_buffers_init:
+ for (i = 0; i < card->n_targets; i++) {
+ if (card->ctrl[i].status.buf)
+ pci_free_consistent(card->dev,
+ STATUS_BUFFER_SIZE8,
+ card->ctrl[i].status.buf,
+ card->ctrl[i].status.dma_addr);
+ if (card->ctrl[i].cmd.buf)
+ pci_free_consistent(card->dev,
+ COMMAND_BUFFER_SIZE8,
+ card->ctrl[i].cmd.buf,
+ card->ctrl[i].cmd.dma_addr);
+ }
+failed_hw_setup:
+ rsxx_eeh_failure(dev);
+ return PCI_ERS_RESULT_DISCONNECT;
+
+}
+
+/*----------------- Driver Initialization & Setup -------------------*/
+/* Returns: 0 if the driver is compatible with the device
+ -1 if the driver is NOT compatible with the device */
+static int rsxx_compatibility_check(struct rsxx_cardinfo *card)
+{
+ unsigned char pci_rev;
+
+ pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev);
+
+ if (pci_rev > RS70_PCI_REV_SUPPORTED)
+ return -1;
+ return 0;
+}
+
+static int rsxx_pci_probe(struct pci_dev *dev,
+ const struct pci_device_id *id)
+{
+ struct rsxx_cardinfo *card;
+ int st;
+ unsigned int sync_timeout;
+
+ dev_info(&dev->dev, "PCI-Flash SSD discovered\n");
+
+ card = kzalloc(sizeof(*card), GFP_KERNEL);
+ if (!card)
+ return -ENOMEM;
+
+ card->dev = dev;
+ pci_set_drvdata(dev, card);
+
+ do {
+ if (!ida_pre_get(&rsxx_disk_ida, GFP_KERNEL)) {
+ st = -ENOMEM;
+ goto failed_ida_get;
+ }
+
+ spin_lock(&rsxx_ida_lock);
+ st = ida_get_new(&rsxx_disk_ida, &card->disk_id);
+ spin_unlock(&rsxx_ida_lock);
+ } while (st == -EAGAIN);
+
+ if (st)
+ goto failed_ida_get;
+
+ st = pci_enable_device(dev);
+ if (st)
+ goto failed_enable;
+
+ pci_set_master(dev);
+ pci_set_dma_max_seg_size(dev, RSXX_HW_BLK_SIZE);
+
+ st = pci_set_dma_mask(dev, DMA_BIT_MASK(64));
+ if (st) {
+ dev_err(CARD_TO_DEV(card),
+ "No usable DMA configuration,aborting\n");
+ goto failed_dma_mask;
+ }
+
+ st = pci_request_regions(dev, DRIVER_NAME);
+ if (st) {
+ dev_err(CARD_TO_DEV(card),
+ "Failed to request memory region\n");
+ goto failed_request_regions;
+ }
+
+ if (pci_resource_len(dev, 0) == 0) {
+ dev_err(CARD_TO_DEV(card), "BAR0 has length 0!\n");
+ st = -ENOMEM;
+ goto failed_iomap;
+ }
+
+ card->regmap = pci_iomap(dev, 0, 0);
+ if (!card->regmap) {
+ dev_err(CARD_TO_DEV(card), "Failed to map BAR0\n");
+ st = -ENOMEM;
+ goto failed_iomap;
+ }
+
+ spin_lock_init(&card->irq_lock);
+ card->halt = 0;
+ card->eeh_state = 0;
+
+ spin_lock_irq(&card->irq_lock);
+ rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+ spin_unlock_irq(&card->irq_lock);
+
+ if (!force_legacy) {
+ st = pci_enable_msi(dev);
+ if (st)
+ dev_warn(CARD_TO_DEV(card),
+ "Failed to enable MSI\n");
+ }
+
+ st = request_irq(dev->irq, rsxx_isr, IRQF_SHARED,
+ DRIVER_NAME, card);
+ if (st) {
+ dev_err(CARD_TO_DEV(card),
+ "Failed requesting IRQ%d\n", dev->irq);
+ goto failed_irq;
+ }
+
+ /************* Setup Processor Command Interface *************/
+ st = rsxx_creg_setup(card);
+ if (st) {
+ dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n");
+ goto failed_creg_setup;
+ }
+
+ spin_lock_irq(&card->irq_lock);
+ rsxx_enable_ier_and_isr(card, CR_INTR_CREG);
+ spin_unlock_irq(&card->irq_lock);
+
+ st = rsxx_compatibility_check(card);
+ if (st) {
+ dev_warn(CARD_TO_DEV(card),
+ "Incompatible driver detected. Please update the driver.\n");
+ st = -EINVAL;
+ goto failed_compatiblity_check;
+ }
+
+ /************* Load Card Config *************/
+ st = rsxx_load_config(card);
+ if (st)
+ dev_err(CARD_TO_DEV(card),
+ "Failed loading card config\n");
+
+ /************* Setup DMA Engine *************/
+ st = rsxx_get_num_targets(card, &card->n_targets);
+ if (st)
+ dev_info(CARD_TO_DEV(card),
+ "Failed reading the number of DMA targets\n");
+
+ card->ctrl = kzalloc(card->n_targets * sizeof(*card->ctrl), GFP_KERNEL);
+ if (!card->ctrl) {
+ st = -ENOMEM;
+ goto failed_dma_setup;
+ }
+
+ st = rsxx_dma_setup(card);
+ if (st) {
+ dev_info(CARD_TO_DEV(card),
+ "Failed to setup DMA engine\n");
+ goto failed_dma_setup;
+ }
+
+ /************* Setup Card Event Handler *************/
+ card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event");
+ if (!card->event_wq) {
+ dev_err(CARD_TO_DEV(card), "Failed card event setup.\n");
+ goto failed_event_handler;
+ }
+
+ INIT_WORK(&card->event_work, card_event_handler);
+
+ st = rsxx_setup_dev(card);
+ if (st)
+ goto failed_create_dev;
+
+ rsxx_get_card_state(card, &card->state);
+
+ dev_info(CARD_TO_DEV(card),
+ "card state: %s\n",
+ rsxx_card_state_to_str(card->state));
+
+ /*
+ * Now that the DMA Engine and devices have been setup,
+ * we can enable the event interrupt(it kicks off actions in
+ * those layers so we couldn't enable it right away.)
+ */
+ spin_lock_irq(&card->irq_lock);
+ rsxx_enable_ier_and_isr(card, CR_INTR_EVENT);
+ spin_unlock_irq(&card->irq_lock);
+
+ if (card->state == CARD_STATE_SHUTDOWN) {
+ st = rsxx_issue_card_cmd(card, CARD_CMD_STARTUP);
+ if (st)
+ dev_crit(CARD_TO_DEV(card),
+ "Failed issuing card startup\n");
+ if (sync_start) {
+ sync_timeout = SYNC_START_TIMEOUT;
+
+ dev_info(CARD_TO_DEV(card),
+ "Waiting for card to startup\n");
+
+ do {
+ ssleep(1);
+ sync_timeout--;
+
+ rsxx_get_card_state(card, &card->state);
+ } while (sync_timeout &&
+ (card->state == CARD_STATE_STARTING));
+
+ if (card->state == CARD_STATE_STARTING) {
+ dev_warn(CARD_TO_DEV(card),
+ "Card startup timed out\n");
+ card->size8 = 0;
+ } else {
+ dev_info(CARD_TO_DEV(card),
+ "card state: %s\n",
+ rsxx_card_state_to_str(card->state));
+ st = rsxx_get_card_size8(card, &card->size8);
+ if (st)
+ card->size8 = 0;
+ }
+ }
+ } else if (card->state == CARD_STATE_GOOD ||
+ card->state == CARD_STATE_RD_ONLY_FAULT) {
+ st = rsxx_get_card_size8(card, &card->size8);
+ if (st)
+ card->size8 = 0;
+ }
+
+ rsxx_attach_dev(card);
+
+ /************* Setup Debugfs *************/
+ rsxx_debugfs_dev_new(card);
+
+ return 0;
+
+failed_create_dev:
+ destroy_workqueue(card->event_wq);
+ card->event_wq = NULL;
+failed_event_handler:
+ rsxx_dma_destroy(card);
+failed_dma_setup:
+failed_compatiblity_check:
+ destroy_workqueue(card->creg_ctrl.creg_wq);
+ card->creg_ctrl.creg_wq = NULL;
+failed_creg_setup:
+ spin_lock_irq(&card->irq_lock);
+ rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+ spin_unlock_irq(&card->irq_lock);
+ free_irq(dev->irq, card);
+ if (!force_legacy)
+ pci_disable_msi(dev);
+failed_irq:
+ pci_iounmap(dev, card->regmap);
+failed_iomap:
+ pci_release_regions(dev);
+failed_request_regions:
+failed_dma_mask:
+ pci_disable_device(dev);
+failed_enable:
+ spin_lock(&rsxx_ida_lock);
+ ida_remove(&rsxx_disk_ida, card->disk_id);
+ spin_unlock(&rsxx_ida_lock);
+failed_ida_get:
+ kfree(card);
+
+ return st;
+}
+
+static void rsxx_pci_remove(struct pci_dev *dev)
+{
+ struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+ unsigned long flags;
+ int st;
+ int i;
+
+ if (!card)
+ return;
+
+ dev_info(CARD_TO_DEV(card),
+ "Removing PCI-Flash SSD.\n");
+
+ rsxx_detach_dev(card);
+
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i));
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+ }
+
+ st = card_shutdown(card);
+ if (st)
+ dev_crit(CARD_TO_DEV(card), "Shutdown failed!\n");
+
+ /* Sync outstanding event handlers. */
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ cancel_work_sync(&card->event_work);
+
+ rsxx_destroy_dev(card);
+ rsxx_dma_destroy(card);
+
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ /* Prevent work_structs from re-queuing themselves. */
+ card->halt = 1;
+
+ debugfs_remove_recursive(card->debugfs_dir);
+
+ free_irq(dev->irq, card);
+
+ if (!force_legacy)
+ pci_disable_msi(dev);
+
+ rsxx_creg_destroy(card);
+
+ pci_iounmap(dev, card->regmap);
+
+ pci_disable_device(dev);
+ pci_release_regions(dev);
+
+ kfree(card);
+}
+
+static int rsxx_pci_suspend(struct pci_dev *dev, pm_message_t state)
+{
+ /* We don't support suspend at this time. */
+ return -ENOSYS;
+}
+
+static void rsxx_pci_shutdown(struct pci_dev *dev)
+{
+ struct rsxx_cardinfo *card = pci_get_drvdata(dev);
+ unsigned long flags;
+ int i;
+
+ if (!card)
+ return;
+
+ dev_info(CARD_TO_DEV(card), "Shutting down PCI-Flash SSD.\n");
+
+ rsxx_detach_dev(card);
+
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_DMA(i));
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+ }
+
+ card_shutdown(card);
+}
+
+static const struct pci_error_handlers rsxx_err_handler = {
+ .error_detected = rsxx_error_detected,
+ .slot_reset = rsxx_slot_reset,
+};
+
+static const struct pci_device_id rsxx_pci_ids[] = {
+ {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS70_FLASH)},
+ {PCI_DEVICE(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_FS80_FLASH)},
+ {0,},
+};
+
+MODULE_DEVICE_TABLE(pci, rsxx_pci_ids);
+
+static struct pci_driver rsxx_pci_driver = {
+ .name = DRIVER_NAME,
+ .id_table = rsxx_pci_ids,
+ .probe = rsxx_pci_probe,
+ .remove = rsxx_pci_remove,
+ .suspend = rsxx_pci_suspend,
+ .shutdown = rsxx_pci_shutdown,
+ .err_handler = &rsxx_err_handler,
+};
+
+static int __init rsxx_core_init(void)
+{
+ int st;
+
+ st = rsxx_dev_init();
+ if (st)
+ return st;
+
+ st = rsxx_dma_init();
+ if (st)
+ goto dma_init_failed;
+
+ st = rsxx_creg_init();
+ if (st)
+ goto creg_init_failed;
+
+ return pci_register_driver(&rsxx_pci_driver);
+
+creg_init_failed:
+ rsxx_dma_cleanup();
+dma_init_failed:
+ rsxx_dev_cleanup();
+
+ return st;
+}
+
+static void __exit rsxx_core_cleanup(void)
+{
+ pci_unregister_driver(&rsxx_pci_driver);
+ rsxx_creg_cleanup();
+ rsxx_dma_cleanup();
+ rsxx_dev_cleanup();
+}
+
+module_init(rsxx_core_init);
+module_exit(rsxx_core_cleanup);
diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c
new file mode 100644
index 000000000..926dce9c4
--- /dev/null
+++ b/drivers/block/rsxx/cregs.c
@@ -0,0 +1,804 @@
+/*
+* Filename: cregs.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/completion.h>
+#include <linux/slab.h>
+
+#include "rsxx_priv.h"
+
+#define CREG_TIMEOUT_MSEC 10000
+
+typedef void (*creg_cmd_cb)(struct rsxx_cardinfo *card,
+ struct creg_cmd *cmd,
+ int st);
+
+struct creg_cmd {
+ struct list_head list;
+ creg_cmd_cb cb;
+ void *cb_private;
+ unsigned int op;
+ unsigned int addr;
+ int cnt8;
+ void *buf;
+ unsigned int stream;
+ unsigned int status;
+};
+
+static struct kmem_cache *creg_cmd_pool;
+
+
+/*------------ Private Functions --------------*/
+
+#if defined(__LITTLE_ENDIAN)
+#define LITTLE_ENDIAN 1
+#elif defined(__BIG_ENDIAN)
+#define LITTLE_ENDIAN 0
+#else
+#error Unknown endianess!!! Aborting...
+#endif
+
+static int copy_to_creg_data(struct rsxx_cardinfo *card,
+ int cnt8,
+ void *buf,
+ unsigned int stream)
+{
+ int i = 0;
+ u32 *data = buf;
+
+ if (unlikely(card->eeh_state))
+ return -EIO;
+
+ for (i = 0; cnt8 > 0; i++, cnt8 -= 4) {
+ /*
+ * Firmware implementation makes it necessary to byte swap on
+ * little endian processors.
+ */
+ if (LITTLE_ENDIAN && stream)
+ iowrite32be(data[i], card->regmap + CREG_DATA(i));
+ else
+ iowrite32(data[i], card->regmap + CREG_DATA(i));
+ }
+
+ return 0;
+}
+
+
+static int copy_from_creg_data(struct rsxx_cardinfo *card,
+ int cnt8,
+ void *buf,
+ unsigned int stream)
+{
+ int i = 0;
+ u32 *data = buf;
+
+ if (unlikely(card->eeh_state))
+ return -EIO;
+
+ for (i = 0; cnt8 > 0; i++, cnt8 -= 4) {
+ /*
+ * Firmware implementation makes it necessary to byte swap on
+ * little endian processors.
+ */
+ if (LITTLE_ENDIAN && stream)
+ data[i] = ioread32be(card->regmap + CREG_DATA(i));
+ else
+ data[i] = ioread32(card->regmap + CREG_DATA(i));
+ }
+
+ return 0;
+}
+
+static void creg_issue_cmd(struct rsxx_cardinfo *card, struct creg_cmd *cmd)
+{
+ int st;
+
+ if (unlikely(card->eeh_state))
+ return;
+
+ iowrite32(cmd->addr, card->regmap + CREG_ADD);
+ iowrite32(cmd->cnt8, card->regmap + CREG_CNT);
+
+ if (cmd->op == CREG_OP_WRITE) {
+ if (cmd->buf) {
+ st = copy_to_creg_data(card, cmd->cnt8,
+ cmd->buf, cmd->stream);
+ if (st)
+ return;
+ }
+ }
+
+ if (unlikely(card->eeh_state))
+ return;
+
+ /* Setting the valid bit will kick off the command. */
+ iowrite32(cmd->op, card->regmap + CREG_CMD);
+}
+
+static void creg_kick_queue(struct rsxx_cardinfo *card)
+{
+ if (card->creg_ctrl.active || list_empty(&card->creg_ctrl.queue))
+ return;
+
+ card->creg_ctrl.active = 1;
+ card->creg_ctrl.active_cmd = list_first_entry(&card->creg_ctrl.queue,
+ struct creg_cmd, list);
+ list_del(&card->creg_ctrl.active_cmd->list);
+ card->creg_ctrl.q_depth--;
+
+ /*
+ * We have to set the timer before we push the new command. Otherwise,
+ * we could create a race condition that would occur if the timer
+ * was not canceled, and expired after the new command was pushed,
+ * but before the command was issued to hardware.
+ */
+ mod_timer(&card->creg_ctrl.cmd_timer,
+ jiffies + msecs_to_jiffies(CREG_TIMEOUT_MSEC));
+
+ creg_issue_cmd(card, card->creg_ctrl.active_cmd);
+}
+
+static int creg_queue_cmd(struct rsxx_cardinfo *card,
+ unsigned int op,
+ unsigned int addr,
+ unsigned int cnt8,
+ void *buf,
+ int stream,
+ creg_cmd_cb callback,
+ void *cb_private)
+{
+ struct creg_cmd *cmd;
+
+ /* Don't queue stuff up if we're halted. */
+ if (unlikely(card->halt))
+ return -EINVAL;
+
+ if (card->creg_ctrl.reset)
+ return -EAGAIN;
+
+ if (cnt8 > MAX_CREG_DATA8)
+ return -EINVAL;
+
+ cmd = kmem_cache_alloc(creg_cmd_pool, GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&cmd->list);
+
+ cmd->op = op;
+ cmd->addr = addr;
+ cmd->cnt8 = cnt8;
+ cmd->buf = buf;
+ cmd->stream = stream;
+ cmd->cb = callback;
+ cmd->cb_private = cb_private;
+ cmd->status = 0;
+
+ spin_lock_bh(&card->creg_ctrl.lock);
+ list_add_tail(&cmd->list, &card->creg_ctrl.queue);
+ card->creg_ctrl.q_depth++;
+ creg_kick_queue(card);
+ spin_unlock_bh(&card->creg_ctrl.lock);
+
+ return 0;
+}
+
+static void creg_cmd_timed_out(unsigned long data)
+{
+ struct rsxx_cardinfo *card = (struct rsxx_cardinfo *) data;
+ struct creg_cmd *cmd;
+
+ spin_lock(&card->creg_ctrl.lock);
+ cmd = card->creg_ctrl.active_cmd;
+ card->creg_ctrl.active_cmd = NULL;
+ spin_unlock(&card->creg_ctrl.lock);
+
+ if (cmd == NULL) {
+ card->creg_ctrl.creg_stats.creg_timeout++;
+ dev_warn(CARD_TO_DEV(card),
+ "No active command associated with timeout!\n");
+ return;
+ }
+
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ETIMEDOUT);
+
+ kmem_cache_free(creg_cmd_pool, cmd);
+
+
+ spin_lock(&card->creg_ctrl.lock);
+ card->creg_ctrl.active = 0;
+ creg_kick_queue(card);
+ spin_unlock(&card->creg_ctrl.lock);
+}
+
+
+static void creg_cmd_done(struct work_struct *work)
+{
+ struct rsxx_cardinfo *card;
+ struct creg_cmd *cmd;
+ int st = 0;
+
+ card = container_of(work, struct rsxx_cardinfo,
+ creg_ctrl.done_work);
+
+ /*
+ * The timer could not be cancelled for some reason,
+ * race to pop the active command.
+ */
+ if (del_timer_sync(&card->creg_ctrl.cmd_timer) == 0)
+ card->creg_ctrl.creg_stats.failed_cancel_timer++;
+
+ spin_lock_bh(&card->creg_ctrl.lock);
+ cmd = card->creg_ctrl.active_cmd;
+ card->creg_ctrl.active_cmd = NULL;
+ spin_unlock_bh(&card->creg_ctrl.lock);
+
+ if (cmd == NULL) {
+ dev_err(CARD_TO_DEV(card),
+ "Spurious creg interrupt!\n");
+ return;
+ }
+
+ card->creg_ctrl.creg_stats.stat = ioread32(card->regmap + CREG_STAT);
+ cmd->status = card->creg_ctrl.creg_stats.stat;
+ if ((cmd->status & CREG_STAT_STATUS_MASK) == 0) {
+ dev_err(CARD_TO_DEV(card),
+ "Invalid status on creg command\n");
+ /*
+ * At this point we're probably reading garbage from HW. Don't
+ * do anything else that could mess up the system and let
+ * the sync function return an error.
+ */
+ st = -EIO;
+ goto creg_done;
+ } else if (cmd->status & CREG_STAT_ERROR) {
+ st = -EIO;
+ }
+
+ if ((cmd->op == CREG_OP_READ)) {
+ unsigned int cnt8 = ioread32(card->regmap + CREG_CNT);
+
+ /* Paranoid Sanity Checks */
+ if (!cmd->buf) {
+ dev_err(CARD_TO_DEV(card),
+ "Buffer not given for read.\n");
+ st = -EIO;
+ goto creg_done;
+ }
+ if (cnt8 != cmd->cnt8) {
+ dev_err(CARD_TO_DEV(card),
+ "count mismatch\n");
+ st = -EIO;
+ goto creg_done;
+ }
+
+ st = copy_from_creg_data(card, cnt8, cmd->buf, cmd->stream);
+ }
+
+creg_done:
+ if (cmd->cb)
+ cmd->cb(card, cmd, st);
+
+ kmem_cache_free(creg_cmd_pool, cmd);
+
+ spin_lock_bh(&card->creg_ctrl.lock);
+ card->creg_ctrl.active = 0;
+ creg_kick_queue(card);
+ spin_unlock_bh(&card->creg_ctrl.lock);
+}
+
+static void creg_reset(struct rsxx_cardinfo *card)
+{
+ struct creg_cmd *cmd = NULL;
+ struct creg_cmd *tmp;
+ unsigned long flags;
+
+ /*
+ * mutex_trylock is used here because if reset_lock is taken then a
+ * reset is already happening. So, we can just go ahead and return.
+ */
+ if (!mutex_trylock(&card->creg_ctrl.reset_lock))
+ return;
+
+ card->creg_ctrl.reset = 1;
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_disable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ dev_warn(CARD_TO_DEV(card),
+ "Resetting creg interface for recovery\n");
+
+ /* Cancel outstanding commands */
+ spin_lock_bh(&card->creg_ctrl.lock);
+ list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) {
+ list_del(&cmd->list);
+ card->creg_ctrl.q_depth--;
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ECANCELED);
+ kmem_cache_free(creg_cmd_pool, cmd);
+ }
+
+ cmd = card->creg_ctrl.active_cmd;
+ card->creg_ctrl.active_cmd = NULL;
+ if (cmd) {
+ if (timer_pending(&card->creg_ctrl.cmd_timer))
+ del_timer_sync(&card->creg_ctrl.cmd_timer);
+
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ECANCELED);
+ kmem_cache_free(creg_cmd_pool, cmd);
+
+ card->creg_ctrl.active = 0;
+ }
+ spin_unlock_bh(&card->creg_ctrl.lock);
+
+ card->creg_ctrl.reset = 0;
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_enable_ier_and_isr(card, CR_INTR_CREG | CR_INTR_EVENT);
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+
+ mutex_unlock(&card->creg_ctrl.reset_lock);
+}
+
+/* Used for synchronous accesses */
+struct creg_completion {
+ struct completion *cmd_done;
+ int st;
+ u32 creg_status;
+};
+
+static void creg_cmd_done_cb(struct rsxx_cardinfo *card,
+ struct creg_cmd *cmd,
+ int st)
+{
+ struct creg_completion *cmd_completion;
+
+ cmd_completion = cmd->cb_private;
+ BUG_ON(!cmd_completion);
+
+ cmd_completion->st = st;
+ cmd_completion->creg_status = cmd->status;
+ complete(cmd_completion->cmd_done);
+}
+
+static int __issue_creg_rw(struct rsxx_cardinfo *card,
+ unsigned int op,
+ unsigned int addr,
+ unsigned int cnt8,
+ void *buf,
+ int stream,
+ unsigned int *hw_stat)
+{
+ DECLARE_COMPLETION_ONSTACK(cmd_done);
+ struct creg_completion completion;
+ unsigned long timeout;
+ int st;
+
+ completion.cmd_done = &cmd_done;
+ completion.st = 0;
+ completion.creg_status = 0;
+
+ st = creg_queue_cmd(card, op, addr, cnt8, buf, stream, creg_cmd_done_cb,
+ &completion);
+ if (st)
+ return st;
+
+ /*
+ * This timeout is necessary for unresponsive hardware. The additional
+ * 20 seconds to used to guarantee that each cregs requests has time to
+ * complete.
+ */
+ timeout = msecs_to_jiffies(CREG_TIMEOUT_MSEC *
+ card->creg_ctrl.q_depth + 20000);
+
+ /*
+ * The creg interface is guaranteed to complete. It has a timeout
+ * mechanism that will kick in if hardware does not respond.
+ */
+ st = wait_for_completion_timeout(completion.cmd_done, timeout);
+ if (st == 0) {
+ /*
+ * This is really bad, because the kernel timer did not
+ * expire and notify us of a timeout!
+ */
+ dev_crit(CARD_TO_DEV(card),
+ "cregs timer failed\n");
+ creg_reset(card);
+ return -EIO;
+ }
+
+ *hw_stat = completion.creg_status;
+
+ if (completion.st) {
+ /*
+ * This read is needed to verify that there has not been any
+ * extreme errors that might have occurred, i.e. EEH. The
+ * function iowrite32 will not detect EEH errors, so it is
+ * necessary that we recover if such an error is the reason
+ * for the timeout. This is a dummy read.
+ */
+ ioread32(card->regmap + SCRATCH);
+
+ dev_warn(CARD_TO_DEV(card),
+ "creg command failed(%d x%08x)\n",
+ completion.st, addr);
+ return completion.st;
+ }
+
+ return 0;
+}
+
+static int issue_creg_rw(struct rsxx_cardinfo *card,
+ u32 addr,
+ unsigned int size8,
+ void *data,
+ int stream,
+ int read)
+{
+ unsigned int hw_stat;
+ unsigned int xfer;
+ unsigned int op;
+ int st;
+
+ op = read ? CREG_OP_READ : CREG_OP_WRITE;
+
+ do {
+ xfer = min_t(unsigned int, size8, MAX_CREG_DATA8);
+
+ st = __issue_creg_rw(card, op, addr, xfer,
+ data, stream, &hw_stat);
+ if (st)
+ return st;
+
+ data = (char *)data + xfer;
+ addr += xfer;
+ size8 -= xfer;
+ } while (size8);
+
+ return 0;
+}
+
+/* ---------------------------- Public API ---------------------------------- */
+int rsxx_creg_write(struct rsxx_cardinfo *card,
+ u32 addr,
+ unsigned int size8,
+ void *data,
+ int byte_stream)
+{
+ return issue_creg_rw(card, addr, size8, data, byte_stream, 0);
+}
+
+int rsxx_creg_read(struct rsxx_cardinfo *card,
+ u32 addr,
+ unsigned int size8,
+ void *data,
+ int byte_stream)
+{
+ return issue_creg_rw(card, addr, size8, data, byte_stream, 1);
+}
+
+int rsxx_get_card_state(struct rsxx_cardinfo *card, unsigned int *state)
+{
+ return rsxx_creg_read(card, CREG_ADD_CARD_STATE,
+ sizeof(*state), state, 0);
+}
+
+int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8)
+{
+ unsigned int size;
+ int st;
+
+ st = rsxx_creg_read(card, CREG_ADD_CARD_SIZE,
+ sizeof(size), &size, 0);
+ if (st)
+ return st;
+
+ *size8 = (u64)size * RSXX_HW_BLK_SIZE;
+ return 0;
+}
+
+int rsxx_get_num_targets(struct rsxx_cardinfo *card,
+ unsigned int *n_targets)
+{
+ return rsxx_creg_read(card, CREG_ADD_NUM_TARGETS,
+ sizeof(*n_targets), n_targets, 0);
+}
+
+int rsxx_get_card_capabilities(struct rsxx_cardinfo *card,
+ u32 *capabilities)
+{
+ return rsxx_creg_read(card, CREG_ADD_CAPABILITIES,
+ sizeof(*capabilities), capabilities, 0);
+}
+
+int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd)
+{
+ return rsxx_creg_write(card, CREG_ADD_CARD_CMD,
+ sizeof(cmd), &cmd, 0);
+}
+
+
+/*----------------- HW Log Functions -------------------*/
+static void hw_log_msg(struct rsxx_cardinfo *card, const char *str, int len)
+{
+ static char level;
+
+ /*
+ * New messages start with "<#>", where # is the log level. Messages
+ * that extend past the log buffer will use the previous level
+ */
+ if ((len > 3) && (str[0] == '<') && (str[2] == '>')) {
+ level = str[1];
+ str += 3; /* Skip past the log level. */
+ len -= 3;
+ }
+
+ switch (level) {
+ case '0':
+ dev_emerg(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '1':
+ dev_alert(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '2':
+ dev_crit(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '3':
+ dev_err(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '4':
+ dev_warn(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '5':
+ dev_notice(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '6':
+ dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ case '7':
+ dev_dbg(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ default:
+ dev_info(CARD_TO_DEV(card), "HW: %.*s", len, str);
+ break;
+ }
+}
+
+/*
+ * The substrncpy function copies the src string (which includes the
+ * terminating '\0' character), up to the count into the dest pointer.
+ * Returns the number of bytes copied to dest.
+ */
+static int substrncpy(char *dest, const char *src, int count)
+{
+ int max_cnt = count;
+
+ while (count) {
+ count--;
+ *dest = *src;
+ if (*dest == '\0')
+ break;
+ src++;
+ dest++;
+ }
+ return max_cnt - count;
+}
+
+
+static void read_hw_log_done(struct rsxx_cardinfo *card,
+ struct creg_cmd *cmd,
+ int st)
+{
+ char *buf;
+ char *log_str;
+ int cnt;
+ int len;
+ int off;
+
+ buf = cmd->buf;
+ off = 0;
+
+ /* Failed getting the log message */
+ if (st)
+ return;
+
+ while (off < cmd->cnt8) {
+ log_str = &card->log.buf[card->log.buf_len];
+ cnt = min(cmd->cnt8 - off, LOG_BUF_SIZE8 - card->log.buf_len);
+ len = substrncpy(log_str, &buf[off], cnt);
+
+ off += len;
+ card->log.buf_len += len;
+
+ /*
+ * Flush the log if we've hit the end of a message or if we've
+ * run out of buffer space.
+ */
+ if ((log_str[len - 1] == '\0') ||
+ (card->log.buf_len == LOG_BUF_SIZE8)) {
+ if (card->log.buf_len != 1) /* Don't log blank lines. */
+ hw_log_msg(card, card->log.buf,
+ card->log.buf_len);
+ card->log.buf_len = 0;
+ }
+
+ }
+
+ if (cmd->status & CREG_STAT_LOG_PENDING)
+ rsxx_read_hw_log(card);
+}
+
+int rsxx_read_hw_log(struct rsxx_cardinfo *card)
+{
+ int st;
+
+ st = creg_queue_cmd(card, CREG_OP_READ, CREG_ADD_LOG,
+ sizeof(card->log.tmp), card->log.tmp,
+ 1, read_hw_log_done, NULL);
+ if (st)
+ dev_err(CARD_TO_DEV(card),
+ "Failed getting log text\n");
+
+ return st;
+}
+
+/*-------------- IOCTL REG Access ------------------*/
+static int issue_reg_cmd(struct rsxx_cardinfo *card,
+ struct rsxx_reg_access *cmd,
+ int read)
+{
+ unsigned int op = read ? CREG_OP_READ : CREG_OP_WRITE;
+
+ return __issue_creg_rw(card, op, cmd->addr, cmd->cnt, cmd->data,
+ cmd->stream, &cmd->stat);
+}
+
+int rsxx_reg_access(struct rsxx_cardinfo *card,
+ struct rsxx_reg_access __user *ucmd,
+ int read)
+{
+ struct rsxx_reg_access cmd;
+ int st;
+
+ st = copy_from_user(&cmd, ucmd, sizeof(cmd));
+ if (st)
+ return -EFAULT;
+
+ if (cmd.cnt > RSXX_MAX_REG_CNT)
+ return -EFAULT;
+
+ st = issue_reg_cmd(card, &cmd, read);
+ if (st)
+ return st;
+
+ st = put_user(cmd.stat, &ucmd->stat);
+ if (st)
+ return -EFAULT;
+
+ if (read) {
+ st = copy_to_user(ucmd->data, cmd.data, cmd.cnt);
+ if (st)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card)
+{
+ struct creg_cmd *cmd = NULL;
+
+ cmd = card->creg_ctrl.active_cmd;
+ card->creg_ctrl.active_cmd = NULL;
+
+ if (cmd) {
+ del_timer_sync(&card->creg_ctrl.cmd_timer);
+
+ spin_lock_bh(&card->creg_ctrl.lock);
+ list_add(&cmd->list, &card->creg_ctrl.queue);
+ card->creg_ctrl.q_depth++;
+ card->creg_ctrl.active = 0;
+ spin_unlock_bh(&card->creg_ctrl.lock);
+ }
+}
+
+void rsxx_kick_creg_queue(struct rsxx_cardinfo *card)
+{
+ spin_lock_bh(&card->creg_ctrl.lock);
+ if (!list_empty(&card->creg_ctrl.queue))
+ creg_kick_queue(card);
+ spin_unlock_bh(&card->creg_ctrl.lock);
+}
+
+/*------------ Initialization & Setup --------------*/
+int rsxx_creg_setup(struct rsxx_cardinfo *card)
+{
+ card->creg_ctrl.active_cmd = NULL;
+
+ card->creg_ctrl.creg_wq =
+ create_singlethread_workqueue(DRIVER_NAME"_creg");
+ if (!card->creg_ctrl.creg_wq)
+ return -ENOMEM;
+
+ INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done);
+ mutex_init(&card->creg_ctrl.reset_lock);
+ INIT_LIST_HEAD(&card->creg_ctrl.queue);
+ spin_lock_init(&card->creg_ctrl.lock);
+ setup_timer(&card->creg_ctrl.cmd_timer, creg_cmd_timed_out,
+ (unsigned long) card);
+
+ return 0;
+}
+
+void rsxx_creg_destroy(struct rsxx_cardinfo *card)
+{
+ struct creg_cmd *cmd;
+ struct creg_cmd *tmp;
+ int cnt = 0;
+
+ /* Cancel outstanding commands */
+ spin_lock_bh(&card->creg_ctrl.lock);
+ list_for_each_entry_safe(cmd, tmp, &card->creg_ctrl.queue, list) {
+ list_del(&cmd->list);
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ECANCELED);
+ kmem_cache_free(creg_cmd_pool, cmd);
+ cnt++;
+ }
+
+ if (cnt)
+ dev_info(CARD_TO_DEV(card),
+ "Canceled %d queue creg commands\n", cnt);
+
+ cmd = card->creg_ctrl.active_cmd;
+ card->creg_ctrl.active_cmd = NULL;
+ if (cmd) {
+ if (timer_pending(&card->creg_ctrl.cmd_timer))
+ del_timer_sync(&card->creg_ctrl.cmd_timer);
+
+ if (cmd->cb)
+ cmd->cb(card, cmd, -ECANCELED);
+ dev_info(CARD_TO_DEV(card),
+ "Canceled active creg command\n");
+ kmem_cache_free(creg_cmd_pool, cmd);
+ }
+ spin_unlock_bh(&card->creg_ctrl.lock);
+
+ cancel_work_sync(&card->creg_ctrl.done_work);
+}
+
+
+int rsxx_creg_init(void)
+{
+ creg_cmd_pool = KMEM_CACHE(creg_cmd, SLAB_HWCACHE_ALIGN);
+ if (!creg_cmd_pool)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rsxx_creg_cleanup(void)
+{
+ kmem_cache_destroy(creg_cmd_pool);
+}
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
new file mode 100644
index 000000000..ac8c62cb4
--- /dev/null
+++ b/drivers/block/rsxx/dev.c
@@ -0,0 +1,340 @@
+/*
+* Filename: dev.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include <linux/hdreg.h>
+#include <linux/genhd.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+
+#include <linux/fs.h>
+
+#include "rsxx_priv.h"
+
+static unsigned int blkdev_minors = 64;
+module_param(blkdev_minors, uint, 0444);
+MODULE_PARM_DESC(blkdev_minors, "Number of minors(partitions)");
+
+/*
+ * For now I'm making this tweakable in case any applications hit this limit.
+ * If you see a "bio too big" error in the log you will need to raise this
+ * value.
+ */
+static unsigned int blkdev_max_hw_sectors = 1024;
+module_param(blkdev_max_hw_sectors, uint, 0444);
+MODULE_PARM_DESC(blkdev_max_hw_sectors, "Max hw sectors for a single BIO");
+
+static unsigned int enable_blkdev = 1;
+module_param(enable_blkdev , uint, 0444);
+MODULE_PARM_DESC(enable_blkdev, "Enable block device interfaces");
+
+
+struct rsxx_bio_meta {
+ struct bio *bio;
+ atomic_t pending_dmas;
+ atomic_t error;
+ unsigned long start_time;
+};
+
+static struct kmem_cache *bio_meta_pool;
+
+/*----------------- Block Device Operations -----------------*/
+static int rsxx_blkdev_ioctl(struct block_device *bdev,
+ fmode_t mode,
+ unsigned int cmd,
+ unsigned long arg)
+{
+ struct rsxx_cardinfo *card = bdev->bd_disk->private_data;
+
+ switch (cmd) {
+ case RSXX_GETREG:
+ return rsxx_reg_access(card, (void __user *)arg, 1);
+ case RSXX_SETREG:
+ return rsxx_reg_access(card, (void __user *)arg, 0);
+ }
+
+ return -ENOTTY;
+}
+
+static int rsxx_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct rsxx_cardinfo *card = bdev->bd_disk->private_data;
+ u64 blocks = card->size8 >> 9;
+
+ /*
+ * get geometry: Fake it. I haven't found any drivers that set
+ * geo->start, so we won't either.
+ */
+ if (card->size8) {
+ geo->heads = 64;
+ geo->sectors = 16;
+ do_div(blocks, (geo->heads * geo->sectors));
+ geo->cylinders = blocks;
+ } else {
+ geo->heads = 0;
+ geo->sectors = 0;
+ geo->cylinders = 0;
+ }
+ return 0;
+}
+
+static const struct block_device_operations rsxx_fops = {
+ .owner = THIS_MODULE,
+ .getgeo = rsxx_getgeo,
+ .ioctl = rsxx_blkdev_ioctl,
+};
+
+static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio)
+{
+ generic_start_io_acct(bio_data_dir(bio), bio_sectors(bio),
+ &card->gendisk->part0);
+}
+
+static void disk_stats_complete(struct rsxx_cardinfo *card,
+ struct bio *bio,
+ unsigned long start_time)
+{
+ generic_end_io_acct(bio_data_dir(bio), &card->gendisk->part0,
+ start_time);
+}
+
+static void bio_dma_done_cb(struct rsxx_cardinfo *card,
+ void *cb_data,
+ unsigned int error)
+{
+ struct rsxx_bio_meta *meta = cb_data;
+
+ if (error)
+ atomic_set(&meta->error, 1);
+
+ if (atomic_dec_and_test(&meta->pending_dmas)) {
+ if (!card->eeh_state && card->gendisk)
+ disk_stats_complete(card, meta->bio, meta->start_time);
+
+ bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0);
+ kmem_cache_free(bio_meta_pool, meta);
+ }
+}
+
+static void rsxx_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct rsxx_cardinfo *card = q->queuedata;
+ struct rsxx_bio_meta *bio_meta;
+ int st = -EINVAL;
+
+ might_sleep();
+
+ if (!card)
+ goto req_err;
+
+ if (bio_end_sector(bio) > get_capacity(card->gendisk))
+ goto req_err;
+
+ if (unlikely(card->halt)) {
+ st = -EFAULT;
+ goto req_err;
+ }
+
+ if (unlikely(card->dma_fault)) {
+ st = (-EFAULT);
+ goto req_err;
+ }
+
+ if (bio->bi_iter.bi_size == 0) {
+ dev_err(CARD_TO_DEV(card), "size zero BIO!\n");
+ goto req_err;
+ }
+
+ bio_meta = kmem_cache_alloc(bio_meta_pool, GFP_KERNEL);
+ if (!bio_meta) {
+ st = -ENOMEM;
+ goto req_err;
+ }
+
+ bio_meta->bio = bio;
+ atomic_set(&bio_meta->error, 0);
+ atomic_set(&bio_meta->pending_dmas, 0);
+ bio_meta->start_time = jiffies;
+
+ if (!unlikely(card->halt))
+ disk_stats_start(card, bio);
+
+ dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
+ bio_data_dir(bio) ? 'W' : 'R', bio_meta,
+ (u64)bio->bi_iter.bi_sector << 9, bio->bi_iter.bi_size);
+
+ st = rsxx_dma_queue_bio(card, bio, &bio_meta->pending_dmas,
+ bio_dma_done_cb, bio_meta);
+ if (st)
+ goto queue_err;
+
+ return;
+
+queue_err:
+ kmem_cache_free(bio_meta_pool, bio_meta);
+req_err:
+ bio_endio(bio, st);
+}
+
+/*----------------- Device Setup -------------------*/
+static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
+{
+ unsigned char pci_rev;
+
+ pci_read_config_byte(card->dev, PCI_REVISION_ID, &pci_rev);
+
+ return (pci_rev >= RSXX_DISCARD_SUPPORT);
+}
+
+int rsxx_attach_dev(struct rsxx_cardinfo *card)
+{
+ mutex_lock(&card->dev_lock);
+
+ /* The block device requires the stripe size from the config. */
+ if (enable_blkdev) {
+ if (card->config_valid)
+ set_capacity(card->gendisk, card->size8 >> 9);
+ else
+ set_capacity(card->gendisk, 0);
+ add_disk(card->gendisk);
+
+ card->bdev_attached = 1;
+ }
+
+ mutex_unlock(&card->dev_lock);
+
+ return 0;
+}
+
+void rsxx_detach_dev(struct rsxx_cardinfo *card)
+{
+ mutex_lock(&card->dev_lock);
+
+ if (card->bdev_attached) {
+ del_gendisk(card->gendisk);
+ card->bdev_attached = 0;
+ }
+
+ mutex_unlock(&card->dev_lock);
+}
+
+int rsxx_setup_dev(struct rsxx_cardinfo *card)
+{
+ unsigned short blk_size;
+
+ mutex_init(&card->dev_lock);
+
+ if (!enable_blkdev)
+ return 0;
+
+ card->major = register_blkdev(0, DRIVER_NAME);
+ if (card->major < 0) {
+ dev_err(CARD_TO_DEV(card), "Failed to get major number\n");
+ return -ENOMEM;
+ }
+
+ card->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!card->queue) {
+ dev_err(CARD_TO_DEV(card), "Failed queue alloc\n");
+ unregister_blkdev(card->major, DRIVER_NAME);
+ return -ENOMEM;
+ }
+
+ card->gendisk = alloc_disk(blkdev_minors);
+ if (!card->gendisk) {
+ dev_err(CARD_TO_DEV(card), "Failed disk alloc\n");
+ blk_cleanup_queue(card->queue);
+ unregister_blkdev(card->major, DRIVER_NAME);
+ return -ENOMEM;
+ }
+
+ if (card->config_valid) {
+ blk_size = card->config.data.block_size;
+ blk_queue_dma_alignment(card->queue, blk_size - 1);
+ blk_queue_logical_block_size(card->queue, blk_size);
+ }
+
+ blk_queue_make_request(card->queue, rsxx_make_request);
+ blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
+ blk_queue_max_hw_sectors(card->queue, blkdev_max_hw_sectors);
+ blk_queue_physical_block_size(card->queue, RSXX_HW_BLK_SIZE);
+
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, card->queue);
+ queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, card->queue);
+ if (rsxx_discard_supported(card)) {
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, card->queue);
+ blk_queue_max_discard_sectors(card->queue,
+ RSXX_HW_BLK_SIZE >> 9);
+ card->queue->limits.discard_granularity = RSXX_HW_BLK_SIZE;
+ card->queue->limits.discard_alignment = RSXX_HW_BLK_SIZE;
+ card->queue->limits.discard_zeroes_data = 1;
+ }
+
+ card->queue->queuedata = card;
+
+ snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name),
+ "rsxx%d", card->disk_id);
+ card->gendisk->driverfs_dev = &card->dev->dev;
+ card->gendisk->major = card->major;
+ card->gendisk->first_minor = 0;
+ card->gendisk->fops = &rsxx_fops;
+ card->gendisk->private_data = card;
+ card->gendisk->queue = card->queue;
+
+ return 0;
+}
+
+void rsxx_destroy_dev(struct rsxx_cardinfo *card)
+{
+ if (!enable_blkdev)
+ return;
+
+ put_disk(card->gendisk);
+ card->gendisk = NULL;
+
+ blk_cleanup_queue(card->queue);
+ card->queue->queuedata = NULL;
+ unregister_blkdev(card->major, DRIVER_NAME);
+}
+
+int rsxx_dev_init(void)
+{
+ bio_meta_pool = KMEM_CACHE(rsxx_bio_meta, SLAB_HWCACHE_ALIGN);
+ if (!bio_meta_pool)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void rsxx_dev_cleanup(void)
+{
+ kmem_cache_destroy(bio_meta_pool);
+}
+
+
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
new file mode 100644
index 000000000..cf8cd293a
--- /dev/null
+++ b/drivers/block/rsxx/dma.c
@@ -0,0 +1,1104 @@
+/*
+* Filename: dma.c
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#include <linux/slab.h>
+#include "rsxx_priv.h"
+
+struct rsxx_dma {
+ struct list_head list;
+ u8 cmd;
+ unsigned int laddr; /* Logical address */
+ struct {
+ u32 off;
+ u32 cnt;
+ } sub_page;
+ dma_addr_t dma_addr;
+ struct page *page;
+ unsigned int pg_off; /* Page Offset */
+ rsxx_dma_cb cb;
+ void *cb_data;
+};
+
+/* This timeout is used to detect a stalled DMA channel */
+#define DMA_ACTIVITY_TIMEOUT msecs_to_jiffies(10000)
+
+struct hw_status {
+ u8 status;
+ u8 tag;
+ __le16 count;
+ __le32 _rsvd2;
+ __le64 _rsvd3;
+} __packed;
+
+enum rsxx_dma_status {
+ DMA_SW_ERR = 0x1,
+ DMA_HW_FAULT = 0x2,
+ DMA_CANCELLED = 0x4,
+};
+
+struct hw_cmd {
+ u8 command;
+ u8 tag;
+ u8 _rsvd;
+ u8 sub_page; /* Bit[0:2]: 512byte offset */
+ /* Bit[4:6]: 512byte count */
+ __le32 device_addr;
+ __le64 host_addr;
+} __packed;
+
+enum rsxx_hw_cmd {
+ HW_CMD_BLK_DISCARD = 0x70,
+ HW_CMD_BLK_WRITE = 0x80,
+ HW_CMD_BLK_READ = 0xC0,
+ HW_CMD_BLK_RECON_READ = 0xE0,
+};
+
+enum rsxx_hw_status {
+ HW_STATUS_CRC = 0x01,
+ HW_STATUS_HARD_ERR = 0x02,
+ HW_STATUS_SOFT_ERR = 0x04,
+ HW_STATUS_FAULT = 0x08,
+};
+
+static struct kmem_cache *rsxx_dma_pool;
+
+struct dma_tracker {
+ int next_tag;
+ struct rsxx_dma *dma;
+};
+
+#define DMA_TRACKER_LIST_SIZE8 (sizeof(struct dma_tracker_list) + \
+ (sizeof(struct dma_tracker) * RSXX_MAX_OUTSTANDING_CMDS))
+
+struct dma_tracker_list {
+ spinlock_t lock;
+ int head;
+ struct dma_tracker list[0];
+};
+
+
+/*----------------- Misc Utility Functions -------------------*/
+static unsigned int rsxx_addr8_to_laddr(u64 addr8, struct rsxx_cardinfo *card)
+{
+ unsigned long long tgt_addr8;
+
+ tgt_addr8 = ((addr8 >> card->_stripe.upper_shift) &
+ card->_stripe.upper_mask) |
+ ((addr8) & card->_stripe.lower_mask);
+ do_div(tgt_addr8, RSXX_HW_BLK_SIZE);
+ return tgt_addr8;
+}
+
+static unsigned int rsxx_get_dma_tgt(struct rsxx_cardinfo *card, u64 addr8)
+{
+ unsigned int tgt;
+
+ tgt = (addr8 >> card->_stripe.target_shift) & card->_stripe.target_mask;
+
+ return tgt;
+}
+
+void rsxx_dma_queue_reset(struct rsxx_cardinfo *card)
+{
+ /* Reset all DMA Command/Status Queues */
+ iowrite32(DMA_QUEUE_RESET, card->regmap + RESET);
+}
+
+static unsigned int get_dma_size(struct rsxx_dma *dma)
+{
+ if (dma->sub_page.cnt)
+ return dma->sub_page.cnt << 9;
+ else
+ return RSXX_HW_BLK_SIZE;
+}
+
+
+/*----------------- DMA Tracker -------------------*/
+static void set_tracker_dma(struct dma_tracker_list *trackers,
+ int tag,
+ struct rsxx_dma *dma)
+{
+ trackers->list[tag].dma = dma;
+}
+
+static struct rsxx_dma *get_tracker_dma(struct dma_tracker_list *trackers,
+ int tag)
+{
+ return trackers->list[tag].dma;
+}
+
+static int pop_tracker(struct dma_tracker_list *trackers)
+{
+ int tag;
+
+ spin_lock(&trackers->lock);
+ tag = trackers->head;
+ if (tag != -1) {
+ trackers->head = trackers->list[tag].next_tag;
+ trackers->list[tag].next_tag = -1;
+ }
+ spin_unlock(&trackers->lock);
+
+ return tag;
+}
+
+static void push_tracker(struct dma_tracker_list *trackers, int tag)
+{
+ spin_lock(&trackers->lock);
+ trackers->list[tag].next_tag = trackers->head;
+ trackers->head = tag;
+ trackers->list[tag].dma = NULL;
+ spin_unlock(&trackers->lock);
+}
+
+
+/*----------------- Interrupt Coalescing -------------*/
+/*
+ * Interrupt Coalescing Register Format:
+ * Interrupt Timer (64ns units) [15:0]
+ * Interrupt Count [24:16]
+ * Reserved [31:25]
+*/
+#define INTR_COAL_LATENCY_MASK (0x0000ffff)
+
+#define INTR_COAL_COUNT_SHIFT 16
+#define INTR_COAL_COUNT_BITS 9
+#define INTR_COAL_COUNT_MASK (((1 << INTR_COAL_COUNT_BITS) - 1) << \
+ INTR_COAL_COUNT_SHIFT)
+#define INTR_COAL_LATENCY_UNITS_NS 64
+
+
+static u32 dma_intr_coal_val(u32 mode, u32 count, u32 latency)
+{
+ u32 latency_units = latency / INTR_COAL_LATENCY_UNITS_NS;
+
+ if (mode == RSXX_INTR_COAL_DISABLED)
+ return 0;
+
+ return ((count << INTR_COAL_COUNT_SHIFT) & INTR_COAL_COUNT_MASK) |
+ (latency_units & INTR_COAL_LATENCY_MASK);
+
+}
+
+static void dma_intr_coal_auto_tune(struct rsxx_cardinfo *card)
+{
+ int i;
+ u32 q_depth = 0;
+ u32 intr_coal;
+
+ if (card->config.data.intr_coal.mode != RSXX_INTR_COAL_AUTO_TUNE ||
+ unlikely(card->eeh_state))
+ return;
+
+ for (i = 0; i < card->n_targets; i++)
+ q_depth += atomic_read(&card->ctrl[i].stats.hw_q_depth);
+
+ intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode,
+ q_depth / 2,
+ card->config.data.intr_coal.latency);
+ iowrite32(intr_coal, card->regmap + INTR_COAL);
+}
+
+/*----------------- RSXX DMA Handling -------------------*/
+static void rsxx_free_dma(struct rsxx_dma_ctrl *ctrl, struct rsxx_dma *dma)
+{
+ if (dma->cmd != HW_CMD_BLK_DISCARD) {
+ if (!pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+ pci_unmap_page(ctrl->card->dev, dma->dma_addr,
+ get_dma_size(dma),
+ dma->cmd == HW_CMD_BLK_WRITE ?
+ PCI_DMA_TODEVICE :
+ PCI_DMA_FROMDEVICE);
+ }
+ }
+
+ kmem_cache_free(rsxx_dma_pool, dma);
+}
+
+static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
+ struct rsxx_dma *dma,
+ unsigned int status)
+{
+ if (status & DMA_SW_ERR)
+ ctrl->stats.dma_sw_err++;
+ if (status & DMA_HW_FAULT)
+ ctrl->stats.dma_hw_fault++;
+ if (status & DMA_CANCELLED)
+ ctrl->stats.dma_cancelled++;
+
+ if (dma->cb)
+ dma->cb(ctrl->card, dma->cb_data, status ? 1 : 0);
+
+ rsxx_free_dma(ctrl, dma);
+}
+
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
+ struct list_head *q, unsigned int done)
+{
+ struct rsxx_dma *dma;
+ struct rsxx_dma *tmp;
+ int cnt = 0;
+
+ list_for_each_entry_safe(dma, tmp, q, list) {
+ list_del(&dma->list);
+ if (done & COMPLETE_DMA)
+ rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+ else
+ rsxx_free_dma(ctrl, dma);
+ cnt++;
+ }
+
+ return cnt;
+}
+
+static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
+ struct rsxx_dma *dma)
+{
+ /*
+ * Requeued DMAs go to the front of the queue so they are issued
+ * first.
+ */
+ spin_lock_bh(&ctrl->queue_lock);
+ ctrl->stats.sw_q_depth++;
+ list_add(&dma->list, &ctrl->queue);
+ spin_unlock_bh(&ctrl->queue_lock);
+}
+
+static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
+ struct rsxx_dma *dma,
+ u8 hw_st)
+{
+ unsigned int status = 0;
+ int requeue_cmd = 0;
+
+ dev_dbg(CARD_TO_DEV(ctrl->card),
+ "Handling DMA error(cmd x%02x, laddr x%08x st:x%02x)\n",
+ dma->cmd, dma->laddr, hw_st);
+
+ if (hw_st & HW_STATUS_CRC)
+ ctrl->stats.crc_errors++;
+ if (hw_st & HW_STATUS_HARD_ERR)
+ ctrl->stats.hard_errors++;
+ if (hw_st & HW_STATUS_SOFT_ERR)
+ ctrl->stats.soft_errors++;
+
+ switch (dma->cmd) {
+ case HW_CMD_BLK_READ:
+ if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) {
+ if (ctrl->card->scrub_hard) {
+ dma->cmd = HW_CMD_BLK_RECON_READ;
+ requeue_cmd = 1;
+ ctrl->stats.reads_retried++;
+ } else {
+ status |= DMA_HW_FAULT;
+ ctrl->stats.reads_failed++;
+ }
+ } else if (hw_st & HW_STATUS_FAULT) {
+ status |= DMA_HW_FAULT;
+ ctrl->stats.reads_failed++;
+ }
+
+ break;
+ case HW_CMD_BLK_RECON_READ:
+ if (hw_st & (HW_STATUS_CRC | HW_STATUS_HARD_ERR)) {
+ /* Data could not be reconstructed. */
+ status |= DMA_HW_FAULT;
+ ctrl->stats.reads_failed++;
+ }
+
+ break;
+ case HW_CMD_BLK_WRITE:
+ status |= DMA_HW_FAULT;
+ ctrl->stats.writes_failed++;
+
+ break;
+ case HW_CMD_BLK_DISCARD:
+ status |= DMA_HW_FAULT;
+ ctrl->stats.discards_failed++;
+
+ break;
+ default:
+ dev_err(CARD_TO_DEV(ctrl->card),
+ "Unknown command in DMA!(cmd: x%02x "
+ "laddr x%08x st: x%02x\n",
+ dma->cmd, dma->laddr, hw_st);
+ status |= DMA_SW_ERR;
+
+ break;
+ }
+
+ if (requeue_cmd)
+ rsxx_requeue_dma(ctrl, dma);
+ else
+ rsxx_complete_dma(ctrl, dma, status);
+}
+
+static void dma_engine_stalled(unsigned long data)
+{
+ struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
+ int cnt;
+
+ if (atomic_read(&ctrl->stats.hw_q_depth) == 0 ||
+ unlikely(ctrl->card->eeh_state))
+ return;
+
+ if (ctrl->cmd.idx != ioread32(ctrl->regmap + SW_CMD_IDX)) {
+ /*
+ * The dma engine was stalled because the SW_CMD_IDX write
+ * was lost. Issue it again to recover.
+ */
+ dev_warn(CARD_TO_DEV(ctrl->card),
+ "SW_CMD_IDX write was lost, re-writing...\n");
+ iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+ mod_timer(&ctrl->activity_timer,
+ jiffies + DMA_ACTIVITY_TIMEOUT);
+ } else {
+ dev_warn(CARD_TO_DEV(ctrl->card),
+ "DMA channel %d has stalled, faulting interface.\n",
+ ctrl->id);
+ ctrl->card->dma_fault = 1;
+
+ /* Clean up the DMA queue */
+ spin_lock(&ctrl->queue_lock);
+ cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
+ spin_unlock(&ctrl->queue_lock);
+
+ cnt += rsxx_dma_cancel(ctrl);
+
+ if (cnt)
+ dev_info(CARD_TO_DEV(ctrl->card),
+ "Freed %d queued DMAs on channel %d\n",
+ cnt, ctrl->id);
+ }
+}
+
+static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
+{
+ struct rsxx_dma *dma;
+ int tag;
+ int cmds_pending = 0;
+ struct hw_cmd *hw_cmd_buf;
+ int dir;
+
+ hw_cmd_buf = ctrl->cmd.buf;
+
+ if (unlikely(ctrl->card->halt) ||
+ unlikely(ctrl->card->eeh_state))
+ return;
+
+ while (1) {
+ spin_lock_bh(&ctrl->queue_lock);
+ if (list_empty(&ctrl->queue)) {
+ spin_unlock_bh(&ctrl->queue_lock);
+ break;
+ }
+ spin_unlock_bh(&ctrl->queue_lock);
+
+ tag = pop_tracker(ctrl->trackers);
+ if (tag == -1)
+ break;
+
+ spin_lock_bh(&ctrl->queue_lock);
+ dma = list_entry(ctrl->queue.next, struct rsxx_dma, list);
+ list_del(&dma->list);
+ ctrl->stats.sw_q_depth--;
+ spin_unlock_bh(&ctrl->queue_lock);
+
+ /*
+ * This will catch any DMAs that slipped in right before the
+ * fault, but was queued after all the other DMAs were
+ * cancelled.
+ */
+ if (unlikely(ctrl->card->dma_fault)) {
+ push_tracker(ctrl->trackers, tag);
+ rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+ continue;
+ }
+
+ if (dma->cmd != HW_CMD_BLK_DISCARD) {
+ if (dma->cmd == HW_CMD_BLK_WRITE)
+ dir = PCI_DMA_TODEVICE;
+ else
+ dir = PCI_DMA_FROMDEVICE;
+
+ /*
+ * The function pci_map_page is placed here because we
+ * can only, by design, issue up to 255 commands to the
+ * hardware at one time per DMA channel. So the maximum
+ * amount of mapped memory would be 255 * 4 channels *
+ * 4096 Bytes which is less than 2GB, the limit of a x8
+ * Non-HWWD PCIe slot. This way the pci_map_page
+ * function should never fail because of a lack of
+ * mappable memory.
+ */
+ dma->dma_addr = pci_map_page(ctrl->card->dev, dma->page,
+ dma->pg_off, dma->sub_page.cnt << 9, dir);
+ if (pci_dma_mapping_error(ctrl->card->dev, dma->dma_addr)) {
+ push_tracker(ctrl->trackers, tag);
+ rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+ continue;
+ }
+ }
+
+ set_tracker_dma(ctrl->trackers, tag, dma);
+ hw_cmd_buf[ctrl->cmd.idx].command = dma->cmd;
+ hw_cmd_buf[ctrl->cmd.idx].tag = tag;
+ hw_cmd_buf[ctrl->cmd.idx]._rsvd = 0;
+ hw_cmd_buf[ctrl->cmd.idx].sub_page =
+ ((dma->sub_page.cnt & 0x7) << 4) |
+ (dma->sub_page.off & 0x7);
+
+ hw_cmd_buf[ctrl->cmd.idx].device_addr =
+ cpu_to_le32(dma->laddr);
+
+ hw_cmd_buf[ctrl->cmd.idx].host_addr =
+ cpu_to_le64(dma->dma_addr);
+
+ dev_dbg(CARD_TO_DEV(ctrl->card),
+ "Issue DMA%d(laddr %d tag %d) to idx %d\n",
+ ctrl->id, dma->laddr, tag, ctrl->cmd.idx);
+
+ ctrl->cmd.idx = (ctrl->cmd.idx + 1) & RSXX_CS_IDX_MASK;
+ cmds_pending++;
+
+ if (dma->cmd == HW_CMD_BLK_WRITE)
+ ctrl->stats.writes_issued++;
+ else if (dma->cmd == HW_CMD_BLK_DISCARD)
+ ctrl->stats.discards_issued++;
+ else
+ ctrl->stats.reads_issued++;
+ }
+
+ /* Let HW know we've queued commands. */
+ if (cmds_pending) {
+ atomic_add(cmds_pending, &ctrl->stats.hw_q_depth);
+ mod_timer(&ctrl->activity_timer,
+ jiffies + DMA_ACTIVITY_TIMEOUT);
+
+ if (unlikely(ctrl->card->eeh_state)) {
+ del_timer_sync(&ctrl->activity_timer);
+ return;
+ }
+
+ iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+ }
+}
+
+static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl)
+{
+ struct rsxx_dma *dma;
+ unsigned long flags;
+ u16 count;
+ u8 status;
+ u8 tag;
+ struct hw_status *hw_st_buf;
+
+ hw_st_buf = ctrl->status.buf;
+
+ if (unlikely(ctrl->card->halt) ||
+ unlikely(ctrl->card->dma_fault) ||
+ unlikely(ctrl->card->eeh_state))
+ return;
+
+ count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count);
+
+ while (count == ctrl->e_cnt) {
+ /*
+ * The read memory-barrier is necessary to keep aggressive
+ * processors/optimizers (such as the PPC Apple G5) from
+ * reordering the following status-buffer tag & status read
+ * *before* the count read on subsequent iterations of the
+ * loop!
+ */
+ rmb();
+
+ status = hw_st_buf[ctrl->status.idx].status;
+ tag = hw_st_buf[ctrl->status.idx].tag;
+
+ dma = get_tracker_dma(ctrl->trackers, tag);
+ if (dma == NULL) {
+ spin_lock_irqsave(&ctrl->card->irq_lock, flags);
+ rsxx_disable_ier(ctrl->card, CR_INTR_DMA_ALL);
+ spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
+
+ dev_err(CARD_TO_DEV(ctrl->card),
+ "No tracker for tag %d "
+ "(idx %d id %d)\n",
+ tag, ctrl->status.idx, ctrl->id);
+ return;
+ }
+
+ dev_dbg(CARD_TO_DEV(ctrl->card),
+ "Completing DMA%d"
+ "(laddr x%x tag %d st: x%x cnt: x%04x) from idx %d.\n",
+ ctrl->id, dma->laddr, tag, status, count,
+ ctrl->status.idx);
+
+ atomic_dec(&ctrl->stats.hw_q_depth);
+
+ mod_timer(&ctrl->activity_timer,
+ jiffies + DMA_ACTIVITY_TIMEOUT);
+
+ if (status)
+ rsxx_handle_dma_error(ctrl, dma, status);
+ else
+ rsxx_complete_dma(ctrl, dma, 0);
+
+ push_tracker(ctrl->trackers, tag);
+
+ ctrl->status.idx = (ctrl->status.idx + 1) &
+ RSXX_CS_IDX_MASK;
+ ctrl->e_cnt++;
+
+ count = le16_to_cpu(hw_st_buf[ctrl->status.idx].count);
+ }
+
+ dma_intr_coal_auto_tune(ctrl->card);
+
+ if (atomic_read(&ctrl->stats.hw_q_depth) == 0)
+ del_timer_sync(&ctrl->activity_timer);
+
+ spin_lock_irqsave(&ctrl->card->irq_lock, flags);
+ rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id));
+ spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
+
+ spin_lock_bh(&ctrl->queue_lock);
+ if (ctrl->stats.sw_q_depth)
+ queue_work(ctrl->issue_wq, &ctrl->issue_dma_work);
+ spin_unlock_bh(&ctrl->queue_lock);
+}
+
+static void rsxx_schedule_issue(struct work_struct *work)
+{
+ struct rsxx_dma_ctrl *ctrl;
+
+ ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
+
+ mutex_lock(&ctrl->work_lock);
+ rsxx_issue_dmas(ctrl);
+ mutex_unlock(&ctrl->work_lock);
+}
+
+static void rsxx_schedule_done(struct work_struct *work)
+{
+ struct rsxx_dma_ctrl *ctrl;
+
+ ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
+
+ mutex_lock(&ctrl->work_lock);
+ rsxx_dma_done(ctrl);
+ mutex_unlock(&ctrl->work_lock);
+}
+
+static int rsxx_queue_discard(struct rsxx_cardinfo *card,
+ struct list_head *q,
+ unsigned int laddr,
+ rsxx_dma_cb cb,
+ void *cb_data)
+{
+ struct rsxx_dma *dma;
+
+ dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL);
+ if (!dma)
+ return -ENOMEM;
+
+ dma->cmd = HW_CMD_BLK_DISCARD;
+ dma->laddr = laddr;
+ dma->dma_addr = 0;
+ dma->sub_page.off = 0;
+ dma->sub_page.cnt = 0;
+ dma->page = NULL;
+ dma->pg_off = 0;
+ dma->cb = cb;
+ dma->cb_data = cb_data;
+
+ dev_dbg(CARD_TO_DEV(card), "Queuing[D] laddr %x\n", dma->laddr);
+
+ list_add_tail(&dma->list, q);
+
+ return 0;
+}
+
+static int rsxx_queue_dma(struct rsxx_cardinfo *card,
+ struct list_head *q,
+ int dir,
+ unsigned int dma_off,
+ unsigned int dma_len,
+ unsigned int laddr,
+ struct page *page,
+ unsigned int pg_off,
+ rsxx_dma_cb cb,
+ void *cb_data)
+{
+ struct rsxx_dma *dma;
+
+ dma = kmem_cache_alloc(rsxx_dma_pool, GFP_KERNEL);
+ if (!dma)
+ return -ENOMEM;
+
+ dma->cmd = dir ? HW_CMD_BLK_WRITE : HW_CMD_BLK_READ;
+ dma->laddr = laddr;
+ dma->sub_page.off = (dma_off >> 9);
+ dma->sub_page.cnt = (dma_len >> 9);
+ dma->page = page;
+ dma->pg_off = pg_off;
+ dma->cb = cb;
+ dma->cb_data = cb_data;
+
+ dev_dbg(CARD_TO_DEV(card),
+ "Queuing[%c] laddr %x off %d cnt %d page %p pg_off %d\n",
+ dir ? 'W' : 'R', dma->laddr, dma->sub_page.off,
+ dma->sub_page.cnt, dma->page, dma->pg_off);
+
+ /* Queue the DMA */
+ list_add_tail(&dma->list, q);
+
+ return 0;
+}
+
+int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
+ struct bio *bio,
+ atomic_t *n_dmas,
+ rsxx_dma_cb cb,
+ void *cb_data)
+{
+ struct list_head dma_list[RSXX_MAX_TARGETS];
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+ unsigned long long addr8;
+ unsigned int laddr;
+ unsigned int bv_len;
+ unsigned int bv_off;
+ unsigned int dma_off;
+ unsigned int dma_len;
+ int dma_cnt[RSXX_MAX_TARGETS];
+ int tgt;
+ int st;
+ int i;
+
+ addr8 = bio->bi_iter.bi_sector << 9; /* sectors are 512 bytes */
+ atomic_set(n_dmas, 0);
+
+ for (i = 0; i < card->n_targets; i++) {
+ INIT_LIST_HEAD(&dma_list[i]);
+ dma_cnt[i] = 0;
+ }
+
+ if (bio->bi_rw & REQ_DISCARD) {
+ bv_len = bio->bi_iter.bi_size;
+
+ while (bv_len > 0) {
+ tgt = rsxx_get_dma_tgt(card, addr8);
+ laddr = rsxx_addr8_to_laddr(addr8, card);
+
+ st = rsxx_queue_discard(card, &dma_list[tgt], laddr,
+ cb, cb_data);
+ if (st)
+ goto bvec_err;
+
+ dma_cnt[tgt]++;
+ atomic_inc(n_dmas);
+ addr8 += RSXX_HW_BLK_SIZE;
+ bv_len -= RSXX_HW_BLK_SIZE;
+ }
+ } else {
+ bio_for_each_segment(bvec, bio, iter) {
+ bv_len = bvec.bv_len;
+ bv_off = bvec.bv_offset;
+
+ while (bv_len > 0) {
+ tgt = rsxx_get_dma_tgt(card, addr8);
+ laddr = rsxx_addr8_to_laddr(addr8, card);
+ dma_off = addr8 & RSXX_HW_BLK_MASK;
+ dma_len = min(bv_len,
+ RSXX_HW_BLK_SIZE - dma_off);
+
+ st = rsxx_queue_dma(card, &dma_list[tgt],
+ bio_data_dir(bio),
+ dma_off, dma_len,
+ laddr, bvec.bv_page,
+ bv_off, cb, cb_data);
+ if (st)
+ goto bvec_err;
+
+ dma_cnt[tgt]++;
+ atomic_inc(n_dmas);
+ addr8 += dma_len;
+ bv_off += dma_len;
+ bv_len -= dma_len;
+ }
+ }
+ }
+
+ for (i = 0; i < card->n_targets; i++) {
+ if (!list_empty(&dma_list[i])) {
+ spin_lock_bh(&card->ctrl[i].queue_lock);
+ card->ctrl[i].stats.sw_q_depth += dma_cnt[i];
+ list_splice_tail(&dma_list[i], &card->ctrl[i].queue);
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
+
+ queue_work(card->ctrl[i].issue_wq,
+ &card->ctrl[i].issue_dma_work);
+ }
+ }
+
+ return 0;
+
+bvec_err:
+ for (i = 0; i < card->n_targets; i++)
+ rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i],
+ FREE_DMA);
+
+ return st;
+}
+
+
+/*----------------- DMA Engine Initialization & Setup -------------------*/
+int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl)
+{
+ ctrl->status.buf = pci_alloc_consistent(dev, STATUS_BUFFER_SIZE8,
+ &ctrl->status.dma_addr);
+ ctrl->cmd.buf = pci_alloc_consistent(dev, COMMAND_BUFFER_SIZE8,
+ &ctrl->cmd.dma_addr);
+ if (ctrl->status.buf == NULL || ctrl->cmd.buf == NULL)
+ return -ENOMEM;
+
+ memset(ctrl->status.buf, 0xac, STATUS_BUFFER_SIZE8);
+ iowrite32(lower_32_bits(ctrl->status.dma_addr),
+ ctrl->regmap + SB_ADD_LO);
+ iowrite32(upper_32_bits(ctrl->status.dma_addr),
+ ctrl->regmap + SB_ADD_HI);
+
+ memset(ctrl->cmd.buf, 0x83, COMMAND_BUFFER_SIZE8);
+ iowrite32(lower_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_LO);
+ iowrite32(upper_32_bits(ctrl->cmd.dma_addr), ctrl->regmap + CB_ADD_HI);
+
+ ctrl->status.idx = ioread32(ctrl->regmap + HW_STATUS_CNT);
+ if (ctrl->status.idx > RSXX_MAX_OUTSTANDING_CMDS) {
+ dev_crit(&dev->dev, "Failed reading status cnt x%x\n",
+ ctrl->status.idx);
+ return -EINVAL;
+ }
+ iowrite32(ctrl->status.idx, ctrl->regmap + HW_STATUS_CNT);
+ iowrite32(ctrl->status.idx, ctrl->regmap + SW_STATUS_CNT);
+
+ ctrl->cmd.idx = ioread32(ctrl->regmap + HW_CMD_IDX);
+ if (ctrl->cmd.idx > RSXX_MAX_OUTSTANDING_CMDS) {
+ dev_crit(&dev->dev, "Failed reading cmd cnt x%x\n",
+ ctrl->status.idx);
+ return -EINVAL;
+ }
+ iowrite32(ctrl->cmd.idx, ctrl->regmap + HW_CMD_IDX);
+ iowrite32(ctrl->cmd.idx, ctrl->regmap + SW_CMD_IDX);
+
+ return 0;
+}
+
+static int rsxx_dma_ctrl_init(struct pci_dev *dev,
+ struct rsxx_dma_ctrl *ctrl)
+{
+ int i;
+ int st;
+
+ memset(&ctrl->stats, 0, sizeof(ctrl->stats));
+
+ ctrl->trackers = vmalloc(DMA_TRACKER_LIST_SIZE8);
+ if (!ctrl->trackers)
+ return -ENOMEM;
+
+ ctrl->trackers->head = 0;
+ for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
+ ctrl->trackers->list[i].next_tag = i + 1;
+ ctrl->trackers->list[i].dma = NULL;
+ }
+ ctrl->trackers->list[RSXX_MAX_OUTSTANDING_CMDS-1].next_tag = -1;
+ spin_lock_init(&ctrl->trackers->lock);
+
+ spin_lock_init(&ctrl->queue_lock);
+ mutex_init(&ctrl->work_lock);
+ INIT_LIST_HEAD(&ctrl->queue);
+
+ setup_timer(&ctrl->activity_timer, dma_engine_stalled,
+ (unsigned long)ctrl);
+
+ ctrl->issue_wq = alloc_ordered_workqueue(DRIVER_NAME"_issue", 0);
+ if (!ctrl->issue_wq)
+ return -ENOMEM;
+
+ ctrl->done_wq = alloc_ordered_workqueue(DRIVER_NAME"_done", 0);
+ if (!ctrl->done_wq)
+ return -ENOMEM;
+
+ INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue);
+ INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done);
+
+ st = rsxx_hw_buffers_init(dev, ctrl);
+ if (st)
+ return st;
+
+ return 0;
+}
+
+static int rsxx_dma_stripe_setup(struct rsxx_cardinfo *card,
+ unsigned int stripe_size8)
+{
+ if (!is_power_of_2(stripe_size8)) {
+ dev_err(CARD_TO_DEV(card),
+ "stripe_size is NOT a power of 2!\n");
+ return -EINVAL;
+ }
+
+ card->_stripe.lower_mask = stripe_size8 - 1;
+
+ card->_stripe.upper_mask = ~(card->_stripe.lower_mask);
+ card->_stripe.upper_shift = ffs(card->n_targets) - 1;
+
+ card->_stripe.target_mask = card->n_targets - 1;
+ card->_stripe.target_shift = ffs(stripe_size8) - 1;
+
+ dev_dbg(CARD_TO_DEV(card), "_stripe.lower_mask = x%016llx\n",
+ card->_stripe.lower_mask);
+ dev_dbg(CARD_TO_DEV(card), "_stripe.upper_shift = x%016llx\n",
+ card->_stripe.upper_shift);
+ dev_dbg(CARD_TO_DEV(card), "_stripe.upper_mask = x%016llx\n",
+ card->_stripe.upper_mask);
+ dev_dbg(CARD_TO_DEV(card), "_stripe.target_mask = x%016llx\n",
+ card->_stripe.target_mask);
+ dev_dbg(CARD_TO_DEV(card), "_stripe.target_shift = x%016llx\n",
+ card->_stripe.target_shift);
+
+ return 0;
+}
+
+int rsxx_dma_configure(struct rsxx_cardinfo *card)
+{
+ u32 intr_coal;
+
+ intr_coal = dma_intr_coal_val(card->config.data.intr_coal.mode,
+ card->config.data.intr_coal.count,
+ card->config.data.intr_coal.latency);
+ iowrite32(intr_coal, card->regmap + INTR_COAL);
+
+ return rsxx_dma_stripe_setup(card, card->config.data.stripe_size);
+}
+
+int rsxx_dma_setup(struct rsxx_cardinfo *card)
+{
+ unsigned long flags;
+ int st;
+ int i;
+
+ dev_info(CARD_TO_DEV(card),
+ "Initializing %d DMA targets\n",
+ card->n_targets);
+
+ /* Regmap is divided up into 4K chunks. One for each DMA channel */
+ for (i = 0; i < card->n_targets; i++)
+ card->ctrl[i].regmap = card->regmap + (i * 4096);
+
+ card->dma_fault = 0;
+
+ /* Reset the DMA queues */
+ rsxx_dma_queue_reset(card);
+
+ /************* Setup DMA Control *************/
+ for (i = 0; i < card->n_targets; i++) {
+ st = rsxx_dma_ctrl_init(card->dev, &card->ctrl[i]);
+ if (st)
+ goto failed_dma_setup;
+
+ card->ctrl[i].card = card;
+ card->ctrl[i].id = i;
+ }
+
+ card->scrub_hard = 1;
+
+ if (card->config_valid)
+ rsxx_dma_configure(card);
+
+ /* Enable the interrupts after all setup has completed. */
+ for (i = 0; i < card->n_targets; i++) {
+ spin_lock_irqsave(&card->irq_lock, flags);
+ rsxx_enable_ier_and_isr(card, CR_INTR_DMA(i));
+ spin_unlock_irqrestore(&card->irq_lock, flags);
+ }
+
+ return 0;
+
+failed_dma_setup:
+ for (i = 0; i < card->n_targets; i++) {
+ struct rsxx_dma_ctrl *ctrl = &card->ctrl[i];
+
+ if (ctrl->issue_wq) {
+ destroy_workqueue(ctrl->issue_wq);
+ ctrl->issue_wq = NULL;
+ }
+
+ if (ctrl->done_wq) {
+ destroy_workqueue(ctrl->done_wq);
+ ctrl->done_wq = NULL;
+ }
+
+ if (ctrl->trackers)
+ vfree(ctrl->trackers);
+
+ if (ctrl->status.buf)
+ pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
+ ctrl->status.buf,
+ ctrl->status.dma_addr);
+ if (ctrl->cmd.buf)
+ pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
+ ctrl->cmd.buf, ctrl->cmd.dma_addr);
+ }
+
+ return st;
+}
+
+int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl)
+{
+ struct rsxx_dma *dma;
+ int i;
+ int cnt = 0;
+
+ /* Clean up issued DMAs */
+ for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
+ dma = get_tracker_dma(ctrl->trackers, i);
+ if (dma) {
+ atomic_dec(&ctrl->stats.hw_q_depth);
+ rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+ push_tracker(ctrl->trackers, i);
+ cnt++;
+ }
+ }
+
+ return cnt;
+}
+
+void rsxx_dma_destroy(struct rsxx_cardinfo *card)
+{
+ struct rsxx_dma_ctrl *ctrl;
+ int i;
+
+ for (i = 0; i < card->n_targets; i++) {
+ ctrl = &card->ctrl[i];
+
+ if (ctrl->issue_wq) {
+ destroy_workqueue(ctrl->issue_wq);
+ ctrl->issue_wq = NULL;
+ }
+
+ if (ctrl->done_wq) {
+ destroy_workqueue(ctrl->done_wq);
+ ctrl->done_wq = NULL;
+ }
+
+ if (timer_pending(&ctrl->activity_timer))
+ del_timer_sync(&ctrl->activity_timer);
+
+ /* Clean up the DMA queue */
+ spin_lock_bh(&ctrl->queue_lock);
+ rsxx_cleanup_dma_queue(ctrl, &ctrl->queue, COMPLETE_DMA);
+ spin_unlock_bh(&ctrl->queue_lock);
+
+ rsxx_dma_cancel(ctrl);
+
+ vfree(ctrl->trackers);
+
+ pci_free_consistent(card->dev, STATUS_BUFFER_SIZE8,
+ ctrl->status.buf, ctrl->status.dma_addr);
+ pci_free_consistent(card->dev, COMMAND_BUFFER_SIZE8,
+ ctrl->cmd.buf, ctrl->cmd.dma_addr);
+ }
+}
+
+int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
+{
+ int i;
+ int j;
+ int cnt;
+ struct rsxx_dma *dma;
+ struct list_head *issued_dmas;
+
+ issued_dmas = kzalloc(sizeof(*issued_dmas) * card->n_targets,
+ GFP_KERNEL);
+ if (!issued_dmas)
+ return -ENOMEM;
+
+ for (i = 0; i < card->n_targets; i++) {
+ INIT_LIST_HEAD(&issued_dmas[i]);
+ cnt = 0;
+ for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) {
+ dma = get_tracker_dma(card->ctrl[i].trackers, j);
+ if (dma == NULL)
+ continue;
+
+ if (dma->cmd == HW_CMD_BLK_WRITE)
+ card->ctrl[i].stats.writes_issued--;
+ else if (dma->cmd == HW_CMD_BLK_DISCARD)
+ card->ctrl[i].stats.discards_issued--;
+ else
+ card->ctrl[i].stats.reads_issued--;
+
+ if (dma->cmd != HW_CMD_BLK_DISCARD) {
+ pci_unmap_page(card->dev, dma->dma_addr,
+ get_dma_size(dma),
+ dma->cmd == HW_CMD_BLK_WRITE ?
+ PCI_DMA_TODEVICE :
+ PCI_DMA_FROMDEVICE);
+ }
+
+ list_add_tail(&dma->list, &issued_dmas[i]);
+ push_tracker(card->ctrl[i].trackers, j);
+ cnt++;
+ }
+
+ spin_lock_bh(&card->ctrl[i].queue_lock);
+ list_splice(&issued_dmas[i], &card->ctrl[i].queue);
+
+ atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
+ card->ctrl[i].stats.sw_q_depth += cnt;
+ card->ctrl[i].e_cnt = 0;
+ spin_unlock_bh(&card->ctrl[i].queue_lock);
+ }
+
+ kfree(issued_dmas);
+
+ return 0;
+}
+
+int rsxx_dma_init(void)
+{
+ rsxx_dma_pool = KMEM_CACHE(rsxx_dma, SLAB_HWCACHE_ALIGN);
+ if (!rsxx_dma_pool)
+ return -ENOMEM;
+
+ return 0;
+}
+
+
+void rsxx_dma_cleanup(void)
+{
+ kmem_cache_destroy(rsxx_dma_pool);
+}
+
diff --git a/drivers/block/rsxx/rsxx.h b/drivers/block/rsxx/rsxx.h
new file mode 100644
index 000000000..24ba3642b
--- /dev/null
+++ b/drivers/block/rsxx/rsxx.h
@@ -0,0 +1,47 @@
+/*
+* Filename: rsxx.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_H__
+#define __RSXX_H__
+
+/*----------------- IOCTL Definitions -------------------*/
+
+#define RSXX_MAX_DATA 8
+
+struct rsxx_reg_access {
+ __u32 addr;
+ __u32 cnt;
+ __u32 stat;
+ __u32 stream;
+ __u32 data[RSXX_MAX_DATA];
+};
+
+#define RSXX_MAX_REG_CNT (RSXX_MAX_DATA * (sizeof(__u32)))
+
+#define RSXX_IOC_MAGIC 'r'
+
+#define RSXX_GETREG _IOWR(RSXX_IOC_MAGIC, 0x20, struct rsxx_reg_access)
+#define RSXX_SETREG _IOWR(RSXX_IOC_MAGIC, 0x21, struct rsxx_reg_access)
+
+#endif /* __RSXX_H_ */
diff --git a/drivers/block/rsxx/rsxx_cfg.h b/drivers/block/rsxx/rsxx_cfg.h
new file mode 100644
index 000000000..f384c9438
--- /dev/null
+++ b/drivers/block/rsxx/rsxx_cfg.h
@@ -0,0 +1,72 @@
+/*
+* Filename: rsXX_cfg.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_CFG_H__
+#define __RSXX_CFG_H__
+
+/* NOTE: Config values will be saved in network byte order (i.e. Big endian) */
+#include <linux/types.h>
+
+/*
+ * The card config version must match the driver's expected version. If it does
+ * not, the DMA interfaces will not be attached and the user will need to
+ * initialize/upgrade the card configuration using the card config utility.
+ */
+#define RSXX_CFG_VERSION 4
+
+struct card_cfg_hdr {
+ __u32 version;
+ __u32 crc;
+};
+
+struct card_cfg_data {
+ __u32 block_size;
+ __u32 stripe_size;
+ __u32 vendor_id;
+ __u32 cache_order;
+ struct {
+ __u32 mode; /* Disabled, manual, auto-tune... */
+ __u32 count; /* Number of intr to coalesce */
+ __u32 latency;/* Max wait time (in ns) */
+ } intr_coal;
+};
+
+struct rsxx_card_cfg {
+ struct card_cfg_hdr hdr;
+ struct card_cfg_data data;
+};
+
+/* Vendor ID Values */
+#define RSXX_VENDOR_ID_IBM 0
+#define RSXX_VENDOR_ID_DSI 1
+#define RSXX_VENDOR_COUNT 2
+
+/* Interrupt Coalescing Values */
+#define RSXX_INTR_COAL_DISABLED 0
+#define RSXX_INTR_COAL_EXPLICIT 1
+#define RSXX_INTR_COAL_AUTO_TUNE 2
+
+
+#endif /* __RSXX_CFG_H__ */
+
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
new file mode 100644
index 000000000..6bbc64d0f
--- /dev/null
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -0,0 +1,434 @@
+/*
+* Filename: rsxx_priv.h
+*
+*
+* Authors: Joshua Morris <josh.h.morris@us.ibm.com>
+* Philip Kelleher <pjk1939@linux.vnet.ibm.com>
+*
+* (C) Copyright 2013 IBM Corporation
+*
+* This program is free software; you can redistribute it and/or
+* modify it under the terms of the GNU General Public License as
+* published by the Free Software Foundation; either version 2 of the
+* License, or (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful, but
+* WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+* General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software Foundation,
+* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+*/
+
+#ifndef __RSXX_PRIV_H__
+#define __RSXX_PRIV_H__
+
+#include <linux/version.h>
+#include <linux/semaphore.h>
+
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/pci.h>
+#include <linux/spinlock.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/bio.h>
+#include <linux/vmalloc.h>
+#include <linux/timer.h>
+#include <linux/ioctl.h>
+#include <linux/delay.h>
+
+#include "rsxx.h"
+#include "rsxx_cfg.h"
+
+struct proc_cmd;
+
+#define PCI_DEVICE_ID_FS70_FLASH 0x04A9
+#define PCI_DEVICE_ID_FS80_FLASH 0x04AA
+
+#define RS70_PCI_REV_SUPPORTED 4
+
+#define DRIVER_NAME "rsxx"
+#define DRIVER_VERSION "4.0.3.2516"
+
+/* Block size is 4096 */
+#define RSXX_HW_BLK_SHIFT 12
+#define RSXX_HW_BLK_SIZE (1 << RSXX_HW_BLK_SHIFT)
+#define RSXX_HW_BLK_MASK (RSXX_HW_BLK_SIZE - 1)
+
+#define MAX_CREG_DATA8 32
+#define LOG_BUF_SIZE8 128
+
+#define RSXX_MAX_OUTSTANDING_CMDS 255
+#define RSXX_CS_IDX_MASK 0xff
+
+#define STATUS_BUFFER_SIZE8 4096
+#define COMMAND_BUFFER_SIZE8 4096
+
+#define RSXX_MAX_TARGETS 8
+
+struct dma_tracker_list;
+
+/* DMA Command/Status Buffer structure */
+struct rsxx_cs_buffer {
+ dma_addr_t dma_addr;
+ void *buf;
+ u32 idx;
+};
+
+struct rsxx_dma_stats {
+ u32 crc_errors;
+ u32 hard_errors;
+ u32 soft_errors;
+ u32 writes_issued;
+ u32 writes_failed;
+ u32 reads_issued;
+ u32 reads_failed;
+ u32 reads_retried;
+ u32 discards_issued;
+ u32 discards_failed;
+ u32 done_rescheduled;
+ u32 issue_rescheduled;
+ u32 dma_sw_err;
+ u32 dma_hw_fault;
+ u32 dma_cancelled;
+ u32 sw_q_depth; /* Number of DMAs on the SW queue. */
+ atomic_t hw_q_depth; /* Number of DMAs queued to HW. */
+};
+
+struct rsxx_dma_ctrl {
+ struct rsxx_cardinfo *card;
+ int id;
+ void __iomem *regmap;
+ struct rsxx_cs_buffer status;
+ struct rsxx_cs_buffer cmd;
+ u16 e_cnt;
+ spinlock_t queue_lock;
+ struct list_head queue;
+ struct workqueue_struct *issue_wq;
+ struct work_struct issue_dma_work;
+ struct workqueue_struct *done_wq;
+ struct work_struct dma_done_work;
+ struct timer_list activity_timer;
+ struct dma_tracker_list *trackers;
+ struct rsxx_dma_stats stats;
+ struct mutex work_lock;
+};
+
+struct rsxx_cardinfo {
+ struct pci_dev *dev;
+ unsigned int halt;
+ unsigned int eeh_state;
+
+ void __iomem *regmap;
+ spinlock_t irq_lock;
+ unsigned int isr_mask;
+ unsigned int ier_mask;
+
+ struct rsxx_card_cfg config;
+ int config_valid;
+
+ /* Embedded CPU Communication */
+ struct {
+ spinlock_t lock;
+ bool active;
+ struct creg_cmd *active_cmd;
+ struct workqueue_struct *creg_wq;
+ struct work_struct done_work;
+ struct list_head queue;
+ unsigned int q_depth;
+ /* Cache the creg status to prevent ioreads */
+ struct {
+ u32 stat;
+ u32 failed_cancel_timer;
+ u32 creg_timeout;
+ } creg_stats;
+ struct timer_list cmd_timer;
+ struct mutex reset_lock;
+ int reset;
+ } creg_ctrl;
+
+ struct {
+ char tmp[MAX_CREG_DATA8];
+ char buf[LOG_BUF_SIZE8]; /* terminated */
+ int buf_len;
+ } log;
+
+ struct workqueue_struct *event_wq;
+ struct work_struct event_work;
+ unsigned int state;
+ u64 size8;
+
+ /* Lock the device attach/detach function */
+ struct mutex dev_lock;
+
+ /* Block Device Variables */
+ bool bdev_attached;
+ int disk_id;
+ int major;
+ struct request_queue *queue;
+ struct gendisk *gendisk;
+ struct {
+ /* Used to convert a byte address to a device address. */
+ u64 lower_mask;
+ u64 upper_shift;
+ u64 upper_mask;
+ u64 target_mask;
+ u64 target_shift;
+ } _stripe;
+ unsigned int dma_fault;
+
+ int scrub_hard;
+
+ int n_targets;
+ struct rsxx_dma_ctrl *ctrl;
+
+ struct dentry *debugfs_dir;
+};
+
+enum rsxx_pci_regmap {
+ HWID = 0x00, /* Hardware Identification Register */
+ SCRATCH = 0x04, /* Scratch/Debug Register */
+ RESET = 0x08, /* Reset Register */
+ ISR = 0x10, /* Interrupt Status Register */
+ IER = 0x14, /* Interrupt Enable Register */
+ IPR = 0x18, /* Interrupt Poll Register */
+ CB_ADD_LO = 0x20, /* Command Host Buffer Address [31:0] */
+ CB_ADD_HI = 0x24, /* Command Host Buffer Address [63:32]*/
+ HW_CMD_IDX = 0x28, /* Hardware Processed Command Index */
+ SW_CMD_IDX = 0x2C, /* Software Processed Command Index */
+ SB_ADD_LO = 0x30, /* Status Host Buffer Address [31:0] */
+ SB_ADD_HI = 0x34, /* Status Host Buffer Address [63:32] */
+ HW_STATUS_CNT = 0x38, /* Hardware Status Counter */
+ SW_STATUS_CNT = 0x3C, /* Deprecated */
+ CREG_CMD = 0x40, /* CPU Command Register */
+ CREG_ADD = 0x44, /* CPU Address Register */
+ CREG_CNT = 0x48, /* CPU Count Register */
+ CREG_STAT = 0x4C, /* CPU Status Register */
+ CREG_DATA0 = 0x50, /* CPU Data Registers */
+ CREG_DATA1 = 0x54,
+ CREG_DATA2 = 0x58,
+ CREG_DATA3 = 0x5C,
+ CREG_DATA4 = 0x60,
+ CREG_DATA5 = 0x64,
+ CREG_DATA6 = 0x68,
+ CREG_DATA7 = 0x6c,
+ INTR_COAL = 0x70, /* Interrupt Coalescing Register */
+ HW_ERROR = 0x74, /* Card Error Register */
+ PCI_DEBUG0 = 0x78, /* PCI Debug Registers */
+ PCI_DEBUG1 = 0x7C,
+ PCI_DEBUG2 = 0x80,
+ PCI_DEBUG3 = 0x84,
+ PCI_DEBUG4 = 0x88,
+ PCI_DEBUG5 = 0x8C,
+ PCI_DEBUG6 = 0x90,
+ PCI_DEBUG7 = 0x94,
+ PCI_POWER_THROTTLE = 0x98,
+ PERF_CTRL = 0x9c,
+ PERF_TIMER_LO = 0xa0,
+ PERF_TIMER_HI = 0xa4,
+ PERF_RD512_LO = 0xa8,
+ PERF_RD512_HI = 0xac,
+ PERF_WR512_LO = 0xb0,
+ PERF_WR512_HI = 0xb4,
+ PCI_RECONFIG = 0xb8,
+};
+
+enum rsxx_intr {
+ CR_INTR_DMA0 = 0x00000001,
+ CR_INTR_CREG = 0x00000002,
+ CR_INTR_DMA1 = 0x00000004,
+ CR_INTR_EVENT = 0x00000008,
+ CR_INTR_DMA2 = 0x00000010,
+ CR_INTR_DMA3 = 0x00000020,
+ CR_INTR_DMA4 = 0x00000040,
+ CR_INTR_DMA5 = 0x00000080,
+ CR_INTR_DMA6 = 0x00000100,
+ CR_INTR_DMA7 = 0x00000200,
+ CR_INTR_ALL_C = 0x0000003f,
+ CR_INTR_ALL_G = 0x000003ff,
+ CR_INTR_DMA_ALL = 0x000003f5,
+ CR_INTR_ALL = 0xffffffff,
+};
+
+static inline int CR_INTR_DMA(int N)
+{
+ static const unsigned int _CR_INTR_DMA[] = {
+ CR_INTR_DMA0, CR_INTR_DMA1, CR_INTR_DMA2, CR_INTR_DMA3,
+ CR_INTR_DMA4, CR_INTR_DMA5, CR_INTR_DMA6, CR_INTR_DMA7
+ };
+ return _CR_INTR_DMA[N];
+}
+enum rsxx_pci_reset {
+ DMA_QUEUE_RESET = 0x00000001,
+};
+
+enum rsxx_hw_fifo_flush {
+ RSXX_FLUSH_BUSY = 0x00000002,
+ RSXX_FLUSH_TIMEOUT = 0x00000004,
+};
+
+enum rsxx_pci_revision {
+ RSXX_DISCARD_SUPPORT = 2,
+ RSXX_EEH_SUPPORT = 3,
+};
+
+enum rsxx_creg_cmd {
+ CREG_CMD_TAG_MASK = 0x0000FF00,
+ CREG_OP_WRITE = 0x000000C0,
+ CREG_OP_READ = 0x000000E0,
+};
+
+enum rsxx_creg_addr {
+ CREG_ADD_CARD_CMD = 0x80001000,
+ CREG_ADD_CARD_STATE = 0x80001004,
+ CREG_ADD_CARD_SIZE = 0x8000100c,
+ CREG_ADD_CAPABILITIES = 0x80001050,
+ CREG_ADD_LOG = 0x80002000,
+ CREG_ADD_NUM_TARGETS = 0x80003000,
+ CREG_ADD_CRAM = 0xA0000000,
+ CREG_ADD_CONFIG = 0xB0000000,
+};
+
+enum rsxx_creg_card_cmd {
+ CARD_CMD_STARTUP = 1,
+ CARD_CMD_SHUTDOWN = 2,
+ CARD_CMD_LOW_LEVEL_FORMAT = 3,
+ CARD_CMD_FPGA_RECONFIG_BR = 4,
+ CARD_CMD_FPGA_RECONFIG_MAIN = 5,
+ CARD_CMD_BACKUP = 6,
+ CARD_CMD_RESET = 7,
+ CARD_CMD_deprecated = 8,
+ CARD_CMD_UNINITIALIZE = 9,
+ CARD_CMD_DSTROY_EMERGENCY = 10,
+ CARD_CMD_DSTROY_NORMAL = 11,
+ CARD_CMD_DSTROY_EXTENDED = 12,
+ CARD_CMD_DSTROY_ABORT = 13,
+};
+
+enum rsxx_card_state {
+ CARD_STATE_SHUTDOWN = 0x00000001,
+ CARD_STATE_STARTING = 0x00000002,
+ CARD_STATE_FORMATTING = 0x00000004,
+ CARD_STATE_UNINITIALIZED = 0x00000008,
+ CARD_STATE_GOOD = 0x00000010,
+ CARD_STATE_SHUTTING_DOWN = 0x00000020,
+ CARD_STATE_FAULT = 0x00000040,
+ CARD_STATE_RD_ONLY_FAULT = 0x00000080,
+ CARD_STATE_DSTROYING = 0x00000100,
+};
+
+enum rsxx_led {
+ LED_DEFAULT = 0x0,
+ LED_IDENTIFY = 0x1,
+ LED_SOAK = 0x2,
+};
+
+enum rsxx_creg_flash_lock {
+ CREG_FLASH_LOCK = 1,
+ CREG_FLASH_UNLOCK = 2,
+};
+
+enum rsxx_card_capabilities {
+ CARD_CAP_SUBPAGE_WRITES = 0x00000080,
+};
+
+enum rsxx_creg_stat {
+ CREG_STAT_STATUS_MASK = 0x00000003,
+ CREG_STAT_SUCCESS = 0x1,
+ CREG_STAT_ERROR = 0x2,
+ CREG_STAT_CHAR_PENDING = 0x00000004, /* Character I/O pending bit */
+ CREG_STAT_LOG_PENDING = 0x00000008, /* HW log message pending bit */
+ CREG_STAT_TAG_MASK = 0x0000ff00,
+};
+
+enum rsxx_dma_finish {
+ FREE_DMA = 0x0,
+ COMPLETE_DMA = 0x1,
+};
+
+static inline unsigned int CREG_DATA(int N)
+{
+ return CREG_DATA0 + (N << 2);
+}
+
+/*----------------- Convenient Log Wrappers -------------------*/
+#define CARD_TO_DEV(__CARD) (&(__CARD)->dev->dev)
+
+/***** config.c *****/
+int rsxx_load_config(struct rsxx_cardinfo *card);
+
+/***** core.c *****/
+void rsxx_enable_ier(struct rsxx_cardinfo *card, unsigned int intr);
+void rsxx_disable_ier(struct rsxx_cardinfo *card, unsigned int intr);
+void rsxx_enable_ier_and_isr(struct rsxx_cardinfo *card,
+ unsigned int intr);
+void rsxx_disable_ier_and_isr(struct rsxx_cardinfo *card,
+ unsigned int intr);
+
+/***** dev.c *****/
+int rsxx_attach_dev(struct rsxx_cardinfo *card);
+void rsxx_detach_dev(struct rsxx_cardinfo *card);
+int rsxx_setup_dev(struct rsxx_cardinfo *card);
+void rsxx_destroy_dev(struct rsxx_cardinfo *card);
+int rsxx_dev_init(void);
+void rsxx_dev_cleanup(void);
+
+/***** dma.c ****/
+typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
+ void *cb_data,
+ unsigned int status);
+int rsxx_dma_setup(struct rsxx_cardinfo *card);
+void rsxx_dma_destroy(struct rsxx_cardinfo *card);
+int rsxx_dma_init(void);
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
+ struct list_head *q,
+ unsigned int done);
+int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
+void rsxx_dma_cleanup(void);
+void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
+int rsxx_dma_configure(struct rsxx_cardinfo *card);
+int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
+ struct bio *bio,
+ atomic_t *n_dmas,
+ rsxx_dma_cb cb,
+ void *cb_data);
+int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl);
+int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card);
+int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card);
+
+/***** cregs.c *****/
+int rsxx_creg_write(struct rsxx_cardinfo *card, u32 addr,
+ unsigned int size8,
+ void *data,
+ int byte_stream);
+int rsxx_creg_read(struct rsxx_cardinfo *card,
+ u32 addr,
+ unsigned int size8,
+ void *data,
+ int byte_stream);
+int rsxx_read_hw_log(struct rsxx_cardinfo *card);
+int rsxx_get_card_state(struct rsxx_cardinfo *card,
+ unsigned int *state);
+int rsxx_get_card_size8(struct rsxx_cardinfo *card, u64 *size8);
+int rsxx_get_num_targets(struct rsxx_cardinfo *card,
+ unsigned int *n_targets);
+int rsxx_get_card_capabilities(struct rsxx_cardinfo *card,
+ u32 *capabilities);
+int rsxx_issue_card_cmd(struct rsxx_cardinfo *card, u32 cmd);
+int rsxx_creg_setup(struct rsxx_cardinfo *card);
+void rsxx_creg_destroy(struct rsxx_cardinfo *card);
+int rsxx_creg_init(void);
+void rsxx_creg_cleanup(void);
+int rsxx_reg_access(struct rsxx_cardinfo *card,
+ struct rsxx_reg_access __user *ucmd,
+ int read);
+void rsxx_eeh_save_issued_creg(struct rsxx_cardinfo *card);
+void rsxx_kick_creg_queue(struct rsxx_cardinfo *card);
+
+
+
+#endif /* __DRIVERS_BLOCK_RSXX_H__ */
diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
new file mode 100644
index 000000000..1e46eb230
--- /dev/null
+++ b/drivers/block/skd_main.c
@@ -0,0 +1,5411 @@
+/* Copyright 2012 STEC, Inc.
+ *
+ * This file is licensed under the terms of the 3-clause
+ * BSD License (http://opensource.org/licenses/BSD-3-Clause)
+ * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
+ * at your option. Both licenses are also available in the LICENSE file
+ * distributed with this project. This file may not be copied, modified,
+ * or distributed except in accordance with those terms.
+ * Gordoni Waidhofer <gwaidhofer@stec-inc.com>
+ * Initial Driver Design!
+ * Thomas Swann <tswann@stec-inc.com>
+ * Interrupt handling.
+ * Ramprasad Chinthekindi <rchinthekindi@stec-inc.com>
+ * biomode implementation.
+ * Akhil Bhansali <abhansali@stec-inc.com>
+ * Added support for DISCARD / FLUSH and FUA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/time.h>
+#include <linux/hdreg.h>
+#include <linux/dma-mapping.h>
+#include <linux/completion.h>
+#include <linux/scatterlist.h>
+#include <linux/version.h>
+#include <linux/err.h>
+#include <linux/scatterlist.h>
+#include <linux/aer.h>
+#include <linux/ctype.h>
+#include <linux/wait.h>
+#include <linux/uio.h>
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <asm/unaligned.h>
+
+#include "skd_s1120.h"
+
+static int skd_dbg_level;
+static int skd_isr_comp_limit = 4;
+
+enum {
+ STEC_LINK_2_5GTS = 0,
+ STEC_LINK_5GTS = 1,
+ STEC_LINK_8GTS = 2,
+ STEC_LINK_UNKNOWN = 0xFF
+};
+
+enum {
+ SKD_FLUSH_INITIALIZER,
+ SKD_FLUSH_ZERO_SIZE_FIRST,
+ SKD_FLUSH_DATA_SECOND,
+};
+
+#define SKD_ASSERT(expr) \
+ do { \
+ if (unlikely(!(expr))) { \
+ pr_err("Assertion failed! %s,%s,%s,line=%d\n", \
+ # expr, __FILE__, __func__, __LINE__); \
+ } \
+ } while (0)
+
+#define DRV_NAME "skd"
+#define DRV_VERSION "2.2.1"
+#define DRV_BUILD_ID "0260"
+#define PFX DRV_NAME ": "
+#define DRV_BIN_VERSION 0x100
+#define DRV_VER_COMPL "2.2.1." DRV_BUILD_ID
+
+MODULE_AUTHOR("bug-reports: support@stec-inc.com");
+MODULE_LICENSE("Dual BSD/GPL");
+
+MODULE_DESCRIPTION("STEC s1120 PCIe SSD block driver (b" DRV_BUILD_ID ")");
+MODULE_VERSION(DRV_VERSION "-" DRV_BUILD_ID);
+
+#define PCI_VENDOR_ID_STEC 0x1B39
+#define PCI_DEVICE_ID_S1120 0x0001
+
+#define SKD_FUA_NV (1 << 1)
+#define SKD_MINORS_PER_DEVICE 16
+
+#define SKD_MAX_QUEUE_DEPTH 200u
+
+#define SKD_PAUSE_TIMEOUT (5 * 1000)
+
+#define SKD_N_FITMSG_BYTES (512u)
+
+#define SKD_N_SPECIAL_CONTEXT 32u
+#define SKD_N_SPECIAL_FITMSG_BYTES (128u)
+
+/* SG elements are 32 bytes, so we can make this 4096 and still be under the
+ * 128KB limit. That allows 4096*4K = 16M xfer size
+ */
+#define SKD_N_SG_PER_REQ_DEFAULT 256u
+#define SKD_N_SG_PER_SPECIAL 256u
+
+#define SKD_N_COMPLETION_ENTRY 256u
+#define SKD_N_READ_CAP_BYTES (8u)
+
+#define SKD_N_INTERNAL_BYTES (512u)
+
+/* 5 bits of uniqifier, 0xF800 */
+#define SKD_ID_INCR (0x400)
+#define SKD_ID_TABLE_MASK (3u << 8u)
+#define SKD_ID_RW_REQUEST (0u << 8u)
+#define SKD_ID_INTERNAL (1u << 8u)
+#define SKD_ID_SPECIAL_REQUEST (2u << 8u)
+#define SKD_ID_FIT_MSG (3u << 8u)
+#define SKD_ID_SLOT_MASK 0x00FFu
+#define SKD_ID_SLOT_AND_TABLE_MASK 0x03FFu
+
+#define SKD_N_TIMEOUT_SLOT 4u
+#define SKD_TIMEOUT_SLOT_MASK 3u
+
+#define SKD_N_MAX_SECTORS 2048u
+
+#define SKD_MAX_RETRIES 2u
+
+#define SKD_TIMER_SECONDS(seconds) (seconds)
+#define SKD_TIMER_MINUTES(minutes) ((minutes) * (60))
+
+#define INQ_STD_NBYTES 36
+#define SKD_DISCARD_CDB_LENGTH 24
+
+enum skd_drvr_state {
+ SKD_DRVR_STATE_LOAD,
+ SKD_DRVR_STATE_IDLE,
+ SKD_DRVR_STATE_BUSY,
+ SKD_DRVR_STATE_STARTING,
+ SKD_DRVR_STATE_ONLINE,
+ SKD_DRVR_STATE_PAUSING,
+ SKD_DRVR_STATE_PAUSED,
+ SKD_DRVR_STATE_DRAINING_TIMEOUT,
+ SKD_DRVR_STATE_RESTARTING,
+ SKD_DRVR_STATE_RESUMING,
+ SKD_DRVR_STATE_STOPPING,
+ SKD_DRVR_STATE_FAULT,
+ SKD_DRVR_STATE_DISAPPEARED,
+ SKD_DRVR_STATE_PROTOCOL_MISMATCH,
+ SKD_DRVR_STATE_BUSY_ERASE,
+ SKD_DRVR_STATE_BUSY_SANITIZE,
+ SKD_DRVR_STATE_BUSY_IMMINENT,
+ SKD_DRVR_STATE_WAIT_BOOT,
+ SKD_DRVR_STATE_SYNCING,
+};
+
+#define SKD_WAIT_BOOT_TIMO SKD_TIMER_SECONDS(90u)
+#define SKD_STARTING_TIMO SKD_TIMER_SECONDS(8u)
+#define SKD_RESTARTING_TIMO SKD_TIMER_MINUTES(4u)
+#define SKD_DRAINING_TIMO SKD_TIMER_SECONDS(6u)
+#define SKD_BUSY_TIMO SKD_TIMER_MINUTES(20u)
+#define SKD_STARTED_BUSY_TIMO SKD_TIMER_SECONDS(60u)
+#define SKD_START_WAIT_SECONDS 90u
+
+enum skd_req_state {
+ SKD_REQ_STATE_IDLE,
+ SKD_REQ_STATE_SETUP,
+ SKD_REQ_STATE_BUSY,
+ SKD_REQ_STATE_COMPLETED,
+ SKD_REQ_STATE_TIMEOUT,
+ SKD_REQ_STATE_ABORTED,
+};
+
+enum skd_fit_msg_state {
+ SKD_MSG_STATE_IDLE,
+ SKD_MSG_STATE_BUSY,
+};
+
+enum skd_check_status_action {
+ SKD_CHECK_STATUS_REPORT_GOOD,
+ SKD_CHECK_STATUS_REPORT_SMART_ALERT,
+ SKD_CHECK_STATUS_REQUEUE_REQUEST,
+ SKD_CHECK_STATUS_REPORT_ERROR,
+ SKD_CHECK_STATUS_BUSY_IMMINENT,
+};
+
+struct skd_fitmsg_context {
+ enum skd_fit_msg_state state;
+
+ struct skd_fitmsg_context *next;
+
+ u32 id;
+ u16 outstanding;
+
+ u32 length;
+ u32 offset;
+
+ u8 *msg_buf;
+ dma_addr_t mb_dma_address;
+};
+
+struct skd_request_context {
+ enum skd_req_state state;
+
+ struct skd_request_context *next;
+
+ u16 id;
+ u32 fitmsg_id;
+
+ struct request *req;
+ u8 flush_cmd;
+ u8 discard_page;
+
+ u32 timeout_stamp;
+ u8 sg_data_dir;
+ struct scatterlist *sg;
+ u32 n_sg;
+ u32 sg_byte_count;
+
+ struct fit_sg_descriptor *sksg_list;
+ dma_addr_t sksg_dma_address;
+
+ struct fit_completion_entry_v1 completion;
+
+ struct fit_comp_error_info err_info;
+
+};
+#define SKD_DATA_DIR_HOST_TO_CARD 1
+#define SKD_DATA_DIR_CARD_TO_HOST 2
+#define SKD_DATA_DIR_NONE 3 /* especially for DISCARD requests. */
+
+struct skd_special_context {
+ struct skd_request_context req;
+
+ u8 orphaned;
+
+ void *data_buf;
+ dma_addr_t db_dma_address;
+
+ u8 *msg_buf;
+ dma_addr_t mb_dma_address;
+};
+
+struct skd_sg_io {
+ fmode_t mode;
+ void __user *argp;
+
+ struct sg_io_hdr sg;
+
+ u8 cdb[16];
+
+ u32 dxfer_len;
+ u32 iovcnt;
+ struct sg_iovec *iov;
+ struct sg_iovec no_iov_iov;
+
+ struct skd_special_context *skspcl;
+};
+
+typedef enum skd_irq_type {
+ SKD_IRQ_LEGACY,
+ SKD_IRQ_MSI,
+ SKD_IRQ_MSIX
+} skd_irq_type_t;
+
+#define SKD_MAX_BARS 2
+
+struct skd_device {
+ volatile void __iomem *mem_map[SKD_MAX_BARS];
+ resource_size_t mem_phys[SKD_MAX_BARS];
+ u32 mem_size[SKD_MAX_BARS];
+
+ skd_irq_type_t irq_type;
+ u32 msix_count;
+ struct skd_msix_entry *msix_entries;
+
+ struct pci_dev *pdev;
+ int pcie_error_reporting_is_enabled;
+
+ spinlock_t lock;
+ struct gendisk *disk;
+ struct request_queue *queue;
+ struct device *class_dev;
+ int gendisk_on;
+ int sync_done;
+
+ atomic_t device_count;
+ u32 devno;
+ u32 major;
+ char name[32];
+ char isr_name[30];
+
+ enum skd_drvr_state state;
+ u32 drive_state;
+
+ u32 in_flight;
+ u32 cur_max_queue_depth;
+ u32 queue_low_water_mark;
+ u32 dev_max_queue_depth;
+
+ u32 num_fitmsg_context;
+ u32 num_req_context;
+
+ u32 timeout_slot[SKD_N_TIMEOUT_SLOT];
+ u32 timeout_stamp;
+ struct skd_fitmsg_context *skmsg_free_list;
+ struct skd_fitmsg_context *skmsg_table;
+
+ struct skd_request_context *skreq_free_list;
+ struct skd_request_context *skreq_table;
+
+ struct skd_special_context *skspcl_free_list;
+ struct skd_special_context *skspcl_table;
+
+ struct skd_special_context internal_skspcl;
+ u32 read_cap_blocksize;
+ u32 read_cap_last_lba;
+ int read_cap_is_valid;
+ int inquiry_is_valid;
+ u8 inq_serial_num[13]; /*12 chars plus null term */
+ u8 id_str[80]; /* holds a composite name (pci + sernum) */
+
+ u8 skcomp_cycle;
+ u32 skcomp_ix;
+ struct fit_completion_entry_v1 *skcomp_table;
+ struct fit_comp_error_info *skerr_table;
+ dma_addr_t cq_dma_address;
+
+ wait_queue_head_t waitq;
+
+ struct timer_list timer;
+ u32 timer_countdown;
+ u32 timer_substate;
+
+ int n_special;
+ int sgs_per_request;
+ u32 last_mtd;
+
+ u32 proto_ver;
+
+ int dbg_level;
+ u32 connect_time_stamp;
+ int connect_retries;
+#define SKD_MAX_CONNECT_RETRIES 16
+ u32 drive_jiffies;
+
+ u32 timo_slot;
+
+
+ struct work_struct completion_worker;
+};
+
+#define SKD_WRITEL(DEV, VAL, OFF) skd_reg_write32(DEV, VAL, OFF)
+#define SKD_READL(DEV, OFF) skd_reg_read32(DEV, OFF)
+#define SKD_WRITEQ(DEV, VAL, OFF) skd_reg_write64(DEV, VAL, OFF)
+
+static inline u32 skd_reg_read32(struct skd_device *skdev, u32 offset)
+{
+ u32 val;
+
+ if (likely(skdev->dbg_level < 2))
+ return readl(skdev->mem_map[1] + offset);
+ else {
+ barrier();
+ val = readl(skdev->mem_map[1] + offset);
+ barrier();
+ pr_debug("%s:%s:%d offset %x = %x\n",
+ skdev->name, __func__, __LINE__, offset, val);
+ return val;
+ }
+
+}
+
+static inline void skd_reg_write32(struct skd_device *skdev, u32 val,
+ u32 offset)
+{
+ if (likely(skdev->dbg_level < 2)) {
+ writel(val, skdev->mem_map[1] + offset);
+ barrier();
+ } else {
+ barrier();
+ writel(val, skdev->mem_map[1] + offset);
+ barrier();
+ pr_debug("%s:%s:%d offset %x = %x\n",
+ skdev->name, __func__, __LINE__, offset, val);
+ }
+}
+
+static inline void skd_reg_write64(struct skd_device *skdev, u64 val,
+ u32 offset)
+{
+ if (likely(skdev->dbg_level < 2)) {
+ writeq(val, skdev->mem_map[1] + offset);
+ barrier();
+ } else {
+ barrier();
+ writeq(val, skdev->mem_map[1] + offset);
+ barrier();
+ pr_debug("%s:%s:%d offset %x = %016llx\n",
+ skdev->name, __func__, __LINE__, offset, val);
+ }
+}
+
+
+#define SKD_IRQ_DEFAULT SKD_IRQ_MSI
+static int skd_isr_type = SKD_IRQ_DEFAULT;
+
+module_param(skd_isr_type, int, 0444);
+MODULE_PARM_DESC(skd_isr_type, "Interrupt type capability."
+ " (0==legacy, 1==MSI, 2==MSI-X, default==1)");
+
+#define SKD_MAX_REQ_PER_MSG_DEFAULT 1
+static int skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
+
+module_param(skd_max_req_per_msg, int, 0444);
+MODULE_PARM_DESC(skd_max_req_per_msg,
+ "Maximum SCSI requests packed in a single message."
+ " (1-14, default==1)");
+
+#define SKD_MAX_QUEUE_DEPTH_DEFAULT 64
+#define SKD_MAX_QUEUE_DEPTH_DEFAULT_STR "64"
+static int skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
+
+module_param(skd_max_queue_depth, int, 0444);
+MODULE_PARM_DESC(skd_max_queue_depth,
+ "Maximum SCSI requests issued to s1120."
+ " (1-200, default==" SKD_MAX_QUEUE_DEPTH_DEFAULT_STR ")");
+
+static int skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
+module_param(skd_sgs_per_request, int, 0444);
+MODULE_PARM_DESC(skd_sgs_per_request,
+ "Maximum SG elements per block request."
+ " (1-4096, default==256)");
+
+static int skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
+module_param(skd_max_pass_thru, int, 0444);
+MODULE_PARM_DESC(skd_max_pass_thru,
+ "Maximum SCSI pass-thru at a time." " (1-50, default==32)");
+
+module_param(skd_dbg_level, int, 0444);
+MODULE_PARM_DESC(skd_dbg_level, "s1120 debug level (0,1,2)");
+
+module_param(skd_isr_comp_limit, int, 0444);
+MODULE_PARM_DESC(skd_isr_comp_limit, "s1120 isr comp limit (0=none) default=4");
+
+/* Major device number dynamically assigned. */
+static u32 skd_major;
+
+static void skd_destruct(struct skd_device *skdev);
+static const struct block_device_operations skd_blockdev_ops;
+static void skd_send_fitmsg(struct skd_device *skdev,
+ struct skd_fitmsg_context *skmsg);
+static void skd_send_special_fitmsg(struct skd_device *skdev,
+ struct skd_special_context *skspcl);
+static void skd_request_fn(struct request_queue *rq);
+static void skd_end_request(struct skd_device *skdev,
+ struct skd_request_context *skreq, int error);
+static int skd_preop_sg_list(struct skd_device *skdev,
+ struct skd_request_context *skreq);
+static void skd_postop_sg_list(struct skd_device *skdev,
+ struct skd_request_context *skreq);
+
+static void skd_restart_device(struct skd_device *skdev);
+static int skd_quiesce_dev(struct skd_device *skdev);
+static int skd_unquiesce_dev(struct skd_device *skdev);
+static void skd_release_special(struct skd_device *skdev,
+ struct skd_special_context *skspcl);
+static void skd_disable_interrupts(struct skd_device *skdev);
+static void skd_isr_fwstate(struct skd_device *skdev);
+static void skd_recover_requests(struct skd_device *skdev, int requeue);
+static void skd_soft_reset(struct skd_device *skdev);
+
+static const char *skd_name(struct skd_device *skdev);
+const char *skd_drive_state_to_str(int state);
+const char *skd_skdev_state_to_str(enum skd_drvr_state state);
+static void skd_log_skdev(struct skd_device *skdev, const char *event);
+static void skd_log_skmsg(struct skd_device *skdev,
+ struct skd_fitmsg_context *skmsg, const char *event);
+static void skd_log_skreq(struct skd_device *skdev,
+ struct skd_request_context *skreq, const char *event);
+
+/*
+ *****************************************************************************
+ * READ/WRITE REQUESTS
+ *****************************************************************************
+ */
+static void skd_fail_all_pending(struct skd_device *skdev)
+{
+ struct request_queue *q = skdev->queue;
+ struct request *req;
+
+ for (;; ) {
+ req = blk_peek_request(q);
+ if (req == NULL)
+ break;
+ blk_start_request(req);
+ __blk_end_request_all(req, -EIO);
+ }
+}
+
+static void
+skd_prep_rw_cdb(struct skd_scsi_request *scsi_req,
+ int data_dir, unsigned lba,
+ unsigned count)
+{
+ if (data_dir == READ)
+ scsi_req->cdb[0] = 0x28;
+ else
+ scsi_req->cdb[0] = 0x2a;
+
+ scsi_req->cdb[1] = 0;
+ scsi_req->cdb[2] = (lba & 0xff000000) >> 24;
+ scsi_req->cdb[3] = (lba & 0xff0000) >> 16;
+ scsi_req->cdb[4] = (lba & 0xff00) >> 8;
+ scsi_req->cdb[5] = (lba & 0xff);
+ scsi_req->cdb[6] = 0;
+ scsi_req->cdb[7] = (count & 0xff00) >> 8;
+ scsi_req->cdb[8] = count & 0xff;
+ scsi_req->cdb[9] = 0;
+}
+
+static void
+skd_prep_zerosize_flush_cdb(struct skd_scsi_request *scsi_req,
+ struct skd_request_context *skreq)
+{
+ skreq->flush_cmd = 1;
+
+ scsi_req->cdb[0] = 0x35;
+ scsi_req->cdb[1] = 0;
+ scsi_req->cdb[2] = 0;
+ scsi_req->cdb[3] = 0;
+ scsi_req->cdb[4] = 0;
+ scsi_req->cdb[5] = 0;
+ scsi_req->cdb[6] = 0;
+ scsi_req->cdb[7] = 0;
+ scsi_req->cdb[8] = 0;
+ scsi_req->cdb[9] = 0;
+}
+
+static void
+skd_prep_discard_cdb(struct skd_scsi_request *scsi_req,
+ struct skd_request_context *skreq,
+ struct page *page,
+ u32 lba, u32 count)
+{
+ char *buf;
+ unsigned long len;
+ struct request *req;
+
+ buf = page_address(page);
+ len = SKD_DISCARD_CDB_LENGTH;
+
+ scsi_req->cdb[0] = UNMAP;
+ scsi_req->cdb[8] = len;
+
+ put_unaligned_be16(6 + 16, &buf[0]);
+ put_unaligned_be16(16, &buf[2]);
+ put_unaligned_be64(lba, &buf[8]);
+ put_unaligned_be32(count, &buf[16]);
+
+ req = skreq->req;
+ blk_add_request_payload(req, page, len);
+}
+
+static void skd_request_fn_not_online(struct request_queue *q);
+
+static void skd_request_fn(struct request_queue *q)
+{
+ struct skd_device *skdev = q->queuedata;
+ struct skd_fitmsg_context *skmsg = NULL;
+ struct fit_msg_hdr *fmh = NULL;
+ struct skd_request_context *skreq;
+ struct request *req = NULL;
+ struct skd_scsi_request *scsi_req;
+ struct page *page;
+ unsigned long io_flags;
+ int error;
+ u32 lba;
+ u32 count;
+ int data_dir;
+ u32 be_lba;
+ u32 be_count;
+ u64 be_dmaa;
+ u64 cmdctxt;
+ u32 timo_slot;
+ void *cmd_ptr;
+ int flush, fua;
+
+ if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+ skd_request_fn_not_online(q);
+ return;
+ }
+
+ if (blk_queue_stopped(skdev->queue)) {
+ if (skdev->skmsg_free_list == NULL ||
+ skdev->skreq_free_list == NULL ||
+ skdev->in_flight >= skdev->queue_low_water_mark)
+ /* There is still some kind of shortage */
+ return;
+
+ queue_flag_clear(QUEUE_FLAG_STOPPED, skdev->queue);
+ }
+
+ /*
+ * Stop conditions:
+ * - There are no more native requests
+ * - There are already the maximum number of requests in progress
+ * - There are no more skd_request_context entries
+ * - There are no more FIT msg buffers
+ */
+ for (;; ) {
+
+ flush = fua = 0;
+
+ req = blk_peek_request(q);
+
+ /* Are there any native requests to start? */
+ if (req == NULL)
+ break;
+
+ lba = (u32)blk_rq_pos(req);
+ count = blk_rq_sectors(req);
+ data_dir = rq_data_dir(req);
+ io_flags = req->cmd_flags;
+
+ if (io_flags & REQ_FLUSH)
+ flush++;
+
+ if (io_flags & REQ_FUA)
+ fua++;
+
+ pr_debug("%s:%s:%d new req=%p lba=%u(0x%x) "
+ "count=%u(0x%x) dir=%d\n",
+ skdev->name, __func__, __LINE__,
+ req, lba, lba, count, count, data_dir);
+
+ /* At this point we know there is a request */
+
+ /* Are too many requets already in progress? */
+ if (skdev->in_flight >= skdev->cur_max_queue_depth) {
+ pr_debug("%s:%s:%d qdepth %d, limit %d\n",
+ skdev->name, __func__, __LINE__,
+ skdev->in_flight, skdev->cur_max_queue_depth);
+ break;
+ }
+
+ /* Is a skd_request_context available? */
+ skreq = skdev->skreq_free_list;
+ if (skreq == NULL) {
+ pr_debug("%s:%s:%d Out of req=%p\n",
+ skdev->name, __func__, __LINE__, q);
+ break;
+ }
+ SKD_ASSERT(skreq->state == SKD_REQ_STATE_IDLE);
+ SKD_ASSERT((skreq->id & SKD_ID_INCR) == 0);
+
+ /* Now we check to see if we can get a fit msg */
+ if (skmsg == NULL) {
+ if (skdev->skmsg_free_list == NULL) {
+ pr_debug("%s:%s:%d Out of msg\n",
+ skdev->name, __func__, __LINE__);
+ break;
+ }
+ }
+
+ skreq->flush_cmd = 0;
+ skreq->n_sg = 0;
+ skreq->sg_byte_count = 0;
+ skreq->discard_page = 0;
+
+ /*
+ * OK to now dequeue request from q.
+ *
+ * At this point we are comitted to either start or reject
+ * the native request. Note that skd_request_context is
+ * available but is still at the head of the free list.
+ */
+ blk_start_request(req);
+ skreq->req = req;
+ skreq->fitmsg_id = 0;
+
+ /* Either a FIT msg is in progress or we have to start one. */
+ if (skmsg == NULL) {
+ /* Are there any FIT msg buffers available? */
+ skmsg = skdev->skmsg_free_list;
+ if (skmsg == NULL) {
+ pr_debug("%s:%s:%d Out of msg skdev=%p\n",
+ skdev->name, __func__, __LINE__,
+ skdev);
+ break;
+ }
+ SKD_ASSERT(skmsg->state == SKD_MSG_STATE_IDLE);
+ SKD_ASSERT((skmsg->id & SKD_ID_INCR) == 0);
+
+ skdev->skmsg_free_list = skmsg->next;
+
+ skmsg->state = SKD_MSG_STATE_BUSY;
+ skmsg->id += SKD_ID_INCR;
+
+ /* Initialize the FIT msg header */
+ fmh = (struct fit_msg_hdr *)skmsg->msg_buf;
+ memset(fmh, 0, sizeof(*fmh));
+ fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+ skmsg->length = sizeof(*fmh);
+ }
+
+ skreq->fitmsg_id = skmsg->id;
+
+ /*
+ * Note that a FIT msg may have just been started
+ * but contains no SoFIT requests yet.
+ */
+
+ /*
+ * Transcode the request, checking as we go. The outcome of
+ * the transcoding is represented by the error variable.
+ */
+ cmd_ptr = &skmsg->msg_buf[skmsg->length];
+ memset(cmd_ptr, 0, 32);
+
+ be_lba = cpu_to_be32(lba);
+ be_count = cpu_to_be32(count);
+ be_dmaa = cpu_to_be64((u64)skreq->sksg_dma_address);
+ cmdctxt = skreq->id + SKD_ID_INCR;
+
+ scsi_req = cmd_ptr;
+ scsi_req->hdr.tag = cmdctxt;
+ scsi_req->hdr.sg_list_dma_address = be_dmaa;
+
+ if (data_dir == READ)
+ skreq->sg_data_dir = SKD_DATA_DIR_CARD_TO_HOST;
+ else
+ skreq->sg_data_dir = SKD_DATA_DIR_HOST_TO_CARD;
+
+ if (io_flags & REQ_DISCARD) {
+ page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
+ if (!page) {
+ pr_err("request_fn:Page allocation failed.\n");
+ skd_end_request(skdev, skreq, -ENOMEM);
+ break;
+ }
+ skreq->discard_page = 1;
+ req->completion_data = page;
+ skd_prep_discard_cdb(scsi_req, skreq, page, lba, count);
+
+ } else if (flush == SKD_FLUSH_ZERO_SIZE_FIRST) {
+ skd_prep_zerosize_flush_cdb(scsi_req, skreq);
+ SKD_ASSERT(skreq->flush_cmd == 1);
+
+ } else {
+ skd_prep_rw_cdb(scsi_req, data_dir, lba, count);
+ }
+
+ if (fua)
+ scsi_req->cdb[1] |= SKD_FUA_NV;
+
+ if (!req->bio)
+ goto skip_sg;
+
+ error = skd_preop_sg_list(skdev, skreq);
+
+ if (error != 0) {
+ /*
+ * Complete the native request with error.
+ * Note that the request context is still at the
+ * head of the free list, and that the SoFIT request
+ * was encoded into the FIT msg buffer but the FIT
+ * msg length has not been updated. In short, the
+ * only resource that has been allocated but might
+ * not be used is that the FIT msg could be empty.
+ */
+ pr_debug("%s:%s:%d error Out\n",
+ skdev->name, __func__, __LINE__);
+ skd_end_request(skdev, skreq, error);
+ continue;
+ }
+
+skip_sg:
+ scsi_req->hdr.sg_list_len_bytes =
+ cpu_to_be32(skreq->sg_byte_count);
+
+ /* Complete resource allocations. */
+ skdev->skreq_free_list = skreq->next;
+ skreq->state = SKD_REQ_STATE_BUSY;
+ skreq->id += SKD_ID_INCR;
+
+ skmsg->length += sizeof(struct skd_scsi_request);
+ fmh->num_protocol_cmds_coalesced++;
+
+ /*
+ * Update the active request counts.
+ * Capture the timeout timestamp.
+ */
+ skreq->timeout_stamp = skdev->timeout_stamp;
+ timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+ skdev->timeout_slot[timo_slot]++;
+ skdev->in_flight++;
+ pr_debug("%s:%s:%d req=0x%x busy=%d\n",
+ skdev->name, __func__, __LINE__,
+ skreq->id, skdev->in_flight);
+
+ /*
+ * If the FIT msg buffer is full send it.
+ */
+ if (skmsg->length >= SKD_N_FITMSG_BYTES ||
+ fmh->num_protocol_cmds_coalesced >= skd_max_req_per_msg) {
+ skd_send_fitmsg(skdev, skmsg);
+ skmsg = NULL;
+ fmh = NULL;
+ }
+ }
+
+ /*
+ * Is a FIT msg in progress? If it is empty put the buffer back
+ * on the free list. If it is non-empty send what we got.
+ * This minimizes latency when there are fewer requests than
+ * what fits in a FIT msg.
+ */
+ if (skmsg != NULL) {
+ /* Bigger than just a FIT msg header? */
+ if (skmsg->length > sizeof(struct fit_msg_hdr)) {
+ pr_debug("%s:%s:%d sending msg=%p, len %d\n",
+ skdev->name, __func__, __LINE__,
+ skmsg, skmsg->length);
+ skd_send_fitmsg(skdev, skmsg);
+ } else {
+ /*
+ * The FIT msg is empty. It means we got started
+ * on the msg, but the requests were rejected.
+ */
+ skmsg->state = SKD_MSG_STATE_IDLE;
+ skmsg->id += SKD_ID_INCR;
+ skmsg->next = skdev->skmsg_free_list;
+ skdev->skmsg_free_list = skmsg;
+ }
+ skmsg = NULL;
+ fmh = NULL;
+ }
+
+ /*
+ * If req is non-NULL it means there is something to do but
+ * we are out of a resource.
+ */
+ if (req)
+ blk_stop_queue(skdev->queue);
+}
+
+static void skd_end_request(struct skd_device *skdev,
+ struct skd_request_context *skreq, int error)
+{
+ struct request *req = skreq->req;
+ unsigned int io_flags = req->cmd_flags;
+
+ if ((io_flags & REQ_DISCARD) &&
+ (skreq->discard_page == 1)) {
+ pr_debug("%s:%s:%d, free the page!",
+ skdev->name, __func__, __LINE__);
+ __free_page(req->completion_data);
+ }
+
+ if (unlikely(error)) {
+ struct request *req = skreq->req;
+ char *cmd = (rq_data_dir(req) == READ) ? "read" : "write";
+ u32 lba = (u32)blk_rq_pos(req);
+ u32 count = blk_rq_sectors(req);
+
+ pr_err("(%s): Error cmd=%s sect=%u count=%u id=0x%x\n",
+ skd_name(skdev), cmd, lba, count, skreq->id);
+ } else
+ pr_debug("%s:%s:%d id=0x%x error=%d\n",
+ skdev->name, __func__, __LINE__, skreq->id, error);
+
+ __blk_end_request_all(skreq->req, error);
+}
+
+static int skd_preop_sg_list(struct skd_device *skdev,
+ struct skd_request_context *skreq)
+{
+ struct request *req = skreq->req;
+ int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
+ int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
+ struct scatterlist *sg = &skreq->sg[0];
+ int n_sg;
+ int i;
+
+ skreq->sg_byte_count = 0;
+
+ /* SKD_ASSERT(skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD ||
+ skreq->sg_data_dir == SKD_DATA_DIR_CARD_TO_HOST); */
+
+ n_sg = blk_rq_map_sg(skdev->queue, req, sg);
+ if (n_sg <= 0)
+ return -EINVAL;
+
+ /*
+ * Map scatterlist to PCI bus addresses.
+ * Note PCI might change the number of entries.
+ */
+ n_sg = pci_map_sg(skdev->pdev, sg, n_sg, pci_dir);
+ if (n_sg <= 0)
+ return -EINVAL;
+
+ SKD_ASSERT(n_sg <= skdev->sgs_per_request);
+
+ skreq->n_sg = n_sg;
+
+ for (i = 0; i < n_sg; i++) {
+ struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+ u32 cnt = sg_dma_len(&sg[i]);
+ uint64_t dma_addr = sg_dma_address(&sg[i]);
+
+ sgd->control = FIT_SGD_CONTROL_NOT_LAST;
+ sgd->byte_count = cnt;
+ skreq->sg_byte_count += cnt;
+ sgd->host_side_addr = dma_addr;
+ sgd->dev_side_addr = 0;
+ }
+
+ skreq->sksg_list[n_sg - 1].next_desc_ptr = 0LL;
+ skreq->sksg_list[n_sg - 1].control = FIT_SGD_CONTROL_LAST;
+
+ if (unlikely(skdev->dbg_level > 1)) {
+ pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
+ skdev->name, __func__, __LINE__,
+ skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+ for (i = 0; i < n_sg; i++) {
+ struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+ pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x "
+ "addr=0x%llx next=0x%llx\n",
+ skdev->name, __func__, __LINE__,
+ i, sgd->byte_count, sgd->control,
+ sgd->host_side_addr, sgd->next_desc_ptr);
+ }
+ }
+
+ return 0;
+}
+
+static void skd_postop_sg_list(struct skd_device *skdev,
+ struct skd_request_context *skreq)
+{
+ int writing = skreq->sg_data_dir == SKD_DATA_DIR_HOST_TO_CARD;
+ int pci_dir = writing ? PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE;
+
+ /*
+ * restore the next ptr for next IO request so we
+ * don't have to set it every time.
+ */
+ skreq->sksg_list[skreq->n_sg - 1].next_desc_ptr =
+ skreq->sksg_dma_address +
+ ((skreq->n_sg) * sizeof(struct fit_sg_descriptor));
+ pci_unmap_sg(skdev->pdev, &skreq->sg[0], skreq->n_sg, pci_dir);
+}
+
+static void skd_request_fn_not_online(struct request_queue *q)
+{
+ struct skd_device *skdev = q->queuedata;
+ int error;
+
+ SKD_ASSERT(skdev->state != SKD_DRVR_STATE_ONLINE);
+
+ skd_log_skdev(skdev, "req_not_online");
+ switch (skdev->state) {
+ case SKD_DRVR_STATE_PAUSING:
+ case SKD_DRVR_STATE_PAUSED:
+ case SKD_DRVR_STATE_STARTING:
+ case SKD_DRVR_STATE_RESTARTING:
+ case SKD_DRVR_STATE_WAIT_BOOT:
+ /* In case of starting, we haven't started the queue,
+ * so we can't get here... but requests are
+ * possibly hanging out waiting for us because we
+ * reported the dev/skd0 already. They'll wait
+ * forever if connect doesn't complete.
+ * What to do??? delay dev/skd0 ??
+ */
+ case SKD_DRVR_STATE_BUSY:
+ case SKD_DRVR_STATE_BUSY_IMMINENT:
+ case SKD_DRVR_STATE_BUSY_ERASE:
+ case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+ return;
+
+ case SKD_DRVR_STATE_BUSY_SANITIZE:
+ case SKD_DRVR_STATE_STOPPING:
+ case SKD_DRVR_STATE_SYNCING:
+ case SKD_DRVR_STATE_FAULT:
+ case SKD_DRVR_STATE_DISAPPEARED:
+ default:
+ error = -EIO;
+ break;
+ }
+
+ /* If we get here, terminate all pending block requeusts
+ * with EIO and any scsi pass thru with appropriate sense
+ */
+
+ skd_fail_all_pending(skdev);
+}
+
+/*
+ *****************************************************************************
+ * TIMER
+ *****************************************************************************
+ */
+
+static void skd_timer_tick_not_online(struct skd_device *skdev);
+
+static void skd_timer_tick(ulong arg)
+{
+ struct skd_device *skdev = (struct skd_device *)arg;
+
+ u32 timo_slot;
+ u32 overdue_timestamp;
+ unsigned long reqflags;
+ u32 state;
+
+ if (skdev->state == SKD_DRVR_STATE_FAULT)
+ /* The driver has declared fault, and we want it to
+ * stay that way until driver is reloaded.
+ */
+ return;
+
+ spin_lock_irqsave(&skdev->lock, reqflags);
+
+ state = SKD_READL(skdev, FIT_STATUS);
+ state &= FIT_SR_DRIVE_STATE_MASK;
+ if (state != skdev->drive_state)
+ skd_isr_fwstate(skdev);
+
+ if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+ skd_timer_tick_not_online(skdev);
+ goto timer_func_out;
+ }
+ skdev->timeout_stamp++;
+ timo_slot = skdev->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+
+ /*
+ * All requests that happened during the previous use of
+ * this slot should be done by now. The previous use was
+ * over 7 seconds ago.
+ */
+ if (skdev->timeout_slot[timo_slot] == 0)
+ goto timer_func_out;
+
+ /* Something is overdue */
+ overdue_timestamp = skdev->timeout_stamp - SKD_N_TIMEOUT_SLOT;
+
+ pr_debug("%s:%s:%d found %d timeouts, draining busy=%d\n",
+ skdev->name, __func__, __LINE__,
+ skdev->timeout_slot[timo_slot], skdev->in_flight);
+ pr_err("(%s): Overdue IOs (%d), busy %d\n",
+ skd_name(skdev), skdev->timeout_slot[timo_slot],
+ skdev->in_flight);
+
+ skdev->timer_countdown = SKD_DRAINING_TIMO;
+ skdev->state = SKD_DRVR_STATE_DRAINING_TIMEOUT;
+ skdev->timo_slot = timo_slot;
+ blk_stop_queue(skdev->queue);
+
+timer_func_out:
+ mod_timer(&skdev->timer, (jiffies + HZ));
+
+ spin_unlock_irqrestore(&skdev->lock, reqflags);
+}
+
+static void skd_timer_tick_not_online(struct skd_device *skdev)
+{
+ switch (skdev->state) {
+ case SKD_DRVR_STATE_IDLE:
+ case SKD_DRVR_STATE_LOAD:
+ break;
+ case SKD_DRVR_STATE_BUSY_SANITIZE:
+ pr_debug("%s:%s:%d drive busy sanitize[%x], driver[%x]\n",
+ skdev->name, __func__, __LINE__,
+ skdev->drive_state, skdev->state);
+ /* If we've been in sanitize for 3 seconds, we figure we're not
+ * going to get anymore completions, so recover requests now
+ */
+ if (skdev->timer_countdown > 0) {
+ skdev->timer_countdown--;
+ return;
+ }
+ skd_recover_requests(skdev, 0);
+ break;
+
+ case SKD_DRVR_STATE_BUSY:
+ case SKD_DRVR_STATE_BUSY_IMMINENT:
+ case SKD_DRVR_STATE_BUSY_ERASE:
+ pr_debug("%s:%s:%d busy[%x], countdown=%d\n",
+ skdev->name, __func__, __LINE__,
+ skdev->state, skdev->timer_countdown);
+ if (skdev->timer_countdown > 0) {
+ skdev->timer_countdown--;
+ return;
+ }
+ pr_debug("%s:%s:%d busy[%x], timedout=%d, restarting device.",
+ skdev->name, __func__, __LINE__,
+ skdev->state, skdev->timer_countdown);
+ skd_restart_device(skdev);
+ break;
+
+ case SKD_DRVR_STATE_WAIT_BOOT:
+ case SKD_DRVR_STATE_STARTING:
+ if (skdev->timer_countdown > 0) {
+ skdev->timer_countdown--;
+ return;
+ }
+ /* For now, we fault the drive. Could attempt resets to
+ * revcover at some point. */
+ skdev->state = SKD_DRVR_STATE_FAULT;
+
+ pr_err("(%s): DriveFault Connect Timeout (%x)\n",
+ skd_name(skdev), skdev->drive_state);
+
+ /*start the queue so we can respond with error to requests */
+ /* wakeup anyone waiting for startup complete */
+ blk_start_queue(skdev->queue);
+ skdev->gendisk_on = -1;
+ wake_up_interruptible(&skdev->waitq);
+ break;
+
+ case SKD_DRVR_STATE_ONLINE:
+ /* shouldn't get here. */
+ break;
+
+ case SKD_DRVR_STATE_PAUSING:
+ case SKD_DRVR_STATE_PAUSED:
+ break;
+
+ case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+ pr_debug("%s:%s:%d "
+ "draining busy [%d] tick[%d] qdb[%d] tmls[%d]\n",
+ skdev->name, __func__, __LINE__,
+ skdev->timo_slot,
+ skdev->timer_countdown,
+ skdev->in_flight,
+ skdev->timeout_slot[skdev->timo_slot]);
+ /* if the slot has cleared we can let the I/O continue */
+ if (skdev->timeout_slot[skdev->timo_slot] == 0) {
+ pr_debug("%s:%s:%d Slot drained, starting queue.\n",
+ skdev->name, __func__, __LINE__);
+ skdev->state = SKD_DRVR_STATE_ONLINE;
+ blk_start_queue(skdev->queue);
+ return;
+ }
+ if (skdev->timer_countdown > 0) {
+ skdev->timer_countdown--;
+ return;
+ }
+ skd_restart_device(skdev);
+ break;
+
+ case SKD_DRVR_STATE_RESTARTING:
+ if (skdev->timer_countdown > 0) {
+ skdev->timer_countdown--;
+ return;
+ }
+ /* For now, we fault the drive. Could attempt resets to
+ * revcover at some point. */
+ skdev->state = SKD_DRVR_STATE_FAULT;
+ pr_err("(%s): DriveFault Reconnect Timeout (%x)\n",
+ skd_name(skdev), skdev->drive_state);
+
+ /*
+ * Recovering does two things:
+ * 1. completes IO with error
+ * 2. reclaims dma resources
+ * When is it safe to recover requests?
+ * - if the drive state is faulted
+ * - if the state is still soft reset after out timeout
+ * - if the drive registers are dead (state = FF)
+ * If it is "unsafe", we still need to recover, so we will
+ * disable pci bus mastering and disable our interrupts.
+ */
+
+ if ((skdev->drive_state == FIT_SR_DRIVE_SOFT_RESET) ||
+ (skdev->drive_state == FIT_SR_DRIVE_FAULT) ||
+ (skdev->drive_state == FIT_SR_DRIVE_STATE_MASK))
+ /* It never came out of soft reset. Try to
+ * recover the requests and then let them
+ * fail. This is to mitigate hung processes. */
+ skd_recover_requests(skdev, 0);
+ else {
+ pr_err("(%s): Disable BusMaster (%x)\n",
+ skd_name(skdev), skdev->drive_state);
+ pci_disable_device(skdev->pdev);
+ skd_disable_interrupts(skdev);
+ skd_recover_requests(skdev, 0);
+ }
+
+ /*start the queue so we can respond with error to requests */
+ /* wakeup anyone waiting for startup complete */
+ blk_start_queue(skdev->queue);
+ skdev->gendisk_on = -1;
+ wake_up_interruptible(&skdev->waitq);
+ break;
+
+ case SKD_DRVR_STATE_RESUMING:
+ case SKD_DRVR_STATE_STOPPING:
+ case SKD_DRVR_STATE_SYNCING:
+ case SKD_DRVR_STATE_FAULT:
+ case SKD_DRVR_STATE_DISAPPEARED:
+ default:
+ break;
+ }
+}
+
+static int skd_start_timer(struct skd_device *skdev)
+{
+ int rc;
+
+ init_timer(&skdev->timer);
+ setup_timer(&skdev->timer, skd_timer_tick, (ulong)skdev);
+
+ rc = mod_timer(&skdev->timer, (jiffies + HZ));
+ if (rc)
+ pr_err("%s: failed to start timer %d\n",
+ __func__, rc);
+ return rc;
+}
+
+static void skd_kill_timer(struct skd_device *skdev)
+{
+ del_timer_sync(&skdev->timer);
+}
+
+/*
+ *****************************************************************************
+ * IOCTL
+ *****************************************************************************
+ */
+static int skd_ioctl_sg_io(struct skd_device *skdev,
+ fmode_t mode, void __user *argp);
+static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
+ struct skd_sg_io *sksgio);
+static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
+ struct skd_sg_io *sksgio);
+static int skd_sg_io_prep_buffering(struct skd_device *skdev,
+ struct skd_sg_io *sksgio);
+static int skd_sg_io_copy_buffer(struct skd_device *skdev,
+ struct skd_sg_io *sksgio, int dxfer_dir);
+static int skd_sg_io_send_fitmsg(struct skd_device *skdev,
+ struct skd_sg_io *sksgio);
+static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio);
+static int skd_sg_io_release_skspcl(struct skd_device *skdev,
+ struct skd_sg_io *sksgio);
+static int skd_sg_io_put_status(struct skd_device *skdev,
+ struct skd_sg_io *sksgio);
+
+static void skd_complete_special(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1
+ *skcomp,
+ volatile struct fit_comp_error_info *skerr,
+ struct skd_special_context *skspcl);
+
+static int skd_bdev_ioctl(struct block_device *bdev, fmode_t mode,
+ uint cmd_in, ulong arg)
+{
+ int rc = 0;
+ struct gendisk *disk = bdev->bd_disk;
+ struct skd_device *skdev = disk->private_data;
+ void __user *p = (void *)arg;
+
+ pr_debug("%s:%s:%d %s: CMD[%s] ioctl mode 0x%x, cmd 0x%x arg %0lx\n",
+ skdev->name, __func__, __LINE__,
+ disk->disk_name, current->comm, mode, cmd_in, arg);
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd_in) {
+ case SG_SET_TIMEOUT:
+ case SG_GET_TIMEOUT:
+ case SG_GET_VERSION_NUM:
+ rc = scsi_cmd_ioctl(disk->queue, disk, mode, cmd_in, p);
+ break;
+ case SG_IO:
+ rc = skd_ioctl_sg_io(skdev, mode, p);
+ break;
+
+ default:
+ rc = -ENOTTY;
+ break;
+ }
+
+ pr_debug("%s:%s:%d %s: completion rc %d\n",
+ skdev->name, __func__, __LINE__, disk->disk_name, rc);
+ return rc;
+}
+
+static int skd_ioctl_sg_io(struct skd_device *skdev, fmode_t mode,
+ void __user *argp)
+{
+ int rc;
+ struct skd_sg_io sksgio;
+
+ memset(&sksgio, 0, sizeof(sksgio));
+ sksgio.mode = mode;
+ sksgio.argp = argp;
+ sksgio.iov = &sksgio.no_iov_iov;
+
+ switch (skdev->state) {
+ case SKD_DRVR_STATE_ONLINE:
+ case SKD_DRVR_STATE_BUSY_IMMINENT:
+ break;
+
+ default:
+ pr_debug("%s:%s:%d drive not online\n",
+ skdev->name, __func__, __LINE__);
+ rc = -ENXIO;
+ goto out;
+ }
+
+ rc = skd_sg_io_get_and_check_args(skdev, &sksgio);
+ if (rc)
+ goto out;
+
+ rc = skd_sg_io_obtain_skspcl(skdev, &sksgio);
+ if (rc)
+ goto out;
+
+ rc = skd_sg_io_prep_buffering(skdev, &sksgio);
+ if (rc)
+ goto out;
+
+ rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_TO_DEV);
+ if (rc)
+ goto out;
+
+ rc = skd_sg_io_send_fitmsg(skdev, &sksgio);
+ if (rc)
+ goto out;
+
+ rc = skd_sg_io_await(skdev, &sksgio);
+ if (rc)
+ goto out;
+
+ rc = skd_sg_io_copy_buffer(skdev, &sksgio, SG_DXFER_FROM_DEV);
+ if (rc)
+ goto out;
+
+ rc = skd_sg_io_put_status(skdev, &sksgio);
+ if (rc)
+ goto out;
+
+ rc = 0;
+
+out:
+ skd_sg_io_release_skspcl(skdev, &sksgio);
+
+ if (sksgio.iov != NULL && sksgio.iov != &sksgio.no_iov_iov)
+ kfree(sksgio.iov);
+ return rc;
+}
+
+static int skd_sg_io_get_and_check_args(struct skd_device *skdev,
+ struct skd_sg_io *sksgio)
+{
+ struct sg_io_hdr *sgp = &sksgio->sg;
+ int i, acc;
+
+ if (!access_ok(VERIFY_WRITE, sksgio->argp, sizeof(sg_io_hdr_t))) {
+ pr_debug("%s:%s:%d access sg failed %p\n",
+ skdev->name, __func__, __LINE__, sksgio->argp);
+ return -EFAULT;
+ }
+
+ if (__copy_from_user(sgp, sksgio->argp, sizeof(sg_io_hdr_t))) {
+ pr_debug("%s:%s:%d copy_from_user sg failed %p\n",
+ skdev->name, __func__, __LINE__, sksgio->argp);
+ return -EFAULT;
+ }
+
+ if (sgp->interface_id != SG_INTERFACE_ID_ORIG) {
+ pr_debug("%s:%s:%d interface_id invalid 0x%x\n",
+ skdev->name, __func__, __LINE__, sgp->interface_id);
+ return -EINVAL;
+ }
+
+ if (sgp->cmd_len > sizeof(sksgio->cdb)) {
+ pr_debug("%s:%s:%d cmd_len invalid %d\n",
+ skdev->name, __func__, __LINE__, sgp->cmd_len);
+ return -EINVAL;
+ }
+
+ if (sgp->iovec_count > 256) {
+ pr_debug("%s:%s:%d iovec_count invalid %d\n",
+ skdev->name, __func__, __LINE__, sgp->iovec_count);
+ return -EINVAL;
+ }
+
+ if (sgp->dxfer_len > (PAGE_SIZE * SKD_N_SG_PER_SPECIAL)) {
+ pr_debug("%s:%s:%d dxfer_len invalid %d\n",
+ skdev->name, __func__, __LINE__, sgp->dxfer_len);
+ return -EINVAL;
+ }
+
+ switch (sgp->dxfer_direction) {
+ case SG_DXFER_NONE:
+ acc = -1;
+ break;
+
+ case SG_DXFER_TO_DEV:
+ acc = VERIFY_READ;
+ break;
+
+ case SG_DXFER_FROM_DEV:
+ case SG_DXFER_TO_FROM_DEV:
+ acc = VERIFY_WRITE;
+ break;
+
+ default:
+ pr_debug("%s:%s:%d dxfer_dir invalid %d\n",
+ skdev->name, __func__, __LINE__, sgp->dxfer_direction);
+ return -EINVAL;
+ }
+
+ if (copy_from_user(sksgio->cdb, sgp->cmdp, sgp->cmd_len)) {
+ pr_debug("%s:%s:%d copy_from_user cmdp failed %p\n",
+ skdev->name, __func__, __LINE__, sgp->cmdp);
+ return -EFAULT;
+ }
+
+ if (sgp->mx_sb_len != 0) {
+ if (!access_ok(VERIFY_WRITE, sgp->sbp, sgp->mx_sb_len)) {
+ pr_debug("%s:%s:%d access sbp failed %p\n",
+ skdev->name, __func__, __LINE__, sgp->sbp);
+ return -EFAULT;
+ }
+ }
+
+ if (sgp->iovec_count == 0) {
+ sksgio->iov[0].iov_base = sgp->dxferp;
+ sksgio->iov[0].iov_len = sgp->dxfer_len;
+ sksgio->iovcnt = 1;
+ sksgio->dxfer_len = sgp->dxfer_len;
+ } else {
+ struct sg_iovec *iov;
+ uint nbytes = sizeof(*iov) * sgp->iovec_count;
+ size_t iov_data_len;
+
+ iov = kmalloc(nbytes, GFP_KERNEL);
+ if (iov == NULL) {
+ pr_debug("%s:%s:%d alloc iovec failed %d\n",
+ skdev->name, __func__, __LINE__,
+ sgp->iovec_count);
+ return -ENOMEM;
+ }
+ sksgio->iov = iov;
+ sksgio->iovcnt = sgp->iovec_count;
+
+ if (copy_from_user(iov, sgp->dxferp, nbytes)) {
+ pr_debug("%s:%s:%d copy_from_user iovec failed %p\n",
+ skdev->name, __func__, __LINE__, sgp->dxferp);
+ return -EFAULT;
+ }
+
+ /*
+ * Sum up the vecs, making sure they don't overflow
+ */
+ iov_data_len = 0;
+ for (i = 0; i < sgp->iovec_count; i++) {
+ if (iov_data_len + iov[i].iov_len < iov_data_len)
+ return -EINVAL;
+ iov_data_len += iov[i].iov_len;
+ }
+
+ /* SG_IO howto says that the shorter of the two wins */
+ if (sgp->dxfer_len < iov_data_len) {
+ sksgio->iovcnt = iov_shorten((struct iovec *)iov,
+ sgp->iovec_count,
+ sgp->dxfer_len);
+ sksgio->dxfer_len = sgp->dxfer_len;
+ } else
+ sksgio->dxfer_len = iov_data_len;
+ }
+
+ if (sgp->dxfer_direction != SG_DXFER_NONE) {
+ struct sg_iovec *iov = sksgio->iov;
+ for (i = 0; i < sksgio->iovcnt; i++, iov++) {
+ if (!access_ok(acc, iov->iov_base, iov->iov_len)) {
+ pr_debug("%s:%s:%d access data failed %p/%d\n",
+ skdev->name, __func__, __LINE__,
+ iov->iov_base, (int)iov->iov_len);
+ return -EFAULT;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int skd_sg_io_obtain_skspcl(struct skd_device *skdev,
+ struct skd_sg_io *sksgio)
+{
+ struct skd_special_context *skspcl = NULL;
+ int rc;
+
+ for (;;) {
+ ulong flags;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ skspcl = skdev->skspcl_free_list;
+ if (skspcl != NULL) {
+ skdev->skspcl_free_list =
+ (struct skd_special_context *)skspcl->req.next;
+ skspcl->req.id += SKD_ID_INCR;
+ skspcl->req.state = SKD_REQ_STATE_SETUP;
+ skspcl->orphaned = 0;
+ skspcl->req.n_sg = 0;
+ }
+ spin_unlock_irqrestore(&skdev->lock, flags);
+
+ if (skspcl != NULL) {
+ rc = 0;
+ break;
+ }
+
+ pr_debug("%s:%s:%d blocking\n",
+ skdev->name, __func__, __LINE__);
+
+ rc = wait_event_interruptible_timeout(
+ skdev->waitq,
+ (skdev->skspcl_free_list != NULL),
+ msecs_to_jiffies(sksgio->sg.timeout));
+
+ pr_debug("%s:%s:%d unblocking, rc=%d\n",
+ skdev->name, __func__, __LINE__, rc);
+
+ if (rc <= 0) {
+ if (rc == 0)
+ rc = -ETIMEDOUT;
+ else
+ rc = -EINTR;
+ break;
+ }
+ /*
+ * If we get here rc > 0 meaning the timeout to
+ * wait_event_interruptible_timeout() had time left, hence the
+ * sought event -- non-empty free list -- happened.
+ * Retry the allocation.
+ */
+ }
+ sksgio->skspcl = skspcl;
+
+ return rc;
+}
+
+static int skd_skreq_prep_buffering(struct skd_device *skdev,
+ struct skd_request_context *skreq,
+ u32 dxfer_len)
+{
+ u32 resid = dxfer_len;
+
+ /*
+ * The DMA engine must have aligned addresses and byte counts.
+ */
+ resid += (-resid) & 3;
+ skreq->sg_byte_count = resid;
+
+ skreq->n_sg = 0;
+
+ while (resid > 0) {
+ u32 nbytes = PAGE_SIZE;
+ u32 ix = skreq->n_sg;
+ struct scatterlist *sg = &skreq->sg[ix];
+ struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix];
+ struct page *page;
+
+ if (nbytes > resid)
+ nbytes = resid;
+
+ page = alloc_page(GFP_KERNEL);
+ if (page == NULL)
+ return -ENOMEM;
+
+ sg_set_page(sg, page, nbytes, 0);
+
+ /* TODO: This should be going through a pci_???()
+ * routine to do proper mapping. */
+ sksg->control = FIT_SGD_CONTROL_NOT_LAST;
+ sksg->byte_count = nbytes;
+
+ sksg->host_side_addr = sg_phys(sg);
+
+ sksg->dev_side_addr = 0;
+ sksg->next_desc_ptr = skreq->sksg_dma_address +
+ (ix + 1) * sizeof(*sksg);
+
+ skreq->n_sg++;
+ resid -= nbytes;
+ }
+
+ if (skreq->n_sg > 0) {
+ u32 ix = skreq->n_sg - 1;
+ struct fit_sg_descriptor *sksg = &skreq->sksg_list[ix];
+
+ sksg->control = FIT_SGD_CONTROL_LAST;
+ sksg->next_desc_ptr = 0;
+ }
+
+ if (unlikely(skdev->dbg_level > 1)) {
+ u32 i;
+
+ pr_debug("%s:%s:%d skreq=%x sksg_list=%p sksg_dma=%llx\n",
+ skdev->name, __func__, __LINE__,
+ skreq->id, skreq->sksg_list, skreq->sksg_dma_address);
+ for (i = 0; i < skreq->n_sg; i++) {
+ struct fit_sg_descriptor *sgd = &skreq->sksg_list[i];
+
+ pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x "
+ "addr=0x%llx next=0x%llx\n",
+ skdev->name, __func__, __LINE__,
+ i, sgd->byte_count, sgd->control,
+ sgd->host_side_addr, sgd->next_desc_ptr);
+ }
+ }
+
+ return 0;
+}
+
+static int skd_sg_io_prep_buffering(struct skd_device *skdev,
+ struct skd_sg_io *sksgio)
+{
+ struct skd_special_context *skspcl = sksgio->skspcl;
+ struct skd_request_context *skreq = &skspcl->req;
+ u32 dxfer_len = sksgio->dxfer_len;
+ int rc;
+
+ rc = skd_skreq_prep_buffering(skdev, skreq, dxfer_len);
+ /*
+ * Eventually, errors or not, skd_release_special() is called
+ * to recover allocations including partial allocations.
+ */
+ return rc;
+}
+
+static int skd_sg_io_copy_buffer(struct skd_device *skdev,
+ struct skd_sg_io *sksgio, int dxfer_dir)
+{
+ struct skd_special_context *skspcl = sksgio->skspcl;
+ u32 iov_ix = 0;
+ struct sg_iovec curiov;
+ u32 sksg_ix = 0;
+ u8 *bufp = NULL;
+ u32 buf_len = 0;
+ u32 resid = sksgio->dxfer_len;
+ int rc;
+
+ curiov.iov_len = 0;
+ curiov.iov_base = NULL;
+
+ if (dxfer_dir != sksgio->sg.dxfer_direction) {
+ if (dxfer_dir != SG_DXFER_TO_DEV ||
+ sksgio->sg.dxfer_direction != SG_DXFER_TO_FROM_DEV)
+ return 0;
+ }
+
+ while (resid > 0) {
+ u32 nbytes = PAGE_SIZE;
+
+ if (curiov.iov_len == 0) {
+ curiov = sksgio->iov[iov_ix++];
+ continue;
+ }
+
+ if (buf_len == 0) {
+ struct page *page;
+ page = sg_page(&skspcl->req.sg[sksg_ix++]);
+ bufp = page_address(page);
+ buf_len = PAGE_SIZE;
+ }
+
+ nbytes = min_t(u32, nbytes, resid);
+ nbytes = min_t(u32, nbytes, curiov.iov_len);
+ nbytes = min_t(u32, nbytes, buf_len);
+
+ if (dxfer_dir == SG_DXFER_TO_DEV)
+ rc = __copy_from_user(bufp, curiov.iov_base, nbytes);
+ else
+ rc = __copy_to_user(curiov.iov_base, bufp, nbytes);
+
+ if (rc)
+ return -EFAULT;
+
+ resid -= nbytes;
+ curiov.iov_len -= nbytes;
+ curiov.iov_base += nbytes;
+ buf_len -= nbytes;
+ }
+
+ return 0;
+}
+
+static int skd_sg_io_send_fitmsg(struct skd_device *skdev,
+ struct skd_sg_io *sksgio)
+{
+ struct skd_special_context *skspcl = sksgio->skspcl;
+ struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf;
+ struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1];
+
+ memset(skspcl->msg_buf, 0, SKD_N_SPECIAL_FITMSG_BYTES);
+
+ /* Initialize the FIT msg header */
+ fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+ fmh->num_protocol_cmds_coalesced = 1;
+
+ /* Initialize the SCSI request */
+ if (sksgio->sg.dxfer_direction != SG_DXFER_NONE)
+ scsi_req->hdr.sg_list_dma_address =
+ cpu_to_be64(skspcl->req.sksg_dma_address);
+ scsi_req->hdr.tag = skspcl->req.id;
+ scsi_req->hdr.sg_list_len_bytes =
+ cpu_to_be32(skspcl->req.sg_byte_count);
+ memcpy(scsi_req->cdb, sksgio->cdb, sizeof(scsi_req->cdb));
+
+ skspcl->req.state = SKD_REQ_STATE_BUSY;
+ skd_send_special_fitmsg(skdev, skspcl);
+
+ return 0;
+}
+
+static int skd_sg_io_await(struct skd_device *skdev, struct skd_sg_io *sksgio)
+{
+ unsigned long flags;
+ int rc;
+
+ rc = wait_event_interruptible_timeout(skdev->waitq,
+ (sksgio->skspcl->req.state !=
+ SKD_REQ_STATE_BUSY),
+ msecs_to_jiffies(sksgio->sg.
+ timeout));
+
+ spin_lock_irqsave(&skdev->lock, flags);
+
+ if (sksgio->skspcl->req.state == SKD_REQ_STATE_ABORTED) {
+ pr_debug("%s:%s:%d skspcl %p aborted\n",
+ skdev->name, __func__, __LINE__, sksgio->skspcl);
+
+ /* Build check cond, sense and let command finish. */
+ /* For a timeout, we must fabricate completion and sense
+ * data to complete the command */
+ sksgio->skspcl->req.completion.status =
+ SAM_STAT_CHECK_CONDITION;
+
+ memset(&sksgio->skspcl->req.err_info, 0,
+ sizeof(sksgio->skspcl->req.err_info));
+ sksgio->skspcl->req.err_info.type = 0x70;
+ sksgio->skspcl->req.err_info.key = ABORTED_COMMAND;
+ sksgio->skspcl->req.err_info.code = 0x44;
+ sksgio->skspcl->req.err_info.qual = 0;
+ rc = 0;
+ } else if (sksgio->skspcl->req.state != SKD_REQ_STATE_BUSY)
+ /* No longer on the adapter. We finish. */
+ rc = 0;
+ else {
+ /* Something's gone wrong. Still busy. Timeout or
+ * user interrupted (control-C). Mark as an orphan
+ * so it will be disposed when completed. */
+ sksgio->skspcl->orphaned = 1;
+ sksgio->skspcl = NULL;
+ if (rc == 0) {
+ pr_debug("%s:%s:%d timed out %p (%u ms)\n",
+ skdev->name, __func__, __LINE__,
+ sksgio, sksgio->sg.timeout);
+ rc = -ETIMEDOUT;
+ } else {
+ pr_debug("%s:%s:%d cntlc %p\n",
+ skdev->name, __func__, __LINE__, sksgio);
+ rc = -EINTR;
+ }
+ }
+
+ spin_unlock_irqrestore(&skdev->lock, flags);
+
+ return rc;
+}
+
+static int skd_sg_io_put_status(struct skd_device *skdev,
+ struct skd_sg_io *sksgio)
+{
+ struct sg_io_hdr *sgp = &sksgio->sg;
+ struct skd_special_context *skspcl = sksgio->skspcl;
+ int resid = 0;
+
+ u32 nb = be32_to_cpu(skspcl->req.completion.num_returned_bytes);
+
+ sgp->status = skspcl->req.completion.status;
+ resid = sksgio->dxfer_len - nb;
+
+ sgp->masked_status = sgp->status & STATUS_MASK;
+ sgp->msg_status = 0;
+ sgp->host_status = 0;
+ sgp->driver_status = 0;
+ sgp->resid = resid;
+ if (sgp->masked_status || sgp->host_status || sgp->driver_status)
+ sgp->info |= SG_INFO_CHECK;
+
+ pr_debug("%s:%s:%d status %x masked %x resid 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ sgp->status, sgp->masked_status, sgp->resid);
+
+ if (sgp->masked_status == SAM_STAT_CHECK_CONDITION) {
+ if (sgp->mx_sb_len > 0) {
+ struct fit_comp_error_info *ei = &skspcl->req.err_info;
+ u32 nbytes = sizeof(*ei);
+
+ nbytes = min_t(u32, nbytes, sgp->mx_sb_len);
+
+ sgp->sb_len_wr = nbytes;
+
+ if (__copy_to_user(sgp->sbp, ei, nbytes)) {
+ pr_debug("%s:%s:%d copy_to_user sense failed %p\n",
+ skdev->name, __func__, __LINE__,
+ sgp->sbp);
+ return -EFAULT;
+ }
+ }
+ }
+
+ if (__copy_to_user(sksgio->argp, sgp, sizeof(sg_io_hdr_t))) {
+ pr_debug("%s:%s:%d copy_to_user sg failed %p\n",
+ skdev->name, __func__, __LINE__, sksgio->argp);
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+static int skd_sg_io_release_skspcl(struct skd_device *skdev,
+ struct skd_sg_io *sksgio)
+{
+ struct skd_special_context *skspcl = sksgio->skspcl;
+
+ if (skspcl != NULL) {
+ ulong flags;
+
+ sksgio->skspcl = NULL;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ skd_release_special(skdev, skspcl);
+ spin_unlock_irqrestore(&skdev->lock, flags);
+ }
+
+ return 0;
+}
+
+/*
+ *****************************************************************************
+ * INTERNAL REQUESTS -- generated by driver itself
+ *****************************************************************************
+ */
+
+static int skd_format_internal_skspcl(struct skd_device *skdev)
+{
+ struct skd_special_context *skspcl = &skdev->internal_skspcl;
+ struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
+ struct fit_msg_hdr *fmh;
+ uint64_t dma_address;
+ struct skd_scsi_request *scsi;
+
+ fmh = (struct fit_msg_hdr *)&skspcl->msg_buf[0];
+ fmh->protocol_id = FIT_PROTOCOL_ID_SOFIT;
+ fmh->num_protocol_cmds_coalesced = 1;
+
+ scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64];
+ memset(scsi, 0, sizeof(*scsi));
+ dma_address = skspcl->req.sksg_dma_address;
+ scsi->hdr.sg_list_dma_address = cpu_to_be64(dma_address);
+ sgd->control = FIT_SGD_CONTROL_LAST;
+ sgd->byte_count = 0;
+ sgd->host_side_addr = skspcl->db_dma_address;
+ sgd->dev_side_addr = 0;
+ sgd->next_desc_ptr = 0LL;
+
+ return 1;
+}
+
+#define WR_BUF_SIZE SKD_N_INTERNAL_BYTES
+
+static void skd_send_internal_skspcl(struct skd_device *skdev,
+ struct skd_special_context *skspcl,
+ u8 opcode)
+{
+ struct fit_sg_descriptor *sgd = &skspcl->req.sksg_list[0];
+ struct skd_scsi_request *scsi;
+ unsigned char *buf = skspcl->data_buf;
+ int i;
+
+ if (skspcl->req.state != SKD_REQ_STATE_IDLE)
+ /*
+ * A refresh is already in progress.
+ * Just wait for it to finish.
+ */
+ return;
+
+ SKD_ASSERT((skspcl->req.id & SKD_ID_INCR) == 0);
+ skspcl->req.state = SKD_REQ_STATE_BUSY;
+ skspcl->req.id += SKD_ID_INCR;
+
+ scsi = (struct skd_scsi_request *)&skspcl->msg_buf[64];
+ scsi->hdr.tag = skspcl->req.id;
+
+ memset(scsi->cdb, 0, sizeof(scsi->cdb));
+
+ switch (opcode) {
+ case TEST_UNIT_READY:
+ scsi->cdb[0] = TEST_UNIT_READY;
+ sgd->byte_count = 0;
+ scsi->hdr.sg_list_len_bytes = 0;
+ break;
+
+ case READ_CAPACITY:
+ scsi->cdb[0] = READ_CAPACITY;
+ sgd->byte_count = SKD_N_READ_CAP_BYTES;
+ scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+ break;
+
+ case INQUIRY:
+ scsi->cdb[0] = INQUIRY;
+ scsi->cdb[1] = 0x01; /* evpd */
+ scsi->cdb[2] = 0x80; /* serial number page */
+ scsi->cdb[4] = 0x10;
+ sgd->byte_count = 16;
+ scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+ break;
+
+ case SYNCHRONIZE_CACHE:
+ scsi->cdb[0] = SYNCHRONIZE_CACHE;
+ sgd->byte_count = 0;
+ scsi->hdr.sg_list_len_bytes = 0;
+ break;
+
+ case WRITE_BUFFER:
+ scsi->cdb[0] = WRITE_BUFFER;
+ scsi->cdb[1] = 0x02;
+ scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
+ scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
+ sgd->byte_count = WR_BUF_SIZE;
+ scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+ /* fill incrementing byte pattern */
+ for (i = 0; i < sgd->byte_count; i++)
+ buf[i] = i & 0xFF;
+ break;
+
+ case READ_BUFFER:
+ scsi->cdb[0] = READ_BUFFER;
+ scsi->cdb[1] = 0x02;
+ scsi->cdb[7] = (WR_BUF_SIZE & 0xFF00) >> 8;
+ scsi->cdb[8] = WR_BUF_SIZE & 0xFF;
+ sgd->byte_count = WR_BUF_SIZE;
+ scsi->hdr.sg_list_len_bytes = cpu_to_be32(sgd->byte_count);
+ memset(skspcl->data_buf, 0, sgd->byte_count);
+ break;
+
+ default:
+ SKD_ASSERT("Don't know what to send");
+ return;
+
+ }
+ skd_send_special_fitmsg(skdev, skspcl);
+}
+
+static void skd_refresh_device_data(struct skd_device *skdev)
+{
+ struct skd_special_context *skspcl = &skdev->internal_skspcl;
+
+ skd_send_internal_skspcl(skdev, skspcl, TEST_UNIT_READY);
+}
+
+static int skd_chk_read_buf(struct skd_device *skdev,
+ struct skd_special_context *skspcl)
+{
+ unsigned char *buf = skspcl->data_buf;
+ int i;
+
+ /* check for incrementing byte pattern */
+ for (i = 0; i < WR_BUF_SIZE; i++)
+ if (buf[i] != (i & 0xFF))
+ return 1;
+
+ return 0;
+}
+
+static void skd_log_check_status(struct skd_device *skdev, u8 status, u8 key,
+ u8 code, u8 qual, u8 fruc)
+{
+ /* If the check condition is of special interest, log a message */
+ if ((status == SAM_STAT_CHECK_CONDITION) && (key == 0x02)
+ && (code == 0x04) && (qual == 0x06)) {
+ pr_err("(%s): *** LOST_WRITE_DATA ERROR *** key/asc/"
+ "ascq/fruc %02x/%02x/%02x/%02x\n",
+ skd_name(skdev), key, code, qual, fruc);
+ }
+}
+
+static void skd_complete_internal(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1
+ *skcomp,
+ volatile struct fit_comp_error_info *skerr,
+ struct skd_special_context *skspcl)
+{
+ u8 *buf = skspcl->data_buf;
+ u8 status;
+ int i;
+ struct skd_scsi_request *scsi =
+ (struct skd_scsi_request *)&skspcl->msg_buf[64];
+
+ SKD_ASSERT(skspcl == &skdev->internal_skspcl);
+
+ pr_debug("%s:%s:%d complete internal %x\n",
+ skdev->name, __func__, __LINE__, scsi->cdb[0]);
+
+ skspcl->req.completion = *skcomp;
+ skspcl->req.state = SKD_REQ_STATE_IDLE;
+ skspcl->req.id += SKD_ID_INCR;
+
+ status = skspcl->req.completion.status;
+
+ skd_log_check_status(skdev, status, skerr->key, skerr->code,
+ skerr->qual, skerr->fruc);
+
+ switch (scsi->cdb[0]) {
+ case TEST_UNIT_READY:
+ if (status == SAM_STAT_GOOD)
+ skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
+ else if ((status == SAM_STAT_CHECK_CONDITION) &&
+ (skerr->key == MEDIUM_ERROR))
+ skd_send_internal_skspcl(skdev, skspcl, WRITE_BUFFER);
+ else {
+ if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+ pr_debug("%s:%s:%d TUR failed, don't send anymore state 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ skdev->state);
+ return;
+ }
+ pr_debug("%s:%s:%d **** TUR failed, retry skerr\n",
+ skdev->name, __func__, __LINE__);
+ skd_send_internal_skspcl(skdev, skspcl, 0x00);
+ }
+ break;
+
+ case WRITE_BUFFER:
+ if (status == SAM_STAT_GOOD)
+ skd_send_internal_skspcl(skdev, skspcl, READ_BUFFER);
+ else {
+ if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+ pr_debug("%s:%s:%d write buffer failed, don't send anymore state 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ skdev->state);
+ return;
+ }
+ pr_debug("%s:%s:%d **** write buffer failed, retry skerr\n",
+ skdev->name, __func__, __LINE__);
+ skd_send_internal_skspcl(skdev, skspcl, 0x00);
+ }
+ break;
+
+ case READ_BUFFER:
+ if (status == SAM_STAT_GOOD) {
+ if (skd_chk_read_buf(skdev, skspcl) == 0)
+ skd_send_internal_skspcl(skdev, skspcl,
+ READ_CAPACITY);
+ else {
+ pr_err(
+ "(%s):*** W/R Buffer mismatch %d ***\n",
+ skd_name(skdev), skdev->connect_retries);
+ if (skdev->connect_retries <
+ SKD_MAX_CONNECT_RETRIES) {
+ skdev->connect_retries++;
+ skd_soft_reset(skdev);
+ } else {
+ pr_err(
+ "(%s): W/R Buffer Connect Error\n",
+ skd_name(skdev));
+ return;
+ }
+ }
+
+ } else {
+ if (skdev->state == SKD_DRVR_STATE_STOPPING) {
+ pr_debug("%s:%s:%d "
+ "read buffer failed, don't send anymore state 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ skdev->state);
+ return;
+ }
+ pr_debug("%s:%s:%d "
+ "**** read buffer failed, retry skerr\n",
+ skdev->name, __func__, __LINE__);
+ skd_send_internal_skspcl(skdev, skspcl, 0x00);
+ }
+ break;
+
+ case READ_CAPACITY:
+ skdev->read_cap_is_valid = 0;
+ if (status == SAM_STAT_GOOD) {
+ skdev->read_cap_last_lba =
+ (buf[0] << 24) | (buf[1] << 16) |
+ (buf[2] << 8) | buf[3];
+ skdev->read_cap_blocksize =
+ (buf[4] << 24) | (buf[5] << 16) |
+ (buf[6] << 8) | buf[7];
+
+ pr_debug("%s:%s:%d last lba %d, bs %d\n",
+ skdev->name, __func__, __LINE__,
+ skdev->read_cap_last_lba,
+ skdev->read_cap_blocksize);
+
+ set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
+
+ skdev->read_cap_is_valid = 1;
+
+ skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
+ } else if ((status == SAM_STAT_CHECK_CONDITION) &&
+ (skerr->key == MEDIUM_ERROR)) {
+ skdev->read_cap_last_lba = ~0;
+ set_capacity(skdev->disk, skdev->read_cap_last_lba + 1);
+ pr_debug("%s:%s:%d "
+ "**** MEDIUM ERROR caused READCAP to fail, ignore failure and continue to inquiry\n",
+ skdev->name, __func__, __LINE__);
+ skd_send_internal_skspcl(skdev, skspcl, INQUIRY);
+ } else {
+ pr_debug("%s:%s:%d **** READCAP failed, retry TUR\n",
+ skdev->name, __func__, __LINE__);
+ skd_send_internal_skspcl(skdev, skspcl,
+ TEST_UNIT_READY);
+ }
+ break;
+
+ case INQUIRY:
+ skdev->inquiry_is_valid = 0;
+ if (status == SAM_STAT_GOOD) {
+ skdev->inquiry_is_valid = 1;
+
+ for (i = 0; i < 12; i++)
+ skdev->inq_serial_num[i] = buf[i + 4];
+ skdev->inq_serial_num[12] = 0;
+ }
+
+ if (skd_unquiesce_dev(skdev) < 0)
+ pr_debug("%s:%s:%d **** failed, to ONLINE device\n",
+ skdev->name, __func__, __LINE__);
+ /* connection is complete */
+ skdev->connect_retries = 0;
+ break;
+
+ case SYNCHRONIZE_CACHE:
+ if (status == SAM_STAT_GOOD)
+ skdev->sync_done = 1;
+ else
+ skdev->sync_done = -1;
+ wake_up_interruptible(&skdev->waitq);
+ break;
+
+ default:
+ SKD_ASSERT("we didn't send this");
+ }
+}
+
+/*
+ *****************************************************************************
+ * FIT MESSAGES
+ *****************************************************************************
+ */
+
+static void skd_send_fitmsg(struct skd_device *skdev,
+ struct skd_fitmsg_context *skmsg)
+{
+ u64 qcmd;
+ struct fit_msg_hdr *fmh;
+
+ pr_debug("%s:%s:%d dma address 0x%llx, busy=%d\n",
+ skdev->name, __func__, __LINE__,
+ skmsg->mb_dma_address, skdev->in_flight);
+ pr_debug("%s:%s:%d msg_buf 0x%p, offset %x\n",
+ skdev->name, __func__, __LINE__,
+ skmsg->msg_buf, skmsg->offset);
+
+ qcmd = skmsg->mb_dma_address;
+ qcmd |= FIT_QCMD_QID_NORMAL;
+
+ fmh = (struct fit_msg_hdr *)skmsg->msg_buf;
+ skmsg->outstanding = fmh->num_protocol_cmds_coalesced;
+
+ if (unlikely(skdev->dbg_level > 1)) {
+ u8 *bp = (u8 *)skmsg->msg_buf;
+ int i;
+ for (i = 0; i < skmsg->length; i += 8) {
+ pr_debug("%s:%s:%d msg[%2d] %02x %02x %02x %02x "
+ "%02x %02x %02x %02x\n",
+ skdev->name, __func__, __LINE__,
+ i, bp[i + 0], bp[i + 1], bp[i + 2],
+ bp[i + 3], bp[i + 4], bp[i + 5],
+ bp[i + 6], bp[i + 7]);
+ if (i == 0)
+ i = 64 - 8;
+ }
+ }
+
+ if (skmsg->length > 256)
+ qcmd |= FIT_QCMD_MSGSIZE_512;
+ else if (skmsg->length > 128)
+ qcmd |= FIT_QCMD_MSGSIZE_256;
+ else if (skmsg->length > 64)
+ qcmd |= FIT_QCMD_MSGSIZE_128;
+ else
+ /*
+ * This makes no sense because the FIT msg header is
+ * 64 bytes. If the msg is only 64 bytes long it has
+ * no payload.
+ */
+ qcmd |= FIT_QCMD_MSGSIZE_64;
+
+ SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
+
+}
+
+static void skd_send_special_fitmsg(struct skd_device *skdev,
+ struct skd_special_context *skspcl)
+{
+ u64 qcmd;
+
+ if (unlikely(skdev->dbg_level > 1)) {
+ u8 *bp = (u8 *)skspcl->msg_buf;
+ int i;
+
+ for (i = 0; i < SKD_N_SPECIAL_FITMSG_BYTES; i += 8) {
+ pr_debug("%s:%s:%d spcl[%2d] %02x %02x %02x %02x "
+ "%02x %02x %02x %02x\n",
+ skdev->name, __func__, __LINE__, i,
+ bp[i + 0], bp[i + 1], bp[i + 2], bp[i + 3],
+ bp[i + 4], bp[i + 5], bp[i + 6], bp[i + 7]);
+ if (i == 0)
+ i = 64 - 8;
+ }
+
+ pr_debug("%s:%s:%d skspcl=%p id=%04x sksg_list=%p sksg_dma=%llx\n",
+ skdev->name, __func__, __LINE__,
+ skspcl, skspcl->req.id, skspcl->req.sksg_list,
+ skspcl->req.sksg_dma_address);
+ for (i = 0; i < skspcl->req.n_sg; i++) {
+ struct fit_sg_descriptor *sgd =
+ &skspcl->req.sksg_list[i];
+
+ pr_debug("%s:%s:%d sg[%d] count=%u ctrl=0x%x "
+ "addr=0x%llx next=0x%llx\n",
+ skdev->name, __func__, __LINE__,
+ i, sgd->byte_count, sgd->control,
+ sgd->host_side_addr, sgd->next_desc_ptr);
+ }
+ }
+
+ /*
+ * Special FIT msgs are always 128 bytes: a 64-byte FIT hdr
+ * and one 64-byte SSDI command.
+ */
+ qcmd = skspcl->mb_dma_address;
+ qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128;
+
+ SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND);
+}
+
+/*
+ *****************************************************************************
+ * COMPLETION QUEUE
+ *****************************************************************************
+ */
+
+static void skd_complete_other(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1 *skcomp,
+ volatile struct fit_comp_error_info *skerr);
+
+struct sns_info {
+ u8 type;
+ u8 stat;
+ u8 key;
+ u8 asc;
+ u8 ascq;
+ u8 mask;
+ enum skd_check_status_action action;
+};
+
+static struct sns_info skd_chkstat_table[] = {
+ /* Good */
+ { 0x70, 0x02, RECOVERED_ERROR, 0, 0, 0x1c,
+ SKD_CHECK_STATUS_REPORT_GOOD },
+
+ /* Smart alerts */
+ { 0x70, 0x02, NO_SENSE, 0x0B, 0x00, 0x1E, /* warnings */
+ SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+ { 0x70, 0x02, NO_SENSE, 0x5D, 0x00, 0x1E, /* thresholds */
+ SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+ { 0x70, 0x02, RECOVERED_ERROR, 0x0B, 0x01, 0x1F, /* temperature over trigger */
+ SKD_CHECK_STATUS_REPORT_SMART_ALERT },
+
+ /* Retry (with limits) */
+ { 0x70, 0x02, 0x0B, 0, 0, 0x1C, /* This one is for DMA ERROR */
+ SKD_CHECK_STATUS_REQUEUE_REQUEST },
+ { 0x70, 0x02, 0x06, 0x0B, 0x00, 0x1E, /* warnings */
+ SKD_CHECK_STATUS_REQUEUE_REQUEST },
+ { 0x70, 0x02, 0x06, 0x5D, 0x00, 0x1E, /* thresholds */
+ SKD_CHECK_STATUS_REQUEUE_REQUEST },
+ { 0x70, 0x02, 0x06, 0x80, 0x30, 0x1F, /* backup power */
+ SKD_CHECK_STATUS_REQUEUE_REQUEST },
+
+ /* Busy (or about to be) */
+ { 0x70, 0x02, 0x06, 0x3f, 0x01, 0x1F, /* fw changed */
+ SKD_CHECK_STATUS_BUSY_IMMINENT },
+};
+
+/*
+ * Look up status and sense data to decide how to handle the error
+ * from the device.
+ * mask says which fields must match e.g., mask=0x18 means check
+ * type and stat, ignore key, asc, ascq.
+ */
+
+static enum skd_check_status_action
+skd_check_status(struct skd_device *skdev,
+ u8 cmp_status, volatile struct fit_comp_error_info *skerr)
+{
+ int i, n;
+
+ pr_err("(%s): key/asc/ascq/fruc %02x/%02x/%02x/%02x\n",
+ skd_name(skdev), skerr->key, skerr->code, skerr->qual,
+ skerr->fruc);
+
+ pr_debug("%s:%s:%d stat: t=%02x stat=%02x k=%02x c=%02x q=%02x fruc=%02x\n",
+ skdev->name, __func__, __LINE__, skerr->type, cmp_status,
+ skerr->key, skerr->code, skerr->qual, skerr->fruc);
+
+ /* Does the info match an entry in the good category? */
+ n = sizeof(skd_chkstat_table) / sizeof(skd_chkstat_table[0]);
+ for (i = 0; i < n; i++) {
+ struct sns_info *sns = &skd_chkstat_table[i];
+
+ if (sns->mask & 0x10)
+ if (skerr->type != sns->type)
+ continue;
+
+ if (sns->mask & 0x08)
+ if (cmp_status != sns->stat)
+ continue;
+
+ if (sns->mask & 0x04)
+ if (skerr->key != sns->key)
+ continue;
+
+ if (sns->mask & 0x02)
+ if (skerr->code != sns->asc)
+ continue;
+
+ if (sns->mask & 0x01)
+ if (skerr->qual != sns->ascq)
+ continue;
+
+ if (sns->action == SKD_CHECK_STATUS_REPORT_SMART_ALERT) {
+ pr_err("(%s): SMART Alert: sense key/asc/ascq "
+ "%02x/%02x/%02x\n",
+ skd_name(skdev), skerr->key,
+ skerr->code, skerr->qual);
+ }
+ return sns->action;
+ }
+
+ /* No other match, so nonzero status means error,
+ * zero status means good
+ */
+ if (cmp_status) {
+ pr_debug("%s:%s:%d status check: error\n",
+ skdev->name, __func__, __LINE__);
+ return SKD_CHECK_STATUS_REPORT_ERROR;
+ }
+
+ pr_debug("%s:%s:%d status check good default\n",
+ skdev->name, __func__, __LINE__);
+ return SKD_CHECK_STATUS_REPORT_GOOD;
+}
+
+static void skd_resolve_req_exception(struct skd_device *skdev,
+ struct skd_request_context *skreq)
+{
+ u8 cmp_status = skreq->completion.status;
+
+ switch (skd_check_status(skdev, cmp_status, &skreq->err_info)) {
+ case SKD_CHECK_STATUS_REPORT_GOOD:
+ case SKD_CHECK_STATUS_REPORT_SMART_ALERT:
+ skd_end_request(skdev, skreq, 0);
+ break;
+
+ case SKD_CHECK_STATUS_BUSY_IMMINENT:
+ skd_log_skreq(skdev, skreq, "retry(busy)");
+ blk_requeue_request(skdev->queue, skreq->req);
+ pr_info("(%s) drive BUSY imminent\n", skd_name(skdev));
+ skdev->state = SKD_DRVR_STATE_BUSY_IMMINENT;
+ skdev->timer_countdown = SKD_TIMER_MINUTES(20);
+ skd_quiesce_dev(skdev);
+ break;
+
+ case SKD_CHECK_STATUS_REQUEUE_REQUEST:
+ if ((unsigned long) ++skreq->req->special < SKD_MAX_RETRIES) {
+ skd_log_skreq(skdev, skreq, "retry");
+ blk_requeue_request(skdev->queue, skreq->req);
+ break;
+ }
+ /* fall through to report error */
+
+ case SKD_CHECK_STATUS_REPORT_ERROR:
+ default:
+ skd_end_request(skdev, skreq, -EIO);
+ break;
+ }
+}
+
+/* assume spinlock is already held */
+static void skd_release_skreq(struct skd_device *skdev,
+ struct skd_request_context *skreq)
+{
+ u32 msg_slot;
+ struct skd_fitmsg_context *skmsg;
+
+ u32 timo_slot;
+
+ /*
+ * Reclaim the FIT msg buffer if this is
+ * the first of the requests it carried to
+ * be completed. The FIT msg buffer used to
+ * send this request cannot be reused until
+ * we are sure the s1120 card has copied
+ * it to its memory. The FIT msg might have
+ * contained several requests. As soon as
+ * any of them are completed we know that
+ * the entire FIT msg was transferred.
+ * Only the first completed request will
+ * match the FIT msg buffer id. The FIT
+ * msg buffer id is immediately updated.
+ * When subsequent requests complete the FIT
+ * msg buffer id won't match, so we know
+ * quite cheaply that it is already done.
+ */
+ msg_slot = skreq->fitmsg_id & SKD_ID_SLOT_MASK;
+ SKD_ASSERT(msg_slot < skdev->num_fitmsg_context);
+
+ skmsg = &skdev->skmsg_table[msg_slot];
+ if (skmsg->id == skreq->fitmsg_id) {
+ SKD_ASSERT(skmsg->state == SKD_MSG_STATE_BUSY);
+ SKD_ASSERT(skmsg->outstanding > 0);
+ skmsg->outstanding--;
+ if (skmsg->outstanding == 0) {
+ skmsg->state = SKD_MSG_STATE_IDLE;
+ skmsg->id += SKD_ID_INCR;
+ skmsg->next = skdev->skmsg_free_list;
+ skdev->skmsg_free_list = skmsg;
+ }
+ }
+
+ /*
+ * Decrease the number of active requests.
+ * Also decrements the count in the timeout slot.
+ */
+ SKD_ASSERT(skdev->in_flight > 0);
+ skdev->in_flight -= 1;
+
+ timo_slot = skreq->timeout_stamp & SKD_TIMEOUT_SLOT_MASK;
+ SKD_ASSERT(skdev->timeout_slot[timo_slot] > 0);
+ skdev->timeout_slot[timo_slot] -= 1;
+
+ /*
+ * Reset backpointer
+ */
+ skreq->req = NULL;
+
+ /*
+ * Reclaim the skd_request_context
+ */
+ skreq->state = SKD_REQ_STATE_IDLE;
+ skreq->id += SKD_ID_INCR;
+ skreq->next = skdev->skreq_free_list;
+ skdev->skreq_free_list = skreq;
+}
+
+#define DRIVER_INQ_EVPD_PAGE_CODE 0xDA
+
+static void skd_do_inq_page_00(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1 *skcomp,
+ volatile struct fit_comp_error_info *skerr,
+ uint8_t *cdb, uint8_t *buf)
+{
+ uint16_t insert_pt, max_bytes, drive_pages, drive_bytes, new_size;
+
+ /* Caller requested "supported pages". The driver needs to insert
+ * its page.
+ */
+ pr_debug("%s:%s:%d skd_do_driver_inquiry: modify supported pages.\n",
+ skdev->name, __func__, __LINE__);
+
+ /* If the device rejected the request because the CDB was
+ * improperly formed, then just leave.
+ */
+ if (skcomp->status == SAM_STAT_CHECK_CONDITION &&
+ skerr->key == ILLEGAL_REQUEST && skerr->code == 0x24)
+ return;
+
+ /* Get the amount of space the caller allocated */
+ max_bytes = (cdb[3] << 8) | cdb[4];
+
+ /* Get the number of pages actually returned by the device */
+ drive_pages = (buf[2] << 8) | buf[3];
+ drive_bytes = drive_pages + 4;
+ new_size = drive_pages + 1;
+
+ /* Supported pages must be in numerical order, so find where
+ * the driver page needs to be inserted into the list of
+ * pages returned by the device.
+ */
+ for (insert_pt = 4; insert_pt < drive_bytes; insert_pt++) {
+ if (buf[insert_pt] == DRIVER_INQ_EVPD_PAGE_CODE)
+ return; /* Device using this page code. abort */
+ else if (buf[insert_pt] > DRIVER_INQ_EVPD_PAGE_CODE)
+ break;
+ }
+
+ if (insert_pt < max_bytes) {
+ uint16_t u;
+
+ /* Shift everything up one byte to make room. */
+ for (u = new_size + 3; u > insert_pt; u--)
+ buf[u] = buf[u - 1];
+ buf[insert_pt] = DRIVER_INQ_EVPD_PAGE_CODE;
+
+ /* SCSI byte order increment of num_returned_bytes by 1 */
+ skcomp->num_returned_bytes =
+ be32_to_cpu(skcomp->num_returned_bytes) + 1;
+ skcomp->num_returned_bytes =
+ be32_to_cpu(skcomp->num_returned_bytes);
+ }
+
+ /* update page length field to reflect the driver's page too */
+ buf[2] = (uint8_t)((new_size >> 8) & 0xFF);
+ buf[3] = (uint8_t)((new_size >> 0) & 0xFF);
+}
+
+static void skd_get_link_info(struct pci_dev *pdev, u8 *speed, u8 *width)
+{
+ int pcie_reg;
+ u16 pci_bus_speed;
+ u8 pci_lanes;
+
+ pcie_reg = pci_find_capability(pdev, PCI_CAP_ID_EXP);
+ if (pcie_reg) {
+ u16 linksta;
+ pci_read_config_word(pdev, pcie_reg + PCI_EXP_LNKSTA, &linksta);
+
+ pci_bus_speed = linksta & 0xF;
+ pci_lanes = (linksta & 0x3F0) >> 4;
+ } else {
+ *speed = STEC_LINK_UNKNOWN;
+ *width = 0xFF;
+ return;
+ }
+
+ switch (pci_bus_speed) {
+ case 1:
+ *speed = STEC_LINK_2_5GTS;
+ break;
+ case 2:
+ *speed = STEC_LINK_5GTS;
+ break;
+ case 3:
+ *speed = STEC_LINK_8GTS;
+ break;
+ default:
+ *speed = STEC_LINK_UNKNOWN;
+ break;
+ }
+
+ if (pci_lanes <= 0x20)
+ *width = pci_lanes;
+ else
+ *width = 0xFF;
+}
+
+static void skd_do_inq_page_da(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1 *skcomp,
+ volatile struct fit_comp_error_info *skerr,
+ uint8_t *cdb, uint8_t *buf)
+{
+ struct pci_dev *pdev = skdev->pdev;
+ unsigned max_bytes;
+ struct driver_inquiry_data inq;
+ u16 val;
+
+ pr_debug("%s:%s:%d skd_do_driver_inquiry: return driver page\n",
+ skdev->name, __func__, __LINE__);
+
+ memset(&inq, 0, sizeof(inq));
+
+ inq.page_code = DRIVER_INQ_EVPD_PAGE_CODE;
+
+ skd_get_link_info(pdev, &inq.pcie_link_speed, &inq.pcie_link_lanes);
+ inq.pcie_bus_number = cpu_to_be16(pdev->bus->number);
+ inq.pcie_device_number = PCI_SLOT(pdev->devfn);
+ inq.pcie_function_number = PCI_FUNC(pdev->devfn);
+
+ pci_read_config_word(pdev, PCI_VENDOR_ID, &val);
+ inq.pcie_vendor_id = cpu_to_be16(val);
+
+ pci_read_config_word(pdev, PCI_DEVICE_ID, &val);
+ inq.pcie_device_id = cpu_to_be16(val);
+
+ pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &val);
+ inq.pcie_subsystem_vendor_id = cpu_to_be16(val);
+
+ pci_read_config_word(pdev, PCI_SUBSYSTEM_ID, &val);
+ inq.pcie_subsystem_device_id = cpu_to_be16(val);
+
+ /* Driver version, fixed lenth, padded with spaces on the right */
+ inq.driver_version_length = sizeof(inq.driver_version);
+ memset(&inq.driver_version, ' ', sizeof(inq.driver_version));
+ memcpy(inq.driver_version, DRV_VER_COMPL,
+ min(sizeof(inq.driver_version), strlen(DRV_VER_COMPL)));
+
+ inq.page_length = cpu_to_be16((sizeof(inq) - 4));
+
+ /* Clear the error set by the device */
+ skcomp->status = SAM_STAT_GOOD;
+ memset((void *)skerr, 0, sizeof(*skerr));
+
+ /* copy response into output buffer */
+ max_bytes = (cdb[3] << 8) | cdb[4];
+ memcpy(buf, &inq, min_t(unsigned, max_bytes, sizeof(inq)));
+
+ skcomp->num_returned_bytes =
+ be32_to_cpu(min_t(uint16_t, max_bytes, sizeof(inq)));
+}
+
+static void skd_do_driver_inq(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1 *skcomp,
+ volatile struct fit_comp_error_info *skerr,
+ uint8_t *cdb, uint8_t *buf)
+{
+ if (!buf)
+ return;
+ else if (cdb[0] != INQUIRY)
+ return; /* Not an INQUIRY */
+ else if ((cdb[1] & 1) == 0)
+ return; /* EVPD not set */
+ else if (cdb[2] == 0)
+ /* Need to add driver's page to supported pages list */
+ skd_do_inq_page_00(skdev, skcomp, skerr, cdb, buf);
+ else if (cdb[2] == DRIVER_INQ_EVPD_PAGE_CODE)
+ /* Caller requested driver's page */
+ skd_do_inq_page_da(skdev, skcomp, skerr, cdb, buf);
+}
+
+static unsigned char *skd_sg_1st_page_ptr(struct scatterlist *sg)
+{
+ if (!sg)
+ return NULL;
+ if (!sg_page(sg))
+ return NULL;
+ return sg_virt(sg);
+}
+
+static void skd_process_scsi_inq(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1
+ *skcomp,
+ volatile struct fit_comp_error_info *skerr,
+ struct skd_special_context *skspcl)
+{
+ uint8_t *buf;
+ struct fit_msg_hdr *fmh = (struct fit_msg_hdr *)skspcl->msg_buf;
+ struct skd_scsi_request *scsi_req = (struct skd_scsi_request *)&fmh[1];
+
+ dma_sync_sg_for_cpu(skdev->class_dev, skspcl->req.sg, skspcl->req.n_sg,
+ skspcl->req.sg_data_dir);
+ buf = skd_sg_1st_page_ptr(skspcl->req.sg);
+
+ if (buf)
+ skd_do_driver_inq(skdev, skcomp, skerr, scsi_req->cdb, buf);
+}
+
+
+static int skd_isr_completion_posted(struct skd_device *skdev,
+ int limit, int *enqueued)
+{
+ volatile struct fit_completion_entry_v1 *skcmp = NULL;
+ volatile struct fit_comp_error_info *skerr;
+ u16 req_id;
+ u32 req_slot;
+ struct skd_request_context *skreq;
+ u16 cmp_cntxt = 0;
+ u8 cmp_status = 0;
+ u8 cmp_cycle = 0;
+ u32 cmp_bytes = 0;
+ int rc = 0;
+ int processed = 0;
+
+ for (;; ) {
+ SKD_ASSERT(skdev->skcomp_ix < SKD_N_COMPLETION_ENTRY);
+
+ skcmp = &skdev->skcomp_table[skdev->skcomp_ix];
+ cmp_cycle = skcmp->cycle;
+ cmp_cntxt = skcmp->tag;
+ cmp_status = skcmp->status;
+ cmp_bytes = be32_to_cpu(skcmp->num_returned_bytes);
+
+ skerr = &skdev->skerr_table[skdev->skcomp_ix];
+
+ pr_debug("%s:%s:%d "
+ "cycle=%d ix=%d got cycle=%d cmdctxt=0x%x stat=%d "
+ "busy=%d rbytes=0x%x proto=%d\n",
+ skdev->name, __func__, __LINE__, skdev->skcomp_cycle,
+ skdev->skcomp_ix, cmp_cycle, cmp_cntxt, cmp_status,
+ skdev->in_flight, cmp_bytes, skdev->proto_ver);
+
+ if (cmp_cycle != skdev->skcomp_cycle) {
+ pr_debug("%s:%s:%d end of completions\n",
+ skdev->name, __func__, __LINE__);
+ break;
+ }
+ /*
+ * Update the completion queue head index and possibly
+ * the completion cycle count. 8-bit wrap-around.
+ */
+ skdev->skcomp_ix++;
+ if (skdev->skcomp_ix >= SKD_N_COMPLETION_ENTRY) {
+ skdev->skcomp_ix = 0;
+ skdev->skcomp_cycle++;
+ }
+
+ /*
+ * The command context is a unique 32-bit ID. The low order
+ * bits help locate the request. The request is usually a
+ * r/w request (see skd_start() above) or a special request.
+ */
+ req_id = cmp_cntxt;
+ req_slot = req_id & SKD_ID_SLOT_AND_TABLE_MASK;
+
+ /* Is this other than a r/w request? */
+ if (req_slot >= skdev->num_req_context) {
+ /*
+ * This is not a completion for a r/w request.
+ */
+ skd_complete_other(skdev, skcmp, skerr);
+ continue;
+ }
+
+ skreq = &skdev->skreq_table[req_slot];
+
+ /*
+ * Make sure the request ID for the slot matches.
+ */
+ if (skreq->id != req_id) {
+ pr_debug("%s:%s:%d mismatch comp_id=0x%x req_id=0x%x\n",
+ skdev->name, __func__, __LINE__,
+ req_id, skreq->id);
+ {
+ u16 new_id = cmp_cntxt;
+ pr_err("(%s): Completion mismatch "
+ "comp_id=0x%04x skreq=0x%04x new=0x%04x\n",
+ skd_name(skdev), req_id,
+ skreq->id, new_id);
+
+ continue;
+ }
+ }
+
+ SKD_ASSERT(skreq->state == SKD_REQ_STATE_BUSY);
+
+ if (skreq->state == SKD_REQ_STATE_ABORTED) {
+ pr_debug("%s:%s:%d reclaim req %p id=%04x\n",
+ skdev->name, __func__, __LINE__,
+ skreq, skreq->id);
+ /* a previously timed out command can
+ * now be cleaned up */
+ skd_release_skreq(skdev, skreq);
+ continue;
+ }
+
+ skreq->completion = *skcmp;
+ if (unlikely(cmp_status == SAM_STAT_CHECK_CONDITION)) {
+ skreq->err_info = *skerr;
+ skd_log_check_status(skdev, cmp_status, skerr->key,
+ skerr->code, skerr->qual,
+ skerr->fruc);
+ }
+ /* Release DMA resources for the request. */
+ if (skreq->n_sg > 0)
+ skd_postop_sg_list(skdev, skreq);
+
+ if (!skreq->req) {
+ pr_debug("%s:%s:%d NULL backptr skdreq %p, "
+ "req=0x%x req_id=0x%x\n",
+ skdev->name, __func__, __LINE__,
+ skreq, skreq->id, req_id);
+ } else {
+ /*
+ * Capture the outcome and post it back to the
+ * native request.
+ */
+ if (likely(cmp_status == SAM_STAT_GOOD))
+ skd_end_request(skdev, skreq, 0);
+ else
+ skd_resolve_req_exception(skdev, skreq);
+ }
+
+ /*
+ * Release the skreq, its FIT msg (if one), timeout slot,
+ * and queue depth.
+ */
+ skd_release_skreq(skdev, skreq);
+
+ /* skd_isr_comp_limit equal zero means no limit */
+ if (limit) {
+ if (++processed >= limit) {
+ rc = 1;
+ break;
+ }
+ }
+ }
+
+ if ((skdev->state == SKD_DRVR_STATE_PAUSING)
+ && (skdev->in_flight) == 0) {
+ skdev->state = SKD_DRVR_STATE_PAUSED;
+ wake_up_interruptible(&skdev->waitq);
+ }
+
+ return rc;
+}
+
+static void skd_complete_other(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1 *skcomp,
+ volatile struct fit_comp_error_info *skerr)
+{
+ u32 req_id = 0;
+ u32 req_table;
+ u32 req_slot;
+ struct skd_special_context *skspcl;
+
+ req_id = skcomp->tag;
+ req_table = req_id & SKD_ID_TABLE_MASK;
+ req_slot = req_id & SKD_ID_SLOT_MASK;
+
+ pr_debug("%s:%s:%d table=0x%x id=0x%x slot=%d\n",
+ skdev->name, __func__, __LINE__,
+ req_table, req_id, req_slot);
+
+ /*
+ * Based on the request id, determine how to dispatch this completion.
+ * This swich/case is finding the good cases and forwarding the
+ * completion entry. Errors are reported below the switch.
+ */
+ switch (req_table) {
+ case SKD_ID_RW_REQUEST:
+ /*
+ * The caller, skd_completion_posted_isr() above,
+ * handles r/w requests. The only way we get here
+ * is if the req_slot is out of bounds.
+ */
+ break;
+
+ case SKD_ID_SPECIAL_REQUEST:
+ /*
+ * Make sure the req_slot is in bounds and that the id
+ * matches.
+ */
+ if (req_slot < skdev->n_special) {
+ skspcl = &skdev->skspcl_table[req_slot];
+ if (skspcl->req.id == req_id &&
+ skspcl->req.state == SKD_REQ_STATE_BUSY) {
+ skd_complete_special(skdev,
+ skcomp, skerr, skspcl);
+ return;
+ }
+ }
+ break;
+
+ case SKD_ID_INTERNAL:
+ if (req_slot == 0) {
+ skspcl = &skdev->internal_skspcl;
+ if (skspcl->req.id == req_id &&
+ skspcl->req.state == SKD_REQ_STATE_BUSY) {
+ skd_complete_internal(skdev,
+ skcomp, skerr, skspcl);
+ return;
+ }
+ }
+ break;
+
+ case SKD_ID_FIT_MSG:
+ /*
+ * These id's should never appear in a completion record.
+ */
+ break;
+
+ default:
+ /*
+ * These id's should never appear anywhere;
+ */
+ break;
+ }
+
+ /*
+ * If we get here it is a bad or stale id.
+ */
+}
+
+static void skd_complete_special(struct skd_device *skdev,
+ volatile struct fit_completion_entry_v1
+ *skcomp,
+ volatile struct fit_comp_error_info *skerr,
+ struct skd_special_context *skspcl)
+{
+ pr_debug("%s:%s:%d completing special request %p\n",
+ skdev->name, __func__, __LINE__, skspcl);
+ if (skspcl->orphaned) {
+ /* Discard orphaned request */
+ /* ?: Can this release directly or does it need
+ * to use a worker? */
+ pr_debug("%s:%s:%d release orphaned %p\n",
+ skdev->name, __func__, __LINE__, skspcl);
+ skd_release_special(skdev, skspcl);
+ return;
+ }
+
+ skd_process_scsi_inq(skdev, skcomp, skerr, skspcl);
+
+ skspcl->req.state = SKD_REQ_STATE_COMPLETED;
+ skspcl->req.completion = *skcomp;
+ skspcl->req.err_info = *skerr;
+
+ skd_log_check_status(skdev, skspcl->req.completion.status, skerr->key,
+ skerr->code, skerr->qual, skerr->fruc);
+
+ wake_up_interruptible(&skdev->waitq);
+}
+
+/* assume spinlock is already held */
+static void skd_release_special(struct skd_device *skdev,
+ struct skd_special_context *skspcl)
+{
+ int i, was_depleted;
+
+ for (i = 0; i < skspcl->req.n_sg; i++) {
+ struct page *page = sg_page(&skspcl->req.sg[i]);
+ __free_page(page);
+ }
+
+ was_depleted = (skdev->skspcl_free_list == NULL);
+
+ skspcl->req.state = SKD_REQ_STATE_IDLE;
+ skspcl->req.id += SKD_ID_INCR;
+ skspcl->req.next =
+ (struct skd_request_context *)skdev->skspcl_free_list;
+ skdev->skspcl_free_list = (struct skd_special_context *)skspcl;
+
+ if (was_depleted) {
+ pr_debug("%s:%s:%d skspcl was depleted\n",
+ skdev->name, __func__, __LINE__);
+ /* Free list was depleted. Their might be waiters. */
+ wake_up_interruptible(&skdev->waitq);
+ }
+}
+
+static void skd_reset_skcomp(struct skd_device *skdev)
+{
+ u32 nbytes;
+ struct fit_completion_entry_v1 *skcomp;
+
+ nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
+ nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
+
+ memset(skdev->skcomp_table, 0, nbytes);
+
+ skdev->skcomp_ix = 0;
+ skdev->skcomp_cycle = 1;
+}
+
+/*
+ *****************************************************************************
+ * INTERRUPTS
+ *****************************************************************************
+ */
+static void skd_completion_worker(struct work_struct *work)
+{
+ struct skd_device *skdev =
+ container_of(work, struct skd_device, completion_worker);
+ unsigned long flags;
+ int flush_enqueued = 0;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+
+ /*
+ * pass in limit=0, which means no limit..
+ * process everything in compq
+ */
+ skd_isr_completion_posted(skdev, 0, &flush_enqueued);
+ skd_request_fn(skdev->queue);
+
+ spin_unlock_irqrestore(&skdev->lock, flags);
+}
+
+static void skd_isr_msg_from_dev(struct skd_device *skdev);
+
+irqreturn_t
+static skd_isr(int irq, void *ptr)
+{
+ struct skd_device *skdev;
+ u32 intstat;
+ u32 ack;
+ int rc = 0;
+ int deferred = 0;
+ int flush_enqueued = 0;
+
+ skdev = (struct skd_device *)ptr;
+ spin_lock(&skdev->lock);
+
+ for (;; ) {
+ intstat = SKD_READL(skdev, FIT_INT_STATUS_HOST);
+
+ ack = FIT_INT_DEF_MASK;
+ ack &= intstat;
+
+ pr_debug("%s:%s:%d intstat=0x%x ack=0x%x\n",
+ skdev->name, __func__, __LINE__, intstat, ack);
+
+ /* As long as there is an int pending on device, keep
+ * running loop. When none, get out, but if we've never
+ * done any processing, call completion handler?
+ */
+ if (ack == 0) {
+ /* No interrupts on device, but run the completion
+ * processor anyway?
+ */
+ if (rc == 0)
+ if (likely (skdev->state
+ == SKD_DRVR_STATE_ONLINE))
+ deferred = 1;
+ break;
+ }
+
+ rc = IRQ_HANDLED;
+
+ SKD_WRITEL(skdev, ack, FIT_INT_STATUS_HOST);
+
+ if (likely((skdev->state != SKD_DRVR_STATE_LOAD) &&
+ (skdev->state != SKD_DRVR_STATE_STOPPING))) {
+ if (intstat & FIT_ISH_COMPLETION_POSTED) {
+ /*
+ * If we have already deferred completion
+ * processing, don't bother running it again
+ */
+ if (deferred == 0)
+ deferred =
+ skd_isr_completion_posted(skdev,
+ skd_isr_comp_limit, &flush_enqueued);
+ }
+
+ if (intstat & FIT_ISH_FW_STATE_CHANGE) {
+ skd_isr_fwstate(skdev);
+ if (skdev->state == SKD_DRVR_STATE_FAULT ||
+ skdev->state ==
+ SKD_DRVR_STATE_DISAPPEARED) {
+ spin_unlock(&skdev->lock);
+ return rc;
+ }
+ }
+
+ if (intstat & FIT_ISH_MSG_FROM_DEV)
+ skd_isr_msg_from_dev(skdev);
+ }
+ }
+
+ if (unlikely(flush_enqueued))
+ skd_request_fn(skdev->queue);
+
+ if (deferred)
+ schedule_work(&skdev->completion_worker);
+ else if (!flush_enqueued)
+ skd_request_fn(skdev->queue);
+
+ spin_unlock(&skdev->lock);
+
+ return rc;
+}
+
+static void skd_drive_fault(struct skd_device *skdev)
+{
+ skdev->state = SKD_DRVR_STATE_FAULT;
+ pr_err("(%s): Drive FAULT\n", skd_name(skdev));
+}
+
+static void skd_drive_disappeared(struct skd_device *skdev)
+{
+ skdev->state = SKD_DRVR_STATE_DISAPPEARED;
+ pr_err("(%s): Drive DISAPPEARED\n", skd_name(skdev));
+}
+
+static void skd_isr_fwstate(struct skd_device *skdev)
+{
+ u32 sense;
+ u32 state;
+ u32 mtd;
+ int prev_driver_state = skdev->state;
+
+ sense = SKD_READL(skdev, FIT_STATUS);
+ state = sense & FIT_SR_DRIVE_STATE_MASK;
+
+ pr_err("(%s): s1120 state %s(%d)=>%s(%d)\n",
+ skd_name(skdev),
+ skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
+ skd_drive_state_to_str(state), state);
+
+ skdev->drive_state = state;
+
+ switch (skdev->drive_state) {
+ case FIT_SR_DRIVE_INIT:
+ if (skdev->state == SKD_DRVR_STATE_PROTOCOL_MISMATCH) {
+ skd_disable_interrupts(skdev);
+ break;
+ }
+ if (skdev->state == SKD_DRVR_STATE_RESTARTING)
+ skd_recover_requests(skdev, 0);
+ if (skdev->state == SKD_DRVR_STATE_WAIT_BOOT) {
+ skdev->timer_countdown = SKD_STARTING_TIMO;
+ skdev->state = SKD_DRVR_STATE_STARTING;
+ skd_soft_reset(skdev);
+ break;
+ }
+ mtd = FIT_MXD_CONS(FIT_MTD_FITFW_INIT, 0, 0);
+ SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+ skdev->last_mtd = mtd;
+ break;
+
+ case FIT_SR_DRIVE_ONLINE:
+ skdev->cur_max_queue_depth = skd_max_queue_depth;
+ if (skdev->cur_max_queue_depth > skdev->dev_max_queue_depth)
+ skdev->cur_max_queue_depth = skdev->dev_max_queue_depth;
+
+ skdev->queue_low_water_mark =
+ skdev->cur_max_queue_depth * 2 / 3 + 1;
+ if (skdev->queue_low_water_mark < 1)
+ skdev->queue_low_water_mark = 1;
+ pr_info(
+ "(%s): Queue depth limit=%d dev=%d lowat=%d\n",
+ skd_name(skdev),
+ skdev->cur_max_queue_depth,
+ skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
+
+ skd_refresh_device_data(skdev);
+ break;
+
+ case FIT_SR_DRIVE_BUSY:
+ skdev->state = SKD_DRVR_STATE_BUSY;
+ skdev->timer_countdown = SKD_BUSY_TIMO;
+ skd_quiesce_dev(skdev);
+ break;
+ case FIT_SR_DRIVE_BUSY_SANITIZE:
+ /* set timer for 3 seconds, we'll abort any unfinished
+ * commands after that expires
+ */
+ skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
+ skdev->timer_countdown = SKD_TIMER_SECONDS(3);
+ blk_start_queue(skdev->queue);
+ break;
+ case FIT_SR_DRIVE_BUSY_ERASE:
+ skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
+ skdev->timer_countdown = SKD_BUSY_TIMO;
+ break;
+ case FIT_SR_DRIVE_OFFLINE:
+ skdev->state = SKD_DRVR_STATE_IDLE;
+ break;
+ case FIT_SR_DRIVE_SOFT_RESET:
+ switch (skdev->state) {
+ case SKD_DRVR_STATE_STARTING:
+ case SKD_DRVR_STATE_RESTARTING:
+ /* Expected by a caller of skd_soft_reset() */
+ break;
+ default:
+ skdev->state = SKD_DRVR_STATE_RESTARTING;
+ break;
+ }
+ break;
+ case FIT_SR_DRIVE_FW_BOOTING:
+ pr_debug("%s:%s:%d ISR FIT_SR_DRIVE_FW_BOOTING %s\n",
+ skdev->name, __func__, __LINE__, skdev->name);
+ skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
+ skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
+ break;
+
+ case FIT_SR_DRIVE_DEGRADED:
+ case FIT_SR_PCIE_LINK_DOWN:
+ case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
+ break;
+
+ case FIT_SR_DRIVE_FAULT:
+ skd_drive_fault(skdev);
+ skd_recover_requests(skdev, 0);
+ blk_start_queue(skdev->queue);
+ break;
+
+ /* PCIe bus returned all Fs? */
+ case 0xFF:
+ pr_info("(%s): state=0x%x sense=0x%x\n",
+ skd_name(skdev), state, sense);
+ skd_drive_disappeared(skdev);
+ skd_recover_requests(skdev, 0);
+ blk_start_queue(skdev->queue);
+ break;
+ default:
+ /*
+ * Uknown FW State. Wait for a state we recognize.
+ */
+ break;
+ }
+ pr_err("(%s): Driver state %s(%d)=>%s(%d)\n",
+ skd_name(skdev),
+ skd_skdev_state_to_str(prev_driver_state), prev_driver_state,
+ skd_skdev_state_to_str(skdev->state), skdev->state);
+}
+
+static void skd_recover_requests(struct skd_device *skdev, int requeue)
+{
+ int i;
+
+ for (i = 0; i < skdev->num_req_context; i++) {
+ struct skd_request_context *skreq = &skdev->skreq_table[i];
+
+ if (skreq->state == SKD_REQ_STATE_BUSY) {
+ skd_log_skreq(skdev, skreq, "recover");
+
+ SKD_ASSERT((skreq->id & SKD_ID_INCR) != 0);
+ SKD_ASSERT(skreq->req != NULL);
+
+ /* Release DMA resources for the request. */
+ if (skreq->n_sg > 0)
+ skd_postop_sg_list(skdev, skreq);
+
+ if (requeue &&
+ (unsigned long) ++skreq->req->special <
+ SKD_MAX_RETRIES)
+ blk_requeue_request(skdev->queue, skreq->req);
+ else
+ skd_end_request(skdev, skreq, -EIO);
+
+ skreq->req = NULL;
+
+ skreq->state = SKD_REQ_STATE_IDLE;
+ skreq->id += SKD_ID_INCR;
+ }
+ if (i > 0)
+ skreq[-1].next = skreq;
+ skreq->next = NULL;
+ }
+ skdev->skreq_free_list = skdev->skreq_table;
+
+ for (i = 0; i < skdev->num_fitmsg_context; i++) {
+ struct skd_fitmsg_context *skmsg = &skdev->skmsg_table[i];
+
+ if (skmsg->state == SKD_MSG_STATE_BUSY) {
+ skd_log_skmsg(skdev, skmsg, "salvaged");
+ SKD_ASSERT((skmsg->id & SKD_ID_INCR) != 0);
+ skmsg->state = SKD_MSG_STATE_IDLE;
+ skmsg->id += SKD_ID_INCR;
+ }
+ if (i > 0)
+ skmsg[-1].next = skmsg;
+ skmsg->next = NULL;
+ }
+ skdev->skmsg_free_list = skdev->skmsg_table;
+
+ for (i = 0; i < skdev->n_special; i++) {
+ struct skd_special_context *skspcl = &skdev->skspcl_table[i];
+
+ /* If orphaned, reclaim it because it has already been reported
+ * to the process as an error (it was just waiting for
+ * a completion that didn't come, and now it will never come)
+ * If busy, change to a state that will cause it to error
+ * out in the wait routine and let it do the normal
+ * reporting and reclaiming
+ */
+ if (skspcl->req.state == SKD_REQ_STATE_BUSY) {
+ if (skspcl->orphaned) {
+ pr_debug("%s:%s:%d orphaned %p\n",
+ skdev->name, __func__, __LINE__,
+ skspcl);
+ skd_release_special(skdev, skspcl);
+ } else {
+ pr_debug("%s:%s:%d not orphaned %p\n",
+ skdev->name, __func__, __LINE__,
+ skspcl);
+ skspcl->req.state = SKD_REQ_STATE_ABORTED;
+ }
+ }
+ }
+ skdev->skspcl_free_list = skdev->skspcl_table;
+
+ for (i = 0; i < SKD_N_TIMEOUT_SLOT; i++)
+ skdev->timeout_slot[i] = 0;
+
+ skdev->in_flight = 0;
+}
+
+static void skd_isr_msg_from_dev(struct skd_device *skdev)
+{
+ u32 mfd;
+ u32 mtd;
+ u32 data;
+
+ mfd = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
+
+ pr_debug("%s:%s:%d mfd=0x%x last_mtd=0x%x\n",
+ skdev->name, __func__, __LINE__, mfd, skdev->last_mtd);
+
+ /* ignore any mtd that is an ack for something we didn't send */
+ if (FIT_MXD_TYPE(mfd) != FIT_MXD_TYPE(skdev->last_mtd))
+ return;
+
+ switch (FIT_MXD_TYPE(mfd)) {
+ case FIT_MTD_FITFW_INIT:
+ skdev->proto_ver = FIT_PROTOCOL_MAJOR_VER(mfd);
+
+ if (skdev->proto_ver != FIT_PROTOCOL_VERSION_1) {
+ pr_err("(%s): protocol mismatch\n",
+ skdev->name);
+ pr_err("(%s): got=%d support=%d\n",
+ skdev->name, skdev->proto_ver,
+ FIT_PROTOCOL_VERSION_1);
+ pr_err("(%s): please upgrade driver\n",
+ skdev->name);
+ skdev->state = SKD_DRVR_STATE_PROTOCOL_MISMATCH;
+ skd_soft_reset(skdev);
+ break;
+ }
+ mtd = FIT_MXD_CONS(FIT_MTD_GET_CMDQ_DEPTH, 0, 0);
+ SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+ skdev->last_mtd = mtd;
+ break;
+
+ case FIT_MTD_GET_CMDQ_DEPTH:
+ skdev->dev_max_queue_depth = FIT_MXD_DATA(mfd);
+ mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_DEPTH, 0,
+ SKD_N_COMPLETION_ENTRY);
+ SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+ skdev->last_mtd = mtd;
+ break;
+
+ case FIT_MTD_SET_COMPQ_DEPTH:
+ SKD_WRITEQ(skdev, skdev->cq_dma_address, FIT_MSG_TO_DEVICE_ARG);
+ mtd = FIT_MXD_CONS(FIT_MTD_SET_COMPQ_ADDR, 0, 0);
+ SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+ skdev->last_mtd = mtd;
+ break;
+
+ case FIT_MTD_SET_COMPQ_ADDR:
+ skd_reset_skcomp(skdev);
+ mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_HOST_ID, 0, skdev->devno);
+ SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+ skdev->last_mtd = mtd;
+ break;
+
+ case FIT_MTD_CMD_LOG_HOST_ID:
+ skdev->connect_time_stamp = get_seconds();
+ data = skdev->connect_time_stamp & 0xFFFF;
+ mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data);
+ SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+ skdev->last_mtd = mtd;
+ break;
+
+ case FIT_MTD_CMD_LOG_TIME_STAMP_LO:
+ skdev->drive_jiffies = FIT_MXD_DATA(mfd);
+ data = (skdev->connect_time_stamp >> 16) & 0xFFFF;
+ mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_HI, 0, data);
+ SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+ skdev->last_mtd = mtd;
+ break;
+
+ case FIT_MTD_CMD_LOG_TIME_STAMP_HI:
+ skdev->drive_jiffies |= (FIT_MXD_DATA(mfd) << 16);
+ mtd = FIT_MXD_CONS(FIT_MTD_ARM_QUEUE, 0, 0);
+ SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE);
+ skdev->last_mtd = mtd;
+
+ pr_err("(%s): Time sync driver=0x%x device=0x%x\n",
+ skd_name(skdev),
+ skdev->connect_time_stamp, skdev->drive_jiffies);
+ break;
+
+ case FIT_MTD_ARM_QUEUE:
+ skdev->last_mtd = 0;
+ /*
+ * State should be, or soon will be, FIT_SR_DRIVE_ONLINE.
+ */
+ break;
+
+ default:
+ break;
+ }
+}
+
+static void skd_disable_interrupts(struct skd_device *skdev)
+{
+ u32 sense;
+
+ sense = SKD_READL(skdev, FIT_CONTROL);
+ sense &= ~FIT_CR_ENABLE_INTERRUPTS;
+ SKD_WRITEL(skdev, sense, FIT_CONTROL);
+ pr_debug("%s:%s:%d sense 0x%x\n",
+ skdev->name, __func__, __LINE__, sense);
+
+ /* Note that the 1s is written. A 1-bit means
+ * disable, a 0 means enable.
+ */
+ SKD_WRITEL(skdev, ~0, FIT_INT_MASK_HOST);
+}
+
+static void skd_enable_interrupts(struct skd_device *skdev)
+{
+ u32 val;
+
+ /* unmask interrupts first */
+ val = FIT_ISH_FW_STATE_CHANGE +
+ FIT_ISH_COMPLETION_POSTED + FIT_ISH_MSG_FROM_DEV;
+
+ /* Note that the compliment of mask is written. A 1-bit means
+ * disable, a 0 means enable. */
+ SKD_WRITEL(skdev, ~val, FIT_INT_MASK_HOST);
+ pr_debug("%s:%s:%d interrupt mask=0x%x\n",
+ skdev->name, __func__, __LINE__, ~val);
+
+ val = SKD_READL(skdev, FIT_CONTROL);
+ val |= FIT_CR_ENABLE_INTERRUPTS;
+ pr_debug("%s:%s:%d control=0x%x\n",
+ skdev->name, __func__, __LINE__, val);
+ SKD_WRITEL(skdev, val, FIT_CONTROL);
+}
+
+/*
+ *****************************************************************************
+ * START, STOP, RESTART, QUIESCE, UNQUIESCE
+ *****************************************************************************
+ */
+
+static void skd_soft_reset(struct skd_device *skdev)
+{
+ u32 val;
+
+ val = SKD_READL(skdev, FIT_CONTROL);
+ val |= (FIT_CR_SOFT_RESET);
+ pr_debug("%s:%s:%d control=0x%x\n",
+ skdev->name, __func__, __LINE__, val);
+ SKD_WRITEL(skdev, val, FIT_CONTROL);
+}
+
+static void skd_start_device(struct skd_device *skdev)
+{
+ unsigned long flags;
+ u32 sense;
+ u32 state;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+
+ /* ack all ghost interrupts */
+ SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+
+ sense = SKD_READL(skdev, FIT_STATUS);
+
+ pr_debug("%s:%s:%d initial status=0x%x\n",
+ skdev->name, __func__, __LINE__, sense);
+
+ state = sense & FIT_SR_DRIVE_STATE_MASK;
+ skdev->drive_state = state;
+ skdev->last_mtd = 0;
+
+ skdev->state = SKD_DRVR_STATE_STARTING;
+ skdev->timer_countdown = SKD_STARTING_TIMO;
+
+ skd_enable_interrupts(skdev);
+
+ switch (skdev->drive_state) {
+ case FIT_SR_DRIVE_OFFLINE:
+ pr_err("(%s): Drive offline...\n", skd_name(skdev));
+ break;
+
+ case FIT_SR_DRIVE_FW_BOOTING:
+ pr_debug("%s:%s:%d FIT_SR_DRIVE_FW_BOOTING %s\n",
+ skdev->name, __func__, __LINE__, skdev->name);
+ skdev->state = SKD_DRVR_STATE_WAIT_BOOT;
+ skdev->timer_countdown = SKD_WAIT_BOOT_TIMO;
+ break;
+
+ case FIT_SR_DRIVE_BUSY_SANITIZE:
+ pr_info("(%s): Start: BUSY_SANITIZE\n",
+ skd_name(skdev));
+ skdev->state = SKD_DRVR_STATE_BUSY_SANITIZE;
+ skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+ break;
+
+ case FIT_SR_DRIVE_BUSY_ERASE:
+ pr_info("(%s): Start: BUSY_ERASE\n", skd_name(skdev));
+ skdev->state = SKD_DRVR_STATE_BUSY_ERASE;
+ skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+ break;
+
+ case FIT_SR_DRIVE_INIT:
+ case FIT_SR_DRIVE_ONLINE:
+ skd_soft_reset(skdev);
+ break;
+
+ case FIT_SR_DRIVE_BUSY:
+ pr_err("(%s): Drive Busy...\n", skd_name(skdev));
+ skdev->state = SKD_DRVR_STATE_BUSY;
+ skdev->timer_countdown = SKD_STARTED_BUSY_TIMO;
+ break;
+
+ case FIT_SR_DRIVE_SOFT_RESET:
+ pr_err("(%s) drive soft reset in prog\n",
+ skd_name(skdev));
+ break;
+
+ case FIT_SR_DRIVE_FAULT:
+ /* Fault state is bad...soft reset won't do it...
+ * Hard reset, maybe, but does it work on device?
+ * For now, just fault so the system doesn't hang.
+ */
+ skd_drive_fault(skdev);
+ /*start the queue so we can respond with error to requests */
+ pr_debug("%s:%s:%d starting %s queue\n",
+ skdev->name, __func__, __LINE__, skdev->name);
+ blk_start_queue(skdev->queue);
+ skdev->gendisk_on = -1;
+ wake_up_interruptible(&skdev->waitq);
+ break;
+
+ case 0xFF:
+ /* Most likely the device isn't there or isn't responding
+ * to the BAR1 addresses. */
+ skd_drive_disappeared(skdev);
+ /*start the queue so we can respond with error to requests */
+ pr_debug("%s:%s:%d starting %s queue to error-out reqs\n",
+ skdev->name, __func__, __LINE__, skdev->name);
+ blk_start_queue(skdev->queue);
+ skdev->gendisk_on = -1;
+ wake_up_interruptible(&skdev->waitq);
+ break;
+
+ default:
+ pr_err("(%s) Start: unknown state %x\n",
+ skd_name(skdev), skdev->drive_state);
+ break;
+ }
+
+ state = SKD_READL(skdev, FIT_CONTROL);
+ pr_debug("%s:%s:%d FIT Control Status=0x%x\n",
+ skdev->name, __func__, __LINE__, state);
+
+ state = SKD_READL(skdev, FIT_INT_STATUS_HOST);
+ pr_debug("%s:%s:%d Intr Status=0x%x\n",
+ skdev->name, __func__, __LINE__, state);
+
+ state = SKD_READL(skdev, FIT_INT_MASK_HOST);
+ pr_debug("%s:%s:%d Intr Mask=0x%x\n",
+ skdev->name, __func__, __LINE__, state);
+
+ state = SKD_READL(skdev, FIT_MSG_FROM_DEVICE);
+ pr_debug("%s:%s:%d Msg from Dev=0x%x\n",
+ skdev->name, __func__, __LINE__, state);
+
+ state = SKD_READL(skdev, FIT_HW_VERSION);
+ pr_debug("%s:%s:%d HW version=0x%x\n",
+ skdev->name, __func__, __LINE__, state);
+
+ spin_unlock_irqrestore(&skdev->lock, flags);
+}
+
+static void skd_stop_device(struct skd_device *skdev)
+{
+ unsigned long flags;
+ struct skd_special_context *skspcl = &skdev->internal_skspcl;
+ u32 dev_state;
+ int i;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+
+ if (skdev->state != SKD_DRVR_STATE_ONLINE) {
+ pr_err("(%s): skd_stop_device not online no sync\n",
+ skd_name(skdev));
+ goto stop_out;
+ }
+
+ if (skspcl->req.state != SKD_REQ_STATE_IDLE) {
+ pr_err("(%s): skd_stop_device no special\n",
+ skd_name(skdev));
+ goto stop_out;
+ }
+
+ skdev->state = SKD_DRVR_STATE_SYNCING;
+ skdev->sync_done = 0;
+
+ skd_send_internal_skspcl(skdev, skspcl, SYNCHRONIZE_CACHE);
+
+ spin_unlock_irqrestore(&skdev->lock, flags);
+
+ wait_event_interruptible_timeout(skdev->waitq,
+ (skdev->sync_done), (10 * HZ));
+
+ spin_lock_irqsave(&skdev->lock, flags);
+
+ switch (skdev->sync_done) {
+ case 0:
+ pr_err("(%s): skd_stop_device no sync\n",
+ skd_name(skdev));
+ break;
+ case 1:
+ pr_err("(%s): skd_stop_device sync done\n",
+ skd_name(skdev));
+ break;
+ default:
+ pr_err("(%s): skd_stop_device sync error\n",
+ skd_name(skdev));
+ }
+
+stop_out:
+ skdev->state = SKD_DRVR_STATE_STOPPING;
+ spin_unlock_irqrestore(&skdev->lock, flags);
+
+ skd_kill_timer(skdev);
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ skd_disable_interrupts(skdev);
+
+ /* ensure all ints on device are cleared */
+ /* soft reset the device to unload with a clean slate */
+ SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+ SKD_WRITEL(skdev, FIT_CR_SOFT_RESET, FIT_CONTROL);
+
+ spin_unlock_irqrestore(&skdev->lock, flags);
+
+ /* poll every 100ms, 1 second timeout */
+ for (i = 0; i < 10; i++) {
+ dev_state =
+ SKD_READL(skdev, FIT_STATUS) & FIT_SR_DRIVE_STATE_MASK;
+ if (dev_state == FIT_SR_DRIVE_INIT)
+ break;
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(msecs_to_jiffies(100));
+ }
+
+ if (dev_state != FIT_SR_DRIVE_INIT)
+ pr_err("(%s): skd_stop_device state error 0x%02x\n",
+ skd_name(skdev), dev_state);
+}
+
+/* assume spinlock is held */
+static void skd_restart_device(struct skd_device *skdev)
+{
+ u32 state;
+
+ /* ack all ghost interrupts */
+ SKD_WRITEL(skdev, FIT_INT_DEF_MASK, FIT_INT_STATUS_HOST);
+
+ state = SKD_READL(skdev, FIT_STATUS);
+
+ pr_debug("%s:%s:%d drive status=0x%x\n",
+ skdev->name, __func__, __LINE__, state);
+
+ state &= FIT_SR_DRIVE_STATE_MASK;
+ skdev->drive_state = state;
+ skdev->last_mtd = 0;
+
+ skdev->state = SKD_DRVR_STATE_RESTARTING;
+ skdev->timer_countdown = SKD_RESTARTING_TIMO;
+
+ skd_soft_reset(skdev);
+}
+
+/* assume spinlock is held */
+static int skd_quiesce_dev(struct skd_device *skdev)
+{
+ int rc = 0;
+
+ switch (skdev->state) {
+ case SKD_DRVR_STATE_BUSY:
+ case SKD_DRVR_STATE_BUSY_IMMINENT:
+ pr_debug("%s:%s:%d stopping %s queue\n",
+ skdev->name, __func__, __LINE__, skdev->name);
+ blk_stop_queue(skdev->queue);
+ break;
+ case SKD_DRVR_STATE_ONLINE:
+ case SKD_DRVR_STATE_STOPPING:
+ case SKD_DRVR_STATE_SYNCING:
+ case SKD_DRVR_STATE_PAUSING:
+ case SKD_DRVR_STATE_PAUSED:
+ case SKD_DRVR_STATE_STARTING:
+ case SKD_DRVR_STATE_RESTARTING:
+ case SKD_DRVR_STATE_RESUMING:
+ default:
+ rc = -EINVAL;
+ pr_debug("%s:%s:%d state [%d] not implemented\n",
+ skdev->name, __func__, __LINE__, skdev->state);
+ }
+ return rc;
+}
+
+/* assume spinlock is held */
+static int skd_unquiesce_dev(struct skd_device *skdev)
+{
+ int prev_driver_state = skdev->state;
+
+ skd_log_skdev(skdev, "unquiesce");
+ if (skdev->state == SKD_DRVR_STATE_ONLINE) {
+ pr_debug("%s:%s:%d **** device already ONLINE\n",
+ skdev->name, __func__, __LINE__);
+ return 0;
+ }
+ if (skdev->drive_state != FIT_SR_DRIVE_ONLINE) {
+ /*
+ * If there has been an state change to other than
+ * ONLINE, we will rely on controller state change
+ * to come back online and restart the queue.
+ * The BUSY state means that driver is ready to
+ * continue normal processing but waiting for controller
+ * to become available.
+ */
+ skdev->state = SKD_DRVR_STATE_BUSY;
+ pr_debug("%s:%s:%d drive BUSY state\n",
+ skdev->name, __func__, __LINE__);
+ return 0;
+ }
+
+ /*
+ * Drive has just come online, driver is either in startup,
+ * paused performing a task, or bust waiting for hardware.
+ */
+ switch (skdev->state) {
+ case SKD_DRVR_STATE_PAUSED:
+ case SKD_DRVR_STATE_BUSY:
+ case SKD_DRVR_STATE_BUSY_IMMINENT:
+ case SKD_DRVR_STATE_BUSY_ERASE:
+ case SKD_DRVR_STATE_STARTING:
+ case SKD_DRVR_STATE_RESTARTING:
+ case SKD_DRVR_STATE_FAULT:
+ case SKD_DRVR_STATE_IDLE:
+ case SKD_DRVR_STATE_LOAD:
+ skdev->state = SKD_DRVR_STATE_ONLINE;
+ pr_err("(%s): Driver state %s(%d)=>%s(%d)\n",
+ skd_name(skdev),
+ skd_skdev_state_to_str(prev_driver_state),
+ prev_driver_state, skd_skdev_state_to_str(skdev->state),
+ skdev->state);
+ pr_debug("%s:%s:%d **** device ONLINE...starting block queue\n",
+ skdev->name, __func__, __LINE__);
+ pr_debug("%s:%s:%d starting %s queue\n",
+ skdev->name, __func__, __LINE__, skdev->name);
+ pr_info("(%s): STEC s1120 ONLINE\n", skd_name(skdev));
+ blk_start_queue(skdev->queue);
+ skdev->gendisk_on = 1;
+ wake_up_interruptible(&skdev->waitq);
+ break;
+
+ case SKD_DRVR_STATE_DISAPPEARED:
+ default:
+ pr_debug("%s:%s:%d **** driver state %d, not implemented \n",
+ skdev->name, __func__, __LINE__,
+ skdev->state);
+ return -EBUSY;
+ }
+ return 0;
+}
+
+/*
+ *****************************************************************************
+ * PCIe MSI/MSI-X INTERRUPT HANDLERS
+ *****************************************************************************
+ */
+
+static irqreturn_t skd_reserved_isr(int irq, void *skd_host_data)
+{
+ struct skd_device *skdev = skd_host_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ pr_debug("%s:%s:%d MSIX = 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ SKD_READL(skdev, FIT_INT_STATUS_HOST));
+ pr_err("(%s): MSIX reserved irq %d = 0x%x\n", skd_name(skdev),
+ irq, SKD_READL(skdev, FIT_INT_STATUS_HOST));
+ SKD_WRITEL(skdev, FIT_INT_RESERVED_MASK, FIT_INT_STATUS_HOST);
+ spin_unlock_irqrestore(&skdev->lock, flags);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_statec_isr(int irq, void *skd_host_data)
+{
+ struct skd_device *skdev = skd_host_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ pr_debug("%s:%s:%d MSIX = 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ SKD_READL(skdev, FIT_INT_STATUS_HOST));
+ SKD_WRITEL(skdev, FIT_ISH_FW_STATE_CHANGE, FIT_INT_STATUS_HOST);
+ skd_isr_fwstate(skdev);
+ spin_unlock_irqrestore(&skdev->lock, flags);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_comp_q(int irq, void *skd_host_data)
+{
+ struct skd_device *skdev = skd_host_data;
+ unsigned long flags;
+ int flush_enqueued = 0;
+ int deferred;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ pr_debug("%s:%s:%d MSIX = 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ SKD_READL(skdev, FIT_INT_STATUS_HOST));
+ SKD_WRITEL(skdev, FIT_ISH_COMPLETION_POSTED, FIT_INT_STATUS_HOST);
+ deferred = skd_isr_completion_posted(skdev, skd_isr_comp_limit,
+ &flush_enqueued);
+ if (flush_enqueued)
+ skd_request_fn(skdev->queue);
+
+ if (deferred)
+ schedule_work(&skdev->completion_worker);
+ else if (!flush_enqueued)
+ skd_request_fn(skdev->queue);
+
+ spin_unlock_irqrestore(&skdev->lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_msg_isr(int irq, void *skd_host_data)
+{
+ struct skd_device *skdev = skd_host_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ pr_debug("%s:%s:%d MSIX = 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ SKD_READL(skdev, FIT_INT_STATUS_HOST));
+ SKD_WRITEL(skdev, FIT_ISH_MSG_FROM_DEV, FIT_INT_STATUS_HOST);
+ skd_isr_msg_from_dev(skdev);
+ spin_unlock_irqrestore(&skdev->lock, flags);
+ return IRQ_HANDLED;
+}
+
+static irqreturn_t skd_qfull_isr(int irq, void *skd_host_data)
+{
+ struct skd_device *skdev = skd_host_data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ pr_debug("%s:%s:%d MSIX = 0x%x\n",
+ skdev->name, __func__, __LINE__,
+ SKD_READL(skdev, FIT_INT_STATUS_HOST));
+ SKD_WRITEL(skdev, FIT_INT_QUEUE_FULL, FIT_INT_STATUS_HOST);
+ spin_unlock_irqrestore(&skdev->lock, flags);
+ return IRQ_HANDLED;
+}
+
+/*
+ *****************************************************************************
+ * PCIe MSI/MSI-X SETUP
+ *****************************************************************************
+ */
+
+struct skd_msix_entry {
+ int have_irq;
+ u32 vector;
+ u32 entry;
+ struct skd_device *rsp;
+ char isr_name[30];
+};
+
+struct skd_init_msix_entry {
+ const char *name;
+ irq_handler_t handler;
+};
+
+#define SKD_MAX_MSIX_COUNT 13
+#define SKD_MIN_MSIX_COUNT 7
+#define SKD_BASE_MSIX_IRQ 4
+
+static struct skd_init_msix_entry msix_entries[SKD_MAX_MSIX_COUNT] = {
+ { "(DMA 0)", skd_reserved_isr },
+ { "(DMA 1)", skd_reserved_isr },
+ { "(DMA 2)", skd_reserved_isr },
+ { "(DMA 3)", skd_reserved_isr },
+ { "(State Change)", skd_statec_isr },
+ { "(COMPL_Q)", skd_comp_q },
+ { "(MSG)", skd_msg_isr },
+ { "(Reserved)", skd_reserved_isr },
+ { "(Reserved)", skd_reserved_isr },
+ { "(Queue Full 0)", skd_qfull_isr },
+ { "(Queue Full 1)", skd_qfull_isr },
+ { "(Queue Full 2)", skd_qfull_isr },
+ { "(Queue Full 3)", skd_qfull_isr },
+};
+
+static void skd_release_msix(struct skd_device *skdev)
+{
+ struct skd_msix_entry *qentry;
+ int i;
+
+ if (skdev->msix_entries) {
+ for (i = 0; i < skdev->msix_count; i++) {
+ qentry = &skdev->msix_entries[i];
+ skdev = qentry->rsp;
+
+ if (qentry->have_irq)
+ devm_free_irq(&skdev->pdev->dev,
+ qentry->vector, qentry->rsp);
+ }
+
+ kfree(skdev->msix_entries);
+ }
+
+ if (skdev->msix_count)
+ pci_disable_msix(skdev->pdev);
+
+ skdev->msix_count = 0;
+ skdev->msix_entries = NULL;
+}
+
+static int skd_acquire_msix(struct skd_device *skdev)
+{
+ int i, rc;
+ struct pci_dev *pdev = skdev->pdev;
+ struct msix_entry *entries;
+ struct skd_msix_entry *qentry;
+
+ entries = kzalloc(sizeof(struct msix_entry) * SKD_MAX_MSIX_COUNT,
+ GFP_KERNEL);
+ if (!entries)
+ return -ENOMEM;
+
+ for (i = 0; i < SKD_MAX_MSIX_COUNT; i++)
+ entries[i].entry = i;
+
+ rc = pci_enable_msix_exact(pdev, entries, SKD_MAX_MSIX_COUNT);
+ if (rc) {
+ pr_err("(%s): failed to enable MSI-X %d\n",
+ skd_name(skdev), rc);
+ goto msix_out;
+ }
+
+ skdev->msix_count = SKD_MAX_MSIX_COUNT;
+ skdev->msix_entries = kzalloc(sizeof(struct skd_msix_entry) *
+ skdev->msix_count, GFP_KERNEL);
+ if (!skdev->msix_entries) {
+ rc = -ENOMEM;
+ pr_err("(%s): msix table allocation error\n",
+ skd_name(skdev));
+ goto msix_out;
+ }
+
+ for (i = 0; i < skdev->msix_count; i++) {
+ qentry = &skdev->msix_entries[i];
+ qentry->vector = entries[i].vector;
+ qentry->entry = entries[i].entry;
+ qentry->rsp = NULL;
+ qentry->have_irq = 0;
+ pr_debug("%s:%s:%d %s: <%s> msix (%d) vec %d, entry %x\n",
+ skdev->name, __func__, __LINE__,
+ pci_name(pdev), skdev->name,
+ i, qentry->vector, qentry->entry);
+ }
+
+ /* Enable MSI-X vectors for the base queue */
+ for (i = 0; i < skdev->msix_count; i++) {
+ qentry = &skdev->msix_entries[i];
+ snprintf(qentry->isr_name, sizeof(qentry->isr_name),
+ "%s%d-msix %s", DRV_NAME, skdev->devno,
+ msix_entries[i].name);
+ rc = devm_request_irq(&skdev->pdev->dev, qentry->vector,
+ msix_entries[i].handler, 0,
+ qentry->isr_name, skdev);
+ if (rc) {
+ pr_err("(%s): Unable to register(%d) MSI-X "
+ "handler %d: %s\n",
+ skd_name(skdev), rc, i, qentry->isr_name);
+ goto msix_out;
+ } else {
+ qentry->have_irq = 1;
+ qentry->rsp = skdev;
+ }
+ }
+ pr_debug("%s:%s:%d %s: <%s> msix %d irq(s) enabled\n",
+ skdev->name, __func__, __LINE__,
+ pci_name(pdev), skdev->name, skdev->msix_count);
+ return 0;
+
+msix_out:
+ if (entries)
+ kfree(entries);
+ skd_release_msix(skdev);
+ return rc;
+}
+
+static int skd_acquire_irq(struct skd_device *skdev)
+{
+ int rc;
+ struct pci_dev *pdev;
+
+ pdev = skdev->pdev;
+ skdev->msix_count = 0;
+
+RETRY_IRQ_TYPE:
+ switch (skdev->irq_type) {
+ case SKD_IRQ_MSIX:
+ rc = skd_acquire_msix(skdev);
+ if (!rc)
+ pr_info("(%s): MSI-X %d irqs enabled\n",
+ skd_name(skdev), skdev->msix_count);
+ else {
+ pr_err(
+ "(%s): failed to enable MSI-X, re-trying with MSI %d\n",
+ skd_name(skdev), rc);
+ skdev->irq_type = SKD_IRQ_MSI;
+ goto RETRY_IRQ_TYPE;
+ }
+ break;
+ case SKD_IRQ_MSI:
+ snprintf(skdev->isr_name, sizeof(skdev->isr_name), "%s%d-msi",
+ DRV_NAME, skdev->devno);
+ rc = pci_enable_msi_range(pdev, 1, 1);
+ if (rc > 0) {
+ rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr, 0,
+ skdev->isr_name, skdev);
+ if (rc) {
+ pci_disable_msi(pdev);
+ pr_err(
+ "(%s): failed to allocate the MSI interrupt %d\n",
+ skd_name(skdev), rc);
+ goto RETRY_IRQ_LEGACY;
+ }
+ pr_info("(%s): MSI irq %d enabled\n",
+ skd_name(skdev), pdev->irq);
+ } else {
+RETRY_IRQ_LEGACY:
+ pr_err(
+ "(%s): failed to enable MSI, re-trying with LEGACY %d\n",
+ skd_name(skdev), rc);
+ skdev->irq_type = SKD_IRQ_LEGACY;
+ goto RETRY_IRQ_TYPE;
+ }
+ break;
+ case SKD_IRQ_LEGACY:
+ snprintf(skdev->isr_name, sizeof(skdev->isr_name),
+ "%s%d-legacy", DRV_NAME, skdev->devno);
+ rc = devm_request_irq(&pdev->dev, pdev->irq, skd_isr,
+ IRQF_SHARED, skdev->isr_name, skdev);
+ if (!rc)
+ pr_info("(%s): LEGACY irq %d enabled\n",
+ skd_name(skdev), pdev->irq);
+ else
+ pr_err("(%s): request LEGACY irq error %d\n",
+ skd_name(skdev), rc);
+ break;
+ default:
+ pr_info("(%s): irq_type %d invalid, re-set to %d\n",
+ skd_name(skdev), skdev->irq_type, SKD_IRQ_DEFAULT);
+ skdev->irq_type = SKD_IRQ_LEGACY;
+ goto RETRY_IRQ_TYPE;
+ }
+ return rc;
+}
+
+static void skd_release_irq(struct skd_device *skdev)
+{
+ switch (skdev->irq_type) {
+ case SKD_IRQ_MSIX:
+ skd_release_msix(skdev);
+ break;
+ case SKD_IRQ_MSI:
+ devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
+ pci_disable_msi(skdev->pdev);
+ break;
+ case SKD_IRQ_LEGACY:
+ devm_free_irq(&skdev->pdev->dev, skdev->pdev->irq, skdev);
+ break;
+ default:
+ pr_err("(%s): wrong irq type %d!",
+ skd_name(skdev), skdev->irq_type);
+ break;
+ }
+}
+
+/*
+ *****************************************************************************
+ * CONSTRUCT
+ *****************************************************************************
+ */
+
+static int skd_cons_skcomp(struct skd_device *skdev)
+{
+ int rc = 0;
+ struct fit_completion_entry_v1 *skcomp;
+ u32 nbytes;
+
+ nbytes = sizeof(*skcomp) * SKD_N_COMPLETION_ENTRY;
+ nbytes += sizeof(struct fit_comp_error_info) * SKD_N_COMPLETION_ENTRY;
+
+ pr_debug("%s:%s:%d comp pci_alloc, total bytes %d entries %d\n",
+ skdev->name, __func__, __LINE__,
+ nbytes, SKD_N_COMPLETION_ENTRY);
+
+ skcomp = pci_zalloc_consistent(skdev->pdev, nbytes,
+ &skdev->cq_dma_address);
+
+ if (skcomp == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ skdev->skcomp_table = skcomp;
+ skdev->skerr_table = (struct fit_comp_error_info *)((char *)skcomp +
+ sizeof(*skcomp) *
+ SKD_N_COMPLETION_ENTRY);
+
+err_out:
+ return rc;
+}
+
+static int skd_cons_skmsg(struct skd_device *skdev)
+{
+ int rc = 0;
+ u32 i;
+
+ pr_debug("%s:%s:%d skmsg_table kzalloc, struct %lu, count %u total %lu\n",
+ skdev->name, __func__, __LINE__,
+ sizeof(struct skd_fitmsg_context),
+ skdev->num_fitmsg_context,
+ sizeof(struct skd_fitmsg_context) * skdev->num_fitmsg_context);
+
+ skdev->skmsg_table = kzalloc(sizeof(struct skd_fitmsg_context)
+ *skdev->num_fitmsg_context, GFP_KERNEL);
+ if (skdev->skmsg_table == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ for (i = 0; i < skdev->num_fitmsg_context; i++) {
+ struct skd_fitmsg_context *skmsg;
+
+ skmsg = &skdev->skmsg_table[i];
+
+ skmsg->id = i + SKD_ID_FIT_MSG;
+
+ skmsg->state = SKD_MSG_STATE_IDLE;
+ skmsg->msg_buf = pci_alloc_consistent(skdev->pdev,
+ SKD_N_FITMSG_BYTES + 64,
+ &skmsg->mb_dma_address);
+
+ if (skmsg->msg_buf == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ skmsg->offset = (u32)((u64)skmsg->msg_buf &
+ (~FIT_QCMD_BASE_ADDRESS_MASK));
+ skmsg->msg_buf += ~FIT_QCMD_BASE_ADDRESS_MASK;
+ skmsg->msg_buf = (u8 *)((u64)skmsg->msg_buf &
+ FIT_QCMD_BASE_ADDRESS_MASK);
+ skmsg->mb_dma_address += ~FIT_QCMD_BASE_ADDRESS_MASK;
+ skmsg->mb_dma_address &= FIT_QCMD_BASE_ADDRESS_MASK;
+ memset(skmsg->msg_buf, 0, SKD_N_FITMSG_BYTES);
+
+ skmsg->next = &skmsg[1];
+ }
+
+ /* Free list is in order starting with the 0th entry. */
+ skdev->skmsg_table[i - 1].next = NULL;
+ skdev->skmsg_free_list = skdev->skmsg_table;
+
+err_out:
+ return rc;
+}
+
+static struct fit_sg_descriptor *skd_cons_sg_list(struct skd_device *skdev,
+ u32 n_sg,
+ dma_addr_t *ret_dma_addr)
+{
+ struct fit_sg_descriptor *sg_list;
+ u32 nbytes;
+
+ nbytes = sizeof(*sg_list) * n_sg;
+
+ sg_list = pci_alloc_consistent(skdev->pdev, nbytes, ret_dma_addr);
+
+ if (sg_list != NULL) {
+ uint64_t dma_address = *ret_dma_addr;
+ u32 i;
+
+ memset(sg_list, 0, nbytes);
+
+ for (i = 0; i < n_sg - 1; i++) {
+ uint64_t ndp_off;
+ ndp_off = (i + 1) * sizeof(struct fit_sg_descriptor);
+
+ sg_list[i].next_desc_ptr = dma_address + ndp_off;
+ }
+ sg_list[i].next_desc_ptr = 0LL;
+ }
+
+ return sg_list;
+}
+
+static int skd_cons_skreq(struct skd_device *skdev)
+{
+ int rc = 0;
+ u32 i;
+
+ pr_debug("%s:%s:%d skreq_table kzalloc, struct %lu, count %u total %lu\n",
+ skdev->name, __func__, __LINE__,
+ sizeof(struct skd_request_context),
+ skdev->num_req_context,
+ sizeof(struct skd_request_context) * skdev->num_req_context);
+
+ skdev->skreq_table = kzalloc(sizeof(struct skd_request_context)
+ * skdev->num_req_context, GFP_KERNEL);
+ if (skdev->skreq_table == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ pr_debug("%s:%s:%d alloc sg_table sg_per_req %u scatlist %lu total %lu\n",
+ skdev->name, __func__, __LINE__,
+ skdev->sgs_per_request, sizeof(struct scatterlist),
+ skdev->sgs_per_request * sizeof(struct scatterlist));
+
+ for (i = 0; i < skdev->num_req_context; i++) {
+ struct skd_request_context *skreq;
+
+ skreq = &skdev->skreq_table[i];
+
+ skreq->id = i + SKD_ID_RW_REQUEST;
+ skreq->state = SKD_REQ_STATE_IDLE;
+
+ skreq->sg = kzalloc(sizeof(struct scatterlist) *
+ skdev->sgs_per_request, GFP_KERNEL);
+ if (skreq->sg == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+ sg_init_table(skreq->sg, skdev->sgs_per_request);
+
+ skreq->sksg_list = skd_cons_sg_list(skdev,
+ skdev->sgs_per_request,
+ &skreq->sksg_dma_address);
+
+ if (skreq->sksg_list == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ skreq->next = &skreq[1];
+ }
+
+ /* Free list is in order starting with the 0th entry. */
+ skdev->skreq_table[i - 1].next = NULL;
+ skdev->skreq_free_list = skdev->skreq_table;
+
+err_out:
+ return rc;
+}
+
+static int skd_cons_skspcl(struct skd_device *skdev)
+{
+ int rc = 0;
+ u32 i, nbytes;
+
+ pr_debug("%s:%s:%d skspcl_table kzalloc, struct %lu, count %u total %lu\n",
+ skdev->name, __func__, __LINE__,
+ sizeof(struct skd_special_context),
+ skdev->n_special,
+ sizeof(struct skd_special_context) * skdev->n_special);
+
+ skdev->skspcl_table = kzalloc(sizeof(struct skd_special_context)
+ * skdev->n_special, GFP_KERNEL);
+ if (skdev->skspcl_table == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ for (i = 0; i < skdev->n_special; i++) {
+ struct skd_special_context *skspcl;
+
+ skspcl = &skdev->skspcl_table[i];
+
+ skspcl->req.id = i + SKD_ID_SPECIAL_REQUEST;
+ skspcl->req.state = SKD_REQ_STATE_IDLE;
+
+ skspcl->req.next = &skspcl[1].req;
+
+ nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+
+ skspcl->msg_buf =
+ pci_zalloc_consistent(skdev->pdev, nbytes,
+ &skspcl->mb_dma_address);
+ if (skspcl->msg_buf == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ skspcl->req.sg = kzalloc(sizeof(struct scatterlist) *
+ SKD_N_SG_PER_SPECIAL, GFP_KERNEL);
+ if (skspcl->req.sg == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ skspcl->req.sksg_list = skd_cons_sg_list(skdev,
+ SKD_N_SG_PER_SPECIAL,
+ &skspcl->req.
+ sksg_dma_address);
+ if (skspcl->req.sksg_list == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+ }
+
+ /* Free list is in order starting with the 0th entry. */
+ skdev->skspcl_table[i - 1].req.next = NULL;
+ skdev->skspcl_free_list = skdev->skspcl_table;
+
+ return rc;
+
+err_out:
+ return rc;
+}
+
+static int skd_cons_sksb(struct skd_device *skdev)
+{
+ int rc = 0;
+ struct skd_special_context *skspcl;
+ u32 nbytes;
+
+ skspcl = &skdev->internal_skspcl;
+
+ skspcl->req.id = 0 + SKD_ID_INTERNAL;
+ skspcl->req.state = SKD_REQ_STATE_IDLE;
+
+ nbytes = SKD_N_INTERNAL_BYTES;
+
+ skspcl->data_buf = pci_zalloc_consistent(skdev->pdev, nbytes,
+ &skspcl->db_dma_address);
+ if (skspcl->data_buf == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+ skspcl->msg_buf = pci_zalloc_consistent(skdev->pdev, nbytes,
+ &skspcl->mb_dma_address);
+ if (skspcl->msg_buf == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ skspcl->req.sksg_list = skd_cons_sg_list(skdev, 1,
+ &skspcl->req.sksg_dma_address);
+ if (skspcl->req.sksg_list == NULL) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ if (!skd_format_internal_skspcl(skdev)) {
+ rc = -EINVAL;
+ goto err_out;
+ }
+
+err_out:
+ return rc;
+}
+
+static int skd_cons_disk(struct skd_device *skdev)
+{
+ int rc = 0;
+ struct gendisk *disk;
+ struct request_queue *q;
+ unsigned long flags;
+
+ disk = alloc_disk(SKD_MINORS_PER_DEVICE);
+ if (!disk) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ skdev->disk = disk;
+ sprintf(disk->disk_name, DRV_NAME "%u", skdev->devno);
+
+ disk->major = skdev->major;
+ disk->first_minor = skdev->devno * SKD_MINORS_PER_DEVICE;
+ disk->fops = &skd_blockdev_ops;
+ disk->private_data = skdev;
+
+ q = blk_init_queue(skd_request_fn, &skdev->lock);
+ if (!q) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ skdev->queue = q;
+ disk->queue = q;
+ q->queuedata = skdev;
+
+ blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+ blk_queue_max_segments(q, skdev->sgs_per_request);
+ blk_queue_max_hw_sectors(q, SKD_N_MAX_SECTORS);
+
+ /* set sysfs ptimal_io_size to 8K */
+ blk_queue_io_opt(q, 8192);
+
+ /* DISCARD Flag initialization. */
+ q->limits.discard_granularity = 8192;
+ q->limits.discard_alignment = 0;
+ q->limits.max_discard_sectors = UINT_MAX >> 9;
+ q->limits.discard_zeroes_data = 1;
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
+ queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q);
+
+ spin_lock_irqsave(&skdev->lock, flags);
+ pr_debug("%s:%s:%d stopping %s queue\n",
+ skdev->name, __func__, __LINE__, skdev->name);
+ blk_stop_queue(skdev->queue);
+ spin_unlock_irqrestore(&skdev->lock, flags);
+
+err_out:
+ return rc;
+}
+
+#define SKD_N_DEV_TABLE 16u
+static u32 skd_next_devno;
+
+static struct skd_device *skd_construct(struct pci_dev *pdev)
+{
+ struct skd_device *skdev;
+ int blk_major = skd_major;
+ int rc;
+
+ skdev = kzalloc(sizeof(*skdev), GFP_KERNEL);
+
+ if (!skdev) {
+ pr_err(PFX "(%s): memory alloc failure\n",
+ pci_name(pdev));
+ return NULL;
+ }
+
+ skdev->state = SKD_DRVR_STATE_LOAD;
+ skdev->pdev = pdev;
+ skdev->devno = skd_next_devno++;
+ skdev->major = blk_major;
+ skdev->irq_type = skd_isr_type;
+ sprintf(skdev->name, DRV_NAME "%d", skdev->devno);
+ skdev->dev_max_queue_depth = 0;
+
+ skdev->num_req_context = skd_max_queue_depth;
+ skdev->num_fitmsg_context = skd_max_queue_depth;
+ skdev->n_special = skd_max_pass_thru;
+ skdev->cur_max_queue_depth = 1;
+ skdev->queue_low_water_mark = 1;
+ skdev->proto_ver = 99;
+ skdev->sgs_per_request = skd_sgs_per_request;
+ skdev->dbg_level = skd_dbg_level;
+
+ atomic_set(&skdev->device_count, 0);
+
+ spin_lock_init(&skdev->lock);
+
+ INIT_WORK(&skdev->completion_worker, skd_completion_worker);
+
+ pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
+ rc = skd_cons_skcomp(skdev);
+ if (rc < 0)
+ goto err_out;
+
+ pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
+ rc = skd_cons_skmsg(skdev);
+ if (rc < 0)
+ goto err_out;
+
+ pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
+ rc = skd_cons_skreq(skdev);
+ if (rc < 0)
+ goto err_out;
+
+ pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
+ rc = skd_cons_skspcl(skdev);
+ if (rc < 0)
+ goto err_out;
+
+ pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
+ rc = skd_cons_sksb(skdev);
+ if (rc < 0)
+ goto err_out;
+
+ pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
+ rc = skd_cons_disk(skdev);
+ if (rc < 0)
+ goto err_out;
+
+ pr_debug("%s:%s:%d VICTORY\n", skdev->name, __func__, __LINE__);
+ return skdev;
+
+err_out:
+ pr_debug("%s:%s:%d construct failed\n",
+ skdev->name, __func__, __LINE__);
+ skd_destruct(skdev);
+ return NULL;
+}
+
+/*
+ *****************************************************************************
+ * DESTRUCT (FREE)
+ *****************************************************************************
+ */
+
+static void skd_free_skcomp(struct skd_device *skdev)
+{
+ if (skdev->skcomp_table != NULL) {
+ u32 nbytes;
+
+ nbytes = sizeof(skdev->skcomp_table[0]) *
+ SKD_N_COMPLETION_ENTRY;
+ pci_free_consistent(skdev->pdev, nbytes,
+ skdev->skcomp_table, skdev->cq_dma_address);
+ }
+
+ skdev->skcomp_table = NULL;
+ skdev->cq_dma_address = 0;
+}
+
+static void skd_free_skmsg(struct skd_device *skdev)
+{
+ u32 i;
+
+ if (skdev->skmsg_table == NULL)
+ return;
+
+ for (i = 0; i < skdev->num_fitmsg_context; i++) {
+ struct skd_fitmsg_context *skmsg;
+
+ skmsg = &skdev->skmsg_table[i];
+
+ if (skmsg->msg_buf != NULL) {
+ skmsg->msg_buf += skmsg->offset;
+ skmsg->mb_dma_address += skmsg->offset;
+ pci_free_consistent(skdev->pdev, SKD_N_FITMSG_BYTES,
+ skmsg->msg_buf,
+ skmsg->mb_dma_address);
+ }
+ skmsg->msg_buf = NULL;
+ skmsg->mb_dma_address = 0;
+ }
+
+ kfree(skdev->skmsg_table);
+ skdev->skmsg_table = NULL;
+}
+
+static void skd_free_sg_list(struct skd_device *skdev,
+ struct fit_sg_descriptor *sg_list,
+ u32 n_sg, dma_addr_t dma_addr)
+{
+ if (sg_list != NULL) {
+ u32 nbytes;
+
+ nbytes = sizeof(*sg_list) * n_sg;
+
+ pci_free_consistent(skdev->pdev, nbytes, sg_list, dma_addr);
+ }
+}
+
+static void skd_free_skreq(struct skd_device *skdev)
+{
+ u32 i;
+
+ if (skdev->skreq_table == NULL)
+ return;
+
+ for (i = 0; i < skdev->num_req_context; i++) {
+ struct skd_request_context *skreq;
+
+ skreq = &skdev->skreq_table[i];
+
+ skd_free_sg_list(skdev, skreq->sksg_list,
+ skdev->sgs_per_request,
+ skreq->sksg_dma_address);
+
+ skreq->sksg_list = NULL;
+ skreq->sksg_dma_address = 0;
+
+ kfree(skreq->sg);
+ }
+
+ kfree(skdev->skreq_table);
+ skdev->skreq_table = NULL;
+}
+
+static void skd_free_skspcl(struct skd_device *skdev)
+{
+ u32 i;
+ u32 nbytes;
+
+ if (skdev->skspcl_table == NULL)
+ return;
+
+ for (i = 0; i < skdev->n_special; i++) {
+ struct skd_special_context *skspcl;
+
+ skspcl = &skdev->skspcl_table[i];
+
+ if (skspcl->msg_buf != NULL) {
+ nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+ pci_free_consistent(skdev->pdev, nbytes,
+ skspcl->msg_buf,
+ skspcl->mb_dma_address);
+ }
+
+ skspcl->msg_buf = NULL;
+ skspcl->mb_dma_address = 0;
+
+ skd_free_sg_list(skdev, skspcl->req.sksg_list,
+ SKD_N_SG_PER_SPECIAL,
+ skspcl->req.sksg_dma_address);
+
+ skspcl->req.sksg_list = NULL;
+ skspcl->req.sksg_dma_address = 0;
+
+ kfree(skspcl->req.sg);
+ }
+
+ kfree(skdev->skspcl_table);
+ skdev->skspcl_table = NULL;
+}
+
+static void skd_free_sksb(struct skd_device *skdev)
+{
+ struct skd_special_context *skspcl;
+ u32 nbytes;
+
+ skspcl = &skdev->internal_skspcl;
+
+ if (skspcl->data_buf != NULL) {
+ nbytes = SKD_N_INTERNAL_BYTES;
+
+ pci_free_consistent(skdev->pdev, nbytes,
+ skspcl->data_buf, skspcl->db_dma_address);
+ }
+
+ skspcl->data_buf = NULL;
+ skspcl->db_dma_address = 0;
+
+ if (skspcl->msg_buf != NULL) {
+ nbytes = SKD_N_SPECIAL_FITMSG_BYTES;
+ pci_free_consistent(skdev->pdev, nbytes,
+ skspcl->msg_buf, skspcl->mb_dma_address);
+ }
+
+ skspcl->msg_buf = NULL;
+ skspcl->mb_dma_address = 0;
+
+ skd_free_sg_list(skdev, skspcl->req.sksg_list, 1,
+ skspcl->req.sksg_dma_address);
+
+ skspcl->req.sksg_list = NULL;
+ skspcl->req.sksg_dma_address = 0;
+}
+
+static void skd_free_disk(struct skd_device *skdev)
+{
+ struct gendisk *disk = skdev->disk;
+
+ if (disk != NULL) {
+ struct request_queue *q = disk->queue;
+
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ if (q)
+ blk_cleanup_queue(q);
+ put_disk(disk);
+ }
+ skdev->disk = NULL;
+}
+
+static void skd_destruct(struct skd_device *skdev)
+{
+ if (skdev == NULL)
+ return;
+
+
+ pr_debug("%s:%s:%d disk\n", skdev->name, __func__, __LINE__);
+ skd_free_disk(skdev);
+
+ pr_debug("%s:%s:%d sksb\n", skdev->name, __func__, __LINE__);
+ skd_free_sksb(skdev);
+
+ pr_debug("%s:%s:%d skspcl\n", skdev->name, __func__, __LINE__);
+ skd_free_skspcl(skdev);
+
+ pr_debug("%s:%s:%d skreq\n", skdev->name, __func__, __LINE__);
+ skd_free_skreq(skdev);
+
+ pr_debug("%s:%s:%d skmsg\n", skdev->name, __func__, __LINE__);
+ skd_free_skmsg(skdev);
+
+ pr_debug("%s:%s:%d skcomp\n", skdev->name, __func__, __LINE__);
+ skd_free_skcomp(skdev);
+
+ pr_debug("%s:%s:%d skdev\n", skdev->name, __func__, __LINE__);
+ kfree(skdev);
+}
+
+/*
+ *****************************************************************************
+ * BLOCK DEVICE (BDEV) GLUE
+ *****************************************************************************
+ */
+
+static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct skd_device *skdev;
+ u64 capacity;
+
+ skdev = bdev->bd_disk->private_data;
+
+ pr_debug("%s:%s:%d %s: CMD[%s] getgeo device\n",
+ skdev->name, __func__, __LINE__,
+ bdev->bd_disk->disk_name, current->comm);
+
+ if (skdev->read_cap_is_valid) {
+ capacity = get_capacity(skdev->disk);
+ geo->heads = 64;
+ geo->sectors = 255;
+ geo->cylinders = (capacity) / (255 * 64);
+
+ return 0;
+ }
+ return -EIO;
+}
+
+static int skd_bdev_attach(struct skd_device *skdev)
+{
+ pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__);
+ add_disk(skdev->disk);
+ return 0;
+}
+
+static const struct block_device_operations skd_blockdev_ops = {
+ .owner = THIS_MODULE,
+ .ioctl = skd_bdev_ioctl,
+ .getgeo = skd_bdev_getgeo,
+};
+
+
+/*
+ *****************************************************************************
+ * PCIe DRIVER GLUE
+ *****************************************************************************
+ */
+
+static const struct pci_device_id skd_pci_tbl[] = {
+ { PCI_VENDOR_ID_STEC, PCI_DEVICE_ID_S1120,
+ PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
+ { 0 } /* terminate list */
+};
+
+MODULE_DEVICE_TABLE(pci, skd_pci_tbl);
+
+static char *skd_pci_info(struct skd_device *skdev, char *str)
+{
+ int pcie_reg;
+
+ strcpy(str, "PCIe (");
+ pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP);
+
+ if (pcie_reg) {
+
+ char lwstr[6];
+ uint16_t pcie_lstat, lspeed, lwidth;
+
+ pcie_reg += 0x12;
+ pci_read_config_word(skdev->pdev, pcie_reg, &pcie_lstat);
+ lspeed = pcie_lstat & (0xF);
+ lwidth = (pcie_lstat & 0x3F0) >> 4;
+
+ if (lspeed == 1)
+ strcat(str, "2.5GT/s ");
+ else if (lspeed == 2)
+ strcat(str, "5.0GT/s ");
+ else
+ strcat(str, "<unknown> ");
+ snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth);
+ strcat(str, lwstr);
+ }
+ return str;
+}
+
+static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ int i;
+ int rc = 0;
+ char pci_str[32];
+ struct skd_device *skdev;
+
+ pr_info("STEC s1120 Driver(%s) version %s-b%s\n",
+ DRV_NAME, DRV_VERSION, DRV_BUILD_ID);
+ pr_info("(skd?:??:[%s]): vendor=%04X device=%04x\n",
+ pci_name(pdev), pdev->vendor, pdev->device);
+
+ rc = pci_enable_device(pdev);
+ if (rc)
+ return rc;
+ rc = pci_request_regions(pdev, DRV_NAME);
+ if (rc)
+ goto err_out;
+ rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (!rc) {
+ if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
+
+ pr_err("(%s): consistent DMA mask error %d\n",
+ pci_name(pdev), rc);
+ }
+ } else {
+ (rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)));
+ if (rc) {
+
+ pr_err("(%s): DMA mask error %d\n",
+ pci_name(pdev), rc);
+ goto err_out_regions;
+ }
+ }
+
+ if (!skd_major) {
+ rc = register_blkdev(0, DRV_NAME);
+ if (rc < 0)
+ goto err_out_regions;
+ BUG_ON(!rc);
+ skd_major = rc;
+ }
+
+ skdev = skd_construct(pdev);
+ if (skdev == NULL) {
+ rc = -ENOMEM;
+ goto err_out_regions;
+ }
+
+ skd_pci_info(skdev, pci_str);
+ pr_info("(%s): %s 64bit\n", skd_name(skdev), pci_str);
+
+ pci_set_master(pdev);
+ rc = pci_enable_pcie_error_reporting(pdev);
+ if (rc) {
+ pr_err(
+ "(%s): bad enable of PCIe error reporting rc=%d\n",
+ skd_name(skdev), rc);
+ skdev->pcie_error_reporting_is_enabled = 0;
+ } else
+ skdev->pcie_error_reporting_is_enabled = 1;
+
+
+ pci_set_drvdata(pdev, skdev);
+
+ skdev->disk->driverfs_dev = &pdev->dev;
+
+ for (i = 0; i < SKD_MAX_BARS; i++) {
+ skdev->mem_phys[i] = pci_resource_start(pdev, i);
+ skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
+ skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
+ skdev->mem_size[i]);
+ if (!skdev->mem_map[i]) {
+ pr_err("(%s): Unable to map adapter memory!\n",
+ skd_name(skdev));
+ rc = -ENODEV;
+ goto err_out_iounmap;
+ }
+ pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n",
+ skdev->name, __func__, __LINE__,
+ skdev->mem_map[i],
+ (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
+ }
+
+ rc = skd_acquire_irq(skdev);
+ if (rc) {
+ pr_err("(%s): interrupt resource error %d\n",
+ skd_name(skdev), rc);
+ goto err_out_iounmap;
+ }
+
+ rc = skd_start_timer(skdev);
+ if (rc)
+ goto err_out_timer;
+
+ init_waitqueue_head(&skdev->waitq);
+
+ skd_start_device(skdev);
+
+ rc = wait_event_interruptible_timeout(skdev->waitq,
+ (skdev->gendisk_on),
+ (SKD_START_WAIT_SECONDS * HZ));
+ if (skdev->gendisk_on > 0) {
+ /* device came on-line after reset */
+ skd_bdev_attach(skdev);
+ rc = 0;
+ } else {
+ /* we timed out, something is wrong with the device,
+ don't add the disk structure */
+ pr_err(
+ "(%s): error: waiting for s1120 timed out %d!\n",
+ skd_name(skdev), rc);
+ /* in case of no error; we timeout with ENXIO */
+ if (!rc)
+ rc = -ENXIO;
+ goto err_out_timer;
+ }
+
+
+#ifdef SKD_VMK_POLL_HANDLER
+ if (skdev->irq_type == SKD_IRQ_MSIX) {
+ /* MSIX completion handler is being used for coredump */
+ vmklnx_scsi_register_poll_handler(skdev->scsi_host,
+ skdev->msix_entries[5].vector,
+ skd_comp_q, skdev);
+ } else {
+ vmklnx_scsi_register_poll_handler(skdev->scsi_host,
+ skdev->pdev->irq, skd_isr,
+ skdev);
+ }
+#endif /* SKD_VMK_POLL_HANDLER */
+
+ return rc;
+
+err_out_timer:
+ skd_stop_device(skdev);
+ skd_release_irq(skdev);
+
+err_out_iounmap:
+ for (i = 0; i < SKD_MAX_BARS; i++)
+ if (skdev->mem_map[i])
+ iounmap(skdev->mem_map[i]);
+
+ if (skdev->pcie_error_reporting_is_enabled)
+ pci_disable_pcie_error_reporting(pdev);
+
+ skd_destruct(skdev);
+
+err_out_regions:
+ pci_release_regions(pdev);
+
+err_out:
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+ return rc;
+}
+
+static void skd_pci_remove(struct pci_dev *pdev)
+{
+ int i;
+ struct skd_device *skdev;
+
+ skdev = pci_get_drvdata(pdev);
+ if (!skdev) {
+ pr_err("%s: no device data for PCI\n", pci_name(pdev));
+ return;
+ }
+ skd_stop_device(skdev);
+ skd_release_irq(skdev);
+
+ for (i = 0; i < SKD_MAX_BARS; i++)
+ if (skdev->mem_map[i])
+ iounmap((u32 *)skdev->mem_map[i]);
+
+ if (skdev->pcie_error_reporting_is_enabled)
+ pci_disable_pcie_error_reporting(pdev);
+
+ skd_destruct(skdev);
+
+ pci_release_regions(pdev);
+ pci_disable_device(pdev);
+ pci_set_drvdata(pdev, NULL);
+
+ return;
+}
+
+static int skd_pci_suspend(struct pci_dev *pdev, pm_message_t state)
+{
+ int i;
+ struct skd_device *skdev;
+
+ skdev = pci_get_drvdata(pdev);
+ if (!skdev) {
+ pr_err("%s: no device data for PCI\n", pci_name(pdev));
+ return -EIO;
+ }
+
+ skd_stop_device(skdev);
+
+ skd_release_irq(skdev);
+
+ for (i = 0; i < SKD_MAX_BARS; i++)
+ if (skdev->mem_map[i])
+ iounmap((u32 *)skdev->mem_map[i]);
+
+ if (skdev->pcie_error_reporting_is_enabled)
+ pci_disable_pcie_error_reporting(pdev);
+
+ pci_release_regions(pdev);
+ pci_save_state(pdev);
+ pci_disable_device(pdev);
+ pci_set_power_state(pdev, pci_choose_state(pdev, state));
+ return 0;
+}
+
+static int skd_pci_resume(struct pci_dev *pdev)
+{
+ int i;
+ int rc = 0;
+ struct skd_device *skdev;
+
+ skdev = pci_get_drvdata(pdev);
+ if (!skdev) {
+ pr_err("%s: no device data for PCI\n", pci_name(pdev));
+ return -1;
+ }
+
+ pci_set_power_state(pdev, PCI_D0);
+ pci_enable_wake(pdev, PCI_D0, 0);
+ pci_restore_state(pdev);
+
+ rc = pci_enable_device(pdev);
+ if (rc)
+ return rc;
+ rc = pci_request_regions(pdev, DRV_NAME);
+ if (rc)
+ goto err_out;
+ rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (!rc) {
+ if (pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) {
+
+ pr_err("(%s): consistent DMA mask error %d\n",
+ pci_name(pdev), rc);
+ }
+ } else {
+ rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (rc) {
+
+ pr_err("(%s): DMA mask error %d\n",
+ pci_name(pdev), rc);
+ goto err_out_regions;
+ }
+ }
+
+ pci_set_master(pdev);
+ rc = pci_enable_pcie_error_reporting(pdev);
+ if (rc) {
+ pr_err("(%s): bad enable of PCIe error reporting rc=%d\n",
+ skdev->name, rc);
+ skdev->pcie_error_reporting_is_enabled = 0;
+ } else
+ skdev->pcie_error_reporting_is_enabled = 1;
+
+ for (i = 0; i < SKD_MAX_BARS; i++) {
+
+ skdev->mem_phys[i] = pci_resource_start(pdev, i);
+ skdev->mem_size[i] = (u32)pci_resource_len(pdev, i);
+ skdev->mem_map[i] = ioremap(skdev->mem_phys[i],
+ skdev->mem_size[i]);
+ if (!skdev->mem_map[i]) {
+ pr_err("(%s): Unable to map adapter memory!\n",
+ skd_name(skdev));
+ rc = -ENODEV;
+ goto err_out_iounmap;
+ }
+ pr_debug("%s:%s:%d mem_map=%p, phyd=%016llx, size=%d\n",
+ skdev->name, __func__, __LINE__,
+ skdev->mem_map[i],
+ (uint64_t)skdev->mem_phys[i], skdev->mem_size[i]);
+ }
+ rc = skd_acquire_irq(skdev);
+ if (rc) {
+
+ pr_err("(%s): interrupt resource error %d\n",
+ pci_name(pdev), rc);
+ goto err_out_iounmap;
+ }
+
+ rc = skd_start_timer(skdev);
+ if (rc)
+ goto err_out_timer;
+
+ init_waitqueue_head(&skdev->waitq);
+
+ skd_start_device(skdev);
+
+ return rc;
+
+err_out_timer:
+ skd_stop_device(skdev);
+ skd_release_irq(skdev);
+
+err_out_iounmap:
+ for (i = 0; i < SKD_MAX_BARS; i++)
+ if (skdev->mem_map[i])
+ iounmap(skdev->mem_map[i]);
+
+ if (skdev->pcie_error_reporting_is_enabled)
+ pci_disable_pcie_error_reporting(pdev);
+
+err_out_regions:
+ pci_release_regions(pdev);
+
+err_out:
+ pci_disable_device(pdev);
+ return rc;
+}
+
+static void skd_pci_shutdown(struct pci_dev *pdev)
+{
+ struct skd_device *skdev;
+
+ pr_err("skd_pci_shutdown called\n");
+
+ skdev = pci_get_drvdata(pdev);
+ if (!skdev) {
+ pr_err("%s: no device data for PCI\n", pci_name(pdev));
+ return;
+ }
+
+ pr_err("%s: calling stop\n", skd_name(skdev));
+ skd_stop_device(skdev);
+}
+
+static struct pci_driver skd_driver = {
+ .name = DRV_NAME,
+ .id_table = skd_pci_tbl,
+ .probe = skd_pci_probe,
+ .remove = skd_pci_remove,
+ .suspend = skd_pci_suspend,
+ .resume = skd_pci_resume,
+ .shutdown = skd_pci_shutdown,
+};
+
+/*
+ *****************************************************************************
+ * LOGGING SUPPORT
+ *****************************************************************************
+ */
+
+static const char *skd_name(struct skd_device *skdev)
+{
+ memset(skdev->id_str, 0, sizeof(skdev->id_str));
+
+ if (skdev->inquiry_is_valid)
+ snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:%s:[%s]",
+ skdev->name, skdev->inq_serial_num,
+ pci_name(skdev->pdev));
+ else
+ snprintf(skdev->id_str, sizeof(skdev->id_str), "%s:??:[%s]",
+ skdev->name, pci_name(skdev->pdev));
+
+ return skdev->id_str;
+}
+
+const char *skd_drive_state_to_str(int state)
+{
+ switch (state) {
+ case FIT_SR_DRIVE_OFFLINE:
+ return "OFFLINE";
+ case FIT_SR_DRIVE_INIT:
+ return "INIT";
+ case FIT_SR_DRIVE_ONLINE:
+ return "ONLINE";
+ case FIT_SR_DRIVE_BUSY:
+ return "BUSY";
+ case FIT_SR_DRIVE_FAULT:
+ return "FAULT";
+ case FIT_SR_DRIVE_DEGRADED:
+ return "DEGRADED";
+ case FIT_SR_PCIE_LINK_DOWN:
+ return "INK_DOWN";
+ case FIT_SR_DRIVE_SOFT_RESET:
+ return "SOFT_RESET";
+ case FIT_SR_DRIVE_NEED_FW_DOWNLOAD:
+ return "NEED_FW";
+ case FIT_SR_DRIVE_INIT_FAULT:
+ return "INIT_FAULT";
+ case FIT_SR_DRIVE_BUSY_SANITIZE:
+ return "BUSY_SANITIZE";
+ case FIT_SR_DRIVE_BUSY_ERASE:
+ return "BUSY_ERASE";
+ case FIT_SR_DRIVE_FW_BOOTING:
+ return "FW_BOOTING";
+ default:
+ return "???";
+ }
+}
+
+const char *skd_skdev_state_to_str(enum skd_drvr_state state)
+{
+ switch (state) {
+ case SKD_DRVR_STATE_LOAD:
+ return "LOAD";
+ case SKD_DRVR_STATE_IDLE:
+ return "IDLE";
+ case SKD_DRVR_STATE_BUSY:
+ return "BUSY";
+ case SKD_DRVR_STATE_STARTING:
+ return "STARTING";
+ case SKD_DRVR_STATE_ONLINE:
+ return "ONLINE";
+ case SKD_DRVR_STATE_PAUSING:
+ return "PAUSING";
+ case SKD_DRVR_STATE_PAUSED:
+ return "PAUSED";
+ case SKD_DRVR_STATE_DRAINING_TIMEOUT:
+ return "DRAINING_TIMEOUT";
+ case SKD_DRVR_STATE_RESTARTING:
+ return "RESTARTING";
+ case SKD_DRVR_STATE_RESUMING:
+ return "RESUMING";
+ case SKD_DRVR_STATE_STOPPING:
+ return "STOPPING";
+ case SKD_DRVR_STATE_SYNCING:
+ return "SYNCING";
+ case SKD_DRVR_STATE_FAULT:
+ return "FAULT";
+ case SKD_DRVR_STATE_DISAPPEARED:
+ return "DISAPPEARED";
+ case SKD_DRVR_STATE_BUSY_ERASE:
+ return "BUSY_ERASE";
+ case SKD_DRVR_STATE_BUSY_SANITIZE:
+ return "BUSY_SANITIZE";
+ case SKD_DRVR_STATE_BUSY_IMMINENT:
+ return "BUSY_IMMINENT";
+ case SKD_DRVR_STATE_WAIT_BOOT:
+ return "WAIT_BOOT";
+
+ default:
+ return "???";
+ }
+}
+
+static const char *skd_skmsg_state_to_str(enum skd_fit_msg_state state)
+{
+ switch (state) {
+ case SKD_MSG_STATE_IDLE:
+ return "IDLE";
+ case SKD_MSG_STATE_BUSY:
+ return "BUSY";
+ default:
+ return "???";
+ }
+}
+
+static const char *skd_skreq_state_to_str(enum skd_req_state state)
+{
+ switch (state) {
+ case SKD_REQ_STATE_IDLE:
+ return "IDLE";
+ case SKD_REQ_STATE_SETUP:
+ return "SETUP";
+ case SKD_REQ_STATE_BUSY:
+ return "BUSY";
+ case SKD_REQ_STATE_COMPLETED:
+ return "COMPLETED";
+ case SKD_REQ_STATE_TIMEOUT:
+ return "TIMEOUT";
+ case SKD_REQ_STATE_ABORTED:
+ return "ABORTED";
+ default:
+ return "???";
+ }
+}
+
+static void skd_log_skdev(struct skd_device *skdev, const char *event)
+{
+ pr_debug("%s:%s:%d (%s) skdev=%p event='%s'\n",
+ skdev->name, __func__, __LINE__, skdev->name, skdev, event);
+ pr_debug("%s:%s:%d drive_state=%s(%d) driver_state=%s(%d)\n",
+ skdev->name, __func__, __LINE__,
+ skd_drive_state_to_str(skdev->drive_state), skdev->drive_state,
+ skd_skdev_state_to_str(skdev->state), skdev->state);
+ pr_debug("%s:%s:%d busy=%d limit=%d dev=%d lowat=%d\n",
+ skdev->name, __func__, __LINE__,
+ skdev->in_flight, skdev->cur_max_queue_depth,
+ skdev->dev_max_queue_depth, skdev->queue_low_water_mark);
+ pr_debug("%s:%s:%d timestamp=0x%x cycle=%d cycle_ix=%d\n",
+ skdev->name, __func__, __LINE__,
+ skdev->timeout_stamp, skdev->skcomp_cycle, skdev->skcomp_ix);
+}
+
+static void skd_log_skmsg(struct skd_device *skdev,
+ struct skd_fitmsg_context *skmsg, const char *event)
+{
+ pr_debug("%s:%s:%d (%s) skmsg=%p event='%s'\n",
+ skdev->name, __func__, __LINE__, skdev->name, skmsg, event);
+ pr_debug("%s:%s:%d state=%s(%d) id=0x%04x length=%d\n",
+ skdev->name, __func__, __LINE__,
+ skd_skmsg_state_to_str(skmsg->state), skmsg->state,
+ skmsg->id, skmsg->length);
+}
+
+static void skd_log_skreq(struct skd_device *skdev,
+ struct skd_request_context *skreq, const char *event)
+{
+ pr_debug("%s:%s:%d (%s) skreq=%p event='%s'\n",
+ skdev->name, __func__, __LINE__, skdev->name, skreq, event);
+ pr_debug("%s:%s:%d state=%s(%d) id=0x%04x fitmsg=0x%04x\n",
+ skdev->name, __func__, __LINE__,
+ skd_skreq_state_to_str(skreq->state), skreq->state,
+ skreq->id, skreq->fitmsg_id);
+ pr_debug("%s:%s:%d timo=0x%x sg_dir=%d n_sg=%d\n",
+ skdev->name, __func__, __LINE__,
+ skreq->timeout_stamp, skreq->sg_data_dir, skreq->n_sg);
+
+ if (skreq->req != NULL) {
+ struct request *req = skreq->req;
+ u32 lba = (u32)blk_rq_pos(req);
+ u32 count = blk_rq_sectors(req);
+
+ pr_debug("%s:%s:%d "
+ "req=%p lba=%u(0x%x) count=%u(0x%x) dir=%d\n",
+ skdev->name, __func__, __LINE__,
+ req, lba, lba, count, count,
+ (int)rq_data_dir(req));
+ } else
+ pr_debug("%s:%s:%d req=NULL\n",
+ skdev->name, __func__, __LINE__);
+}
+
+/*
+ *****************************************************************************
+ * MODULE GLUE
+ *****************************************************************************
+ */
+
+static int __init skd_init(void)
+{
+ pr_info(PFX " v%s-b%s loaded\n", DRV_VERSION, DRV_BUILD_ID);
+
+ switch (skd_isr_type) {
+ case SKD_IRQ_LEGACY:
+ case SKD_IRQ_MSI:
+ case SKD_IRQ_MSIX:
+ break;
+ default:
+ pr_err(PFX "skd_isr_type %d invalid, re-set to %d\n",
+ skd_isr_type, SKD_IRQ_DEFAULT);
+ skd_isr_type = SKD_IRQ_DEFAULT;
+ }
+
+ if (skd_max_queue_depth < 1 ||
+ skd_max_queue_depth > SKD_MAX_QUEUE_DEPTH) {
+ pr_err(PFX "skd_max_queue_depth %d invalid, re-set to %d\n",
+ skd_max_queue_depth, SKD_MAX_QUEUE_DEPTH_DEFAULT);
+ skd_max_queue_depth = SKD_MAX_QUEUE_DEPTH_DEFAULT;
+ }
+
+ if (skd_max_req_per_msg < 1 || skd_max_req_per_msg > 14) {
+ pr_err(PFX "skd_max_req_per_msg %d invalid, re-set to %d\n",
+ skd_max_req_per_msg, SKD_MAX_REQ_PER_MSG_DEFAULT);
+ skd_max_req_per_msg = SKD_MAX_REQ_PER_MSG_DEFAULT;
+ }
+
+ if (skd_sgs_per_request < 1 || skd_sgs_per_request > 4096) {
+ pr_err(PFX "skd_sg_per_request %d invalid, re-set to %d\n",
+ skd_sgs_per_request, SKD_N_SG_PER_REQ_DEFAULT);
+ skd_sgs_per_request = SKD_N_SG_PER_REQ_DEFAULT;
+ }
+
+ if (skd_dbg_level < 0 || skd_dbg_level > 2) {
+ pr_err(PFX "skd_dbg_level %d invalid, re-set to %d\n",
+ skd_dbg_level, 0);
+ skd_dbg_level = 0;
+ }
+
+ if (skd_isr_comp_limit < 0) {
+ pr_err(PFX "skd_isr_comp_limit %d invalid, set to %d\n",
+ skd_isr_comp_limit, 0);
+ skd_isr_comp_limit = 0;
+ }
+
+ if (skd_max_pass_thru < 1 || skd_max_pass_thru > 50) {
+ pr_err(PFX "skd_max_pass_thru %d invalid, re-set to %d\n",
+ skd_max_pass_thru, SKD_N_SPECIAL_CONTEXT);
+ skd_max_pass_thru = SKD_N_SPECIAL_CONTEXT;
+ }
+
+ return pci_register_driver(&skd_driver);
+}
+
+static void __exit skd_exit(void)
+{
+ pr_info(PFX " v%s-b%s unloading\n", DRV_VERSION, DRV_BUILD_ID);
+
+ pci_unregister_driver(&skd_driver);
+
+ if (skd_major)
+ unregister_blkdev(skd_major, DRV_NAME);
+}
+
+module_init(skd_init);
+module_exit(skd_exit);
diff --git a/drivers/block/skd_s1120.h b/drivers/block/skd_s1120.h
new file mode 100644
index 000000000..61c757ff0
--- /dev/null
+++ b/drivers/block/skd_s1120.h
@@ -0,0 +1,330 @@
+/* Copyright 2012 STEC, Inc.
+ *
+ * This file is licensed under the terms of the 3-clause
+ * BSD License (http://opensource.org/licenses/BSD-3-Clause)
+ * or the GNU GPL-2.0 (http://www.gnu.org/licenses/gpl-2.0.html),
+ * at your option. Both licenses are also available in the LICENSE file
+ * distributed with this project. This file may not be copied, modified,
+ * or distributed except in accordance with those terms.
+ */
+
+
+#ifndef SKD_S1120_H
+#define SKD_S1120_H
+
+#pragma pack(push, s1120_h, 1)
+
+/*
+ * Q-channel, 64-bit r/w
+ */
+#define FIT_Q_COMMAND 0x400u
+#define FIT_QCMD_QID_MASK (0x3 << 1)
+#define FIT_QCMD_QID0 (0x0 << 1)
+#define FIT_QCMD_QID_NORMAL FIT_QCMD_QID0
+#define FIT_QCMD_QID1 (0x1 << 1)
+#define FIT_QCMD_QID2 (0x2 << 1)
+#define FIT_QCMD_QID3 (0x3 << 1)
+#define FIT_QCMD_FLUSH_QUEUE (0ull) /* add QID */
+#define FIT_QCMD_MSGSIZE_MASK (0x3 << 4)
+#define FIT_QCMD_MSGSIZE_64 (0x0 << 4)
+#define FIT_QCMD_MSGSIZE_128 (0x1 << 4)
+#define FIT_QCMD_MSGSIZE_256 (0x2 << 4)
+#define FIT_QCMD_MSGSIZE_512 (0x3 << 4)
+#define FIT_QCMD_BASE_ADDRESS_MASK (0xFFFFFFFFFFFFFFC0ull)
+
+/*
+ * Control, 32-bit r/w
+ */
+#define FIT_CONTROL 0x500u
+#define FIT_CR_HARD_RESET (1u << 0u)
+#define FIT_CR_SOFT_RESET (1u << 1u)
+#define FIT_CR_DIS_TIMESTAMPS (1u << 6u)
+#define FIT_CR_ENABLE_INTERRUPTS (1u << 7u)
+
+/*
+ * Status, 32-bit, r/o
+ */
+#define FIT_STATUS 0x510u
+#define FIT_SR_DRIVE_STATE_MASK 0x000000FFu
+#define FIT_SR_SIGNATURE (0xFF << 8)
+#define FIT_SR_PIO_DMA (1 << 16)
+#define FIT_SR_DRIVE_OFFLINE 0x00
+#define FIT_SR_DRIVE_INIT 0x01
+/* #define FIT_SR_DRIVE_READY 0x02 */
+#define FIT_SR_DRIVE_ONLINE 0x03
+#define FIT_SR_DRIVE_BUSY 0x04
+#define FIT_SR_DRIVE_FAULT 0x05
+#define FIT_SR_DRIVE_DEGRADED 0x06
+#define FIT_SR_PCIE_LINK_DOWN 0x07
+#define FIT_SR_DRIVE_SOFT_RESET 0x08
+#define FIT_SR_DRIVE_INIT_FAULT 0x09
+#define FIT_SR_DRIVE_BUSY_SANITIZE 0x0A
+#define FIT_SR_DRIVE_BUSY_ERASE 0x0B
+#define FIT_SR_DRIVE_FW_BOOTING 0x0C
+#define FIT_SR_DRIVE_NEED_FW_DOWNLOAD 0xFE
+#define FIT_SR_DEVICE_MISSING 0xFF
+#define FIT_SR__RESERVED 0xFFFFFF00u
+
+/*
+ * FIT_STATUS - Status register data definition
+ */
+#define FIT_SR_STATE_MASK (0xFF << 0)
+#define FIT_SR_SIGNATURE (0xFF << 8)
+#define FIT_SR_PIO_DMA (1 << 16)
+
+/*
+ * Interrupt status, 32-bit r/w1c (w1c ==> write 1 to clear)
+ */
+#define FIT_INT_STATUS_HOST 0x520u
+#define FIT_ISH_FW_STATE_CHANGE (1u << 0u)
+#define FIT_ISH_COMPLETION_POSTED (1u << 1u)
+#define FIT_ISH_MSG_FROM_DEV (1u << 2u)
+#define FIT_ISH_UNDEFINED_3 (1u << 3u)
+#define FIT_ISH_UNDEFINED_4 (1u << 4u)
+#define FIT_ISH_Q0_FULL (1u << 5u)
+#define FIT_ISH_Q1_FULL (1u << 6u)
+#define FIT_ISH_Q2_FULL (1u << 7u)
+#define FIT_ISH_Q3_FULL (1u << 8u)
+#define FIT_ISH_QCMD_FIFO_OVERRUN (1u << 9u)
+#define FIT_ISH_BAD_EXP_ROM_READ (1u << 10u)
+
+#define FIT_INT_DEF_MASK \
+ (FIT_ISH_FW_STATE_CHANGE | \
+ FIT_ISH_COMPLETION_POSTED | \
+ FIT_ISH_MSG_FROM_DEV | \
+ FIT_ISH_Q0_FULL | \
+ FIT_ISH_Q1_FULL | \
+ FIT_ISH_Q2_FULL | \
+ FIT_ISH_Q3_FULL | \
+ FIT_ISH_QCMD_FIFO_OVERRUN | \
+ FIT_ISH_BAD_EXP_ROM_READ)
+
+#define FIT_INT_QUEUE_FULL \
+ (FIT_ISH_Q0_FULL | \
+ FIT_ISH_Q1_FULL | \
+ FIT_ISH_Q2_FULL | \
+ FIT_ISH_Q3_FULL)
+
+#define MSI_MSG_NWL_ERROR_0 0x00000000
+#define MSI_MSG_NWL_ERROR_1 0x00000001
+#define MSI_MSG_NWL_ERROR_2 0x00000002
+#define MSI_MSG_NWL_ERROR_3 0x00000003
+#define MSI_MSG_STATE_CHANGE 0x00000004
+#define MSI_MSG_COMPLETION_POSTED 0x00000005
+#define MSI_MSG_MSG_FROM_DEV 0x00000006
+#define MSI_MSG_RESERVED_0 0x00000007
+#define MSI_MSG_RESERVED_1 0x00000008
+#define MSI_MSG_QUEUE_0_FULL 0x00000009
+#define MSI_MSG_QUEUE_1_FULL 0x0000000A
+#define MSI_MSG_QUEUE_2_FULL 0x0000000B
+#define MSI_MSG_QUEUE_3_FULL 0x0000000C
+
+#define FIT_INT_RESERVED_MASK \
+ (FIT_ISH_UNDEFINED_3 | \
+ FIT_ISH_UNDEFINED_4)
+
+/*
+ * Interrupt mask, 32-bit r/w
+ * Bit definitions are the same as FIT_INT_STATUS_HOST
+ */
+#define FIT_INT_MASK_HOST 0x528u
+
+/*
+ * Message to device, 32-bit r/w
+ */
+#define FIT_MSG_TO_DEVICE 0x540u
+
+/*
+ * Message from device, 32-bit, r/o
+ */
+#define FIT_MSG_FROM_DEVICE 0x548u
+
+/*
+ * 32-bit messages to/from device, composition/extraction macros
+ */
+#define FIT_MXD_CONS(TYPE, PARAM, DATA) \
+ ((((TYPE) & 0xFFu) << 24u) | \
+ (((PARAM) & 0xFFu) << 16u) | \
+ (((DATA) & 0xFFFFu) << 0u))
+#define FIT_MXD_TYPE(MXD) (((MXD) >> 24u) & 0xFFu)
+#define FIT_MXD_PARAM(MXD) (((MXD) >> 16u) & 0xFFu)
+#define FIT_MXD_DATA(MXD) (((MXD) >> 0u) & 0xFFFFu)
+
+/*
+ * Types of messages to/from device
+ */
+#define FIT_MTD_FITFW_INIT 0x01u
+#define FIT_MTD_GET_CMDQ_DEPTH 0x02u
+#define FIT_MTD_SET_COMPQ_DEPTH 0x03u
+#define FIT_MTD_SET_COMPQ_ADDR 0x04u
+#define FIT_MTD_ARM_QUEUE 0x05u
+#define FIT_MTD_CMD_LOG_HOST_ID 0x07u
+#define FIT_MTD_CMD_LOG_TIME_STAMP_LO 0x08u
+#define FIT_MTD_CMD_LOG_TIME_STAMP_HI 0x09u
+#define FIT_MFD_SMART_EXCEEDED 0x10u
+#define FIT_MFD_POWER_DOWN 0x11u
+#define FIT_MFD_OFFLINE 0x12u
+#define FIT_MFD_ONLINE 0x13u
+#define FIT_MFD_FW_RESTARTING 0x14u
+#define FIT_MFD_PM_ACTIVE 0x15u
+#define FIT_MFD_PM_STANDBY 0x16u
+#define FIT_MFD_PM_SLEEP 0x17u
+#define FIT_MFD_CMD_PROGRESS 0x18u
+
+#define FIT_MTD_DEBUG 0xFEu
+#define FIT_MFD_DEBUG 0xFFu
+
+#define FIT_MFD_MASK (0xFFu)
+#define FIT_MFD_DATA_MASK (0xFFu)
+#define FIT_MFD_MSG(x) (((x) >> 24) & FIT_MFD_MASK)
+#define FIT_MFD_DATA(x) ((x) & FIT_MFD_MASK)
+
+/*
+ * Extra arg to FIT_MSG_TO_DEVICE, 64-bit r/w
+ * Used to set completion queue address (FIT_MTD_SET_COMPQ_ADDR)
+ * (was Response buffer in docs)
+ */
+#define FIT_MSG_TO_DEVICE_ARG 0x580u
+
+/*
+ * Hardware (ASIC) version, 32-bit r/o
+ */
+#define FIT_HW_VERSION 0x588u
+
+/*
+ * Scatter/gather list descriptor.
+ * 32-bytes and must be aligned on a 32-byte boundary.
+ * All fields are in little endian order.
+ */
+struct fit_sg_descriptor {
+ uint32_t control;
+ uint32_t byte_count;
+ uint64_t host_side_addr;
+ uint64_t dev_side_addr;
+ uint64_t next_desc_ptr;
+};
+
+#define FIT_SGD_CONTROL_NOT_LAST 0x000u
+#define FIT_SGD_CONTROL_LAST 0x40Eu
+
+/*
+ * Header at the beginning of a FIT message. The header
+ * is followed by SSDI requests each 64 bytes.
+ * A FIT message can be up to 512 bytes long and must start
+ * on a 64-byte boundary.
+ */
+struct fit_msg_hdr {
+ uint8_t protocol_id;
+ uint8_t num_protocol_cmds_coalesced;
+ uint8_t _reserved[62];
+};
+
+#define FIT_PROTOCOL_ID_FIT 1
+#define FIT_PROTOCOL_ID_SSDI 2
+#define FIT_PROTOCOL_ID_SOFIT 3
+
+
+#define FIT_PROTOCOL_MINOR_VER(mtd_val) ((mtd_val >> 16) & 0xF)
+#define FIT_PROTOCOL_MAJOR_VER(mtd_val) ((mtd_val >> 20) & 0xF)
+
+/*
+ * Format of a completion entry. The completion queue is circular
+ * and must have at least as many entries as the maximum number
+ * of commands that may be issued to the device.
+ *
+ * There are no head/tail pointers. The cycle value is used to
+ * infer the presence of new completion records.
+ * Initially the cycle in all entries is 0, the index is 0, and
+ * the cycle value to expect is 1. When completions are added
+ * their cycle values are set to 1. When the index wraps the
+ * cycle value to expect is incremented.
+ *
+ * Command_context is opaque and taken verbatim from the SSDI command.
+ * All other fields are big endian.
+ */
+#define FIT_PROTOCOL_VERSION_0 0
+
+/*
+ * Protocol major version 1 completion entry.
+ * The major protocol version is found in bits
+ * 20-23 of the FIT_MTD_FITFW_INIT response.
+ */
+struct fit_completion_entry_v1 {
+ uint32_t num_returned_bytes;
+ uint16_t tag;
+ uint8_t status; /* SCSI status */
+ uint8_t cycle;
+};
+#define FIT_PROTOCOL_VERSION_1 1
+#define FIT_PROTOCOL_VERSION_CURRENT FIT_PROTOCOL_VERSION_1
+
+struct fit_comp_error_info {
+ uint8_t type:7; /* 00: Bits0-6 indicates the type of sense data. */
+ uint8_t valid:1; /* 00: Bit 7 := 1 ==> info field is valid. */
+ uint8_t reserved0; /* 01: Obsolete field */
+ uint8_t key:4; /* 02: Bits0-3 indicate the sense key. */
+ uint8_t reserved2:1; /* 02: Reserved bit. */
+ uint8_t bad_length:1; /* 02: Incorrect Length Indicator */
+ uint8_t end_medium:1; /* 02: End of Medium */
+ uint8_t file_mark:1; /* 02: Filemark */
+ uint8_t info[4]; /* 03: */
+ uint8_t reserved1; /* 07: Additional Sense Length */
+ uint8_t cmd_spec[4]; /* 08: Command Specific Information */
+ uint8_t code; /* 0C: Additional Sense Code */
+ uint8_t qual; /* 0D: Additional Sense Code Qualifier */
+ uint8_t fruc; /* 0E: Field Replaceable Unit Code */
+ uint8_t sks_high:7; /* 0F: Sense Key Specific (MSB) */
+ uint8_t sks_valid:1; /* 0F: Sense Key Specific Valid */
+ uint16_t sks_low; /* 10: Sense Key Specific (LSW) */
+ uint16_t reserved3; /* 12: Part of additional sense bytes (unused) */
+ uint16_t uec; /* 14: Additional Sense Bytes */
+ uint64_t per; /* 16: Additional Sense Bytes */
+ uint8_t reserved4[2]; /* 1E: Additional Sense Bytes (unused) */
+};
+
+
+/* Task management constants */
+#define SOFT_TASK_SIMPLE 0x00
+#define SOFT_TASK_HEAD_OF_QUEUE 0x01
+#define SOFT_TASK_ORDERED 0x02
+
+/* Version zero has the last 32 bits reserved,
+ * Version one has the last 32 bits sg_list_len_bytes;
+ */
+struct skd_command_header {
+ uint64_t sg_list_dma_address;
+ uint16_t tag;
+ uint8_t attribute;
+ uint8_t add_cdb_len; /* In 32 bit words */
+ uint32_t sg_list_len_bytes;
+};
+
+struct skd_scsi_request {
+ struct skd_command_header hdr;
+ unsigned char cdb[16];
+/* unsigned char _reserved[16]; */
+};
+
+struct driver_inquiry_data {
+ uint8_t peripheral_device_type:5;
+ uint8_t qualifier:3;
+ uint8_t page_code;
+ uint16_t page_length;
+ uint16_t pcie_bus_number;
+ uint8_t pcie_device_number;
+ uint8_t pcie_function_number;
+ uint8_t pcie_link_speed;
+ uint8_t pcie_link_lanes;
+ uint16_t pcie_vendor_id;
+ uint16_t pcie_device_id;
+ uint16_t pcie_subsystem_vendor_id;
+ uint16_t pcie_subsystem_device_id;
+ uint8_t reserved1[2];
+ uint8_t reserved2[3];
+ uint8_t driver_version_length;
+ uint8_t driver_version[0x14];
+};
+
+#pragma pack(pop, s1120_h)
+
+#endif /* SKD_S1120_H */
diff --git a/drivers/block/smart1,2.h b/drivers/block/smart1,2.h
new file mode 100644
index 000000000..e5565fbae
--- /dev/null
+++ b/drivers/block/smart1,2.h
@@ -0,0 +1,278 @@
+/*
+ * Disk Array driver for Compaq SMART2 Controllers
+ * Copyright 1998 Compaq Computer Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Questions/Comments/Bugfixes to iss_storagedev@hp.com
+ *
+ * If you want to make changes, improve or add functionality to this
+ * driver, you'll probably need the Compaq Array Controller Interface
+ * Specificiation (Document number ECG086/1198)
+ */
+
+/*
+ * This file contains the controller communication implementation for
+ * Compaq SMART-1 and SMART-2 controllers. To the best of my knowledge,
+ * this should support:
+ *
+ * PCI:
+ * SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200
+ * Integerated SMART Array Controller, SMART-4200, SMART-4250ES
+ *
+ * EISA:
+ * SMART-2/E, SMART, IAES, IDA-2, IDA
+ */
+
+/*
+ * Memory mapped FIFO interface (SMART 42xx cards)
+ */
+static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c)
+{
+ writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET);
+}
+
+/*
+ * This card is the opposite of the other cards.
+ * 0 turns interrupts on...
+ * 0x08 turns them off...
+ */
+static void smart4_intr_mask(ctlr_info_t *h, unsigned long val)
+{
+ if (val)
+ { /* Turn interrupts on */
+ writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
+ } else /* Turn them off */
+ {
+ writel( S42XX_INTR_OFF,
+ h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
+ }
+}
+
+/*
+ * For older cards FIFO Full = 0.
+ * On this card 0 means there is room, anything else FIFO Full.
+ *
+ */
+static unsigned long smart4_fifo_full(ctlr_info_t *h)
+{
+
+ return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET));
+}
+
+/* This type of controller returns -1 if the fifo is empty,
+ * Not 0 like the others.
+ * And we need to let it know we read a value out
+ */
+static unsigned long smart4_completed(ctlr_info_t *h)
+{
+ long register_value
+ = readl(h->vaddr + S42XX_REPLY_PORT_OFFSET);
+
+ /* Fifo is empty */
+ if( register_value == 0xffffffff)
+ return 0;
+
+ /* Need to let it know we got the reply */
+ /* We do this by writing a 0 to the port we just read from */
+ writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET);
+
+ return ((unsigned long) register_value);
+}
+
+ /*
+ * This hardware returns interrupt pending at a different place and
+ * it does not tell us if the fifo is empty, we will have check
+ * that by getting a 0 back from the command_completed call.
+ */
+static unsigned long smart4_intr_pending(ctlr_info_t *h)
+{
+ unsigned long register_value =
+ readl(h->vaddr + S42XX_INTR_STATUS);
+
+ if( register_value & S42XX_INTR_PENDING)
+ return FIFO_NOT_EMPTY;
+ return 0 ;
+}
+
+static struct access_method smart4_access = {
+ smart4_submit_command,
+ smart4_intr_mask,
+ smart4_fifo_full,
+ smart4_intr_pending,
+ smart4_completed,
+};
+
+/*
+ * Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards)
+ */
+static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c)
+{
+ writel(c->busaddr, h->vaddr + COMMAND_FIFO);
+}
+
+static void smart2_intr_mask(ctlr_info_t *h, unsigned long val)
+{
+ writel(val, h->vaddr + INTR_MASK);
+}
+
+static unsigned long smart2_fifo_full(ctlr_info_t *h)
+{
+ return readl(h->vaddr + COMMAND_FIFO);
+}
+
+static unsigned long smart2_completed(ctlr_info_t *h)
+{
+ return readl(h->vaddr + COMMAND_COMPLETE_FIFO);
+}
+
+static unsigned long smart2_intr_pending(ctlr_info_t *h)
+{
+ return readl(h->vaddr + INTR_PENDING);
+}
+
+static struct access_method smart2_access = {
+ smart2_submit_command,
+ smart2_intr_mask,
+ smart2_fifo_full,
+ smart2_intr_pending,
+ smart2_completed,
+};
+
+/*
+ * IO access for SMART-2/E cards
+ */
+static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c)
+{
+ outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO);
+}
+
+static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val)
+{
+ outl(val, h->io_mem_addr + INTR_MASK);
+}
+
+static unsigned long smart2e_fifo_full(ctlr_info_t *h)
+{
+ return inl(h->io_mem_addr + COMMAND_FIFO);
+}
+
+static unsigned long smart2e_completed(ctlr_info_t *h)
+{
+ return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO);
+}
+
+static unsigned long smart2e_intr_pending(ctlr_info_t *h)
+{
+ return inl(h->io_mem_addr + INTR_PENDING);
+}
+
+static struct access_method smart2e_access = {
+ smart2e_submit_command,
+ smart2e_intr_mask,
+ smart2e_fifo_full,
+ smart2e_intr_pending,
+ smart2e_completed,
+};
+
+/*
+ * IO access for older SMART-1 type cards
+ */
+#define SMART1_SYSTEM_MASK 0xC8E
+#define SMART1_SYSTEM_DOORBELL 0xC8F
+#define SMART1_LOCAL_MASK 0xC8C
+#define SMART1_LOCAL_DOORBELL 0xC8D
+#define SMART1_INTR_MASK 0xC89
+#define SMART1_LISTADDR 0xC90
+#define SMART1_LISTLEN 0xC94
+#define SMART1_TAG 0xC97
+#define SMART1_COMPLETE_ADDR 0xC98
+#define SMART1_LISTSTATUS 0xC9E
+
+#define CHANNEL_BUSY 0x01
+#define CHANNEL_CLEAR 0x02
+
+static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c)
+{
+ /*
+ * This __u16 is actually a bunch of control flags on SMART
+ * and below. We want them all to be zero.
+ */
+ c->hdr.size = 0;
+
+ outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
+
+ outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR);
+ outw(c->size, h->io_mem_addr + SMART1_LISTLEN);
+
+ outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
+}
+
+static void smart1_intr_mask(ctlr_info_t *h, unsigned long val)
+{
+ if (val == 1) {
+ outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
+ outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
+ outb(0x01, h->io_mem_addr + SMART1_INTR_MASK);
+ outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK);
+ } else {
+ outb(0, h->io_mem_addr + 0xC8E);
+ }
+}
+
+static unsigned long smart1_fifo_full(ctlr_info_t *h)
+{
+ unsigned char chan;
+ chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR;
+ return chan;
+}
+
+static unsigned long smart1_completed(ctlr_info_t *h)
+{
+ unsigned char status;
+ unsigned long cmd;
+
+ if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) {
+ outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
+
+ cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR);
+ status = inb(h->io_mem_addr + SMART1_LISTSTATUS);
+
+ outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
+
+ /*
+ * this is x86 (actually compaq x86) only, so it's ok
+ */
+ if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status;
+ } else {
+ cmd = 0;
+ }
+ return cmd;
+}
+
+static unsigned long smart1_intr_pending(ctlr_info_t *h)
+{
+ unsigned char chan;
+ chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY;
+ return chan;
+}
+
+static struct access_method smart1_access = {
+ smart1_submit_command,
+ smart1_intr_mask,
+ smart1_fifo_full,
+ smart1_intr_pending,
+ smart1_completed,
+};
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
new file mode 100644
index 000000000..4b911ed96
--- /dev/null
+++ b/drivers/block/sunvdc.c
@@ -0,0 +1,1129 @@
+/* sunvdc.c: Sun LDOM Virtual Disk Client.
+ *
+ * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/genhd.h>
+#include <linux/cdrom.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+
+#include <asm/vio.h>
+#include <asm/ldc.h>
+
+#define DRV_MODULE_NAME "sunvdc"
+#define PFX DRV_MODULE_NAME ": "
+#define DRV_MODULE_VERSION "1.2"
+#define DRV_MODULE_RELDATE "November 24, 2014"
+
+static char version[] =
+ DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_DESCRIPTION("Sun LDOM virtual disk client driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+#define VDC_TX_RING_SIZE 512
+
+#define WAITING_FOR_LINK_UP 0x01
+#define WAITING_FOR_TX_SPACE 0x02
+#define WAITING_FOR_GEN_CMD 0x04
+#define WAITING_FOR_ANY -1
+
+static struct workqueue_struct *sunvdc_wq;
+
+struct vdc_req_entry {
+ struct request *req;
+};
+
+struct vdc_port {
+ struct vio_driver_state vio;
+
+ struct gendisk *disk;
+
+ struct vdc_completion *cmp;
+
+ u64 req_id;
+ u64 seq;
+ struct vdc_req_entry rq_arr[VDC_TX_RING_SIZE];
+
+ unsigned long ring_cookies;
+
+ u64 max_xfer_size;
+ u32 vdisk_block_size;
+
+ u64 ldc_timeout;
+ struct timer_list ldc_reset_timer;
+ struct work_struct ldc_reset_work;
+
+ /* The server fills these in for us in the disk attribute
+ * ACK packet.
+ */
+ u64 operations;
+ u32 vdisk_size;
+ u8 vdisk_type;
+ u8 vdisk_mtype;
+
+ char disk_name[32];
+};
+
+static void vdc_ldc_reset(struct vdc_port *port);
+static void vdc_ldc_reset_work(struct work_struct *work);
+static void vdc_ldc_reset_timer(unsigned long _arg);
+
+static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
+{
+ return container_of(vio, struct vdc_port, vio);
+}
+
+/* Ordered from largest major to lowest */
+static struct vio_version vdc_versions[] = {
+ { .major = 1, .minor = 1 },
+ { .major = 1, .minor = 0 },
+};
+
+static inline int vdc_version_supported(struct vdc_port *port,
+ u16 major, u16 minor)
+{
+ return port->vio.ver.major == major && port->vio.ver.minor >= minor;
+}
+
+#define VDCBLK_NAME "vdisk"
+static int vdc_major;
+#define PARTITION_SHIFT 3
+
+static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr)
+{
+ return vio_dring_avail(dr, VDC_TX_RING_SIZE);
+}
+
+static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ sector_t nsect = get_capacity(disk);
+ sector_t cylinders = nsect;
+
+ geo->heads = 0xff;
+ geo->sectors = 0x3f;
+ sector_div(cylinders, geo->heads * geo->sectors);
+ geo->cylinders = cylinders;
+ if ((sector_t)(geo->cylinders + 1) * geo->heads * geo->sectors < nsect)
+ geo->cylinders = 0xffff;
+
+ return 0;
+}
+
+/* Add ioctl/CDROM_GET_CAPABILITY to support cdrom_id in udev
+ * when vdisk_mtype is VD_MEDIA_TYPE_CD or VD_MEDIA_TYPE_DVD.
+ * Needed to be able to install inside an ldom from an iso image.
+ */
+static int vdc_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned command, unsigned long argument)
+{
+ int i;
+ struct gendisk *disk;
+
+ switch (command) {
+ case CDROMMULTISESSION:
+ pr_debug(PFX "Multisession CDs not supported\n");
+ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+ if (put_user(0, (char __user *)(argument + i)))
+ return -EFAULT;
+ return 0;
+
+ case CDROM_GET_CAPABILITY:
+ disk = bdev->bd_disk;
+
+ if (bdev->bd_disk && (disk->flags & GENHD_FL_CD))
+ return 0;
+ return -EINVAL;
+
+ default:
+ pr_debug(PFX "ioctl %08x not supported\n", command);
+ return -EINVAL;
+ }
+}
+
+static const struct block_device_operations vdc_fops = {
+ .owner = THIS_MODULE,
+ .getgeo = vdc_getgeo,
+ .ioctl = vdc_ioctl,
+};
+
+static void vdc_blk_queue_start(struct vdc_port *port)
+{
+ struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+ /* restart blk queue when ring is half emptied. also called after
+ * handshake completes, so check for initial handshake before we've
+ * allocated a disk.
+ */
+ if (port->disk && blk_queue_stopped(port->disk->queue) &&
+ vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) {
+ blk_start_queue(port->disk->queue);
+ }
+
+}
+
+static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
+{
+ if (vio->cmp &&
+ (waiting_for == -1 ||
+ vio->cmp->waiting_for == waiting_for)) {
+ vio->cmp->err = err;
+ complete(&vio->cmp->com);
+ vio->cmp = NULL;
+ }
+}
+
+static void vdc_handshake_complete(struct vio_driver_state *vio)
+{
+ struct vdc_port *port = to_vdc_port(vio);
+
+ del_timer(&port->ldc_reset_timer);
+ vdc_finish(vio, 0, WAITING_FOR_LINK_UP);
+ vdc_blk_queue_start(port);
+}
+
+static int vdc_handle_unknown(struct vdc_port *port, void *arg)
+{
+ struct vio_msg_tag *pkt = arg;
+
+ printk(KERN_ERR PFX "Received unknown msg [%02x:%02x:%04x:%08x]\n",
+ pkt->type, pkt->stype, pkt->stype_env, pkt->sid);
+ printk(KERN_ERR PFX "Resetting connection.\n");
+
+ ldc_disconnect(port->vio.lp);
+
+ return -ECONNRESET;
+}
+
+static int vdc_send_attr(struct vio_driver_state *vio)
+{
+ struct vdc_port *port = to_vdc_port(vio);
+ struct vio_disk_attr_info pkt;
+
+ memset(&pkt, 0, sizeof(pkt));
+
+ pkt.tag.type = VIO_TYPE_CTRL;
+ pkt.tag.stype = VIO_SUBTYPE_INFO;
+ pkt.tag.stype_env = VIO_ATTR_INFO;
+ pkt.tag.sid = vio_send_sid(vio);
+
+ pkt.xfer_mode = VIO_DRING_MODE;
+ pkt.vdisk_block_size = port->vdisk_block_size;
+ pkt.max_xfer_size = port->max_xfer_size;
+
+ viodbg(HS, "SEND ATTR xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n",
+ pkt.xfer_mode, pkt.vdisk_block_size, pkt.max_xfer_size);
+
+ return vio_ldc_send(&port->vio, &pkt, sizeof(pkt));
+}
+
+static int vdc_handle_attr(struct vio_driver_state *vio, void *arg)
+{
+ struct vdc_port *port = to_vdc_port(vio);
+ struct vio_disk_attr_info *pkt = arg;
+
+ viodbg(HS, "GOT ATTR stype[0x%x] ops[%llx] disk_size[%llu] disk_type[%x] "
+ "mtype[0x%x] xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n",
+ pkt->tag.stype, pkt->operations,
+ pkt->vdisk_size, pkt->vdisk_type, pkt->vdisk_mtype,
+ pkt->xfer_mode, pkt->vdisk_block_size,
+ pkt->max_xfer_size);
+
+ if (pkt->tag.stype == VIO_SUBTYPE_ACK) {
+ switch (pkt->vdisk_type) {
+ case VD_DISK_TYPE_DISK:
+ case VD_DISK_TYPE_SLICE:
+ break;
+
+ default:
+ printk(KERN_ERR PFX "%s: Bogus vdisk_type 0x%x\n",
+ vio->name, pkt->vdisk_type);
+ return -ECONNRESET;
+ }
+
+ if (pkt->vdisk_block_size > port->vdisk_block_size) {
+ printk(KERN_ERR PFX "%s: BLOCK size increased "
+ "%u --> %u\n",
+ vio->name,
+ port->vdisk_block_size, pkt->vdisk_block_size);
+ return -ECONNRESET;
+ }
+
+ port->operations = pkt->operations;
+ port->vdisk_type = pkt->vdisk_type;
+ if (vdc_version_supported(port, 1, 1)) {
+ port->vdisk_size = pkt->vdisk_size;
+ port->vdisk_mtype = pkt->vdisk_mtype;
+ }
+ if (pkt->max_xfer_size < port->max_xfer_size)
+ port->max_xfer_size = pkt->max_xfer_size;
+ port->vdisk_block_size = pkt->vdisk_block_size;
+ return 0;
+ } else {
+ printk(KERN_ERR PFX "%s: Attribute NACK\n", vio->name);
+
+ return -ECONNRESET;
+ }
+}
+
+static void vdc_end_special(struct vdc_port *port, struct vio_disk_desc *desc)
+{
+ int err = desc->status;
+
+ vdc_finish(&port->vio, -err, WAITING_FOR_GEN_CMD);
+}
+
+static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
+ unsigned int index)
+{
+ struct vio_disk_desc *desc = vio_dring_entry(dr, index);
+ struct vdc_req_entry *rqe = &port->rq_arr[index];
+ struct request *req;
+
+ if (unlikely(desc->hdr.state != VIO_DESC_DONE))
+ return;
+
+ ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies);
+ desc->hdr.state = VIO_DESC_FREE;
+ dr->cons = vio_dring_next(dr, index);
+
+ req = rqe->req;
+ if (req == NULL) {
+ vdc_end_special(port, desc);
+ return;
+ }
+
+ rqe->req = NULL;
+
+ __blk_end_request(req, (desc->status ? -EIO : 0), desc->size);
+
+ vdc_blk_queue_start(port);
+}
+
+static int vdc_ack(struct vdc_port *port, void *msgbuf)
+{
+ struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+ struct vio_dring_data *pkt = msgbuf;
+
+ if (unlikely(pkt->dring_ident != dr->ident ||
+ pkt->start_idx != pkt->end_idx ||
+ pkt->start_idx >= VDC_TX_RING_SIZE))
+ return 0;
+
+ vdc_end_one(port, dr, pkt->start_idx);
+
+ return 0;
+}
+
+static int vdc_nack(struct vdc_port *port, void *msgbuf)
+{
+ /* XXX Implement me XXX */
+ return 0;
+}
+
+static void vdc_event(void *arg, int event)
+{
+ struct vdc_port *port = arg;
+ struct vio_driver_state *vio = &port->vio;
+ unsigned long flags;
+ int err;
+
+ spin_lock_irqsave(&vio->lock, flags);
+
+ if (unlikely(event == LDC_EVENT_RESET)) {
+ vio_link_state_change(vio, event);
+ queue_work(sunvdc_wq, &port->ldc_reset_work);
+ goto out;
+ }
+
+ if (unlikely(event == LDC_EVENT_UP)) {
+ vio_link_state_change(vio, event);
+ goto out;
+ }
+
+ if (unlikely(event != LDC_EVENT_DATA_READY)) {
+ pr_warn(PFX "Unexpected LDC event %d\n", event);
+ goto out;
+ }
+
+ err = 0;
+ while (1) {
+ union {
+ struct vio_msg_tag tag;
+ u64 raw[8];
+ } msgbuf;
+
+ err = ldc_read(vio->lp, &msgbuf, sizeof(msgbuf));
+ if (unlikely(err < 0)) {
+ if (err == -ECONNRESET)
+ vio_conn_reset(vio);
+ break;
+ }
+ if (err == 0)
+ break;
+ viodbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n",
+ msgbuf.tag.type,
+ msgbuf.tag.stype,
+ msgbuf.tag.stype_env,
+ msgbuf.tag.sid);
+ err = vio_validate_sid(vio, &msgbuf.tag);
+ if (err < 0)
+ break;
+
+ if (likely(msgbuf.tag.type == VIO_TYPE_DATA)) {
+ if (msgbuf.tag.stype == VIO_SUBTYPE_ACK)
+ err = vdc_ack(port, &msgbuf);
+ else if (msgbuf.tag.stype == VIO_SUBTYPE_NACK)
+ err = vdc_nack(port, &msgbuf);
+ else
+ err = vdc_handle_unknown(port, &msgbuf);
+ } else if (msgbuf.tag.type == VIO_TYPE_CTRL) {
+ err = vio_control_pkt_engine(vio, &msgbuf);
+ } else {
+ err = vdc_handle_unknown(port, &msgbuf);
+ }
+ if (err < 0)
+ break;
+ }
+ if (err < 0)
+ vdc_finish(&port->vio, err, WAITING_FOR_ANY);
+out:
+ spin_unlock_irqrestore(&vio->lock, flags);
+}
+
+static int __vdc_tx_trigger(struct vdc_port *port)
+{
+ struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+ struct vio_dring_data hdr = {
+ .tag = {
+ .type = VIO_TYPE_DATA,
+ .stype = VIO_SUBTYPE_INFO,
+ .stype_env = VIO_DRING_DATA,
+ .sid = vio_send_sid(&port->vio),
+ },
+ .dring_ident = dr->ident,
+ .start_idx = dr->prod,
+ .end_idx = dr->prod,
+ };
+ int err, delay;
+
+ hdr.seq = dr->snd_nxt;
+ delay = 1;
+ do {
+ err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr));
+ if (err > 0) {
+ dr->snd_nxt++;
+ break;
+ }
+ udelay(delay);
+ if ((delay <<= 1) > 128)
+ delay = 128;
+ } while (err == -EAGAIN);
+
+ if (err == -ENOTCONN)
+ vdc_ldc_reset(port);
+ return err;
+}
+
+static int __send_request(struct request *req)
+{
+ struct vdc_port *port = req->rq_disk->private_data;
+ struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+ struct scatterlist sg[port->ring_cookies];
+ struct vdc_req_entry *rqe;
+ struct vio_disk_desc *desc;
+ unsigned int map_perm;
+ int nsg, err, i;
+ u64 len;
+ u8 op;
+
+ map_perm = LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO;
+
+ if (rq_data_dir(req) == READ) {
+ map_perm |= LDC_MAP_W;
+ op = VD_OP_BREAD;
+ } else {
+ map_perm |= LDC_MAP_R;
+ op = VD_OP_BWRITE;
+ }
+
+ sg_init_table(sg, port->ring_cookies);
+ nsg = blk_rq_map_sg(req->q, req, sg);
+
+ len = 0;
+ for (i = 0; i < nsg; i++)
+ len += sg[i].length;
+
+ desc = vio_dring_cur(dr);
+
+ err = ldc_map_sg(port->vio.lp, sg, nsg,
+ desc->cookies, port->ring_cookies,
+ map_perm);
+ if (err < 0) {
+ printk(KERN_ERR PFX "ldc_map_sg() failure, err=%d.\n", err);
+ return err;
+ }
+
+ rqe = &port->rq_arr[dr->prod];
+ rqe->req = req;
+
+ desc->hdr.ack = VIO_ACK_ENABLE;
+ desc->req_id = port->req_id;
+ desc->operation = op;
+ if (port->vdisk_type == VD_DISK_TYPE_DISK) {
+ desc->slice = 0xff;
+ } else {
+ desc->slice = 0;
+ }
+ desc->status = ~0;
+ desc->offset = (blk_rq_pos(req) << 9) / port->vdisk_block_size;
+ desc->size = len;
+ desc->ncookies = err;
+
+ /* This has to be a non-SMP write barrier because we are writing
+ * to memory which is shared with the peer LDOM.
+ */
+ wmb();
+ desc->hdr.state = VIO_DESC_READY;
+
+ err = __vdc_tx_trigger(port);
+ if (err < 0) {
+ printk(KERN_ERR PFX "vdc_tx_trigger() failure, err=%d\n", err);
+ } else {
+ port->req_id++;
+ dr->prod = vio_dring_next(dr, dr->prod);
+ }
+
+ return err;
+}
+
+static void do_vdc_request(struct request_queue *rq)
+{
+ struct request *req;
+
+ while ((req = blk_peek_request(rq)) != NULL) {
+ struct vdc_port *port;
+ struct vio_dring_state *dr;
+
+ port = req->rq_disk->private_data;
+ dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+ if (unlikely(vdc_tx_dring_avail(dr) < 1))
+ goto wait;
+
+ blk_start_request(req);
+
+ if (__send_request(req) < 0) {
+ blk_requeue_request(rq, req);
+wait:
+ /* Avoid pointless unplugs. */
+ blk_stop_queue(rq);
+ break;
+ }
+ }
+}
+
+static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
+{
+ struct vio_dring_state *dr;
+ struct vio_completion comp;
+ struct vio_disk_desc *desc;
+ unsigned int map_perm;
+ unsigned long flags;
+ int op_len, err;
+ void *req_buf;
+
+ if (!(((u64)1 << (u64)op) & port->operations))
+ return -EOPNOTSUPP;
+
+ switch (op) {
+ case VD_OP_BREAD:
+ case VD_OP_BWRITE:
+ default:
+ return -EINVAL;
+
+ case VD_OP_FLUSH:
+ op_len = 0;
+ map_perm = 0;
+ break;
+
+ case VD_OP_GET_WCE:
+ op_len = sizeof(u32);
+ map_perm = LDC_MAP_W;
+ break;
+
+ case VD_OP_SET_WCE:
+ op_len = sizeof(u32);
+ map_perm = LDC_MAP_R;
+ break;
+
+ case VD_OP_GET_VTOC:
+ op_len = sizeof(struct vio_disk_vtoc);
+ map_perm = LDC_MAP_W;
+ break;
+
+ case VD_OP_SET_VTOC:
+ op_len = sizeof(struct vio_disk_vtoc);
+ map_perm = LDC_MAP_R;
+ break;
+
+ case VD_OP_GET_DISKGEOM:
+ op_len = sizeof(struct vio_disk_geom);
+ map_perm = LDC_MAP_W;
+ break;
+
+ case VD_OP_SET_DISKGEOM:
+ op_len = sizeof(struct vio_disk_geom);
+ map_perm = LDC_MAP_R;
+ break;
+
+ case VD_OP_SCSICMD:
+ op_len = 16;
+ map_perm = LDC_MAP_RW;
+ break;
+
+ case VD_OP_GET_DEVID:
+ op_len = sizeof(struct vio_disk_devid);
+ map_perm = LDC_MAP_W;
+ break;
+
+ case VD_OP_GET_EFI:
+ case VD_OP_SET_EFI:
+ return -EOPNOTSUPP;
+ break;
+ };
+
+ map_perm |= LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO;
+
+ op_len = (op_len + 7) & ~7;
+ req_buf = kzalloc(op_len, GFP_KERNEL);
+ if (!req_buf)
+ return -ENOMEM;
+
+ if (len > op_len)
+ len = op_len;
+
+ if (map_perm & LDC_MAP_R)
+ memcpy(req_buf, buf, len);
+
+ spin_lock_irqsave(&port->vio.lock, flags);
+
+ dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+ /* XXX If we want to use this code generically we have to
+ * XXX handle TX ring exhaustion etc.
+ */
+ desc = vio_dring_cur(dr);
+
+ err = ldc_map_single(port->vio.lp, req_buf, op_len,
+ desc->cookies, port->ring_cookies,
+ map_perm);
+ if (err < 0) {
+ spin_unlock_irqrestore(&port->vio.lock, flags);
+ kfree(req_buf);
+ return err;
+ }
+
+ init_completion(&comp.com);
+ comp.waiting_for = WAITING_FOR_GEN_CMD;
+ port->vio.cmp = &comp;
+
+ desc->hdr.ack = VIO_ACK_ENABLE;
+ desc->req_id = port->req_id;
+ desc->operation = op;
+ desc->slice = 0;
+ desc->status = ~0;
+ desc->offset = 0;
+ desc->size = op_len;
+ desc->ncookies = err;
+
+ /* This has to be a non-SMP write barrier because we are writing
+ * to memory which is shared with the peer LDOM.
+ */
+ wmb();
+ desc->hdr.state = VIO_DESC_READY;
+
+ err = __vdc_tx_trigger(port);
+ if (err >= 0) {
+ port->req_id++;
+ dr->prod = vio_dring_next(dr, dr->prod);
+ spin_unlock_irqrestore(&port->vio.lock, flags);
+
+ wait_for_completion(&comp.com);
+ err = comp.err;
+ } else {
+ port->vio.cmp = NULL;
+ spin_unlock_irqrestore(&port->vio.lock, flags);
+ }
+
+ if (map_perm & LDC_MAP_W)
+ memcpy(buf, req_buf, len);
+
+ kfree(req_buf);
+
+ return err;
+}
+
+static int vdc_alloc_tx_ring(struct vdc_port *port)
+{
+ struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+ unsigned long len, entry_size;
+ int ncookies;
+ void *dring;
+
+ entry_size = sizeof(struct vio_disk_desc) +
+ (sizeof(struct ldc_trans_cookie) * port->ring_cookies);
+ len = (VDC_TX_RING_SIZE * entry_size);
+
+ ncookies = VIO_MAX_RING_COOKIES;
+ dring = ldc_alloc_exp_dring(port->vio.lp, len,
+ dr->cookies, &ncookies,
+ (LDC_MAP_SHADOW |
+ LDC_MAP_DIRECT |
+ LDC_MAP_RW));
+ if (IS_ERR(dring))
+ return PTR_ERR(dring);
+
+ dr->base = dring;
+ dr->entry_size = entry_size;
+ dr->num_entries = VDC_TX_RING_SIZE;
+ dr->prod = dr->cons = 0;
+ dr->pending = VDC_TX_RING_SIZE;
+ dr->ncookies = ncookies;
+
+ return 0;
+}
+
+static void vdc_free_tx_ring(struct vdc_port *port)
+{
+ struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+ if (dr->base) {
+ ldc_free_exp_dring(port->vio.lp, dr->base,
+ (dr->entry_size * dr->num_entries),
+ dr->cookies, dr->ncookies);
+ dr->base = NULL;
+ dr->entry_size = 0;
+ dr->num_entries = 0;
+ dr->pending = 0;
+ dr->ncookies = 0;
+ }
+}
+
+static int vdc_port_up(struct vdc_port *port)
+{
+ struct vio_completion comp;
+
+ init_completion(&comp.com);
+ comp.err = 0;
+ comp.waiting_for = WAITING_FOR_LINK_UP;
+ port->vio.cmp = &comp;
+
+ vio_port_up(&port->vio);
+ wait_for_completion(&comp.com);
+ return comp.err;
+}
+
+static void vdc_port_down(struct vdc_port *port)
+{
+ ldc_disconnect(port->vio.lp);
+ ldc_unbind(port->vio.lp);
+ vdc_free_tx_ring(port);
+ vio_ldc_free(&port->vio);
+}
+
+static int probe_disk(struct vdc_port *port)
+{
+ struct request_queue *q;
+ struct gendisk *g;
+ int err;
+
+ err = vdc_port_up(port);
+ if (err)
+ return err;
+
+ if (vdc_version_supported(port, 1, 1)) {
+ /* vdisk_size should be set during the handshake, if it wasn't
+ * then the underlying disk is reserved by another system
+ */
+ if (port->vdisk_size == -1)
+ return -ENODEV;
+ } else {
+ struct vio_disk_geom geom;
+
+ err = generic_request(port, VD_OP_GET_DISKGEOM,
+ &geom, sizeof(geom));
+ if (err < 0) {
+ printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns "
+ "error %d\n", err);
+ return err;
+ }
+ port->vdisk_size = ((u64)geom.num_cyl *
+ (u64)geom.num_hd *
+ (u64)geom.num_sec);
+ }
+
+ q = blk_init_queue(do_vdc_request, &port->vio.lock);
+ if (!q) {
+ printk(KERN_ERR PFX "%s: Could not allocate queue.\n",
+ port->vio.name);
+ return -ENOMEM;
+ }
+ g = alloc_disk(1 << PARTITION_SHIFT);
+ if (!g) {
+ printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
+ port->vio.name);
+ blk_cleanup_queue(q);
+ return -ENOMEM;
+ }
+
+ port->disk = g;
+
+ /* Each segment in a request is up to an aligned page in size. */
+ blk_queue_segment_boundary(q, PAGE_SIZE - 1);
+ blk_queue_max_segment_size(q, PAGE_SIZE);
+
+ blk_queue_max_segments(q, port->ring_cookies);
+ blk_queue_max_hw_sectors(q, port->max_xfer_size);
+ g->major = vdc_major;
+ g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT;
+ strcpy(g->disk_name, port->disk_name);
+
+ g->fops = &vdc_fops;
+ g->queue = q;
+ g->private_data = port;
+ g->driverfs_dev = &port->vio.vdev->dev;
+
+ set_capacity(g, port->vdisk_size);
+
+ if (vdc_version_supported(port, 1, 1)) {
+ switch (port->vdisk_mtype) {
+ case VD_MEDIA_TYPE_CD:
+ pr_info(PFX "Virtual CDROM %s\n", port->disk_name);
+ g->flags |= GENHD_FL_CD;
+ g->flags |= GENHD_FL_REMOVABLE;
+ set_disk_ro(g, 1);
+ break;
+
+ case VD_MEDIA_TYPE_DVD:
+ pr_info(PFX "Virtual DVD %s\n", port->disk_name);
+ g->flags |= GENHD_FL_CD;
+ g->flags |= GENHD_FL_REMOVABLE;
+ set_disk_ro(g, 1);
+ break;
+
+ case VD_MEDIA_TYPE_FIXED:
+ pr_info(PFX "Virtual Hard disk %s\n", port->disk_name);
+ break;
+ }
+ }
+
+ pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n",
+ g->disk_name,
+ port->vdisk_size, (port->vdisk_size >> (20 - 9)),
+ port->vio.ver.major, port->vio.ver.minor);
+
+ add_disk(g);
+
+ return 0;
+}
+
+static struct ldc_channel_config vdc_ldc_cfg = {
+ .event = vdc_event,
+ .mtu = 64,
+ .mode = LDC_MODE_UNRELIABLE,
+};
+
+static struct vio_driver_ops vdc_vio_ops = {
+ .send_attr = vdc_send_attr,
+ .handle_attr = vdc_handle_attr,
+ .handshake_complete = vdc_handshake_complete,
+};
+
+static void print_version(void)
+{
+ static int version_printed;
+
+ if (version_printed++ == 0)
+ printk(KERN_INFO "%s", version);
+}
+
+static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
+{
+ struct mdesc_handle *hp;
+ struct vdc_port *port;
+ int err;
+ const u64 *ldc_timeout;
+
+ print_version();
+
+ hp = mdesc_grab();
+
+ err = -ENODEV;
+ if ((vdev->dev_no << PARTITION_SHIFT) & ~(u64)MINORMASK) {
+ printk(KERN_ERR PFX "Port id [%llu] too large.\n",
+ vdev->dev_no);
+ goto err_out_release_mdesc;
+ }
+
+ port = kzalloc(sizeof(*port), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!port) {
+ printk(KERN_ERR PFX "Cannot allocate vdc_port.\n");
+ goto err_out_release_mdesc;
+ }
+
+ if (vdev->dev_no >= 26)
+ snprintf(port->disk_name, sizeof(port->disk_name),
+ VDCBLK_NAME "%c%c",
+ 'a' + ((int)vdev->dev_no / 26) - 1,
+ 'a' + ((int)vdev->dev_no % 26));
+ else
+ snprintf(port->disk_name, sizeof(port->disk_name),
+ VDCBLK_NAME "%c", 'a' + ((int)vdev->dev_no % 26));
+ port->vdisk_size = -1;
+
+ /* Actual wall time may be double due to do_generic_file_read() doing
+ * a readahead I/O first, and once that fails it will try to read a
+ * single page.
+ */
+ ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
+ port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
+ setup_timer(&port->ldc_reset_timer, vdc_ldc_reset_timer,
+ (unsigned long)port);
+ INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work);
+
+ err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
+ vdc_versions, ARRAY_SIZE(vdc_versions),
+ &vdc_vio_ops, port->disk_name);
+ if (err)
+ goto err_out_free_port;
+
+ port->vdisk_block_size = 512;
+ port->max_xfer_size = ((128 * 1024) / port->vdisk_block_size);
+ port->ring_cookies = ((port->max_xfer_size *
+ port->vdisk_block_size) / PAGE_SIZE) + 2;
+
+ err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port);
+ if (err)
+ goto err_out_free_port;
+
+ err = vdc_alloc_tx_ring(port);
+ if (err)
+ goto err_out_free_ldc;
+
+ err = probe_disk(port);
+ if (err)
+ goto err_out_free_tx_ring;
+
+ dev_set_drvdata(&vdev->dev, port);
+
+ mdesc_release(hp);
+
+ return 0;
+
+err_out_free_tx_ring:
+ vdc_free_tx_ring(port);
+
+err_out_free_ldc:
+ vio_ldc_free(&port->vio);
+
+err_out_free_port:
+ kfree(port);
+
+err_out_release_mdesc:
+ mdesc_release(hp);
+ return err;
+}
+
+static int vdc_port_remove(struct vio_dev *vdev)
+{
+ struct vdc_port *port = dev_get_drvdata(&vdev->dev);
+
+ if (port) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->vio.lock, flags);
+ blk_stop_queue(port->disk->queue);
+ spin_unlock_irqrestore(&port->vio.lock, flags);
+
+ flush_work(&port->ldc_reset_work);
+ del_timer_sync(&port->ldc_reset_timer);
+ del_timer_sync(&port->vio.timer);
+
+ del_gendisk(port->disk);
+ blk_cleanup_queue(port->disk->queue);
+ put_disk(port->disk);
+ port->disk = NULL;
+
+ vdc_free_tx_ring(port);
+ vio_ldc_free(&port->vio);
+
+ dev_set_drvdata(&vdev->dev, NULL);
+
+ kfree(port);
+ }
+ return 0;
+}
+
+static void vdc_requeue_inflight(struct vdc_port *port)
+{
+ struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+ u32 idx;
+
+ for (idx = dr->cons; idx != dr->prod; idx = vio_dring_next(dr, idx)) {
+ struct vio_disk_desc *desc = vio_dring_entry(dr, idx);
+ struct vdc_req_entry *rqe = &port->rq_arr[idx];
+ struct request *req;
+
+ ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies);
+ desc->hdr.state = VIO_DESC_FREE;
+ dr->cons = vio_dring_next(dr, idx);
+
+ req = rqe->req;
+ if (req == NULL) {
+ vdc_end_special(port, desc);
+ continue;
+ }
+
+ rqe->req = NULL;
+ blk_requeue_request(port->disk->queue, req);
+ }
+}
+
+static void vdc_queue_drain(struct vdc_port *port)
+{
+ struct request *req;
+
+ while ((req = blk_fetch_request(port->disk->queue)) != NULL)
+ __blk_end_request_all(req, -EIO);
+}
+
+static void vdc_ldc_reset_timer(unsigned long _arg)
+{
+ struct vdc_port *port = (struct vdc_port *) _arg;
+ struct vio_driver_state *vio = &port->vio;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vio->lock, flags);
+ if (!(port->vio.hs_state & VIO_HS_COMPLETE)) {
+ pr_warn(PFX "%s ldc down %llu seconds, draining queue\n",
+ port->disk_name, port->ldc_timeout);
+ vdc_queue_drain(port);
+ vdc_blk_queue_start(port);
+ }
+ spin_unlock_irqrestore(&vio->lock, flags);
+}
+
+static void vdc_ldc_reset_work(struct work_struct *work)
+{
+ struct vdc_port *port;
+ struct vio_driver_state *vio;
+ unsigned long flags;
+
+ port = container_of(work, struct vdc_port, ldc_reset_work);
+ vio = &port->vio;
+
+ spin_lock_irqsave(&vio->lock, flags);
+ vdc_ldc_reset(port);
+ spin_unlock_irqrestore(&vio->lock, flags);
+}
+
+static void vdc_ldc_reset(struct vdc_port *port)
+{
+ int err;
+
+ assert_spin_locked(&port->vio.lock);
+
+ pr_warn(PFX "%s ldc link reset\n", port->disk_name);
+ blk_stop_queue(port->disk->queue);
+ vdc_requeue_inflight(port);
+ vdc_port_down(port);
+
+ err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port);
+ if (err) {
+ pr_err(PFX "%s vio_ldc_alloc:%d\n", port->disk_name, err);
+ return;
+ }
+
+ err = vdc_alloc_tx_ring(port);
+ if (err) {
+ pr_err(PFX "%s vio_alloc_tx_ring:%d\n", port->disk_name, err);
+ goto err_free_ldc;
+ }
+
+ if (port->ldc_timeout)
+ mod_timer(&port->ldc_reset_timer,
+ round_jiffies(jiffies + HZ * port->ldc_timeout));
+ mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
+ return;
+
+err_free_ldc:
+ vio_ldc_free(&port->vio);
+}
+
+static const struct vio_device_id vdc_port_match[] = {
+ {
+ .type = "vdc-port",
+ },
+ {},
+};
+MODULE_DEVICE_TABLE(vio, vdc_port_match);
+
+static struct vio_driver vdc_port_driver = {
+ .id_table = vdc_port_match,
+ .probe = vdc_port_probe,
+ .remove = vdc_port_remove,
+ .name = "vdc_port",
+};
+
+static int __init vdc_init(void)
+{
+ int err;
+
+ sunvdc_wq = alloc_workqueue("sunvdc", 0, 0);
+ if (!sunvdc_wq)
+ return -ENOMEM;
+
+ err = register_blkdev(0, VDCBLK_NAME);
+ if (err < 0)
+ goto out_free_wq;
+
+ vdc_major = err;
+
+ err = vio_register_driver(&vdc_port_driver);
+ if (err)
+ goto out_unregister_blkdev;
+
+ return 0;
+
+out_unregister_blkdev:
+ unregister_blkdev(vdc_major, VDCBLK_NAME);
+ vdc_major = 0;
+
+out_free_wq:
+ destroy_workqueue(sunvdc_wq);
+ return err;
+}
+
+static void __exit vdc_exit(void)
+{
+ vio_unregister_driver(&vdc_port_driver);
+ unregister_blkdev(vdc_major, VDCBLK_NAME);
+ destroy_workqueue(sunvdc_wq);
+}
+
+module_init(vdc_init);
+module_exit(vdc_exit);
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
new file mode 100644
index 000000000..b5afd495d
--- /dev/null
+++ b/drivers/block/swim.c
@@ -0,0 +1,994 @@
+/*
+ * Driver for SWIM (Sander Woz Integrated Machine) floppy controller
+ *
+ * Copyright (C) 2004,2008 Laurent Vivier <Laurent@lvivier.info>
+ *
+ * based on Alastair Bridgewater SWIM analysis, 2001
+ * based on SWIM3 driver (c) Paul Mackerras, 1996
+ * based on netBSD IWM driver (c) 1997, 1998 Hauke Fath.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * 2004-08-21 (lv) - Initial implementation
+ * 2008-10-30 (lv) - Port to 2.6
+ */
+
+#include <linux/module.h>
+#include <linux/fd.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <linux/hdreg.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+
+#include <asm/mac_via.h>
+
+#define CARDNAME "swim"
+
+struct sector_header {
+ unsigned char side;
+ unsigned char track;
+ unsigned char sector;
+ unsigned char size;
+ unsigned char crc0;
+ unsigned char crc1;
+} __attribute__((packed));
+
+#define DRIVER_VERSION "Version 0.2 (2008-10-30)"
+
+#define REG(x) unsigned char x, x ## _pad[0x200 - 1];
+
+struct swim {
+ REG(write_data)
+ REG(write_mark)
+ REG(write_CRC)
+ REG(write_parameter)
+ REG(write_phase)
+ REG(write_setup)
+ REG(write_mode0)
+ REG(write_mode1)
+
+ REG(read_data)
+ REG(read_mark)
+ REG(read_error)
+ REG(read_parameter)
+ REG(read_phase)
+ REG(read_setup)
+ REG(read_status)
+ REG(read_handshake)
+} __attribute__((packed));
+
+#define swim_write(base, reg, v) out_8(&(base)->write_##reg, (v))
+#define swim_read(base, reg) in_8(&(base)->read_##reg)
+
+/* IWM registers */
+
+struct iwm {
+ REG(ph0L)
+ REG(ph0H)
+ REG(ph1L)
+ REG(ph1H)
+ REG(ph2L)
+ REG(ph2H)
+ REG(ph3L)
+ REG(ph3H)
+ REG(mtrOff)
+ REG(mtrOn)
+ REG(intDrive)
+ REG(extDrive)
+ REG(q6L)
+ REG(q6H)
+ REG(q7L)
+ REG(q7H)
+} __attribute__((packed));
+
+#define iwm_write(base, reg, v) out_8(&(base)->reg, (v))
+#define iwm_read(base, reg) in_8(&(base)->reg)
+
+/* bits in phase register */
+
+#define SEEK_POSITIVE 0x070
+#define SEEK_NEGATIVE 0x074
+#define STEP 0x071
+#define MOTOR_ON 0x072
+#define MOTOR_OFF 0x076
+#define INDEX 0x073
+#define EJECT 0x077
+#define SETMFM 0x171
+#define SETGCR 0x175
+
+#define RELAX 0x033
+#define LSTRB 0x008
+
+#define CA_MASK 0x077
+
+/* Select values for swim_select and swim_readbit */
+
+#define READ_DATA_0 0x074
+#define TWOMEG_DRIVE 0x075
+#define SINGLE_SIDED 0x076
+#define DRIVE_PRESENT 0x077
+#define DISK_IN 0x170
+#define WRITE_PROT 0x171
+#define TRACK_ZERO 0x172
+#define TACHO 0x173
+#define READ_DATA_1 0x174
+#define MFM_MODE 0x175
+#define SEEK_COMPLETE 0x176
+#define ONEMEG_MEDIA 0x177
+
+/* Bits in handshake register */
+
+#define MARK_BYTE 0x01
+#define CRC_ZERO 0x02
+#define RDDATA 0x04
+#define SENSE 0x08
+#define MOTEN 0x10
+#define ERROR 0x20
+#define DAT2BYTE 0x40
+#define DAT1BYTE 0x80
+
+/* bits in setup register */
+
+#define S_INV_WDATA 0x01
+#define S_3_5_SELECT 0x02
+#define S_GCR 0x04
+#define S_FCLK_DIV2 0x08
+#define S_ERROR_CORR 0x10
+#define S_IBM_DRIVE 0x20
+#define S_GCR_WRITE 0x40
+#define S_TIMEOUT 0x80
+
+/* bits in mode register */
+
+#define CLFIFO 0x01
+#define ENBL1 0x02
+#define ENBL2 0x04
+#define ACTION 0x08
+#define WRITE_MODE 0x10
+#define HEDSEL 0x20
+#define MOTON 0x80
+
+/*----------------------------------------------------------------------------*/
+
+enum drive_location {
+ INTERNAL_DRIVE = 0x02,
+ EXTERNAL_DRIVE = 0x04,
+};
+
+enum media_type {
+ DD_MEDIA,
+ HD_MEDIA,
+};
+
+struct floppy_state {
+
+ /* physical properties */
+
+ enum drive_location location; /* internal or external drive */
+ int head_number; /* single- or double-sided drive */
+
+ /* media */
+
+ int disk_in;
+ int ejected;
+ enum media_type type;
+ int write_protected;
+
+ int total_secs;
+ int secpercyl;
+ int secpertrack;
+
+ /* in-use information */
+
+ int track;
+ int ref_count;
+
+ struct gendisk *disk;
+
+ /* parent controller */
+
+ struct swim_priv *swd;
+};
+
+enum motor_action {
+ OFF,
+ ON,
+};
+
+enum head {
+ LOWER_HEAD = 0,
+ UPPER_HEAD = 1,
+};
+
+#define FD_MAX_UNIT 2
+
+struct swim_priv {
+ struct swim __iomem *base;
+ spinlock_t lock;
+ struct request_queue *queue;
+ int floppy_count;
+ struct floppy_state unit[FD_MAX_UNIT];
+};
+
+extern int swim_read_sector_header(struct swim __iomem *base,
+ struct sector_header *header);
+extern int swim_read_sector_data(struct swim __iomem *base,
+ unsigned char *data);
+
+static DEFINE_MUTEX(swim_mutex);
+static inline void set_swim_mode(struct swim __iomem *base, int enable)
+{
+ struct iwm __iomem *iwm_base;
+ unsigned long flags;
+
+ if (!enable) {
+ swim_write(base, mode0, 0xf8);
+ return;
+ }
+
+ iwm_base = (struct iwm __iomem *)base;
+ local_irq_save(flags);
+
+ iwm_read(iwm_base, q7L);
+ iwm_read(iwm_base, mtrOff);
+ iwm_read(iwm_base, q6H);
+
+ iwm_write(iwm_base, q7H, 0x57);
+ iwm_write(iwm_base, q7H, 0x17);
+ iwm_write(iwm_base, q7H, 0x57);
+ iwm_write(iwm_base, q7H, 0x57);
+
+ local_irq_restore(flags);
+}
+
+static inline int get_swim_mode(struct swim __iomem *base)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ swim_write(base, phase, 0xf5);
+ if (swim_read(base, phase) != 0xf5)
+ goto is_iwm;
+ swim_write(base, phase, 0xf6);
+ if (swim_read(base, phase) != 0xf6)
+ goto is_iwm;
+ swim_write(base, phase, 0xf7);
+ if (swim_read(base, phase) != 0xf7)
+ goto is_iwm;
+ local_irq_restore(flags);
+ return 1;
+is_iwm:
+ local_irq_restore(flags);
+ return 0;
+}
+
+static inline void swim_select(struct swim __iomem *base, int sel)
+{
+ swim_write(base, phase, RELAX);
+
+ via1_set_head(sel & 0x100);
+
+ swim_write(base, phase, sel & CA_MASK);
+}
+
+static inline void swim_action(struct swim __iomem *base, int action)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ swim_select(base, action);
+ udelay(1);
+ swim_write(base, phase, (LSTRB<<4) | LSTRB);
+ udelay(1);
+ swim_write(base, phase, (LSTRB<<4) | ((~LSTRB) & 0x0F));
+ udelay(1);
+
+ local_irq_restore(flags);
+}
+
+static inline int swim_readbit(struct swim __iomem *base, int bit)
+{
+ int stat;
+
+ swim_select(base, bit);
+
+ udelay(10);
+
+ stat = swim_read(base, handshake);
+
+ return (stat & SENSE) == 0;
+}
+
+static inline void swim_drive(struct swim __iomem *base,
+ enum drive_location location)
+{
+ if (location == INTERNAL_DRIVE) {
+ swim_write(base, mode0, EXTERNAL_DRIVE); /* clear drive 1 bit */
+ swim_write(base, mode1, INTERNAL_DRIVE); /* set drive 0 bit */
+ } else if (location == EXTERNAL_DRIVE) {
+ swim_write(base, mode0, INTERNAL_DRIVE); /* clear drive 0 bit */
+ swim_write(base, mode1, EXTERNAL_DRIVE); /* set drive 1 bit */
+ }
+}
+
+static inline void swim_motor(struct swim __iomem *base,
+ enum motor_action action)
+{
+ if (action == ON) {
+ int i;
+
+ swim_action(base, MOTOR_ON);
+
+ for (i = 0; i < 2*HZ; i++) {
+ swim_select(base, RELAX);
+ if (swim_readbit(base, MOTOR_ON))
+ break;
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ }
+ } else if (action == OFF) {
+ swim_action(base, MOTOR_OFF);
+ swim_select(base, RELAX);
+ }
+}
+
+static inline void swim_eject(struct swim __iomem *base)
+{
+ int i;
+
+ swim_action(base, EJECT);
+
+ for (i = 0; i < 2*HZ; i++) {
+ swim_select(base, RELAX);
+ if (!swim_readbit(base, DISK_IN))
+ break;
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+ }
+ swim_select(base, RELAX);
+}
+
+static inline void swim_head(struct swim __iomem *base, enum head head)
+{
+ /* wait drive is ready */
+
+ if (head == UPPER_HEAD)
+ swim_select(base, READ_DATA_1);
+ else if (head == LOWER_HEAD)
+ swim_select(base, READ_DATA_0);
+}
+
+static inline int swim_step(struct swim __iomem *base)
+{
+ int wait;
+
+ swim_action(base, STEP);
+
+ for (wait = 0; wait < HZ; wait++) {
+
+ current->state = TASK_INTERRUPTIBLE;
+ schedule_timeout(1);
+
+ swim_select(base, RELAX);
+ if (!swim_readbit(base, STEP))
+ return 0;
+ }
+ return -1;
+}
+
+static inline int swim_track00(struct swim __iomem *base)
+{
+ int try;
+
+ swim_action(base, SEEK_NEGATIVE);
+
+ for (try = 0; try < 100; try++) {
+
+ swim_select(base, RELAX);
+ if (swim_readbit(base, TRACK_ZERO))
+ break;
+
+ if (swim_step(base))
+ return -1;
+ }
+
+ if (swim_readbit(base, TRACK_ZERO))
+ return 0;
+
+ return -1;
+}
+
+static inline int swim_seek(struct swim __iomem *base, int step)
+{
+ if (step == 0)
+ return 0;
+
+ if (step < 0) {
+ swim_action(base, SEEK_NEGATIVE);
+ step = -step;
+ } else
+ swim_action(base, SEEK_POSITIVE);
+
+ for ( ; step > 0; step--) {
+ if (swim_step(base))
+ return -1;
+ }
+
+ return 0;
+}
+
+static inline int swim_track(struct floppy_state *fs, int track)
+{
+ struct swim __iomem *base = fs->swd->base;
+ int ret;
+
+ ret = swim_seek(base, track - fs->track);
+
+ if (ret == 0)
+ fs->track = track;
+ else {
+ swim_track00(base);
+ fs->track = 0;
+ }
+
+ return ret;
+}
+
+static int floppy_eject(struct floppy_state *fs)
+{
+ struct swim __iomem *base = fs->swd->base;
+
+ swim_drive(base, fs->location);
+ swim_motor(base, OFF);
+ swim_eject(base);
+
+ fs->disk_in = 0;
+ fs->ejected = 1;
+
+ return 0;
+}
+
+static inline int swim_read_sector(struct floppy_state *fs,
+ int side, int track,
+ int sector, unsigned char *buffer)
+{
+ struct swim __iomem *base = fs->swd->base;
+ unsigned long flags;
+ struct sector_header header;
+ int ret = -1;
+ short i;
+
+ swim_track(fs, track);
+
+ swim_write(base, mode1, MOTON);
+ swim_head(base, side);
+ swim_write(base, mode0, side);
+
+ local_irq_save(flags);
+ for (i = 0; i < 36; i++) {
+ ret = swim_read_sector_header(base, &header);
+ if (!ret && (header.sector == sector)) {
+ /* found */
+
+ ret = swim_read_sector_data(base, buffer);
+ break;
+ }
+ }
+ local_irq_restore(flags);
+
+ swim_write(base, mode0, MOTON);
+
+ if ((header.side != side) || (header.track != track) ||
+ (header.sector != sector))
+ return 0;
+
+ return ret;
+}
+
+static int floppy_read_sectors(struct floppy_state *fs,
+ int req_sector, int sectors_nb,
+ unsigned char *buffer)
+{
+ struct swim __iomem *base = fs->swd->base;
+ int ret;
+ int side, track, sector;
+ int i, try;
+
+
+ swim_drive(base, fs->location);
+ for (i = req_sector; i < req_sector + sectors_nb; i++) {
+ int x;
+ track = i / fs->secpercyl;
+ x = i % fs->secpercyl;
+ side = x / fs->secpertrack;
+ sector = x % fs->secpertrack + 1;
+
+ try = 5;
+ do {
+ ret = swim_read_sector(fs, side, track, sector,
+ buffer);
+ if (try-- == 0)
+ return -EIO;
+ } while (ret != 512);
+
+ buffer += ret;
+ }
+
+ return 0;
+}
+
+static void redo_fd_request(struct request_queue *q)
+{
+ struct request *req;
+ struct floppy_state *fs;
+
+ req = blk_fetch_request(q);
+ while (req) {
+ int err = -EIO;
+
+ fs = req->rq_disk->private_data;
+ if (blk_rq_pos(req) >= fs->total_secs)
+ goto done;
+ if (!fs->disk_in)
+ goto done;
+ if (rq_data_dir(req) == WRITE && fs->write_protected)
+ goto done;
+
+ switch (rq_data_dir(req)) {
+ case WRITE:
+ /* NOT IMPLEMENTED */
+ break;
+ case READ:
+ err = floppy_read_sectors(fs, blk_rq_pos(req),
+ blk_rq_cur_sectors(req),
+ bio_data(req->bio));
+ break;
+ }
+ done:
+ if (!__blk_end_request_cur(req, err))
+ req = blk_fetch_request(q);
+ }
+}
+
+static void do_fd_request(struct request_queue *q)
+{
+ redo_fd_request(q);
+}
+
+static struct floppy_struct floppy_type[4] = {
+ { 0, 0, 0, 0, 0, 0x00, 0x00, 0x00, 0x00, NULL }, /* no testing */
+ { 720, 9, 1, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 360KB SS 3.5"*/
+ { 1440, 9, 2, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 720KB 3.5" */
+ { 2880, 18, 2, 80, 0, 0x1B, 0x00, 0xCF, 0x6C, NULL }, /* 1.44MB 3.5" */
+};
+
+static int get_floppy_geometry(struct floppy_state *fs, int type,
+ struct floppy_struct **g)
+{
+ if (type >= ARRAY_SIZE(floppy_type))
+ return -EINVAL;
+
+ if (type)
+ *g = &floppy_type[type];
+ else if (fs->type == HD_MEDIA) /* High-Density media */
+ *g = &floppy_type[3];
+ else if (fs->head_number == 2) /* double-sided */
+ *g = &floppy_type[2];
+ else
+ *g = &floppy_type[1];
+
+ return 0;
+}
+
+static void setup_medium(struct floppy_state *fs)
+{
+ struct swim __iomem *base = fs->swd->base;
+
+ if (swim_readbit(base, DISK_IN)) {
+ struct floppy_struct *g;
+ fs->disk_in = 1;
+ fs->write_protected = swim_readbit(base, WRITE_PROT);
+ fs->type = swim_readbit(base, ONEMEG_MEDIA);
+
+ if (swim_track00(base))
+ printk(KERN_ERR
+ "SWIM: cannot move floppy head to track 0\n");
+
+ swim_track00(base);
+
+ get_floppy_geometry(fs, 0, &g);
+ fs->total_secs = g->size;
+ fs->secpercyl = g->head * g->sect;
+ fs->secpertrack = g->sect;
+ fs->track = 0;
+ } else {
+ fs->disk_in = 0;
+ }
+}
+
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+ struct floppy_state *fs = bdev->bd_disk->private_data;
+ struct swim __iomem *base = fs->swd->base;
+ int err;
+
+ if (fs->ref_count == -1 || (fs->ref_count && mode & FMODE_EXCL))
+ return -EBUSY;
+
+ if (mode & FMODE_EXCL)
+ fs->ref_count = -1;
+ else
+ fs->ref_count++;
+
+ swim_write(base, setup, S_IBM_DRIVE | S_FCLK_DIV2);
+ udelay(10);
+ swim_drive(base, INTERNAL_DRIVE);
+ swim_motor(base, ON);
+ swim_action(base, SETMFM);
+ if (fs->ejected)
+ setup_medium(fs);
+ if (!fs->disk_in) {
+ err = -ENXIO;
+ goto out;
+ }
+
+ if (mode & FMODE_NDELAY)
+ return 0;
+
+ if (mode & (FMODE_READ|FMODE_WRITE)) {
+ check_disk_change(bdev);
+ if ((mode & FMODE_WRITE) && fs->write_protected) {
+ err = -EROFS;
+ goto out;
+ }
+ }
+ return 0;
+out:
+ if (fs->ref_count < 0)
+ fs->ref_count = 0;
+ else if (fs->ref_count > 0)
+ --fs->ref_count;
+
+ if (fs->ref_count == 0)
+ swim_motor(base, OFF);
+ return err;
+}
+
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+ int ret;
+
+ mutex_lock(&swim_mutex);
+ ret = floppy_open(bdev, mode);
+ mutex_unlock(&swim_mutex);
+
+ return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+ struct floppy_state *fs = disk->private_data;
+ struct swim __iomem *base = fs->swd->base;
+
+ mutex_lock(&swim_mutex);
+ if (fs->ref_count < 0)
+ fs->ref_count = 0;
+ else if (fs->ref_count > 0)
+ --fs->ref_count;
+
+ if (fs->ref_count == 0)
+ swim_motor(base, OFF);
+ mutex_unlock(&swim_mutex);
+}
+
+static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param)
+{
+ struct floppy_state *fs = bdev->bd_disk->private_data;
+ int err;
+
+ if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ switch (cmd) {
+ case FDEJECT:
+ if (fs->ref_count != 1)
+ return -EBUSY;
+ mutex_lock(&swim_mutex);
+ err = floppy_eject(fs);
+ mutex_unlock(&swim_mutex);
+ return err;
+
+ case FDGETPRM:
+ if (copy_to_user((void __user *) param, (void *) &floppy_type,
+ sizeof(struct floppy_struct)))
+ return -EFAULT;
+ break;
+
+ default:
+ printk(KERN_DEBUG "SWIM floppy_ioctl: unknown cmd %d\n",
+ cmd);
+ return -ENOSYS;
+ }
+ return 0;
+}
+
+static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct floppy_state *fs = bdev->bd_disk->private_data;
+ struct floppy_struct *g;
+ int ret;
+
+ ret = get_floppy_geometry(fs, 0, &g);
+ if (ret)
+ return ret;
+
+ geo->heads = g->head;
+ geo->sectors = g->sect;
+ geo->cylinders = g->track;
+
+ return 0;
+}
+
+static unsigned int floppy_check_events(struct gendisk *disk,
+ unsigned int clearing)
+{
+ struct floppy_state *fs = disk->private_data;
+
+ return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static int floppy_revalidate(struct gendisk *disk)
+{
+ struct floppy_state *fs = disk->private_data;
+ struct swim __iomem *base = fs->swd->base;
+
+ swim_drive(base, fs->location);
+
+ if (fs->ejected)
+ setup_medium(fs);
+
+ if (!fs->disk_in)
+ swim_motor(base, OFF);
+ else
+ fs->ejected = 0;
+
+ return !fs->disk_in;
+}
+
+static const struct block_device_operations floppy_fops = {
+ .owner = THIS_MODULE,
+ .open = floppy_unlocked_open,
+ .release = floppy_release,
+ .ioctl = floppy_ioctl,
+ .getgeo = floppy_getgeo,
+ .check_events = floppy_check_events,
+ .revalidate_disk = floppy_revalidate,
+};
+
+static struct kobject *floppy_find(dev_t dev, int *part, void *data)
+{
+ struct swim_priv *swd = data;
+ int drive = (*part & 3);
+
+ if (drive > swd->floppy_count)
+ return NULL;
+
+ *part = 0;
+ return get_disk(swd->unit[drive].disk);
+}
+
+static int swim_add_floppy(struct swim_priv *swd, enum drive_location location)
+{
+ struct floppy_state *fs = &swd->unit[swd->floppy_count];
+ struct swim __iomem *base = swd->base;
+
+ fs->location = location;
+
+ swim_drive(base, location);
+
+ swim_motor(base, OFF);
+
+ if (swim_readbit(base, SINGLE_SIDED))
+ fs->head_number = 1;
+ else
+ fs->head_number = 2;
+ fs->ref_count = 0;
+ fs->ejected = 1;
+
+ swd->floppy_count++;
+
+ return 0;
+}
+
+static int swim_floppy_init(struct swim_priv *swd)
+{
+ int err;
+ int drive;
+ struct swim __iomem *base = swd->base;
+
+ /* scan floppy drives */
+
+ swim_drive(base, INTERNAL_DRIVE);
+ if (swim_readbit(base, DRIVE_PRESENT))
+ swim_add_floppy(swd, INTERNAL_DRIVE);
+ swim_drive(base, EXTERNAL_DRIVE);
+ if (swim_readbit(base, DRIVE_PRESENT))
+ swim_add_floppy(swd, EXTERNAL_DRIVE);
+
+ /* register floppy drives */
+
+ err = register_blkdev(FLOPPY_MAJOR, "fd");
+ if (err) {
+ printk(KERN_ERR "Unable to get major %d for SWIM floppy\n",
+ FLOPPY_MAJOR);
+ return -EBUSY;
+ }
+
+ for (drive = 0; drive < swd->floppy_count; drive++) {
+ swd->unit[drive].disk = alloc_disk(1);
+ if (swd->unit[drive].disk == NULL) {
+ err = -ENOMEM;
+ goto exit_put_disks;
+ }
+ swd->unit[drive].swd = swd;
+ }
+
+ spin_lock_init(&swd->lock);
+ swd->queue = blk_init_queue(do_fd_request, &swd->lock);
+ if (!swd->queue) {
+ err = -ENOMEM;
+ goto exit_put_disks;
+ }
+
+ for (drive = 0; drive < swd->floppy_count; drive++) {
+ swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE;
+ swd->unit[drive].disk->major = FLOPPY_MAJOR;
+ swd->unit[drive].disk->first_minor = drive;
+ sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
+ swd->unit[drive].disk->fops = &floppy_fops;
+ swd->unit[drive].disk->private_data = &swd->unit[drive];
+ swd->unit[drive].disk->queue = swd->queue;
+ set_capacity(swd->unit[drive].disk, 2880);
+ add_disk(swd->unit[drive].disk);
+ }
+
+ blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
+ floppy_find, NULL, swd);
+
+ return 0;
+
+exit_put_disks:
+ unregister_blkdev(FLOPPY_MAJOR, "fd");
+ while (drive--)
+ put_disk(swd->unit[drive].disk);
+ return err;
+}
+
+static int swim_probe(struct platform_device *dev)
+{
+ struct resource *res;
+ struct swim __iomem *swim_base;
+ struct swim_priv *swd;
+ int ret;
+
+ res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+ if (!res) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ if (!request_mem_region(res->start, resource_size(res), CARDNAME)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ swim_base = ioremap(res->start, resource_size(res));
+ if (!swim_base) {
+ ret = -ENOMEM;
+ goto out_release_io;
+ }
+
+ /* probe device */
+
+ set_swim_mode(swim_base, 1);
+ if (!get_swim_mode(swim_base)) {
+ printk(KERN_INFO "SWIM device not found !\n");
+ ret = -ENODEV;
+ goto out_iounmap;
+ }
+
+ /* set platform driver data */
+
+ swd = kzalloc(sizeof(struct swim_priv), GFP_KERNEL);
+ if (!swd) {
+ ret = -ENOMEM;
+ goto out_iounmap;
+ }
+ platform_set_drvdata(dev, swd);
+
+ swd->base = swim_base;
+
+ ret = swim_floppy_init(swd);
+ if (ret)
+ goto out_kfree;
+
+ return 0;
+
+out_kfree:
+ kfree(swd);
+out_iounmap:
+ iounmap(swim_base);
+out_release_io:
+ release_mem_region(res->start, resource_size(res));
+out:
+ return ret;
+}
+
+static int swim_remove(struct platform_device *dev)
+{
+ struct swim_priv *swd = platform_get_drvdata(dev);
+ int drive;
+ struct resource *res;
+
+ blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
+
+ for (drive = 0; drive < swd->floppy_count; drive++) {
+ del_gendisk(swd->unit[drive].disk);
+ put_disk(swd->unit[drive].disk);
+ }
+
+ unregister_blkdev(FLOPPY_MAJOR, "fd");
+
+ blk_cleanup_queue(swd->queue);
+
+ /* eject floppies */
+
+ for (drive = 0; drive < swd->floppy_count; drive++)
+ floppy_eject(&swd->unit[drive]);
+
+ iounmap(swd->base);
+
+ res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+ if (res)
+ release_mem_region(res->start, resource_size(res));
+
+ kfree(swd);
+
+ return 0;
+}
+
+static struct platform_driver swim_driver = {
+ .probe = swim_probe,
+ .remove = swim_remove,
+ .driver = {
+ .name = CARDNAME,
+ },
+};
+
+static int __init swim_init(void)
+{
+ printk(KERN_INFO "SWIM floppy driver %s\n", DRIVER_VERSION);
+
+ return platform_driver_register(&swim_driver);
+}
+module_init(swim_init);
+
+static void __exit swim_exit(void)
+{
+ platform_driver_unregister(&swim_driver);
+}
+module_exit(swim_exit);
+
+MODULE_DESCRIPTION("Driver for SWIM floppy controller");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laurent Vivier <laurent@lvivier.info>");
+MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR);
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
new file mode 100644
index 000000000..c264f2d28
--- /dev/null
+++ b/drivers/block/swim3.c
@@ -0,0 +1,1289 @@
+/*
+ * Driver for the SWIM3 (Super Woz Integrated Machine 3)
+ * floppy controller found on Power Macintoshes.
+ *
+ * Copyright (C) 1996 Paul Mackerras.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * TODO:
+ * handle 2 drives
+ * handle GCR disks
+ */
+
+#undef DEBUG
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/fd.h>
+#include <linux/ioctl.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+#include <asm/dbdma.h>
+#include <asm/prom.h>
+#include <asm/uaccess.h>
+#include <asm/mediabay.h>
+#include <asm/machdep.h>
+#include <asm/pmac_feature.h>
+
+#define MAX_FLOPPIES 2
+
+static DEFINE_MUTEX(swim3_mutex);
+static struct gendisk *disks[MAX_FLOPPIES];
+
+enum swim_state {
+ idle,
+ locating,
+ seeking,
+ settling,
+ do_transfer,
+ jogging,
+ available,
+ revalidating,
+ ejecting
+};
+
+#define REG(x) unsigned char x; char x ## _pad[15];
+
+/*
+ * The names for these registers mostly represent speculation on my part.
+ * It will be interesting to see how close they are to the names Apple uses.
+ */
+struct swim3 {
+ REG(data);
+ REG(timer); /* counts down at 1MHz */
+ REG(error);
+ REG(mode);
+ REG(select); /* controls CA0, CA1, CA2 and LSTRB signals */
+ REG(setup);
+ REG(control); /* writing bits clears them */
+ REG(status); /* writing bits sets them in control */
+ REG(intr);
+ REG(nseek); /* # tracks to seek */
+ REG(ctrack); /* current track number */
+ REG(csect); /* current sector number */
+ REG(gap3); /* size of gap 3 in track format */
+ REG(sector); /* sector # to read or write */
+ REG(nsect); /* # sectors to read or write */
+ REG(intr_enable);
+};
+
+#define control_bic control
+#define control_bis status
+
+/* Bits in select register */
+#define CA_MASK 7
+#define LSTRB 8
+
+/* Bits in control register */
+#define DO_SEEK 0x80
+#define FORMAT 0x40
+#define SELECT 0x20
+#define WRITE_SECTORS 0x10
+#define DO_ACTION 0x08
+#define DRIVE2_ENABLE 0x04
+#define DRIVE_ENABLE 0x02
+#define INTR_ENABLE 0x01
+
+/* Bits in status register */
+#define FIFO_1BYTE 0x80
+#define FIFO_2BYTE 0x40
+#define ERROR 0x20
+#define DATA 0x08
+#define RDDATA 0x04
+#define INTR_PENDING 0x02
+#define MARK_BYTE 0x01
+
+/* Bits in intr and intr_enable registers */
+#define ERROR_INTR 0x20
+#define DATA_CHANGED 0x10
+#define TRANSFER_DONE 0x08
+#define SEEN_SECTOR 0x04
+#define SEEK_DONE 0x02
+#define TIMER_DONE 0x01
+
+/* Bits in error register */
+#define ERR_DATA_CRC 0x80
+#define ERR_ADDR_CRC 0x40
+#define ERR_OVERRUN 0x04
+#define ERR_UNDERRUN 0x01
+
+/* Bits in setup register */
+#define S_SW_RESET 0x80
+#define S_GCR_WRITE 0x40
+#define S_IBM_DRIVE 0x20
+#define S_TEST_MODE 0x10
+#define S_FCLK_DIV2 0x08
+#define S_GCR 0x04
+#define S_COPY_PROT 0x02
+#define S_INV_WDATA 0x01
+
+/* Select values for swim3_action */
+#define SEEK_POSITIVE 0
+#define SEEK_NEGATIVE 4
+#define STEP 1
+#define MOTOR_ON 2
+#define MOTOR_OFF 6
+#define INDEX 3
+#define EJECT 7
+#define SETMFM 9
+#define SETGCR 13
+
+/* Select values for swim3_select and swim3_readbit */
+#define STEP_DIR 0
+#define STEPPING 1
+#define MOTOR_ON 2
+#define RELAX 3 /* also eject in progress */
+#define READ_DATA_0 4
+#define TWOMEG_DRIVE 5
+#define SINGLE_SIDED 6 /* drive or diskette is 4MB type? */
+#define DRIVE_PRESENT 7
+#define DISK_IN 8
+#define WRITE_PROT 9
+#define TRACK_ZERO 10
+#define TACHO 11
+#define READ_DATA_1 12
+#define MFM_MODE 13
+#define SEEK_COMPLETE 14
+#define ONEMEG_MEDIA 15
+
+/* Definitions of values used in writing and formatting */
+#define DATA_ESCAPE 0x99
+#define GCR_SYNC_EXC 0x3f
+#define GCR_SYNC_CONV 0x80
+#define GCR_FIRST_MARK 0xd5
+#define GCR_SECOND_MARK 0xaa
+#define GCR_ADDR_MARK "\xd5\xaa\x00"
+#define GCR_DATA_MARK "\xd5\xaa\x0b"
+#define GCR_SLIP_BYTE "\x27\xaa"
+#define GCR_SELF_SYNC "\x3f\xbf\x1e\x34\x3c\x3f"
+
+#define DATA_99 "\x99\x99"
+#define MFM_ADDR_MARK "\x99\xa1\x99\xa1\x99\xa1\x99\xfe"
+#define MFM_INDEX_MARK "\x99\xc2\x99\xc2\x99\xc2\x99\xfc"
+#define MFM_GAP_LEN 12
+
+struct floppy_state {
+ enum swim_state state;
+ struct swim3 __iomem *swim3; /* hardware registers */
+ struct dbdma_regs __iomem *dma; /* DMA controller registers */
+ int swim3_intr; /* interrupt number for SWIM3 */
+ int dma_intr; /* interrupt number for DMA channel */
+ int cur_cyl; /* cylinder head is on, or -1 */
+ int cur_sector; /* last sector we saw go past */
+ int req_cyl; /* the cylinder for the current r/w request */
+ int head; /* head number ditto */
+ int req_sector; /* sector number ditto */
+ int scount; /* # sectors we're transferring at present */
+ int retries;
+ int settle_time;
+ int secpercyl; /* disk geometry information */
+ int secpertrack;
+ int total_secs;
+ int write_prot; /* 1 if write-protected, 0 if not, -1 dunno */
+ struct dbdma_cmd *dma_cmd;
+ int ref_count;
+ int expect_cyl;
+ struct timer_list timeout;
+ int timeout_pending;
+ int ejected;
+ wait_queue_head_t wait;
+ int wanted;
+ struct macio_dev *mdev;
+ char dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)];
+ int index;
+ struct request *cur_req;
+};
+
+#define swim3_err(fmt, arg...) dev_err(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_warn(fmt, arg...) dev_warn(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_info(fmt, arg...) dev_info(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+
+#ifdef DEBUG
+#define swim3_dbg(fmt, arg...) dev_dbg(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#else
+#define swim3_dbg(fmt, arg...) do { } while(0)
+#endif
+
+static struct floppy_state floppy_states[MAX_FLOPPIES];
+static int floppy_count = 0;
+static DEFINE_SPINLOCK(swim3_lock);
+
+static unsigned short write_preamble[] = {
+ 0x4e4e, 0x4e4e, 0x4e4e, 0x4e4e, 0x4e4e, /* gap field */
+ 0, 0, 0, 0, 0, 0, /* sync field */
+ 0x99a1, 0x99a1, 0x99a1, 0x99fb, /* data address mark */
+ 0x990f /* no escape for 512 bytes */
+};
+
+static unsigned short write_postamble[] = {
+ 0x9904, /* insert CRC */
+ 0x4e4e, 0x4e4e,
+ 0x9908, /* stop writing */
+ 0, 0, 0, 0, 0, 0
+};
+
+static void seek_track(struct floppy_state *fs, int n);
+static void init_dma(struct dbdma_cmd *cp, int cmd, void *buf, int count);
+static void act(struct floppy_state *fs);
+static void scan_timeout(unsigned long data);
+static void seek_timeout(unsigned long data);
+static void settle_timeout(unsigned long data);
+static void xfer_timeout(unsigned long data);
+static irqreturn_t swim3_interrupt(int irq, void *dev_id);
+/*static void fd_dma_interrupt(int irq, void *dev_id);*/
+static int grab_drive(struct floppy_state *fs, enum swim_state state,
+ int interruptible);
+static void release_drive(struct floppy_state *fs);
+static int fd_eject(struct floppy_state *fs);
+static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param);
+static int floppy_open(struct block_device *bdev, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
+static unsigned int floppy_check_events(struct gendisk *disk,
+ unsigned int clearing);
+static int floppy_revalidate(struct gendisk *disk);
+
+static bool swim3_end_request(struct floppy_state *fs, int err, unsigned int nr_bytes)
+{
+ struct request *req = fs->cur_req;
+ int rc;
+
+ swim3_dbg(" end request, err=%d nr_bytes=%d, cur_req=%p\n",
+ err, nr_bytes, req);
+
+ if (err)
+ nr_bytes = blk_rq_cur_bytes(req);
+ rc = __blk_end_request(req, err, nr_bytes);
+ if (rc)
+ return true;
+ fs->cur_req = NULL;
+ return false;
+}
+
+static void swim3_select(struct floppy_state *fs, int sel)
+{
+ struct swim3 __iomem *sw = fs->swim3;
+
+ out_8(&sw->select, RELAX);
+ if (sel & 8)
+ out_8(&sw->control_bis, SELECT);
+ else
+ out_8(&sw->control_bic, SELECT);
+ out_8(&sw->select, sel & CA_MASK);
+}
+
+static void swim3_action(struct floppy_state *fs, int action)
+{
+ struct swim3 __iomem *sw = fs->swim3;
+
+ swim3_select(fs, action);
+ udelay(1);
+ out_8(&sw->select, sw->select | LSTRB);
+ udelay(2);
+ out_8(&sw->select, sw->select & ~LSTRB);
+ udelay(1);
+}
+
+static int swim3_readbit(struct floppy_state *fs, int bit)
+{
+ struct swim3 __iomem *sw = fs->swim3;
+ int stat;
+
+ swim3_select(fs, bit);
+ udelay(1);
+ stat = in_8(&sw->status);
+ return (stat & DATA) == 0;
+}
+
+static void start_request(struct floppy_state *fs)
+{
+ struct request *req;
+ unsigned long x;
+
+ swim3_dbg("start request, initial state=%d\n", fs->state);
+
+ if (fs->state == idle && fs->wanted) {
+ fs->state = available;
+ wake_up(&fs->wait);
+ return;
+ }
+ while (fs->state == idle) {
+ swim3_dbg("start request, idle loop, cur_req=%p\n", fs->cur_req);
+ if (!fs->cur_req) {
+ fs->cur_req = blk_fetch_request(disks[fs->index]->queue);
+ swim3_dbg(" fetched request %p\n", fs->cur_req);
+ if (!fs->cur_req)
+ break;
+ }
+ req = fs->cur_req;
+
+ if (fs->mdev->media_bay &&
+ check_media_bay(fs->mdev->media_bay) != MB_FD) {
+ swim3_dbg("%s", " media bay absent, dropping req\n");
+ swim3_end_request(fs, -ENODEV, 0);
+ continue;
+ }
+
+#if 0 /* This is really too verbose */
+ swim3_dbg("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
+ req->rq_disk->disk_name, req->cmd,
+ (long)blk_rq_pos(req), blk_rq_sectors(req),
+ bio_data(req->bio));
+ swim3_dbg(" errors=%d current_nr_sectors=%u\n",
+ req->errors, blk_rq_cur_sectors(req));
+#endif
+
+ if (blk_rq_pos(req) >= fs->total_secs) {
+ swim3_dbg(" pos out of bounds (%ld, max is %ld)\n",
+ (long)blk_rq_pos(req), (long)fs->total_secs);
+ swim3_end_request(fs, -EIO, 0);
+ continue;
+ }
+ if (fs->ejected) {
+ swim3_dbg("%s", " disk ejected\n");
+ swim3_end_request(fs, -EIO, 0);
+ continue;
+ }
+
+ if (rq_data_dir(req) == WRITE) {
+ if (fs->write_prot < 0)
+ fs->write_prot = swim3_readbit(fs, WRITE_PROT);
+ if (fs->write_prot) {
+ swim3_dbg("%s", " try to write, disk write protected\n");
+ swim3_end_request(fs, -EIO, 0);
+ continue;
+ }
+ }
+
+ /* Do not remove the cast. blk_rq_pos(req) is now a
+ * sector_t and can be 64 bits, but it will never go
+ * past 32 bits for this driver anyway, so we can
+ * safely cast it down and not have to do a 64/32
+ * division
+ */
+ fs->req_cyl = ((long)blk_rq_pos(req)) / fs->secpercyl;
+ x = ((long)blk_rq_pos(req)) % fs->secpercyl;
+ fs->head = x / fs->secpertrack;
+ fs->req_sector = x % fs->secpertrack + 1;
+ fs->state = do_transfer;
+ fs->retries = 0;
+
+ act(fs);
+ }
+}
+
+static void do_fd_request(struct request_queue * q)
+{
+ start_request(q->queuedata);
+}
+
+static void set_timeout(struct floppy_state *fs, int nticks,
+ void (*proc)(unsigned long))
+{
+ if (fs->timeout_pending)
+ del_timer(&fs->timeout);
+ fs->timeout.expires = jiffies + nticks;
+ fs->timeout.function = proc;
+ fs->timeout.data = (unsigned long) fs;
+ add_timer(&fs->timeout);
+ fs->timeout_pending = 1;
+}
+
+static inline void scan_track(struct floppy_state *fs)
+{
+ struct swim3 __iomem *sw = fs->swim3;
+
+ swim3_select(fs, READ_DATA_0);
+ in_8(&sw->intr); /* clear SEEN_SECTOR bit */
+ in_8(&sw->error);
+ out_8(&sw->intr_enable, SEEN_SECTOR);
+ out_8(&sw->control_bis, DO_ACTION);
+ /* enable intr when track found */
+ set_timeout(fs, HZ, scan_timeout); /* enable timeout */
+}
+
+static inline void seek_track(struct floppy_state *fs, int n)
+{
+ struct swim3 __iomem *sw = fs->swim3;
+
+ if (n >= 0) {
+ swim3_action(fs, SEEK_POSITIVE);
+ sw->nseek = n;
+ } else {
+ swim3_action(fs, SEEK_NEGATIVE);
+ sw->nseek = -n;
+ }
+ fs->expect_cyl = (fs->cur_cyl >= 0)? fs->cur_cyl + n: -1;
+ swim3_select(fs, STEP);
+ in_8(&sw->error);
+ /* enable intr when seek finished */
+ out_8(&sw->intr_enable, SEEK_DONE);
+ out_8(&sw->control_bis, DO_SEEK);
+ set_timeout(fs, 3*HZ, seek_timeout); /* enable timeout */
+ fs->settle_time = 0;
+}
+
+static inline void init_dma(struct dbdma_cmd *cp, int cmd,
+ void *buf, int count)
+{
+ cp->req_count = cpu_to_le16(count);
+ cp->command = cpu_to_le16(cmd);
+ cp->phy_addr = cpu_to_le32(virt_to_bus(buf));
+ cp->xfer_status = 0;
+}
+
+static inline void setup_transfer(struct floppy_state *fs)
+{
+ int n;
+ struct swim3 __iomem *sw = fs->swim3;
+ struct dbdma_cmd *cp = fs->dma_cmd;
+ struct dbdma_regs __iomem *dr = fs->dma;
+ struct request *req = fs->cur_req;
+
+ if (blk_rq_cur_sectors(req) <= 0) {
+ swim3_warn("%s", "Transfer 0 sectors ?\n");
+ return;
+ }
+ if (rq_data_dir(req) == WRITE)
+ n = 1;
+ else {
+ n = fs->secpertrack - fs->req_sector + 1;
+ if (n > blk_rq_cur_sectors(req))
+ n = blk_rq_cur_sectors(req);
+ }
+
+ swim3_dbg(" setup xfer at sect %d (of %d) head %d for %d\n",
+ fs->req_sector, fs->secpertrack, fs->head, n);
+
+ fs->scount = n;
+ swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0);
+ out_8(&sw->sector, fs->req_sector);
+ out_8(&sw->nsect, n);
+ out_8(&sw->gap3, 0);
+ out_le32(&dr->cmdptr, virt_to_bus(cp));
+ if (rq_data_dir(req) == WRITE) {
+ /* Set up 3 dma commands: write preamble, data, postamble */
+ init_dma(cp, OUTPUT_MORE, write_preamble, sizeof(write_preamble));
+ ++cp;
+ init_dma(cp, OUTPUT_MORE, bio_data(req->bio), 512);
+ ++cp;
+ init_dma(cp, OUTPUT_LAST, write_postamble, sizeof(write_postamble));
+ } else {
+ init_dma(cp, INPUT_LAST, bio_data(req->bio), n * 512);
+ }
+ ++cp;
+ out_le16(&cp->command, DBDMA_STOP);
+ out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
+ in_8(&sw->error);
+ out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
+ if (rq_data_dir(req) == WRITE)
+ out_8(&sw->control_bis, WRITE_SECTORS);
+ in_8(&sw->intr);
+ out_le32(&dr->control, (RUN << 16) | RUN);
+ /* enable intr when transfer complete */
+ out_8(&sw->intr_enable, TRANSFER_DONE);
+ out_8(&sw->control_bis, DO_ACTION);
+ set_timeout(fs, 2*HZ, xfer_timeout); /* enable timeout */
+}
+
+static void act(struct floppy_state *fs)
+{
+ for (;;) {
+ swim3_dbg(" act loop, state=%d, req_cyl=%d, cur_cyl=%d\n",
+ fs->state, fs->req_cyl, fs->cur_cyl);
+
+ switch (fs->state) {
+ case idle:
+ return; /* XXX shouldn't get here */
+
+ case locating:
+ if (swim3_readbit(fs, TRACK_ZERO)) {
+ swim3_dbg("%s", " locate track 0\n");
+ fs->cur_cyl = 0;
+ if (fs->req_cyl == 0)
+ fs->state = do_transfer;
+ else
+ fs->state = seeking;
+ break;
+ }
+ scan_track(fs);
+ return;
+
+ case seeking:
+ if (fs->cur_cyl < 0) {
+ fs->expect_cyl = -1;
+ fs->state = locating;
+ break;
+ }
+ if (fs->req_cyl == fs->cur_cyl) {
+ swim3_warn("%s", "Whoops, seeking 0\n");
+ fs->state = do_transfer;
+ break;
+ }
+ seek_track(fs, fs->req_cyl - fs->cur_cyl);
+ return;
+
+ case settling:
+ /* check for SEEK_COMPLETE after 30ms */
+ fs->settle_time = (HZ + 32) / 33;
+ set_timeout(fs, fs->settle_time, settle_timeout);
+ return;
+
+ case do_transfer:
+ if (fs->cur_cyl != fs->req_cyl) {
+ if (fs->retries > 5) {
+ swim3_err("Wrong cylinder in transfer, want: %d got %d\n",
+ fs->req_cyl, fs->cur_cyl);
+ swim3_end_request(fs, -EIO, 0);
+ fs->state = idle;
+ return;
+ }
+ fs->state = seeking;
+ break;
+ }
+ setup_transfer(fs);
+ return;
+
+ case jogging:
+ seek_track(fs, -5);
+ return;
+
+ default:
+ swim3_err("Unknown state %d\n", fs->state);
+ return;
+ }
+ }
+}
+
+static void scan_timeout(unsigned long data)
+{
+ struct floppy_state *fs = (struct floppy_state *) data;
+ struct swim3 __iomem *sw = fs->swim3;
+ unsigned long flags;
+
+ swim3_dbg("* scan timeout, state=%d\n", fs->state);
+
+ spin_lock_irqsave(&swim3_lock, flags);
+ fs->timeout_pending = 0;
+ out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
+ out_8(&sw->select, RELAX);
+ out_8(&sw->intr_enable, 0);
+ fs->cur_cyl = -1;
+ if (fs->retries > 5) {
+ swim3_end_request(fs, -EIO, 0);
+ fs->state = idle;
+ start_request(fs);
+ } else {
+ fs->state = jogging;
+ act(fs);
+ }
+ spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static void seek_timeout(unsigned long data)
+{
+ struct floppy_state *fs = (struct floppy_state *) data;
+ struct swim3 __iomem *sw = fs->swim3;
+ unsigned long flags;
+
+ swim3_dbg("* seek timeout, state=%d\n", fs->state);
+
+ spin_lock_irqsave(&swim3_lock, flags);
+ fs->timeout_pending = 0;
+ out_8(&sw->control_bic, DO_SEEK);
+ out_8(&sw->select, RELAX);
+ out_8(&sw->intr_enable, 0);
+ swim3_err("%s", "Seek timeout\n");
+ swim3_end_request(fs, -EIO, 0);
+ fs->state = idle;
+ start_request(fs);
+ spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static void settle_timeout(unsigned long data)
+{
+ struct floppy_state *fs = (struct floppy_state *) data;
+ struct swim3 __iomem *sw = fs->swim3;
+ unsigned long flags;
+
+ swim3_dbg("* settle timeout, state=%d\n", fs->state);
+
+ spin_lock_irqsave(&swim3_lock, flags);
+ fs->timeout_pending = 0;
+ if (swim3_readbit(fs, SEEK_COMPLETE)) {
+ out_8(&sw->select, RELAX);
+ fs->state = locating;
+ act(fs);
+ goto unlock;
+ }
+ out_8(&sw->select, RELAX);
+ if (fs->settle_time < 2*HZ) {
+ ++fs->settle_time;
+ set_timeout(fs, 1, settle_timeout);
+ goto unlock;
+ }
+ swim3_err("%s", "Seek settle timeout\n");
+ swim3_end_request(fs, -EIO, 0);
+ fs->state = idle;
+ start_request(fs);
+ unlock:
+ spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static void xfer_timeout(unsigned long data)
+{
+ struct floppy_state *fs = (struct floppy_state *) data;
+ struct swim3 __iomem *sw = fs->swim3;
+ struct dbdma_regs __iomem *dr = fs->dma;
+ unsigned long flags;
+ int n;
+
+ swim3_dbg("* xfer timeout, state=%d\n", fs->state);
+
+ spin_lock_irqsave(&swim3_lock, flags);
+ fs->timeout_pending = 0;
+ out_le32(&dr->control, RUN << 16);
+ /* We must wait a bit for dbdma to stop */
+ for (n = 0; (in_le32(&dr->status) & ACTIVE) && n < 1000; n++)
+ udelay(1);
+ out_8(&sw->intr_enable, 0);
+ out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
+ out_8(&sw->select, RELAX);
+ swim3_err("Timeout %sing sector %ld\n",
+ (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"),
+ (long)blk_rq_pos(fs->cur_req));
+ swim3_end_request(fs, -EIO, 0);
+ fs->state = idle;
+ start_request(fs);
+ spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static irqreturn_t swim3_interrupt(int irq, void *dev_id)
+{
+ struct floppy_state *fs = (struct floppy_state *) dev_id;
+ struct swim3 __iomem *sw = fs->swim3;
+ int intr, err, n;
+ int stat, resid;
+ struct dbdma_regs __iomem *dr;
+ struct dbdma_cmd *cp;
+ unsigned long flags;
+ struct request *req = fs->cur_req;
+
+ swim3_dbg("* interrupt, state=%d\n", fs->state);
+
+ spin_lock_irqsave(&swim3_lock, flags);
+ intr = in_8(&sw->intr);
+ err = (intr & ERROR_INTR)? in_8(&sw->error): 0;
+ if ((intr & ERROR_INTR) && fs->state != do_transfer)
+ swim3_err("Non-transfer error interrupt: state=%d, dir=%x, intr=%x, err=%x\n",
+ fs->state, rq_data_dir(req), intr, err);
+ switch (fs->state) {
+ case locating:
+ if (intr & SEEN_SECTOR) {
+ out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
+ out_8(&sw->select, RELAX);
+ out_8(&sw->intr_enable, 0);
+ del_timer(&fs->timeout);
+ fs->timeout_pending = 0;
+ if (sw->ctrack == 0xff) {
+ swim3_err("%s", "Seen sector but cyl=ff?\n");
+ fs->cur_cyl = -1;
+ if (fs->retries > 5) {
+ swim3_end_request(fs, -EIO, 0);
+ fs->state = idle;
+ start_request(fs);
+ } else {
+ fs->state = jogging;
+ act(fs);
+ }
+ break;
+ }
+ fs->cur_cyl = sw->ctrack;
+ fs->cur_sector = sw->csect;
+ if (fs->expect_cyl != -1 && fs->expect_cyl != fs->cur_cyl)
+ swim3_err("Expected cyl %d, got %d\n",
+ fs->expect_cyl, fs->cur_cyl);
+ fs->state = do_transfer;
+ act(fs);
+ }
+ break;
+ case seeking:
+ case jogging:
+ if (sw->nseek == 0) {
+ out_8(&sw->control_bic, DO_SEEK);
+ out_8(&sw->select, RELAX);
+ out_8(&sw->intr_enable, 0);
+ del_timer(&fs->timeout);
+ fs->timeout_pending = 0;
+ if (fs->state == seeking)
+ ++fs->retries;
+ fs->state = settling;
+ act(fs);
+ }
+ break;
+ case settling:
+ out_8(&sw->intr_enable, 0);
+ del_timer(&fs->timeout);
+ fs->timeout_pending = 0;
+ act(fs);
+ break;
+ case do_transfer:
+ if ((intr & (ERROR_INTR | TRANSFER_DONE)) == 0)
+ break;
+ out_8(&sw->intr_enable, 0);
+ out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
+ out_8(&sw->select, RELAX);
+ del_timer(&fs->timeout);
+ fs->timeout_pending = 0;
+ dr = fs->dma;
+ cp = fs->dma_cmd;
+ if (rq_data_dir(req) == WRITE)
+ ++cp;
+ /*
+ * Check that the main data transfer has finished.
+ * On writing, the swim3 sometimes doesn't use
+ * up all the bytes of the postamble, so we can still
+ * see DMA active here. That doesn't matter as long
+ * as all the sector data has been transferred.
+ */
+ if ((intr & ERROR_INTR) == 0 && cp->xfer_status == 0) {
+ /* wait a little while for DMA to complete */
+ for (n = 0; n < 100; ++n) {
+ if (cp->xfer_status != 0)
+ break;
+ udelay(1);
+ barrier();
+ }
+ }
+ /* turn off DMA */
+ out_le32(&dr->control, (RUN | PAUSE) << 16);
+ stat = le16_to_cpu(cp->xfer_status);
+ resid = le16_to_cpu(cp->res_count);
+ if (intr & ERROR_INTR) {
+ n = fs->scount - 1 - resid / 512;
+ if (n > 0) {
+ blk_update_request(req, 0, n << 9);
+ fs->req_sector += n;
+ }
+ if (fs->retries < 5) {
+ ++fs->retries;
+ act(fs);
+ } else {
+ swim3_err("Error %sing block %ld (err=%x)\n",
+ rq_data_dir(req) == WRITE? "writ": "read",
+ (long)blk_rq_pos(req), err);
+ swim3_end_request(fs, -EIO, 0);
+ fs->state = idle;
+ }
+ } else {
+ if ((stat & ACTIVE) == 0 || resid != 0) {
+ /* musta been an error */
+ swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid);
+ swim3_err(" state=%d, dir=%x, intr=%x, err=%x\n",
+ fs->state, rq_data_dir(req), intr, err);
+ swim3_end_request(fs, -EIO, 0);
+ fs->state = idle;
+ start_request(fs);
+ break;
+ }
+ fs->retries = 0;
+ if (swim3_end_request(fs, 0, fs->scount << 9)) {
+ fs->req_sector += fs->scount;
+ if (fs->req_sector > fs->secpertrack) {
+ fs->req_sector -= fs->secpertrack;
+ if (++fs->head > 1) {
+ fs->head = 0;
+ ++fs->req_cyl;
+ }
+ }
+ act(fs);
+ } else
+ fs->state = idle;
+ }
+ if (fs->state == idle)
+ start_request(fs);
+ break;
+ default:
+ swim3_err("Don't know what to do in state %d\n", fs->state);
+ }
+ spin_unlock_irqrestore(&swim3_lock, flags);
+ return IRQ_HANDLED;
+}
+
+/*
+static void fd_dma_interrupt(int irq, void *dev_id)
+{
+}
+*/
+
+/* Called under the mutex to grab exclusive access to a drive */
+static int grab_drive(struct floppy_state *fs, enum swim_state state,
+ int interruptible)
+{
+ unsigned long flags;
+
+ swim3_dbg("%s", "-> grab drive\n");
+
+ spin_lock_irqsave(&swim3_lock, flags);
+ if (fs->state != idle && fs->state != available) {
+ ++fs->wanted;
+ /* this will enable irqs in order to sleep */
+ if (!interruptible)
+ wait_event_lock_irq(fs->wait,
+ fs->state == available,
+ swim3_lock);
+ else if (wait_event_interruptible_lock_irq(fs->wait,
+ fs->state == available,
+ swim3_lock)) {
+ --fs->wanted;
+ spin_unlock_irqrestore(&swim3_lock, flags);
+ return -EINTR;
+ }
+ --fs->wanted;
+ }
+ fs->state = state;
+ spin_unlock_irqrestore(&swim3_lock, flags);
+
+ return 0;
+}
+
+static void release_drive(struct floppy_state *fs)
+{
+ unsigned long flags;
+
+ swim3_dbg("%s", "-> release drive\n");
+
+ spin_lock_irqsave(&swim3_lock, flags);
+ fs->state = idle;
+ start_request(fs);
+ spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static int fd_eject(struct floppy_state *fs)
+{
+ int err, n;
+
+ err = grab_drive(fs, ejecting, 1);
+ if (err)
+ return err;
+ swim3_action(fs, EJECT);
+ for (n = 20; n > 0; --n) {
+ if (signal_pending(current)) {
+ err = -EINTR;
+ break;
+ }
+ swim3_select(fs, RELAX);
+ schedule_timeout_interruptible(1);
+ if (swim3_readbit(fs, DISK_IN) == 0)
+ break;
+ }
+ swim3_select(fs, RELAX);
+ udelay(150);
+ fs->ejected = 1;
+ release_drive(fs);
+ return err;
+}
+
+static struct floppy_struct floppy_type =
+ { 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,NULL }; /* 7 1.44MB 3.5" */
+
+static int floppy_locked_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param)
+{
+ struct floppy_state *fs = bdev->bd_disk->private_data;
+ int err;
+
+ if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (fs->mdev->media_bay &&
+ check_media_bay(fs->mdev->media_bay) != MB_FD)
+ return -ENXIO;
+
+ switch (cmd) {
+ case FDEJECT:
+ if (fs->ref_count != 1)
+ return -EBUSY;
+ err = fd_eject(fs);
+ return err;
+ case FDGETPRM:
+ if (copy_to_user((void __user *) param, &floppy_type,
+ sizeof(struct floppy_struct)))
+ return -EFAULT;
+ return 0;
+ }
+ return -ENOTTY;
+}
+
+static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long param)
+{
+ int ret;
+
+ mutex_lock(&swim3_mutex);
+ ret = floppy_locked_ioctl(bdev, mode, cmd, param);
+ mutex_unlock(&swim3_mutex);
+
+ return ret;
+}
+
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+ struct floppy_state *fs = bdev->bd_disk->private_data;
+ struct swim3 __iomem *sw = fs->swim3;
+ int n, err = 0;
+
+ if (fs->ref_count == 0) {
+ if (fs->mdev->media_bay &&
+ check_media_bay(fs->mdev->media_bay) != MB_FD)
+ return -ENXIO;
+ out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2);
+ out_8(&sw->control_bic, 0xff);
+ out_8(&sw->mode, 0x95);
+ udelay(10);
+ out_8(&sw->intr_enable, 0);
+ out_8(&sw->control_bis, DRIVE_ENABLE | INTR_ENABLE);
+ swim3_action(fs, MOTOR_ON);
+ fs->write_prot = -1;
+ fs->cur_cyl = -1;
+ for (n = 0; n < 2 * HZ; ++n) {
+ if (n >= HZ/30 && swim3_readbit(fs, SEEK_COMPLETE))
+ break;
+ if (signal_pending(current)) {
+ err = -EINTR;
+ break;
+ }
+ swim3_select(fs, RELAX);
+ schedule_timeout_interruptible(1);
+ }
+ if (err == 0 && (swim3_readbit(fs, SEEK_COMPLETE) == 0
+ || swim3_readbit(fs, DISK_IN) == 0))
+ err = -ENXIO;
+ swim3_action(fs, SETMFM);
+ swim3_select(fs, RELAX);
+
+ } else if (fs->ref_count == -1 || mode & FMODE_EXCL)
+ return -EBUSY;
+
+ if (err == 0 && (mode & FMODE_NDELAY) == 0
+ && (mode & (FMODE_READ|FMODE_WRITE))) {
+ check_disk_change(bdev);
+ if (fs->ejected)
+ err = -ENXIO;
+ }
+
+ if (err == 0 && (mode & FMODE_WRITE)) {
+ if (fs->write_prot < 0)
+ fs->write_prot = swim3_readbit(fs, WRITE_PROT);
+ if (fs->write_prot)
+ err = -EROFS;
+ }
+
+ if (err) {
+ if (fs->ref_count == 0) {
+ swim3_action(fs, MOTOR_OFF);
+ out_8(&sw->control_bic, DRIVE_ENABLE | INTR_ENABLE);
+ swim3_select(fs, RELAX);
+ }
+ return err;
+ }
+
+ if (mode & FMODE_EXCL)
+ fs->ref_count = -1;
+ else
+ ++fs->ref_count;
+
+ return 0;
+}
+
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+ int ret;
+
+ mutex_lock(&swim3_mutex);
+ ret = floppy_open(bdev, mode);
+ mutex_unlock(&swim3_mutex);
+
+ return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+ struct floppy_state *fs = disk->private_data;
+ struct swim3 __iomem *sw = fs->swim3;
+
+ mutex_lock(&swim3_mutex);
+ if (fs->ref_count > 0 && --fs->ref_count == 0) {
+ swim3_action(fs, MOTOR_OFF);
+ out_8(&sw->control_bic, 0xff);
+ swim3_select(fs, RELAX);
+ }
+ mutex_unlock(&swim3_mutex);
+}
+
+static unsigned int floppy_check_events(struct gendisk *disk,
+ unsigned int clearing)
+{
+ struct floppy_state *fs = disk->private_data;
+ return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static int floppy_revalidate(struct gendisk *disk)
+{
+ struct floppy_state *fs = disk->private_data;
+ struct swim3 __iomem *sw;
+ int ret, n;
+
+ if (fs->mdev->media_bay &&
+ check_media_bay(fs->mdev->media_bay) != MB_FD)
+ return -ENXIO;
+
+ sw = fs->swim3;
+ grab_drive(fs, revalidating, 0);
+ out_8(&sw->intr_enable, 0);
+ out_8(&sw->control_bis, DRIVE_ENABLE);
+ swim3_action(fs, MOTOR_ON); /* necessary? */
+ fs->write_prot = -1;
+ fs->cur_cyl = -1;
+ mdelay(1);
+ for (n = HZ; n > 0; --n) {
+ if (swim3_readbit(fs, SEEK_COMPLETE))
+ break;
+ if (signal_pending(current))
+ break;
+ swim3_select(fs, RELAX);
+ schedule_timeout_interruptible(1);
+ }
+ ret = swim3_readbit(fs, SEEK_COMPLETE) == 0
+ || swim3_readbit(fs, DISK_IN) == 0;
+ if (ret)
+ swim3_action(fs, MOTOR_OFF);
+ else {
+ fs->ejected = 0;
+ swim3_action(fs, SETMFM);
+ }
+ swim3_select(fs, RELAX);
+
+ release_drive(fs);
+ return ret;
+}
+
+static const struct block_device_operations floppy_fops = {
+ .open = floppy_unlocked_open,
+ .release = floppy_release,
+ .ioctl = floppy_ioctl,
+ .check_events = floppy_check_events,
+ .revalidate_disk= floppy_revalidate,
+};
+
+static void swim3_mb_event(struct macio_dev* mdev, int mb_state)
+{
+ struct floppy_state *fs = macio_get_drvdata(mdev);
+ struct swim3 __iomem *sw;
+
+ if (!fs)
+ return;
+
+ sw = fs->swim3;
+
+ if (mb_state != MB_FD)
+ return;
+
+ /* Clear state */
+ out_8(&sw->intr_enable, 0);
+ in_8(&sw->intr);
+ in_8(&sw->error);
+}
+
+static int swim3_add_device(struct macio_dev *mdev, int index)
+{
+ struct device_node *swim = mdev->ofdev.dev.of_node;
+ struct floppy_state *fs = &floppy_states[index];
+ int rc = -EBUSY;
+
+ /* Do this first for message macros */
+ memset(fs, 0, sizeof(*fs));
+ fs->mdev = mdev;
+ fs->index = index;
+
+ /* Check & Request resources */
+ if (macio_resource_count(mdev) < 2) {
+ swim3_err("%s", "No address in device-tree\n");
+ return -ENXIO;
+ }
+ if (macio_irq_count(mdev) < 1) {
+ swim3_err("%s", "No interrupt in device-tree\n");
+ return -ENXIO;
+ }
+ if (macio_request_resource(mdev, 0, "swim3 (mmio)")) {
+ swim3_err("%s", "Can't request mmio resource\n");
+ return -EBUSY;
+ }
+ if (macio_request_resource(mdev, 1, "swim3 (dma)")) {
+ swim3_err("%s", "Can't request dma resource\n");
+ macio_release_resource(mdev, 0);
+ return -EBUSY;
+ }
+ dev_set_drvdata(&mdev->ofdev.dev, fs);
+
+ if (mdev->media_bay == NULL)
+ pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1);
+
+ fs->state = idle;
+ fs->swim3 = (struct swim3 __iomem *)
+ ioremap(macio_resource_start(mdev, 0), 0x200);
+ if (fs->swim3 == NULL) {
+ swim3_err("%s", "Couldn't map mmio registers\n");
+ rc = -ENOMEM;
+ goto out_release;
+ }
+ fs->dma = (struct dbdma_regs __iomem *)
+ ioremap(macio_resource_start(mdev, 1), 0x200);
+ if (fs->dma == NULL) {
+ swim3_err("%s", "Couldn't map dma registers\n");
+ iounmap(fs->swim3);
+ rc = -ENOMEM;
+ goto out_release;
+ }
+ fs->swim3_intr = macio_irq(mdev, 0);
+ fs->dma_intr = macio_irq(mdev, 1);
+ fs->cur_cyl = -1;
+ fs->cur_sector = -1;
+ fs->secpercyl = 36;
+ fs->secpertrack = 18;
+ fs->total_secs = 2880;
+ init_waitqueue_head(&fs->wait);
+
+ fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space);
+ memset(fs->dma_cmd, 0, 2 * sizeof(struct dbdma_cmd));
+ fs->dma_cmd[1].command = cpu_to_le16(DBDMA_STOP);
+
+ if (mdev->media_bay == NULL || check_media_bay(mdev->media_bay) == MB_FD)
+ swim3_mb_event(mdev, MB_FD);
+
+ if (request_irq(fs->swim3_intr, swim3_interrupt, 0, "SWIM3", fs)) {
+ swim3_err("%s", "Couldn't request interrupt\n");
+ pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 0);
+ goto out_unmap;
+ return -EBUSY;
+ }
+
+ init_timer(&fs->timeout);
+
+ swim3_info("SWIM3 floppy controller %s\n",
+ mdev->media_bay ? "in media bay" : "");
+
+ return 0;
+
+ out_unmap:
+ iounmap(fs->dma);
+ iounmap(fs->swim3);
+
+ out_release:
+ macio_release_resource(mdev, 0);
+ macio_release_resource(mdev, 1);
+
+ return rc;
+}
+
+static int swim3_attach(struct macio_dev *mdev,
+ const struct of_device_id *match)
+{
+ struct gendisk *disk;
+ int index, rc;
+
+ index = floppy_count++;
+ if (index >= MAX_FLOPPIES)
+ return -ENXIO;
+
+ /* Add the drive */
+ rc = swim3_add_device(mdev, index);
+ if (rc)
+ return rc;
+ /* Now register that disk. Same comment about failure handling */
+ disk = disks[index] = alloc_disk(1);
+ if (disk == NULL)
+ return -ENOMEM;
+ disk->queue = blk_init_queue(do_fd_request, &swim3_lock);
+ if (disk->queue == NULL) {
+ put_disk(disk);
+ return -ENOMEM;
+ }
+ disk->queue->queuedata = &floppy_states[index];
+
+ if (index == 0) {
+ /* If we failed, there isn't much we can do as the driver is still
+ * too dumb to remove the device, just bail out
+ */
+ if (register_blkdev(FLOPPY_MAJOR, "fd"))
+ return 0;
+ }
+
+ disk->major = FLOPPY_MAJOR;
+ disk->first_minor = index;
+ disk->fops = &floppy_fops;
+ disk->private_data = &floppy_states[index];
+ disk->flags |= GENHD_FL_REMOVABLE;
+ sprintf(disk->disk_name, "fd%d", index);
+ set_capacity(disk, 2880);
+ add_disk(disk);
+
+ return 0;
+}
+
+static struct of_device_id swim3_match[] =
+{
+ {
+ .name = "swim3",
+ },
+ {
+ .compatible = "ohare-swim3"
+ },
+ {
+ .compatible = "swim3"
+ },
+ { /* end of list */ }
+};
+
+static struct macio_driver swim3_driver =
+{
+ .driver = {
+ .name = "swim3",
+ .of_match_table = swim3_match,
+ },
+ .probe = swim3_attach,
+#ifdef CONFIG_PMAC_MEDIABAY
+ .mediabay_event = swim3_mb_event,
+#endif
+#if 0
+ .suspend = swim3_suspend,
+ .resume = swim3_resume,
+#endif
+};
+
+
+int swim3_init(void)
+{
+ macio_register_driver(&swim3_driver);
+ return 0;
+}
+
+module_init(swim3_init)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul Mackerras");
+MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR);
diff --git a/drivers/block/swim_asm.S b/drivers/block/swim_asm.S
new file mode 100644
index 000000000..c96682068
--- /dev/null
+++ b/drivers/block/swim_asm.S
@@ -0,0 +1,247 @@
+/*
+ * low-level functions for the SWIM floppy controller
+ *
+ * needs assembly language because is very timing dependent
+ * this controller exists only on macintosh 680x0 based
+ *
+ * Copyright (C) 2004,2008 Laurent Vivier <Laurent@lvivier.info>
+ *
+ * based on Alastair Bridgewater SWIM analysis, 2001
+ * based on netBSD IWM driver (c) 1997, 1998 Hauke Fath.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * 2004-08-21 (lv) - Initial implementation
+ * 2008-11-05 (lv) - add get_swim_mode
+ */
+
+ .equ write_data, 0x0000
+ .equ write_mark, 0x0200
+ .equ write_CRC, 0x0400
+ .equ write_parameter,0x0600
+ .equ write_phase, 0x0800
+ .equ write_setup, 0x0a00
+ .equ write_mode0, 0x0c00
+ .equ write_mode1, 0x0e00
+ .equ read_data, 0x1000
+ .equ read_mark, 0x1200
+ .equ read_error, 0x1400
+ .equ read_parameter, 0x1600
+ .equ read_phase, 0x1800
+ .equ read_setup, 0x1a00
+ .equ read_status, 0x1c00
+ .equ read_handshake, 0x1e00
+
+ .equ o_side, 0
+ .equ o_track, 1
+ .equ o_sector, 2
+ .equ o_size, 3
+ .equ o_crc0, 4
+ .equ o_crc1, 5
+
+ .equ seek_time, 30000
+ .equ max_retry, 40
+ .equ sector_size, 512
+
+ .global swim_read_sector_header
+swim_read_sector_header:
+ link %a6, #0
+ moveml %d1-%d5/%a0-%a4,%sp@-
+ movel %a6@(0x0c), %a4
+ bsr mfm_read_addrmark
+ moveml %sp@+, %d1-%d5/%a0-%a4
+ unlk %a6
+ rts
+
+sector_address_mark:
+ .byte 0xa1, 0xa1, 0xa1, 0xfe
+sector_data_mark:
+ .byte 0xa1, 0xa1, 0xa1, 0xfb
+
+mfm_read_addrmark:
+ movel %a6@(0x08), %a3
+ lea %a3@(read_handshake), %a2
+ lea %a3@(read_mark), %a3
+ moveq #-1, %d0
+ movew #seek_time, %d2
+
+wait_header_init:
+ tstb %a3@(read_error - read_mark)
+ moveb #0x18, %a3@(write_mode0 - read_mark)
+ moveb #0x01, %a3@(write_mode1 - read_mark)
+ moveb #0x01, %a3@(write_mode0 - read_mark)
+ tstb %a3@(read_error - read_mark)
+ moveb #0x08, %a3@(write_mode1 - read_mark)
+
+ lea sector_address_mark, %a0
+ moveq #3, %d1
+
+wait_addr_mark_byte:
+
+ tstb %a2@
+ dbmi %d2, wait_addr_mark_byte
+ bpl header_exit
+
+ moveb %a3@, %d3
+ cmpb %a0@+, %d3
+ dbne %d1, wait_addr_mark_byte
+ bne wait_header_init
+
+ moveq #max_retry, %d2
+
+amark0: tstb %a2@
+ dbmi %d2, amark0
+ bpl signal_nonyb
+
+ moveb %a3@, %a4@(o_track)
+
+ moveq #max_retry, %d2
+
+amark1: tstb %a2@
+ dbmi %d2, amark1
+ bpl signal_nonyb
+
+ moveb %a3@, %a4@(o_side)
+
+ moveq #max_retry, %d2
+
+amark2: tstb %a2@
+ dbmi %d2, amark2
+ bpl signal_nonyb
+
+ moveb %a3@, %a4@(o_sector)
+
+ moveq #max_retry, %d2
+
+amark3: tstb %a2@
+ dbmi %d2, amark3
+ bpl signal_nonyb
+
+ moveb %a3@, %a4@(o_size)
+
+ moveq #max_retry, %d2
+
+crc0: tstb %a2@
+ dbmi %d2, crc0
+ bpl signal_nonyb
+
+ moveb %a3@, %a4@(o_crc0)
+
+ moveq #max_retry, %d2
+
+crc1: tstb %a2@
+ dbmi %d2, crc1
+ bpl signal_nonyb
+
+ moveb %a3@, %a4@(o_crc1)
+
+ tstb %a3@(read_error - read_mark)
+
+header_exit:
+ moveq #0, %d0
+ moveb #0x18, %a3@(write_mode0 - read_mark)
+ rts
+signal_nonyb:
+ moveq #-1, %d0
+ moveb #0x18, %a3@(write_mode0 - read_mark)
+ rts
+
+ .global swim_read_sector_data
+swim_read_sector_data:
+ link %a6, #0
+ moveml %d1-%d5/%a0-%a5,%sp@-
+ movel %a6@(0x0c), %a4
+ bsr mfm_read_data
+ moveml %sp@+, %d1-%d5/%a0-%a5
+ unlk %a6
+ rts
+
+mfm_read_data:
+ movel %a6@(0x08), %a3
+ lea %a3@(read_handshake), %a2
+ lea %a3@(read_data), %a5
+ lea %a3@(read_mark), %a3
+ movew #seek_time, %d2
+
+wait_data_init:
+ tstb %a3@(read_error - read_mark)
+ moveb #0x18, %a3@(write_mode0 - read_mark)
+ moveb #0x01, %a3@(write_mode1 - read_mark)
+ moveb #0x01, %a3@(write_mode0 - read_mark)
+ tstb %a3@(read_error - read_mark)
+ moveb #0x08, %a3@(write_mode1 - read_mark)
+
+ lea sector_data_mark, %a0
+ moveq #3, %d1
+
+ /* wait data address mark */
+
+wait_data_mark_byte:
+
+ tstb %a2@
+ dbmi %d2, wait_data_mark_byte
+ bpl data_exit
+
+ moveb %a3@, %d3
+ cmpb %a0@+, %d3
+ dbne %d1, wait_data_mark_byte
+ bne wait_data_init
+
+ /* read data */
+
+ tstb %a3@(read_error - read_mark)
+
+ movel #sector_size-1, %d4 /* sector size */
+read_new_data:
+ movew #max_retry, %d2
+read_data_loop:
+ moveb %a2@, %d5
+ andb #0xc0, %d5
+ dbne %d2, read_data_loop
+ beq data_exit
+ moveb %a5@, %a4@+
+ andb #0x40, %d5
+ dbne %d4, read_new_data
+ beq exit_loop
+ moveb %a5@, %a4@+
+ dbra %d4, read_new_data
+exit_loop:
+
+ /* read CRC */
+
+ movew #max_retry, %d2
+data_crc0:
+
+ tstb %a2@
+ dbmi %d2, data_crc0
+ bpl data_exit
+
+ moveb %a3@, %d5
+
+ moveq #max_retry, %d2
+
+data_crc1:
+
+ tstb %a2@
+ dbmi %d2, data_crc1
+ bpl data_exit
+
+ moveb %a3@, %d5
+
+ tstb %a3@(read_error - read_mark)
+
+ moveb #0x18, %a3@(write_mode0 - read_mark)
+
+ /* return number of bytes read */
+
+ movel #sector_size, %d0
+ addw #1, %d4
+ subl %d4, %d0
+ rts
+data_exit:
+ moveb #0x18, %a3@(write_mode0 - read_mark)
+ moveq #-1, %d0
+ rts
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
new file mode 100644
index 000000000..5d552857d
--- /dev/null
+++ b/drivers/block/sx8.c
@@ -0,0 +1,1749 @@
+/*
+ * sx8.c: Driver for Promise SATA SX8 looks-like-I2O hardware
+ *
+ * Copyright 2004-2005 Red Hat, Inc.
+ *
+ * Author/maintainer: Jeff Garzik <jgarzik@pobox.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/compiler.h>
+#include <linux/workqueue.h>
+#include <linux/bitops.h>
+#include <linux/delay.h>
+#include <linux/time.h>
+#include <linux/hdreg.h>
+#include <linux/dma-mapping.h>
+#include <linux/completion.h>
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+#if 0
+#define CARM_DEBUG
+#define CARM_VERBOSE_DEBUG
+#else
+#undef CARM_DEBUG
+#undef CARM_VERBOSE_DEBUG
+#endif
+#undef CARM_NDEBUG
+
+#define DRV_NAME "sx8"
+#define DRV_VERSION "1.0"
+#define PFX DRV_NAME ": "
+
+MODULE_AUTHOR("Jeff Garzik");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Promise SATA SX8 block driver");
+MODULE_VERSION(DRV_VERSION);
+
+/*
+ * SX8 hardware has a single message queue for all ATA ports.
+ * When this driver was written, the hardware (firmware?) would
+ * corrupt data eventually, if more than one request was outstanding.
+ * As one can imagine, having 8 ports bottlenecking on a single
+ * command hurts performance.
+ *
+ * Based on user reports, later versions of the hardware (firmware?)
+ * seem to be able to survive with more than one command queued.
+ *
+ * Therefore, we default to the safe option -- 1 command -- but
+ * allow the user to increase this.
+ *
+ * SX8 should be able to support up to ~60 queued commands (CARM_MAX_REQ),
+ * but problems seem to occur when you exceed ~30, even on newer hardware.
+ */
+static int max_queue = 1;
+module_param(max_queue, int, 0444);
+MODULE_PARM_DESC(max_queue, "Maximum number of queued commands. (min==1, max==30, safe==1)");
+
+
+#define NEXT_RESP(idx) ((idx + 1) % RMSG_Q_LEN)
+
+/* 0xf is just arbitrary, non-zero noise; this is sorta like poisoning */
+#define TAG_ENCODE(tag) (((tag) << 16) | 0xf)
+#define TAG_DECODE(tag) (((tag) >> 16) & 0x1f)
+#define TAG_VALID(tag) ((((tag) & 0xf) == 0xf) && (TAG_DECODE(tag) < 32))
+
+/* note: prints function name for you */
+#ifdef CARM_DEBUG
+#define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
+#ifdef CARM_VERBOSE_DEBUG
+#define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args)
+#else
+#define VPRINTK(fmt, args...)
+#endif /* CARM_VERBOSE_DEBUG */
+#else
+#define DPRINTK(fmt, args...)
+#define VPRINTK(fmt, args...)
+#endif /* CARM_DEBUG */
+
+#ifdef CARM_NDEBUG
+#define assert(expr)
+#else
+#define assert(expr) \
+ if(unlikely(!(expr))) { \
+ printk(KERN_ERR "Assertion failed! %s,%s,%s,line=%d\n", \
+ #expr, __FILE__, __func__, __LINE__); \
+ }
+#endif
+
+/* defines only for the constants which don't work well as enums */
+struct carm_host;
+
+enum {
+ /* adapter-wide limits */
+ CARM_MAX_PORTS = 8,
+ CARM_SHM_SIZE = (4096 << 7),
+ CARM_MINORS_PER_MAJOR = 256 / CARM_MAX_PORTS,
+ CARM_MAX_WAIT_Q = CARM_MAX_PORTS + 1,
+
+ /* command message queue limits */
+ CARM_MAX_REQ = 64, /* max command msgs per host */
+ CARM_MSG_LOW_WATER = (CARM_MAX_REQ / 4), /* refill mark */
+
+ /* S/G limits, host-wide and per-request */
+ CARM_MAX_REQ_SG = 32, /* max s/g entries per request */
+ CARM_MAX_HOST_SG = 600, /* max s/g entries per host */
+ CARM_SG_LOW_WATER = (CARM_MAX_HOST_SG / 4), /* re-fill mark */
+
+ /* hardware registers */
+ CARM_IHQP = 0x1c,
+ CARM_INT_STAT = 0x10, /* interrupt status */
+ CARM_INT_MASK = 0x14, /* interrupt mask */
+ CARM_HMUC = 0x18, /* host message unit control */
+ RBUF_ADDR_LO = 0x20, /* response msg DMA buf low 32 bits */
+ RBUF_ADDR_HI = 0x24, /* response msg DMA buf high 32 bits */
+ RBUF_BYTE_SZ = 0x28,
+ CARM_RESP_IDX = 0x2c,
+ CARM_CMS0 = 0x30, /* command message size reg 0 */
+ CARM_LMUC = 0x48,
+ CARM_HMPHA = 0x6c,
+ CARM_INITC = 0xb5,
+
+ /* bits in CARM_INT_{STAT,MASK} */
+ INT_RESERVED = 0xfffffff0,
+ INT_WATCHDOG = (1 << 3), /* watchdog timer */
+ INT_Q_OVERFLOW = (1 << 2), /* cmd msg q overflow */
+ INT_Q_AVAILABLE = (1 << 1), /* cmd msg q has free space */
+ INT_RESPONSE = (1 << 0), /* response msg available */
+ INT_ACK_MASK = INT_WATCHDOG | INT_Q_OVERFLOW,
+ INT_DEF_MASK = INT_RESERVED | INT_Q_OVERFLOW |
+ INT_RESPONSE,
+
+ /* command messages, and related register bits */
+ CARM_HAVE_RESP = 0x01,
+ CARM_MSG_READ = 1,
+ CARM_MSG_WRITE = 2,
+ CARM_MSG_VERIFY = 3,
+ CARM_MSG_GET_CAPACITY = 4,
+ CARM_MSG_FLUSH = 5,
+ CARM_MSG_IOCTL = 6,
+ CARM_MSG_ARRAY = 8,
+ CARM_MSG_MISC = 9,
+ CARM_CME = (1 << 2),
+ CARM_RME = (1 << 1),
+ CARM_WZBC = (1 << 0),
+ CARM_RMI = (1 << 0),
+ CARM_Q_FULL = (1 << 3),
+ CARM_MSG_SIZE = 288,
+ CARM_Q_LEN = 48,
+
+ /* CARM_MSG_IOCTL messages */
+ CARM_IOC_SCAN_CHAN = 5, /* scan channels for devices */
+ CARM_IOC_GET_TCQ = 13, /* get tcq/ncq depth */
+ CARM_IOC_SET_TCQ = 14, /* set tcq/ncq depth */
+
+ IOC_SCAN_CHAN_NODEV = 0x1f,
+ IOC_SCAN_CHAN_OFFSET = 0x40,
+
+ /* CARM_MSG_ARRAY messages */
+ CARM_ARRAY_INFO = 0,
+
+ ARRAY_NO_EXIST = (1 << 31),
+
+ /* response messages */
+ RMSG_SZ = 8, /* sizeof(struct carm_response) */
+ RMSG_Q_LEN = 48, /* resp. msg list length */
+ RMSG_OK = 1, /* bit indicating msg was successful */
+ /* length of entire resp. msg buffer */
+ RBUF_LEN = RMSG_SZ * RMSG_Q_LEN,
+
+ PDC_SHM_SIZE = (4096 << 7), /* length of entire h/w buffer */
+
+ /* CARM_MSG_MISC messages */
+ MISC_GET_FW_VER = 2,
+ MISC_ALLOC_MEM = 3,
+ MISC_SET_TIME = 5,
+
+ /* MISC_GET_FW_VER feature bits */
+ FW_VER_4PORT = (1 << 2), /* 1=4 ports, 0=8 ports */
+ FW_VER_NON_RAID = (1 << 1), /* 1=non-RAID firmware, 0=RAID */
+ FW_VER_ZCR = (1 << 0), /* zero channel RAID (whatever that is) */
+
+ /* carm_host flags */
+ FL_NON_RAID = FW_VER_NON_RAID,
+ FL_4PORT = FW_VER_4PORT,
+ FL_FW_VER_MASK = (FW_VER_NON_RAID | FW_VER_4PORT),
+ FL_DAC = (1 << 16),
+ FL_DYN_MAJOR = (1 << 17),
+};
+
+enum {
+ CARM_SG_BOUNDARY = 0xffffUL, /* s/g segment boundary */
+};
+
+enum scatter_gather_types {
+ SGT_32BIT = 0,
+ SGT_64BIT = 1,
+};
+
+enum host_states {
+ HST_INVALID, /* invalid state; never used */
+ HST_ALLOC_BUF, /* setting up master SHM area */
+ HST_ERROR, /* we never leave here */
+ HST_PORT_SCAN, /* start dev scan */
+ HST_DEV_SCAN_START, /* start per-device probe */
+ HST_DEV_SCAN, /* continue per-device probe */
+ HST_DEV_ACTIVATE, /* activate devices we found */
+ HST_PROBE_FINISHED, /* probe is complete */
+ HST_PROBE_START, /* initiate probe */
+ HST_SYNC_TIME, /* tell firmware what time it is */
+ HST_GET_FW_VER, /* get firmware version, adapter port cnt */
+};
+
+#ifdef CARM_DEBUG
+static const char *state_name[] = {
+ "HST_INVALID",
+ "HST_ALLOC_BUF",
+ "HST_ERROR",
+ "HST_PORT_SCAN",
+ "HST_DEV_SCAN_START",
+ "HST_DEV_SCAN",
+ "HST_DEV_ACTIVATE",
+ "HST_PROBE_FINISHED",
+ "HST_PROBE_START",
+ "HST_SYNC_TIME",
+ "HST_GET_FW_VER",
+};
+#endif
+
+struct carm_port {
+ unsigned int port_no;
+ struct gendisk *disk;
+ struct carm_host *host;
+
+ /* attached device characteristics */
+ u64 capacity;
+ char name[41];
+ u16 dev_geom_head;
+ u16 dev_geom_sect;
+ u16 dev_geom_cyl;
+};
+
+struct carm_request {
+ unsigned int tag;
+ int n_elem;
+ unsigned int msg_type;
+ unsigned int msg_subtype;
+ unsigned int msg_bucket;
+ struct request *rq;
+ struct carm_port *port;
+ struct scatterlist sg[CARM_MAX_REQ_SG];
+};
+
+struct carm_host {
+ unsigned long flags;
+ void __iomem *mmio;
+ void *shm;
+ dma_addr_t shm_dma;
+
+ int major;
+ int id;
+ char name[32];
+
+ spinlock_t lock;
+ struct pci_dev *pdev;
+ unsigned int state;
+ u32 fw_ver;
+
+ struct request_queue *oob_q;
+ unsigned int n_oob;
+
+ unsigned int hw_sg_used;
+
+ unsigned int resp_idx;
+
+ unsigned int wait_q_prod;
+ unsigned int wait_q_cons;
+ struct request_queue *wait_q[CARM_MAX_WAIT_Q];
+
+ unsigned int n_msgs;
+ u64 msg_alloc;
+ struct carm_request req[CARM_MAX_REQ];
+ void *msg_base;
+ dma_addr_t msg_dma;
+
+ int cur_scan_dev;
+ unsigned long dev_active;
+ unsigned long dev_present;
+ struct carm_port port[CARM_MAX_PORTS];
+
+ struct work_struct fsm_task;
+
+ struct completion probe_comp;
+};
+
+struct carm_response {
+ __le32 ret_handle;
+ __le32 status;
+} __attribute__((packed));
+
+struct carm_msg_sg {
+ __le32 start;
+ __le32 len;
+} __attribute__((packed));
+
+struct carm_msg_rw {
+ u8 type;
+ u8 id;
+ u8 sg_count;
+ u8 sg_type;
+ __le32 handle;
+ __le32 lba;
+ __le16 lba_count;
+ __le16 lba_high;
+ struct carm_msg_sg sg[32];
+} __attribute__((packed));
+
+struct carm_msg_allocbuf {
+ u8 type;
+ u8 subtype;
+ u8 n_sg;
+ u8 sg_type;
+ __le32 handle;
+ __le32 addr;
+ __le32 len;
+ __le32 evt_pool;
+ __le32 n_evt;
+ __le32 rbuf_pool;
+ __le32 n_rbuf;
+ __le32 msg_pool;
+ __le32 n_msg;
+ struct carm_msg_sg sg[8];
+} __attribute__((packed));
+
+struct carm_msg_ioctl {
+ u8 type;
+ u8 subtype;
+ u8 array_id;
+ u8 reserved1;
+ __le32 handle;
+ __le32 data_addr;
+ u32 reserved2;
+} __attribute__((packed));
+
+struct carm_msg_sync_time {
+ u8 type;
+ u8 subtype;
+ u16 reserved1;
+ __le32 handle;
+ u32 reserved2;
+ __le32 timestamp;
+} __attribute__((packed));
+
+struct carm_msg_get_fw_ver {
+ u8 type;
+ u8 subtype;
+ u16 reserved1;
+ __le32 handle;
+ __le32 data_addr;
+ u32 reserved2;
+} __attribute__((packed));
+
+struct carm_fw_ver {
+ __le32 version;
+ u8 features;
+ u8 reserved1;
+ u16 reserved2;
+} __attribute__((packed));
+
+struct carm_array_info {
+ __le32 size;
+
+ __le16 size_hi;
+ __le16 stripe_size;
+
+ __le32 mode;
+
+ __le16 stripe_blk_sz;
+ __le16 reserved1;
+
+ __le16 cyl;
+ __le16 head;
+
+ __le16 sect;
+ u8 array_id;
+ u8 reserved2;
+
+ char name[40];
+
+ __le32 array_status;
+
+ /* device list continues beyond this point? */
+} __attribute__((packed));
+
+static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent);
+static void carm_remove_one (struct pci_dev *pdev);
+static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+
+static const struct pci_device_id carm_pci_tbl[] = {
+ { PCI_VENDOR_ID_PROMISE, 0x8000, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
+ { PCI_VENDOR_ID_PROMISE, 0x8002, PCI_ANY_ID, PCI_ANY_ID, 0, 0, },
+ { } /* terminate list */
+};
+MODULE_DEVICE_TABLE(pci, carm_pci_tbl);
+
+static struct pci_driver carm_driver = {
+ .name = DRV_NAME,
+ .id_table = carm_pci_tbl,
+ .probe = carm_init_one,
+ .remove = carm_remove_one,
+};
+
+static const struct block_device_operations carm_bd_ops = {
+ .owner = THIS_MODULE,
+ .getgeo = carm_bdev_getgeo,
+};
+
+static unsigned int carm_host_id;
+static unsigned long carm_major_alloc;
+
+
+
+static int carm_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct carm_port *port = bdev->bd_disk->private_data;
+
+ geo->heads = (u8) port->dev_geom_head;
+ geo->sectors = (u8) port->dev_geom_sect;
+ geo->cylinders = port->dev_geom_cyl;
+ return 0;
+}
+
+static const u32 msg_sizes[] = { 32, 64, 128, CARM_MSG_SIZE };
+
+static inline int carm_lookup_bucket(u32 msg_size)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(msg_sizes); i++)
+ if (msg_size <= msg_sizes[i])
+ return i;
+
+ return -ENOENT;
+}
+
+static void carm_init_buckets(void __iomem *mmio)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(msg_sizes); i++)
+ writel(msg_sizes[i], mmio + CARM_CMS0 + (4 * i));
+}
+
+static inline void *carm_ref_msg(struct carm_host *host,
+ unsigned int msg_idx)
+{
+ return host->msg_base + (msg_idx * CARM_MSG_SIZE);
+}
+
+static inline dma_addr_t carm_ref_msg_dma(struct carm_host *host,
+ unsigned int msg_idx)
+{
+ return host->msg_dma + (msg_idx * CARM_MSG_SIZE);
+}
+
+static int carm_send_msg(struct carm_host *host,
+ struct carm_request *crq)
+{
+ void __iomem *mmio = host->mmio;
+ u32 msg = (u32) carm_ref_msg_dma(host, crq->tag);
+ u32 cm_bucket = crq->msg_bucket;
+ u32 tmp;
+ int rc = 0;
+
+ VPRINTK("ENTER\n");
+
+ tmp = readl(mmio + CARM_HMUC);
+ if (tmp & CARM_Q_FULL) {
+#if 0
+ tmp = readl(mmio + CARM_INT_MASK);
+ tmp |= INT_Q_AVAILABLE;
+ writel(tmp, mmio + CARM_INT_MASK);
+ readl(mmio + CARM_INT_MASK); /* flush */
+#endif
+ DPRINTK("host msg queue full\n");
+ rc = -EBUSY;
+ } else {
+ writel(msg | (cm_bucket << 1), mmio + CARM_IHQP);
+ readl(mmio + CARM_IHQP); /* flush */
+ }
+
+ return rc;
+}
+
+static struct carm_request *carm_get_request(struct carm_host *host)
+{
+ unsigned int i;
+
+ /* obey global hardware limit on S/G entries */
+ if (host->hw_sg_used >= (CARM_MAX_HOST_SG - CARM_MAX_REQ_SG))
+ return NULL;
+
+ for (i = 0; i < max_queue; i++)
+ if ((host->msg_alloc & (1ULL << i)) == 0) {
+ struct carm_request *crq = &host->req[i];
+ crq->port = NULL;
+ crq->n_elem = 0;
+
+ host->msg_alloc |= (1ULL << i);
+ host->n_msgs++;
+
+ assert(host->n_msgs <= CARM_MAX_REQ);
+ sg_init_table(crq->sg, CARM_MAX_REQ_SG);
+ return crq;
+ }
+
+ DPRINTK("no request available, returning NULL\n");
+ return NULL;
+}
+
+static int carm_put_request(struct carm_host *host, struct carm_request *crq)
+{
+ assert(crq->tag < max_queue);
+
+ if (unlikely((host->msg_alloc & (1ULL << crq->tag)) == 0))
+ return -EINVAL; /* tried to clear a tag that was not active */
+
+ assert(host->hw_sg_used >= crq->n_elem);
+
+ host->msg_alloc &= ~(1ULL << crq->tag);
+ host->hw_sg_used -= crq->n_elem;
+ host->n_msgs--;
+
+ return 0;
+}
+
+static struct carm_request *carm_get_special(struct carm_host *host)
+{
+ unsigned long flags;
+ struct carm_request *crq = NULL;
+ struct request *rq;
+ int tries = 5000;
+
+ while (tries-- > 0) {
+ spin_lock_irqsave(&host->lock, flags);
+ crq = carm_get_request(host);
+ spin_unlock_irqrestore(&host->lock, flags);
+
+ if (crq)
+ break;
+ msleep(10);
+ }
+
+ if (!crq)
+ return NULL;
+
+ rq = blk_get_request(host->oob_q, WRITE /* bogus */, GFP_KERNEL);
+ if (IS_ERR(rq)) {
+ spin_lock_irqsave(&host->lock, flags);
+ carm_put_request(host, crq);
+ spin_unlock_irqrestore(&host->lock, flags);
+ return NULL;
+ }
+
+ crq->rq = rq;
+ return crq;
+}
+
+static int carm_array_info (struct carm_host *host, unsigned int array_idx)
+{
+ struct carm_msg_ioctl *ioc;
+ unsigned int idx;
+ u32 msg_data;
+ dma_addr_t msg_dma;
+ struct carm_request *crq;
+ int rc;
+
+ crq = carm_get_special(host);
+ if (!crq) {
+ rc = -ENOMEM;
+ goto err_out;
+ }
+
+ idx = crq->tag;
+
+ ioc = carm_ref_msg(host, idx);
+ msg_dma = carm_ref_msg_dma(host, idx);
+ msg_data = (u32) (msg_dma + sizeof(struct carm_array_info));
+
+ crq->msg_type = CARM_MSG_ARRAY;
+ crq->msg_subtype = CARM_ARRAY_INFO;
+ rc = carm_lookup_bucket(sizeof(struct carm_msg_ioctl) +
+ sizeof(struct carm_array_info));
+ BUG_ON(rc < 0);
+ crq->msg_bucket = (u32) rc;
+
+ memset(ioc, 0, sizeof(*ioc));
+ ioc->type = CARM_MSG_ARRAY;
+ ioc->subtype = CARM_ARRAY_INFO;
+ ioc->array_id = (u8) array_idx;
+ ioc->handle = cpu_to_le32(TAG_ENCODE(idx));
+ ioc->data_addr = cpu_to_le32(msg_data);
+
+ spin_lock_irq(&host->lock);
+ assert(host->state == HST_DEV_SCAN_START ||
+ host->state == HST_DEV_SCAN);
+ spin_unlock_irq(&host->lock);
+
+ DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+ crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+ crq->rq->special = crq;
+ blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
+
+ return 0;
+
+err_out:
+ spin_lock_irq(&host->lock);
+ host->state = HST_ERROR;
+ spin_unlock_irq(&host->lock);
+ return rc;
+}
+
+typedef unsigned int (*carm_sspc_t)(struct carm_host *, unsigned int, void *);
+
+static int carm_send_special (struct carm_host *host, carm_sspc_t func)
+{
+ struct carm_request *crq;
+ struct carm_msg_ioctl *ioc;
+ void *mem;
+ unsigned int idx, msg_size;
+ int rc;
+
+ crq = carm_get_special(host);
+ if (!crq)
+ return -ENOMEM;
+
+ idx = crq->tag;
+
+ mem = carm_ref_msg(host, idx);
+
+ msg_size = func(host, idx, mem);
+
+ ioc = mem;
+ crq->msg_type = ioc->type;
+ crq->msg_subtype = ioc->subtype;
+ rc = carm_lookup_bucket(msg_size);
+ BUG_ON(rc < 0);
+ crq->msg_bucket = (u32) rc;
+
+ DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+ crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+ crq->rq->special = crq;
+ blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
+
+ return 0;
+}
+
+static unsigned int carm_fill_sync_time(struct carm_host *host,
+ unsigned int idx, void *mem)
+{
+ struct timeval tv;
+ struct carm_msg_sync_time *st = mem;
+
+ do_gettimeofday(&tv);
+
+ memset(st, 0, sizeof(*st));
+ st->type = CARM_MSG_MISC;
+ st->subtype = MISC_SET_TIME;
+ st->handle = cpu_to_le32(TAG_ENCODE(idx));
+ st->timestamp = cpu_to_le32(tv.tv_sec);
+
+ return sizeof(struct carm_msg_sync_time);
+}
+
+static unsigned int carm_fill_alloc_buf(struct carm_host *host,
+ unsigned int idx, void *mem)
+{
+ struct carm_msg_allocbuf *ab = mem;
+
+ memset(ab, 0, sizeof(*ab));
+ ab->type = CARM_MSG_MISC;
+ ab->subtype = MISC_ALLOC_MEM;
+ ab->handle = cpu_to_le32(TAG_ENCODE(idx));
+ ab->n_sg = 1;
+ ab->sg_type = SGT_32BIT;
+ ab->addr = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1));
+ ab->len = cpu_to_le32(PDC_SHM_SIZE >> 1);
+ ab->evt_pool = cpu_to_le32(host->shm_dma + (16 * 1024));
+ ab->n_evt = cpu_to_le32(1024);
+ ab->rbuf_pool = cpu_to_le32(host->shm_dma);
+ ab->n_rbuf = cpu_to_le32(RMSG_Q_LEN);
+ ab->msg_pool = cpu_to_le32(host->shm_dma + RBUF_LEN);
+ ab->n_msg = cpu_to_le32(CARM_Q_LEN);
+ ab->sg[0].start = cpu_to_le32(host->shm_dma + (PDC_SHM_SIZE >> 1));
+ ab->sg[0].len = cpu_to_le32(65536);
+
+ return sizeof(struct carm_msg_allocbuf);
+}
+
+static unsigned int carm_fill_scan_channels(struct carm_host *host,
+ unsigned int idx, void *mem)
+{
+ struct carm_msg_ioctl *ioc = mem;
+ u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) +
+ IOC_SCAN_CHAN_OFFSET);
+
+ memset(ioc, 0, sizeof(*ioc));
+ ioc->type = CARM_MSG_IOCTL;
+ ioc->subtype = CARM_IOC_SCAN_CHAN;
+ ioc->handle = cpu_to_le32(TAG_ENCODE(idx));
+ ioc->data_addr = cpu_to_le32(msg_data);
+
+ /* fill output data area with "no device" default values */
+ mem += IOC_SCAN_CHAN_OFFSET;
+ memset(mem, IOC_SCAN_CHAN_NODEV, CARM_MAX_PORTS);
+
+ return IOC_SCAN_CHAN_OFFSET + CARM_MAX_PORTS;
+}
+
+static unsigned int carm_fill_get_fw_ver(struct carm_host *host,
+ unsigned int idx, void *mem)
+{
+ struct carm_msg_get_fw_ver *ioc = mem;
+ u32 msg_data = (u32) (carm_ref_msg_dma(host, idx) + sizeof(*ioc));
+
+ memset(ioc, 0, sizeof(*ioc));
+ ioc->type = CARM_MSG_MISC;
+ ioc->subtype = MISC_GET_FW_VER;
+ ioc->handle = cpu_to_le32(TAG_ENCODE(idx));
+ ioc->data_addr = cpu_to_le32(msg_data);
+
+ return sizeof(struct carm_msg_get_fw_ver) +
+ sizeof(struct carm_fw_ver);
+}
+
+static inline void carm_end_request_queued(struct carm_host *host,
+ struct carm_request *crq,
+ int error)
+{
+ struct request *req = crq->rq;
+ int rc;
+
+ __blk_end_request_all(req, error);
+
+ rc = carm_put_request(host, crq);
+ assert(rc == 0);
+}
+
+static inline void carm_push_q (struct carm_host *host, struct request_queue *q)
+{
+ unsigned int idx = host->wait_q_prod % CARM_MAX_WAIT_Q;
+
+ blk_stop_queue(q);
+ VPRINTK("STOPPED QUEUE %p\n", q);
+
+ host->wait_q[idx] = q;
+ host->wait_q_prod++;
+ BUG_ON(host->wait_q_prod == host->wait_q_cons); /* overrun */
+}
+
+static inline struct request_queue *carm_pop_q(struct carm_host *host)
+{
+ unsigned int idx;
+
+ if (host->wait_q_prod == host->wait_q_cons)
+ return NULL;
+
+ idx = host->wait_q_cons % CARM_MAX_WAIT_Q;
+ host->wait_q_cons++;
+
+ return host->wait_q[idx];
+}
+
+static inline void carm_round_robin(struct carm_host *host)
+{
+ struct request_queue *q = carm_pop_q(host);
+ if (q) {
+ blk_start_queue(q);
+ VPRINTK("STARTED QUEUE %p\n", q);
+ }
+}
+
+static inline void carm_end_rq(struct carm_host *host, struct carm_request *crq,
+ int error)
+{
+ carm_end_request_queued(host, crq, error);
+ if (max_queue == 1)
+ carm_round_robin(host);
+ else if ((host->n_msgs <= CARM_MSG_LOW_WATER) &&
+ (host->hw_sg_used <= CARM_SG_LOW_WATER)) {
+ carm_round_robin(host);
+ }
+}
+
+static void carm_oob_rq_fn(struct request_queue *q)
+{
+ struct carm_host *host = q->queuedata;
+ struct carm_request *crq;
+ struct request *rq;
+ int rc;
+
+ while (1) {
+ DPRINTK("get req\n");
+ rq = blk_fetch_request(q);
+ if (!rq)
+ break;
+
+ crq = rq->special;
+ assert(crq != NULL);
+ assert(crq->rq == rq);
+
+ crq->n_elem = 0;
+
+ DPRINTK("send req\n");
+ rc = carm_send_msg(host, crq);
+ if (rc) {
+ blk_requeue_request(q, rq);
+ carm_push_q(host, q);
+ return; /* call us again later, eventually */
+ }
+ }
+}
+
+static void carm_rq_fn(struct request_queue *q)
+{
+ struct carm_port *port = q->queuedata;
+ struct carm_host *host = port->host;
+ struct carm_msg_rw *msg;
+ struct carm_request *crq;
+ struct request *rq;
+ struct scatterlist *sg;
+ int writing = 0, pci_dir, i, n_elem, rc;
+ u32 tmp;
+ unsigned int msg_size;
+
+queue_one_request:
+ VPRINTK("get req\n");
+ rq = blk_peek_request(q);
+ if (!rq)
+ return;
+
+ crq = carm_get_request(host);
+ if (!crq) {
+ carm_push_q(host, q);
+ return; /* call us again later, eventually */
+ }
+ crq->rq = rq;
+
+ blk_start_request(rq);
+
+ if (rq_data_dir(rq) == WRITE) {
+ writing = 1;
+ pci_dir = PCI_DMA_TODEVICE;
+ } else {
+ pci_dir = PCI_DMA_FROMDEVICE;
+ }
+
+ /* get scatterlist from block layer */
+ sg = &crq->sg[0];
+ n_elem = blk_rq_map_sg(q, rq, sg);
+ if (n_elem <= 0) {
+ carm_end_rq(host, crq, -EIO);
+ return; /* request with no s/g entries? */
+ }
+
+ /* map scatterlist to PCI bus addresses */
+ n_elem = pci_map_sg(host->pdev, sg, n_elem, pci_dir);
+ if (n_elem <= 0) {
+ carm_end_rq(host, crq, -EIO);
+ return; /* request with no s/g entries? */
+ }
+ crq->n_elem = n_elem;
+ crq->port = port;
+ host->hw_sg_used += n_elem;
+
+ /*
+ * build read/write message
+ */
+
+ VPRINTK("build msg\n");
+ msg = (struct carm_msg_rw *) carm_ref_msg(host, crq->tag);
+
+ if (writing) {
+ msg->type = CARM_MSG_WRITE;
+ crq->msg_type = CARM_MSG_WRITE;
+ } else {
+ msg->type = CARM_MSG_READ;
+ crq->msg_type = CARM_MSG_READ;
+ }
+
+ msg->id = port->port_no;
+ msg->sg_count = n_elem;
+ msg->sg_type = SGT_32BIT;
+ msg->handle = cpu_to_le32(TAG_ENCODE(crq->tag));
+ msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
+ tmp = (blk_rq_pos(rq) >> 16) >> 16;
+ msg->lba_high = cpu_to_le16( (u16) tmp );
+ msg->lba_count = cpu_to_le16(blk_rq_sectors(rq));
+
+ msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg);
+ for (i = 0; i < n_elem; i++) {
+ struct carm_msg_sg *carm_sg = &msg->sg[i];
+ carm_sg->start = cpu_to_le32(sg_dma_address(&crq->sg[i]));
+ carm_sg->len = cpu_to_le32(sg_dma_len(&crq->sg[i]));
+ msg_size += sizeof(struct carm_msg_sg);
+ }
+
+ rc = carm_lookup_bucket(msg_size);
+ BUG_ON(rc < 0);
+ crq->msg_bucket = (u32) rc;
+
+ /*
+ * queue read/write message to hardware
+ */
+
+ VPRINTK("send msg, tag == %u\n", crq->tag);
+ rc = carm_send_msg(host, crq);
+ if (rc) {
+ carm_put_request(host, crq);
+ blk_requeue_request(q, rq);
+ carm_push_q(host, q);
+ return; /* call us again later, eventually */
+ }
+
+ goto queue_one_request;
+}
+
+static void carm_handle_array_info(struct carm_host *host,
+ struct carm_request *crq, u8 *mem,
+ int error)
+{
+ struct carm_port *port;
+ u8 *msg_data = mem + sizeof(struct carm_array_info);
+ struct carm_array_info *desc = (struct carm_array_info *) msg_data;
+ u64 lo, hi;
+ int cur_port;
+ size_t slen;
+
+ DPRINTK("ENTER\n");
+
+ carm_end_rq(host, crq, error);
+
+ if (error)
+ goto out;
+ if (le32_to_cpu(desc->array_status) & ARRAY_NO_EXIST)
+ goto out;
+
+ cur_port = host->cur_scan_dev;
+
+ /* should never occur */
+ if ((cur_port < 0) || (cur_port >= CARM_MAX_PORTS)) {
+ printk(KERN_ERR PFX "BUG: cur_scan_dev==%d, array_id==%d\n",
+ cur_port, (int) desc->array_id);
+ goto out;
+ }
+
+ port = &host->port[cur_port];
+
+ lo = (u64) le32_to_cpu(desc->size);
+ hi = (u64) le16_to_cpu(desc->size_hi);
+
+ port->capacity = lo | (hi << 32);
+ port->dev_geom_head = le16_to_cpu(desc->head);
+ port->dev_geom_sect = le16_to_cpu(desc->sect);
+ port->dev_geom_cyl = le16_to_cpu(desc->cyl);
+
+ host->dev_active |= (1 << cur_port);
+
+ strncpy(port->name, desc->name, sizeof(port->name));
+ port->name[sizeof(port->name) - 1] = 0;
+ slen = strlen(port->name);
+ while (slen && (port->name[slen - 1] == ' ')) {
+ port->name[slen - 1] = 0;
+ slen--;
+ }
+
+ printk(KERN_INFO DRV_NAME "(%s): port %u device %Lu sectors\n",
+ pci_name(host->pdev), port->port_no,
+ (unsigned long long) port->capacity);
+ printk(KERN_INFO DRV_NAME "(%s): port %u device \"%s\"\n",
+ pci_name(host->pdev), port->port_no, port->name);
+
+out:
+ assert(host->state == HST_DEV_SCAN);
+ schedule_work(&host->fsm_task);
+}
+
+static void carm_handle_scan_chan(struct carm_host *host,
+ struct carm_request *crq, u8 *mem,
+ int error)
+{
+ u8 *msg_data = mem + IOC_SCAN_CHAN_OFFSET;
+ unsigned int i, dev_count = 0;
+ int new_state = HST_DEV_SCAN_START;
+
+ DPRINTK("ENTER\n");
+
+ carm_end_rq(host, crq, error);
+
+ if (error) {
+ new_state = HST_ERROR;
+ goto out;
+ }
+
+ /* TODO: scan and support non-disk devices */
+ for (i = 0; i < 8; i++)
+ if (msg_data[i] == 0) { /* direct-access device (disk) */
+ host->dev_present |= (1 << i);
+ dev_count++;
+ }
+
+ printk(KERN_INFO DRV_NAME "(%s): found %u interesting devices\n",
+ pci_name(host->pdev), dev_count);
+
+out:
+ assert(host->state == HST_PORT_SCAN);
+ host->state = new_state;
+ schedule_work(&host->fsm_task);
+}
+
+static void carm_handle_generic(struct carm_host *host,
+ struct carm_request *crq, int error,
+ int cur_state, int next_state)
+{
+ DPRINTK("ENTER\n");
+
+ carm_end_rq(host, crq, error);
+
+ assert(host->state == cur_state);
+ if (error)
+ host->state = HST_ERROR;
+ else
+ host->state = next_state;
+ schedule_work(&host->fsm_task);
+}
+
+static inline void carm_handle_rw(struct carm_host *host,
+ struct carm_request *crq, int error)
+{
+ int pci_dir;
+
+ VPRINTK("ENTER\n");
+
+ if (rq_data_dir(crq->rq) == WRITE)
+ pci_dir = PCI_DMA_TODEVICE;
+ else
+ pci_dir = PCI_DMA_FROMDEVICE;
+
+ pci_unmap_sg(host->pdev, &crq->sg[0], crq->n_elem, pci_dir);
+
+ carm_end_rq(host, crq, error);
+}
+
+static inline void carm_handle_resp(struct carm_host *host,
+ __le32 ret_handle_le, u32 status)
+{
+ u32 handle = le32_to_cpu(ret_handle_le);
+ unsigned int msg_idx;
+ struct carm_request *crq;
+ int error = (status == RMSG_OK) ? 0 : -EIO;
+ u8 *mem;
+
+ VPRINTK("ENTER, handle == 0x%x\n", handle);
+
+ if (unlikely(!TAG_VALID(handle))) {
+ printk(KERN_ERR DRV_NAME "(%s): BUG: invalid tag 0x%x\n",
+ pci_name(host->pdev), handle);
+ return;
+ }
+
+ msg_idx = TAG_DECODE(handle);
+ VPRINTK("tag == %u\n", msg_idx);
+
+ crq = &host->req[msg_idx];
+
+ /* fast path */
+ if (likely(crq->msg_type == CARM_MSG_READ ||
+ crq->msg_type == CARM_MSG_WRITE)) {
+ carm_handle_rw(host, crq, error);
+ return;
+ }
+
+ mem = carm_ref_msg(host, msg_idx);
+
+ switch (crq->msg_type) {
+ case CARM_MSG_IOCTL: {
+ switch (crq->msg_subtype) {
+ case CARM_IOC_SCAN_CHAN:
+ carm_handle_scan_chan(host, crq, mem, error);
+ break;
+ default:
+ /* unknown / invalid response */
+ goto err_out;
+ }
+ break;
+ }
+
+ case CARM_MSG_MISC: {
+ switch (crq->msg_subtype) {
+ case MISC_ALLOC_MEM:
+ carm_handle_generic(host, crq, error,
+ HST_ALLOC_BUF, HST_SYNC_TIME);
+ break;
+ case MISC_SET_TIME:
+ carm_handle_generic(host, crq, error,
+ HST_SYNC_TIME, HST_GET_FW_VER);
+ break;
+ case MISC_GET_FW_VER: {
+ struct carm_fw_ver *ver = (struct carm_fw_ver *)
+ (mem + sizeof(struct carm_msg_get_fw_ver));
+ if (!error) {
+ host->fw_ver = le32_to_cpu(ver->version);
+ host->flags |= (ver->features & FL_FW_VER_MASK);
+ }
+ carm_handle_generic(host, crq, error,
+ HST_GET_FW_VER, HST_PORT_SCAN);
+ break;
+ }
+ default:
+ /* unknown / invalid response */
+ goto err_out;
+ }
+ break;
+ }
+
+ case CARM_MSG_ARRAY: {
+ switch (crq->msg_subtype) {
+ case CARM_ARRAY_INFO:
+ carm_handle_array_info(host, crq, mem, error);
+ break;
+ default:
+ /* unknown / invalid response */
+ goto err_out;
+ }
+ break;
+ }
+
+ default:
+ /* unknown / invalid response */
+ goto err_out;
+ }
+
+ return;
+
+err_out:
+ printk(KERN_WARNING DRV_NAME "(%s): BUG: unhandled message type %d/%d\n",
+ pci_name(host->pdev), crq->msg_type, crq->msg_subtype);
+ carm_end_rq(host, crq, -EIO);
+}
+
+static inline void carm_handle_responses(struct carm_host *host)
+{
+ void __iomem *mmio = host->mmio;
+ struct carm_response *resp = (struct carm_response *) host->shm;
+ unsigned int work = 0;
+ unsigned int idx = host->resp_idx % RMSG_Q_LEN;
+
+ while (1) {
+ u32 status = le32_to_cpu(resp[idx].status);
+
+ if (status == 0xffffffff) {
+ VPRINTK("ending response on index %u\n", idx);
+ writel(idx << 3, mmio + CARM_RESP_IDX);
+ break;
+ }
+
+ /* response to a message we sent */
+ else if ((status & (1 << 31)) == 0) {
+ VPRINTK("handling msg response on index %u\n", idx);
+ carm_handle_resp(host, resp[idx].ret_handle, status);
+ resp[idx].status = cpu_to_le32(0xffffffff);
+ }
+
+ /* asynchronous events the hardware throws our way */
+ else if ((status & 0xff000000) == (1 << 31)) {
+ u8 *evt_type_ptr = (u8 *) &resp[idx];
+ u8 evt_type = *evt_type_ptr;
+ printk(KERN_WARNING DRV_NAME "(%s): unhandled event type %d\n",
+ pci_name(host->pdev), (int) evt_type);
+ resp[idx].status = cpu_to_le32(0xffffffff);
+ }
+
+ idx = NEXT_RESP(idx);
+ work++;
+ }
+
+ VPRINTK("EXIT, work==%u\n", work);
+ host->resp_idx += work;
+}
+
+static irqreturn_t carm_interrupt(int irq, void *__host)
+{
+ struct carm_host *host = __host;
+ void __iomem *mmio;
+ u32 mask;
+ int handled = 0;
+ unsigned long flags;
+
+ if (!host) {
+ VPRINTK("no host\n");
+ return IRQ_NONE;
+ }
+
+ spin_lock_irqsave(&host->lock, flags);
+
+ mmio = host->mmio;
+
+ /* reading should also clear interrupts */
+ mask = readl(mmio + CARM_INT_STAT);
+
+ if (mask == 0 || mask == 0xffffffff) {
+ VPRINTK("no work, mask == 0x%x\n", mask);
+ goto out;
+ }
+
+ if (mask & INT_ACK_MASK)
+ writel(mask, mmio + CARM_INT_STAT);
+
+ if (unlikely(host->state == HST_INVALID)) {
+ VPRINTK("not initialized yet, mask = 0x%x\n", mask);
+ goto out;
+ }
+
+ if (mask & CARM_HAVE_RESP) {
+ handled = 1;
+ carm_handle_responses(host);
+ }
+
+out:
+ spin_unlock_irqrestore(&host->lock, flags);
+ VPRINTK("EXIT\n");
+ return IRQ_RETVAL(handled);
+}
+
+static void carm_fsm_task (struct work_struct *work)
+{
+ struct carm_host *host =
+ container_of(work, struct carm_host, fsm_task);
+ unsigned long flags;
+ unsigned int state;
+ int rc, i, next_dev;
+ int reschedule = 0;
+ int new_state = HST_INVALID;
+
+ spin_lock_irqsave(&host->lock, flags);
+ state = host->state;
+ spin_unlock_irqrestore(&host->lock, flags);
+
+ DPRINTK("ENTER, state == %s\n", state_name[state]);
+
+ switch (state) {
+ case HST_PROBE_START:
+ new_state = HST_ALLOC_BUF;
+ reschedule = 1;
+ break;
+
+ case HST_ALLOC_BUF:
+ rc = carm_send_special(host, carm_fill_alloc_buf);
+ if (rc) {
+ new_state = HST_ERROR;
+ reschedule = 1;
+ }
+ break;
+
+ case HST_SYNC_TIME:
+ rc = carm_send_special(host, carm_fill_sync_time);
+ if (rc) {
+ new_state = HST_ERROR;
+ reschedule = 1;
+ }
+ break;
+
+ case HST_GET_FW_VER:
+ rc = carm_send_special(host, carm_fill_get_fw_ver);
+ if (rc) {
+ new_state = HST_ERROR;
+ reschedule = 1;
+ }
+ break;
+
+ case HST_PORT_SCAN:
+ rc = carm_send_special(host, carm_fill_scan_channels);
+ if (rc) {
+ new_state = HST_ERROR;
+ reschedule = 1;
+ }
+ break;
+
+ case HST_DEV_SCAN_START:
+ host->cur_scan_dev = -1;
+ new_state = HST_DEV_SCAN;
+ reschedule = 1;
+ break;
+
+ case HST_DEV_SCAN:
+ next_dev = -1;
+ for (i = host->cur_scan_dev + 1; i < CARM_MAX_PORTS; i++)
+ if (host->dev_present & (1 << i)) {
+ next_dev = i;
+ break;
+ }
+
+ if (next_dev >= 0) {
+ host->cur_scan_dev = next_dev;
+ rc = carm_array_info(host, next_dev);
+ if (rc) {
+ new_state = HST_ERROR;
+ reschedule = 1;
+ }
+ } else {
+ new_state = HST_DEV_ACTIVATE;
+ reschedule = 1;
+ }
+ break;
+
+ case HST_DEV_ACTIVATE: {
+ int activated = 0;
+ for (i = 0; i < CARM_MAX_PORTS; i++)
+ if (host->dev_active & (1 << i)) {
+ struct carm_port *port = &host->port[i];
+ struct gendisk *disk = port->disk;
+
+ set_capacity(disk, port->capacity);
+ add_disk(disk);
+ activated++;
+ }
+
+ printk(KERN_INFO DRV_NAME "(%s): %d ports activated\n",
+ pci_name(host->pdev), activated);
+
+ new_state = HST_PROBE_FINISHED;
+ reschedule = 1;
+ break;
+ }
+
+ case HST_PROBE_FINISHED:
+ complete(&host->probe_comp);
+ break;
+
+ case HST_ERROR:
+ /* FIXME: TODO */
+ break;
+
+ default:
+ /* should never occur */
+ printk(KERN_ERR PFX "BUG: unknown state %d\n", state);
+ assert(0);
+ break;
+ }
+
+ if (new_state != HST_INVALID) {
+ spin_lock_irqsave(&host->lock, flags);
+ host->state = new_state;
+ spin_unlock_irqrestore(&host->lock, flags);
+ }
+ if (reschedule)
+ schedule_work(&host->fsm_task);
+}
+
+static int carm_init_wait(void __iomem *mmio, u32 bits, unsigned int test_bit)
+{
+ unsigned int i;
+
+ for (i = 0; i < 50000; i++) {
+ u32 tmp = readl(mmio + CARM_LMUC);
+ udelay(100);
+
+ if (test_bit) {
+ if ((tmp & bits) == bits)
+ return 0;
+ } else {
+ if ((tmp & bits) == 0)
+ return 0;
+ }
+
+ cond_resched();
+ }
+
+ printk(KERN_ERR PFX "carm_init_wait timeout, bits == 0x%x, test_bit == %s\n",
+ bits, test_bit ? "yes" : "no");
+ return -EBUSY;
+}
+
+static void carm_init_responses(struct carm_host *host)
+{
+ void __iomem *mmio = host->mmio;
+ unsigned int i;
+ struct carm_response *resp = (struct carm_response *) host->shm;
+
+ for (i = 0; i < RMSG_Q_LEN; i++)
+ resp[i].status = cpu_to_le32(0xffffffff);
+
+ writel(0, mmio + CARM_RESP_IDX);
+}
+
+static int carm_init_host(struct carm_host *host)
+{
+ void __iomem *mmio = host->mmio;
+ u32 tmp;
+ u8 tmp8;
+ int rc;
+
+ DPRINTK("ENTER\n");
+
+ writel(0, mmio + CARM_INT_MASK);
+
+ tmp8 = readb(mmio + CARM_INITC);
+ if (tmp8 & 0x01) {
+ tmp8 &= ~0x01;
+ writeb(tmp8, mmio + CARM_INITC);
+ readb(mmio + CARM_INITC); /* flush */
+
+ DPRINTK("snooze...\n");
+ msleep(5000);
+ }
+
+ tmp = readl(mmio + CARM_HMUC);
+ if (tmp & CARM_CME) {
+ DPRINTK("CME bit present, waiting\n");
+ rc = carm_init_wait(mmio, CARM_CME, 1);
+ if (rc) {
+ DPRINTK("EXIT, carm_init_wait 1 failed\n");
+ return rc;
+ }
+ }
+ if (tmp & CARM_RME) {
+ DPRINTK("RME bit present, waiting\n");
+ rc = carm_init_wait(mmio, CARM_RME, 1);
+ if (rc) {
+ DPRINTK("EXIT, carm_init_wait 2 failed\n");
+ return rc;
+ }
+ }
+
+ tmp &= ~(CARM_RME | CARM_CME);
+ writel(tmp, mmio + CARM_HMUC);
+ readl(mmio + CARM_HMUC); /* flush */
+
+ rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 0);
+ if (rc) {
+ DPRINTK("EXIT, carm_init_wait 3 failed\n");
+ return rc;
+ }
+
+ carm_init_buckets(mmio);
+
+ writel(host->shm_dma & 0xffffffff, mmio + RBUF_ADDR_LO);
+ writel((host->shm_dma >> 16) >> 16, mmio + RBUF_ADDR_HI);
+ writel(RBUF_LEN, mmio + RBUF_BYTE_SZ);
+
+ tmp = readl(mmio + CARM_HMUC);
+ tmp |= (CARM_RME | CARM_CME | CARM_WZBC);
+ writel(tmp, mmio + CARM_HMUC);
+ readl(mmio + CARM_HMUC); /* flush */
+
+ rc = carm_init_wait(mmio, CARM_RME | CARM_CME, 1);
+ if (rc) {
+ DPRINTK("EXIT, carm_init_wait 4 failed\n");
+ return rc;
+ }
+
+ writel(0, mmio + CARM_HMPHA);
+ writel(INT_DEF_MASK, mmio + CARM_INT_MASK);
+
+ carm_init_responses(host);
+
+ /* start initialization, probing state machine */
+ spin_lock_irq(&host->lock);
+ assert(host->state == HST_INVALID);
+ host->state = HST_PROBE_START;
+ spin_unlock_irq(&host->lock);
+ schedule_work(&host->fsm_task);
+
+ DPRINTK("EXIT\n");
+ return 0;
+}
+
+static int carm_init_disks(struct carm_host *host)
+{
+ unsigned int i;
+ int rc = 0;
+
+ for (i = 0; i < CARM_MAX_PORTS; i++) {
+ struct gendisk *disk;
+ struct request_queue *q;
+ struct carm_port *port;
+
+ port = &host->port[i];
+ port->host = host;
+ port->port_no = i;
+
+ disk = alloc_disk(CARM_MINORS_PER_MAJOR);
+ if (!disk) {
+ rc = -ENOMEM;
+ break;
+ }
+
+ port->disk = disk;
+ sprintf(disk->disk_name, DRV_NAME "/%u",
+ (unsigned int) (host->id * CARM_MAX_PORTS) + i);
+ disk->major = host->major;
+ disk->first_minor = i * CARM_MINORS_PER_MAJOR;
+ disk->fops = &carm_bd_ops;
+ disk->private_data = port;
+
+ q = blk_init_queue(carm_rq_fn, &host->lock);
+ if (!q) {
+ rc = -ENOMEM;
+ break;
+ }
+ disk->queue = q;
+ blk_queue_max_segments(q, CARM_MAX_REQ_SG);
+ blk_queue_segment_boundary(q, CARM_SG_BOUNDARY);
+
+ q->queuedata = port;
+ }
+
+ return rc;
+}
+
+static void carm_free_disks(struct carm_host *host)
+{
+ unsigned int i;
+
+ for (i = 0; i < CARM_MAX_PORTS; i++) {
+ struct gendisk *disk = host->port[i].disk;
+ if (disk) {
+ struct request_queue *q = disk->queue;
+
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ if (q)
+ blk_cleanup_queue(q);
+ put_disk(disk);
+ }
+ }
+}
+
+static int carm_init_shm(struct carm_host *host)
+{
+ host->shm = pci_alloc_consistent(host->pdev, CARM_SHM_SIZE,
+ &host->shm_dma);
+ if (!host->shm)
+ return -ENOMEM;
+
+ host->msg_base = host->shm + RBUF_LEN;
+ host->msg_dma = host->shm_dma + RBUF_LEN;
+
+ memset(host->shm, 0xff, RBUF_LEN);
+ memset(host->msg_base, 0, PDC_SHM_SIZE - RBUF_LEN);
+
+ return 0;
+}
+
+static int carm_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+ struct carm_host *host;
+ unsigned int pci_dac;
+ int rc;
+ struct request_queue *q;
+ unsigned int i;
+
+ printk_once(KERN_DEBUG DRV_NAME " version " DRV_VERSION "\n");
+
+ rc = pci_enable_device(pdev);
+ if (rc)
+ return rc;
+
+ rc = pci_request_regions(pdev, DRV_NAME);
+ if (rc)
+ goto err_out;
+
+#ifdef IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */
+ rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (!rc) {
+ rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+ if (rc) {
+ printk(KERN_ERR DRV_NAME "(%s): consistent DMA mask failure\n",
+ pci_name(pdev));
+ goto err_out_regions;
+ }
+ pci_dac = 1;
+ } else {
+#endif
+ rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+ if (rc) {
+ printk(KERN_ERR DRV_NAME "(%s): DMA mask failure\n",
+ pci_name(pdev));
+ goto err_out_regions;
+ }
+ pci_dac = 0;
+#ifdef IF_64BIT_DMA_IS_POSSIBLE /* grrrr... */
+ }
+#endif
+
+ host = kzalloc(sizeof(*host), GFP_KERNEL);
+ if (!host) {
+ printk(KERN_ERR DRV_NAME "(%s): memory alloc failure\n",
+ pci_name(pdev));
+ rc = -ENOMEM;
+ goto err_out_regions;
+ }
+
+ host->pdev = pdev;
+ host->flags = pci_dac ? FL_DAC : 0;
+ spin_lock_init(&host->lock);
+ INIT_WORK(&host->fsm_task, carm_fsm_task);
+ init_completion(&host->probe_comp);
+
+ for (i = 0; i < ARRAY_SIZE(host->req); i++)
+ host->req[i].tag = i;
+
+ host->mmio = ioremap(pci_resource_start(pdev, 0),
+ pci_resource_len(pdev, 0));
+ if (!host->mmio) {
+ printk(KERN_ERR DRV_NAME "(%s): MMIO alloc failure\n",
+ pci_name(pdev));
+ rc = -ENOMEM;
+ goto err_out_kfree;
+ }
+
+ rc = carm_init_shm(host);
+ if (rc) {
+ printk(KERN_ERR DRV_NAME "(%s): DMA SHM alloc failure\n",
+ pci_name(pdev));
+ goto err_out_iounmap;
+ }
+
+ q = blk_init_queue(carm_oob_rq_fn, &host->lock);
+ if (!q) {
+ printk(KERN_ERR DRV_NAME "(%s): OOB queue alloc failure\n",
+ pci_name(pdev));
+ rc = -ENOMEM;
+ goto err_out_pci_free;
+ }
+ host->oob_q = q;
+ q->queuedata = host;
+
+ /*
+ * Figure out which major to use: 160, 161, or dynamic
+ */
+ if (!test_and_set_bit(0, &carm_major_alloc))
+ host->major = 160;
+ else if (!test_and_set_bit(1, &carm_major_alloc))
+ host->major = 161;
+ else
+ host->flags |= FL_DYN_MAJOR;
+
+ host->id = carm_host_id;
+ sprintf(host->name, DRV_NAME "%d", carm_host_id);
+
+ rc = register_blkdev(host->major, host->name);
+ if (rc < 0)
+ goto err_out_free_majors;
+ if (host->flags & FL_DYN_MAJOR)
+ host->major = rc;
+
+ rc = carm_init_disks(host);
+ if (rc)
+ goto err_out_blkdev_disks;
+
+ pci_set_master(pdev);
+
+ rc = request_irq(pdev->irq, carm_interrupt, IRQF_SHARED, DRV_NAME, host);
+ if (rc) {
+ printk(KERN_ERR DRV_NAME "(%s): irq alloc failure\n",
+ pci_name(pdev));
+ goto err_out_blkdev_disks;
+ }
+
+ rc = carm_init_host(host);
+ if (rc)
+ goto err_out_free_irq;
+
+ DPRINTK("waiting for probe_comp\n");
+ wait_for_completion(&host->probe_comp);
+
+ printk(KERN_INFO "%s: pci %s, ports %d, io %llx, irq %u, major %d\n",
+ host->name, pci_name(pdev), (int) CARM_MAX_PORTS,
+ (unsigned long long)pci_resource_start(pdev, 0),
+ pdev->irq, host->major);
+
+ carm_host_id++;
+ pci_set_drvdata(pdev, host);
+ return 0;
+
+err_out_free_irq:
+ free_irq(pdev->irq, host);
+err_out_blkdev_disks:
+ carm_free_disks(host);
+ unregister_blkdev(host->major, host->name);
+err_out_free_majors:
+ if (host->major == 160)
+ clear_bit(0, &carm_major_alloc);
+ else if (host->major == 161)
+ clear_bit(1, &carm_major_alloc);
+ blk_cleanup_queue(host->oob_q);
+err_out_pci_free:
+ pci_free_consistent(pdev, CARM_SHM_SIZE, host->shm, host->shm_dma);
+err_out_iounmap:
+ iounmap(host->mmio);
+err_out_kfree:
+ kfree(host);
+err_out_regions:
+ pci_release_regions(pdev);
+err_out:
+ pci_disable_device(pdev);
+ return rc;
+}
+
+static void carm_remove_one (struct pci_dev *pdev)
+{
+ struct carm_host *host = pci_get_drvdata(pdev);
+
+ if (!host) {
+ printk(KERN_ERR PFX "BUG: no host data for PCI(%s)\n",
+ pci_name(pdev));
+ return;
+ }
+
+ free_irq(pdev->irq, host);
+ carm_free_disks(host);
+ unregister_blkdev(host->major, host->name);
+ if (host->major == 160)
+ clear_bit(0, &carm_major_alloc);
+ else if (host->major == 161)
+ clear_bit(1, &carm_major_alloc);
+ blk_cleanup_queue(host->oob_q);
+ pci_free_consistent(pdev, CARM_SHM_SIZE, host->shm, host->shm_dma);
+ iounmap(host->mmio);
+ kfree(host);
+ pci_release_regions(pdev);
+ pci_disable_device(pdev);
+}
+
+module_pci_driver(carm_driver);
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
new file mode 100644
index 000000000..4cf81b5bf
--- /dev/null
+++ b/drivers/block/umem.c
@@ -0,0 +1,1136 @@
+/*
+ * mm.c - Micro Memory(tm) PCI memory board block device driver - v2.3
+ *
+ * (C) 2001 San Mehat <nettwerk@valinux.com>
+ * (C) 2001 Johannes Erdfelt <jerdfelt@valinux.com>
+ * (C) 2001 NeilBrown <neilb@cse.unsw.edu.au>
+ *
+ * This driver for the Micro Memory PCI Memory Module with Battery Backup
+ * is Copyright Micro Memory Inc 2001-2002. All rights reserved.
+ *
+ * This driver is released to the public under the terms of the
+ * GNU GENERAL PUBLIC LICENSE version 2
+ * See the file COPYING for details.
+ *
+ * This driver provides a standard block device interface for Micro Memory(tm)
+ * PCI based RAM boards.
+ * 10/05/01: Phap Nguyen - Rebuilt the driver
+ * 10/22/01: Phap Nguyen - v2.1 Added disk partitioning
+ * 29oct2001:NeilBrown - Use make_request_fn instead of request_fn
+ * - use stand disk partitioning (so fdisk works).
+ * 08nov2001:NeilBrown - change driver name from "mm" to "umem"
+ * - incorporate into main kernel
+ * 08apr2002:NeilBrown - Move some of interrupt handle to tasklet
+ * - use spin_lock_bh instead of _irq
+ * - Never block on make_request. queue
+ * bh's instead.
+ * - unregister umem from devfs at mod unload
+ * - Change version to 2.3
+ * 07Nov2001:Phap Nguyen - Select pci read command: 06, 12, 15 (Decimal)
+ * 07Jan2002: P. Nguyen - Used PCI Memory Write & Invalidate for DMA
+ * 15May2002:NeilBrown - convert to bio for 2.5
+ * 17May2002:NeilBrown - remove init_mem initialisation. Instead detect
+ * - a sequence of writes that cover the card, and
+ * - set initialised bit then.
+ */
+
+#undef DEBUG /* #define DEBUG if you want debugging info (pr_debug) */
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/gfp.h>
+#include <linux/ioctl.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/timer.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+
+#include <linux/fcntl.h> /* O_ACCMODE */
+#include <linux/hdreg.h> /* HDIO_GETGEO */
+
+#include "umem.h"
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#define MM_MAXCARDS 4
+#define MM_RAHEAD 2 /* two sectors */
+#define MM_BLKSIZE 1024 /* 1k blocks */
+#define MM_HARDSECT 512 /* 512-byte hardware sectors */
+#define MM_SHIFT 6 /* max 64 partitions on 4 cards */
+
+/*
+ * Version Information
+ */
+
+#define DRIVER_NAME "umem"
+#define DRIVER_VERSION "v2.3"
+#define DRIVER_AUTHOR "San Mehat, Johannes Erdfelt, NeilBrown"
+#define DRIVER_DESC "Micro Memory(tm) PCI memory board block driver"
+
+static int debug;
+/* #define HW_TRACE(x) writeb(x,cards[0].csr_remap + MEMCTRLSTATUS_MAGIC) */
+#define HW_TRACE(x)
+
+#define DEBUG_LED_ON_TRANSFER 0x01
+#define DEBUG_BATTERY_POLLING 0x02
+
+module_param(debug, int, 0644);
+MODULE_PARM_DESC(debug, "Debug bitmask");
+
+static int pci_read_cmd = 0x0C; /* Read Multiple */
+module_param(pci_read_cmd, int, 0);
+MODULE_PARM_DESC(pci_read_cmd, "PCI read command");
+
+static int pci_write_cmd = 0x0F; /* Write and Invalidate */
+module_param(pci_write_cmd, int, 0);
+MODULE_PARM_DESC(pci_write_cmd, "PCI write command");
+
+static int pci_cmds;
+
+static int major_nr;
+
+#include <linux/blkdev.h>
+#include <linux/blkpg.h>
+
+struct cardinfo {
+ struct pci_dev *dev;
+
+ unsigned char __iomem *csr_remap;
+ unsigned int mm_size; /* size in kbytes */
+
+ unsigned int init_size; /* initial segment, in sectors,
+ * that we know to
+ * have been written
+ */
+ struct bio *bio, *currentbio, **biotail;
+ struct bvec_iter current_iter;
+
+ struct request_queue *queue;
+
+ struct mm_page {
+ dma_addr_t page_dma;
+ struct mm_dma_desc *desc;
+ int cnt, headcnt;
+ struct bio *bio, **biotail;
+ struct bvec_iter iter;
+ } mm_pages[2];
+#define DESC_PER_PAGE ((PAGE_SIZE*2)/sizeof(struct mm_dma_desc))
+
+ int Active, Ready;
+
+ struct tasklet_struct tasklet;
+ unsigned int dma_status;
+
+ struct {
+ int good;
+ int warned;
+ unsigned long last_change;
+ } battery[2];
+
+ spinlock_t lock;
+ int check_batteries;
+
+ int flags;
+};
+
+static struct cardinfo cards[MM_MAXCARDS];
+static struct timer_list battery_timer;
+
+static int num_cards;
+
+static struct gendisk *mm_gendisk[MM_MAXCARDS];
+
+static void check_batteries(struct cardinfo *card);
+
+static int get_userbit(struct cardinfo *card, int bit)
+{
+ unsigned char led;
+
+ led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL);
+ return led & bit;
+}
+
+static int set_userbit(struct cardinfo *card, int bit, unsigned char state)
+{
+ unsigned char led;
+
+ led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL);
+ if (state)
+ led |= bit;
+ else
+ led &= ~bit;
+ writeb(led, card->csr_remap + MEMCTRLCMD_LEDCTRL);
+
+ return 0;
+}
+
+/*
+ * NOTE: For the power LED, use the LED_POWER_* macros since they differ
+ */
+static void set_led(struct cardinfo *card, int shift, unsigned char state)
+{
+ unsigned char led;
+
+ led = readb(card->csr_remap + MEMCTRLCMD_LEDCTRL);
+ if (state == LED_FLIP)
+ led ^= (1<<shift);
+ else {
+ led &= ~(0x03 << shift);
+ led |= (state << shift);
+ }
+ writeb(led, card->csr_remap + MEMCTRLCMD_LEDCTRL);
+
+}
+
+#ifdef MM_DIAG
+static void dump_regs(struct cardinfo *card)
+{
+ unsigned char *p;
+ int i, i1;
+
+ p = card->csr_remap;
+ for (i = 0; i < 8; i++) {
+ printk(KERN_DEBUG "%p ", p);
+
+ for (i1 = 0; i1 < 16; i1++)
+ printk("%02x ", *p++);
+
+ printk("\n");
+ }
+}
+#endif
+
+static void dump_dmastat(struct cardinfo *card, unsigned int dmastat)
+{
+ dev_printk(KERN_DEBUG, &card->dev->dev, "DMAstat - ");
+ if (dmastat & DMASCR_ANY_ERR)
+ printk(KERN_CONT "ANY_ERR ");
+ if (dmastat & DMASCR_MBE_ERR)
+ printk(KERN_CONT "MBE_ERR ");
+ if (dmastat & DMASCR_PARITY_ERR_REP)
+ printk(KERN_CONT "PARITY_ERR_REP ");
+ if (dmastat & DMASCR_PARITY_ERR_DET)
+ printk(KERN_CONT "PARITY_ERR_DET ");
+ if (dmastat & DMASCR_SYSTEM_ERR_SIG)
+ printk(KERN_CONT "SYSTEM_ERR_SIG ");
+ if (dmastat & DMASCR_TARGET_ABT)
+ printk(KERN_CONT "TARGET_ABT ");
+ if (dmastat & DMASCR_MASTER_ABT)
+ printk(KERN_CONT "MASTER_ABT ");
+ if (dmastat & DMASCR_CHAIN_COMPLETE)
+ printk(KERN_CONT "CHAIN_COMPLETE ");
+ if (dmastat & DMASCR_DMA_COMPLETE)
+ printk(KERN_CONT "DMA_COMPLETE ");
+ printk("\n");
+}
+
+/*
+ * Theory of request handling
+ *
+ * Each bio is assigned to one mm_dma_desc - which may not be enough FIXME
+ * We have two pages of mm_dma_desc, holding about 64 descriptors
+ * each. These are allocated at init time.
+ * One page is "Ready" and is either full, or can have request added.
+ * The other page might be "Active", which DMA is happening on it.
+ *
+ * Whenever IO on the active page completes, the Ready page is activated
+ * and the ex-Active page is clean out and made Ready.
+ * Otherwise the Ready page is only activated when it becomes full.
+ *
+ * If a request arrives while both pages a full, it is queued, and b_rdev is
+ * overloaded to record whether it was a read or a write.
+ *
+ * The interrupt handler only polls the device to clear the interrupt.
+ * The processing of the result is done in a tasklet.
+ */
+
+static void mm_start_io(struct cardinfo *card)
+{
+ /* we have the lock, we know there is
+ * no IO active, and we know that card->Active
+ * is set
+ */
+ struct mm_dma_desc *desc;
+ struct mm_page *page;
+ int offset;
+
+ /* make the last descriptor end the chain */
+ page = &card->mm_pages[card->Active];
+ pr_debug("start_io: %d %d->%d\n",
+ card->Active, page->headcnt, page->cnt - 1);
+ desc = &page->desc[page->cnt-1];
+
+ desc->control_bits |= cpu_to_le32(DMASCR_CHAIN_COMP_EN);
+ desc->control_bits &= ~cpu_to_le32(DMASCR_CHAIN_EN);
+ desc->sem_control_bits = desc->control_bits;
+
+
+ if (debug & DEBUG_LED_ON_TRANSFER)
+ set_led(card, LED_REMOVE, LED_ON);
+
+ desc = &page->desc[page->headcnt];
+ writel(0, card->csr_remap + DMA_PCI_ADDR);
+ writel(0, card->csr_remap + DMA_PCI_ADDR + 4);
+
+ writel(0, card->csr_remap + DMA_LOCAL_ADDR);
+ writel(0, card->csr_remap + DMA_LOCAL_ADDR + 4);
+
+ writel(0, card->csr_remap + DMA_TRANSFER_SIZE);
+ writel(0, card->csr_remap + DMA_TRANSFER_SIZE + 4);
+
+ writel(0, card->csr_remap + DMA_SEMAPHORE_ADDR);
+ writel(0, card->csr_remap + DMA_SEMAPHORE_ADDR + 4);
+
+ offset = ((char *)desc) - ((char *)page->desc);
+ writel(cpu_to_le32((page->page_dma+offset) & 0xffffffff),
+ card->csr_remap + DMA_DESCRIPTOR_ADDR);
+ /* Force the value to u64 before shifting otherwise >> 32 is undefined C
+ * and on some ports will do nothing ! */
+ writel(cpu_to_le32(((u64)page->page_dma)>>32),
+ card->csr_remap + DMA_DESCRIPTOR_ADDR + 4);
+
+ /* Go, go, go */
+ writel(cpu_to_le32(DMASCR_GO | DMASCR_CHAIN_EN | pci_cmds),
+ card->csr_remap + DMA_STATUS_CTRL);
+}
+
+static int add_bio(struct cardinfo *card);
+
+static void activate(struct cardinfo *card)
+{
+ /* if No page is Active, and Ready is
+ * not empty, then switch Ready page
+ * to active and start IO.
+ * Then add any bh's that are available to Ready
+ */
+
+ do {
+ while (add_bio(card))
+ ;
+
+ if (card->Active == -1 &&
+ card->mm_pages[card->Ready].cnt > 0) {
+ card->Active = card->Ready;
+ card->Ready = 1-card->Ready;
+ mm_start_io(card);
+ }
+
+ } while (card->Active == -1 && add_bio(card));
+}
+
+static inline void reset_page(struct mm_page *page)
+{
+ page->cnt = 0;
+ page->headcnt = 0;
+ page->bio = NULL;
+ page->biotail = &page->bio;
+}
+
+/*
+ * If there is room on Ready page, take
+ * one bh off list and add it.
+ * return 1 if there was room, else 0.
+ */
+static int add_bio(struct cardinfo *card)
+{
+ struct mm_page *p;
+ struct mm_dma_desc *desc;
+ dma_addr_t dma_handle;
+ int offset;
+ struct bio *bio;
+ struct bio_vec vec;
+ int rw;
+
+ bio = card->currentbio;
+ if (!bio && card->bio) {
+ card->currentbio = card->bio;
+ card->current_iter = card->bio->bi_iter;
+ card->bio = card->bio->bi_next;
+ if (card->bio == NULL)
+ card->biotail = &card->bio;
+ card->currentbio->bi_next = NULL;
+ return 1;
+ }
+ if (!bio)
+ return 0;
+
+ rw = bio_rw(bio);
+ if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE)
+ return 0;
+
+ vec = bio_iter_iovec(bio, card->current_iter);
+
+ dma_handle = pci_map_page(card->dev,
+ vec.bv_page,
+ vec.bv_offset,
+ vec.bv_len,
+ (rw == READ) ?
+ PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE);
+
+ p = &card->mm_pages[card->Ready];
+ desc = &p->desc[p->cnt];
+ p->cnt++;
+ if (p->bio == NULL)
+ p->iter = card->current_iter;
+ if ((p->biotail) != &bio->bi_next) {
+ *(p->biotail) = bio;
+ p->biotail = &(bio->bi_next);
+ bio->bi_next = NULL;
+ }
+
+ desc->data_dma_handle = dma_handle;
+
+ desc->pci_addr = cpu_to_le64((u64)desc->data_dma_handle);
+ desc->local_addr = cpu_to_le64(card->current_iter.bi_sector << 9);
+ desc->transfer_size = cpu_to_le32(vec.bv_len);
+ offset = (((char *)&desc->sem_control_bits) - ((char *)p->desc));
+ desc->sem_addr = cpu_to_le64((u64)(p->page_dma+offset));
+ desc->zero1 = desc->zero2 = 0;
+ offset = (((char *)(desc+1)) - ((char *)p->desc));
+ desc->next_desc_addr = cpu_to_le64(p->page_dma+offset);
+ desc->control_bits = cpu_to_le32(DMASCR_GO|DMASCR_ERR_INT_EN|
+ DMASCR_PARITY_INT_EN|
+ DMASCR_CHAIN_EN |
+ DMASCR_SEM_EN |
+ pci_cmds);
+ if (rw == WRITE)
+ desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ);
+ desc->sem_control_bits = desc->control_bits;
+
+
+ bio_advance_iter(bio, &card->current_iter, vec.bv_len);
+ if (!card->current_iter.bi_size)
+ card->currentbio = NULL;
+
+ return 1;
+}
+
+static void process_page(unsigned long data)
+{
+ /* check if any of the requests in the page are DMA_COMPLETE,
+ * and deal with them appropriately.
+ * If we find a descriptor without DMA_COMPLETE in the semaphore, then
+ * dma must have hit an error on that descriptor, so use dma_status
+ * instead and assume that all following descriptors must be re-tried.
+ */
+ struct mm_page *page;
+ struct bio *return_bio = NULL;
+ struct cardinfo *card = (struct cardinfo *)data;
+ unsigned int dma_status = card->dma_status;
+
+ spin_lock_bh(&card->lock);
+ if (card->Active < 0)
+ goto out_unlock;
+ page = &card->mm_pages[card->Active];
+
+ while (page->headcnt < page->cnt) {
+ struct bio *bio = page->bio;
+ struct mm_dma_desc *desc = &page->desc[page->headcnt];
+ int control = le32_to_cpu(desc->sem_control_bits);
+ int last = 0;
+ struct bio_vec vec;
+
+ if (!(control & DMASCR_DMA_COMPLETE)) {
+ control = dma_status;
+ last = 1;
+ }
+
+ page->headcnt++;
+ vec = bio_iter_iovec(bio, page->iter);
+ bio_advance_iter(bio, &page->iter, vec.bv_len);
+
+ if (!page->iter.bi_size) {
+ page->bio = bio->bi_next;
+ if (page->bio)
+ page->iter = page->bio->bi_iter;
+ }
+
+ pci_unmap_page(card->dev, desc->data_dma_handle,
+ vec.bv_len,
+ (control & DMASCR_TRANSFER_READ) ?
+ PCI_DMA_TODEVICE : PCI_DMA_FROMDEVICE);
+ if (control & DMASCR_HARD_ERROR) {
+ /* error */
+ clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ dev_printk(KERN_WARNING, &card->dev->dev,
+ "I/O error on sector %d/%d\n",
+ le32_to_cpu(desc->local_addr)>>9,
+ le32_to_cpu(desc->transfer_size));
+ dump_dmastat(card, control);
+ } else if ((bio->bi_rw & REQ_WRITE) &&
+ le32_to_cpu(desc->local_addr) >> 9 ==
+ card->init_size) {
+ card->init_size += le32_to_cpu(desc->transfer_size) >> 9;
+ if (card->init_size >> 1 >= card->mm_size) {
+ dev_printk(KERN_INFO, &card->dev->dev,
+ "memory now initialised\n");
+ set_userbit(card, MEMORY_INITIALIZED, 1);
+ }
+ }
+ if (bio != page->bio) {
+ bio->bi_next = return_bio;
+ return_bio = bio;
+ }
+
+ if (last)
+ break;
+ }
+
+ if (debug & DEBUG_LED_ON_TRANSFER)
+ set_led(card, LED_REMOVE, LED_OFF);
+
+ if (card->check_batteries) {
+ card->check_batteries = 0;
+ check_batteries(card);
+ }
+ if (page->headcnt >= page->cnt) {
+ reset_page(page);
+ card->Active = -1;
+ activate(card);
+ } else {
+ /* haven't finished with this one yet */
+ pr_debug("do some more\n");
+ mm_start_io(card);
+ }
+ out_unlock:
+ spin_unlock_bh(&card->lock);
+
+ while (return_bio) {
+ struct bio *bio = return_bio;
+
+ return_bio = bio->bi_next;
+ bio->bi_next = NULL;
+ bio_endio(bio, 0);
+ }
+}
+
+static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+ struct cardinfo *card = cb->data;
+
+ spin_lock_irq(&card->lock);
+ activate(card);
+ spin_unlock_irq(&card->lock);
+ kfree(cb);
+}
+
+static int mm_check_plugged(struct cardinfo *card)
+{
+ return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb));
+}
+
+static void mm_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct cardinfo *card = q->queuedata;
+ pr_debug("mm_make_request %llu %u\n",
+ (unsigned long long)bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size);
+
+ spin_lock_irq(&card->lock);
+ *card->biotail = bio;
+ bio->bi_next = NULL;
+ card->biotail = &bio->bi_next;
+ if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card))
+ activate(card);
+ spin_unlock_irq(&card->lock);
+
+ return;
+}
+
+static irqreturn_t mm_interrupt(int irq, void *__card)
+{
+ struct cardinfo *card = (struct cardinfo *) __card;
+ unsigned int dma_status;
+ unsigned short cfg_status;
+
+HW_TRACE(0x30);
+
+ dma_status = le32_to_cpu(readl(card->csr_remap + DMA_STATUS_CTRL));
+
+ if (!(dma_status & (DMASCR_ERROR_MASK | DMASCR_CHAIN_COMPLETE))) {
+ /* interrupt wasn't for me ... */
+ return IRQ_NONE;
+ }
+
+ /* clear COMPLETION interrupts */
+ if (card->flags & UM_FLAG_NO_BYTE_STATUS)
+ writel(cpu_to_le32(DMASCR_DMA_COMPLETE|DMASCR_CHAIN_COMPLETE),
+ card->csr_remap + DMA_STATUS_CTRL);
+ else
+ writeb((DMASCR_DMA_COMPLETE|DMASCR_CHAIN_COMPLETE) >> 16,
+ card->csr_remap + DMA_STATUS_CTRL + 2);
+
+ /* log errors and clear interrupt status */
+ if (dma_status & DMASCR_ANY_ERR) {
+ unsigned int data_log1, data_log2;
+ unsigned int addr_log1, addr_log2;
+ unsigned char stat, count, syndrome, check;
+
+ stat = readb(card->csr_remap + MEMCTRLCMD_ERRSTATUS);
+
+ data_log1 = le32_to_cpu(readl(card->csr_remap +
+ ERROR_DATA_LOG));
+ data_log2 = le32_to_cpu(readl(card->csr_remap +
+ ERROR_DATA_LOG + 4));
+ addr_log1 = le32_to_cpu(readl(card->csr_remap +
+ ERROR_ADDR_LOG));
+ addr_log2 = readb(card->csr_remap + ERROR_ADDR_LOG + 4);
+
+ count = readb(card->csr_remap + ERROR_COUNT);
+ syndrome = readb(card->csr_remap + ERROR_SYNDROME);
+ check = readb(card->csr_remap + ERROR_CHECK);
+
+ dump_dmastat(card, dma_status);
+
+ if (stat & 0x01)
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Memory access error detected (err count %d)\n",
+ count);
+ if (stat & 0x02)
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Multi-bit EDC error\n");
+
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Fault Address 0x%02x%08x, Fault Data 0x%08x%08x\n",
+ addr_log2, addr_log1, data_log2, data_log1);
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Fault Check 0x%02x, Fault Syndrome 0x%02x\n",
+ check, syndrome);
+
+ writeb(0, card->csr_remap + ERROR_COUNT);
+ }
+
+ if (dma_status & DMASCR_PARITY_ERR_REP) {
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "PARITY ERROR REPORTED\n");
+ pci_read_config_word(card->dev, PCI_STATUS, &cfg_status);
+ pci_write_config_word(card->dev, PCI_STATUS, cfg_status);
+ }
+
+ if (dma_status & DMASCR_PARITY_ERR_DET) {
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "PARITY ERROR DETECTED\n");
+ pci_read_config_word(card->dev, PCI_STATUS, &cfg_status);
+ pci_write_config_word(card->dev, PCI_STATUS, cfg_status);
+ }
+
+ if (dma_status & DMASCR_SYSTEM_ERR_SIG) {
+ dev_printk(KERN_ERR, &card->dev->dev, "SYSTEM ERROR\n");
+ pci_read_config_word(card->dev, PCI_STATUS, &cfg_status);
+ pci_write_config_word(card->dev, PCI_STATUS, cfg_status);
+ }
+
+ if (dma_status & DMASCR_TARGET_ABT) {
+ dev_printk(KERN_ERR, &card->dev->dev, "TARGET ABORT\n");
+ pci_read_config_word(card->dev, PCI_STATUS, &cfg_status);
+ pci_write_config_word(card->dev, PCI_STATUS, cfg_status);
+ }
+
+ if (dma_status & DMASCR_MASTER_ABT) {
+ dev_printk(KERN_ERR, &card->dev->dev, "MASTER ABORT\n");
+ pci_read_config_word(card->dev, PCI_STATUS, &cfg_status);
+ pci_write_config_word(card->dev, PCI_STATUS, cfg_status);
+ }
+
+ /* and process the DMA descriptors */
+ card->dma_status = dma_status;
+ tasklet_schedule(&card->tasklet);
+
+HW_TRACE(0x36);
+
+ return IRQ_HANDLED;
+}
+
+/*
+ * If both batteries are good, no LED
+ * If either battery has been warned, solid LED
+ * If both batteries are bad, flash the LED quickly
+ * If either battery is bad, flash the LED semi quickly
+ */
+static void set_fault_to_battery_status(struct cardinfo *card)
+{
+ if (card->battery[0].good && card->battery[1].good)
+ set_led(card, LED_FAULT, LED_OFF);
+ else if (card->battery[0].warned || card->battery[1].warned)
+ set_led(card, LED_FAULT, LED_ON);
+ else if (!card->battery[0].good && !card->battery[1].good)
+ set_led(card, LED_FAULT, LED_FLASH_7_0);
+ else
+ set_led(card, LED_FAULT, LED_FLASH_3_5);
+}
+
+static void init_battery_timer(void);
+
+static int check_battery(struct cardinfo *card, int battery, int status)
+{
+ if (status != card->battery[battery].good) {
+ card->battery[battery].good = !card->battery[battery].good;
+ card->battery[battery].last_change = jiffies;
+
+ if (card->battery[battery].good) {
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Battery %d now good\n", battery + 1);
+ card->battery[battery].warned = 0;
+ } else
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Battery %d now FAILED\n", battery + 1);
+
+ return 1;
+ } else if (!card->battery[battery].good &&
+ !card->battery[battery].warned &&
+ time_after_eq(jiffies, card->battery[battery].last_change +
+ (HZ * 60 * 60 * 5))) {
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Battery %d still FAILED after 5 hours\n", battery + 1);
+ card->battery[battery].warned = 1;
+
+ return 1;
+ }
+
+ return 0;
+}
+
+static void check_batteries(struct cardinfo *card)
+{
+ /* NOTE: this must *never* be called while the card
+ * is doing (bus-to-card) DMA, or you will need the
+ * reset switch
+ */
+ unsigned char status;
+ int ret1, ret2;
+
+ status = readb(card->csr_remap + MEMCTRLSTATUS_BATTERY);
+ if (debug & DEBUG_BATTERY_POLLING)
+ dev_printk(KERN_DEBUG, &card->dev->dev,
+ "checking battery status, 1 = %s, 2 = %s\n",
+ (status & BATTERY_1_FAILURE) ? "FAILURE" : "OK",
+ (status & BATTERY_2_FAILURE) ? "FAILURE" : "OK");
+
+ ret1 = check_battery(card, 0, !(status & BATTERY_1_FAILURE));
+ ret2 = check_battery(card, 1, !(status & BATTERY_2_FAILURE));
+
+ if (ret1 || ret2)
+ set_fault_to_battery_status(card);
+}
+
+static void check_all_batteries(unsigned long ptr)
+{
+ int i;
+
+ for (i = 0; i < num_cards; i++)
+ if (!(cards[i].flags & UM_FLAG_NO_BATT)) {
+ struct cardinfo *card = &cards[i];
+ spin_lock_bh(&card->lock);
+ if (card->Active >= 0)
+ card->check_batteries = 1;
+ else
+ check_batteries(card);
+ spin_unlock_bh(&card->lock);
+ }
+
+ init_battery_timer();
+}
+
+static void init_battery_timer(void)
+{
+ init_timer(&battery_timer);
+ battery_timer.function = check_all_batteries;
+ battery_timer.expires = jiffies + (HZ * 60);
+ add_timer(&battery_timer);
+}
+
+static void del_battery_timer(void)
+{
+ del_timer(&battery_timer);
+}
+
+/*
+ * Note no locks taken out here. In a worst case scenario, we could drop
+ * a chunk of system memory. But that should never happen, since validation
+ * happens at open or mount time, when locks are held.
+ *
+ * That's crap, since doing that while some partitions are opened
+ * or mounted will give you really nasty results.
+ */
+static int mm_revalidate(struct gendisk *disk)
+{
+ struct cardinfo *card = disk->private_data;
+ set_capacity(disk, card->mm_size << 1);
+ return 0;
+}
+
+static int mm_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct cardinfo *card = bdev->bd_disk->private_data;
+ int size = card->mm_size * (1024 / MM_HARDSECT);
+
+ /*
+ * get geometry: we have to fake one... trim the size to a
+ * multiple of 2048 (1M): tell we have 32 sectors, 64 heads,
+ * whatever cylinders.
+ */
+ geo->heads = 64;
+ geo->sectors = 32;
+ geo->cylinders = size / (geo->heads * geo->sectors);
+ return 0;
+}
+
+static const struct block_device_operations mm_fops = {
+ .owner = THIS_MODULE,
+ .getgeo = mm_getgeo,
+ .revalidate_disk = mm_revalidate,
+};
+
+static int mm_pci_probe(struct pci_dev *dev, const struct pci_device_id *id)
+{
+ int ret = -ENODEV;
+ struct cardinfo *card = &cards[num_cards];
+ unsigned char mem_present;
+ unsigned char batt_status;
+ unsigned int saved_bar, data;
+ unsigned long csr_base;
+ unsigned long csr_len;
+ int magic_number;
+ static int printed_version;
+
+ if (!printed_version++)
+ printk(KERN_INFO DRIVER_VERSION " : " DRIVER_DESC "\n");
+
+ ret = pci_enable_device(dev);
+ if (ret)
+ return ret;
+
+ pci_write_config_byte(dev, PCI_LATENCY_TIMER, 0xF8);
+ pci_set_master(dev);
+
+ card->dev = dev;
+
+ csr_base = pci_resource_start(dev, 0);
+ csr_len = pci_resource_len(dev, 0);
+ if (!csr_base || !csr_len)
+ return -ENODEV;
+
+ dev_printk(KERN_INFO, &dev->dev,
+ "Micro Memory(tm) controller found (PCI Mem Module (Battery Backup))\n");
+
+ if (pci_set_dma_mask(dev, DMA_BIT_MASK(64)) &&
+ pci_set_dma_mask(dev, DMA_BIT_MASK(32))) {
+ dev_printk(KERN_WARNING, &dev->dev, "NO suitable DMA found\n");
+ return -ENOMEM;
+ }
+
+ ret = pci_request_regions(dev, DRIVER_NAME);
+ if (ret) {
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Unable to request memory region\n");
+ goto failed_req_csr;
+ }
+
+ card->csr_remap = ioremap_nocache(csr_base, csr_len);
+ if (!card->csr_remap) {
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Unable to remap memory region\n");
+ ret = -ENOMEM;
+
+ goto failed_remap_csr;
+ }
+
+ dev_printk(KERN_INFO, &card->dev->dev,
+ "CSR 0x%08lx -> 0x%p (0x%lx)\n",
+ csr_base, card->csr_remap, csr_len);
+
+ switch (card->dev->device) {
+ case 0x5415:
+ card->flags |= UM_FLAG_NO_BYTE_STATUS | UM_FLAG_NO_BATTREG;
+ magic_number = 0x59;
+ break;
+
+ case 0x5425:
+ card->flags |= UM_FLAG_NO_BYTE_STATUS;
+ magic_number = 0x5C;
+ break;
+
+ case 0x6155:
+ card->flags |= UM_FLAG_NO_BYTE_STATUS |
+ UM_FLAG_NO_BATTREG | UM_FLAG_NO_BATT;
+ magic_number = 0x99;
+ break;
+
+ default:
+ magic_number = 0x100;
+ break;
+ }
+
+ if (readb(card->csr_remap + MEMCTRLSTATUS_MAGIC) != magic_number) {
+ dev_printk(KERN_ERR, &card->dev->dev, "Magic number invalid\n");
+ ret = -ENOMEM;
+ goto failed_magic;
+ }
+
+ card->mm_pages[0].desc = pci_alloc_consistent(card->dev,
+ PAGE_SIZE * 2,
+ &card->mm_pages[0].page_dma);
+ card->mm_pages[1].desc = pci_alloc_consistent(card->dev,
+ PAGE_SIZE * 2,
+ &card->mm_pages[1].page_dma);
+ if (card->mm_pages[0].desc == NULL ||
+ card->mm_pages[1].desc == NULL) {
+ dev_printk(KERN_ERR, &card->dev->dev, "alloc failed\n");
+ goto failed_alloc;
+ }
+ reset_page(&card->mm_pages[0]);
+ reset_page(&card->mm_pages[1]);
+ card->Ready = 0; /* page 0 is ready */
+ card->Active = -1; /* no page is active */
+ card->bio = NULL;
+ card->biotail = &card->bio;
+
+ card->queue = blk_alloc_queue(GFP_KERNEL);
+ if (!card->queue)
+ goto failed_alloc;
+
+ blk_queue_make_request(card->queue, mm_make_request);
+ card->queue->queue_lock = &card->lock;
+ card->queue->queuedata = card;
+
+ tasklet_init(&card->tasklet, process_page, (unsigned long)card);
+
+ card->check_batteries = 0;
+
+ mem_present = readb(card->csr_remap + MEMCTRLSTATUS_MEMORY);
+ switch (mem_present) {
+ case MEM_128_MB:
+ card->mm_size = 1024 * 128;
+ break;
+ case MEM_256_MB:
+ card->mm_size = 1024 * 256;
+ break;
+ case MEM_512_MB:
+ card->mm_size = 1024 * 512;
+ break;
+ case MEM_1_GB:
+ card->mm_size = 1024 * 1024;
+ break;
+ case MEM_2_GB:
+ card->mm_size = 1024 * 2048;
+ break;
+ default:
+ card->mm_size = 0;
+ break;
+ }
+
+ /* Clear the LED's we control */
+ set_led(card, LED_REMOVE, LED_OFF);
+ set_led(card, LED_FAULT, LED_OFF);
+
+ batt_status = readb(card->csr_remap + MEMCTRLSTATUS_BATTERY);
+
+ card->battery[0].good = !(batt_status & BATTERY_1_FAILURE);
+ card->battery[1].good = !(batt_status & BATTERY_2_FAILURE);
+ card->battery[0].last_change = card->battery[1].last_change = jiffies;
+
+ if (card->flags & UM_FLAG_NO_BATT)
+ dev_printk(KERN_INFO, &card->dev->dev,
+ "Size %d KB\n", card->mm_size);
+ else {
+ dev_printk(KERN_INFO, &card->dev->dev,
+ "Size %d KB, Battery 1 %s (%s), Battery 2 %s (%s)\n",
+ card->mm_size,
+ batt_status & BATTERY_1_DISABLED ? "Disabled" : "Enabled",
+ card->battery[0].good ? "OK" : "FAILURE",
+ batt_status & BATTERY_2_DISABLED ? "Disabled" : "Enabled",
+ card->battery[1].good ? "OK" : "FAILURE");
+
+ set_fault_to_battery_status(card);
+ }
+
+ pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, &saved_bar);
+ data = 0xffffffff;
+ pci_write_config_dword(dev, PCI_BASE_ADDRESS_1, data);
+ pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, &data);
+ pci_write_config_dword(dev, PCI_BASE_ADDRESS_1, saved_bar);
+ data &= 0xfffffff0;
+ data = ~data;
+ data += 1;
+
+ if (request_irq(dev->irq, mm_interrupt, IRQF_SHARED, DRIVER_NAME,
+ card)) {
+ dev_printk(KERN_ERR, &card->dev->dev,
+ "Unable to allocate IRQ\n");
+ ret = -ENODEV;
+ goto failed_req_irq;
+ }
+
+ dev_printk(KERN_INFO, &card->dev->dev,
+ "Window size %d bytes, IRQ %d\n", data, dev->irq);
+
+ spin_lock_init(&card->lock);
+
+ pci_set_drvdata(dev, card);
+
+ if (pci_write_cmd != 0x0F) /* If not Memory Write & Invalidate */
+ pci_write_cmd = 0x07; /* then Memory Write command */
+
+ if (pci_write_cmd & 0x08) { /* use Memory Write and Invalidate */
+ unsigned short cfg_command;
+ pci_read_config_word(dev, PCI_COMMAND, &cfg_command);
+ cfg_command |= 0x10; /* Memory Write & Invalidate Enable */
+ pci_write_config_word(dev, PCI_COMMAND, cfg_command);
+ }
+ pci_cmds = (pci_read_cmd << 28) | (pci_write_cmd << 24);
+
+ num_cards++;
+
+ if (!get_userbit(card, MEMORY_INITIALIZED)) {
+ dev_printk(KERN_INFO, &card->dev->dev,
+ "memory NOT initialized. Consider over-writing whole device.\n");
+ card->init_size = 0;
+ } else {
+ dev_printk(KERN_INFO, &card->dev->dev,
+ "memory already initialized\n");
+ card->init_size = card->mm_size;
+ }
+
+ /* Enable ECC */
+ writeb(EDC_STORE_CORRECT, card->csr_remap + MEMCTRLCMD_ERRCTRL);
+
+ return 0;
+
+ failed_req_irq:
+ failed_alloc:
+ if (card->mm_pages[0].desc)
+ pci_free_consistent(card->dev, PAGE_SIZE*2,
+ card->mm_pages[0].desc,
+ card->mm_pages[0].page_dma);
+ if (card->mm_pages[1].desc)
+ pci_free_consistent(card->dev, PAGE_SIZE*2,
+ card->mm_pages[1].desc,
+ card->mm_pages[1].page_dma);
+ failed_magic:
+ iounmap(card->csr_remap);
+ failed_remap_csr:
+ pci_release_regions(dev);
+ failed_req_csr:
+
+ return ret;
+}
+
+static void mm_pci_remove(struct pci_dev *dev)
+{
+ struct cardinfo *card = pci_get_drvdata(dev);
+
+ tasklet_kill(&card->tasklet);
+ free_irq(dev->irq, card);
+ iounmap(card->csr_remap);
+
+ if (card->mm_pages[0].desc)
+ pci_free_consistent(card->dev, PAGE_SIZE*2,
+ card->mm_pages[0].desc,
+ card->mm_pages[0].page_dma);
+ if (card->mm_pages[1].desc)
+ pci_free_consistent(card->dev, PAGE_SIZE*2,
+ card->mm_pages[1].desc,
+ card->mm_pages[1].page_dma);
+ blk_cleanup_queue(card->queue);
+
+ pci_release_regions(dev);
+ pci_disable_device(dev);
+}
+
+static const struct pci_device_id mm_pci_ids[] = {
+ {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_5415CN)},
+ {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_5425CN)},
+ {PCI_DEVICE(PCI_VENDOR_ID_MICRO_MEMORY, PCI_DEVICE_ID_MICRO_MEMORY_6155)},
+ {
+ .vendor = 0x8086,
+ .device = 0xB555,
+ .subvendor = 0x1332,
+ .subdevice = 0x5460,
+ .class = 0x050000,
+ .class_mask = 0,
+ }, { /* end: all zeroes */ }
+};
+
+MODULE_DEVICE_TABLE(pci, mm_pci_ids);
+
+static struct pci_driver mm_pci_driver = {
+ .name = DRIVER_NAME,
+ .id_table = mm_pci_ids,
+ .probe = mm_pci_probe,
+ .remove = mm_pci_remove,
+};
+
+static int __init mm_init(void)
+{
+ int retval, i;
+ int err;
+
+ retval = pci_register_driver(&mm_pci_driver);
+ if (retval)
+ return -ENOMEM;
+
+ err = major_nr = register_blkdev(0, DRIVER_NAME);
+ if (err < 0) {
+ pci_unregister_driver(&mm_pci_driver);
+ return -EIO;
+ }
+
+ for (i = 0; i < num_cards; i++) {
+ mm_gendisk[i] = alloc_disk(1 << MM_SHIFT);
+ if (!mm_gendisk[i])
+ goto out;
+ }
+
+ for (i = 0; i < num_cards; i++) {
+ struct gendisk *disk = mm_gendisk[i];
+ sprintf(disk->disk_name, "umem%c", 'a'+i);
+ spin_lock_init(&cards[i].lock);
+ disk->major = major_nr;
+ disk->first_minor = i << MM_SHIFT;
+ disk->fops = &mm_fops;
+ disk->private_data = &cards[i];
+ disk->queue = cards[i].queue;
+ set_capacity(disk, cards[i].mm_size << 1);
+ add_disk(disk);
+ }
+
+ init_battery_timer();
+ printk(KERN_INFO "MM: desc_per_page = %ld\n", DESC_PER_PAGE);
+/* printk("mm_init: Done. 10-19-01 9:00\n"); */
+ return 0;
+
+out:
+ pci_unregister_driver(&mm_pci_driver);
+ unregister_blkdev(major_nr, DRIVER_NAME);
+ while (i--)
+ put_disk(mm_gendisk[i]);
+ return -ENOMEM;
+}
+
+static void __exit mm_cleanup(void)
+{
+ int i;
+
+ del_battery_timer();
+
+ for (i = 0; i < num_cards ; i++) {
+ del_gendisk(mm_gendisk[i]);
+ put_disk(mm_gendisk[i]);
+ }
+
+ pci_unregister_driver(&mm_pci_driver);
+
+ unregister_blkdev(major_nr, DRIVER_NAME);
+}
+
+module_init(mm_init);
+module_exit(mm_cleanup);
+
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/umem.h b/drivers/block/umem.h
new file mode 100644
index 000000000..375c68974
--- /dev/null
+++ b/drivers/block/umem.h
@@ -0,0 +1,133 @@
+
+/*
+ * This file contains defines for the
+ * Micro Memory MM5415
+ * family PCI Memory Module with Battery Backup.
+ *
+ * Copyright Micro Memory INC 2001. All rights reserved.
+ * Release under the terms of the GNU GENERAL PUBLIC LICENSE version 2.
+ * See the file COPYING.
+ */
+
+#ifndef _DRIVERS_BLOCK_MM_H
+#define _DRIVERS_BLOCK_MM_H
+
+
+#define IRQ_TIMEOUT (1 * HZ)
+
+/* CSR register definition */
+#define MEMCTRLSTATUS_MAGIC 0x00
+#define MM_MAGIC_VALUE (unsigned char)0x59
+
+#define MEMCTRLSTATUS_BATTERY 0x04
+#define BATTERY_1_DISABLED 0x01
+#define BATTERY_1_FAILURE 0x02
+#define BATTERY_2_DISABLED 0x04
+#define BATTERY_2_FAILURE 0x08
+
+#define MEMCTRLSTATUS_MEMORY 0x07
+#define MEM_128_MB 0xfe
+#define MEM_256_MB 0xfc
+#define MEM_512_MB 0xf8
+#define MEM_1_GB 0xf0
+#define MEM_2_GB 0xe0
+
+#define MEMCTRLCMD_LEDCTRL 0x08
+#define LED_REMOVE 2
+#define LED_FAULT 4
+#define LED_POWER 6
+#define LED_FLIP 255
+#define LED_OFF 0x00
+#define LED_ON 0x01
+#define LED_FLASH_3_5 0x02
+#define LED_FLASH_7_0 0x03
+#define LED_POWER_ON 0x00
+#define LED_POWER_OFF 0x01
+#define USER_BIT1 0x01
+#define USER_BIT2 0x02
+
+#define MEMORY_INITIALIZED USER_BIT1
+
+#define MEMCTRLCMD_ERRCTRL 0x0C
+#define EDC_NONE_DEFAULT 0x00
+#define EDC_NONE 0x01
+#define EDC_STORE_READ 0x02
+#define EDC_STORE_CORRECT 0x03
+
+#define MEMCTRLCMD_ERRCNT 0x0D
+#define MEMCTRLCMD_ERRSTATUS 0x0E
+
+#define ERROR_DATA_LOG 0x20
+#define ERROR_ADDR_LOG 0x28
+#define ERROR_COUNT 0x3D
+#define ERROR_SYNDROME 0x3E
+#define ERROR_CHECK 0x3F
+
+#define DMA_PCI_ADDR 0x40
+#define DMA_LOCAL_ADDR 0x48
+#define DMA_TRANSFER_SIZE 0x50
+#define DMA_DESCRIPTOR_ADDR 0x58
+#define DMA_SEMAPHORE_ADDR 0x60
+#define DMA_STATUS_CTRL 0x68
+#define DMASCR_GO 0x00001
+#define DMASCR_TRANSFER_READ 0x00002
+#define DMASCR_CHAIN_EN 0x00004
+#define DMASCR_SEM_EN 0x00010
+#define DMASCR_DMA_COMP_EN 0x00020
+#define DMASCR_CHAIN_COMP_EN 0x00040
+#define DMASCR_ERR_INT_EN 0x00080
+#define DMASCR_PARITY_INT_EN 0x00100
+#define DMASCR_ANY_ERR 0x00800
+#define DMASCR_MBE_ERR 0x01000
+#define DMASCR_PARITY_ERR_REP 0x02000
+#define DMASCR_PARITY_ERR_DET 0x04000
+#define DMASCR_SYSTEM_ERR_SIG 0x08000
+#define DMASCR_TARGET_ABT 0x10000
+#define DMASCR_MASTER_ABT 0x20000
+#define DMASCR_DMA_COMPLETE 0x40000
+#define DMASCR_CHAIN_COMPLETE 0x80000
+
+/*
+3.SOME PCs HAVE HOST BRIDGES WHICH APPARENTLY DO NOT CORRECTLY HANDLE
+READ-LINE (0xE) OR READ-MULTIPLE (0xC) PCI COMMAND CODES DURING DMA
+TRANSFERS. IN OTHER SYSTEMS THESE COMMAND CODES WILL CAUSE THE HOST BRIDGE
+TO ALLOW LONGER BURSTS DURING DMA READ OPERATIONS. THE UPPER FOUR BITS
+(31..28) OF THE DMA CSR HAVE BEEN MADE PROGRAMMABLE, SO THAT EITHER A 0x6,
+AN 0xE OR A 0xC CAN BE WRITTEN TO THEM TO SET THE COMMAND CODE USED DURING
+DMA READ OPERATIONS.
+*/
+#define DMASCR_READ 0x60000000
+#define DMASCR_READLINE 0xE0000000
+#define DMASCR_READMULTI 0xC0000000
+
+
+#define DMASCR_ERROR_MASK (DMASCR_MASTER_ABT | DMASCR_TARGET_ABT | DMASCR_SYSTEM_ERR_SIG | DMASCR_PARITY_ERR_DET | DMASCR_MBE_ERR | DMASCR_ANY_ERR)
+#define DMASCR_HARD_ERROR (DMASCR_MASTER_ABT | DMASCR_TARGET_ABT | DMASCR_SYSTEM_ERR_SIG | DMASCR_PARITY_ERR_DET | DMASCR_MBE_ERR)
+
+#define WINDOWMAP_WINNUM 0x7B
+
+#define DMA_READ_FROM_HOST 0
+#define DMA_WRITE_TO_HOST 1
+
+struct mm_dma_desc {
+ __le64 pci_addr;
+ __le64 local_addr;
+ __le32 transfer_size;
+ u32 zero1;
+ __le64 next_desc_addr;
+ __le64 sem_addr;
+ __le32 control_bits;
+ u32 zero2;
+
+ dma_addr_t data_dma_handle;
+
+ /* Copy of the bits */
+ __le64 sem_control_bits;
+} __attribute__((aligned(8)));
+
+/* bits for card->flags */
+#define UM_FLAG_DMA_IN_REGS 1
+#define UM_FLAG_NO_BYTE_STATUS 2
+#define UM_FLAG_NO_BATTREG 4
+#define UM_FLAG_NO_BATT 8
+#endif
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
new file mode 100644
index 000000000..5ea2f0bbb
--- /dev/null
+++ b/drivers/block/virtio_blk.c
@@ -0,0 +1,901 @@
+//#define DEBUG
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/virtio.h>
+#include <linux/virtio_blk.h>
+#include <linux/scatterlist.h>
+#include <linux/string_helpers.h>
+#include <scsi/scsi_cmnd.h>
+#include <linux/idr.h>
+#include <linux/blk-mq.h>
+#include <linux/numa.h>
+
+#define PART_BITS 4
+#define VQ_NAME_LEN 16
+
+static int major;
+static DEFINE_IDA(vd_index_ida);
+
+static struct workqueue_struct *virtblk_wq;
+
+struct virtio_blk_vq {
+ struct virtqueue *vq;
+ spinlock_t lock;
+ char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
+struct virtio_blk {
+ struct virtio_device *vdev;
+
+ /* The disk structure for the kernel. */
+ struct gendisk *disk;
+
+ /* Block layer tags. */
+ struct blk_mq_tag_set tag_set;
+
+ /* Process context for config space updates */
+ struct work_struct config_work;
+
+ /* What host tells us, plus 2 for header & tailer. */
+ unsigned int sg_elems;
+
+ /* Ida index - used to track minor number allocations. */
+ int index;
+
+ /* num of vqs */
+ int num_vqs;
+ struct virtio_blk_vq *vqs;
+};
+
+struct virtblk_req {
+ struct request *req;
+ struct virtio_blk_outhdr out_hdr;
+ struct virtio_scsi_inhdr in_hdr;
+ u8 status;
+ struct scatterlist sg[];
+};
+
+static inline int virtblk_result(struct virtblk_req *vbr)
+{
+ switch (vbr->status) {
+ case VIRTIO_BLK_S_OK:
+ return 0;
+ case VIRTIO_BLK_S_UNSUPP:
+ return -ENOTTY;
+ default:
+ return -EIO;
+ }
+}
+
+static int __virtblk_add_req(struct virtqueue *vq,
+ struct virtblk_req *vbr,
+ struct scatterlist *data_sg,
+ bool have_data)
+{
+ struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
+ unsigned int num_out = 0, num_in = 0;
+ __virtio32 type = vbr->out_hdr.type & ~cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT);
+
+ sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
+ sgs[num_out++] = &hdr;
+
+ /*
+ * If this is a packet command we need a couple of additional headers.
+ * Behind the normal outhdr we put a segment with the scsi command
+ * block, and before the normal inhdr we put the sense data and the
+ * inhdr with additional status information.
+ */
+ if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) {
+ sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
+ sgs[num_out++] = &cmd;
+ }
+
+ if (have_data) {
+ if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
+ sgs[num_out++] = data_sg;
+ else
+ sgs[num_out + num_in++] = data_sg;
+ }
+
+ if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) {
+ sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
+ sgs[num_out + num_in++] = &sense;
+ sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
+ sgs[num_out + num_in++] = &inhdr;
+ }
+
+ sg_init_one(&status, &vbr->status, sizeof(vbr->status));
+ sgs[num_out + num_in++] = &status;
+
+ return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
+}
+
+static inline void virtblk_request_done(struct request *req)
+{
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+ struct virtio_blk *vblk = req->q->queuedata;
+ int error = virtblk_result(vbr);
+
+ if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
+ req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual);
+ req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len);
+ req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors);
+ } else if (req->cmd_type == REQ_TYPE_SPECIAL) {
+ req->errors = (error != 0);
+ }
+
+ blk_mq_end_request(req, error);
+}
+
+static void virtblk_done(struct virtqueue *vq)
+{
+ struct virtio_blk *vblk = vq->vdev->priv;
+ bool req_done = false;
+ int qid = vq->index;
+ struct virtblk_req *vbr;
+ unsigned long flags;
+ unsigned int len;
+
+ spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
+ do {
+ virtqueue_disable_cb(vq);
+ while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
+ blk_mq_complete_request(vbr->req);
+ req_done = true;
+ }
+ if (unlikely(virtqueue_is_broken(vq)))
+ break;
+ } while (!virtqueue_enable_cb(vq));
+
+ /* In case queue is stopped waiting for more buffers. */
+ if (req_done)
+ blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+ spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
+}
+
+static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
+ const struct blk_mq_queue_data *bd)
+{
+ struct virtio_blk *vblk = hctx->queue->queuedata;
+ struct request *req = bd->rq;
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+ unsigned long flags;
+ unsigned int num;
+ int qid = hctx->queue_num;
+ int err;
+ bool notify = false;
+
+ BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
+
+ vbr->req = req;
+ if (req->cmd_flags & REQ_FLUSH) {
+ vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH);
+ vbr->out_hdr.sector = 0;
+ vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
+ } else {
+ switch (req->cmd_type) {
+ case REQ_TYPE_FS:
+ vbr->out_hdr.type = 0;
+ vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, blk_rq_pos(vbr->req));
+ vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
+ break;
+ case REQ_TYPE_BLOCK_PC:
+ vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_SCSI_CMD);
+ vbr->out_hdr.sector = 0;
+ vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
+ break;
+ case REQ_TYPE_SPECIAL:
+ vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
+ vbr->out_hdr.sector = 0;
+ vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req));
+ break;
+ default:
+ /* We don't put anything else in the queue. */
+ BUG();
+ }
+ }
+
+ blk_mq_start_request(req);
+
+ num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg);
+ if (num) {
+ if (rq_data_dir(vbr->req) == WRITE)
+ vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT);
+ else
+ vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN);
+ }
+
+ spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
+ err = __virtblk_add_req(vblk->vqs[qid].vq, vbr, vbr->sg, num);
+ if (err) {
+ virtqueue_kick(vblk->vqs[qid].vq);
+ blk_mq_stop_hw_queue(hctx);
+ spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
+ /* Out of mem doesn't actually happen, since we fall back
+ * to direct descriptors */
+ if (err == -ENOMEM || err == -ENOSPC)
+ return BLK_MQ_RQ_QUEUE_BUSY;
+ return BLK_MQ_RQ_QUEUE_ERROR;
+ }
+
+ if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
+ notify = true;
+ spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
+
+ if (notify)
+ virtqueue_notify(vblk->vqs[qid].vq);
+ return BLK_MQ_RQ_QUEUE_OK;
+}
+
+/* return id (s/n) string for *disk to *id_str
+ */
+static int virtblk_get_id(struct gendisk *disk, char *id_str)
+{
+ struct virtio_blk *vblk = disk->private_data;
+ struct request *req;
+ struct bio *bio;
+ int err;
+
+ bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES,
+ GFP_KERNEL);
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
+ if (IS_ERR(req)) {
+ bio_put(bio);
+ return PTR_ERR(req);
+ }
+
+ req->cmd_type = REQ_TYPE_SPECIAL;
+ err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
+ blk_put_request(req);
+
+ return err;
+}
+
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long data)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct virtio_blk *vblk = disk->private_data;
+
+ /*
+ * Only allow the generic SCSI ioctls if the host can support it.
+ */
+ if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
+ return -ENOTTY;
+
+ return scsi_cmd_blk_ioctl(bdev, mode, cmd,
+ (void __user *)data);
+}
+
+/* We provide getgeo only to please some old bootloader/partitioning tools */
+static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+ struct virtio_blk *vblk = bd->bd_disk->private_data;
+
+ /* see if the host passed in geometry config */
+ if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
+ virtio_cread(vblk->vdev, struct virtio_blk_config,
+ geometry.cylinders, &geo->cylinders);
+ virtio_cread(vblk->vdev, struct virtio_blk_config,
+ geometry.heads, &geo->heads);
+ virtio_cread(vblk->vdev, struct virtio_blk_config,
+ geometry.sectors, &geo->sectors);
+ } else {
+ /* some standard values, similar to sd */
+ geo->heads = 1 << 6;
+ geo->sectors = 1 << 5;
+ geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+ }
+ return 0;
+}
+
+static const struct block_device_operations virtblk_fops = {
+ .ioctl = virtblk_ioctl,
+ .owner = THIS_MODULE,
+ .getgeo = virtblk_getgeo,
+};
+
+static int index_to_minor(int index)
+{
+ return index << PART_BITS;
+}
+
+static int minor_to_index(int minor)
+{
+ return minor >> PART_BITS;
+}
+
+static ssize_t virtblk_serial_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ int err;
+
+ /* sysfs gives us a PAGE_SIZE buffer */
+ BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
+
+ buf[VIRTIO_BLK_ID_BYTES] = '\0';
+ err = virtblk_get_id(disk, buf);
+ if (!err)
+ return strlen(buf);
+
+ if (err == -EIO) /* Unsupported? Make it empty. */
+ return 0;
+
+ return err;
+}
+
+static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
+
+static void virtblk_config_changed_work(struct work_struct *work)
+{
+ struct virtio_blk *vblk =
+ container_of(work, struct virtio_blk, config_work);
+ struct virtio_device *vdev = vblk->vdev;
+ struct request_queue *q = vblk->disk->queue;
+ char cap_str_2[10], cap_str_10[10];
+ char *envp[] = { "RESIZE=1", NULL };
+ u64 capacity;
+
+ /* Host must always specify the capacity. */
+ virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
+
+ /* If capacity is too big, truncate with warning. */
+ if ((sector_t)capacity != capacity) {
+ dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
+ (unsigned long long)capacity);
+ capacity = (sector_t)-1;
+ }
+
+ string_get_size(capacity, queue_logical_block_size(q),
+ STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
+ string_get_size(capacity, queue_logical_block_size(q),
+ STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
+
+ dev_notice(&vdev->dev,
+ "new size: %llu %d-byte logical blocks (%s/%s)\n",
+ (unsigned long long)capacity,
+ queue_logical_block_size(q),
+ cap_str_10, cap_str_2);
+
+ set_capacity(vblk->disk, capacity);
+ revalidate_disk(vblk->disk);
+ kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
+}
+
+static void virtblk_config_changed(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+
+ queue_work(virtblk_wq, &vblk->config_work);
+}
+
+static int init_vq(struct virtio_blk *vblk)
+{
+ int err = 0;
+ int i;
+ vq_callback_t **callbacks;
+ const char **names;
+ struct virtqueue **vqs;
+ unsigned short num_vqs;
+ struct virtio_device *vdev = vblk->vdev;
+
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
+ struct virtio_blk_config, num_queues,
+ &num_vqs);
+ if (err)
+ num_vqs = 1;
+
+ vblk->vqs = kmalloc(sizeof(*vblk->vqs) * num_vqs, GFP_KERNEL);
+ if (!vblk->vqs) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ names = kmalloc(sizeof(*names) * num_vqs, GFP_KERNEL);
+ if (!names)
+ goto err_names;
+
+ callbacks = kmalloc(sizeof(*callbacks) * num_vqs, GFP_KERNEL);
+ if (!callbacks)
+ goto err_callbacks;
+
+ vqs = kmalloc(sizeof(*vqs) * num_vqs, GFP_KERNEL);
+ if (!vqs)
+ goto err_vqs;
+
+ for (i = 0; i < num_vqs; i++) {
+ callbacks[i] = virtblk_done;
+ snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%d", i);
+ names[i] = vblk->vqs[i].name;
+ }
+
+ /* Discover virtqueues and write information to configuration. */
+ err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
+ if (err)
+ goto err_find_vqs;
+
+ for (i = 0; i < num_vqs; i++) {
+ spin_lock_init(&vblk->vqs[i].lock);
+ vblk->vqs[i].vq = vqs[i];
+ }
+ vblk->num_vqs = num_vqs;
+
+ err_find_vqs:
+ kfree(vqs);
+ err_vqs:
+ kfree(callbacks);
+ err_callbacks:
+ kfree(names);
+ err_names:
+ if (err)
+ kfree(vblk->vqs);
+ out:
+ return err;
+}
+
+/*
+ * Legacy naming scheme used for virtio devices. We are stuck with it for
+ * virtio blk but don't ever use it for any new driver.
+ */
+static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
+{
+ const int base = 'z' - 'a' + 1;
+ char *begin = buf + strlen(prefix);
+ char *end = buf + buflen;
+ char *p;
+ int unit;
+
+ p = end - 1;
+ *p = '\0';
+ unit = base;
+ do {
+ if (p == begin)
+ return -EINVAL;
+ *--p = 'a' + (index % unit);
+ index = (index / unit) - 1;
+ } while (index >= 0);
+
+ memmove(begin, p, end - p);
+ memcpy(buf, prefix, strlen(prefix));
+
+ return 0;
+}
+
+static int virtblk_get_cache_mode(struct virtio_device *vdev)
+{
+ u8 writeback;
+ int err;
+
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
+ struct virtio_blk_config, wce,
+ &writeback);
+ if (err)
+ writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) ||
+ virtio_has_feature(vdev, VIRTIO_F_VERSION_1);
+
+ return writeback;
+}
+
+static void virtblk_update_cache_mode(struct virtio_device *vdev)
+{
+ u8 writeback = virtblk_get_cache_mode(vdev);
+ struct virtio_blk *vblk = vdev->priv;
+
+ if (writeback)
+ blk_queue_flush(vblk->disk->queue, REQ_FLUSH);
+ else
+ blk_queue_flush(vblk->disk->queue, 0);
+
+ revalidate_disk(vblk->disk);
+}
+
+static const char *const virtblk_cache_types[] = {
+ "write through", "write back"
+};
+
+static ssize_t
+virtblk_cache_type_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct virtio_blk *vblk = disk->private_data;
+ struct virtio_device *vdev = vblk->vdev;
+ int i;
+
+ BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
+ for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; )
+ if (sysfs_streq(buf, virtblk_cache_types[i]))
+ break;
+
+ if (i < 0)
+ return -EINVAL;
+
+ virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
+ virtblk_update_cache_mode(vdev);
+ return count;
+}
+
+static ssize_t
+virtblk_cache_type_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct virtio_blk *vblk = disk->private_data;
+ u8 writeback = virtblk_get_cache_mode(vblk->vdev);
+
+ BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
+ return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]);
+}
+
+static const struct device_attribute dev_attr_cache_type_ro =
+ __ATTR(cache_type, S_IRUGO,
+ virtblk_cache_type_show, NULL);
+static const struct device_attribute dev_attr_cache_type_rw =
+ __ATTR(cache_type, S_IRUGO|S_IWUSR,
+ virtblk_cache_type_show, virtblk_cache_type_store);
+
+static int virtblk_init_request(void *data, struct request *rq,
+ unsigned int hctx_idx, unsigned int request_idx,
+ unsigned int numa_node)
+{
+ struct virtio_blk *vblk = data;
+ struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
+
+ sg_init_table(vbr->sg, vblk->sg_elems);
+ return 0;
+}
+
+static struct blk_mq_ops virtio_mq_ops = {
+ .queue_rq = virtio_queue_rq,
+ .map_queue = blk_mq_map_queue,
+ .complete = virtblk_request_done,
+ .init_request = virtblk_init_request,
+};
+
+static unsigned int virtblk_queue_depth;
+module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
+
+static int virtblk_probe(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk;
+ struct request_queue *q;
+ int err, index;
+
+ u64 cap;
+ u32 v, blk_size, sg_elems, opt_io_size;
+ u16 min_io_size;
+ u8 physical_block_exp, alignment_offset;
+
+ if (!vdev->config->get) {
+ dev_err(&vdev->dev, "%s failure: config access disabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
+ GFP_KERNEL);
+ if (err < 0)
+ goto out;
+ index = err;
+
+ /* We need to know how many segments before we allocate. */
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
+ struct virtio_blk_config, seg_max,
+ &sg_elems);
+
+ /* We need at least one SG element, whatever they say. */
+ if (err || !sg_elems)
+ sg_elems = 1;
+
+ /* We need an extra sg elements at head and tail. */
+ sg_elems += 2;
+ vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
+ if (!vblk) {
+ err = -ENOMEM;
+ goto out_free_index;
+ }
+
+ vblk->vdev = vdev;
+ vblk->sg_elems = sg_elems;
+
+ INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
+
+ err = init_vq(vblk);
+ if (err)
+ goto out_free_vblk;
+
+ /* FIXME: How many partitions? How long is a piece of string? */
+ vblk->disk = alloc_disk(1 << PART_BITS);
+ if (!vblk->disk) {
+ err = -ENOMEM;
+ goto out_free_vq;
+ }
+
+ /* Default queue sizing is to fill the ring. */
+ if (!virtblk_queue_depth) {
+ virtblk_queue_depth = vblk->vqs[0].vq->num_free;
+ /* ... but without indirect descs, we use 2 descs per req */
+ if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+ virtblk_queue_depth /= 2;
+ }
+
+ memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+ vblk->tag_set.ops = &virtio_mq_ops;
+ vblk->tag_set.queue_depth = virtblk_queue_depth;
+ vblk->tag_set.numa_node = NUMA_NO_NODE;
+ vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ vblk->tag_set.cmd_size =
+ sizeof(struct virtblk_req) +
+ sizeof(struct scatterlist) * sg_elems;
+ vblk->tag_set.driver_data = vblk;
+ vblk->tag_set.nr_hw_queues = vblk->num_vqs;
+
+ err = blk_mq_alloc_tag_set(&vblk->tag_set);
+ if (err)
+ goto out_put_disk;
+
+ q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
+ if (IS_ERR(q)) {
+ err = -ENOMEM;
+ goto out_free_tags;
+ }
+
+ q->queuedata = vblk;
+
+ virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
+
+ vblk->disk->major = major;
+ vblk->disk->first_minor = index_to_minor(index);
+ vblk->disk->private_data = vblk;
+ vblk->disk->fops = &virtblk_fops;
+ vblk->disk->driverfs_dev = &vdev->dev;
+ vblk->index = index;
+
+ /* configure queue flush support */
+ virtblk_update_cache_mode(vdev);
+
+ /* If disk is read-only in the host, the guest should obey */
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
+ set_disk_ro(vblk->disk, 1);
+
+ /* Host must always specify the capacity. */
+ virtio_cread(vdev, struct virtio_blk_config, capacity, &cap);
+
+ /* If capacity is too big, truncate with warning. */
+ if ((sector_t)cap != cap) {
+ dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
+ (unsigned long long)cap);
+ cap = (sector_t)-1;
+ }
+ set_capacity(vblk->disk, cap);
+
+ /* We can handle whatever the host told us to handle. */
+ blk_queue_max_segments(q, vblk->sg_elems-2);
+
+ /* No need to bounce any requests */
+ blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
+
+ /* No real sector limit. */
+ blk_queue_max_hw_sectors(q, -1U);
+
+ /* Host can optionally specify maximum segment size and number of
+ * segments. */
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
+ struct virtio_blk_config, size_max, &v);
+ if (!err)
+ blk_queue_max_segment_size(q, v);
+ else
+ blk_queue_max_segment_size(q, -1U);
+
+ /* Host can optionally specify the block size of the device */
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
+ struct virtio_blk_config, blk_size,
+ &blk_size);
+ if (!err)
+ blk_queue_logical_block_size(q, blk_size);
+ else
+ blk_size = queue_logical_block_size(q);
+
+ /* Use topology information if available */
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+ struct virtio_blk_config, physical_block_exp,
+ &physical_block_exp);
+ if (!err && physical_block_exp)
+ blk_queue_physical_block_size(q,
+ blk_size * (1 << physical_block_exp));
+
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+ struct virtio_blk_config, alignment_offset,
+ &alignment_offset);
+ if (!err && alignment_offset)
+ blk_queue_alignment_offset(q, blk_size * alignment_offset);
+
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+ struct virtio_blk_config, min_io_size,
+ &min_io_size);
+ if (!err && min_io_size)
+ blk_queue_io_min(q, blk_size * min_io_size);
+
+ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+ struct virtio_blk_config, opt_io_size,
+ &opt_io_size);
+ if (!err && opt_io_size)
+ blk_queue_io_opt(q, blk_size * opt_io_size);
+
+ virtio_device_ready(vdev);
+
+ add_disk(vblk->disk);
+ err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
+ if (err)
+ goto out_del_disk;
+
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
+ err = device_create_file(disk_to_dev(vblk->disk),
+ &dev_attr_cache_type_rw);
+ else
+ err = device_create_file(disk_to_dev(vblk->disk),
+ &dev_attr_cache_type_ro);
+ if (err)
+ goto out_del_disk;
+ return 0;
+
+out_del_disk:
+ del_gendisk(vblk->disk);
+ blk_cleanup_queue(vblk->disk->queue);
+out_free_tags:
+ blk_mq_free_tag_set(&vblk->tag_set);
+out_put_disk:
+ put_disk(vblk->disk);
+out_free_vq:
+ vdev->config->del_vqs(vdev);
+out_free_vblk:
+ kfree(vblk);
+out_free_index:
+ ida_simple_remove(&vd_index_ida, index);
+out:
+ return err;
+}
+
+static void virtblk_remove(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+ int index = vblk->index;
+ int refc;
+
+ /* Make sure no work handler is accessing the device. */
+ flush_work(&vblk->config_work);
+
+ del_gendisk(vblk->disk);
+ blk_cleanup_queue(vblk->disk->queue);
+
+ blk_mq_free_tag_set(&vblk->tag_set);
+
+ /* Stop all the virtqueues. */
+ vdev->config->reset(vdev);
+
+ refc = atomic_read(&disk_to_dev(vblk->disk)->kobj.kref.refcount);
+ put_disk(vblk->disk);
+ vdev->config->del_vqs(vdev);
+ kfree(vblk->vqs);
+ kfree(vblk);
+
+ /* Only free device id if we don't have any users */
+ if (refc == 1)
+ ida_simple_remove(&vd_index_ida, index);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtblk_freeze(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+
+ /* Ensure we don't receive any more interrupts */
+ vdev->config->reset(vdev);
+
+ /* Make sure no work handler is accessing the device. */
+ flush_work(&vblk->config_work);
+
+ blk_mq_stop_hw_queues(vblk->disk->queue);
+
+ vdev->config->del_vqs(vdev);
+ return 0;
+}
+
+static int virtblk_restore(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk = vdev->priv;
+ int ret;
+
+ ret = init_vq(vdev->priv);
+ if (ret)
+ return ret;
+
+ virtio_device_ready(vdev);
+
+ blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+ return 0;
+}
+#endif
+
+static const struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features_legacy[] = {
+ VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+ VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+ VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+ VIRTIO_BLK_F_MQ,
+}
+;
+static unsigned int features[] = {
+ VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+ VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
+ VIRTIO_BLK_F_TOPOLOGY,
+ VIRTIO_BLK_F_MQ,
+};
+
+static struct virtio_driver virtio_blk = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .feature_table_legacy = features_legacy,
+ .feature_table_size_legacy = ARRAY_SIZE(features_legacy),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = virtblk_probe,
+ .remove = virtblk_remove,
+ .config_changed = virtblk_config_changed,
+#ifdef CONFIG_PM_SLEEP
+ .freeze = virtblk_freeze,
+ .restore = virtblk_restore,
+#endif
+};
+
+static int __init init(void)
+{
+ int error;
+
+ virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
+ if (!virtblk_wq)
+ return -ENOMEM;
+
+ major = register_blkdev(0, "virtblk");
+ if (major < 0) {
+ error = major;
+ goto out_destroy_workqueue;
+ }
+
+ error = register_virtio_driver(&virtio_blk);
+ if (error)
+ goto out_unregister_blkdev;
+ return 0;
+
+out_unregister_blkdev:
+ unregister_blkdev(major, "virtblk");
+out_destroy_workqueue:
+ destroy_workqueue(virtblk_wq);
+ return error;
+}
+
+static void __exit fini(void)
+{
+ unregister_virtio_driver(&virtio_blk);
+ unregister_blkdev(major, "virtblk");
+ destroy_workqueue(virtblk_wq);
+}
+module_init(init);
+module_exit(fini);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio block driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile
new file mode 100644
index 000000000..e491c1b76
--- /dev/null
+++ b/drivers/block/xen-blkback/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+
+xen-blkback-y := blkback.o xenbus.o
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
new file mode 100644
index 000000000..713fc9ff1
--- /dev/null
+++ b/drivers/block/xen-blkback/blkback.c
@@ -0,0 +1,1456 @@
+/******************************************************************************
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ * drivers/block/xen-blkfront.c
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen-blkback: " fmt
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/bitmap.h>
+
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/balloon.h>
+#include <xen/grant_table.h>
+#include "common.h"
+
+/*
+ * Maximum number of unused free pages to keep in the internal buffer.
+ * Setting this to a value too low will reduce memory used in each backend,
+ * but can have a performance penalty.
+ *
+ * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
+ * be set to a lower value that might degrade performance on some intensive
+ * IO workloads.
+ */
+
+static int xen_blkif_max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in each block backend buffer");
+
+/*
+ * Maximum number of grants to map persistently in blkback. For maximum
+ * performance this should be the total numbers of grants that can be used
+ * to fill the ring, but since this might become too high, specially with
+ * the use of indirect descriptors, we set it to a value that provides good
+ * performance without using too much memory.
+ *
+ * When the list of persistent grants is full we clean it up using a LRU
+ * algorithm.
+ */
+
+static int xen_blkif_max_pgrants = 1056;
+module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
+MODULE_PARM_DESC(max_persistent_grants,
+ "Maximum number of grants to map persistently");
+
+/*
+ * The LRU mechanism to clean the lists of persistent grants needs to
+ * be executed periodically. The time interval between consecutive executions
+ * of the purge mechanism is set in ms.
+ */
+#define LRU_INTERVAL 100
+
+/*
+ * When the persistent grants list is full we will remove unused grants
+ * from the list. The percent number of grants to be removed at each LRU
+ * execution.
+ */
+#define LRU_PERCENT_CLEAN 5
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats;
+module_param(log_stats, int, 0644);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+/* Number of free pages to remove on each call to gnttab_free_pages */
+#define NUM_BATCH_FREE_PAGES 10
+
+static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ if (list_empty(&blkif->free_pages)) {
+ BUG_ON(blkif->free_pages_num != 0);
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ return gnttab_alloc_pages(1, page);
+ }
+ BUG_ON(blkif->free_pages_num == 0);
+ page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+ list_del(&page[0]->lru);
+ blkif->free_pages_num--;
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+
+ return 0;
+}
+
+static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+ int num)
+{
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ for (i = 0; i < num; i++)
+ list_add(&page[i]->lru, &blkif->free_pages);
+ blkif->free_pages_num += num;
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+}
+
+static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
+{
+ /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
+ struct page *page[NUM_BATCH_FREE_PAGES];
+ unsigned int num_pages = 0;
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ while (blkif->free_pages_num > num) {
+ BUG_ON(list_empty(&blkif->free_pages));
+ page[num_pages] = list_first_entry(&blkif->free_pages,
+ struct page, lru);
+ list_del(&page[num_pages]->lru);
+ blkif->free_pages_num--;
+ if (++num_pages == NUM_BATCH_FREE_PAGES) {
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ gnttab_free_pages(num_pages, page);
+ spin_lock_irqsave(&blkif->free_pages_lock, flags);
+ num_pages = 0;
+ }
+ }
+ spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+ if (num_pages != 0)
+ gnttab_free_pages(num_pages, page);
+}
+
+#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
+
+static int do_block_io_op(struct xen_blkif *blkif);
+static int dispatch_rw_block_io(struct xen_blkif *blkif,
+ struct blkif_request *req,
+ struct pending_req *pending_req);
+static void make_response(struct xen_blkif *blkif, u64 id,
+ unsigned short op, int st);
+
+#define foreach_grant_safe(pos, n, rbtree, node) \
+ for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
+ (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
+ &(pos)->node != NULL; \
+ (pos) = container_of(n, typeof(*(pos)), node), \
+ (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
+
+
+/*
+ * We don't need locking around the persistent grant helpers
+ * because blkback uses a single-thread for each backed, so we
+ * can be sure that this functions will never be called recursively.
+ *
+ * The only exception to that is put_persistent_grant, that can be called
+ * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
+ * bit operations to modify the flags of a persistent grant and to count
+ * the number of used grants.
+ */
+static int add_persistent_gnt(struct xen_blkif *blkif,
+ struct persistent_gnt *persistent_gnt)
+{
+ struct rb_node **new = NULL, *parent = NULL;
+ struct persistent_gnt *this;
+
+ if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+ if (!blkif->vbd.overflow_max_grants)
+ blkif->vbd.overflow_max_grants = 1;
+ return -EBUSY;
+ }
+ /* Figure out where to put new node */
+ new = &blkif->persistent_gnts.rb_node;
+ while (*new) {
+ this = container_of(*new, struct persistent_gnt, node);
+
+ parent = *new;
+ if (persistent_gnt->gnt < this->gnt)
+ new = &((*new)->rb_left);
+ else if (persistent_gnt->gnt > this->gnt)
+ new = &((*new)->rb_right);
+ else {
+ pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
+ return -EINVAL;
+ }
+ }
+
+ bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
+ set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
+ /* Add new node and rebalance tree. */
+ rb_link_node(&(persistent_gnt->node), parent, new);
+ rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
+ blkif->persistent_gnt_c++;
+ atomic_inc(&blkif->persistent_gnt_in_use);
+ return 0;
+}
+
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
+ grant_ref_t gref)
+{
+ struct persistent_gnt *data;
+ struct rb_node *node = NULL;
+
+ node = blkif->persistent_gnts.rb_node;
+ while (node) {
+ data = container_of(node, struct persistent_gnt, node);
+
+ if (gref < data->gnt)
+ node = node->rb_left;
+ else if (gref > data->gnt)
+ node = node->rb_right;
+ else {
+ if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
+ pr_alert_ratelimited("requesting a grant already in use\n");
+ return NULL;
+ }
+ set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
+ atomic_inc(&blkif->persistent_gnt_in_use);
+ return data;
+ }
+ }
+ return NULL;
+}
+
+static void put_persistent_gnt(struct xen_blkif *blkif,
+ struct persistent_gnt *persistent_gnt)
+{
+ if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+ pr_alert_ratelimited("freeing a grant already unused\n");
+ set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+ clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
+ atomic_dec(&blkif->persistent_gnt_in_use);
+}
+
+static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+ unsigned int num)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct persistent_gnt *persistent_gnt;
+ struct rb_node *n;
+ int segs_to_unmap = 0;
+ struct gntab_unmap_queue_data unmap_data;
+
+ unmap_data.pages = pages;
+ unmap_data.unmap_ops = unmap;
+ unmap_data.kunmap_ops = NULL;
+
+ foreach_grant_safe(persistent_gnt, n, root, node) {
+ BUG_ON(persistent_gnt->handle ==
+ BLKBACK_INVALID_HANDLE);
+ gnttab_set_unmap_op(&unmap[segs_to_unmap],
+ (unsigned long) pfn_to_kaddr(page_to_pfn(
+ persistent_gnt->page)),
+ GNTMAP_host_map,
+ persistent_gnt->handle);
+
+ pages[segs_to_unmap] = persistent_gnt->page;
+
+ if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
+ !rb_next(&persistent_gnt->node)) {
+
+ unmap_data.count = segs_to_unmap;
+ BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
+
+ put_free_pages(blkif, pages, segs_to_unmap);
+ segs_to_unmap = 0;
+ }
+
+ rb_erase(&persistent_gnt->node, root);
+ kfree(persistent_gnt);
+ num--;
+ }
+ BUG_ON(num != 0);
+}
+
+void xen_blkbk_unmap_purged_grants(struct work_struct *work)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct persistent_gnt *persistent_gnt;
+ int segs_to_unmap = 0;
+ struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+ struct gntab_unmap_queue_data unmap_data;
+
+ unmap_data.pages = pages;
+ unmap_data.unmap_ops = unmap;
+ unmap_data.kunmap_ops = NULL;
+
+ while(!list_empty(&blkif->persistent_purge_list)) {
+ persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+ struct persistent_gnt,
+ remove_node);
+ list_del(&persistent_gnt->remove_node);
+
+ gnttab_set_unmap_op(&unmap[segs_to_unmap],
+ vaddr(persistent_gnt->page),
+ GNTMAP_host_map,
+ persistent_gnt->handle);
+
+ pages[segs_to_unmap] = persistent_gnt->page;
+
+ if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ unmap_data.count = segs_to_unmap;
+ BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
+ put_free_pages(blkif, pages, segs_to_unmap);
+ segs_to_unmap = 0;
+ }
+ kfree(persistent_gnt);
+ }
+ if (segs_to_unmap > 0) {
+ unmap_data.count = segs_to_unmap;
+ BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
+ put_free_pages(blkif, pages, segs_to_unmap);
+ }
+}
+
+static void purge_persistent_gnt(struct xen_blkif *blkif)
+{
+ struct persistent_gnt *persistent_gnt;
+ struct rb_node *n;
+ unsigned int num_clean, total;
+ bool scan_used = false, clean_used = false;
+ struct rb_root *root;
+
+ if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
+ (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
+ !blkif->vbd.overflow_max_grants)) {
+ return;
+ }
+
+ if (work_pending(&blkif->persistent_purge_work)) {
+ pr_alert_ratelimited("Scheduled work from previous purge is still pending, cannot purge list\n");
+ return;
+ }
+
+ num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
+ num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+ num_clean = min(blkif->persistent_gnt_c, num_clean);
+ if ((num_clean == 0) ||
+ (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
+ return;
+
+ /*
+ * At this point, we can assure that there will be no calls
+ * to get_persistent_grant (because we are executing this code from
+ * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
+ * which means that the number of currently used grants will go down,
+ * but never up, so we will always be able to remove the requested
+ * number of grants.
+ */
+
+ total = num_clean;
+
+ pr_debug("Going to purge %u persistent grants\n", num_clean);
+
+ BUG_ON(!list_empty(&blkif->persistent_purge_list));
+ root = &blkif->persistent_gnts;
+purge_list:
+ foreach_grant_safe(persistent_gnt, n, root, node) {
+ BUG_ON(persistent_gnt->handle ==
+ BLKBACK_INVALID_HANDLE);
+
+ if (clean_used) {
+ clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+ continue;
+ }
+
+ if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+ continue;
+ if (!scan_used &&
+ (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
+ continue;
+
+ rb_erase(&persistent_gnt->node, root);
+ list_add(&persistent_gnt->remove_node,
+ &blkif->persistent_purge_list);
+ if (--num_clean == 0)
+ goto finished;
+ }
+ /*
+ * If we get here it means we also need to start cleaning
+ * grants that were used since last purge in order to cope
+ * with the requested num
+ */
+ if (!scan_used && !clean_used) {
+ pr_debug("Still missing %u purged frames\n", num_clean);
+ scan_used = true;
+ goto purge_list;
+ }
+finished:
+ if (!clean_used) {
+ pr_debug("Finished scanning for grants to clean, removing used flag\n");
+ clean_used = true;
+ goto purge_list;
+ }
+
+ blkif->persistent_gnt_c -= (total - num_clean);
+ blkif->vbd.overflow_max_grants = 0;
+
+ /* We can defer this work */
+ schedule_work(&blkif->persistent_purge_work);
+ pr_debug("Purged %u/%u\n", (total - num_clean), total);
+ return;
+}
+
+/*
+ * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
+ */
+static struct pending_req *alloc_req(struct xen_blkif *blkif)
+{
+ struct pending_req *req = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&blkif->pending_free_lock, flags);
+ if (!list_empty(&blkif->pending_free)) {
+ req = list_entry(blkif->pending_free.next, struct pending_req,
+ free_list);
+ list_del(&req->free_list);
+ }
+ spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+ return req;
+}
+
+/*
+ * Return the 'pending_req' structure back to the freepool. We also
+ * wake up the thread if it was waiting for a free page.
+ */
+static void free_req(struct xen_blkif *blkif, struct pending_req *req)
+{
+ unsigned long flags;
+ int was_empty;
+
+ spin_lock_irqsave(&blkif->pending_free_lock, flags);
+ was_empty = list_empty(&blkif->pending_free);
+ list_add(&req->free_list, &blkif->pending_free);
+ spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
+ if (was_empty)
+ wake_up(&blkif->pending_free_wq);
+}
+
+/*
+ * Routines for managing virtual block devices (vbds).
+ */
+static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
+ int operation)
+{
+ struct xen_vbd *vbd = &blkif->vbd;
+ int rc = -EACCES;
+
+ if ((operation != READ) && vbd->readonly)
+ goto out;
+
+ if (likely(req->nr_sects)) {
+ blkif_sector_t end = req->sector_number + req->nr_sects;
+
+ if (unlikely(end < req->sector_number))
+ goto out;
+ if (unlikely(end > vbd_sz(vbd)))
+ goto out;
+ }
+
+ req->dev = vbd->pdevice;
+ req->bdev = vbd->bdev;
+ rc = 0;
+
+ out:
+ return rc;
+}
+
+static void xen_vbd_resize(struct xen_blkif *blkif)
+{
+ struct xen_vbd *vbd = &blkif->vbd;
+ struct xenbus_transaction xbt;
+ int err;
+ struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
+ unsigned long long new_size = vbd_sz(vbd);
+
+ pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
+ blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
+ pr_info("VBD Resize: new size %llu\n", new_size);
+ vbd->size = new_size;
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ pr_warn("Error starting transaction\n");
+ return;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+ (unsigned long long)vbd_sz(vbd));
+ if (err) {
+ pr_warn("Error writing new size\n");
+ goto abort;
+ }
+ /*
+ * Write the current state; we will use this to synchronize
+ * the front-end. If the current state is "connected" the
+ * front-end will get the new size information online.
+ */
+ err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+ if (err) {
+ pr_warn("Error writing the state\n");
+ goto abort;
+ }
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ pr_warn("Error ending transaction\n");
+ return;
+abort:
+ xenbus_transaction_end(xbt, 1);
+}
+
+/*
+ * Notification from the guest OS.
+ */
+static void blkif_notify_work(struct xen_blkif *blkif)
+{
+ blkif->waiting_reqs = 1;
+ wake_up(&blkif->wq);
+}
+
+irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
+{
+ blkif_notify_work(dev_id);
+ return IRQ_HANDLED;
+}
+
+/*
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(struct xen_blkif *blkif)
+{
+ pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu"
+ " | ds %4llu | pg: %4u/%4d\n",
+ current->comm, blkif->st_oo_req,
+ blkif->st_rd_req, blkif->st_wr_req,
+ blkif->st_f_req, blkif->st_ds_req,
+ blkif->persistent_gnt_c,
+ xen_blkif_max_pgrants);
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+ blkif->st_rd_req = 0;
+ blkif->st_wr_req = 0;
+ blkif->st_oo_req = 0;
+ blkif->st_ds_req = 0;
+}
+
+int xen_blkif_schedule(void *arg)
+{
+ struct xen_blkif *blkif = arg;
+ struct xen_vbd *vbd = &blkif->vbd;
+ unsigned long timeout;
+ int ret;
+
+ xen_blkif_get(blkif);
+
+ while (!kthread_should_stop()) {
+ if (try_to_freeze())
+ continue;
+ if (unlikely(vbd->size != vbd_sz(vbd)))
+ xen_vbd_resize(blkif);
+
+ timeout = msecs_to_jiffies(LRU_INTERVAL);
+
+ timeout = wait_event_interruptible_timeout(
+ blkif->wq,
+ blkif->waiting_reqs || kthread_should_stop(),
+ timeout);
+ if (timeout == 0)
+ goto purge_gnt_list;
+ timeout = wait_event_interruptible_timeout(
+ blkif->pending_free_wq,
+ !list_empty(&blkif->pending_free) ||
+ kthread_should_stop(),
+ timeout);
+ if (timeout == 0)
+ goto purge_gnt_list;
+
+ blkif->waiting_reqs = 0;
+ smp_mb(); /* clear flag *before* checking for work */
+
+ ret = do_block_io_op(blkif);
+ if (ret > 0)
+ blkif->waiting_reqs = 1;
+ if (ret == -EACCES)
+ wait_event_interruptible(blkif->shutdown_wq,
+ kthread_should_stop());
+
+purge_gnt_list:
+ if (blkif->vbd.feature_gnt_persistent &&
+ time_after(jiffies, blkif->next_lru)) {
+ purge_persistent_gnt(blkif);
+ blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+ }
+
+ /* Shrink if we have more than xen_blkif_max_buffer_pages */
+ shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
+
+ if (log_stats && time_after(jiffies, blkif->st_print))
+ print_stats(blkif);
+ }
+
+ /* Drain pending purge work */
+ flush_work(&blkif->persistent_purge_work);
+
+ if (log_stats)
+ print_stats(blkif);
+
+ blkif->xenblkd = NULL;
+ xen_blkif_put(blkif);
+
+ return 0;
+}
+
+/*
+ * Remove persistent grants and empty the pool of free pages
+ */
+void xen_blkbk_free_caches(struct xen_blkif *blkif)
+{
+ /* Free all persistent grant pages */
+ if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
+ free_persistent_gnts(blkif, &blkif->persistent_gnts,
+ blkif->persistent_gnt_c);
+
+ BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
+ blkif->persistent_gnt_c = 0;
+
+ /* Since we are shutting down remove all pages from the buffer */
+ shrink_free_pagepool(blkif, 0 /* All */);
+}
+
+static unsigned int xen_blkbk_unmap_prepare(
+ struct xen_blkif *blkif,
+ struct grant_page **pages,
+ unsigned int num,
+ struct gnttab_unmap_grant_ref *unmap_ops,
+ struct page **unmap_pages)
+{
+ unsigned int i, invcount = 0;
+
+ for (i = 0; i < num; i++) {
+ if (pages[i]->persistent_gnt != NULL) {
+ put_persistent_gnt(blkif, pages[i]->persistent_gnt);
+ continue;
+ }
+ if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
+ continue;
+ unmap_pages[invcount] = pages[i]->page;
+ gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
+ GNTMAP_host_map, pages[i]->handle);
+ pages[i]->handle = BLKBACK_INVALID_HANDLE;
+ invcount++;
+ }
+
+ return invcount;
+}
+
+static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
+{
+ struct pending_req* pending_req = (struct pending_req*) (data->data);
+ struct xen_blkif *blkif = pending_req->blkif;
+
+ /* BUG_ON used to reproduce existing behaviour,
+ but is this the best way to deal with this? */
+ BUG_ON(result);
+
+ put_free_pages(blkif, data->pages, data->count);
+ make_response(blkif, pending_req->id,
+ pending_req->operation, pending_req->status);
+ free_req(blkif, pending_req);
+ /*
+ * Make sure the request is freed before releasing blkif,
+ * or there could be a race between free_req and the
+ * cleanup done in xen_blkif_free during shutdown.
+ *
+ * NB: The fact that we might try to wake up pending_free_wq
+ * before drain_complete (in case there's a drain going on)
+ * it's not a problem with our current implementation
+ * because we can assure there's no thread waiting on
+ * pending_free_wq if there's a drain going on, but it has
+ * to be taken into account if the current model is changed.
+ */
+ if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) {
+ complete(&blkif->drain_complete);
+ }
+ xen_blkif_put(blkif);
+}
+
+static void xen_blkbk_unmap_and_respond(struct pending_req *req)
+{
+ struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
+ struct xen_blkif *blkif = req->blkif;
+ struct grant_page **pages = req->segments;
+ unsigned int invcount;
+
+ invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_pages,
+ req->unmap, req->unmap_pages);
+
+ work->data = req;
+ work->done = xen_blkbk_unmap_and_respond_callback;
+ work->unmap_ops = req->unmap;
+ work->kunmap_ops = NULL;
+ work->pages = req->unmap_pages;
+ work->count = invcount;
+
+ gnttab_unmap_refs_async(&req->gnttab_unmap_data);
+}
+
+
+/*
+ * Unmap the grant references.
+ *
+ * This could accumulate ops up to the batch size to reduce the number
+ * of hypercalls, but since this is only used in error paths there's
+ * no real need.
+ */
+static void xen_blkbk_unmap(struct xen_blkif *blkif,
+ struct grant_page *pages[],
+ int num)
+{
+ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ unsigned int invcount = 0;
+ int ret;
+
+ while (num) {
+ unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+ invcount = xen_blkbk_unmap_prepare(blkif, pages, batch,
+ unmap, unmap_pages);
+ if (invcount) {
+ ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
+ BUG_ON(ret);
+ put_free_pages(blkif, unmap_pages, invcount);
+ }
+ pages += batch;
+ num -= batch;
+ }
+}
+
+static int xen_blkbk_map(struct xen_blkif *blkif,
+ struct grant_page *pages[],
+ int num, bool ro)
+{
+ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ struct persistent_gnt *persistent_gnt = NULL;
+ phys_addr_t addr = 0;
+ int i, seg_idx, new_map_idx;
+ int segs_to_map = 0;
+ int ret = 0;
+ int last_map = 0, map_until = 0;
+ int use_persistent_gnts;
+
+ use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
+
+ /*
+ * Fill out preq.nr_sects with proper amount of sectors, and setup
+ * assign map[..] with the PFN of the page in our domain with the
+ * corresponding grant reference for each page.
+ */
+again:
+ for (i = map_until; i < num; i++) {
+ uint32_t flags;
+
+ if (use_persistent_gnts)
+ persistent_gnt = get_persistent_gnt(
+ blkif,
+ pages[i]->gref);
+
+ if (persistent_gnt) {
+ /*
+ * We are using persistent grants and
+ * the grant is already mapped
+ */
+ pages[i]->page = persistent_gnt->page;
+ pages[i]->persistent_gnt = persistent_gnt;
+ } else {
+ if (get_free_page(blkif, &pages[i]->page))
+ goto out_of_memory;
+ addr = vaddr(pages[i]->page);
+ pages_to_gnt[segs_to_map] = pages[i]->page;
+ pages[i]->persistent_gnt = NULL;
+ flags = GNTMAP_host_map;
+ if (!use_persistent_gnts && ro)
+ flags |= GNTMAP_readonly;
+ gnttab_set_map_op(&map[segs_to_map++], addr,
+ flags, pages[i]->gref,
+ blkif->domid);
+ }
+ map_until = i + 1;
+ if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
+ break;
+ }
+
+ if (segs_to_map) {
+ ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
+ BUG_ON(ret);
+ }
+
+ /*
+ * Now swizzle the MFN in our domain with the MFN from the other domain
+ * so that when we access vaddr(pending_req,i) it has the contents of
+ * the page from the other domain.
+ */
+ for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
+ if (!pages[seg_idx]->persistent_gnt) {
+ /* This is a newly mapped grant */
+ BUG_ON(new_map_idx >= segs_to_map);
+ if (unlikely(map[new_map_idx].status != 0)) {
+ pr_debug("invalid buffer -- could not remap it\n");
+ put_free_pages(blkif, &pages[seg_idx]->page, 1);
+ pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
+ ret |= 1;
+ goto next;
+ }
+ pages[seg_idx]->handle = map[new_map_idx].handle;
+ } else {
+ continue;
+ }
+ if (use_persistent_gnts &&
+ blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+ /*
+ * We are using persistent grants, the grant is
+ * not mapped but we might have room for it.
+ */
+ persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
+ GFP_KERNEL);
+ if (!persistent_gnt) {
+ /*
+ * If we don't have enough memory to
+ * allocate the persistent_gnt struct
+ * map this grant non-persistenly
+ */
+ goto next;
+ }
+ persistent_gnt->gnt = map[new_map_idx].ref;
+ persistent_gnt->handle = map[new_map_idx].handle;
+ persistent_gnt->page = pages[seg_idx]->page;
+ if (add_persistent_gnt(blkif,
+ persistent_gnt)) {
+ kfree(persistent_gnt);
+ persistent_gnt = NULL;
+ goto next;
+ }
+ pages[seg_idx]->persistent_gnt = persistent_gnt;
+ pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
+ persistent_gnt->gnt, blkif->persistent_gnt_c,
+ xen_blkif_max_pgrants);
+ goto next;
+ }
+ if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
+ blkif->vbd.overflow_max_grants = 1;
+ pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
+ blkif->domid, blkif->vbd.handle);
+ }
+ /*
+ * We could not map this grant persistently, so use it as
+ * a non-persistent grant.
+ */
+next:
+ new_map_idx++;
+ }
+ segs_to_map = 0;
+ last_map = map_until;
+ if (map_until != num)
+ goto again;
+
+ return ret;
+
+out_of_memory:
+ pr_alert("%s: out of memory\n", __func__);
+ put_free_pages(blkif, pages_to_gnt, segs_to_map);
+ return -ENOMEM;
+}
+
+static int xen_blkbk_map_seg(struct pending_req *pending_req)
+{
+ int rc;
+
+ rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+ pending_req->nr_pages,
+ (pending_req->operation != BLKIF_OP_READ));
+
+ return rc;
+}
+
+static int xen_blkbk_parse_indirect(struct blkif_request *req,
+ struct pending_req *pending_req,
+ struct seg_buf seg[],
+ struct phys_req *preq)
+{
+ struct grant_page **pages = pending_req->indirect_pages;
+ struct xen_blkif *blkif = pending_req->blkif;
+ int indirect_grefs, rc, n, nseg, i;
+ struct blkif_request_segment *segments = NULL;
+
+ nseg = pending_req->nr_pages;
+ indirect_grefs = INDIRECT_PAGES(nseg);
+ BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+ for (i = 0; i < indirect_grefs; i++)
+ pages[i]->gref = req->u.indirect.indirect_grefs[i];
+
+ rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+ if (rc)
+ goto unmap;
+
+ for (n = 0, i = 0; n < nseg; n++) {
+ if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
+ /* Map indirect segments */
+ if (segments)
+ kunmap_atomic(segments);
+ segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
+ }
+ i = n % SEGS_PER_INDIRECT_FRAME;
+ pending_req->segments[n]->gref = segments[i].gref;
+ seg[n].nsec = segments[i].last_sect -
+ segments[i].first_sect + 1;
+ seg[n].offset = (segments[i].first_sect << 9);
+ if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (segments[i].last_sect < segments[i].first_sect)) {
+ rc = -EINVAL;
+ goto unmap;
+ }
+ preq->nr_sects += seg[n].nsec;
+ }
+
+unmap:
+ if (segments)
+ kunmap_atomic(segments);
+ xen_blkbk_unmap(blkif, pages, indirect_grefs);
+ return rc;
+}
+
+static int dispatch_discard_io(struct xen_blkif *blkif,
+ struct blkif_request *req)
+{
+ int err = 0;
+ int status = BLKIF_RSP_OKAY;
+ struct block_device *bdev = blkif->vbd.bdev;
+ unsigned long secure;
+ struct phys_req preq;
+
+ xen_blkif_get(blkif);
+
+ preq.sector_number = req->u.discard.sector_number;
+ preq.nr_sects = req->u.discard.nr_sectors;
+
+ err = xen_vbd_translate(&preq, blkif, WRITE);
+ if (err) {
+ pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
+ preq.sector_number,
+ preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
+ goto fail_response;
+ }
+ blkif->st_ds_req++;
+
+ secure = (blkif->vbd.discard_secure &&
+ (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
+ BLKDEV_DISCARD_SECURE : 0;
+
+ err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
+ req->u.discard.nr_sectors,
+ GFP_KERNEL, secure);
+fail_response:
+ if (err == -EOPNOTSUPP) {
+ pr_debug("discard op failed, not supported\n");
+ status = BLKIF_RSP_EOPNOTSUPP;
+ } else if (err)
+ status = BLKIF_RSP_ERROR;
+
+ make_response(blkif, req->u.discard.id, req->operation, status);
+ xen_blkif_put(blkif);
+ return err;
+}
+
+static int dispatch_other_io(struct xen_blkif *blkif,
+ struct blkif_request *req,
+ struct pending_req *pending_req)
+{
+ free_req(blkif, pending_req);
+ make_response(blkif, req->u.other.id, req->operation,
+ BLKIF_RSP_EOPNOTSUPP);
+ return -EIO;
+}
+
+static void xen_blk_drain_io(struct xen_blkif *blkif)
+{
+ atomic_set(&blkif->drain, 1);
+ do {
+ if (atomic_read(&blkif->inflight) == 0)
+ break;
+ wait_for_completion_interruptible_timeout(
+ &blkif->drain_complete, HZ);
+
+ if (!atomic_read(&blkif->drain))
+ break;
+ } while (!kthread_should_stop());
+ atomic_set(&blkif->drain, 0);
+}
+
+/*
+ * Completion callback on the bio's. Called as bh->b_end_io()
+ */
+
+static void __end_block_io_op(struct pending_req *pending_req, int error)
+{
+ /* An error fails the entire request. */
+ if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) &&
+ (error == -EOPNOTSUPP)) {
+ pr_debug("flush diskcache op failed, not supported\n");
+ xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0);
+ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+ } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) &&
+ (error == -EOPNOTSUPP)) {
+ pr_debug("write barrier op failed, not supported\n");
+ xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0);
+ pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+ } else if (error) {
+ pr_debug("Buffer not up-to-date at end of operation,"
+ " error=%d\n", error);
+ pending_req->status = BLKIF_RSP_ERROR;
+ }
+
+ /*
+ * If all of the bio's have completed it is time to unmap
+ * the grant references associated with 'request' and provide
+ * the proper response on the ring.
+ */
+ if (atomic_dec_and_test(&pending_req->pendcnt))
+ xen_blkbk_unmap_and_respond(pending_req);
+}
+
+/*
+ * bio callback.
+ */
+static void end_block_io_op(struct bio *bio, int error)
+{
+ __end_block_io_op(bio->bi_private, error);
+ bio_put(bio);
+}
+
+
+
+/*
+ * Function to copy the from the ring buffer the 'struct blkif_request'
+ * (which has the sectors we want, number of them, grant references, etc),
+ * and transmute it to the block API to hand it over to the proper block disk.
+ */
+static int
+__do_block_io_op(struct xen_blkif *blkif)
+{
+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
+ struct blkif_request req;
+ struct pending_req *pending_req;
+ RING_IDX rc, rp;
+ int more_to_do = 0;
+
+ rc = blk_rings->common.req_cons;
+ rp = blk_rings->common.sring->req_prod;
+ rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+ if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
+ rc = blk_rings->common.rsp_prod_pvt;
+ pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
+ rp, rc, rp - rc, blkif->vbd.pdevice);
+ return -EACCES;
+ }
+ while (rc != rp) {
+
+ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+ break;
+
+ if (kthread_should_stop()) {
+ more_to_do = 1;
+ break;
+ }
+
+ pending_req = alloc_req(blkif);
+ if (NULL == pending_req) {
+ blkif->st_oo_req++;
+ more_to_do = 1;
+ break;
+ }
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+ /* Apply all sanity checks to /private copy/ of request. */
+ barrier();
+
+ switch (req.operation) {
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ case BLKIF_OP_WRITE_BARRIER:
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ case BLKIF_OP_INDIRECT:
+ if (dispatch_rw_block_io(blkif, &req, pending_req))
+ goto done;
+ break;
+ case BLKIF_OP_DISCARD:
+ free_req(blkif, pending_req);
+ if (dispatch_discard_io(blkif, &req))
+ goto done;
+ break;
+ default:
+ if (dispatch_other_io(blkif, &req, pending_req))
+ goto done;
+ break;
+ }
+
+ /* Yield point for this unbounded loop. */
+ cond_resched();
+ }
+done:
+ return more_to_do;
+}
+
+static int
+do_block_io_op(struct xen_blkif *blkif)
+{
+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
+ int more_to_do;
+
+ do {
+ more_to_do = __do_block_io_op(blkif);
+ if (more_to_do)
+ break;
+
+ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+ } while (more_to_do);
+
+ return more_to_do;
+}
+/*
+ * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
+ * and call the 'submit_bio' to pass it to the underlying storage.
+ */
+static int dispatch_rw_block_io(struct xen_blkif *blkif,
+ struct blkif_request *req,
+ struct pending_req *pending_req)
+{
+ struct phys_req preq;
+ struct seg_buf *seg = pending_req->seg;
+ unsigned int nseg;
+ struct bio *bio = NULL;
+ struct bio **biolist = pending_req->biolist;
+ int i, nbio = 0;
+ int operation;
+ struct blk_plug plug;
+ bool drain = false;
+ struct grant_page **pages = pending_req->segments;
+ unsigned short req_operation;
+
+ req_operation = req->operation == BLKIF_OP_INDIRECT ?
+ req->u.indirect.indirect_op : req->operation;
+ if ((req->operation == BLKIF_OP_INDIRECT) &&
+ (req_operation != BLKIF_OP_READ) &&
+ (req_operation != BLKIF_OP_WRITE)) {
+ pr_debug("Invalid indirect operation (%u)\n", req_operation);
+ goto fail_response;
+ }
+
+ switch (req_operation) {
+ case BLKIF_OP_READ:
+ blkif->st_rd_req++;
+ operation = READ;
+ break;
+ case BLKIF_OP_WRITE:
+ blkif->st_wr_req++;
+ operation = WRITE_ODIRECT;
+ break;
+ case BLKIF_OP_WRITE_BARRIER:
+ drain = true;
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ blkif->st_f_req++;
+ operation = WRITE_FLUSH;
+ break;
+ default:
+ operation = 0; /* make gcc happy */
+ goto fail_response;
+ break;
+ }
+
+ /* Check that the number of segments is sane. */
+ nseg = req->operation == BLKIF_OP_INDIRECT ?
+ req->u.indirect.nr_segments : req->u.rw.nr_segments;
+
+ if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
+ unlikely((req->operation != BLKIF_OP_INDIRECT) &&
+ (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
+ unlikely((req->operation == BLKIF_OP_INDIRECT) &&
+ (nseg > MAX_INDIRECT_SEGMENTS))) {
+ pr_debug("Bad number of segments in request (%d)\n", nseg);
+ /* Haven't submitted any bio's yet. */
+ goto fail_response;
+ }
+
+ preq.nr_sects = 0;
+
+ pending_req->blkif = blkif;
+ pending_req->id = req->u.rw.id;
+ pending_req->operation = req_operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nseg;
+
+ if (req->operation != BLKIF_OP_INDIRECT) {
+ preq.dev = req->u.rw.handle;
+ preq.sector_number = req->u.rw.sector_number;
+ for (i = 0; i < nseg; i++) {
+ pages[i]->gref = req->u.rw.seg[i].gref;
+ seg[i].nsec = req->u.rw.seg[i].last_sect -
+ req->u.rw.seg[i].first_sect + 1;
+ seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+ if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+ (req->u.rw.seg[i].last_sect <
+ req->u.rw.seg[i].first_sect))
+ goto fail_response;
+ preq.nr_sects += seg[i].nsec;
+ }
+ } else {
+ preq.dev = req->u.indirect.handle;
+ preq.sector_number = req->u.indirect.sector_number;
+ if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
+ goto fail_response;
+ }
+
+ if (xen_vbd_translate(&preq, blkif, operation) != 0) {
+ pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
+ operation == READ ? "read" : "write",
+ preq.sector_number,
+ preq.sector_number + preq.nr_sects,
+ blkif->vbd.pdevice);
+ goto fail_response;
+ }
+
+ /*
+ * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
+ * is set there.
+ */
+ for (i = 0; i < nseg; i++) {
+ if (((int)preq.sector_number|(int)seg[i].nsec) &
+ ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+ pr_debug("Misaligned I/O request from domain %d\n",
+ blkif->domid);
+ goto fail_response;
+ }
+ }
+
+ /* Wait on all outstanding I/O's and once that has been completed
+ * issue the WRITE_FLUSH.
+ */
+ if (drain)
+ xen_blk_drain_io(pending_req->blkif);
+
+ /*
+ * If we have failed at this point, we need to undo the M2P override,
+ * set gnttab_set_unmap_op on all of the grant references and perform
+ * the hypercall to unmap the grants - that is all done in
+ * xen_blkbk_unmap.
+ */
+ if (xen_blkbk_map_seg(pending_req))
+ goto fail_flush;
+
+ /*
+ * This corresponding xen_blkif_put is done in __end_block_io_op, or
+ * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
+ */
+ xen_blkif_get(blkif);
+ atomic_inc(&blkif->inflight);
+
+ for (i = 0; i < nseg; i++) {
+ while ((bio == NULL) ||
+ (bio_add_page(bio,
+ pages[i]->page,
+ seg[i].nsec << 9,
+ seg[i].offset) == 0)) {
+
+ int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
+ bio = bio_alloc(GFP_KERNEL, nr_iovecs);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
+
+ biolist[nbio++] = bio;
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
+ bio->bi_iter.bi_sector = preq.sector_number;
+ }
+
+ preq.sector_number += seg[i].nsec;
+ }
+
+ /* This will be hit if the operation was a flush or discard. */
+ if (!bio) {
+ BUG_ON(operation != WRITE_FLUSH);
+
+ bio = bio_alloc(GFP_KERNEL, 0);
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;
+
+ biolist[nbio++] = bio;
+ bio->bi_bdev = preq.bdev;
+ bio->bi_private = pending_req;
+ bio->bi_end_io = end_block_io_op;
+ }
+
+ atomic_set(&pending_req->pendcnt, nbio);
+ blk_start_plug(&plug);
+
+ for (i = 0; i < nbio; i++)
+ submit_bio(operation, biolist[i]);
+
+ /* Let the I/Os go.. */
+ blk_finish_plug(&plug);
+
+ if (operation == READ)
+ blkif->st_rd_sect += preq.nr_sects;
+ else if (operation & WRITE)
+ blkif->st_wr_sect += preq.nr_sects;
+
+ return 0;
+
+ fail_flush:
+ xen_blkbk_unmap(blkif, pending_req->segments,
+ pending_req->nr_pages);
+ fail_response:
+ /* Haven't submitted any bio's yet. */
+ make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+ free_req(blkif, pending_req);
+ msleep(1); /* back off a bit */
+ return -EIO;
+
+ fail_put_bio:
+ for (i = 0; i < nbio; i++)
+ bio_put(biolist[i]);
+ atomic_set(&pending_req->pendcnt, 1);
+ __end_block_io_op(pending_req, -EINVAL);
+ msleep(1); /* back off a bit */
+ return -EIO;
+}
+
+
+
+/*
+ * Put a response on the ring on how the operation fared.
+ */
+static void make_response(struct xen_blkif *blkif, u64 id,
+ unsigned short op, int st)
+{
+ struct blkif_response resp;
+ unsigned long flags;
+ union blkif_back_rings *blk_rings = &blkif->blk_rings;
+ int notify;
+
+ resp.id = id;
+ resp.operation = op;
+ resp.status = st;
+
+ spin_lock_irqsave(&blkif->blk_ring_lock, flags);
+ /* Place on the response ring for the relevant domain. */
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt),
+ &resp, sizeof(resp));
+ break;
+ default:
+ BUG();
+ }
+ blk_rings->common.rsp_prod_pvt++;
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);
+ if (notify)
+ notify_remote_via_irq(blkif->irq);
+}
+
+static int __init xen_blkif_init(void)
+{
+ int rc = 0;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ rc = xen_blkif_interface_init();
+ if (rc)
+ goto failed_init;
+
+ rc = xen_blkif_xenbus_init();
+ if (rc)
+ goto failed_init;
+
+ failed_init:
+ return rc;
+}
+
+module_init(xen_blkif_init);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vbd");
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
new file mode 100644
index 000000000..f620b5d3f
--- /dev/null
+++ b/drivers/block/xen-blkback/common.h
@@ -0,0 +1,492 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
+#define __XEN_BLKIF__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/io.h>
+#include <linux/rbtree.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm/hypervisor.h>
+#include <xen/grant_table.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+/*
+ * This is the maximum number of segments that would be allowed in indirect
+ * requests. This value will also be passed to the frontend.
+ */
+#define MAX_INDIRECT_SEGMENTS 256
+
+#define SEGS_PER_INDIRECT_FRAME \
+ (PAGE_SIZE/sizeof(struct blkif_request_segment))
+#define MAX_INDIRECT_PAGES \
+ ((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+#define INDIRECT_PAGES(_segs) \
+ ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
+/* Not a real protocol. Used to generate ring structs which contain
+ * the elements common to all protocols only. This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places. */
+struct blkif_common_request {
+ char dummy;
+};
+struct blkif_common_response {
+ char dummy;
+};
+
+struct blkif_x86_32_request_rw {
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_discard {
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
+ blkif_vdev_t _pad1; /* was "handle" for read/write requests */
+ uint64_t id; /* private guest value, echoed in resp */
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ uint64_t nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_other {
+ uint8_t _pad1;
+ blkif_vdev_t _pad2;
+ uint64_t id; /* private guest value, echoed in resp */
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_indirect {
+ uint8_t indirect_op;
+ uint16_t nr_segments;
+ uint64_t id;
+ blkif_sector_t sector_number;
+ blkif_vdev_t handle;
+ uint16_t _pad1;
+ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+ /*
+ * The maximum number of indirect segments (and pages) that will
+ * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+ * is also exported to the guest (via xenstore
+ * feature-max-indirect-segments entry), so the frontend knows how
+ * many indirect segments the backend supports.
+ */
+ uint64_t _pad2; /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ union {
+ struct blkif_x86_32_request_rw rw;
+ struct blkif_x86_32_request_discard discard;
+ struct blkif_x86_32_request_other other;
+ struct blkif_x86_32_request_indirect indirect;
+ } u;
+} __attribute__((__packed__));
+
+/* i386 protocol version */
+#pragma pack(push, 4)
+struct blkif_x86_32_response {
+ uint64_t id; /* copied from request */
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+#pragma pack(pop)
+/* x86_64 protocol version */
+
+struct blkif_x86_64_request_rw {
+ uint8_t nr_segments; /* number of segments */
+ blkif_vdev_t handle; /* only for read/write requests */
+ uint32_t _pad1; /* offsetof(blkif_reqest..,u.rw.id)==8 */
+ uint64_t id;
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_discard {
+ uint8_t flag; /* BLKIF_DISCARD_SECURE or zero */
+ blkif_vdev_t _pad1; /* was "handle" for read/write requests */
+ uint32_t _pad2; /* offsetof(blkif_..,u.discard.id)==8 */
+ uint64_t id;
+ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */
+ uint64_t nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_other {
+ uint8_t _pad1;
+ blkif_vdev_t _pad2;
+ uint32_t _pad3; /* offsetof(blkif_..,u.discard.id)==8 */
+ uint64_t id; /* private guest value, echoed in resp */
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_indirect {
+ uint8_t indirect_op;
+ uint16_t nr_segments;
+ uint32_t _pad1; /* offsetof(blkif_..,u.indirect.id)==8 */
+ uint64_t id;
+ blkif_sector_t sector_number;
+ blkif_vdev_t handle;
+ uint16_t _pad2;
+ grant_ref_t indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+ /*
+ * The maximum number of indirect segments (and pages) that will
+ * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+ * is also exported to the guest (via xenstore
+ * feature-max-indirect-segments entry), so the frontend knows how
+ * many indirect segments the backend supports.
+ */
+ uint32_t _pad3; /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request {
+ uint8_t operation; /* BLKIF_OP_??? */
+ union {
+ struct blkif_x86_64_request_rw rw;
+ struct blkif_x86_64_request_discard discard;
+ struct blkif_x86_64_request_other other;
+ struct blkif_x86_64_request_indirect indirect;
+ } u;
+} __attribute__((__packed__));
+
+struct blkif_x86_64_response {
+ uint64_t __attribute__((__aligned__(8))) id;
+ uint8_t operation; /* copied from request */
+ int16_t status; /* BLKIF_RSP_??? */
+};
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
+ struct blkif_common_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
+ struct blkif_x86_32_response);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
+ struct blkif_x86_64_response);
+
+union blkif_back_rings {
+ struct blkif_back_ring native;
+ struct blkif_common_back_ring common;
+ struct blkif_x86_32_back_ring x86_32;
+ struct blkif_x86_64_back_ring x86_64;
+};
+
+enum blkif_protocol {
+ BLKIF_PROTOCOL_NATIVE = 1,
+ BLKIF_PROTOCOL_X86_32 = 2,
+ BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+/*
+ * Default protocol if the frontend doesn't specify one.
+ */
+#ifdef CONFIG_X86
+# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
+#else
+# define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
+#endif
+
+struct xen_vbd {
+ /* What the domain refers to this vbd as. */
+ blkif_vdev_t handle;
+ /* Non-zero -> read-only */
+ unsigned char readonly;
+ /* VDISK_xxx */
+ unsigned char type;
+ /* phys device that this vbd maps to. */
+ u32 pdevice;
+ struct block_device *bdev;
+ /* Cached size parameter. */
+ sector_t size;
+ unsigned int flush_support:1;
+ unsigned int discard_secure:1;
+ unsigned int feature_gnt_persistent:1;
+ unsigned int overflow_max_grants:1;
+};
+
+struct backend_info;
+
+/* Number of available flags */
+#define PERSISTENT_GNT_FLAGS_SIZE 2
+/* This persistent grant is currently in use */
+#define PERSISTENT_GNT_ACTIVE 0
+/*
+ * This persistent grant has been used, this flag is set when we remove the
+ * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
+ */
+#define PERSISTENT_GNT_WAS_ACTIVE 1
+
+/* Number of requests that we can fit in a ring */
+#define XEN_BLKIF_REQS 32
+
+struct persistent_gnt {
+ struct page *page;
+ grant_ref_t gnt;
+ grant_handle_t handle;
+ DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
+ struct rb_node node;
+ struct list_head remove_node;
+};
+
+struct xen_blkif {
+ /* Unique identifier for this interface. */
+ domid_t domid;
+ unsigned int handle;
+ /* Physical parameters of the comms window. */
+ unsigned int irq;
+ /* Comms information. */
+ enum blkif_protocol blk_protocol;
+ union blkif_back_rings blk_rings;
+ void *blk_ring;
+ /* The VBD attached to this interface. */
+ struct xen_vbd vbd;
+ /* Back pointer to the backend_info. */
+ struct backend_info *be;
+ /* Private fields. */
+ spinlock_t blk_ring_lock;
+ atomic_t refcnt;
+
+ wait_queue_head_t wq;
+ /* for barrier (drain) requests */
+ struct completion drain_complete;
+ atomic_t drain;
+ atomic_t inflight;
+ /* One thread per one blkif. */
+ struct task_struct *xenblkd;
+ unsigned int waiting_reqs;
+
+ /* tree to store persistent grants */
+ struct rb_root persistent_gnts;
+ unsigned int persistent_gnt_c;
+ atomic_t persistent_gnt_in_use;
+ unsigned long next_lru;
+
+ /* used by the kworker that offload work from the persistent purge */
+ struct list_head persistent_purge_list;
+ struct work_struct persistent_purge_work;
+
+ /* buffer of free pages to map grant refs */
+ spinlock_t free_pages_lock;
+ int free_pages_num;
+ struct list_head free_pages;
+
+ /* List of all 'pending_req' available */
+ struct list_head pending_free;
+ /* And its spinlock. */
+ spinlock_t pending_free_lock;
+ wait_queue_head_t pending_free_wq;
+
+ /* statistics */
+ unsigned long st_print;
+ unsigned long long st_rd_req;
+ unsigned long long st_wr_req;
+ unsigned long long st_oo_req;
+ unsigned long long st_f_req;
+ unsigned long long st_ds_req;
+ unsigned long long st_rd_sect;
+ unsigned long long st_wr_sect;
+
+ struct work_struct free_work;
+ /* Thread shutdown wait queue. */
+ wait_queue_head_t shutdown_wq;
+};
+
+struct seg_buf {
+ unsigned long offset;
+ unsigned int nsec;
+};
+
+struct grant_page {
+ struct page *page;
+ struct persistent_gnt *persistent_gnt;
+ grant_handle_t handle;
+ grant_ref_t gref;
+};
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+struct pending_req {
+ struct xen_blkif *blkif;
+ u64 id;
+ int nr_pages;
+ atomic_t pendcnt;
+ unsigned short operation;
+ int status;
+ struct list_head free_list;
+ struct grant_page *segments[MAX_INDIRECT_SEGMENTS];
+ /* Indirect descriptors */
+ struct grant_page *indirect_pages[MAX_INDIRECT_PAGES];
+ struct seg_buf seg[MAX_INDIRECT_SEGMENTS];
+ struct bio *biolist[MAX_INDIRECT_SEGMENTS];
+ struct gnttab_unmap_grant_ref unmap[MAX_INDIRECT_SEGMENTS];
+ struct page *unmap_pages[MAX_INDIRECT_SEGMENTS];
+ struct gntab_unmap_queue_data gnttab_unmap_data;
+};
+
+
+#define vbd_sz(_v) ((_v)->bdev->bd_part ? \
+ (_v)->bdev->bd_part->nr_sects : \
+ get_capacity((_v)->bdev->bd_disk))
+
+#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define xen_blkif_put(_b) \
+ do { \
+ if (atomic_dec_and_test(&(_b)->refcnt)) \
+ schedule_work(&(_b)->free_work);\
+ } while (0)
+
+struct phys_req {
+ unsigned short dev;
+ blkif_sector_t nr_sects;
+ struct block_device *bdev;
+ blkif_sector_t sector_number;
+};
+int xen_blkif_interface_init(void);
+
+int xen_blkif_xenbus_init(void);
+
+irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
+int xen_blkif_schedule(void *arg);
+int xen_blkif_purge_persistent(void *arg);
+void xen_blkbk_free_caches(struct xen_blkif *blkif);
+
+int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
+ struct backend_info *be, int state);
+
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state);
+struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
+void xen_blkbk_unmap_purged_grants(struct work_struct *work);
+
+static inline void blkif_get_x86_32_req(struct blkif_request *dst,
+ struct blkif_x86_32_request *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
+ dst->operation = src->operation;
+ switch (src->operation) {
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ case BLKIF_OP_WRITE_BARRIER:
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ dst->u.rw.nr_segments = src->u.rw.nr_segments;
+ dst->u.rw.handle = src->u.rw.handle;
+ dst->u.rw.id = src->u.rw.id;
+ dst->u.rw.sector_number = src->u.rw.sector_number;
+ barrier();
+ if (n > dst->u.rw.nr_segments)
+ n = dst->u.rw.nr_segments;
+ for (i = 0; i < n; i++)
+ dst->u.rw.seg[i] = src->u.rw.seg[i];
+ break;
+ case BLKIF_OP_DISCARD:
+ dst->u.discard.flag = src->u.discard.flag;
+ dst->u.discard.id = src->u.discard.id;
+ dst->u.discard.sector_number = src->u.discard.sector_number;
+ dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+ break;
+ case BLKIF_OP_INDIRECT:
+ dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+ dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+ dst->u.indirect.handle = src->u.indirect.handle;
+ dst->u.indirect.id = src->u.indirect.id;
+ dst->u.indirect.sector_number = src->u.indirect.sector_number;
+ barrier();
+ j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+ for (i = 0; i < j; i++)
+ dst->u.indirect.indirect_grefs[i] =
+ src->u.indirect.indirect_grefs[i];
+ break;
+ default:
+ /*
+ * Don't know how to translate this op. Only get the
+ * ID so failure can be reported to the frontend.
+ */
+ dst->u.other.id = src->u.other.id;
+ break;
+ }
+}
+
+static inline void blkif_get_x86_64_req(struct blkif_request *dst,
+ struct blkif_x86_64_request *src)
+{
+ int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
+ dst->operation = src->operation;
+ switch (src->operation) {
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ case BLKIF_OP_WRITE_BARRIER:
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ dst->u.rw.nr_segments = src->u.rw.nr_segments;
+ dst->u.rw.handle = src->u.rw.handle;
+ dst->u.rw.id = src->u.rw.id;
+ dst->u.rw.sector_number = src->u.rw.sector_number;
+ barrier();
+ if (n > dst->u.rw.nr_segments)
+ n = dst->u.rw.nr_segments;
+ for (i = 0; i < n; i++)
+ dst->u.rw.seg[i] = src->u.rw.seg[i];
+ break;
+ case BLKIF_OP_DISCARD:
+ dst->u.discard.flag = src->u.discard.flag;
+ dst->u.discard.id = src->u.discard.id;
+ dst->u.discard.sector_number = src->u.discard.sector_number;
+ dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+ break;
+ case BLKIF_OP_INDIRECT:
+ dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+ dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+ dst->u.indirect.handle = src->u.indirect.handle;
+ dst->u.indirect.id = src->u.indirect.id;
+ dst->u.indirect.sector_number = src->u.indirect.sector_number;
+ barrier();
+ j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+ for (i = 0; i < j; i++)
+ dst->u.indirect.indirect_grefs[i] =
+ src->u.indirect.indirect_grefs[i];
+ break;
+ default:
+ /*
+ * Don't know how to translate this op. Only get the
+ * ID so failure can be reported to the frontend.
+ */
+ dst->u.other.id = src->u.other.id;
+ break;
+ }
+}
+
+#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
new file mode 100644
index 000000000..6ab69ad61
--- /dev/null
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -0,0 +1,934 @@
+/* Xenbus code for blkif backend
+ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+ Copyright (C) 2005 XenSource Ltd
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+*/
+
+#define pr_fmt(fmt) "xen-blkback: " fmt
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include "common.h"
+
+/* Enlarge the array size in order to fully show blkback name. */
+#define BLKBACK_NAME_LEN (20)
+
+struct backend_info {
+ struct xenbus_device *dev;
+ struct xen_blkif *blkif;
+ struct xenbus_watch backend_watch;
+ unsigned major;
+ unsigned minor;
+ char *mode;
+};
+
+static struct kmem_cache *xen_blkif_cachep;
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char **,
+ unsigned int);
+static void xen_blkif_free(struct xen_blkif *blkif);
+static void xen_vbd_free(struct xen_vbd *vbd);
+
+struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
+{
+ return be->dev;
+}
+
+/*
+ * The last request could free the device from softirq context and
+ * xen_blkif_free() can sleep.
+ */
+static void xen_blkif_deferred_free(struct work_struct *work)
+{
+ struct xen_blkif *blkif;
+
+ blkif = container_of(work, struct xen_blkif, free_work);
+ xen_blkif_free(blkif);
+}
+
+static int blkback_name(struct xen_blkif *blkif, char *buf)
+{
+ char *devpath, *devname;
+ struct xenbus_device *dev = blkif->be->dev;
+
+ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+ if (IS_ERR(devpath))
+ return PTR_ERR(devpath);
+
+ devname = strstr(devpath, "/dev/");
+ if (devname != NULL)
+ devname += strlen("/dev/");
+ else
+ devname = devpath;
+
+ snprintf(buf, BLKBACK_NAME_LEN, "blkback.%d.%s", blkif->domid, devname);
+ kfree(devpath);
+
+ return 0;
+}
+
+static void xen_update_blkif_status(struct xen_blkif *blkif)
+{
+ int err;
+ char name[BLKBACK_NAME_LEN];
+
+ /* Not ready to connect? */
+ if (!blkif->irq || !blkif->vbd.bdev)
+ return;
+
+ /* Already connected? */
+ if (blkif->be->dev->state == XenbusStateConnected)
+ return;
+
+ /* Attempt to connect: exit if we fail to. */
+ connect(blkif->be);
+ if (blkif->be->dev->state != XenbusStateConnected)
+ return;
+
+ err = blkback_name(blkif, name);
+ if (err) {
+ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+ return;
+ }
+
+ err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping);
+ if (err) {
+ xenbus_dev_error(blkif->be->dev, err, "block flush");
+ return;
+ }
+ invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+
+ blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name);
+ if (IS_ERR(blkif->xenblkd)) {
+ err = PTR_ERR(blkif->xenblkd);
+ blkif->xenblkd = NULL;
+ xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+ return;
+ }
+}
+
+static struct xen_blkif *xen_blkif_alloc(domid_t domid)
+{
+ struct xen_blkif *blkif;
+ struct pending_req *req, *n;
+ int i, j;
+
+ BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+ blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
+ if (!blkif)
+ return ERR_PTR(-ENOMEM);
+
+ blkif->domid = domid;
+ spin_lock_init(&blkif->blk_ring_lock);
+ atomic_set(&blkif->refcnt, 1);
+ init_waitqueue_head(&blkif->wq);
+ init_completion(&blkif->drain_complete);
+ atomic_set(&blkif->drain, 0);
+ blkif->st_print = jiffies;
+ blkif->persistent_gnts.rb_node = NULL;
+ spin_lock_init(&blkif->free_pages_lock);
+ INIT_LIST_HEAD(&blkif->free_pages);
+ INIT_LIST_HEAD(&blkif->persistent_purge_list);
+ blkif->free_pages_num = 0;
+ atomic_set(&blkif->persistent_gnt_in_use, 0);
+ atomic_set(&blkif->inflight, 0);
+ INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants);
+
+ INIT_LIST_HEAD(&blkif->pending_free);
+ INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
+
+ for (i = 0; i < XEN_BLKIF_REQS; i++) {
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ goto fail;
+ list_add_tail(&req->free_list,
+ &blkif->pending_free);
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+ req->segments[j] = kzalloc(sizeof(*req->segments[0]),
+ GFP_KERNEL);
+ if (!req->segments[j])
+ goto fail;
+ }
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+ req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
+ GFP_KERNEL);
+ if (!req->indirect_pages[j])
+ goto fail;
+ }
+ }
+ spin_lock_init(&blkif->pending_free_lock);
+ init_waitqueue_head(&blkif->pending_free_wq);
+ init_waitqueue_head(&blkif->shutdown_wq);
+
+ return blkif;
+
+fail:
+ list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+ list_del(&req->free_list);
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+ if (!req->segments[j])
+ break;
+ kfree(req->segments[j]);
+ }
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+ if (!req->indirect_pages[j])
+ break;
+ kfree(req->indirect_pages[j]);
+ }
+ kfree(req);
+ }
+
+ kmem_cache_free(xen_blkif_cachep, blkif);
+
+ return ERR_PTR(-ENOMEM);
+}
+
+static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
+ unsigned int evtchn)
+{
+ int err;
+
+ /* Already connected through? */
+ if (blkif->irq)
+ return 0;
+
+ err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
+ &blkif->blk_ring);
+ if (err < 0)
+ return err;
+
+ switch (blkif->blk_protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ struct blkif_sring *sring;
+ sring = (struct blkif_sring *)blkif->blk_ring;
+ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ struct blkif_x86_32_sring *sring_x86_32;
+ sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
+ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ struct blkif_x86_64_sring *sring_x86_64;
+ sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
+ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+ break;
+ }
+ default:
+ BUG();
+ }
+
+ err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn,
+ xen_blkif_be_int, 0,
+ "blkif-backend", blkif);
+ if (err < 0) {
+ xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
+ blkif->blk_rings.common.sring = NULL;
+ return err;
+ }
+ blkif->irq = err;
+
+ return 0;
+}
+
+static int xen_blkif_disconnect(struct xen_blkif *blkif)
+{
+ if (blkif->xenblkd) {
+ kthread_stop(blkif->xenblkd);
+ wake_up(&blkif->shutdown_wq);
+ blkif->xenblkd = NULL;
+ }
+
+ /* The above kthread_stop() guarantees that at this point we
+ * don't have any discard_io or other_io requests. So, checking
+ * for inflight IO is enough.
+ */
+ if (atomic_read(&blkif->inflight) > 0)
+ return -EBUSY;
+
+ if (blkif->irq) {
+ unbind_from_irqhandler(blkif->irq, blkif);
+ blkif->irq = 0;
+ }
+
+ if (blkif->blk_rings.common.sring) {
+ xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring);
+ blkif->blk_rings.common.sring = NULL;
+ }
+
+ /* Remove all persistent grants and the cache of ballooned pages. */
+ xen_blkbk_free_caches(blkif);
+
+ return 0;
+}
+
+static void xen_blkif_free(struct xen_blkif *blkif)
+{
+ struct pending_req *req, *n;
+ int i = 0, j;
+
+ xen_blkif_disconnect(blkif);
+ xen_vbd_free(&blkif->vbd);
+
+ /* Make sure everything is drained before shutting down */
+ BUG_ON(blkif->persistent_gnt_c != 0);
+ BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0);
+ BUG_ON(blkif->free_pages_num != 0);
+ BUG_ON(!list_empty(&blkif->persistent_purge_list));
+ BUG_ON(!list_empty(&blkif->free_pages));
+ BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
+
+ /* Check that there is no request in use */
+ list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+ list_del(&req->free_list);
+
+ for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+ kfree(req->segments[j]);
+
+ for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+ kfree(req->indirect_pages[j]);
+
+ kfree(req);
+ i++;
+ }
+
+ WARN_ON(i != XEN_BLKIF_REQS);
+
+ kmem_cache_free(xen_blkif_cachep, blkif);
+}
+
+int __init xen_blkif_interface_init(void)
+{
+ xen_blkif_cachep = kmem_cache_create("blkif_cache",
+ sizeof(struct xen_blkif),
+ 0, 0, NULL);
+ if (!xen_blkif_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/*
+ * sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW(name, format, args...) \
+ static ssize_t show_##name(struct device *_dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+ { \
+ struct xenbus_device *dev = to_xenbus_device(_dev); \
+ struct backend_info *be = dev_get_drvdata(&dev->dev); \
+ \
+ return sprintf(buf, format, ##args); \
+ } \
+ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req);
+VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req);
+VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req);
+VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req);
+VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req);
+VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect);
+VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect);
+
+static struct attribute *xen_vbdstat_attrs[] = {
+ &dev_attr_oo_req.attr,
+ &dev_attr_rd_req.attr,
+ &dev_attr_wr_req.attr,
+ &dev_attr_f_req.attr,
+ &dev_attr_ds_req.attr,
+ &dev_attr_rd_sect.attr,
+ &dev_attr_wr_sect.attr,
+ NULL
+};
+
+static struct attribute_group xen_vbdstat_group = {
+ .name = "statistics",
+ .attrs = xen_vbdstat_attrs,
+};
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+static int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+ int error;
+
+ error = device_create_file(&dev->dev, &dev_attr_physical_device);
+ if (error)
+ goto fail1;
+
+ error = device_create_file(&dev->dev, &dev_attr_mode);
+ if (error)
+ goto fail2;
+
+ error = sysfs_create_group(&dev->dev.kobj, &xen_vbdstat_group);
+ if (error)
+ goto fail3;
+
+ return 0;
+
+fail3: sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
+fail2: device_remove_file(&dev->dev, &dev_attr_mode);
+fail1: device_remove_file(&dev->dev, &dev_attr_physical_device);
+ return error;
+}
+
+static void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+ sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
+ device_remove_file(&dev->dev, &dev_attr_mode);
+ device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+
+static void xen_vbd_free(struct xen_vbd *vbd)
+{
+ if (vbd->bdev)
+ blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+ vbd->bdev = NULL;
+}
+
+static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
+ unsigned major, unsigned minor, int readonly,
+ int cdrom)
+{
+ struct xen_vbd *vbd;
+ struct block_device *bdev;
+ struct request_queue *q;
+
+ vbd = &blkif->vbd;
+ vbd->handle = handle;
+ vbd->readonly = readonly;
+ vbd->type = 0;
+
+ vbd->pdevice = MKDEV(major, minor);
+
+ bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+ FMODE_READ : FMODE_WRITE, NULL);
+
+ if (IS_ERR(bdev)) {
+ pr_warn("xen_vbd_create: device %08x could not be opened\n",
+ vbd->pdevice);
+ return -ENOENT;
+ }
+
+ vbd->bdev = bdev;
+ if (vbd->bdev->bd_disk == NULL) {
+ pr_warn("xen_vbd_create: device %08x doesn't exist\n",
+ vbd->pdevice);
+ xen_vbd_free(vbd);
+ return -ENOENT;
+ }
+ vbd->size = vbd_sz(vbd);
+
+ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom)
+ vbd->type |= VDISK_CDROM;
+ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+ vbd->type |= VDISK_REMOVABLE;
+
+ q = bdev_get_queue(bdev);
+ if (q && q->flush_flags)
+ vbd->flush_support = true;
+
+ if (q && blk_queue_secdiscard(q))
+ vbd->discard_secure = true;
+
+ pr_debug("Successful creation of handle=%04x (dom=%u)\n",
+ handle, blkif->domid);
+ return 0;
+}
+static int xen_blkbk_remove(struct xenbus_device *dev)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+ pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
+
+ if (be->major || be->minor)
+ xenvbd_sysfs_delif(dev);
+
+ if (be->backend_watch.node) {
+ unregister_xenbus_watch(&be->backend_watch);
+ kfree(be->backend_watch.node);
+ be->backend_watch.node = NULL;
+ }
+
+ dev_set_drvdata(&dev->dev, NULL);
+
+ if (be->blkif) {
+ xen_blkif_disconnect(be->blkif);
+ xen_blkif_put(be->blkif);
+ }
+
+ kfree(be->mode);
+ kfree(be);
+ return 0;
+}
+
+int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
+ struct backend_info *be, int state)
+{
+ struct xenbus_device *dev = be->dev;
+ int err;
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
+ "%d", state);
+ if (err)
+ dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err);
+
+ return err;
+}
+
+static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ struct xen_blkif *blkif = be->blkif;
+ int err;
+ int state = 0, discard_enable;
+ struct block_device *bdev = be->blkif->vbd.bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "discard-enable", "%d",
+ &discard_enable);
+ if (err == 1 && !discard_enable)
+ return;
+
+ if (blk_queue_discard(q)) {
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-granularity", "%u",
+ q->limits.discard_granularity);
+ if (err) {
+ dev_warn(&dev->dev, "writing discard-granularity (%d)", err);
+ return;
+ }
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-alignment", "%u",
+ q->limits.discard_alignment);
+ if (err) {
+ dev_warn(&dev->dev, "writing discard-alignment (%d)", err);
+ return;
+ }
+ state = 1;
+ /* Optional. */
+ err = xenbus_printf(xbt, dev->nodename,
+ "discard-secure", "%d",
+ blkif->vbd.discard_secure);
+ if (err) {
+ dev_warn(&dev->dev, "writing discard-secure (%d)", err);
+ return;
+ }
+ }
+ err = xenbus_printf(xbt, dev->nodename, "feature-discard",
+ "%d", state);
+ if (err)
+ dev_warn(&dev->dev, "writing feature-discard (%d)", err);
+}
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+ struct backend_info *be, int state)
+{
+ struct xenbus_device *dev = be->dev;
+ int err;
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+ "%d", state);
+ if (err)
+ dev_warn(&dev->dev, "writing feature-barrier (%d)", err);
+
+ return err;
+}
+
+/*
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers. Switch to InitWait.
+ */
+static int xen_blkbk_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err;
+ struct backend_info *be = kzalloc(sizeof(struct backend_info),
+ GFP_KERNEL);
+
+ /* match the pr_debug in xen_blkbk_remove */
+ pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
+
+ if (!be) {
+ xenbus_dev_fatal(dev, -ENOMEM,
+ "allocating backend structure");
+ return -ENOMEM;
+ }
+ be->dev = dev;
+ dev_set_drvdata(&dev->dev, be);
+
+ be->blkif = xen_blkif_alloc(dev->otherend_id);
+ if (IS_ERR(be->blkif)) {
+ err = PTR_ERR(be->blkif);
+ be->blkif = NULL;
+ xenbus_dev_fatal(dev, err, "creating block interface");
+ goto fail;
+ }
+
+ /* setup back pointer */
+ be->blkif->be = be;
+
+ err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed,
+ "%s/%s", dev->nodename, "physical-device");
+ if (err)
+ goto fail;
+
+ err = xenbus_switch_state(dev, XenbusStateInitWait);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ pr_warn("%s failed\n", __func__);
+ xen_blkbk_remove(dev);
+ return err;
+}
+
+
+/*
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node. Read it and the mode node, and create a vbd. If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+ const char **vec, unsigned int len)
+{
+ int err;
+ unsigned major;
+ unsigned minor;
+ struct backend_info *be
+ = container_of(watch, struct backend_info, backend_watch);
+ struct xenbus_device *dev = be->dev;
+ int cdrom = 0;
+ unsigned long handle;
+ char *device_type;
+
+ pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
+
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+ &major, &minor);
+ if (XENBUS_EXIST_ERR(err)) {
+ /*
+ * Since this watch will fire once immediately after it is
+ * registered, we expect this. Ignore it, and wait for the
+ * hotplug scripts.
+ */
+ return;
+ }
+ if (err != 2) {
+ xenbus_dev_fatal(dev, err, "reading physical-device");
+ return;
+ }
+
+ if (be->major | be->minor) {
+ if (be->major != major || be->minor != minor)
+ pr_warn("changing physical device (from %x:%x to %x:%x) not supported.\n",
+ be->major, be->minor, major, minor);
+ return;
+ }
+
+ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+ if (IS_ERR(be->mode)) {
+ err = PTR_ERR(be->mode);
+ be->mode = NULL;
+ xenbus_dev_fatal(dev, err, "reading mode");
+ return;
+ }
+
+ device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+ if (!IS_ERR(device_type)) {
+ cdrom = strcmp(device_type, "cdrom") == 0;
+ kfree(device_type);
+ }
+
+ /* Front end dir is a number, which is used as the handle. */
+ err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
+ if (err)
+ return;
+
+ be->major = major;
+ be->minor = minor;
+
+ err = xen_vbd_create(be->blkif, handle, major, minor,
+ !strchr(be->mode, 'w'), cdrom);
+
+ if (err)
+ xenbus_dev_fatal(dev, err, "creating vbd structure");
+ else {
+ err = xenvbd_sysfs_addif(dev);
+ if (err) {
+ xen_vbd_free(&be->blkif->vbd);
+ xenbus_dev_fatal(dev, err, "creating sysfs entries");
+ }
+ }
+
+ if (err) {
+ kfree(be->mode);
+ be->mode = NULL;
+ be->major = 0;
+ be->minor = 0;
+ } else {
+ /* We're potentially connected now */
+ xen_update_blkif_status(be->blkif);
+ }
+}
+
+
+/*
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+ enum xenbus_state frontend_state)
+{
+ struct backend_info *be = dev_get_drvdata(&dev->dev);
+ int err;
+
+ pr_debug("%s %p %s\n", __func__, dev, xenbus_strstate(frontend_state));
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ if (dev->state == XenbusStateClosed) {
+ pr_info("%s: prepare for reconnect\n", dev->nodename);
+ xenbus_switch_state(dev, XenbusStateInitWait);
+ }
+ break;
+
+ case XenbusStateInitialised:
+ case XenbusStateConnected:
+ /*
+ * Ensure we connect even when two watches fire in
+ * close succession and we miss the intermediate value
+ * of frontend_state.
+ */
+ if (dev->state == XenbusStateConnected)
+ break;
+
+ /*
+ * Enforce precondition before potential leak point.
+ * xen_blkif_disconnect() is idempotent.
+ */
+ err = xen_blkif_disconnect(be->blkif);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "pending I/O");
+ break;
+ }
+
+ err = connect_ring(be);
+ if (err)
+ break;
+ xen_update_blkif_status(be->blkif);
+ break;
+
+ case XenbusStateClosing:
+ xenbus_switch_state(dev, XenbusStateClosing);
+ break;
+
+ case XenbusStateClosed:
+ xen_blkif_disconnect(be->blkif);
+ xenbus_switch_state(dev, XenbusStateClosed);
+ if (xenbus_dev_is_online(dev))
+ break;
+ /* fall through if not online */
+ case XenbusStateUnknown:
+ /* implies xen_blkif_disconnect() via xen_blkbk_remove() */
+ device_unregister(&dev->dev);
+ break;
+
+ default:
+ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+}
+
+
+/* ** Connection ** */
+
+
+/*
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+ struct xenbus_transaction xbt;
+ int err;
+ struct xenbus_device *dev = be->dev;
+
+ pr_debug("%s %s\n", __func__, dev->otherend);
+
+ /* Supply the information about the device the frontend needs */
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ return;
+ }
+
+ /* If we can't advertise it is OK. */
+ xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
+
+ xen_blkbk_discard(xbt, be);
+
+ xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
+
+ err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u", 1);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
+ dev->nodename);
+ goto abort;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
+ MAX_INDIRECT_SEGMENTS);
+ if (err)
+ dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
+ dev->nodename, err);
+
+ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+ (unsigned long long)vbd_sz(&be->blkif->vbd));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/sectors",
+ dev->nodename);
+ goto abort;
+ }
+
+ /* FIXME: use a typename instead */
+ err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+ be->blkif->vbd.type |
+ (be->blkif->vbd.readonly ? VDISK_READONLY : 0));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/info",
+ dev->nodename);
+ goto abort;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+ (unsigned long)
+ bdev_logical_block_size(be->blkif->vbd.bdev));
+ if (err) {
+ xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+ dev->nodename);
+ goto abort;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
+ bdev_physical_block_size(be->blkif->vbd.bdev));
+ if (err)
+ xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
+ dev->nodename);
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err == -EAGAIN)
+ goto again;
+ if (err)
+ xenbus_dev_fatal(dev, err, "ending transaction");
+
+ err = xenbus_switch_state(dev, XenbusStateConnected);
+ if (err)
+ xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
+ dev->nodename);
+
+ return;
+ abort:
+ xenbus_transaction_end(xbt, 1);
+}
+
+
+static int connect_ring(struct backend_info *be)
+{
+ struct xenbus_device *dev = be->dev;
+ unsigned long ring_ref;
+ unsigned int evtchn;
+ unsigned int pers_grants;
+ char protocol[64] = "";
+ int err;
+
+ pr_debug("%s %s\n", __func__, dev->otherend);
+
+ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
+ &ring_ref, "event-channel", "%u", &evtchn, NULL);
+ if (err) {
+ xenbus_dev_fatal(dev, err,
+ "reading %s/ring-ref and event-channel",
+ dev->otherend);
+ return err;
+ }
+
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
+ "%63s", protocol, NULL);
+ if (err)
+ strcpy(protocol, "unspecified, assuming default");
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+ else {
+ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+ return -1;
+ }
+ err = xenbus_gather(XBT_NIL, dev->otherend,
+ "feature-persistent", "%u",
+ &pers_grants, NULL);
+ if (err)
+ pers_grants = 0;
+
+ be->blkif->vbd.feature_gnt_persistent = pers_grants;
+ be->blkif->vbd.overflow_max_grants = 0;
+
+ pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
+ ring_ref, evtchn, be->blkif->blk_protocol, protocol,
+ pers_grants ? "persistent grants" : "");
+
+ /* Map the shared frame, irq etc. */
+ err = xen_blkif_map(be->blkif, ring_ref, evtchn);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
+ ring_ref, evtchn);
+ return err;
+ }
+
+ return 0;
+}
+
+static const struct xenbus_device_id xen_blkbk_ids[] = {
+ { "vbd" },
+ { "" }
+};
+
+static struct xenbus_driver xen_blkbk_driver = {
+ .ids = xen_blkbk_ids,
+ .probe = xen_blkbk_probe,
+ .remove = xen_blkbk_remove,
+ .otherend_changed = frontend_changed
+};
+
+int xen_blkif_xenbus_init(void)
+{
+ return xenbus_register_backend(&xen_blkbk_driver);
+}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
new file mode 100644
index 000000000..2c61cf8c6
--- /dev/null
+++ b/drivers/block/xen-blkfront.c
@@ -0,0 +1,2126 @@
+/*
+ * blkfront.c
+ *
+ * XenLinux virtual block device driver.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/cdrom.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/bitmap.h>
+#include <linux/list.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/platform_pci.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+#include <asm/xen/hypervisor.h>
+
+enum blkif_state {
+ BLKIF_STATE_DISCONNECTED,
+ BLKIF_STATE_CONNECTED,
+ BLKIF_STATE_SUSPENDED,
+};
+
+struct grant {
+ grant_ref_t gref;
+ unsigned long pfn;
+ struct list_head node;
+};
+
+struct blk_shadow {
+ struct blkif_request req;
+ struct request *request;
+ struct grant **grants_used;
+ struct grant **indirect_grants;
+ struct scatterlist *sg;
+};
+
+struct split_bio {
+ struct bio *bio;
+ atomic_t pending;
+ int err;
+};
+
+static DEFINE_MUTEX(blkfront_mutex);
+static const struct block_device_operations xlvbd_block_fops;
+
+/*
+ * Maximum number of segments in indirect requests, the actual value used by
+ * the frontend driver is the minimum of this value and the value provided
+ * by the backend driver.
+ */
+
+static unsigned int xen_blkif_max_segments = 32;
+module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
+MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
+
+#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'. They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+ spinlock_t io_lock;
+ struct mutex mutex;
+ struct xenbus_device *xbdev;
+ struct gendisk *gd;
+ int vdevice;
+ blkif_vdev_t handle;
+ enum blkif_state connected;
+ int ring_ref;
+ struct blkif_front_ring ring;
+ unsigned int evtchn, irq;
+ struct request_queue *rq;
+ struct work_struct work;
+ struct gnttab_free_callback callback;
+ struct blk_shadow shadow[BLK_RING_SIZE];
+ struct list_head grants;
+ struct list_head indirect_pages;
+ unsigned int persistent_gnts_c;
+ unsigned long shadow_free;
+ unsigned int feature_flush;
+ unsigned int feature_discard:1;
+ unsigned int feature_secdiscard:1;
+ unsigned int discard_granularity;
+ unsigned int discard_alignment;
+ unsigned int feature_persistent:1;
+ unsigned int max_indirect_segments;
+ int is_ready;
+};
+
+static unsigned int nr_minors;
+static unsigned long *minors;
+static DEFINE_SPINLOCK(minor_lock);
+
+#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
+ (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
+#define GRANT_INVALID_REF 0
+
+#define PARTS_PER_DISK 16
+#define PARTS_PER_EXT_DISK 256
+
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+
+#define EXT_SHIFT 28
+#define EXTENDED (1<<EXT_SHIFT)
+#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
+#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (0)
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
+
+#define DEV_NAME "xvd" /* name in /dev */
+
+#define SEGS_PER_INDIRECT_FRAME \
+ (PAGE_SIZE/sizeof(struct blkif_request_segment))
+#define INDIRECT_GREFS(_segs) \
+ ((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
+static int blkfront_setup_indirect(struct blkfront_info *info);
+
+static int get_id_from_freelist(struct blkfront_info *info)
+{
+ unsigned long free = info->shadow_free;
+ BUG_ON(free >= BLK_RING_SIZE);
+ info->shadow_free = info->shadow[free].req.u.rw.id;
+ info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
+ return free;
+}
+
+static int add_id_to_freelist(struct blkfront_info *info,
+ unsigned long id)
+{
+ if (info->shadow[id].req.u.rw.id != id)
+ return -EINVAL;
+ if (info->shadow[id].request == NULL)
+ return -EINVAL;
+ info->shadow[id].req.u.rw.id = info->shadow_free;
+ info->shadow[id].request = NULL;
+ info->shadow_free = id;
+ return 0;
+}
+
+static int fill_grant_buffer(struct blkfront_info *info, int num)
+{
+ struct page *granted_page;
+ struct grant *gnt_list_entry, *n;
+ int i = 0;
+
+ while(i < num) {
+ gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
+ if (!gnt_list_entry)
+ goto out_of_memory;
+
+ if (info->feature_persistent) {
+ granted_page = alloc_page(GFP_NOIO);
+ if (!granted_page) {
+ kfree(gnt_list_entry);
+ goto out_of_memory;
+ }
+ gnt_list_entry->pfn = page_to_pfn(granted_page);
+ }
+
+ gnt_list_entry->gref = GRANT_INVALID_REF;
+ list_add(&gnt_list_entry->node, &info->grants);
+ i++;
+ }
+
+ return 0;
+
+out_of_memory:
+ list_for_each_entry_safe(gnt_list_entry, n,
+ &info->grants, node) {
+ list_del(&gnt_list_entry->node);
+ if (info->feature_persistent)
+ __free_page(pfn_to_page(gnt_list_entry->pfn));
+ kfree(gnt_list_entry);
+ i--;
+ }
+ BUG_ON(i != 0);
+ return -ENOMEM;
+}
+
+static struct grant *get_grant(grant_ref_t *gref_head,
+ unsigned long pfn,
+ struct blkfront_info *info)
+{
+ struct grant *gnt_list_entry;
+ unsigned long buffer_mfn;
+
+ BUG_ON(list_empty(&info->grants));
+ gnt_list_entry = list_first_entry(&info->grants, struct grant,
+ node);
+ list_del(&gnt_list_entry->node);
+
+ if (gnt_list_entry->gref != GRANT_INVALID_REF) {
+ info->persistent_gnts_c--;
+ return gnt_list_entry;
+ }
+
+ /* Assign a gref to this page */
+ gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
+ BUG_ON(gnt_list_entry->gref == -ENOSPC);
+ if (!info->feature_persistent) {
+ BUG_ON(!pfn);
+ gnt_list_entry->pfn = pfn;
+ }
+ buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
+ gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
+ info->xbdev->otherend_id,
+ buffer_mfn, 0);
+ return gnt_list_entry;
+}
+
+static const char *op_name(int op)
+{
+ static const char *const names[] = {
+ [BLKIF_OP_READ] = "read",
+ [BLKIF_OP_WRITE] = "write",
+ [BLKIF_OP_WRITE_BARRIER] = "barrier",
+ [BLKIF_OP_FLUSH_DISKCACHE] = "flush",
+ [BLKIF_OP_DISCARD] = "discard" };
+
+ if (op < 0 || op >= ARRAY_SIZE(names))
+ return "unknown";
+
+ if (!names[op])
+ return "reserved";
+
+ return names[op];
+}
+static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
+{
+ unsigned int end = minor + nr;
+ int rc;
+
+ if (end > nr_minors) {
+ unsigned long *bitmap, *old;
+
+ bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
+ GFP_KERNEL);
+ if (bitmap == NULL)
+ return -ENOMEM;
+
+ spin_lock(&minor_lock);
+ if (end > nr_minors) {
+ old = minors;
+ memcpy(bitmap, minors,
+ BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
+ minors = bitmap;
+ nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
+ } else
+ old = bitmap;
+ spin_unlock(&minor_lock);
+ kfree(old);
+ }
+
+ spin_lock(&minor_lock);
+ if (find_next_bit(minors, end, minor) >= end) {
+ bitmap_set(minors, minor, nr);
+ rc = 0;
+ } else
+ rc = -EBUSY;
+ spin_unlock(&minor_lock);
+
+ return rc;
+}
+
+static void xlbd_release_minors(unsigned int minor, unsigned int nr)
+{
+ unsigned int end = minor + nr;
+
+ BUG_ON(end > nr_minors);
+ spin_lock(&minor_lock);
+ bitmap_clear(minors, minor, nr);
+ spin_unlock(&minor_lock);
+}
+
+static void blkif_restart_queue_callback(void *arg)
+{
+ struct blkfront_info *info = (struct blkfront_info *)arg;
+ schedule_work(&info->work);
+}
+
+static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+ /* We don't have real geometry info, but let's at least return
+ values consistent with the size of the device */
+ sector_t nsect = get_capacity(bd->bd_disk);
+ sector_t cylinders = nsect;
+
+ hg->heads = 0xff;
+ hg->sectors = 0x3f;
+ sector_div(cylinders, hg->heads * hg->sectors);
+ hg->cylinders = cylinders;
+ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+ hg->cylinders = 0xffff;
+ return 0;
+}
+
+static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned command, unsigned long argument)
+{
+ struct blkfront_info *info = bdev->bd_disk->private_data;
+ int i;
+
+ dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
+ command, (long)argument);
+
+ switch (command) {
+ case CDROMMULTISESSION:
+ dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
+ for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+ if (put_user(0, (char __user *)(argument + i)))
+ return -EFAULT;
+ return 0;
+
+ case CDROM_GET_CAPABILITY: {
+ struct gendisk *gd = info->gd;
+ if (gd->flags & GENHD_FL_CD)
+ return 0;
+ return -EINVAL;
+ }
+
+ default:
+ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
+ command);*/
+ return -EINVAL; /* same return as native Linux */
+ }
+
+ return 0;
+}
+
+/*
+ * Generate a Xen blkfront IO request from a blk layer request. Reads
+ * and writes are handled as expected.
+ *
+ * @req: a request struct
+ */
+static int blkif_queue_request(struct request *req)
+{
+ struct blkfront_info *info = req->rq_disk->private_data;
+ struct blkif_request *ring_req;
+ unsigned long id;
+ unsigned int fsect, lsect;
+ int i, ref, n;
+ struct blkif_request_segment *segments = NULL;
+
+ /*
+ * Used to store if we are able to queue the request by just using
+ * existing persistent grants, or if we have to get new grants,
+ * as there are not sufficiently many free.
+ */
+ bool new_persistent_gnts;
+ grant_ref_t gref_head;
+ struct grant *gnt_list_entry = NULL;
+ struct scatterlist *sg;
+ int nseg, max_grefs;
+
+ if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+ return 1;
+
+ max_grefs = req->nr_phys_segments;
+ if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+ /*
+ * If we are using indirect segments we need to account
+ * for the indirect grefs used in the request.
+ */
+ max_grefs += INDIRECT_GREFS(req->nr_phys_segments);
+
+ /* Check if we have enough grants to allocate a requests */
+ if (info->persistent_gnts_c < max_grefs) {
+ new_persistent_gnts = 1;
+ if (gnttab_alloc_grant_references(
+ max_grefs - info->persistent_gnts_c,
+ &gref_head) < 0) {
+ gnttab_request_free_callback(
+ &info->callback,
+ blkif_restart_queue_callback,
+ info,
+ max_grefs);
+ return 1;
+ }
+ } else
+ new_persistent_gnts = 0;
+
+ /* Fill out a communications ring structure. */
+ ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+ id = get_id_from_freelist(info);
+ info->shadow[id].request = req;
+
+ if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
+ ring_req->operation = BLKIF_OP_DISCARD;
+ ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+ ring_req->u.discard.id = id;
+ ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
+ if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+ ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+ else
+ ring_req->u.discard.flag = 0;
+ } else {
+ BUG_ON(info->max_indirect_segments == 0 &&
+ req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+ BUG_ON(info->max_indirect_segments &&
+ req->nr_phys_segments > info->max_indirect_segments);
+ nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+ ring_req->u.rw.id = id;
+ if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ /*
+ * The indirect operation can only be a BLKIF_OP_READ or
+ * BLKIF_OP_WRITE
+ */
+ BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
+ ring_req->operation = BLKIF_OP_INDIRECT;
+ ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
+ BLKIF_OP_WRITE : BLKIF_OP_READ;
+ ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
+ ring_req->u.indirect.handle = info->handle;
+ ring_req->u.indirect.nr_segments = nseg;
+ } else {
+ ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
+ ring_req->u.rw.handle = info->handle;
+ ring_req->operation = rq_data_dir(req) ?
+ BLKIF_OP_WRITE : BLKIF_OP_READ;
+ if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+ /*
+ * Ideally we can do an unordered flush-to-disk. In case the
+ * backend onlysupports barriers, use that. A barrier request
+ * a superset of FUA, so we can implement it the same
+ * way. (It's also a FLUSH+FUA, since it is
+ * guaranteed ordered WRT previous writes.)
+ */
+ switch (info->feature_flush &
+ ((REQ_FLUSH|REQ_FUA))) {
+ case REQ_FLUSH|REQ_FUA:
+ ring_req->operation =
+ BLKIF_OP_WRITE_BARRIER;
+ break;
+ case REQ_FLUSH:
+ ring_req->operation =
+ BLKIF_OP_FLUSH_DISKCACHE;
+ break;
+ default:
+ ring_req->operation = 0;
+ }
+ }
+ ring_req->u.rw.nr_segments = nseg;
+ }
+ for_each_sg(info->shadow[id].sg, sg, nseg, i) {
+ fsect = sg->offset >> 9;
+ lsect = fsect + (sg->length >> 9) - 1;
+
+ if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
+ (i % SEGS_PER_INDIRECT_FRAME == 0)) {
+ unsigned long uninitialized_var(pfn);
+
+ if (segments)
+ kunmap_atomic(segments);
+
+ n = i / SEGS_PER_INDIRECT_FRAME;
+ if (!info->feature_persistent) {
+ struct page *indirect_page;
+
+ /* Fetch a pre-allocated page to use for indirect grefs */
+ BUG_ON(list_empty(&info->indirect_pages));
+ indirect_page = list_first_entry(&info->indirect_pages,
+ struct page, lru);
+ list_del(&indirect_page->lru);
+ pfn = page_to_pfn(indirect_page);
+ }
+ gnt_list_entry = get_grant(&gref_head, pfn, info);
+ info->shadow[id].indirect_grants[n] = gnt_list_entry;
+ segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
+ ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
+ }
+
+ gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info);
+ ref = gnt_list_entry->gref;
+
+ info->shadow[id].grants_used[i] = gnt_list_entry;
+
+ if (rq_data_dir(req) && info->feature_persistent) {
+ char *bvec_data;
+ void *shared_data;
+
+ BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+
+ shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
+ bvec_data = kmap_atomic(sg_page(sg));
+
+ /*
+ * this does not wipe data stored outside the
+ * range sg->offset..sg->offset+sg->length.
+ * Therefore, blkback *could* see data from
+ * previous requests. This is OK as long as
+ * persistent grants are shared with just one
+ * domain. It may need refactoring if this
+ * changes
+ */
+ memcpy(shared_data + sg->offset,
+ bvec_data + sg->offset,
+ sg->length);
+
+ kunmap_atomic(bvec_data);
+ kunmap_atomic(shared_data);
+ }
+ if (ring_req->operation != BLKIF_OP_INDIRECT) {
+ ring_req->u.rw.seg[i] =
+ (struct blkif_request_segment) {
+ .gref = ref,
+ .first_sect = fsect,
+ .last_sect = lsect };
+ } else {
+ n = i % SEGS_PER_INDIRECT_FRAME;
+ segments[n] =
+ (struct blkif_request_segment) {
+ .gref = ref,
+ .first_sect = fsect,
+ .last_sect = lsect };
+ }
+ }
+ if (segments)
+ kunmap_atomic(segments);
+ }
+
+ info->ring.req_prod_pvt++;
+
+ /* Keep a private copy so we can reissue requests when recovering. */
+ info->shadow[id].req = *ring_req;
+
+ if (new_persistent_gnts)
+ gnttab_free_grant_references(gref_head);
+
+ return 0;
+}
+
+
+static inline void flush_requests(struct blkfront_info *info)
+{
+ int notify;
+
+ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
+
+ if (notify)
+ notify_remote_via_irq(info->irq);
+}
+
+static inline bool blkif_request_flush_invalid(struct request *req,
+ struct blkfront_info *info)
+{
+ return ((req->cmd_type != REQ_TYPE_FS) ||
+ ((req->cmd_flags & REQ_FLUSH) &&
+ !(info->feature_flush & REQ_FLUSH)) ||
+ ((req->cmd_flags & REQ_FUA) &&
+ !(info->feature_flush & REQ_FUA)));
+}
+
+/*
+ * do_blkif_request
+ * read a block; request is in a request queue
+ */
+static void do_blkif_request(struct request_queue *rq)
+{
+ struct blkfront_info *info = NULL;
+ struct request *req;
+ int queued;
+
+ pr_debug("Entered do_blkif_request\n");
+
+ queued = 0;
+
+ while ((req = blk_peek_request(rq)) != NULL) {
+ info = req->rq_disk->private_data;
+
+ if (RING_FULL(&info->ring))
+ goto wait;
+
+ blk_start_request(req);
+
+ if (blkif_request_flush_invalid(req, info)) {
+ __blk_end_request_all(req, -EOPNOTSUPP);
+ continue;
+ }
+
+ pr_debug("do_blk_req %p: cmd %p, sec %lx, "
+ "(%u/%u) [%s]\n",
+ req, req->cmd, (unsigned long)blk_rq_pos(req),
+ blk_rq_cur_sectors(req), blk_rq_sectors(req),
+ rq_data_dir(req) ? "write" : "read");
+
+ if (blkif_queue_request(req)) {
+ blk_requeue_request(rq, req);
+wait:
+ /* Avoid pointless unplugs. */
+ blk_stop_queue(rq);
+ break;
+ }
+
+ queued++;
+ }
+
+ if (queued != 0)
+ flush_requests(info);
+}
+
+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
+ unsigned int physical_sector_size,
+ unsigned int segments)
+{
+ struct request_queue *rq;
+ struct blkfront_info *info = gd->private_data;
+
+ rq = blk_init_queue(do_blkif_request, &info->io_lock);
+ if (rq == NULL)
+ return -1;
+
+ queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
+
+ if (info->feature_discard) {
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq);
+ blk_queue_max_discard_sectors(rq, get_capacity(gd));
+ rq->limits.discard_granularity = info->discard_granularity;
+ rq->limits.discard_alignment = info->discard_alignment;
+ if (info->feature_secdiscard)
+ queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq);
+ }
+
+ /* Hard sector size and max sectors impersonate the equiv. hardware. */
+ blk_queue_logical_block_size(rq, sector_size);
+ blk_queue_physical_block_size(rq, physical_sector_size);
+ blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
+
+ /* Each segment in a request is up to an aligned page in size. */
+ blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+ blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+ /* Ensure a merged request will fit in a single I/O ring slot. */
+ blk_queue_max_segments(rq, segments);
+
+ /* Make sure buffer addresses are sector-aligned. */
+ blk_queue_dma_alignment(rq, 511);
+
+ /* Make sure we don't use bounce buffers. */
+ blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
+
+ gd->queue = rq;
+
+ return 0;
+}
+
+static const char *flush_info(unsigned int feature_flush)
+{
+ switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) {
+ case REQ_FLUSH|REQ_FUA:
+ return "barrier: enabled;";
+ case REQ_FLUSH:
+ return "flush diskcache: enabled;";
+ default:
+ return "barrier or flush: disabled;";
+ }
+}
+
+static void xlvbd_flush(struct blkfront_info *info)
+{
+ blk_queue_flush(info->rq, info->feature_flush);
+ pr_info("blkfront: %s: %s %s %s %s %s\n",
+ info->gd->disk_name, flush_info(info->feature_flush),
+ "persistent grants:", info->feature_persistent ?
+ "enabled;" : "disabled;", "indirect descriptors:",
+ info->max_indirect_segments ? "enabled;" : "disabled;");
+}
+
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+ int major;
+ major = BLKIF_MAJOR(vdevice);
+ *minor = BLKIF_MINOR(vdevice);
+ switch (major) {
+ case XEN_IDE0_MAJOR:
+ *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+ *minor = ((*minor / 64) * PARTS_PER_DISK) +
+ EMULATED_HD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_IDE1_MAJOR:
+ *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+ *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+ EMULATED_HD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK0_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK1_MAJOR:
+ case XEN_SCSI_DISK2_MAJOR:
+ case XEN_SCSI_DISK3_MAJOR:
+ case XEN_SCSI_DISK4_MAJOR:
+ case XEN_SCSI_DISK5_MAJOR:
+ case XEN_SCSI_DISK6_MAJOR:
+ case XEN_SCSI_DISK7_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) +
+ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+ EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor +
+ ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+ EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XEN_SCSI_DISK8_MAJOR:
+ case XEN_SCSI_DISK9_MAJOR:
+ case XEN_SCSI_DISK10_MAJOR:
+ case XEN_SCSI_DISK11_MAJOR:
+ case XEN_SCSI_DISK12_MAJOR:
+ case XEN_SCSI_DISK13_MAJOR:
+ case XEN_SCSI_DISK14_MAJOR:
+ case XEN_SCSI_DISK15_MAJOR:
+ *offset = (*minor / PARTS_PER_DISK) +
+ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+ EMULATED_SD_DISK_NAME_OFFSET;
+ *minor = *minor +
+ ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+ EMULATED_SD_DISK_MINOR_OFFSET;
+ break;
+ case XENVBD_MAJOR:
+ *offset = *minor / PARTS_PER_DISK;
+ break;
+ default:
+ printk(KERN_WARNING "blkfront: your disk configuration is "
+ "incorrect, please use an xvd device instead\n");
+ return -ENODEV;
+ }
+ return 0;
+}
+
+static char *encode_disk_name(char *ptr, unsigned int n)
+{
+ if (n >= 26)
+ ptr = encode_disk_name(ptr, n / 26 - 1);
+ *ptr = 'a' + n % 26;
+ return ptr + 1;
+}
+
+static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+ struct blkfront_info *info,
+ u16 vdisk_info, u16 sector_size,
+ unsigned int physical_sector_size)
+{
+ struct gendisk *gd;
+ int nr_minors = 1;
+ int err;
+ unsigned int offset;
+ int minor;
+ int nr_parts;
+ char *ptr;
+
+ BUG_ON(info->gd != NULL);
+ BUG_ON(info->rq != NULL);
+
+ if ((info->vdevice>>EXT_SHIFT) > 1) {
+ /* this is above the extended range; something is wrong */
+ printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
+ return -ENODEV;
+ }
+
+ if (!VDEV_IS_EXTENDED(info->vdevice)) {
+ err = xen_translate_vdev(info->vdevice, &minor, &offset);
+ if (err)
+ return err;
+ nr_parts = PARTS_PER_DISK;
+ } else {
+ minor = BLKIF_MINOR_EXT(info->vdevice);
+ nr_parts = PARTS_PER_EXT_DISK;
+ offset = minor / nr_parts;
+ if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
+ printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+ "emulated IDE disks,\n\t choose an xvd device name"
+ "from xvde on\n", info->vdevice);
+ }
+ if (minor >> MINORBITS) {
+ pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
+ info->vdevice, minor);
+ return -ENODEV;
+ }
+
+ if ((minor % nr_parts) == 0)
+ nr_minors = nr_parts;
+
+ err = xlbd_reserve_minors(minor, nr_minors);
+ if (err)
+ goto out;
+ err = -ENODEV;
+
+ gd = alloc_disk(nr_minors);
+ if (gd == NULL)
+ goto release;
+
+ strcpy(gd->disk_name, DEV_NAME);
+ ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
+ BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
+ if (nr_minors > 1)
+ *ptr = 0;
+ else
+ snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
+ "%d", minor & (nr_parts - 1));
+
+ gd->major = XENVBD_MAJOR;
+ gd->first_minor = minor;
+ gd->fops = &xlvbd_block_fops;
+ gd->private_data = info;
+ gd->driverfs_dev = &(info->xbdev->dev);
+ set_capacity(gd, capacity);
+
+ if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
+ info->max_indirect_segments ? :
+ BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+ del_gendisk(gd);
+ goto release;
+ }
+
+ info->rq = gd->queue;
+ info->gd = gd;
+
+ xlvbd_flush(info);
+
+ if (vdisk_info & VDISK_READONLY)
+ set_disk_ro(gd, 1);
+
+ if (vdisk_info & VDISK_REMOVABLE)
+ gd->flags |= GENHD_FL_REMOVABLE;
+
+ if (vdisk_info & VDISK_CDROM)
+ gd->flags |= GENHD_FL_CD;
+
+ return 0;
+
+ release:
+ xlbd_release_minors(minor, nr_minors);
+ out:
+ return err;
+}
+
+static void xlvbd_release_gendisk(struct blkfront_info *info)
+{
+ unsigned int minor, nr_minors;
+ unsigned long flags;
+
+ if (info->rq == NULL)
+ return;
+
+ spin_lock_irqsave(&info->io_lock, flags);
+
+ /* No more blkif_request(). */
+ blk_stop_queue(info->rq);
+
+ /* No more gnttab callback work. */
+ gnttab_cancel_free_callback(&info->callback);
+ spin_unlock_irqrestore(&info->io_lock, flags);
+
+ /* Flush gnttab callback work. Must be done with no locks held. */
+ flush_work(&info->work);
+
+ del_gendisk(info->gd);
+
+ minor = info->gd->first_minor;
+ nr_minors = info->gd->minors;
+ xlbd_release_minors(minor, nr_minors);
+
+ blk_cleanup_queue(info->rq);
+ info->rq = NULL;
+
+ put_disk(info->gd);
+ info->gd = NULL;
+}
+
+static void kick_pending_request_queues(struct blkfront_info *info)
+{
+ if (!RING_FULL(&info->ring)) {
+ /* Re-enable calldowns. */
+ blk_start_queue(info->rq);
+ /* Kick things off immediately. */
+ do_blkif_request(info->rq);
+ }
+}
+
+static void blkif_restart_queue(struct work_struct *work)
+{
+ struct blkfront_info *info = container_of(work, struct blkfront_info, work);
+
+ spin_lock_irq(&info->io_lock);
+ if (info->connected == BLKIF_STATE_CONNECTED)
+ kick_pending_request_queues(info);
+ spin_unlock_irq(&info->io_lock);
+}
+
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+ struct grant *persistent_gnt;
+ struct grant *n;
+ int i, j, segs;
+
+ /* Prevent new requests being issued until we fix things up. */
+ spin_lock_irq(&info->io_lock);
+ info->connected = suspend ?
+ BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+ /* No more blkif_request(). */
+ if (info->rq)
+ blk_stop_queue(info->rq);
+
+ /* Remove all persistent grants */
+ if (!list_empty(&info->grants)) {
+ list_for_each_entry_safe(persistent_gnt, n,
+ &info->grants, node) {
+ list_del(&persistent_gnt->node);
+ if (persistent_gnt->gref != GRANT_INVALID_REF) {
+ gnttab_end_foreign_access(persistent_gnt->gref,
+ 0, 0UL);
+ info->persistent_gnts_c--;
+ }
+ if (info->feature_persistent)
+ __free_page(pfn_to_page(persistent_gnt->pfn));
+ kfree(persistent_gnt);
+ }
+ }
+ BUG_ON(info->persistent_gnts_c != 0);
+
+ /*
+ * Remove indirect pages, this only happens when using indirect
+ * descriptors but not persistent grants
+ */
+ if (!list_empty(&info->indirect_pages)) {
+ struct page *indirect_page, *n;
+
+ BUG_ON(info->feature_persistent);
+ list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+ list_del(&indirect_page->lru);
+ __free_page(indirect_page);
+ }
+ }
+
+ for (i = 0; i < BLK_RING_SIZE; i++) {
+ /*
+ * Clear persistent grants present in requests already
+ * on the shared ring
+ */
+ if (!info->shadow[i].request)
+ goto free_shadow;
+
+ segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+ info->shadow[i].req.u.indirect.nr_segments :
+ info->shadow[i].req.u.rw.nr_segments;
+ for (j = 0; j < segs; j++) {
+ persistent_gnt = info->shadow[i].grants_used[j];
+ gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+ if (info->feature_persistent)
+ __free_page(pfn_to_page(persistent_gnt->pfn));
+ kfree(persistent_gnt);
+ }
+
+ if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+ /*
+ * If this is not an indirect operation don't try to
+ * free indirect segments
+ */
+ goto free_shadow;
+
+ for (j = 0; j < INDIRECT_GREFS(segs); j++) {
+ persistent_gnt = info->shadow[i].indirect_grants[j];
+ gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+ __free_page(pfn_to_page(persistent_gnt->pfn));
+ kfree(persistent_gnt);
+ }
+
+free_shadow:
+ kfree(info->shadow[i].grants_used);
+ info->shadow[i].grants_used = NULL;
+ kfree(info->shadow[i].indirect_grants);
+ info->shadow[i].indirect_grants = NULL;
+ kfree(info->shadow[i].sg);
+ info->shadow[i].sg = NULL;
+ }
+
+ /* No more gnttab callback work. */
+ gnttab_cancel_free_callback(&info->callback);
+ spin_unlock_irq(&info->io_lock);
+
+ /* Flush gnttab callback work. Must be done with no locks held. */
+ flush_work(&info->work);
+
+ /* Free resources associated with old device channel. */
+ if (info->ring_ref != GRANT_INVALID_REF) {
+ gnttab_end_foreign_access(info->ring_ref, 0,
+ (unsigned long)info->ring.sring);
+ info->ring_ref = GRANT_INVALID_REF;
+ info->ring.sring = NULL;
+ }
+ if (info->irq)
+ unbind_from_irqhandler(info->irq, info);
+ info->evtchn = info->irq = 0;
+
+}
+
+static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
+ struct blkif_response *bret)
+{
+ int i = 0;
+ struct scatterlist *sg;
+ char *bvec_data;
+ void *shared_data;
+ int nseg;
+
+ nseg = s->req.operation == BLKIF_OP_INDIRECT ?
+ s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+ if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
+ /*
+ * Copy the data received from the backend into the bvec.
+ * Since bv_offset can be different than 0, and bv_len different
+ * than PAGE_SIZE, we have to keep track of the current offset,
+ * to be sure we are copying the data from the right shared page.
+ */
+ for_each_sg(s->sg, sg, nseg, i) {
+ BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+ shared_data = kmap_atomic(
+ pfn_to_page(s->grants_used[i]->pfn));
+ bvec_data = kmap_atomic(sg_page(sg));
+ memcpy(bvec_data + sg->offset,
+ shared_data + sg->offset,
+ sg->length);
+ kunmap_atomic(bvec_data);
+ kunmap_atomic(shared_data);
+ }
+ }
+ /* Add the persistent grant into the list of free grants */
+ for (i = 0; i < nseg; i++) {
+ if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
+ /*
+ * If the grant is still mapped by the backend (the
+ * backend has chosen to make this grant persistent)
+ * we add it at the head of the list, so it will be
+ * reused first.
+ */
+ if (!info->feature_persistent)
+ pr_alert_ratelimited("backed has not unmapped grant: %u\n",
+ s->grants_used[i]->gref);
+ list_add(&s->grants_used[i]->node, &info->grants);
+ info->persistent_gnts_c++;
+ } else {
+ /*
+ * If the grant is not mapped by the backend we end the
+ * foreign access and add it to the tail of the list,
+ * so it will not be picked again unless we run out of
+ * persistent grants.
+ */
+ gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL);
+ s->grants_used[i]->gref = GRANT_INVALID_REF;
+ list_add_tail(&s->grants_used[i]->node, &info->grants);
+ }
+ }
+ if (s->req.operation == BLKIF_OP_INDIRECT) {
+ for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
+ if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
+ if (!info->feature_persistent)
+ pr_alert_ratelimited("backed has not unmapped grant: %u\n",
+ s->indirect_grants[i]->gref);
+ list_add(&s->indirect_grants[i]->node, &info->grants);
+ info->persistent_gnts_c++;
+ } else {
+ struct page *indirect_page;
+
+ gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
+ /*
+ * Add the used indirect page back to the list of
+ * available pages for indirect grefs.
+ */
+ indirect_page = pfn_to_page(s->indirect_grants[i]->pfn);
+ list_add(&indirect_page->lru, &info->indirect_pages);
+ s->indirect_grants[i]->gref = GRANT_INVALID_REF;
+ list_add_tail(&s->indirect_grants[i]->node, &info->grants);
+ }
+ }
+ }
+}
+
+static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+{
+ struct request *req;
+ struct blkif_response *bret;
+ RING_IDX i, rp;
+ unsigned long flags;
+ struct blkfront_info *info = (struct blkfront_info *)dev_id;
+ int error;
+
+ spin_lock_irqsave(&info->io_lock, flags);
+
+ if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+ spin_unlock_irqrestore(&info->io_lock, flags);
+ return IRQ_HANDLED;
+ }
+
+ again:
+ rp = info->ring.sring->rsp_prod;
+ rmb(); /* Ensure we see queued responses up to 'rp'. */
+
+ for (i = info->ring.rsp_cons; i != rp; i++) {
+ unsigned long id;
+
+ bret = RING_GET_RESPONSE(&info->ring, i);
+ id = bret->id;
+ /*
+ * The backend has messed up and given us an id that we would
+ * never have given to it (we stamp it up to BLK_RING_SIZE -
+ * look in get_id_from_freelist.
+ */
+ if (id >= BLK_RING_SIZE) {
+ WARN(1, "%s: response to %s has incorrect id (%ld)\n",
+ info->gd->disk_name, op_name(bret->operation), id);
+ /* We can't safely get the 'struct request' as
+ * the id is busted. */
+ continue;
+ }
+ req = info->shadow[id].request;
+
+ if (bret->operation != BLKIF_OP_DISCARD)
+ blkif_completion(&info->shadow[id], info, bret);
+
+ if (add_id_to_freelist(info, id)) {
+ WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
+ info->gd->disk_name, op_name(bret->operation), id);
+ continue;
+ }
+
+ error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
+ switch (bret->operation) {
+ case BLKIF_OP_DISCARD:
+ if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+ struct request_queue *rq = info->rq;
+ printk(KERN_WARNING "blkfront: %s: %s op failed\n",
+ info->gd->disk_name, op_name(bret->operation));
+ error = -EOPNOTSUPP;
+ info->feature_discard = 0;
+ info->feature_secdiscard = 0;
+ queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
+ queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
+ }
+ __blk_end_request_all(req, error);
+ break;
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ case BLKIF_OP_WRITE_BARRIER:
+ if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
+ printk(KERN_WARNING "blkfront: %s: %s op failed\n",
+ info->gd->disk_name, op_name(bret->operation));
+ error = -EOPNOTSUPP;
+ }
+ if (unlikely(bret->status == BLKIF_RSP_ERROR &&
+ info->shadow[id].req.u.rw.nr_segments == 0)) {
+ printk(KERN_WARNING "blkfront: %s: empty %s op failed\n",
+ info->gd->disk_name, op_name(bret->operation));
+ error = -EOPNOTSUPP;
+ }
+ if (unlikely(error)) {
+ if (error == -EOPNOTSUPP)
+ error = 0;
+ info->feature_flush = 0;
+ xlvbd_flush(info);
+ }
+ /* fall through */
+ case BLKIF_OP_READ:
+ case BLKIF_OP_WRITE:
+ if (unlikely(bret->status != BLKIF_RSP_OKAY))
+ dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
+ "request: %x\n", bret->status);
+
+ __blk_end_request_all(req, error);
+ break;
+ default:
+ BUG();
+ }
+ }
+
+ info->ring.rsp_cons = i;
+
+ if (i != info->ring.req_prod_pvt) {
+ int more_to_do;
+ RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
+ if (more_to_do)
+ goto again;
+ } else
+ info->ring.sring->rsp_event = i + 1;
+
+ kick_pending_request_queues(info);
+
+ spin_unlock_irqrestore(&info->io_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
+
+static int setup_blkring(struct xenbus_device *dev,
+ struct blkfront_info *info)
+{
+ struct blkif_sring *sring;
+ grant_ref_t gref;
+ int err;
+
+ info->ring_ref = GRANT_INVALID_REF;
+
+ sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+ if (!sring) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
+ return -ENOMEM;
+ }
+ SHARED_RING_INIT(sring);
+ FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+
+ err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
+ if (err < 0) {
+ free_page((unsigned long)sring);
+ info->ring.sring = NULL;
+ goto fail;
+ }
+ info->ring_ref = gref;
+
+ err = xenbus_alloc_evtchn(dev, &info->evtchn);
+ if (err)
+ goto fail;
+
+ err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0,
+ "blkif", info);
+ if (err <= 0) {
+ xenbus_dev_fatal(dev, err,
+ "bind_evtchn_to_irqhandler failed");
+ goto fail;
+ }
+ info->irq = err;
+
+ return 0;
+fail:
+ blkif_free(info, 0);
+ return err;
+}
+
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_blkback(struct xenbus_device *dev,
+ struct blkfront_info *info)
+{
+ const char *message = NULL;
+ struct xenbus_transaction xbt;
+ int err;
+
+ /* Create shared ring, alloc event channel. */
+ err = setup_blkring(dev, info);
+ if (err)
+ goto out;
+
+again:
+ err = xenbus_transaction_start(&xbt);
+ if (err) {
+ xenbus_dev_fatal(dev, err, "starting transaction");
+ goto destroy_blkring;
+ }
+
+ err = xenbus_printf(xbt, dev->nodename,
+ "ring-ref", "%u", info->ring_ref);
+ if (err) {
+ message = "writing ring-ref";
+ goto abort_transaction;
+ }
+ err = xenbus_printf(xbt, dev->nodename,
+ "event-channel", "%u", info->evtchn);
+ if (err) {
+ message = "writing event-channel";
+ goto abort_transaction;
+ }
+ err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+ XEN_IO_PROTO_ABI_NATIVE);
+ if (err) {
+ message = "writing protocol";
+ goto abort_transaction;
+ }
+ err = xenbus_printf(xbt, dev->nodename,
+ "feature-persistent", "%u", 1);
+ if (err)
+ dev_warn(&dev->dev,
+ "writing persistent grants feature to xenbus");
+
+ err = xenbus_transaction_end(xbt, 0);
+ if (err) {
+ if (err == -EAGAIN)
+ goto again;
+ xenbus_dev_fatal(dev, err, "completing transaction");
+ goto destroy_blkring;
+ }
+
+ xenbus_switch_state(dev, XenbusStateInitialised);
+
+ return 0;
+
+ abort_transaction:
+ xenbus_transaction_end(xbt, 1);
+ if (message)
+ xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+ blkif_free(info, 0);
+ out:
+ return err;
+}
+
+/**
+ * Entry point to this code when a new device is created. Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those. Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+ const struct xenbus_device_id *id)
+{
+ int err, vdevice, i;
+ struct blkfront_info *info;
+
+ /* FIXME: Use dynamic device id if this is not set. */
+ err = xenbus_scanf(XBT_NIL, dev->nodename,
+ "virtual-device", "%i", &vdevice);
+ if (err != 1) {
+ /* go looking in the extended area instead */
+ err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
+ "%i", &vdevice);
+ if (err != 1) {
+ xenbus_dev_fatal(dev, err, "reading virtual-device");
+ return err;
+ }
+ }
+
+ if (xen_hvm_domain()) {
+ char *type;
+ int len;
+ /* no unplug has been done: do not hook devices != xen vbds */
+ if (xen_has_pv_and_legacy_disk_devices()) {
+ int major;
+
+ if (!VDEV_IS_EXTENDED(vdevice))
+ major = BLKIF_MAJOR(vdevice);
+ else
+ major = XENVBD_MAJOR;
+
+ if (major != XENVBD_MAJOR) {
+ printk(KERN_INFO
+ "%s: HVM does not support vbd %d as xen block device\n",
+ __func__, vdevice);
+ return -ENODEV;
+ }
+ }
+ /* do not create a PV cdrom device if we are an HVM guest */
+ type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
+ if (IS_ERR(type))
+ return -ENODEV;
+ if (strncmp(type, "cdrom", 5) == 0) {
+ kfree(type);
+ return -ENODEV;
+ }
+ kfree(type);
+ }
+ info = kzalloc(sizeof(*info), GFP_KERNEL);
+ if (!info) {
+ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+ return -ENOMEM;
+ }
+
+ mutex_init(&info->mutex);
+ spin_lock_init(&info->io_lock);
+ info->xbdev = dev;
+ info->vdevice = vdevice;
+ INIT_LIST_HEAD(&info->grants);
+ INIT_LIST_HEAD(&info->indirect_pages);
+ info->persistent_gnts_c = 0;
+ info->connected = BLKIF_STATE_DISCONNECTED;
+ INIT_WORK(&info->work, blkif_restart_queue);
+
+ for (i = 0; i < BLK_RING_SIZE; i++)
+ info->shadow[i].req.u.rw.id = i+1;
+ info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+
+ /* Front end dir is a number, which is used as the id. */
+ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
+ dev_set_drvdata(&dev->dev, info);
+
+ err = talk_to_blkback(dev, info);
+ if (err) {
+ kfree(info);
+ dev_set_drvdata(&dev->dev, NULL);
+ return err;
+ }
+
+ return 0;
+}
+
+static void split_bio_end(struct bio *bio, int error)
+{
+ struct split_bio *split_bio = bio->bi_private;
+
+ if (error)
+ split_bio->err = error;
+
+ if (atomic_dec_and_test(&split_bio->pending)) {
+ split_bio->bio->bi_phys_segments = 0;
+ bio_endio(split_bio->bio, split_bio->err);
+ kfree(split_bio);
+ }
+ bio_put(bio);
+}
+
+static int blkif_recover(struct blkfront_info *info)
+{
+ int i;
+ struct request *req, *n;
+ struct blk_shadow *copy;
+ int rc;
+ struct bio *bio, *cloned_bio;
+ struct bio_list bio_list, merge_bio;
+ unsigned int segs, offset;
+ int pending, size;
+ struct split_bio *split_bio;
+ struct list_head requests;
+
+ /* Stage 1: Make a safe copy of the shadow state. */
+ copy = kmemdup(info->shadow, sizeof(info->shadow),
+ GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
+ if (!copy)
+ return -ENOMEM;
+
+ /* Stage 2: Set up free list. */
+ memset(&info->shadow, 0, sizeof(info->shadow));
+ for (i = 0; i < BLK_RING_SIZE; i++)
+ info->shadow[i].req.u.rw.id = i+1;
+ info->shadow_free = info->ring.req_prod_pvt;
+ info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+
+ rc = blkfront_setup_indirect(info);
+ if (rc) {
+ kfree(copy);
+ return rc;
+ }
+
+ segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ blk_queue_max_segments(info->rq, segs);
+ bio_list_init(&bio_list);
+ INIT_LIST_HEAD(&requests);
+ for (i = 0; i < BLK_RING_SIZE; i++) {
+ /* Not in use? */
+ if (!copy[i].request)
+ continue;
+
+ /*
+ * Get the bios in the request so we can re-queue them.
+ */
+ if (copy[i].request->cmd_flags &
+ (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+ /*
+ * Flush operations don't contain bios, so
+ * we need to requeue the whole request
+ */
+ list_add(&copy[i].request->queuelist, &requests);
+ continue;
+ }
+ merge_bio.head = copy[i].request->bio;
+ merge_bio.tail = copy[i].request->biotail;
+ bio_list_merge(&bio_list, &merge_bio);
+ copy[i].request->bio = NULL;
+ blk_end_request_all(copy[i].request, 0);
+ }
+
+ kfree(copy);
+
+ /*
+ * Empty the queue, this is important because we might have
+ * requests in the queue with more segments than what we
+ * can handle now.
+ */
+ spin_lock_irq(&info->io_lock);
+ while ((req = blk_fetch_request(info->rq)) != NULL) {
+ if (req->cmd_flags &
+ (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+ list_add(&req->queuelist, &requests);
+ continue;
+ }
+ merge_bio.head = req->bio;
+ merge_bio.tail = req->biotail;
+ bio_list_merge(&bio_list, &merge_bio);
+ req->bio = NULL;
+ if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
+ pr_alert("diskcache flush request found!\n");
+ __blk_end_request_all(req, 0);
+ }
+ spin_unlock_irq(&info->io_lock);
+
+ xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+ spin_lock_irq(&info->io_lock);
+
+ /* Now safe for us to use the shared ring */
+ info->connected = BLKIF_STATE_CONNECTED;
+
+ /* Kick any other new requests queued since we resumed */
+ kick_pending_request_queues(info);
+
+ list_for_each_entry_safe(req, n, &requests, queuelist) {
+ /* Requeue pending requests (flush or discard) */
+ list_del_init(&req->queuelist);
+ BUG_ON(req->nr_phys_segments > segs);
+ blk_requeue_request(info->rq, req);
+ }
+ spin_unlock_irq(&info->io_lock);
+
+ while ((bio = bio_list_pop(&bio_list)) != NULL) {
+ /* Traverse the list of pending bios and re-queue them */
+ if (bio_segments(bio) > segs) {
+ /*
+ * This bio has more segments than what we can
+ * handle, we have to split it.
+ */
+ pending = (bio_segments(bio) + segs - 1) / segs;
+ split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
+ BUG_ON(split_bio == NULL);
+ atomic_set(&split_bio->pending, pending);
+ split_bio->bio = bio;
+ for (i = 0; i < pending; i++) {
+ offset = (i * segs * PAGE_SIZE) >> 9;
+ size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
+ (unsigned int)bio_sectors(bio) - offset);
+ cloned_bio = bio_clone(bio, GFP_NOIO);
+ BUG_ON(cloned_bio == NULL);
+ bio_trim(cloned_bio, offset, size);
+ cloned_bio->bi_private = split_bio;
+ cloned_bio->bi_end_io = split_bio_end;
+ submit_bio(cloned_bio->bi_rw, cloned_bio);
+ }
+ /*
+ * Now we have to wait for all those smaller bios to
+ * end, so we can also end the "parent" bio.
+ */
+ continue;
+ }
+ /* We don't need to split this bio */
+ submit_bio(bio->bi_rw, bio);
+ }
+
+ return 0;
+}
+
+/**
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart. We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+ struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+ int err;
+
+ dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
+
+ blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+
+ err = talk_to_blkback(dev, info);
+
+ /*
+ * We have to wait for the backend to switch to
+ * connected state, since we want to read which
+ * features it supports.
+ */
+
+ return err;
+}
+
+static void
+blkfront_closing(struct blkfront_info *info)
+{
+ struct xenbus_device *xbdev = info->xbdev;
+ struct block_device *bdev = NULL;
+
+ mutex_lock(&info->mutex);
+
+ if (xbdev->state == XenbusStateClosing) {
+ mutex_unlock(&info->mutex);
+ return;
+ }
+
+ if (info->gd)
+ bdev = bdget_disk(info->gd, 0);
+
+ mutex_unlock(&info->mutex);
+
+ if (!bdev) {
+ xenbus_frontend_closed(xbdev);
+ return;
+ }
+
+ mutex_lock(&bdev->bd_mutex);
+
+ if (bdev->bd_openers) {
+ xenbus_dev_error(xbdev, -EBUSY,
+ "Device in use; refusing to close");
+ xenbus_switch_state(xbdev, XenbusStateClosing);
+ } else {
+ xlvbd_release_gendisk(info);
+ xenbus_frontend_closed(xbdev);
+ }
+
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
+}
+
+static void blkfront_setup_discard(struct blkfront_info *info)
+{
+ int err;
+ unsigned int discard_granularity;
+ unsigned int discard_alignment;
+ unsigned int discard_secure;
+
+ info->feature_discard = 1;
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "discard-granularity", "%u", &discard_granularity,
+ "discard-alignment", "%u", &discard_alignment,
+ NULL);
+ if (!err) {
+ info->discard_granularity = discard_granularity;
+ info->discard_alignment = discard_alignment;
+ }
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "discard-secure", "%d", &discard_secure,
+ NULL);
+ if (!err)
+ info->feature_secdiscard = !!discard_secure;
+}
+
+static int blkfront_setup_indirect(struct blkfront_info *info)
+{
+ unsigned int indirect_segments, segs;
+ int err, i;
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-max-indirect-segments", "%u", &indirect_segments,
+ NULL);
+ if (err) {
+ info->max_indirect_segments = 0;
+ segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ } else {
+ info->max_indirect_segments = min(indirect_segments,
+ xen_blkif_max_segments);
+ segs = info->max_indirect_segments;
+ }
+
+ err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
+ if (err)
+ goto out_of_memory;
+
+ if (!info->feature_persistent && info->max_indirect_segments) {
+ /*
+ * We are using indirect descriptors but not persistent
+ * grants, we need to allocate a set of pages that can be
+ * used for mapping indirect grefs
+ */
+ int num = INDIRECT_GREFS(segs) * BLK_RING_SIZE;
+
+ BUG_ON(!list_empty(&info->indirect_pages));
+ for (i = 0; i < num; i++) {
+ struct page *indirect_page = alloc_page(GFP_NOIO);
+ if (!indirect_page)
+ goto out_of_memory;
+ list_add(&indirect_page->lru, &info->indirect_pages);
+ }
+ }
+
+ for (i = 0; i < BLK_RING_SIZE; i++) {
+ info->shadow[i].grants_used = kzalloc(
+ sizeof(info->shadow[i].grants_used[0]) * segs,
+ GFP_NOIO);
+ info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
+ if (info->max_indirect_segments)
+ info->shadow[i].indirect_grants = kzalloc(
+ sizeof(info->shadow[i].indirect_grants[0]) *
+ INDIRECT_GREFS(segs),
+ GFP_NOIO);
+ if ((info->shadow[i].grants_used == NULL) ||
+ (info->shadow[i].sg == NULL) ||
+ (info->max_indirect_segments &&
+ (info->shadow[i].indirect_grants == NULL)))
+ goto out_of_memory;
+ sg_init_table(info->shadow[i].sg, segs);
+ }
+
+
+ return 0;
+
+out_of_memory:
+ for (i = 0; i < BLK_RING_SIZE; i++) {
+ kfree(info->shadow[i].grants_used);
+ info->shadow[i].grants_used = NULL;
+ kfree(info->shadow[i].sg);
+ info->shadow[i].sg = NULL;
+ kfree(info->shadow[i].indirect_grants);
+ info->shadow[i].indirect_grants = NULL;
+ }
+ if (!list_empty(&info->indirect_pages)) {
+ struct page *indirect_page, *n;
+ list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+ list_del(&indirect_page->lru);
+ __free_page(indirect_page);
+ }
+ }
+ return -ENOMEM;
+}
+
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void blkfront_connect(struct blkfront_info *info)
+{
+ unsigned long long sectors;
+ unsigned long sector_size;
+ unsigned int physical_sector_size;
+ unsigned int binfo;
+ int err;
+ int barrier, flush, discard, persistent;
+
+ switch (info->connected) {
+ case BLKIF_STATE_CONNECTED:
+ /*
+ * Potentially, the back-end may be signalling
+ * a capacity change; update the capacity.
+ */
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "sectors", "%Lu", &sectors);
+ if (XENBUS_EXIST_ERR(err))
+ return;
+ printk(KERN_INFO "Setting capacity to %Lu\n",
+ sectors);
+ set_capacity(info->gd, sectors);
+ revalidate_disk(info->gd);
+
+ return;
+ case BLKIF_STATE_SUSPENDED:
+ /*
+ * If we are recovering from suspension, we need to wait
+ * for the backend to announce it's features before
+ * reconnecting, at least we need to know if the backend
+ * supports indirect descriptors, and how many.
+ */
+ blkif_recover(info);
+ return;
+
+ default:
+ break;
+ }
+
+ dev_dbg(&info->xbdev->dev, "%s:%s.\n",
+ __func__, info->xbdev->otherend);
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "sectors", "%llu", &sectors,
+ "info", "%u", &binfo,
+ "sector-size", "%lu", &sector_size,
+ NULL);
+ if (err) {
+ xenbus_dev_fatal(info->xbdev, err,
+ "reading backend fields at %s",
+ info->xbdev->otherend);
+ return;
+ }
+
+ /*
+ * physcial-sector-size is a newer field, so old backends may not
+ * provide this. Assume physical sector size to be the same as
+ * sector_size in that case.
+ */
+ err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+ "physical-sector-size", "%u", &physical_sector_size);
+ if (err != 1)
+ physical_sector_size = sector_size;
+
+ info->feature_flush = 0;
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-barrier", "%d", &barrier,
+ NULL);
+
+ /*
+ * If there's no "feature-barrier" defined, then it means
+ * we're dealing with a very old backend which writes
+ * synchronously; nothing to do.
+ *
+ * If there are barriers, then we use flush.
+ */
+ if (!err && barrier)
+ info->feature_flush = REQ_FLUSH | REQ_FUA;
+ /*
+ * And if there is "feature-flush-cache" use that above
+ * barriers.
+ */
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-flush-cache", "%d", &flush,
+ NULL);
+
+ if (!err && flush)
+ info->feature_flush = REQ_FLUSH;
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-discard", "%d", &discard,
+ NULL);
+
+ if (!err && discard)
+ blkfront_setup_discard(info);
+
+ err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+ "feature-persistent", "%u", &persistent,
+ NULL);
+ if (err)
+ info->feature_persistent = 0;
+ else
+ info->feature_persistent = persistent;
+
+ err = blkfront_setup_indirect(info);
+ if (err) {
+ xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+ info->xbdev->otherend);
+ return;
+ }
+
+ err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
+ physical_sector_size);
+ if (err) {
+ xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+ info->xbdev->otherend);
+ return;
+ }
+
+ xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+ /* Kick pending requests. */
+ spin_lock_irq(&info->io_lock);
+ info->connected = BLKIF_STATE_CONNECTED;
+ kick_pending_request_queues(info);
+ spin_unlock_irq(&info->io_lock);
+
+ add_disk(info->gd);
+
+ info->is_ready = 1;
+}
+
+/**
+ * Callback received when the backend's state changes.
+ */
+static void blkback_changed(struct xenbus_device *dev,
+ enum xenbus_state backend_state)
+{
+ struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+
+ dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
+
+ switch (backend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateInitWait:
+ case XenbusStateInitialised:
+ case XenbusStateReconfiguring:
+ case XenbusStateReconfigured:
+ case XenbusStateUnknown:
+ break;
+
+ case XenbusStateConnected:
+ blkfront_connect(info);
+ break;
+
+ case XenbusStateClosed:
+ if (dev->state == XenbusStateClosed)
+ break;
+ /* Missed the backend's Closing state -- fallthrough */
+ case XenbusStateClosing:
+ blkfront_closing(info);
+ break;
+ }
+}
+
+static int blkfront_remove(struct xenbus_device *xbdev)
+{
+ struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
+ struct block_device *bdev = NULL;
+ struct gendisk *disk;
+
+ dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
+
+ blkif_free(info, 0);
+
+ mutex_lock(&info->mutex);
+
+ disk = info->gd;
+ if (disk)
+ bdev = bdget_disk(disk, 0);
+
+ info->xbdev = NULL;
+ mutex_unlock(&info->mutex);
+
+ if (!bdev) {
+ kfree(info);
+ return 0;
+ }
+
+ /*
+ * The xbdev was removed before we reached the Closed
+ * state. See if it's safe to remove the disk. If the bdev
+ * isn't closed yet, we let release take care of it.
+ */
+
+ mutex_lock(&bdev->bd_mutex);
+ info = disk->private_data;
+
+ dev_warn(disk_to_dev(disk),
+ "%s was hot-unplugged, %d stale handles\n",
+ xbdev->nodename, bdev->bd_openers);
+
+ if (info && !bdev->bd_openers) {
+ xlvbd_release_gendisk(info);
+ disk->private_data = NULL;
+ kfree(info);
+ }
+
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
+
+ return 0;
+}
+
+static int blkfront_is_ready(struct xenbus_device *dev)
+{
+ struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+
+ return info->is_ready && info->xbdev;
+}
+
+static int blkif_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct blkfront_info *info;
+ int err = 0;
+
+ mutex_lock(&blkfront_mutex);
+
+ info = disk->private_data;
+ if (!info) {
+ /* xbdev gone */
+ err = -ERESTARTSYS;
+ goto out;
+ }
+
+ mutex_lock(&info->mutex);
+
+ if (!info->gd)
+ /* xbdev is closed */
+ err = -ERESTARTSYS;
+
+ mutex_unlock(&info->mutex);
+
+out:
+ mutex_unlock(&blkfront_mutex);
+ return err;
+}
+
+static void blkif_release(struct gendisk *disk, fmode_t mode)
+{
+ struct blkfront_info *info = disk->private_data;
+ struct block_device *bdev;
+ struct xenbus_device *xbdev;
+
+ mutex_lock(&blkfront_mutex);
+
+ bdev = bdget_disk(disk, 0);
+
+ if (!bdev) {
+ WARN(1, "Block device %s yanked out from us!\n", disk->disk_name);
+ goto out_mutex;
+ }
+ if (bdev->bd_openers)
+ goto out;
+
+ /*
+ * Check if we have been instructed to close. We will have
+ * deferred this request, because the bdev was still open.
+ */
+
+ mutex_lock(&info->mutex);
+ xbdev = info->xbdev;
+
+ if (xbdev && xbdev->state == XenbusStateClosing) {
+ /* pending switch to state closed */
+ dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
+ xlvbd_release_gendisk(info);
+ xenbus_frontend_closed(info->xbdev);
+ }
+
+ mutex_unlock(&info->mutex);
+
+ if (!xbdev) {
+ /* sudden device removal */
+ dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
+ xlvbd_release_gendisk(info);
+ disk->private_data = NULL;
+ kfree(info);
+ }
+
+out:
+ bdput(bdev);
+out_mutex:
+ mutex_unlock(&blkfront_mutex);
+}
+
+static const struct block_device_operations xlvbd_block_fops =
+{
+ .owner = THIS_MODULE,
+ .open = blkif_open,
+ .release = blkif_release,
+ .getgeo = blkif_getgeo,
+ .ioctl = blkif_ioctl,
+};
+
+
+static const struct xenbus_device_id blkfront_ids[] = {
+ { "vbd" },
+ { "" }
+};
+
+static struct xenbus_driver blkfront_driver = {
+ .ids = blkfront_ids,
+ .probe = blkfront_probe,
+ .remove = blkfront_remove,
+ .resume = blkfront_resume,
+ .otherend_changed = blkback_changed,
+ .is_ready = blkfront_is_ready,
+};
+
+static int __init xlblk_init(void)
+{
+ int ret;
+
+ if (!xen_domain())
+ return -ENODEV;
+
+ if (!xen_has_pv_disk_devices())
+ return -ENODEV;
+
+ if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
+ printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
+ XENVBD_MAJOR, DEV_NAME);
+ return -ENODEV;
+ }
+
+ ret = xenbus_register_frontend(&blkfront_driver);
+ if (ret) {
+ unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+ return ret;
+ }
+
+ return 0;
+}
+module_init(xlblk_init);
+
+
+static void __exit xlblk_exit(void)
+{
+ xenbus_unregister_driver(&blkfront_driver);
+ unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+ kfree(minors);
+}
+module_exit(xlblk_exit);
+
+MODULE_DESCRIPTION("Xen virtual block device frontend");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
+MODULE_ALIAS("xen:vbd");
+MODULE_ALIAS("xenblk");
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
new file mode 100644
index 000000000..c4328d9d9
--- /dev/null
+++ b/drivers/block/xsysace.c
@@ -0,0 +1,1245 @@
+/*
+ * Xilinx SystemACE device driver
+ *
+ * Copyright 2007 Secret Lab Technologies Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+/*
+ * The SystemACE chip is designed to configure FPGAs by loading an FPGA
+ * bitstream from a file on a CF card and squirting it into FPGAs connected
+ * to the SystemACE JTAG chain. It also has the advantage of providing an
+ * MPU interface which can be used to control the FPGA configuration process
+ * and to use the attached CF card for general purpose storage.
+ *
+ * This driver is a block device driver for the SystemACE.
+ *
+ * Initialization:
+ * The driver registers itself as a platform_device driver at module
+ * load time. The platform bus will take care of calling the
+ * ace_probe() method for all SystemACE instances in the system. Any
+ * number of SystemACE instances are supported. ace_probe() calls
+ * ace_setup() which initialized all data structures, reads the CF
+ * id structure and registers the device.
+ *
+ * Processing:
+ * Just about all of the heavy lifting in this driver is performed by
+ * a Finite State Machine (FSM). The driver needs to wait on a number
+ * of events; some raised by interrupts, some which need to be polled
+ * for. Describing all of the behaviour in a FSM seems to be the
+ * easiest way to keep the complexity low and make it easy to
+ * understand what the driver is doing. If the block ops or the
+ * request function need to interact with the hardware, then they
+ * simply need to flag the request and kick of FSM processing.
+ *
+ * The FSM itself is atomic-safe code which can be run from any
+ * context. The general process flow is:
+ * 1. obtain the ace->lock spinlock.
+ * 2. loop on ace_fsm_dostate() until the ace->fsm_continue flag is
+ * cleared.
+ * 3. release the lock.
+ *
+ * Individual states do not sleep in any way. If a condition needs to
+ * be waited for then the state much clear the fsm_continue flag and
+ * either schedule the FSM to be run again at a later time, or expect
+ * an interrupt to call the FSM when the desired condition is met.
+ *
+ * In normal operation, the FSM is processed at interrupt context
+ * either when the driver's tasklet is scheduled, or when an irq is
+ * raised by the hardware. The tasklet can be scheduled at any time.
+ * The request method in particular schedules the tasklet when a new
+ * request has been indicated by the block layer. Once started, the
+ * FSM proceeds as far as it can processing the request until it
+ * needs on a hardware event. At this point, it must yield execution.
+ *
+ * A state has two options when yielding execution:
+ * 1. ace_fsm_yield()
+ * - Call if need to poll for event.
+ * - clears the fsm_continue flag to exit the processing loop
+ * - reschedules the tasklet to run again as soon as possible
+ * 2. ace_fsm_yieldirq()
+ * - Call if an irq is expected from the HW
+ * - clears the fsm_continue flag to exit the processing loop
+ * - does not reschedule the tasklet so the FSM will not be processed
+ * again until an irq is received.
+ * After calling a yield function, the state must return control back
+ * to the FSM main loop.
+ *
+ * Additionally, the driver maintains a kernel timer which can process
+ * the FSM. If the FSM gets stalled, typically due to a missed
+ * interrupt, then the kernel timer will expire and the driver can
+ * continue where it left off.
+ *
+ * To Do:
+ * - Add FPGA configuration control interface.
+ * - Request major number from lanana
+ */
+
+#undef DEBUG
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include <linux/ata.h>
+#include <linux/hdreg.h>
+#include <linux/platform_device.h>
+#if defined(CONFIG_OF)
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/of_platform.h>
+#endif
+
+MODULE_AUTHOR("Grant Likely <grant.likely@secretlab.ca>");
+MODULE_DESCRIPTION("Xilinx SystemACE device driver");
+MODULE_LICENSE("GPL");
+
+/* SystemACE register definitions */
+#define ACE_BUSMODE (0x00)
+
+#define ACE_STATUS (0x04)
+#define ACE_STATUS_CFGLOCK (0x00000001)
+#define ACE_STATUS_MPULOCK (0x00000002)
+#define ACE_STATUS_CFGERROR (0x00000004) /* config controller error */
+#define ACE_STATUS_CFCERROR (0x00000008) /* CF controller error */
+#define ACE_STATUS_CFDETECT (0x00000010)
+#define ACE_STATUS_DATABUFRDY (0x00000020)
+#define ACE_STATUS_DATABUFMODE (0x00000040)
+#define ACE_STATUS_CFGDONE (0x00000080)
+#define ACE_STATUS_RDYFORCFCMD (0x00000100)
+#define ACE_STATUS_CFGMODEPIN (0x00000200)
+#define ACE_STATUS_CFGADDR_MASK (0x0000e000)
+#define ACE_STATUS_CFBSY (0x00020000)
+#define ACE_STATUS_CFRDY (0x00040000)
+#define ACE_STATUS_CFDWF (0x00080000)
+#define ACE_STATUS_CFDSC (0x00100000)
+#define ACE_STATUS_CFDRQ (0x00200000)
+#define ACE_STATUS_CFCORR (0x00400000)
+#define ACE_STATUS_CFERR (0x00800000)
+
+#define ACE_ERROR (0x08)
+#define ACE_CFGLBA (0x0c)
+#define ACE_MPULBA (0x10)
+
+#define ACE_SECCNTCMD (0x14)
+#define ACE_SECCNTCMD_RESET (0x0100)
+#define ACE_SECCNTCMD_IDENTIFY (0x0200)
+#define ACE_SECCNTCMD_READ_DATA (0x0300)
+#define ACE_SECCNTCMD_WRITE_DATA (0x0400)
+#define ACE_SECCNTCMD_ABORT (0x0600)
+
+#define ACE_VERSION (0x16)
+#define ACE_VERSION_REVISION_MASK (0x00FF)
+#define ACE_VERSION_MINOR_MASK (0x0F00)
+#define ACE_VERSION_MAJOR_MASK (0xF000)
+
+#define ACE_CTRL (0x18)
+#define ACE_CTRL_FORCELOCKREQ (0x0001)
+#define ACE_CTRL_LOCKREQ (0x0002)
+#define ACE_CTRL_FORCECFGADDR (0x0004)
+#define ACE_CTRL_FORCECFGMODE (0x0008)
+#define ACE_CTRL_CFGMODE (0x0010)
+#define ACE_CTRL_CFGSTART (0x0020)
+#define ACE_CTRL_CFGSEL (0x0040)
+#define ACE_CTRL_CFGRESET (0x0080)
+#define ACE_CTRL_DATABUFRDYIRQ (0x0100)
+#define ACE_CTRL_ERRORIRQ (0x0200)
+#define ACE_CTRL_CFGDONEIRQ (0x0400)
+#define ACE_CTRL_RESETIRQ (0x0800)
+#define ACE_CTRL_CFGPROG (0x1000)
+#define ACE_CTRL_CFGADDR_MASK (0xe000)
+
+#define ACE_FATSTAT (0x1c)
+
+#define ACE_NUM_MINORS 16
+#define ACE_SECTOR_SIZE (512)
+#define ACE_FIFO_SIZE (32)
+#define ACE_BUF_PER_SECTOR (ACE_SECTOR_SIZE / ACE_FIFO_SIZE)
+
+#define ACE_BUS_WIDTH_8 0
+#define ACE_BUS_WIDTH_16 1
+
+struct ace_reg_ops;
+
+struct ace_device {
+ /* driver state data */
+ int id;
+ int media_change;
+ int users;
+ struct list_head list;
+
+ /* finite state machine data */
+ struct tasklet_struct fsm_tasklet;
+ uint fsm_task; /* Current activity (ACE_TASK_*) */
+ uint fsm_state; /* Current state (ACE_FSM_STATE_*) */
+ uint fsm_continue_flag; /* cleared to exit FSM mainloop */
+ uint fsm_iter_num;
+ struct timer_list stall_timer;
+
+ /* Transfer state/result, use for both id and block request */
+ struct request *req; /* request being processed */
+ void *data_ptr; /* pointer to I/O buffer */
+ int data_count; /* number of buffers remaining */
+ int data_result; /* Result of transfer; 0 := success */
+
+ int id_req_count; /* count of id requests */
+ int id_result;
+ struct completion id_completion; /* used when id req finishes */
+ int in_irq;
+
+ /* Details of hardware device */
+ resource_size_t physaddr;
+ void __iomem *baseaddr;
+ int irq;
+ int bus_width; /* 0 := 8 bit; 1 := 16 bit */
+ struct ace_reg_ops *reg_ops;
+ int lock_count;
+
+ /* Block device data structures */
+ spinlock_t lock;
+ struct device *dev;
+ struct request_queue *queue;
+ struct gendisk *gd;
+
+ /* Inserted CF card parameters */
+ u16 cf_id[ATA_ID_WORDS];
+};
+
+static DEFINE_MUTEX(xsysace_mutex);
+static int ace_major;
+
+/* ---------------------------------------------------------------------
+ * Low level register access
+ */
+
+struct ace_reg_ops {
+ u16(*in) (struct ace_device * ace, int reg);
+ void (*out) (struct ace_device * ace, int reg, u16 val);
+ void (*datain) (struct ace_device * ace);
+ void (*dataout) (struct ace_device * ace);
+};
+
+/* 8 Bit bus width */
+static u16 ace_in_8(struct ace_device *ace, int reg)
+{
+ void __iomem *r = ace->baseaddr + reg;
+ return in_8(r) | (in_8(r + 1) << 8);
+}
+
+static void ace_out_8(struct ace_device *ace, int reg, u16 val)
+{
+ void __iomem *r = ace->baseaddr + reg;
+ out_8(r, val);
+ out_8(r + 1, val >> 8);
+}
+
+static void ace_datain_8(struct ace_device *ace)
+{
+ void __iomem *r = ace->baseaddr + 0x40;
+ u8 *dst = ace->data_ptr;
+ int i = ACE_FIFO_SIZE;
+ while (i--)
+ *dst++ = in_8(r++);
+ ace->data_ptr = dst;
+}
+
+static void ace_dataout_8(struct ace_device *ace)
+{
+ void __iomem *r = ace->baseaddr + 0x40;
+ u8 *src = ace->data_ptr;
+ int i = ACE_FIFO_SIZE;
+ while (i--)
+ out_8(r++, *src++);
+ ace->data_ptr = src;
+}
+
+static struct ace_reg_ops ace_reg_8_ops = {
+ .in = ace_in_8,
+ .out = ace_out_8,
+ .datain = ace_datain_8,
+ .dataout = ace_dataout_8,
+};
+
+/* 16 bit big endian bus attachment */
+static u16 ace_in_be16(struct ace_device *ace, int reg)
+{
+ return in_be16(ace->baseaddr + reg);
+}
+
+static void ace_out_be16(struct ace_device *ace, int reg, u16 val)
+{
+ out_be16(ace->baseaddr + reg, val);
+}
+
+static void ace_datain_be16(struct ace_device *ace)
+{
+ int i = ACE_FIFO_SIZE / 2;
+ u16 *dst = ace->data_ptr;
+ while (i--)
+ *dst++ = in_le16(ace->baseaddr + 0x40);
+ ace->data_ptr = dst;
+}
+
+static void ace_dataout_be16(struct ace_device *ace)
+{
+ int i = ACE_FIFO_SIZE / 2;
+ u16 *src = ace->data_ptr;
+ while (i--)
+ out_le16(ace->baseaddr + 0x40, *src++);
+ ace->data_ptr = src;
+}
+
+/* 16 bit little endian bus attachment */
+static u16 ace_in_le16(struct ace_device *ace, int reg)
+{
+ return in_le16(ace->baseaddr + reg);
+}
+
+static void ace_out_le16(struct ace_device *ace, int reg, u16 val)
+{
+ out_le16(ace->baseaddr + reg, val);
+}
+
+static void ace_datain_le16(struct ace_device *ace)
+{
+ int i = ACE_FIFO_SIZE / 2;
+ u16 *dst = ace->data_ptr;
+ while (i--)
+ *dst++ = in_be16(ace->baseaddr + 0x40);
+ ace->data_ptr = dst;
+}
+
+static void ace_dataout_le16(struct ace_device *ace)
+{
+ int i = ACE_FIFO_SIZE / 2;
+ u16 *src = ace->data_ptr;
+ while (i--)
+ out_be16(ace->baseaddr + 0x40, *src++);
+ ace->data_ptr = src;
+}
+
+static struct ace_reg_ops ace_reg_be16_ops = {
+ .in = ace_in_be16,
+ .out = ace_out_be16,
+ .datain = ace_datain_be16,
+ .dataout = ace_dataout_be16,
+};
+
+static struct ace_reg_ops ace_reg_le16_ops = {
+ .in = ace_in_le16,
+ .out = ace_out_le16,
+ .datain = ace_datain_le16,
+ .dataout = ace_dataout_le16,
+};
+
+static inline u16 ace_in(struct ace_device *ace, int reg)
+{
+ return ace->reg_ops->in(ace, reg);
+}
+
+static inline u32 ace_in32(struct ace_device *ace, int reg)
+{
+ return ace_in(ace, reg) | (ace_in(ace, reg + 2) << 16);
+}
+
+static inline void ace_out(struct ace_device *ace, int reg, u16 val)
+{
+ ace->reg_ops->out(ace, reg, val);
+}
+
+static inline void ace_out32(struct ace_device *ace, int reg, u32 val)
+{
+ ace_out(ace, reg, val);
+ ace_out(ace, reg + 2, val >> 16);
+}
+
+/* ---------------------------------------------------------------------
+ * Debug support functions
+ */
+
+#if defined(DEBUG)
+static void ace_dump_mem(void *base, int len)
+{
+ const char *ptr = base;
+ int i, j;
+
+ for (i = 0; i < len; i += 16) {
+ printk(KERN_INFO "%.8x:", i);
+ for (j = 0; j < 16; j++) {
+ if (!(j % 4))
+ printk(" ");
+ printk("%.2x", ptr[i + j]);
+ }
+ printk(" ");
+ for (j = 0; j < 16; j++)
+ printk("%c", isprint(ptr[i + j]) ? ptr[i + j] : '.');
+ printk("\n");
+ }
+}
+#else
+static inline void ace_dump_mem(void *base, int len)
+{
+}
+#endif
+
+static void ace_dump_regs(struct ace_device *ace)
+{
+ dev_info(ace->dev,
+ " ctrl: %.8x seccnt/cmd: %.4x ver:%.4x\n"
+ " status:%.8x mpu_lba:%.8x busmode:%4x\n"
+ " error: %.8x cfg_lba:%.8x fatstat:%.4x\n",
+ ace_in32(ace, ACE_CTRL),
+ ace_in(ace, ACE_SECCNTCMD),
+ ace_in(ace, ACE_VERSION),
+ ace_in32(ace, ACE_STATUS),
+ ace_in32(ace, ACE_MPULBA),
+ ace_in(ace, ACE_BUSMODE),
+ ace_in32(ace, ACE_ERROR),
+ ace_in32(ace, ACE_CFGLBA), ace_in(ace, ACE_FATSTAT));
+}
+
+static void ace_fix_driveid(u16 *id)
+{
+#if defined(__BIG_ENDIAN)
+ int i;
+
+ /* All half words have wrong byte order; swap the bytes */
+ for (i = 0; i < ATA_ID_WORDS; i++, id++)
+ *id = le16_to_cpu(*id);
+#endif
+}
+
+/* ---------------------------------------------------------------------
+ * Finite State Machine (FSM) implementation
+ */
+
+/* FSM tasks; used to direct state transitions */
+#define ACE_TASK_IDLE 0
+#define ACE_TASK_IDENTIFY 1
+#define ACE_TASK_READ 2
+#define ACE_TASK_WRITE 3
+#define ACE_FSM_NUM_TASKS 4
+
+/* FSM state definitions */
+#define ACE_FSM_STATE_IDLE 0
+#define ACE_FSM_STATE_REQ_LOCK 1
+#define ACE_FSM_STATE_WAIT_LOCK 2
+#define ACE_FSM_STATE_WAIT_CFREADY 3
+#define ACE_FSM_STATE_IDENTIFY_PREPARE 4
+#define ACE_FSM_STATE_IDENTIFY_TRANSFER 5
+#define ACE_FSM_STATE_IDENTIFY_COMPLETE 6
+#define ACE_FSM_STATE_REQ_PREPARE 7
+#define ACE_FSM_STATE_REQ_TRANSFER 8
+#define ACE_FSM_STATE_REQ_COMPLETE 9
+#define ACE_FSM_STATE_ERROR 10
+#define ACE_FSM_NUM_STATES 11
+
+/* Set flag to exit FSM loop and reschedule tasklet */
+static inline void ace_fsm_yield(struct ace_device *ace)
+{
+ dev_dbg(ace->dev, "ace_fsm_yield()\n");
+ tasklet_schedule(&ace->fsm_tasklet);
+ ace->fsm_continue_flag = 0;
+}
+
+/* Set flag to exit FSM loop and wait for IRQ to reschedule tasklet */
+static inline void ace_fsm_yieldirq(struct ace_device *ace)
+{
+ dev_dbg(ace->dev, "ace_fsm_yieldirq()\n");
+
+ if (!ace->irq)
+ /* No IRQ assigned, so need to poll */
+ tasklet_schedule(&ace->fsm_tasklet);
+ ace->fsm_continue_flag = 0;
+}
+
+/* Get the next read/write request; ending requests that we don't handle */
+static struct request *ace_get_next_request(struct request_queue *q)
+{
+ struct request *req;
+
+ while ((req = blk_peek_request(q)) != NULL) {
+ if (req->cmd_type == REQ_TYPE_FS)
+ break;
+ blk_start_request(req);
+ __blk_end_request_all(req, -EIO);
+ }
+ return req;
+}
+
+static void ace_fsm_dostate(struct ace_device *ace)
+{
+ struct request *req;
+ u32 status;
+ u16 val;
+ int count;
+
+#if defined(DEBUG)
+ dev_dbg(ace->dev, "fsm_state=%i, id_req_count=%i\n",
+ ace->fsm_state, ace->id_req_count);
+#endif
+
+ /* Verify that there is actually a CF in the slot. If not, then
+ * bail out back to the idle state and wake up all the waiters */
+ status = ace_in32(ace, ACE_STATUS);
+ if ((status & ACE_STATUS_CFDETECT) == 0) {
+ ace->fsm_state = ACE_FSM_STATE_IDLE;
+ ace->media_change = 1;
+ set_capacity(ace->gd, 0);
+ dev_info(ace->dev, "No CF in slot\n");
+
+ /* Drop all in-flight and pending requests */
+ if (ace->req) {
+ __blk_end_request_all(ace->req, -EIO);
+ ace->req = NULL;
+ }
+ while ((req = blk_fetch_request(ace->queue)) != NULL)
+ __blk_end_request_all(req, -EIO);
+
+ /* Drop back to IDLE state and notify waiters */
+ ace->fsm_state = ACE_FSM_STATE_IDLE;
+ ace->id_result = -EIO;
+ while (ace->id_req_count) {
+ complete(&ace->id_completion);
+ ace->id_req_count--;
+ }
+ }
+
+ switch (ace->fsm_state) {
+ case ACE_FSM_STATE_IDLE:
+ /* See if there is anything to do */
+ if (ace->id_req_count || ace_get_next_request(ace->queue)) {
+ ace->fsm_iter_num++;
+ ace->fsm_state = ACE_FSM_STATE_REQ_LOCK;
+ mod_timer(&ace->stall_timer, jiffies + HZ);
+ if (!timer_pending(&ace->stall_timer))
+ add_timer(&ace->stall_timer);
+ break;
+ }
+ del_timer(&ace->stall_timer);
+ ace->fsm_continue_flag = 0;
+ break;
+
+ case ACE_FSM_STATE_REQ_LOCK:
+ if (ace_in(ace, ACE_STATUS) & ACE_STATUS_MPULOCK) {
+ /* Already have the lock, jump to next state */
+ ace->fsm_state = ACE_FSM_STATE_WAIT_CFREADY;
+ break;
+ }
+
+ /* Request the lock */
+ val = ace_in(ace, ACE_CTRL);
+ ace_out(ace, ACE_CTRL, val | ACE_CTRL_LOCKREQ);
+ ace->fsm_state = ACE_FSM_STATE_WAIT_LOCK;
+ break;
+
+ case ACE_FSM_STATE_WAIT_LOCK:
+ if (ace_in(ace, ACE_STATUS) & ACE_STATUS_MPULOCK) {
+ /* got the lock; move to next state */
+ ace->fsm_state = ACE_FSM_STATE_WAIT_CFREADY;
+ break;
+ }
+
+ /* wait a bit for the lock */
+ ace_fsm_yield(ace);
+ break;
+
+ case ACE_FSM_STATE_WAIT_CFREADY:
+ status = ace_in32(ace, ACE_STATUS);
+ if (!(status & ACE_STATUS_RDYFORCFCMD) ||
+ (status & ACE_STATUS_CFBSY)) {
+ /* CF card isn't ready; it needs to be polled */
+ ace_fsm_yield(ace);
+ break;
+ }
+
+ /* Device is ready for command; determine what to do next */
+ if (ace->id_req_count)
+ ace->fsm_state = ACE_FSM_STATE_IDENTIFY_PREPARE;
+ else
+ ace->fsm_state = ACE_FSM_STATE_REQ_PREPARE;
+ break;
+
+ case ACE_FSM_STATE_IDENTIFY_PREPARE:
+ /* Send identify command */
+ ace->fsm_task = ACE_TASK_IDENTIFY;
+ ace->data_ptr = ace->cf_id;
+ ace->data_count = ACE_BUF_PER_SECTOR;
+ ace_out(ace, ACE_SECCNTCMD, ACE_SECCNTCMD_IDENTIFY);
+
+ /* As per datasheet, put config controller in reset */
+ val = ace_in(ace, ACE_CTRL);
+ ace_out(ace, ACE_CTRL, val | ACE_CTRL_CFGRESET);
+
+ /* irq handler takes over from this point; wait for the
+ * transfer to complete */
+ ace->fsm_state = ACE_FSM_STATE_IDENTIFY_TRANSFER;
+ ace_fsm_yieldirq(ace);
+ break;
+
+ case ACE_FSM_STATE_IDENTIFY_TRANSFER:
+ /* Check that the sysace is ready to receive data */
+ status = ace_in32(ace, ACE_STATUS);
+ if (status & ACE_STATUS_CFBSY) {
+ dev_dbg(ace->dev, "CFBSY set; t=%i iter=%i dc=%i\n",
+ ace->fsm_task, ace->fsm_iter_num,
+ ace->data_count);
+ ace_fsm_yield(ace);
+ break;
+ }
+ if (!(status & ACE_STATUS_DATABUFRDY)) {
+ ace_fsm_yield(ace);
+ break;
+ }
+
+ /* Transfer the next buffer */
+ ace->reg_ops->datain(ace);
+ ace->data_count--;
+
+ /* If there are still buffers to be transfers; jump out here */
+ if (ace->data_count != 0) {
+ ace_fsm_yieldirq(ace);
+ break;
+ }
+
+ /* transfer finished; kick state machine */
+ dev_dbg(ace->dev, "identify finished\n");
+ ace->fsm_state = ACE_FSM_STATE_IDENTIFY_COMPLETE;
+ break;
+
+ case ACE_FSM_STATE_IDENTIFY_COMPLETE:
+ ace_fix_driveid(ace->cf_id);
+ ace_dump_mem(ace->cf_id, 512); /* Debug: Dump out disk ID */
+
+ if (ace->data_result) {
+ /* Error occurred, disable the disk */
+ ace->media_change = 1;
+ set_capacity(ace->gd, 0);
+ dev_err(ace->dev, "error fetching CF id (%i)\n",
+ ace->data_result);
+ } else {
+ ace->media_change = 0;
+
+ /* Record disk parameters */
+ set_capacity(ace->gd,
+ ata_id_u32(ace->cf_id, ATA_ID_LBA_CAPACITY));
+ dev_info(ace->dev, "capacity: %i sectors\n",
+ ata_id_u32(ace->cf_id, ATA_ID_LBA_CAPACITY));
+ }
+
+ /* We're done, drop to IDLE state and notify waiters */
+ ace->fsm_state = ACE_FSM_STATE_IDLE;
+ ace->id_result = ace->data_result;
+ while (ace->id_req_count) {
+ complete(&ace->id_completion);
+ ace->id_req_count--;
+ }
+ break;
+
+ case ACE_FSM_STATE_REQ_PREPARE:
+ req = ace_get_next_request(ace->queue);
+ if (!req) {
+ ace->fsm_state = ACE_FSM_STATE_IDLE;
+ break;
+ }
+ blk_start_request(req);
+
+ /* Okay, it's a data request, set it up for transfer */
+ dev_dbg(ace->dev,
+ "request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n",
+ (unsigned long long)blk_rq_pos(req),
+ blk_rq_sectors(req), blk_rq_cur_sectors(req),
+ rq_data_dir(req));
+
+ ace->req = req;
+ ace->data_ptr = bio_data(req->bio);
+ ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
+ ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);
+
+ count = blk_rq_sectors(req);
+ if (rq_data_dir(req)) {
+ /* Kick off write request */
+ dev_dbg(ace->dev, "write data\n");
+ ace->fsm_task = ACE_TASK_WRITE;
+ ace_out(ace, ACE_SECCNTCMD,
+ count | ACE_SECCNTCMD_WRITE_DATA);
+ } else {
+ /* Kick off read request */
+ dev_dbg(ace->dev, "read data\n");
+ ace->fsm_task = ACE_TASK_READ;
+ ace_out(ace, ACE_SECCNTCMD,
+ count | ACE_SECCNTCMD_READ_DATA);
+ }
+
+ /* As per datasheet, put config controller in reset */
+ val = ace_in(ace, ACE_CTRL);
+ ace_out(ace, ACE_CTRL, val | ACE_CTRL_CFGRESET);
+
+ /* Move to the transfer state. The systemace will raise
+ * an interrupt once there is something to do
+ */
+ ace->fsm_state = ACE_FSM_STATE_REQ_TRANSFER;
+ if (ace->fsm_task == ACE_TASK_READ)
+ ace_fsm_yieldirq(ace); /* wait for data ready */
+ break;
+
+ case ACE_FSM_STATE_REQ_TRANSFER:
+ /* Check that the sysace is ready to receive data */
+ status = ace_in32(ace, ACE_STATUS);
+ if (status & ACE_STATUS_CFBSY) {
+ dev_dbg(ace->dev,
+ "CFBSY set; t=%i iter=%i c=%i dc=%i irq=%i\n",
+ ace->fsm_task, ace->fsm_iter_num,
+ blk_rq_cur_sectors(ace->req) * 16,
+ ace->data_count, ace->in_irq);
+ ace_fsm_yield(ace); /* need to poll CFBSY bit */
+ break;
+ }
+ if (!(status & ACE_STATUS_DATABUFRDY)) {
+ dev_dbg(ace->dev,
+ "DATABUF not set; t=%i iter=%i c=%i dc=%i irq=%i\n",
+ ace->fsm_task, ace->fsm_iter_num,
+ blk_rq_cur_sectors(ace->req) * 16,
+ ace->data_count, ace->in_irq);
+ ace_fsm_yieldirq(ace);
+ break;
+ }
+
+ /* Transfer the next buffer */
+ if (ace->fsm_task == ACE_TASK_WRITE)
+ ace->reg_ops->dataout(ace);
+ else
+ ace->reg_ops->datain(ace);
+ ace->data_count--;
+
+ /* If there are still buffers to be transfers; jump out here */
+ if (ace->data_count != 0) {
+ ace_fsm_yieldirq(ace);
+ break;
+ }
+
+ /* bio finished; is there another one? */
+ if (__blk_end_request_cur(ace->req, 0)) {
+ /* dev_dbg(ace->dev, "next block; h=%u c=%u\n",
+ * blk_rq_sectors(ace->req),
+ * blk_rq_cur_sectors(ace->req));
+ */
+ ace->data_ptr = bio_data(ace->req->bio);
+ ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
+ ace_fsm_yieldirq(ace);
+ break;
+ }
+
+ ace->fsm_state = ACE_FSM_STATE_REQ_COMPLETE;
+ break;
+
+ case ACE_FSM_STATE_REQ_COMPLETE:
+ ace->req = NULL;
+
+ /* Finished request; go to idle state */
+ ace->fsm_state = ACE_FSM_STATE_IDLE;
+ break;
+
+ default:
+ ace->fsm_state = ACE_FSM_STATE_IDLE;
+ break;
+ }
+}
+
+static void ace_fsm_tasklet(unsigned long data)
+{
+ struct ace_device *ace = (void *)data;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ace->lock, flags);
+
+ /* Loop over state machine until told to stop */
+ ace->fsm_continue_flag = 1;
+ while (ace->fsm_continue_flag)
+ ace_fsm_dostate(ace);
+
+ spin_unlock_irqrestore(&ace->lock, flags);
+}
+
+static void ace_stall_timer(unsigned long data)
+{
+ struct ace_device *ace = (void *)data;
+ unsigned long flags;
+
+ dev_warn(ace->dev,
+ "kicking stalled fsm; state=%i task=%i iter=%i dc=%i\n",
+ ace->fsm_state, ace->fsm_task, ace->fsm_iter_num,
+ ace->data_count);
+ spin_lock_irqsave(&ace->lock, flags);
+
+ /* Rearm the stall timer *before* entering FSM (which may then
+ * delete the timer) */
+ mod_timer(&ace->stall_timer, jiffies + HZ);
+
+ /* Loop over state machine until told to stop */
+ ace->fsm_continue_flag = 1;
+ while (ace->fsm_continue_flag)
+ ace_fsm_dostate(ace);
+
+ spin_unlock_irqrestore(&ace->lock, flags);
+}
+
+/* ---------------------------------------------------------------------
+ * Interrupt handling routines
+ */
+static int ace_interrupt_checkstate(struct ace_device *ace)
+{
+ u32 sreg = ace_in32(ace, ACE_STATUS);
+ u16 creg = ace_in(ace, ACE_CTRL);
+
+ /* Check for error occurrence */
+ if ((sreg & (ACE_STATUS_CFGERROR | ACE_STATUS_CFCERROR)) &&
+ (creg & ACE_CTRL_ERRORIRQ)) {
+ dev_err(ace->dev, "transfer failure\n");
+ ace_dump_regs(ace);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static irqreturn_t ace_interrupt(int irq, void *dev_id)
+{
+ u16 creg;
+ struct ace_device *ace = dev_id;
+
+ /* be safe and get the lock */
+ spin_lock(&ace->lock);
+ ace->in_irq = 1;
+
+ /* clear the interrupt */
+ creg = ace_in(ace, ACE_CTRL);
+ ace_out(ace, ACE_CTRL, creg | ACE_CTRL_RESETIRQ);
+ ace_out(ace, ACE_CTRL, creg);
+
+ /* check for IO failures */
+ if (ace_interrupt_checkstate(ace))
+ ace->data_result = -EIO;
+
+ if (ace->fsm_task == 0) {
+ dev_err(ace->dev,
+ "spurious irq; stat=%.8x ctrl=%.8x cmd=%.4x\n",
+ ace_in32(ace, ACE_STATUS), ace_in32(ace, ACE_CTRL),
+ ace_in(ace, ACE_SECCNTCMD));
+ dev_err(ace->dev, "fsm_task=%i fsm_state=%i data_count=%i\n",
+ ace->fsm_task, ace->fsm_state, ace->data_count);
+ }
+
+ /* Loop over state machine until told to stop */
+ ace->fsm_continue_flag = 1;
+ while (ace->fsm_continue_flag)
+ ace_fsm_dostate(ace);
+
+ /* done with interrupt; drop the lock */
+ ace->in_irq = 0;
+ spin_unlock(&ace->lock);
+
+ return IRQ_HANDLED;
+}
+
+/* ---------------------------------------------------------------------
+ * Block ops
+ */
+static void ace_request(struct request_queue * q)
+{
+ struct request *req;
+ struct ace_device *ace;
+
+ req = ace_get_next_request(q);
+
+ if (req) {
+ ace = req->rq_disk->private_data;
+ tasklet_schedule(&ace->fsm_tasklet);
+ }
+}
+
+static unsigned int ace_check_events(struct gendisk *gd, unsigned int clearing)
+{
+ struct ace_device *ace = gd->private_data;
+ dev_dbg(ace->dev, "ace_check_events(): %i\n", ace->media_change);
+
+ return ace->media_change ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static int ace_revalidate_disk(struct gendisk *gd)
+{
+ struct ace_device *ace = gd->private_data;
+ unsigned long flags;
+
+ dev_dbg(ace->dev, "ace_revalidate_disk()\n");
+
+ if (ace->media_change) {
+ dev_dbg(ace->dev, "requesting cf id and scheduling tasklet\n");
+
+ spin_lock_irqsave(&ace->lock, flags);
+ ace->id_req_count++;
+ spin_unlock_irqrestore(&ace->lock, flags);
+
+ tasklet_schedule(&ace->fsm_tasklet);
+ wait_for_completion(&ace->id_completion);
+ }
+
+ dev_dbg(ace->dev, "revalidate complete\n");
+ return ace->id_result;
+}
+
+static int ace_open(struct block_device *bdev, fmode_t mode)
+{
+ struct ace_device *ace = bdev->bd_disk->private_data;
+ unsigned long flags;
+
+ dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1);
+
+ mutex_lock(&xsysace_mutex);
+ spin_lock_irqsave(&ace->lock, flags);
+ ace->users++;
+ spin_unlock_irqrestore(&ace->lock, flags);
+
+ check_disk_change(bdev);
+ mutex_unlock(&xsysace_mutex);
+
+ return 0;
+}
+
+static void ace_release(struct gendisk *disk, fmode_t mode)
+{
+ struct ace_device *ace = disk->private_data;
+ unsigned long flags;
+ u16 val;
+
+ dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1);
+
+ mutex_lock(&xsysace_mutex);
+ spin_lock_irqsave(&ace->lock, flags);
+ ace->users--;
+ if (ace->users == 0) {
+ val = ace_in(ace, ACE_CTRL);
+ ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ);
+ }
+ spin_unlock_irqrestore(&ace->lock, flags);
+ mutex_unlock(&xsysace_mutex);
+}
+
+static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+ struct ace_device *ace = bdev->bd_disk->private_data;
+ u16 *cf_id = ace->cf_id;
+
+ dev_dbg(ace->dev, "ace_getgeo()\n");
+
+ geo->heads = cf_id[ATA_ID_HEADS];
+ geo->sectors = cf_id[ATA_ID_SECTORS];
+ geo->cylinders = cf_id[ATA_ID_CYLS];
+
+ return 0;
+}
+
+static const struct block_device_operations ace_fops = {
+ .owner = THIS_MODULE,
+ .open = ace_open,
+ .release = ace_release,
+ .check_events = ace_check_events,
+ .revalidate_disk = ace_revalidate_disk,
+ .getgeo = ace_getgeo,
+};
+
+/* --------------------------------------------------------------------
+ * SystemACE device setup/teardown code
+ */
+static int ace_setup(struct ace_device *ace)
+{
+ u16 version;
+ u16 val;
+ int rc;
+
+ dev_dbg(ace->dev, "ace_setup(ace=0x%p)\n", ace);
+ dev_dbg(ace->dev, "physaddr=0x%llx irq=%i\n",
+ (unsigned long long)ace->physaddr, ace->irq);
+
+ spin_lock_init(&ace->lock);
+ init_completion(&ace->id_completion);
+
+ /*
+ * Map the device
+ */
+ ace->baseaddr = ioremap(ace->physaddr, 0x80);
+ if (!ace->baseaddr)
+ goto err_ioremap;
+
+ /*
+ * Initialize the state machine tasklet and stall timer
+ */
+ tasklet_init(&ace->fsm_tasklet, ace_fsm_tasklet, (unsigned long)ace);
+ setup_timer(&ace->stall_timer, ace_stall_timer, (unsigned long)ace);
+
+ /*
+ * Initialize the request queue
+ */
+ ace->queue = blk_init_queue(ace_request, &ace->lock);
+ if (ace->queue == NULL)
+ goto err_blk_initq;
+ blk_queue_logical_block_size(ace->queue, 512);
+
+ /*
+ * Allocate and initialize GD structure
+ */
+ ace->gd = alloc_disk(ACE_NUM_MINORS);
+ if (!ace->gd)
+ goto err_alloc_disk;
+
+ ace->gd->major = ace_major;
+ ace->gd->first_minor = ace->id * ACE_NUM_MINORS;
+ ace->gd->fops = &ace_fops;
+ ace->gd->queue = ace->queue;
+ ace->gd->private_data = ace;
+ snprintf(ace->gd->disk_name, 32, "xs%c", ace->id + 'a');
+
+ /* set bus width */
+ if (ace->bus_width == ACE_BUS_WIDTH_16) {
+ /* 0x0101 should work regardless of endianess */
+ ace_out_le16(ace, ACE_BUSMODE, 0x0101);
+
+ /* read it back to determine endianess */
+ if (ace_in_le16(ace, ACE_BUSMODE) == 0x0001)
+ ace->reg_ops = &ace_reg_le16_ops;
+ else
+ ace->reg_ops = &ace_reg_be16_ops;
+ } else {
+ ace_out_8(ace, ACE_BUSMODE, 0x00);
+ ace->reg_ops = &ace_reg_8_ops;
+ }
+
+ /* Make sure version register is sane */
+ version = ace_in(ace, ACE_VERSION);
+ if ((version == 0) || (version == 0xFFFF))
+ goto err_read;
+
+ /* Put sysace in a sane state by clearing most control reg bits */
+ ace_out(ace, ACE_CTRL, ACE_CTRL_FORCECFGMODE |
+ ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ);
+
+ /* Now we can hook up the irq handler */
+ if (ace->irq) {
+ rc = request_irq(ace->irq, ace_interrupt, 0, "systemace", ace);
+ if (rc) {
+ /* Failure - fall back to polled mode */
+ dev_err(ace->dev, "request_irq failed\n");
+ ace->irq = 0;
+ }
+ }
+
+ /* Enable interrupts */
+ val = ace_in(ace, ACE_CTRL);
+ val |= ACE_CTRL_DATABUFRDYIRQ | ACE_CTRL_ERRORIRQ;
+ ace_out(ace, ACE_CTRL, val);
+
+ /* Print the identification */
+ dev_info(ace->dev, "Xilinx SystemACE revision %i.%i.%i\n",
+ (version >> 12) & 0xf, (version >> 8) & 0x0f, version & 0xff);
+ dev_dbg(ace->dev, "physaddr 0x%llx, mapped to 0x%p, irq=%i\n",
+ (unsigned long long) ace->physaddr, ace->baseaddr, ace->irq);
+
+ ace->media_change = 1;
+ ace_revalidate_disk(ace->gd);
+
+ /* Make the sysace device 'live' */
+ add_disk(ace->gd);
+
+ return 0;
+
+err_read:
+ put_disk(ace->gd);
+err_alloc_disk:
+ blk_cleanup_queue(ace->queue);
+err_blk_initq:
+ iounmap(ace->baseaddr);
+err_ioremap:
+ dev_info(ace->dev, "xsysace: error initializing device at 0x%llx\n",
+ (unsigned long long) ace->physaddr);
+ return -ENOMEM;
+}
+
+static void ace_teardown(struct ace_device *ace)
+{
+ if (ace->gd) {
+ del_gendisk(ace->gd);
+ put_disk(ace->gd);
+ }
+
+ if (ace->queue)
+ blk_cleanup_queue(ace->queue);
+
+ tasklet_kill(&ace->fsm_tasklet);
+
+ if (ace->irq)
+ free_irq(ace->irq, ace);
+
+ iounmap(ace->baseaddr);
+}
+
+static int ace_alloc(struct device *dev, int id, resource_size_t physaddr,
+ int irq, int bus_width)
+{
+ struct ace_device *ace;
+ int rc;
+ dev_dbg(dev, "ace_alloc(%p)\n", dev);
+
+ if (!physaddr) {
+ rc = -ENODEV;
+ goto err_noreg;
+ }
+
+ /* Allocate and initialize the ace device structure */
+ ace = kzalloc(sizeof(struct ace_device), GFP_KERNEL);
+ if (!ace) {
+ rc = -ENOMEM;
+ goto err_alloc;
+ }
+
+ ace->dev = dev;
+ ace->id = id;
+ ace->physaddr = physaddr;
+ ace->irq = irq;
+ ace->bus_width = bus_width;
+
+ /* Call the setup code */
+ rc = ace_setup(ace);
+ if (rc)
+ goto err_setup;
+
+ dev_set_drvdata(dev, ace);
+ return 0;
+
+err_setup:
+ dev_set_drvdata(dev, NULL);
+ kfree(ace);
+err_alloc:
+err_noreg:
+ dev_err(dev, "could not initialize device, err=%i\n", rc);
+ return rc;
+}
+
+static void ace_free(struct device *dev)
+{
+ struct ace_device *ace = dev_get_drvdata(dev);
+ dev_dbg(dev, "ace_free(%p)\n", dev);
+
+ if (ace) {
+ ace_teardown(ace);
+ dev_set_drvdata(dev, NULL);
+ kfree(ace);
+ }
+}
+
+/* ---------------------------------------------------------------------
+ * Platform Bus Support
+ */
+
+static int ace_probe(struct platform_device *dev)
+{
+ resource_size_t physaddr = 0;
+ int bus_width = ACE_BUS_WIDTH_16; /* FIXME: should not be hard coded */
+ u32 id = dev->id;
+ int irq = 0;
+ int i;
+
+ dev_dbg(&dev->dev, "ace_probe(%p)\n", dev);
+
+ /* device id and bus width */
+ if (of_property_read_u32(dev->dev.of_node, "port-number", &id))
+ id = 0;
+ if (of_find_property(dev->dev.of_node, "8-bit", NULL))
+ bus_width = ACE_BUS_WIDTH_8;
+
+ for (i = 0; i < dev->num_resources; i++) {
+ if (dev->resource[i].flags & IORESOURCE_MEM)
+ physaddr = dev->resource[i].start;
+ if (dev->resource[i].flags & IORESOURCE_IRQ)
+ irq = dev->resource[i].start;
+ }
+
+ /* Call the bus-independent setup code */
+ return ace_alloc(&dev->dev, id, physaddr, irq, bus_width);
+}
+
+/*
+ * Platform bus remove() method
+ */
+static int ace_remove(struct platform_device *dev)
+{
+ ace_free(&dev->dev);
+ return 0;
+}
+
+#if defined(CONFIG_OF)
+/* Match table for of_platform binding */
+static const struct of_device_id ace_of_match[] = {
+ { .compatible = "xlnx,opb-sysace-1.00.b", },
+ { .compatible = "xlnx,opb-sysace-1.00.c", },
+ { .compatible = "xlnx,xps-sysace-1.00.a", },
+ { .compatible = "xlnx,sysace", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, ace_of_match);
+#else /* CONFIG_OF */
+#define ace_of_match NULL
+#endif /* CONFIG_OF */
+
+static struct platform_driver ace_platform_driver = {
+ .probe = ace_probe,
+ .remove = ace_remove,
+ .driver = {
+ .name = "xsysace",
+ .of_match_table = ace_of_match,
+ },
+};
+
+/* ---------------------------------------------------------------------
+ * Module init/exit routines
+ */
+static int __init ace_init(void)
+{
+ int rc;
+
+ ace_major = register_blkdev(ace_major, "xsysace");
+ if (ace_major <= 0) {
+ rc = -ENOMEM;
+ goto err_blk;
+ }
+
+ rc = platform_driver_register(&ace_platform_driver);
+ if (rc)
+ goto err_plat;
+
+ pr_info("Xilinx SystemACE device driver, major=%i\n", ace_major);
+ return 0;
+
+err_plat:
+ unregister_blkdev(ace_major, "xsysace");
+err_blk:
+ printk(KERN_ERR "xsysace: registration failed; err=%i\n", rc);
+ return rc;
+}
+module_init(ace_init);
+
+static void __exit ace_exit(void)
+{
+ pr_debug("Unregistering Xilinx SystemACE driver\n");
+ platform_driver_unregister(&ace_platform_driver);
+ unregister_blkdev(ace_major, "xsysace");
+}
+module_exit(ace_exit);
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
new file mode 100644
index 000000000..968f9e52e
--- /dev/null
+++ b/drivers/block/z2ram.c
@@ -0,0 +1,418 @@
+/*
+** z2ram - Amiga pseudo-driver to access 16bit-RAM in ZorroII space
+** as a block device, to be used as a RAM disk or swap space
+**
+** Copyright (C) 1994 by Ingo Wilken (Ingo.Wilken@informatik.uni-oldenburg.de)
+**
+** ++Geert: support for zorro_unused_z2ram, better range checking
+** ++roman: translate accesses via an array
+** ++Milan: support for ChipRAM usage
+** ++yambo: converted to 2.0 kernel
+** ++yambo: modularized and support added for 3 minor devices including:
+** MAJOR MINOR DESCRIPTION
+** ----- ----- ----------------------------------------------
+** 37 0 Use Zorro II and Chip ram
+** 37 1 Use only Zorro II ram
+** 37 2 Use only Chip ram
+** 37 4-7 Use memory list entry 1-4 (first is 0)
+** ++jskov: support for 1-4th memory list entry.
+**
+** Permission to use, copy, modify, and distribute this software and its
+** documentation for any purpose and without fee is hereby granted, provided
+** that the above copyright notice appear in all copies and that both that
+** copyright notice and this permission notice appear in supporting
+** documentation. This software is provided "as is" without express or
+** implied warranty.
+*/
+
+#define DEVICE_NAME "Z2RAM"
+
+#include <linux/major.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/bitops.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <asm/setup.h>
+#include <asm/amigahw.h>
+#include <asm/pgtable.h>
+
+#include <linux/zorro.h>
+
+
+#define Z2MINOR_COMBINED (0)
+#define Z2MINOR_Z2ONLY (1)
+#define Z2MINOR_CHIPONLY (2)
+#define Z2MINOR_MEMLIST1 (4)
+#define Z2MINOR_MEMLIST2 (5)
+#define Z2MINOR_MEMLIST3 (6)
+#define Z2MINOR_MEMLIST4 (7)
+#define Z2MINOR_COUNT (8) /* Move this down when adding a new minor */
+
+#define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 )
+
+static DEFINE_MUTEX(z2ram_mutex);
+static u_long *z2ram_map = NULL;
+static u_long z2ram_size = 0;
+static int z2_count = 0;
+static int chip_count = 0;
+static int list_count = 0;
+static int current_device = -1;
+
+static DEFINE_SPINLOCK(z2ram_lock);
+
+static struct gendisk *z2ram_gendisk;
+
+static void do_z2_request(struct request_queue *q)
+{
+ struct request *req;
+
+ req = blk_fetch_request(q);
+ while (req) {
+ unsigned long start = blk_rq_pos(req) << 9;
+ unsigned long len = blk_rq_cur_bytes(req);
+ int err = 0;
+
+ if (start + len > z2ram_size) {
+ pr_err(DEVICE_NAME ": bad access: block=%llu, "
+ "count=%u\n",
+ (unsigned long long)blk_rq_pos(req),
+ blk_rq_cur_sectors(req));
+ err = -EIO;
+ goto done;
+ }
+ while (len) {
+ unsigned long addr = start & Z2RAM_CHUNKMASK;
+ unsigned long size = Z2RAM_CHUNKSIZE - addr;
+ void *buffer = bio_data(req->bio);
+
+ if (len < size)
+ size = len;
+ addr += z2ram_map[ start >> Z2RAM_CHUNKSHIFT ];
+ if (rq_data_dir(req) == READ)
+ memcpy(buffer, (char *)addr, size);
+ else
+ memcpy((char *)addr, buffer, size);
+ start += size;
+ len -= size;
+ }
+ done:
+ if (!__blk_end_request_cur(req, err))
+ req = blk_fetch_request(q);
+ }
+}
+
+static void
+get_z2ram( void )
+{
+ int i;
+
+ for ( i = 0; i < Z2RAM_SIZE / Z2RAM_CHUNKSIZE; i++ )
+ {
+ if ( test_bit( i, zorro_unused_z2ram ) )
+ {
+ z2_count++;
+ z2ram_map[z2ram_size++] = (unsigned long)ZTWO_VADDR(Z2RAM_START) +
+ (i << Z2RAM_CHUNKSHIFT);
+ clear_bit( i, zorro_unused_z2ram );
+ }
+ }
+
+ return;
+}
+
+static void
+get_chipram( void )
+{
+
+ while ( amiga_chip_avail() > ( Z2RAM_CHUNKSIZE * 4 ) )
+ {
+ chip_count++;
+ z2ram_map[ z2ram_size ] =
+ (u_long)amiga_chip_alloc( Z2RAM_CHUNKSIZE, "z2ram" );
+
+ if ( z2ram_map[ z2ram_size ] == 0 )
+ {
+ break;
+ }
+
+ z2ram_size++;
+ }
+
+ return;
+}
+
+static int z2_open(struct block_device *bdev, fmode_t mode)
+{
+ int device;
+ int max_z2_map = ( Z2RAM_SIZE / Z2RAM_CHUNKSIZE ) *
+ sizeof( z2ram_map[0] );
+ int max_chip_map = ( amiga_chip_size / Z2RAM_CHUNKSIZE ) *
+ sizeof( z2ram_map[0] );
+ int rc = -ENOMEM;
+
+ device = MINOR(bdev->bd_dev);
+
+ mutex_lock(&z2ram_mutex);
+ if ( current_device != -1 && current_device != device )
+ {
+ rc = -EBUSY;
+ goto err_out;
+ }
+
+ if ( current_device == -1 )
+ {
+ z2_count = 0;
+ chip_count = 0;
+ list_count = 0;
+ z2ram_size = 0;
+
+ /* Use a specific list entry. */
+ if (device >= Z2MINOR_MEMLIST1 && device <= Z2MINOR_MEMLIST4) {
+ int index = device - Z2MINOR_MEMLIST1 + 1;
+ unsigned long size, paddr, vaddr;
+
+ if (index >= m68k_realnum_memory) {
+ printk( KERN_ERR DEVICE_NAME
+ ": no such entry in z2ram_map\n" );
+ goto err_out;
+ }
+
+ paddr = m68k_memory[index].addr;
+ size = m68k_memory[index].size & ~(Z2RAM_CHUNKSIZE-1);
+
+#ifdef __powerpc__
+ /* FIXME: ioremap doesn't build correct memory tables. */
+ {
+ vfree(vmalloc (size));
+ }
+
+ vaddr = (unsigned long) __ioremap (paddr, size,
+ _PAGE_WRITETHRU);
+
+#else
+ vaddr = (unsigned long)z_remap_nocache_nonser(paddr, size);
+#endif
+ z2ram_map =
+ kmalloc((size/Z2RAM_CHUNKSIZE)*sizeof(z2ram_map[0]),
+ GFP_KERNEL);
+ if ( z2ram_map == NULL )
+ {
+ printk( KERN_ERR DEVICE_NAME
+ ": cannot get mem for z2ram_map\n" );
+ goto err_out;
+ }
+
+ while (size) {
+ z2ram_map[ z2ram_size++ ] = vaddr;
+ size -= Z2RAM_CHUNKSIZE;
+ vaddr += Z2RAM_CHUNKSIZE;
+ list_count++;
+ }
+
+ if ( z2ram_size != 0 )
+ printk( KERN_INFO DEVICE_NAME
+ ": using %iK List Entry %d Memory\n",
+ list_count * Z2RAM_CHUNK1024, index );
+ } else
+
+ switch ( device )
+ {
+ case Z2MINOR_COMBINED:
+
+ z2ram_map = kmalloc( max_z2_map + max_chip_map, GFP_KERNEL );
+ if ( z2ram_map == NULL )
+ {
+ printk( KERN_ERR DEVICE_NAME
+ ": cannot get mem for z2ram_map\n" );
+ goto err_out;
+ }
+
+ get_z2ram();
+ get_chipram();
+
+ if ( z2ram_size != 0 )
+ printk( KERN_INFO DEVICE_NAME
+ ": using %iK Zorro II RAM and %iK Chip RAM (Total %dK)\n",
+ z2_count * Z2RAM_CHUNK1024,
+ chip_count * Z2RAM_CHUNK1024,
+ ( z2_count + chip_count ) * Z2RAM_CHUNK1024 );
+
+ break;
+
+ case Z2MINOR_Z2ONLY:
+ z2ram_map = kmalloc( max_z2_map, GFP_KERNEL );
+ if ( z2ram_map == NULL )
+ {
+ printk( KERN_ERR DEVICE_NAME
+ ": cannot get mem for z2ram_map\n" );
+ goto err_out;
+ }
+
+ get_z2ram();
+
+ if ( z2ram_size != 0 )
+ printk( KERN_INFO DEVICE_NAME
+ ": using %iK of Zorro II RAM\n",
+ z2_count * Z2RAM_CHUNK1024 );
+
+ break;
+
+ case Z2MINOR_CHIPONLY:
+ z2ram_map = kmalloc( max_chip_map, GFP_KERNEL );
+ if ( z2ram_map == NULL )
+ {
+ printk( KERN_ERR DEVICE_NAME
+ ": cannot get mem for z2ram_map\n" );
+ goto err_out;
+ }
+
+ get_chipram();
+
+ if ( z2ram_size != 0 )
+ printk( KERN_INFO DEVICE_NAME
+ ": using %iK Chip RAM\n",
+ chip_count * Z2RAM_CHUNK1024 );
+
+ break;
+
+ default:
+ rc = -ENODEV;
+ goto err_out;
+
+ break;
+ }
+
+ if ( z2ram_size == 0 )
+ {
+ printk( KERN_NOTICE DEVICE_NAME
+ ": no unused ZII/Chip RAM found\n" );
+ goto err_out_kfree;
+ }
+
+ current_device = device;
+ z2ram_size <<= Z2RAM_CHUNKSHIFT;
+ set_capacity(z2ram_gendisk, z2ram_size >> 9);
+ }
+
+ mutex_unlock(&z2ram_mutex);
+ return 0;
+
+err_out_kfree:
+ kfree(z2ram_map);
+err_out:
+ mutex_unlock(&z2ram_mutex);
+ return rc;
+}
+
+static void
+z2_release(struct gendisk *disk, fmode_t mode)
+{
+ mutex_lock(&z2ram_mutex);
+ if ( current_device == -1 ) {
+ mutex_unlock(&z2ram_mutex);
+ return;
+ }
+ mutex_unlock(&z2ram_mutex);
+ /*
+ * FIXME: unmap memory
+ */
+}
+
+static const struct block_device_operations z2_fops =
+{
+ .owner = THIS_MODULE,
+ .open = z2_open,
+ .release = z2_release,
+};
+
+static struct kobject *z2_find(dev_t dev, int *part, void *data)
+{
+ *part = 0;
+ return get_disk(z2ram_gendisk);
+}
+
+static struct request_queue *z2_queue;
+
+static int __init
+z2_init(void)
+{
+ int ret;
+
+ if (!MACH_IS_AMIGA)
+ return -ENODEV;
+
+ ret = -EBUSY;
+ if (register_blkdev(Z2RAM_MAJOR, DEVICE_NAME))
+ goto err;
+
+ ret = -ENOMEM;
+ z2ram_gendisk = alloc_disk(1);
+ if (!z2ram_gendisk)
+ goto out_disk;
+
+ z2_queue = blk_init_queue(do_z2_request, &z2ram_lock);
+ if (!z2_queue)
+ goto out_queue;
+
+ z2ram_gendisk->major = Z2RAM_MAJOR;
+ z2ram_gendisk->first_minor = 0;
+ z2ram_gendisk->fops = &z2_fops;
+ sprintf(z2ram_gendisk->disk_name, "z2ram");
+
+ z2ram_gendisk->queue = z2_queue;
+ add_disk(z2ram_gendisk);
+ blk_register_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT, THIS_MODULE,
+ z2_find, NULL, NULL);
+
+ return 0;
+
+out_queue:
+ put_disk(z2ram_gendisk);
+out_disk:
+ unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME);
+err:
+ return ret;
+}
+
+static void __exit z2_exit(void)
+{
+ int i, j;
+ blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT);
+ unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME);
+ del_gendisk(z2ram_gendisk);
+ put_disk(z2ram_gendisk);
+ blk_cleanup_queue(z2_queue);
+
+ if ( current_device != -1 )
+ {
+ i = 0;
+
+ for ( j = 0 ; j < z2_count; j++ )
+ {
+ set_bit( i++, zorro_unused_z2ram );
+ }
+
+ for ( j = 0 ; j < chip_count; j++ )
+ {
+ if ( z2ram_map[ i ] )
+ {
+ amiga_chip_free( (void *) z2ram_map[ i++ ] );
+ }
+ }
+
+ if ( z2ram_map != NULL )
+ {
+ kfree( z2ram_map );
+ }
+ }
+
+ return;
+}
+
+module_init(z2_init);
+module_exit(z2_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
new file mode 100644
index 000000000..6489c0fd0
--- /dev/null
+++ b/drivers/block/zram/Kconfig
@@ -0,0 +1,34 @@
+config ZRAM
+ tristate "Compressed RAM block device support"
+ depends on BLOCK && SYSFS && ZSMALLOC
+ select LZO_COMPRESS
+ select LZO_DECOMPRESS
+ default n
+ help
+ Creates virtual block devices called /dev/zramX (X = 0, 1, ...).
+ Pages written to these disks are compressed and stored in memory
+ itself. These disks allow very fast I/O and compression provides
+ good amounts of memory savings.
+
+ It has several use cases, for example: /tmp storage, use as swap
+ disks and maybe many more.
+
+ See zram.txt for more information.
+
+config ZRAM_LZ4_COMPRESS
+ bool "Enable LZ4 algorithm support"
+ depends on ZRAM
+ select LZ4_COMPRESS
+ select LZ4_DECOMPRESS
+ default n
+ help
+ This option enables LZ4 compression algorithm support. Compression
+ algorithm can be changed using `comp_algorithm' device attribute.
+
+config ZRAM_DEBUG
+ bool "Compressed RAM block device debug support"
+ depends on ZRAM
+ default n
+ help
+ This option adds additional debugging code to the compressed
+ RAM block device driver.
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
new file mode 100644
index 000000000..be0763ff5
--- /dev/null
+++ b/drivers/block/zram/Makefile
@@ -0,0 +1,5 @@
+zram-y := zcomp_lzo.o zcomp.o zram_drv.o
+
+zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o
+
+obj-$(CONFIG_ZRAM) += zram.o
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
new file mode 100644
index 000000000..f1ff39a3d
--- /dev/null
+++ b/drivers/block/zram/zcomp.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+
+#include "zcomp.h"
+#include "zcomp_lzo.h"
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+#include "zcomp_lz4.h"
+#endif
+
+/*
+ * single zcomp_strm backend
+ */
+struct zcomp_strm_single {
+ struct mutex strm_lock;
+ struct zcomp_strm *zstrm;
+};
+
+/*
+ * multi zcomp_strm backend
+ */
+struct zcomp_strm_multi {
+ /* protect strm list */
+ spinlock_t strm_lock;
+ /* max possible number of zstrm streams */
+ int max_strm;
+ /* number of available zstrm streams */
+ int avail_strm;
+ /* list of available strms */
+ struct list_head idle_strm;
+ wait_queue_head_t strm_wait;
+};
+
+static struct zcomp_backend *backends[] = {
+ &zcomp_lzo,
+#ifdef CONFIG_ZRAM_LZ4_COMPRESS
+ &zcomp_lz4,
+#endif
+ NULL
+};
+
+static struct zcomp_backend *find_backend(const char *compress)
+{
+ int i = 0;
+ while (backends[i]) {
+ if (sysfs_streq(compress, backends[i]->name))
+ break;
+ i++;
+ }
+ return backends[i];
+}
+
+static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+ if (zstrm->private)
+ comp->backend->destroy(zstrm->private);
+ free_pages((unsigned long)zstrm->buffer, 1);
+ kfree(zstrm);
+}
+
+/*
+ * allocate new zcomp_strm structure with ->private initialized by
+ * backend, return NULL on error
+ */
+static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
+{
+ struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL);
+ if (!zstrm)
+ return NULL;
+
+ zstrm->private = comp->backend->create();
+ /*
+ * allocate 2 pages. 1 for compressed data, plus 1 extra for the
+ * case when compressed size is larger than the original one
+ */
+ zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+ if (!zstrm->private || !zstrm->buffer) {
+ zcomp_strm_free(comp, zstrm);
+ zstrm = NULL;
+ }
+ return zstrm;
+}
+
+/*
+ * get idle zcomp_strm or wait until other process release
+ * (zcomp_strm_release()) one for us
+ */
+static struct zcomp_strm *zcomp_strm_multi_find(struct zcomp *comp)
+{
+ struct zcomp_strm_multi *zs = comp->stream;
+ struct zcomp_strm *zstrm;
+
+ while (1) {
+ spin_lock(&zs->strm_lock);
+ if (!list_empty(&zs->idle_strm)) {
+ zstrm = list_entry(zs->idle_strm.next,
+ struct zcomp_strm, list);
+ list_del(&zstrm->list);
+ spin_unlock(&zs->strm_lock);
+ return zstrm;
+ }
+ /* zstrm streams limit reached, wait for idle stream */
+ if (zs->avail_strm >= zs->max_strm) {
+ spin_unlock(&zs->strm_lock);
+ wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
+ continue;
+ }
+ /* allocate new zstrm stream */
+ zs->avail_strm++;
+ spin_unlock(&zs->strm_lock);
+
+ zstrm = zcomp_strm_alloc(comp);
+ if (!zstrm) {
+ spin_lock(&zs->strm_lock);
+ zs->avail_strm--;
+ spin_unlock(&zs->strm_lock);
+ wait_event(zs->strm_wait, !list_empty(&zs->idle_strm));
+ continue;
+ }
+ break;
+ }
+ return zstrm;
+}
+
+/* add stream back to idle list and wake up waiter or free the stream */
+static void zcomp_strm_multi_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+ struct zcomp_strm_multi *zs = comp->stream;
+
+ spin_lock(&zs->strm_lock);
+ if (zs->avail_strm <= zs->max_strm) {
+ list_add(&zstrm->list, &zs->idle_strm);
+ spin_unlock(&zs->strm_lock);
+ wake_up(&zs->strm_wait);
+ return;
+ }
+
+ zs->avail_strm--;
+ spin_unlock(&zs->strm_lock);
+ zcomp_strm_free(comp, zstrm);
+}
+
+/* change max_strm limit */
+static bool zcomp_strm_multi_set_max_streams(struct zcomp *comp, int num_strm)
+{
+ struct zcomp_strm_multi *zs = comp->stream;
+ struct zcomp_strm *zstrm;
+
+ spin_lock(&zs->strm_lock);
+ zs->max_strm = num_strm;
+ /*
+ * if user has lowered the limit and there are idle streams,
+ * immediately free as much streams (and memory) as we can.
+ */
+ while (zs->avail_strm > num_strm && !list_empty(&zs->idle_strm)) {
+ zstrm = list_entry(zs->idle_strm.next,
+ struct zcomp_strm, list);
+ list_del(&zstrm->list);
+ zcomp_strm_free(comp, zstrm);
+ zs->avail_strm--;
+ }
+ spin_unlock(&zs->strm_lock);
+ return true;
+}
+
+static void zcomp_strm_multi_destroy(struct zcomp *comp)
+{
+ struct zcomp_strm_multi *zs = comp->stream;
+ struct zcomp_strm *zstrm;
+
+ while (!list_empty(&zs->idle_strm)) {
+ zstrm = list_entry(zs->idle_strm.next,
+ struct zcomp_strm, list);
+ list_del(&zstrm->list);
+ zcomp_strm_free(comp, zstrm);
+ }
+ kfree(zs);
+}
+
+static int zcomp_strm_multi_create(struct zcomp *comp, int max_strm)
+{
+ struct zcomp_strm *zstrm;
+ struct zcomp_strm_multi *zs;
+
+ comp->destroy = zcomp_strm_multi_destroy;
+ comp->strm_find = zcomp_strm_multi_find;
+ comp->strm_release = zcomp_strm_multi_release;
+ comp->set_max_streams = zcomp_strm_multi_set_max_streams;
+ zs = kmalloc(sizeof(struct zcomp_strm_multi), GFP_KERNEL);
+ if (!zs)
+ return -ENOMEM;
+
+ comp->stream = zs;
+ spin_lock_init(&zs->strm_lock);
+ INIT_LIST_HEAD(&zs->idle_strm);
+ init_waitqueue_head(&zs->strm_wait);
+ zs->max_strm = max_strm;
+ zs->avail_strm = 1;
+
+ zstrm = zcomp_strm_alloc(comp);
+ if (!zstrm) {
+ kfree(zs);
+ return -ENOMEM;
+ }
+ list_add(&zstrm->list, &zs->idle_strm);
+ return 0;
+}
+
+static struct zcomp_strm *zcomp_strm_single_find(struct zcomp *comp)
+{
+ struct zcomp_strm_single *zs = comp->stream;
+ mutex_lock(&zs->strm_lock);
+ return zs->zstrm;
+}
+
+static void zcomp_strm_single_release(struct zcomp *comp,
+ struct zcomp_strm *zstrm)
+{
+ struct zcomp_strm_single *zs = comp->stream;
+ mutex_unlock(&zs->strm_lock);
+}
+
+static bool zcomp_strm_single_set_max_streams(struct zcomp *comp, int num_strm)
+{
+ /* zcomp_strm_single support only max_comp_streams == 1 */
+ return false;
+}
+
+static void zcomp_strm_single_destroy(struct zcomp *comp)
+{
+ struct zcomp_strm_single *zs = comp->stream;
+ zcomp_strm_free(comp, zs->zstrm);
+ kfree(zs);
+}
+
+static int zcomp_strm_single_create(struct zcomp *comp)
+{
+ struct zcomp_strm_single *zs;
+
+ comp->destroy = zcomp_strm_single_destroy;
+ comp->strm_find = zcomp_strm_single_find;
+ comp->strm_release = zcomp_strm_single_release;
+ comp->set_max_streams = zcomp_strm_single_set_max_streams;
+ zs = kmalloc(sizeof(struct zcomp_strm_single), GFP_KERNEL);
+ if (!zs)
+ return -ENOMEM;
+
+ comp->stream = zs;
+ mutex_init(&zs->strm_lock);
+ zs->zstrm = zcomp_strm_alloc(comp);
+ if (!zs->zstrm) {
+ kfree(zs);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/* show available compressors */
+ssize_t zcomp_available_show(const char *comp, char *buf)
+{
+ ssize_t sz = 0;
+ int i = 0;
+
+ while (backends[i]) {
+ if (sysfs_streq(comp, backends[i]->name))
+ sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+ "[%s] ", backends[i]->name);
+ else
+ sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+ "%s ", backends[i]->name);
+ i++;
+ }
+ sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+ return sz;
+}
+
+bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
+{
+ return comp->set_max_streams(comp, num_strm);
+}
+
+struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
+{
+ return comp->strm_find(comp);
+}
+
+void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm)
+{
+ comp->strm_release(comp, zstrm);
+}
+
+int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
+ const unsigned char *src, size_t *dst_len)
+{
+ return comp->backend->compress(src, zstrm->buffer, dst_len,
+ zstrm->private);
+}
+
+int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
+ size_t src_len, unsigned char *dst)
+{
+ return comp->backend->decompress(src, src_len, dst);
+}
+
+void zcomp_destroy(struct zcomp *comp)
+{
+ comp->destroy(comp);
+ kfree(comp);
+}
+
+/*
+ * search available compressors for requested algorithm.
+ * allocate new zcomp and initialize it. return compressing
+ * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
+ * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
+ * case of allocation error.
+ */
+struct zcomp *zcomp_create(const char *compress, int max_strm)
+{
+ struct zcomp *comp;
+ struct zcomp_backend *backend;
+
+ backend = find_backend(compress);
+ if (!backend)
+ return ERR_PTR(-EINVAL);
+
+ comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
+ if (!comp)
+ return ERR_PTR(-ENOMEM);
+
+ comp->backend = backend;
+ if (max_strm > 1)
+ zcomp_strm_multi_create(comp, max_strm);
+ else
+ zcomp_strm_single_create(comp);
+ if (!comp->stream) {
+ kfree(comp);
+ return ERR_PTR(-ENOMEM);
+ }
+ return comp;
+}
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
new file mode 100644
index 000000000..c59d1fca7
--- /dev/null
+++ b/drivers/block/zram/zcomp.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_H_
+#define _ZCOMP_H_
+
+#include <linux/mutex.h>
+
+struct zcomp_strm {
+ /* compression/decompression buffer */
+ void *buffer;
+ /*
+ * The private data of the compression stream, only compression
+ * stream backend can touch this (e.g. compression algorithm
+ * working memory)
+ */
+ void *private;
+ /* used in multi stream backend, protected by backend strm_lock */
+ struct list_head list;
+};
+
+/* static compression backend */
+struct zcomp_backend {
+ int (*compress)(const unsigned char *src, unsigned char *dst,
+ size_t *dst_len, void *private);
+
+ int (*decompress)(const unsigned char *src, size_t src_len,
+ unsigned char *dst);
+
+ void *(*create)(void);
+ void (*destroy)(void *private);
+
+ const char *name;
+};
+
+/* dynamic per-device compression frontend */
+struct zcomp {
+ void *stream;
+ struct zcomp_backend *backend;
+
+ struct zcomp_strm *(*strm_find)(struct zcomp *comp);
+ void (*strm_release)(struct zcomp *comp, struct zcomp_strm *zstrm);
+ bool (*set_max_streams)(struct zcomp *comp, int num_strm);
+ void (*destroy)(struct zcomp *comp);
+};
+
+ssize_t zcomp_available_show(const char *comp, char *buf);
+
+struct zcomp *zcomp_create(const char *comp, int max_strm);
+void zcomp_destroy(struct zcomp *comp);
+
+struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
+void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm);
+
+int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm,
+ const unsigned char *src, size_t *dst_len);
+
+int zcomp_decompress(struct zcomp *comp, const unsigned char *src,
+ size_t src_len, unsigned char *dst);
+
+bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
+#endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c
new file mode 100644
index 000000000..f2afb7e98
--- /dev/null
+++ b/drivers/block/zram/zcomp_lz4.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/lz4.h>
+
+#include "zcomp_lz4.h"
+
+static void *zcomp_lz4_create(void)
+{
+ return kzalloc(LZ4_MEM_COMPRESS, GFP_KERNEL);
+}
+
+static void zcomp_lz4_destroy(void *private)
+{
+ kfree(private);
+}
+
+static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst,
+ size_t *dst_len, void *private)
+{
+ /* return : Success if return 0 */
+ return lz4_compress(src, PAGE_SIZE, dst, dst_len, private);
+}
+
+static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len,
+ unsigned char *dst)
+{
+ size_t dst_len = PAGE_SIZE;
+ /* return : Success if return 0 */
+ return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len);
+}
+
+struct zcomp_backend zcomp_lz4 = {
+ .compress = zcomp_lz4_compress,
+ .decompress = zcomp_lz4_decompress,
+ .create = zcomp_lz4_create,
+ .destroy = zcomp_lz4_destroy,
+ .name = "lz4",
+};
diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h
new file mode 100644
index 000000000..60613fb29
--- /dev/null
+++ b/drivers/block/zram/zcomp_lz4.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_LZ4_H_
+#define _ZCOMP_LZ4_H_
+
+#include "zcomp.h"
+
+extern struct zcomp_backend zcomp_lz4;
+
+#endif /* _ZCOMP_LZ4_H_ */
diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c
new file mode 100644
index 000000000..da1bc47d5
--- /dev/null
+++ b/drivers/block/zram/zcomp_lzo.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/lzo.h>
+
+#include "zcomp_lzo.h"
+
+static void *lzo_create(void)
+{
+ return kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+}
+
+static void lzo_destroy(void *private)
+{
+ kfree(private);
+}
+
+static int lzo_compress(const unsigned char *src, unsigned char *dst,
+ size_t *dst_len, void *private)
+{
+ int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private);
+ return ret == LZO_E_OK ? 0 : ret;
+}
+
+static int lzo_decompress(const unsigned char *src, size_t src_len,
+ unsigned char *dst)
+{
+ size_t dst_len = PAGE_SIZE;
+ int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len);
+ return ret == LZO_E_OK ? 0 : ret;
+}
+
+struct zcomp_backend zcomp_lzo = {
+ .compress = lzo_compress,
+ .decompress = lzo_decompress,
+ .create = lzo_create,
+ .destroy = lzo_destroy,
+ .name = "lzo",
+};
diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h
new file mode 100644
index 000000000..128c5807f
--- /dev/null
+++ b/drivers/block/zram/zcomp_lzo.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _ZCOMP_LZO_H_
+#define _ZCOMP_LZO_H_
+
+#include "zcomp.h"
+
+extern struct zcomp_backend zcomp_lzo;
+
+#endif /* _ZCOMP_LZO_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
new file mode 100644
index 000000000..6e134f475
--- /dev/null
+++ b/drivers/block/zram/zram_drv.c
@@ -0,0 +1,1322 @@
+/*
+ * Compressed RAM block device
+ *
+ * Copyright (C) 2008, 2009, 2010 Nitin Gupta
+ * 2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ */
+
+#define KMSG_COMPONENT "zram"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#ifdef CONFIG_ZRAM_DEBUG
+#define DEBUG
+#endif
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/err.h>
+
+#include "zram_drv.h"
+
+/* Globals */
+static int zram_major;
+static struct zram *zram_devices;
+static const char *default_compressor = "lzo";
+
+/* Module params (documentation at end) */
+static unsigned int num_devices = 1;
+
+static inline void deprecated_attr_warn(const char *name)
+{
+ pr_warn_once("%d (%s) Attribute %s (and others) will be removed. %s\n",
+ task_pid_nr(current),
+ current->comm,
+ name,
+ "See zram documentation.");
+}
+
+#define ZRAM_ATTR_RO(name) \
+static ssize_t name##_show(struct device *d, \
+ struct device_attribute *attr, char *b) \
+{ \
+ struct zram *zram = dev_to_zram(d); \
+ \
+ deprecated_attr_warn(__stringify(name)); \
+ return scnprintf(b, PAGE_SIZE, "%llu\n", \
+ (u64)atomic64_read(&zram->stats.name)); \
+} \
+static DEVICE_ATTR_RO(name);
+
+static inline bool init_done(struct zram *zram)
+{
+ return zram->disksize;
+}
+
+static inline struct zram *dev_to_zram(struct device *dev)
+{
+ return (struct zram *)dev_to_disk(dev)->private_data;
+}
+
+static ssize_t compact_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_migrated;
+ struct zram *zram = dev_to_zram(dev);
+ struct zram_meta *meta;
+
+ down_read(&zram->init_lock);
+ if (!init_done(zram)) {
+ up_read(&zram->init_lock);
+ return -EINVAL;
+ }
+
+ meta = zram->meta;
+ nr_migrated = zs_compact(meta->mem_pool);
+ atomic64_add(nr_migrated, &zram->stats.num_migrated);
+ up_read(&zram->init_lock);
+
+ return len;
+}
+
+static ssize_t disksize_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
+}
+
+static ssize_t initstate_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u32 val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = init_done(zram);
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t orig_data_size_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+
+ deprecated_attr_warn("orig_data_size");
+ return scnprintf(buf, PAGE_SIZE, "%llu\n",
+ (u64)(atomic64_read(&zram->stats.pages_stored)) << PAGE_SHIFT);
+}
+
+static ssize_t mem_used_total_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u64 val = 0;
+ struct zram *zram = dev_to_zram(dev);
+
+ deprecated_attr_warn("mem_used_total");
+ down_read(&zram->init_lock);
+ if (init_done(zram)) {
+ struct zram_meta *meta = zram->meta;
+ val = zs_get_total_pages(meta->mem_pool);
+ }
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+}
+
+static ssize_t max_comp_streams_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int val;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ val = zram->max_comp_streams;
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t mem_limit_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u64 val;
+ struct zram *zram = dev_to_zram(dev);
+
+ deprecated_attr_warn("mem_limit");
+ down_read(&zram->init_lock);
+ val = zram->limit_pages;
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+}
+
+static ssize_t mem_limit_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ u64 limit;
+ char *tmp;
+ struct zram *zram = dev_to_zram(dev);
+
+ limit = memparse(buf, &tmp);
+ if (buf == tmp) /* no chars parsed, invalid input */
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
+ up_write(&zram->init_lock);
+
+ return len;
+}
+
+static ssize_t mem_used_max_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ u64 val = 0;
+ struct zram *zram = dev_to_zram(dev);
+
+ deprecated_attr_warn("mem_used_max");
+ down_read(&zram->init_lock);
+ if (init_done(zram))
+ val = atomic_long_read(&zram->stats.max_used_pages);
+ up_read(&zram->init_lock);
+
+ return scnprintf(buf, PAGE_SIZE, "%llu\n", val << PAGE_SHIFT);
+}
+
+static ssize_t mem_used_max_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ int err;
+ unsigned long val;
+ struct zram *zram = dev_to_zram(dev);
+
+ err = kstrtoul(buf, 10, &val);
+ if (err || val != 0)
+ return -EINVAL;
+
+ down_read(&zram->init_lock);
+ if (init_done(zram)) {
+ struct zram_meta *meta = zram->meta;
+ atomic_long_set(&zram->stats.max_used_pages,
+ zs_get_total_pages(meta->mem_pool));
+ }
+ up_read(&zram->init_lock);
+
+ return len;
+}
+
+static ssize_t max_comp_streams_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ int num;
+ struct zram *zram = dev_to_zram(dev);
+ int ret;
+
+ ret = kstrtoint(buf, 0, &num);
+ if (ret < 0)
+ return ret;
+ if (num < 1)
+ return -EINVAL;
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ if (!zcomp_set_max_streams(zram->comp, num)) {
+ pr_info("Cannot change max compression streams\n");
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ zram->max_comp_streams = num;
+ ret = len;
+out:
+ up_write(&zram->init_lock);
+ return ret;
+}
+
+static ssize_t comp_algorithm_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ size_t sz;
+ struct zram *zram = dev_to_zram(dev);
+
+ down_read(&zram->init_lock);
+ sz = zcomp_available_show(zram->compressor, buf);
+ up_read(&zram->init_lock);
+
+ return sz;
+}
+
+static ssize_t comp_algorithm_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ struct zram *zram = dev_to_zram(dev);
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ up_write(&zram->init_lock);
+ pr_info("Can't change algorithm for initialized device\n");
+ return -EBUSY;
+ }
+ strlcpy(zram->compressor, buf, sizeof(zram->compressor));
+ up_write(&zram->init_lock);
+ return len;
+}
+
+/* flag operations needs meta->tb_lock */
+static int zram_test_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
+{
+ return meta->table[index].value & BIT(flag);
+}
+
+static void zram_set_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
+{
+ meta->table[index].value |= BIT(flag);
+}
+
+static void zram_clear_flag(struct zram_meta *meta, u32 index,
+ enum zram_pageflags flag)
+{
+ meta->table[index].value &= ~BIT(flag);
+}
+
+static size_t zram_get_obj_size(struct zram_meta *meta, u32 index)
+{
+ return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1);
+}
+
+static void zram_set_obj_size(struct zram_meta *meta,
+ u32 index, size_t size)
+{
+ unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT;
+
+ meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size;
+}
+
+static inline int is_partial_io(struct bio_vec *bvec)
+{
+ return bvec->bv_len != PAGE_SIZE;
+}
+
+/*
+ * Check if request is within bounds and aligned on zram logical blocks.
+ */
+static inline int valid_io_request(struct zram *zram,
+ sector_t start, unsigned int size)
+{
+ u64 end, bound;
+
+ /* unaligned request */
+ if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+ return 0;
+ if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+ return 0;
+
+ end = start + (size >> SECTOR_SHIFT);
+ bound = zram->disksize >> SECTOR_SHIFT;
+ /* out of range range */
+ if (unlikely(start >= bound || end > bound || start > end))
+ return 0;
+
+ /* I/O request is valid */
+ return 1;
+}
+
+static void zram_meta_free(struct zram_meta *meta, u64 disksize)
+{
+ size_t num_pages = disksize >> PAGE_SHIFT;
+ size_t index;
+
+ /* Free all pages that are still in this zram device */
+ for (index = 0; index < num_pages; index++) {
+ unsigned long handle = meta->table[index].handle;
+
+ if (!handle)
+ continue;
+
+ zs_free(meta->mem_pool, handle);
+ }
+
+ zs_destroy_pool(meta->mem_pool);
+ vfree(meta->table);
+ kfree(meta);
+}
+
+static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize)
+{
+ size_t num_pages;
+ char pool_name[8];
+ struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+
+ if (!meta)
+ return NULL;
+
+ num_pages = disksize >> PAGE_SHIFT;
+ meta->table = vzalloc(num_pages * sizeof(*meta->table));
+ if (!meta->table) {
+ pr_err("Error allocating zram address table\n");
+ goto out_error;
+ }
+
+ snprintf(pool_name, sizeof(pool_name), "zram%d", device_id);
+ meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+ if (!meta->mem_pool) {
+ pr_err("Error creating memory pool\n");
+ goto out_error;
+ }
+
+ return meta;
+
+out_error:
+ vfree(meta->table);
+ kfree(meta);
+ return NULL;
+}
+
+static inline bool zram_meta_get(struct zram *zram)
+{
+ if (atomic_inc_not_zero(&zram->refcount))
+ return true;
+ return false;
+}
+
+static inline void zram_meta_put(struct zram *zram)
+{
+ atomic_dec(&zram->refcount);
+}
+
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+ if (*offset + bvec->bv_len >= PAGE_SIZE)
+ (*index)++;
+ *offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+}
+
+static int page_zero_filled(void *ptr)
+{
+ unsigned int pos;
+ unsigned long *page;
+
+ page = (unsigned long *)ptr;
+
+ for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+ if (page[pos])
+ return 0;
+ }
+
+ return 1;
+}
+
+static void handle_zero_page(struct bio_vec *bvec)
+{
+ struct page *page = bvec->bv_page;
+ void *user_mem;
+
+ user_mem = kmap_atomic(page);
+ if (is_partial_io(bvec))
+ memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+ else
+ clear_page(user_mem);
+ kunmap_atomic(user_mem);
+
+ flush_dcache_page(page);
+}
+
+
+/*
+ * To protect concurrent access to the same index entry,
+ * caller should hold this table index entry's bit_spinlock to
+ * indicate this index entry is accessing.
+ */
+static void zram_free_page(struct zram *zram, size_t index)
+{
+ struct zram_meta *meta = zram->meta;
+ unsigned long handle = meta->table[index].handle;
+
+ if (unlikely(!handle)) {
+ /*
+ * No memory is allocated for zero filled pages.
+ * Simply clear zero page flag.
+ */
+ if (zram_test_flag(meta, index, ZRAM_ZERO)) {
+ zram_clear_flag(meta, index, ZRAM_ZERO);
+ atomic64_dec(&zram->stats.zero_pages);
+ }
+ return;
+ }
+
+ zs_free(meta->mem_pool, handle);
+
+ atomic64_sub(zram_get_obj_size(meta, index),
+ &zram->stats.compr_data_size);
+ atomic64_dec(&zram->stats.pages_stored);
+
+ meta->table[index].handle = 0;
+ zram_set_obj_size(meta, index, 0);
+}
+
+static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+{
+ int ret = 0;
+ unsigned char *cmem;
+ struct zram_meta *meta = zram->meta;
+ unsigned long handle;
+ size_t size;
+
+ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ handle = meta->table[index].handle;
+ size = zram_get_obj_size(meta, index);
+
+ if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
+ bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ clear_page(mem);
+ return 0;
+ }
+
+ cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+ if (size == PAGE_SIZE)
+ copy_page(mem, cmem);
+ else
+ ret = zcomp_decompress(zram->comp, cmem, size, mem);
+ zs_unmap_object(meta->mem_pool, handle);
+ bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+ /* Should NEVER happen. Return bio error if it does. */
+ if (unlikely(ret)) {
+ pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+ u32 index, int offset)
+{
+ int ret;
+ struct page *page;
+ unsigned char *user_mem, *uncmem = NULL;
+ struct zram_meta *meta = zram->meta;
+ page = bvec->bv_page;
+
+ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ if (unlikely(!meta->table[index].handle) ||
+ zram_test_flag(meta, index, ZRAM_ZERO)) {
+ bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ handle_zero_page(bvec);
+ return 0;
+ }
+ bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+ if (is_partial_io(bvec))
+ /* Use a temporary buffer to decompress the page */
+ uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
+
+ user_mem = kmap_atomic(page);
+ if (!is_partial_io(bvec))
+ uncmem = user_mem;
+
+ if (!uncmem) {
+ pr_info("Unable to allocate temp memory\n");
+ ret = -ENOMEM;
+ goto out_cleanup;
+ }
+
+ ret = zram_decompress_page(zram, uncmem, index);
+ /* Should NEVER happen. Return bio error if it does. */
+ if (unlikely(ret))
+ goto out_cleanup;
+
+ if (is_partial_io(bvec))
+ memcpy(user_mem + bvec->bv_offset, uncmem + offset,
+ bvec->bv_len);
+
+ flush_dcache_page(page);
+ ret = 0;
+out_cleanup:
+ kunmap_atomic(user_mem);
+ if (is_partial_io(bvec))
+ kfree(uncmem);
+ return ret;
+}
+
+static inline void update_used_max(struct zram *zram,
+ const unsigned long pages)
+{
+ unsigned long old_max, cur_max;
+
+ old_max = atomic_long_read(&zram->stats.max_used_pages);
+
+ do {
+ cur_max = old_max;
+ if (pages > cur_max)
+ old_max = atomic_long_cmpxchg(
+ &zram->stats.max_used_pages, cur_max, pages);
+ } while (old_max != cur_max);
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
+ int offset)
+{
+ int ret = 0;
+ size_t clen;
+ unsigned long handle;
+ struct page *page;
+ unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
+ struct zram_meta *meta = zram->meta;
+ struct zcomp_strm *zstrm;
+ bool locked = false;
+ unsigned long alloced_pages;
+
+ page = bvec->bv_page;
+ if (is_partial_io(bvec)) {
+ /*
+ * This is a partial IO. We need to read the full page
+ * before to write the changes.
+ */
+ uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
+ if (!uncmem) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = zram_decompress_page(zram, uncmem, index);
+ if (ret)
+ goto out;
+ }
+
+ zstrm = zcomp_strm_find(zram->comp);
+ locked = true;
+ user_mem = kmap_atomic(page);
+
+ if (is_partial_io(bvec)) {
+ memcpy(uncmem + offset, user_mem + bvec->bv_offset,
+ bvec->bv_len);
+ kunmap_atomic(user_mem);
+ user_mem = NULL;
+ } else {
+ uncmem = user_mem;
+ }
+
+ if (page_zero_filled(uncmem)) {
+ if (user_mem)
+ kunmap_atomic(user_mem);
+ /* Free memory associated with this sector now. */
+ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_free_page(zram, index);
+ zram_set_flag(meta, index, ZRAM_ZERO);
+ bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+ atomic64_inc(&zram->stats.zero_pages);
+ ret = 0;
+ goto out;
+ }
+
+ ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen);
+ if (!is_partial_io(bvec)) {
+ kunmap_atomic(user_mem);
+ user_mem = NULL;
+ uncmem = NULL;
+ }
+
+ if (unlikely(ret)) {
+ pr_err("Compression failed! err=%d\n", ret);
+ goto out;
+ }
+ src = zstrm->buffer;
+ if (unlikely(clen > max_zpage_size)) {
+ clen = PAGE_SIZE;
+ if (is_partial_io(bvec))
+ src = uncmem;
+ }
+
+ handle = zs_malloc(meta->mem_pool, clen);
+ if (!handle) {
+ pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
+ index, clen);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ alloced_pages = zs_get_total_pages(meta->mem_pool);
+ if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+ zs_free(meta->mem_pool, handle);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ update_used_max(zram, alloced_pages);
+
+ cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
+
+ if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+ src = kmap_atomic(page);
+ copy_page(cmem, src);
+ kunmap_atomic(src);
+ } else {
+ memcpy(cmem, src, clen);
+ }
+
+ zcomp_strm_release(zram->comp, zstrm);
+ locked = false;
+ zs_unmap_object(meta->mem_pool, handle);
+
+ /*
+ * Free memory associated with this sector
+ * before overwriting unused sectors.
+ */
+ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_free_page(zram, index);
+
+ meta->table[index].handle = handle;
+ zram_set_obj_size(meta, index, clen);
+ bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+
+ /* Update stats */
+ atomic64_add(clen, &zram->stats.compr_data_size);
+ atomic64_inc(&zram->stats.pages_stored);
+out:
+ if (locked)
+ zcomp_strm_release(zram->comp, zstrm);
+ if (is_partial_io(bvec))
+ kfree(uncmem);
+ return ret;
+}
+
+static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
+ int offset, int rw)
+{
+ unsigned long start_time = jiffies;
+ int ret;
+
+ generic_start_io_acct(rw, bvec->bv_len >> SECTOR_SHIFT,
+ &zram->disk->part0);
+
+ if (rw == READ) {
+ atomic64_inc(&zram->stats.num_reads);
+ ret = zram_bvec_read(zram, bvec, index, offset);
+ } else {
+ atomic64_inc(&zram->stats.num_writes);
+ ret = zram_bvec_write(zram, bvec, index, offset);
+ }
+
+ generic_end_io_acct(rw, &zram->disk->part0, start_time);
+
+ if (unlikely(ret)) {
+ if (rw == READ)
+ atomic64_inc(&zram->stats.failed_reads);
+ else
+ atomic64_inc(&zram->stats.failed_writes);
+ }
+
+ return ret;
+}
+
+/*
+ * zram_bio_discard - handler on discard request
+ * @index: physical block index in PAGE_SIZE units
+ * @offset: byte offset within physical block
+ */
+static void zram_bio_discard(struct zram *zram, u32 index,
+ int offset, struct bio *bio)
+{
+ size_t n = bio->bi_iter.bi_size;
+ struct zram_meta *meta = zram->meta;
+
+ /*
+ * zram manages data in physical block size units. Because logical block
+ * size isn't identical with physical block size on some arch, we
+ * could get a discard request pointing to a specific offset within a
+ * certain physical block. Although we can handle this request by
+ * reading that physiclal block and decompressing and partially zeroing
+ * and re-compressing and then re-storing it, this isn't reasonable
+ * because our intent with a discard request is to save memory. So
+ * skipping this logical block is appropriate here.
+ */
+ if (offset) {
+ if (n <= (PAGE_SIZE - offset))
+ return;
+
+ n -= (PAGE_SIZE - offset);
+ index++;
+ }
+
+ while (n >= PAGE_SIZE) {
+ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_free_page(zram, index);
+ bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ atomic64_inc(&zram->stats.notify_free);
+ index++;
+ n -= PAGE_SIZE;
+ }
+}
+
+static void zram_reset_device(struct zram *zram)
+{
+ struct zram_meta *meta;
+ struct zcomp *comp;
+ u64 disksize;
+
+ down_write(&zram->init_lock);
+
+ zram->limit_pages = 0;
+
+ if (!init_done(zram)) {
+ up_write(&zram->init_lock);
+ return;
+ }
+
+ meta = zram->meta;
+ comp = zram->comp;
+ disksize = zram->disksize;
+ /*
+ * Refcount will go down to 0 eventually and r/w handler
+ * cannot handle further I/O so it will bail out by
+ * check zram_meta_get.
+ */
+ zram_meta_put(zram);
+ /*
+ * We want to free zram_meta in process context to avoid
+ * deadlock between reclaim path and any other locks.
+ */
+ wait_event(zram->io_done, atomic_read(&zram->refcount) == 0);
+
+ /* Reset stats */
+ memset(&zram->stats, 0, sizeof(zram->stats));
+ zram->disksize = 0;
+ zram->max_comp_streams = 1;
+
+ set_capacity(zram->disk, 0);
+ part_stat_set_all(&zram->disk->part0, 0);
+
+ up_write(&zram->init_lock);
+ /* I/O operation under all of CPU are done so let's free */
+ zram_meta_free(meta, disksize);
+ zcomp_destroy(comp);
+}
+
+static ssize_t disksize_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ u64 disksize;
+ struct zcomp *comp;
+ struct zram_meta *meta;
+ struct zram *zram = dev_to_zram(dev);
+ int err;
+
+ disksize = memparse(buf, NULL);
+ if (!disksize)
+ return -EINVAL;
+
+ disksize = PAGE_ALIGN(disksize);
+ meta = zram_meta_alloc(zram->disk->first_minor, disksize);
+ if (!meta)
+ return -ENOMEM;
+
+ comp = zcomp_create(zram->compressor, zram->max_comp_streams);
+ if (IS_ERR(comp)) {
+ pr_info("Cannot initialise %s compressing backend\n",
+ zram->compressor);
+ err = PTR_ERR(comp);
+ goto out_free_meta;
+ }
+
+ down_write(&zram->init_lock);
+ if (init_done(zram)) {
+ pr_info("Cannot change disksize for initialized device\n");
+ err = -EBUSY;
+ goto out_destroy_comp;
+ }
+
+ init_waitqueue_head(&zram->io_done);
+ atomic_set(&zram->refcount, 1);
+ zram->meta = meta;
+ zram->comp = comp;
+ zram->disksize = disksize;
+ set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+ up_write(&zram->init_lock);
+
+ /*
+ * Revalidate disk out of the init_lock to avoid lockdep splat.
+ * It's okay because disk's capacity is protected by init_lock
+ * so that revalidate_disk always sees up-to-date capacity.
+ */
+ revalidate_disk(zram->disk);
+
+ return len;
+
+out_destroy_comp:
+ up_write(&zram->init_lock);
+ zcomp_destroy(comp);
+out_free_meta:
+ zram_meta_free(meta, disksize);
+ return err;
+}
+
+static ssize_t reset_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t len)
+{
+ int ret;
+ unsigned short do_reset;
+ struct zram *zram;
+ struct block_device *bdev;
+
+ zram = dev_to_zram(dev);
+ bdev = bdget_disk(zram->disk, 0);
+
+ if (!bdev)
+ return -ENOMEM;
+
+ mutex_lock(&bdev->bd_mutex);
+ /* Do not reset an active device! */
+ if (bdev->bd_openers) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ ret = kstrtou16(buf, 10, &do_reset);
+ if (ret)
+ goto out;
+
+ if (!do_reset) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Make sure all pending I/O is finished */
+ fsync_bdev(bdev);
+ zram_reset_device(zram);
+
+ mutex_unlock(&bdev->bd_mutex);
+ revalidate_disk(zram->disk);
+ bdput(bdev);
+
+ return len;
+
+out:
+ mutex_unlock(&bdev->bd_mutex);
+ bdput(bdev);
+ return ret;
+}
+
+static void __zram_make_request(struct zram *zram, struct bio *bio)
+{
+ int offset, rw;
+ u32 index;
+ struct bio_vec bvec;
+ struct bvec_iter iter;
+
+ index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
+ offset = (bio->bi_iter.bi_sector &
+ (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+
+ if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+ zram_bio_discard(zram, index, offset, bio);
+ bio_endio(bio, 0);
+ return;
+ }
+
+ rw = bio_data_dir(bio);
+ bio_for_each_segment(bvec, bio, iter) {
+ int max_transfer_size = PAGE_SIZE - offset;
+
+ if (bvec.bv_len > max_transfer_size) {
+ /*
+ * zram_bvec_rw() can only make operation on a single
+ * zram page. Split the bio vector.
+ */
+ struct bio_vec bv;
+
+ bv.bv_page = bvec.bv_page;
+ bv.bv_len = max_transfer_size;
+ bv.bv_offset = bvec.bv_offset;
+
+ if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0)
+ goto out;
+
+ bv.bv_len = bvec.bv_len - max_transfer_size;
+ bv.bv_offset += max_transfer_size;
+ if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0)
+ goto out;
+ } else
+ if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0)
+ goto out;
+
+ update_position(&index, &offset, &bvec);
+ }
+
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ bio_endio(bio, 0);
+ return;
+
+out:
+ bio_io_error(bio);
+}
+
+/*
+ * Handler function for all zram I/O requests.
+ */
+static void zram_make_request(struct request_queue *queue, struct bio *bio)
+{
+ struct zram *zram = queue->queuedata;
+
+ if (unlikely(!zram_meta_get(zram)))
+ goto error;
+
+ if (!valid_io_request(zram, bio->bi_iter.bi_sector,
+ bio->bi_iter.bi_size)) {
+ atomic64_inc(&zram->stats.invalid_io);
+ goto put_zram;
+ }
+
+ __zram_make_request(zram, bio);
+ zram_meta_put(zram);
+ return;
+put_zram:
+ zram_meta_put(zram);
+error:
+ bio_io_error(bio);
+}
+
+static void zram_slot_free_notify(struct block_device *bdev,
+ unsigned long index)
+{
+ struct zram *zram;
+ struct zram_meta *meta;
+
+ zram = bdev->bd_disk->private_data;
+ meta = zram->meta;
+
+ bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value);
+ zram_free_page(zram, index);
+ bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value);
+ atomic64_inc(&zram->stats.notify_free);
+}
+
+static int zram_rw_page(struct block_device *bdev, sector_t sector,
+ struct page *page, int rw)
+{
+ int offset, err = -EIO;
+ u32 index;
+ struct zram *zram;
+ struct bio_vec bv;
+
+ zram = bdev->bd_disk->private_data;
+ if (unlikely(!zram_meta_get(zram)))
+ goto out;
+
+ if (!valid_io_request(zram, sector, PAGE_SIZE)) {
+ atomic64_inc(&zram->stats.invalid_io);
+ err = -EINVAL;
+ goto put_zram;
+ }
+
+ index = sector >> SECTORS_PER_PAGE_SHIFT;
+ offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT;
+
+ bv.bv_page = page;
+ bv.bv_len = PAGE_SIZE;
+ bv.bv_offset = 0;
+
+ err = zram_bvec_rw(zram, &bv, index, offset, rw);
+put_zram:
+ zram_meta_put(zram);
+out:
+ /*
+ * If I/O fails, just return error(ie, non-zero) without
+ * calling page_endio.
+ * It causes resubmit the I/O with bio request by upper functions
+ * of rw_page(e.g., swap_readpage, __swap_writepage) and
+ * bio->bi_end_io does things to handle the error
+ * (e.g., SetPageError, set_page_dirty and extra works).
+ */
+ if (err == 0)
+ page_endio(page, rw, 0);
+ return err;
+}
+
+static const struct block_device_operations zram_devops = {
+ .swap_slot_free_notify = zram_slot_free_notify,
+ .rw_page = zram_rw_page,
+ .owner = THIS_MODULE
+};
+
+static DEVICE_ATTR_WO(compact);
+static DEVICE_ATTR_RW(disksize);
+static DEVICE_ATTR_RO(initstate);
+static DEVICE_ATTR_WO(reset);
+static DEVICE_ATTR_RO(orig_data_size);
+static DEVICE_ATTR_RO(mem_used_total);
+static DEVICE_ATTR_RW(mem_limit);
+static DEVICE_ATTR_RW(mem_used_max);
+static DEVICE_ATTR_RW(max_comp_streams);
+static DEVICE_ATTR_RW(comp_algorithm);
+
+static ssize_t io_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+ ssize_t ret;
+
+ down_read(&zram->init_lock);
+ ret = scnprintf(buf, PAGE_SIZE,
+ "%8llu %8llu %8llu %8llu\n",
+ (u64)atomic64_read(&zram->stats.failed_reads),
+ (u64)atomic64_read(&zram->stats.failed_writes),
+ (u64)atomic64_read(&zram->stats.invalid_io),
+ (u64)atomic64_read(&zram->stats.notify_free));
+ up_read(&zram->init_lock);
+
+ return ret;
+}
+
+static ssize_t mm_stat_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct zram *zram = dev_to_zram(dev);
+ u64 orig_size, mem_used = 0;
+ long max_used;
+ ssize_t ret;
+
+ down_read(&zram->init_lock);
+ if (init_done(zram))
+ mem_used = zs_get_total_pages(zram->meta->mem_pool);
+
+ orig_size = atomic64_read(&zram->stats.pages_stored);
+ max_used = atomic_long_read(&zram->stats.max_used_pages);
+
+ ret = scnprintf(buf, PAGE_SIZE,
+ "%8llu %8llu %8llu %8lu %8ld %8llu %8llu\n",
+ orig_size << PAGE_SHIFT,
+ (u64)atomic64_read(&zram->stats.compr_data_size),
+ mem_used << PAGE_SHIFT,
+ zram->limit_pages << PAGE_SHIFT,
+ max_used << PAGE_SHIFT,
+ (u64)atomic64_read(&zram->stats.zero_pages),
+ (u64)atomic64_read(&zram->stats.num_migrated));
+ up_read(&zram->init_lock);
+
+ return ret;
+}
+
+static DEVICE_ATTR_RO(io_stat);
+static DEVICE_ATTR_RO(mm_stat);
+ZRAM_ATTR_RO(num_reads);
+ZRAM_ATTR_RO(num_writes);
+ZRAM_ATTR_RO(failed_reads);
+ZRAM_ATTR_RO(failed_writes);
+ZRAM_ATTR_RO(invalid_io);
+ZRAM_ATTR_RO(notify_free);
+ZRAM_ATTR_RO(zero_pages);
+ZRAM_ATTR_RO(compr_data_size);
+
+static struct attribute *zram_disk_attrs[] = {
+ &dev_attr_disksize.attr,
+ &dev_attr_initstate.attr,
+ &dev_attr_reset.attr,
+ &dev_attr_num_reads.attr,
+ &dev_attr_num_writes.attr,
+ &dev_attr_failed_reads.attr,
+ &dev_attr_failed_writes.attr,
+ &dev_attr_compact.attr,
+ &dev_attr_invalid_io.attr,
+ &dev_attr_notify_free.attr,
+ &dev_attr_zero_pages.attr,
+ &dev_attr_orig_data_size.attr,
+ &dev_attr_compr_data_size.attr,
+ &dev_attr_mem_used_total.attr,
+ &dev_attr_mem_limit.attr,
+ &dev_attr_mem_used_max.attr,
+ &dev_attr_max_comp_streams.attr,
+ &dev_attr_comp_algorithm.attr,
+ &dev_attr_io_stat.attr,
+ &dev_attr_mm_stat.attr,
+ NULL,
+};
+
+static struct attribute_group zram_disk_attr_group = {
+ .attrs = zram_disk_attrs,
+};
+
+static int create_device(struct zram *zram, int device_id)
+{
+ struct request_queue *queue;
+ int ret = -ENOMEM;
+
+ init_rwsem(&zram->init_lock);
+
+ queue = blk_alloc_queue(GFP_KERNEL);
+ if (!queue) {
+ pr_err("Error allocating disk queue for device %d\n",
+ device_id);
+ goto out;
+ }
+
+ blk_queue_make_request(queue, zram_make_request);
+
+ /* gendisk structure */
+ zram->disk = alloc_disk(1);
+ if (!zram->disk) {
+ pr_warn("Error allocating disk structure for device %d\n",
+ device_id);
+ ret = -ENOMEM;
+ goto out_free_queue;
+ }
+
+ zram->disk->major = zram_major;
+ zram->disk->first_minor = device_id;
+ zram->disk->fops = &zram_devops;
+ zram->disk->queue = queue;
+ zram->disk->queue->queuedata = zram;
+ zram->disk->private_data = zram;
+ snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
+
+ /* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
+ set_capacity(zram->disk, 0);
+ /* zram devices sort of resembles non-rotational disks */
+ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
+ queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
+ /*
+ * To ensure that we always get PAGE_SIZE aligned
+ * and n*PAGE_SIZED sized I/O requests.
+ */
+ blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
+ blk_queue_logical_block_size(zram->disk->queue,
+ ZRAM_LOGICAL_BLOCK_SIZE);
+ blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
+ blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
+ zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
+ zram->disk->queue->limits.max_discard_sectors = UINT_MAX;
+ /*
+ * zram_bio_discard() will clear all logical blocks if logical block
+ * size is identical with physical block size(PAGE_SIZE). But if it is
+ * different, we will skip discarding some parts of logical blocks in
+ * the part of the request range which isn't aligned to physical block
+ * size. So we can't ensure that all discarded logical blocks are
+ * zeroed.
+ */
+ if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
+ zram->disk->queue->limits.discard_zeroes_data = 1;
+ else
+ zram->disk->queue->limits.discard_zeroes_data = 0;
+ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, zram->disk->queue);
+
+ add_disk(zram->disk);
+
+ ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
+ &zram_disk_attr_group);
+ if (ret < 0) {
+ pr_warn("Error creating sysfs group");
+ goto out_free_disk;
+ }
+ strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor));
+ zram->meta = NULL;
+ zram->max_comp_streams = 1;
+ return 0;
+
+out_free_disk:
+ del_gendisk(zram->disk);
+ put_disk(zram->disk);
+out_free_queue:
+ blk_cleanup_queue(queue);
+out:
+ return ret;
+}
+
+static void destroy_devices(unsigned int nr)
+{
+ struct zram *zram;
+ unsigned int i;
+
+ for (i = 0; i < nr; i++) {
+ zram = &zram_devices[i];
+ /*
+ * Remove sysfs first, so no one will perform a disksize
+ * store while we destroy the devices
+ */
+ sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
+ &zram_disk_attr_group);
+
+ zram_reset_device(zram);
+
+ blk_cleanup_queue(zram->disk->queue);
+ del_gendisk(zram->disk);
+ put_disk(zram->disk);
+ }
+
+ kfree(zram_devices);
+ unregister_blkdev(zram_major, "zram");
+ pr_info("Destroyed %u device(s)\n", nr);
+}
+
+static int __init zram_init(void)
+{
+ int ret, dev_id;
+
+ if (num_devices > max_num_devices) {
+ pr_warn("Invalid value for num_devices: %u\n",
+ num_devices);
+ return -EINVAL;
+ }
+
+ zram_major = register_blkdev(0, "zram");
+ if (zram_major <= 0) {
+ pr_warn("Unable to get major number\n");
+ return -EBUSY;
+ }
+
+ /* Allocate the device array and initialize each one */
+ zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
+ if (!zram_devices) {
+ unregister_blkdev(zram_major, "zram");
+ return -ENOMEM;
+ }
+
+ for (dev_id = 0; dev_id < num_devices; dev_id++) {
+ ret = create_device(&zram_devices[dev_id], dev_id);
+ if (ret)
+ goto out_error;
+ }
+
+ pr_info("Created %u device(s)\n", num_devices);
+ return 0;
+
+out_error:
+ destroy_devices(dev_id);
+ return ret;
+}
+
+static void __exit zram_exit(void)
+{
+ destroy_devices(num_devices);
+}
+
+module_init(zram_init);
+module_exit(zram_exit);
+
+module_param(num_devices, uint, 0);
+MODULE_PARM_DESC(num_devices, "Number of zram devices");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("Compressed RAM Block Device");
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
new file mode 100644
index 000000000..570c598f4
--- /dev/null
+++ b/drivers/block/zram/zram_drv.h
@@ -0,0 +1,125 @@
+/*
+ * Compressed RAM block device
+ *
+ * Copyright (C) 2008, 2009, 2010 Nitin Gupta
+ * 2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ */
+
+#ifndef _ZRAM_DRV_H_
+#define _ZRAM_DRV_H_
+
+#include <linux/spinlock.h>
+#include <linux/zsmalloc.h>
+
+#include "zcomp.h"
+
+/*
+ * Some arbitrary value. This is just to catch
+ * invalid value for num_devices module parameter.
+ */
+static const unsigned max_num_devices = 32;
+
+/*-- Configurable parameters */
+
+/*
+ * Pages that compress to size greater than this are stored
+ * uncompressed in memory.
+ */
+static const size_t max_zpage_size = PAGE_SIZE / 4 * 3;
+
+/*
+ * NOTE: max_zpage_size must be less than or equal to:
+ * ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would
+ * always return failure.
+ */
+
+/*-- End of configurable params */
+
+#define SECTOR_SHIFT 9
+#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT)
+#define ZRAM_LOGICAL_BLOCK_SHIFT 12
+#define ZRAM_LOGICAL_BLOCK_SIZE (1 << ZRAM_LOGICAL_BLOCK_SHIFT)
+#define ZRAM_SECTOR_PER_LOGICAL_BLOCK \
+ (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
+
+
+/*
+ * The lower ZRAM_FLAG_SHIFT bits of table.value is for
+ * object size (excluding header), the higher bits is for
+ * zram_pageflags.
+ *
+ * zram is mainly used for memory efficiency so we want to keep memory
+ * footprint small so we can squeeze size and flags into a field.
+ * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header),
+ * the higher bits is for zram_pageflags.
+ */
+#define ZRAM_FLAG_SHIFT 24
+
+/* Flags for zram pages (table[page_no].value) */
+enum zram_pageflags {
+ /* Page consists entirely of zeros */
+ ZRAM_ZERO = ZRAM_FLAG_SHIFT,
+ ZRAM_ACCESS, /* page is now accessed */
+
+ __NR_ZRAM_PAGEFLAGS,
+};
+
+/*-- Data structures */
+
+/* Allocated for each disk page */
+struct zram_table_entry {
+ unsigned long handle;
+ unsigned long value;
+};
+
+struct zram_stats {
+ atomic64_t compr_data_size; /* compressed size of pages stored */
+ atomic64_t num_reads; /* failed + successful */
+ atomic64_t num_writes; /* --do-- */
+ atomic64_t num_migrated; /* no. of migrated object */
+ atomic64_t failed_reads; /* can happen when memory is too low */
+ atomic64_t failed_writes; /* can happen when memory is too low */
+ atomic64_t invalid_io; /* non-page-aligned I/O requests */
+ atomic64_t notify_free; /* no. of swap slot free notifications */
+ atomic64_t zero_pages; /* no. of zero filled pages */
+ atomic64_t pages_stored; /* no. of pages currently stored */
+ atomic_long_t max_used_pages; /* no. of maximum pages stored */
+};
+
+struct zram_meta {
+ struct zram_table_entry *table;
+ struct zs_pool *mem_pool;
+};
+
+struct zram {
+ struct zram_meta *meta;
+ struct zcomp *comp;
+ struct gendisk *disk;
+ /* Prevent concurrent execution of device init */
+ struct rw_semaphore init_lock;
+ /*
+ * the number of pages zram can consume for storing compressed data
+ */
+ unsigned long limit_pages;
+ int max_comp_streams;
+
+ struct zram_stats stats;
+ atomic_t refcount; /* refcount for zram_meta */
+ /* wait all IO under all of cpu are done */
+ wait_queue_head_t io_done;
+ /*
+ * This is the limit on amount of *uncompressed* worth of data
+ * we can store in a disk.
+ */
+ u64 disksize; /* bytes */
+ char compressor[10];
+};
+#endif