diff options
Diffstat (limited to 'drivers/block/drbd/drbd_state.c')
-rw-r--r-- | drivers/block/drbd/drbd_state.c | 428 |
1 files changed, 416 insertions, 12 deletions
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 2d7dd269b..5a7ef7873 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -29,6 +29,7 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" +#include "drbd_state_change.h" struct after_state_chg_work { struct drbd_work w; @@ -37,6 +38,7 @@ struct after_state_chg_work { union drbd_state ns; enum chg_state_flags flags; struct completion *done; + struct drbd_state_change *state_change; }; enum sanitize_state_warnings { @@ -48,9 +50,248 @@ enum sanitize_state_warnings { IMPLICITLY_UPGRADED_PDSK, }; +static void count_objects(struct drbd_resource *resource, + unsigned int *n_devices, + unsigned int *n_connections) +{ + struct drbd_device *device; + struct drbd_connection *connection; + int vnr; + + *n_devices = 0; + *n_connections = 0; + + idr_for_each_entry(&resource->devices, device, vnr) + (*n_devices)++; + for_each_connection(connection, resource) + (*n_connections)++; +} + +static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp) +{ + struct drbd_state_change *state_change; + unsigned int size, n; + + size = sizeof(struct drbd_state_change) + + n_devices * sizeof(struct drbd_device_state_change) + + n_connections * sizeof(struct drbd_connection_state_change) + + n_devices * n_connections * sizeof(struct drbd_peer_device_state_change); + state_change = kmalloc(size, gfp); + if (!state_change) + return NULL; + state_change->n_devices = n_devices; + state_change->n_connections = n_connections; + state_change->devices = (void *)(state_change + 1); + state_change->connections = (void *)&state_change->devices[n_devices]; + state_change->peer_devices = (void *)&state_change->connections[n_connections]; + state_change->resource->resource = NULL; + for (n = 0; n < n_devices; n++) + state_change->devices[n].device = NULL; + for (n = 0; n < n_connections; n++) + state_change->connections[n].connection = NULL; + return state_change; +} + +struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp) +{ + struct drbd_state_change *state_change; + struct drbd_device *device; + unsigned int n_devices; + struct drbd_connection *connection; + unsigned int n_connections; + int vnr; + + struct drbd_device_state_change *device_state_change; + struct drbd_peer_device_state_change *peer_device_state_change; + struct drbd_connection_state_change *connection_state_change; + + /* Caller holds req_lock spinlock. + * No state, no device IDR, no connections lists can change. */ + count_objects(resource, &n_devices, &n_connections); + state_change = alloc_state_change(n_devices, n_connections, gfp); + if (!state_change) + return NULL; + + kref_get(&resource->kref); + state_change->resource->resource = resource; + state_change->resource->role[OLD] = + conn_highest_role(first_connection(resource)); + state_change->resource->susp[OLD] = resource->susp; + state_change->resource->susp_nod[OLD] = resource->susp_nod; + state_change->resource->susp_fen[OLD] = resource->susp_fen; + + connection_state_change = state_change->connections; + for_each_connection(connection, resource) { + kref_get(&connection->kref); + connection_state_change->connection = connection; + connection_state_change->cstate[OLD] = + connection->cstate; + connection_state_change->peer_role[OLD] = + conn_highest_peer(connection); + connection_state_change++; + } + + device_state_change = state_change->devices; + peer_device_state_change = state_change->peer_devices; + idr_for_each_entry(&resource->devices, device, vnr) { + kref_get(&device->kref); + device_state_change->device = device; + device_state_change->disk_state[OLD] = device->state.disk; + + /* The peer_devices for each device have to be enumerated in + the order of the connections. We may not use for_each_peer_device() here. */ + for_each_connection(connection, resource) { + struct drbd_peer_device *peer_device; + + peer_device = conn_peer_device(connection, device->vnr); + peer_device_state_change->peer_device = peer_device; + peer_device_state_change->disk_state[OLD] = + device->state.pdsk; + peer_device_state_change->repl_state[OLD] = + max_t(enum drbd_conns, + C_WF_REPORT_PARAMS, device->state.conn); + peer_device_state_change->resync_susp_user[OLD] = + device->state.user_isp; + peer_device_state_change->resync_susp_peer[OLD] = + device->state.peer_isp; + peer_device_state_change->resync_susp_dependency[OLD] = + device->state.aftr_isp; + peer_device_state_change++; + } + device_state_change++; + } + + return state_change; +} + +static void remember_new_state(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change; + struct drbd_resource *resource; + unsigned int n; + + if (!state_change) + return; + + resource_state_change = &state_change->resource[0]; + resource = resource_state_change->resource; + + resource_state_change->role[NEW] = + conn_highest_role(first_connection(resource)); + resource_state_change->susp[NEW] = resource->susp; + resource_state_change->susp_nod[NEW] = resource->susp_nod; + resource_state_change->susp_fen[NEW] = resource->susp_fen; + + for (n = 0; n < state_change->n_devices; n++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n]; + struct drbd_device *device = device_state_change->device; + + device_state_change->disk_state[NEW] = device->state.disk; + } + + for (n = 0; n < state_change->n_connections; n++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n]; + struct drbd_connection *connection = + connection_state_change->connection; + + connection_state_change->cstate[NEW] = connection->cstate; + connection_state_change->peer_role[NEW] = + conn_highest_peer(connection); + } + + for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n]; + struct drbd_device *device = + peer_device_state_change->peer_device->device; + union drbd_dev_state state = device->state; + + peer_device_state_change->disk_state[NEW] = state.pdsk; + peer_device_state_change->repl_state[NEW] = + max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn); + peer_device_state_change->resync_susp_user[NEW] = + state.user_isp; + peer_device_state_change->resync_susp_peer[NEW] = + state.peer_isp; + peer_device_state_change->resync_susp_dependency[NEW] = + state.aftr_isp; + } +} + +void copy_old_to_new_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + +#define OLD_TO_NEW(x) \ + (x[NEW] = x[OLD]) + + OLD_TO_NEW(resource_state_change->role); + OLD_TO_NEW(resource_state_change->susp); + OLD_TO_NEW(resource_state_change->susp_nod); + OLD_TO_NEW(resource_state_change->susp_fen); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + OLD_TO_NEW(connection_state_change->peer_role); + OLD_TO_NEW(connection_state_change->cstate); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + OLD_TO_NEW(device_state_change->disk_state); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + OLD_TO_NEW(p->disk_state); + OLD_TO_NEW(p->repl_state); + OLD_TO_NEW(p->resync_susp_user); + OLD_TO_NEW(p->resync_susp_peer); + OLD_TO_NEW(p->resync_susp_dependency); + } + +#undef OLD_TO_NEW +} + +void forget_state_change(struct drbd_state_change *state_change) +{ + unsigned int n; + + if (!state_change) + return; + + if (state_change->resource->resource) + kref_put(&state_change->resource->resource->kref, drbd_destroy_resource); + for (n = 0; n < state_change->n_devices; n++) { + struct drbd_device *device = state_change->devices[n].device; + + if (device) + kref_put(&device->kref, drbd_destroy_device); + } + for (n = 0; n < state_change->n_connections; n++) { + struct drbd_connection *connection = + state_change->connections[n].connection; + + if (connection) + kref_put(&connection->kref, drbd_destroy_connection); + } + kfree(state_change); +} + static int w_after_state_ch(struct drbd_work *w, int unused); static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags); + union drbd_state ns, enum chg_state_flags flags, + struct drbd_state_change *); static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); @@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) return R_SECONDARY; return R_UNKNOWN; } + static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) { if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) @@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device) drbd_info(device, "Resumed AL updates\n"); } -/* helper for __drbd_set_state */ +/* helper for _drbd_set_state */ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) { if (first_peer_device(device)->connection->agreed_pro_version < 90) @@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) } /** - * __drbd_set_state() - Set a new DRBD state + * _drbd_set_state() - Set a new DRBD state * @device: DRBD device. * @ns: new state. * @flags: Flags * @done: Optional completion, that will get completed after the after_state_ch() finished * - * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + * Caller needs to hold req_lock. Do not call directly. */ enum drbd_state_rv -__drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) +_drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) { struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; @@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, enum drbd_state_rv rv = SS_SUCCESS; enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; + struct drbd_state_change *state_change; os = drbd_read_state(device); @@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) clear_bit(RS_DONE, &device->flags); + /* FIXME: Have any flags been set earlier in this function already? */ + state_change = remember_old_state(device->resource, GFP_ATOMIC); + /* changes to local_cnt and device flags should be visible before * changes to state, which again should be visible before anything else * depending on that change happens. */ @@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, device->resource->susp_fen = ns.susp_fen; smp_wmb(); + remember_new_state(state_change); + /* put replicated vs not-replicated requests in seperate epochs */ if (drbd_should_do_remote((union drbd_dev_state)os.i) != drbd_should_do_remote((union drbd_dev_state)ns.i)) @@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ascw->w.cb = w_after_state_ch; ascw->device = device; ascw->done = done; + ascw->state_change = state_change; drbd_queue_work(&connection->sender_work, &ascw->w); } else { @@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused) container_of(w, struct after_state_chg_work, w); struct drbd_device *device = ascw->device; - after_state_ch(device, ascw->os, ascw->ns, ascw->flags); + after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change); + forget_state_change(ascw->state_change); if (ascw->flags & CS_WAIT_COMPLETE) complete(ascw->done); kfree(ascw); @@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); /* open coded non-blocking drbd_suspend_io(device); */ - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); drbd_bm_lock(device, why, flags); rv = io_fn(device); @@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, return rv; } +void notify_resource_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_resource_state_change *resource_state_change, + enum drbd_notification_type type) +{ + struct drbd_resource *resource = resource_state_change->resource; + struct resource_info resource_info = { + .res_role = resource_state_change->role[NEW], + .res_susp = resource_state_change->susp[NEW], + .res_susp_nod = resource_state_change->susp_nod[NEW], + .res_susp_fen = resource_state_change->susp_fen[NEW], + }; + + notify_resource_state(skb, seq, resource, &resource_info, type); +} + +void notify_connection_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_connection_state_change *connection_state_change, + enum drbd_notification_type type) +{ + struct drbd_connection *connection = connection_state_change->connection; + struct connection_info connection_info = { + .conn_connection_state = connection_state_change->cstate[NEW], + .conn_role = connection_state_change->peer_role[NEW], + }; + + notify_connection_state(skb, seq, connection, &connection_info, type); +} + +void notify_device_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_device_state_change *device_state_change, + enum drbd_notification_type type) +{ + struct drbd_device *device = device_state_change->device; + struct device_info device_info = { + .dev_disk_state = device_state_change->disk_state[NEW], + }; + + notify_device_state(skb, seq, device, &device_info, type); +} + +void notify_peer_device_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_peer_device_state_change *p, + enum drbd_notification_type type) +{ + struct drbd_peer_device *peer_device = p->peer_device; + struct peer_device_info peer_device_info = { + .peer_repl_state = p->repl_state[NEW], + .peer_disk_state = p->disk_state[NEW], + .peer_resync_susp_user = p->resync_susp_user[NEW], + .peer_resync_susp_peer = p->resync_susp_peer[NEW], + .peer_resync_susp_dependency = p->resync_susp_dependency[NEW], + }; + + notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); +} + +static void broadcast_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + bool resource_state_has_changed; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + void (*last_func)(struct sk_buff *, unsigned int, void *, + enum drbd_notification_type) = NULL; + void *uninitialized_var(last_arg); + +#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) +#define FINAL_STATE_CHANGE(type) \ + ({ if (last_func) \ + last_func(NULL, 0, last_arg, type); \ + }) +#define REMEMBER_STATE_CHANGE(func, arg, type) \ + ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ + last_func = (typeof(last_func))func; \ + last_arg = arg; \ + }) + + mutex_lock(¬ification_mutex); + + resource_state_has_changed = + HAS_CHANGED(resource_state_change->role) || + HAS_CHANGED(resource_state_change->susp) || + HAS_CHANGED(resource_state_change->susp_nod) || + HAS_CHANGED(resource_state_change->susp_fen); + + if (resource_state_has_changed) + REMEMBER_STATE_CHANGE(notify_resource_state_change, + resource_state_change, NOTIFY_CHANGE); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + if (HAS_CHANGED(connection_state_change->peer_role) || + HAS_CHANGED(connection_state_change->cstate)) + REMEMBER_STATE_CHANGE(notify_connection_state_change, + connection_state_change, NOTIFY_CHANGE); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + if (HAS_CHANGED(device_state_change->disk_state)) + REMEMBER_STATE_CHANGE(notify_device_state_change, + device_state_change, NOTIFY_CHANGE); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + if (HAS_CHANGED(p->disk_state) || + HAS_CHANGED(p->repl_state) || + HAS_CHANGED(p->resync_susp_user) || + HAS_CHANGED(p->resync_susp_peer) || + HAS_CHANGED(p->resync_susp_dependency)) + REMEMBER_STATE_CHANGE(notify_peer_device_state_change, + p, NOTIFY_CHANGE); + } + + FINAL_STATE_CHANGE(NOTIFY_CHANGE); + mutex_unlock(¬ification_mutex); + +#undef HAS_CHANGED +#undef FINAL_STATE_CHANGE +#undef REMEMBER_STATE_CHANGE +} + /** * after_state_ch() - Perform after state change actions that may sleep * @device: DRBD device. @@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, * @flags: Flags */ static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags) + union drbd_state ns, enum chg_state_flags flags, + struct drbd_state_change *state_change) { struct drbd_resource *resource = device->resource; struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; struct sib_info sib; + broadcast_state_change(state_change); + sib.sib_reason = SIB_STATE_CHANGE; sib.os = os; sib.ns = ns; @@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { - if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY && device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(device); drbd_send_uuids(peer_device); @@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, if (os.disk != D_FAILED && ns.disk == D_FAILED) { enum drbd_io_error_p eh = EP_PASS_ON; int was_io_error = 0; - /* corresponding get_ldev was in __drbd_set_state, to serialize + /* corresponding get_ldev was in _drbd_set_state, to serialize * our cleanup here with the transition to D_DISKLESS. * But is is still not save to dreference ldev here, since * we might come from an failed Attach before ldev was set. */ @@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); + /* Intentionally call this handler first, before drbd_send_state(). + * See: 2932204 drbd: call local-io-error handler early + * People may chose to hard-reset the box from this handler. + * It is useful if this looks like a "regular node crash". */ if (was_io_error && eh == EP_CALL_HELPER) drbd_khelper(device, "local-io-error"); @@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work { union drbd_state ns_max; /* new, max state, over all devices */ enum chg_state_flags flags; struct drbd_connection *connection; + struct drbd_state_change *state_change; }; static int w_after_conn_state_ch(struct drbd_work *w, int unused) @@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) struct drbd_peer_device *peer_device; int vnr; + broadcast_state_change(acscw->state_change); + forget_state_change(acscw->state_change); kfree(acscw); /* Upon network configuration, we need to start the receiver */ @@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { struct net_conf *old_conf; + mutex_lock(¬ification_mutex); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + notify_peer_device_state(NULL, 0, peer_device, NULL, + NOTIFY_DESTROY | NOTIFY_CONTINUES); + notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + mutex_lock(&connection->resource->conf_update); old_conf = connection->net_conf; connection->my_addr_len = 0; @@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; - rv = __drbd_set_state(device, ns, flags, NULL); + rv = _drbd_set_state(device, ns, flags, NULL); if (rv < SS_SUCCESS) BUG(); @@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u enum drbd_conns oc = connection->cstate; union drbd_state ns_max, ns_min, os; bool have_mutex = false; + struct drbd_state_change *state_change; if (mask.conn) { rv = is_valid_conn_transition(oc, val.conn); @@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u goto abort; } + state_change = remember_old_state(connection->resource, GFP_ATOMIC); conn_old_common_state(connection, &os, &flags); flags |= CS_DC_SUSP; conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); conn_pr_state_change(connection, os, ns_max, flags); + remember_new_state(state_change); acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); if (acscw) { @@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u acscw->w.cb = w_after_conn_state_ch; kref_get(&connection->kref); acscw->connection = connection; + acscw->state_change = state_change; drbd_queue_work(&connection->sender_work, &acscw->w); } else { drbd_err(connection, "Could not kmalloc an acscw\n"); |