From 57f0f512b273f60d52568b8c6b77e17f5636edc0 Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Wed, 5 Aug 2015 17:04:01 -0300 Subject: Initial import --- net/ceph/Kconfig | 44 + net/ceph/Makefile | 15 + net/ceph/armor.c | 105 ++ net/ceph/auth.c | 340 ++++ net/ceph/auth_none.c | 137 ++ net/ceph/auth_none.h | 29 + net/ceph/auth_x.c | 782 +++++++++ net/ceph/auth_x.h | 52 + net/ceph/auth_x_protocol.h | 90 ++ net/ceph/buffer.c | 58 + net/ceph/ceph_common.c | 725 +++++++++ net/ceph/ceph_fs.c | 78 + net/ceph/ceph_hash.c | 121 ++ net/ceph/ceph_strings.c | 44 + net/ceph/crush/crush.c | 143 ++ net/ceph/crush/crush_ln_table.h | 166 ++ net/ceph/crush/hash.c | 149 ++ net/ceph/crush/mapper.c | 927 +++++++++++ net/ceph/crypto.c | 583 +++++++ net/ceph/crypto.h | 51 + net/ceph/debugfs.c | 309 ++++ net/ceph/messenger.c | 3395 +++++++++++++++++++++++++++++++++++++++ net/ceph/mon_client.c | 1121 +++++++++++++ net/ceph/msgpool.c | 83 + net/ceph/osd_client.c | 3008 ++++++++++++++++++++++++++++++++++ net/ceph/osdmap.c | 1754 ++++++++++++++++++++ net/ceph/pagelist.c | 150 ++ net/ceph/pagevec.c | 202 +++ net/ceph/snapshot.c | 78 + 29 files changed, 14739 insertions(+) create mode 100644 net/ceph/Kconfig create mode 100644 net/ceph/Makefile create mode 100644 net/ceph/armor.c create mode 100644 net/ceph/auth.c create mode 100644 net/ceph/auth_none.c create mode 100644 net/ceph/auth_none.h create mode 100644 net/ceph/auth_x.c create mode 100644 net/ceph/auth_x.h create mode 100644 net/ceph/auth_x_protocol.h create mode 100644 net/ceph/buffer.c create mode 100644 net/ceph/ceph_common.c create mode 100644 net/ceph/ceph_fs.c create mode 100644 net/ceph/ceph_hash.c create mode 100644 net/ceph/ceph_strings.c create mode 100644 net/ceph/crush/crush.c create mode 100644 net/ceph/crush/crush_ln_table.h create mode 100644 net/ceph/crush/hash.c create mode 100644 net/ceph/crush/mapper.c create mode 100644 net/ceph/crypto.c create mode 100644 net/ceph/crypto.h create mode 100644 net/ceph/debugfs.c create mode 100644 net/ceph/messenger.c create mode 100644 net/ceph/mon_client.c create mode 100644 net/ceph/msgpool.c create mode 100644 net/ceph/osd_client.c create mode 100644 net/ceph/osdmap.c create mode 100644 net/ceph/pagelist.c create mode 100644 net/ceph/pagevec.c create mode 100644 net/ceph/snapshot.c (limited to 'net/ceph') diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig new file mode 100644 index 000000000..f8cceb99e --- /dev/null +++ b/net/ceph/Kconfig @@ -0,0 +1,44 @@ +config CEPH_LIB + tristate "Ceph core library" + depends on INET + select LIBCRC32C + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO + select KEYS + default n + help + Choose Y or M here to include cephlib, which provides the + common functionality to both the Ceph filesystem and + to the rados block device (rbd). + + More information at http://ceph.newdream.net/. + + If unsure, say N. + +config CEPH_LIB_PRETTYDEBUG + bool "Include file:line in ceph debug output" + depends on CEPH_LIB + default n + help + If you say Y here, debug output will include a filename and + line to aid debugging. This increases kernel size and slows + execution slightly when debug call sites are enabled (e.g., + via CONFIG_DYNAMIC_DEBUG). + + If unsure, say N. + +config CEPH_LIB_USE_DNS_RESOLVER + bool "Use in-kernel support for DNS lookup" + depends on CEPH_LIB + select DNS_RESOLVER + default n + help + If you say Y here, hostnames (e.g. monitor addresses) will + be resolved using the CONFIG_DNS_RESOLVER facility. + + For information on how to use CONFIG_DNS_RESOLVER consult + Documentation/networking/dns_resolver.txt + + If unsure, say N. + diff --git a/net/ceph/Makefile b/net/ceph/Makefile new file mode 100644 index 000000000..958d98569 --- /dev/null +++ b/net/ceph/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for CEPH filesystem. +# +obj-$(CONFIG_CEPH_LIB) += libceph.o + +libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ + mon_client.o \ + osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \ + debugfs.o \ + auth.o auth_none.o \ + crypto.o armor.o \ + auth_x.o \ + ceph_fs.o ceph_strings.o ceph_hash.o \ + pagevec.o snapshot.o + diff --git a/net/ceph/armor.c b/net/ceph/armor.c new file mode 100644 index 000000000..1fc1ee11d --- /dev/null +++ b/net/ceph/armor.c @@ -0,0 +1,105 @@ + +#include + +int ceph_armor(char *dst, const char *src, const char *end); +int ceph_unarmor(char *dst, const char *src, const char *end); + +/* + * base64 encode/decode. + */ + +static const char *pem_key = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +static int encode_bits(int c) +{ + return pem_key[c]; +} + +static int decode_bits(char c) +{ + if (c >= 'A' && c <= 'Z') + return c - 'A'; + if (c >= 'a' && c <= 'z') + return c - 'a' + 26; + if (c >= '0' && c <= '9') + return c - '0' + 52; + if (c == '+') + return 62; + if (c == '/') + return 63; + if (c == '=') + return 0; /* just non-negative, please */ + return -EINVAL; +} + +int ceph_armor(char *dst, const char *src, const char *end) +{ + int olen = 0; + int line = 0; + + while (src < end) { + unsigned char a, b, c; + + a = *src++; + *dst++ = encode_bits(a >> 2); + if (src < end) { + b = *src++; + *dst++ = encode_bits(((a & 3) << 4) | (b >> 4)); + if (src < end) { + c = *src++; + *dst++ = encode_bits(((b & 15) << 2) | + (c >> 6)); + *dst++ = encode_bits(c & 63); + } else { + *dst++ = encode_bits((b & 15) << 2); + *dst++ = '='; + } + } else { + *dst++ = encode_bits(((a & 3) << 4)); + *dst++ = '='; + *dst++ = '='; + } + olen += 4; + line += 4; + if (line == 64) { + line = 0; + *(dst++) = '\n'; + olen++; + } + } + return olen; +} + +int ceph_unarmor(char *dst, const char *src, const char *end) +{ + int olen = 0; + + while (src < end) { + int a, b, c, d; + + if (src[0] == '\n') { + src++; + continue; + } + if (src + 4 > end) + return -EINVAL; + a = decode_bits(src[0]); + b = decode_bits(src[1]); + c = decode_bits(src[2]); + d = decode_bits(src[3]); + if (a < 0 || b < 0 || c < 0 || d < 0) + return -EINVAL; + + *dst++ = (a << 2) | (b >> 4); + if (src[2] == '=') + return olen + 1; + *dst++ = ((b & 15) << 4) | (c >> 2); + if (src[3] == '=') + return olen + 2; + *dst++ = ((c & 3) << 6) | d; + olen += 3; + src += 4; + } + return olen; +} diff --git a/net/ceph/auth.c b/net/ceph/auth.c new file mode 100644 index 000000000..6b923bcaa --- /dev/null +++ b/net/ceph/auth.c @@ -0,0 +1,340 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include +#include "auth_none.h" +#include "auth_x.h" + + +/* + * get protocol handler + */ +static u32 supported_protocols[] = { + CEPH_AUTH_NONE, + CEPH_AUTH_CEPHX +}; + +static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) +{ + switch (protocol) { + case CEPH_AUTH_NONE: + return ceph_auth_none_init(ac); + case CEPH_AUTH_CEPHX: + return ceph_x_init(ac); + default: + return -ENOENT; + } +} + +/* + * setup, teardown. + */ +struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_crypto_key *key) +{ + struct ceph_auth_client *ac; + int ret; + + dout("auth_init name '%s'\n", name); + + ret = -ENOMEM; + ac = kzalloc(sizeof(*ac), GFP_NOFS); + if (!ac) + goto out; + + mutex_init(&ac->mutex); + ac->negotiating = true; + if (name) + ac->name = name; + else + ac->name = CEPH_AUTH_NAME_DEFAULT; + dout("auth_init name %s\n", ac->name); + ac->key = key; + return ac; + +out: + return ERR_PTR(ret); +} + +void ceph_auth_destroy(struct ceph_auth_client *ac) +{ + dout("auth_destroy %p\n", ac); + if (ac->ops) + ac->ops->destroy(ac); + kfree(ac); +} + +/* + * Reset occurs when reconnecting to the monitor. + */ +void ceph_auth_reset(struct ceph_auth_client *ac) +{ + mutex_lock(&ac->mutex); + dout("auth_reset %p\n", ac); + if (ac->ops && !ac->negotiating) + ac->ops->reset(ac); + ac->negotiating = true; + mutex_unlock(&ac->mutex); +} + +int ceph_entity_name_encode(const char *name, void **p, void *end) +{ + int len = strlen(name); + + if (*p + 2*sizeof(u32) + len > end) + return -ERANGE; + ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT); + ceph_encode_32(p, len); + ceph_encode_copy(p, name, len); + return 0; +} + +/* + * Initiate protocol negotiation with monitor. Include entity name + * and list supported protocols. + */ +int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len) +{ + struct ceph_mon_request_header *monhdr = buf; + void *p = monhdr + 1, *end = buf + len, *lenp; + int i, num; + int ret; + + mutex_lock(&ac->mutex); + dout("auth_build_hello\n"); + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, 0); /* no protocol, yet */ + + lenp = p; + p += sizeof(u32); + + ceph_decode_need(&p, end, 1 + sizeof(u32), bad); + ceph_encode_8(&p, 1); + num = ARRAY_SIZE(supported_protocols); + ceph_encode_32(&p, num); + ceph_decode_need(&p, end, num * sizeof(u32), bad); + for (i = 0; i < num; i++) + ceph_encode_32(&p, supported_protocols[i]); + + ret = ceph_entity_name_encode(ac->name, &p, end); + if (ret < 0) + goto out; + ceph_decode_need(&p, end, sizeof(u64), bad); + ceph_encode_64(&p, ac->global_id); + + ceph_encode_32(&lenp, p - lenp - sizeof(u32)); + ret = p - buf; +out: + mutex_unlock(&ac->mutex); + return ret; + +bad: + ret = -ERANGE; + goto out; +} + +static int ceph_build_auth_request(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) +{ + struct ceph_mon_request_header *monhdr = msg_buf; + void *p = monhdr + 1; + void *end = msg_buf + msg_len; + int ret; + + monhdr->have_version = 0; + monhdr->session_mon = cpu_to_le16(-1); + monhdr->session_mon_tid = 0; + + ceph_encode_32(&p, ac->protocol); + + ret = ac->ops->build_request(ac, p + sizeof(u32), end); + if (ret < 0) { + pr_err("error %d building auth method %s request\n", ret, + ac->ops->name); + goto out; + } + dout(" built request %d bytes\n", ret); + ceph_encode_32(&p, ret); + ret = p + ret - msg_buf; +out: + return ret; +} + +/* + * Handle auth message from monitor. + */ +int ceph_handle_auth_reply(struct ceph_auth_client *ac, + void *buf, size_t len, + void *reply_buf, size_t reply_len) +{ + void *p = buf; + void *end = buf + len; + int protocol; + s32 result; + u64 global_id; + void *payload, *payload_end; + int payload_len; + char *result_msg; + int result_msg_len; + int ret = -EINVAL; + + mutex_lock(&ac->mutex); + dout("handle_auth_reply %p %p\n", p, end); + ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); + protocol = ceph_decode_32(&p); + result = ceph_decode_32(&p); + global_id = ceph_decode_64(&p); + payload_len = ceph_decode_32(&p); + payload = p; + p += payload_len; + ceph_decode_need(&p, end, sizeof(u32), bad); + result_msg_len = ceph_decode_32(&p); + result_msg = p; + p += result_msg_len; + if (p != end) + goto bad; + + dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len, + result_msg, global_id, payload_len); + + payload_end = payload + payload_len; + + if (global_id && ac->global_id != global_id) { + dout(" set global_id %lld -> %lld\n", ac->global_id, global_id); + ac->global_id = global_id; + } + + if (ac->negotiating) { + /* server does not support our protocols? */ + if (!protocol && result < 0) { + ret = result; + goto out; + } + /* set up (new) protocol handler? */ + if (ac->protocol && ac->protocol != protocol) { + ac->ops->destroy(ac); + ac->protocol = 0; + ac->ops = NULL; + } + if (ac->protocol != protocol) { + ret = ceph_auth_init_protocol(ac, protocol); + if (ret) { + pr_err("error %d on auth protocol %d init\n", + ret, protocol); + goto out; + } + } + + ac->negotiating = false; + } + + ret = ac->ops->handle_reply(ac, result, payload, payload_end); + if (ret == -EAGAIN) { + ret = ceph_build_auth_request(ac, reply_buf, reply_len); + } else if (ret) { + pr_err("auth method '%s' error %d\n", ac->ops->name, ret); + } + +out: + mutex_unlock(&ac->mutex); + return ret; + +bad: + pr_err("failed to decode auth msg\n"); + ret = -EINVAL; + goto out; +} + +int ceph_build_auth(struct ceph_auth_client *ac, + void *msg_buf, size_t msg_len) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (!ac->protocol) + ret = ceph_auth_build_hello(ac, msg_buf, msg_len); + else if (ac->ops->should_authenticate(ac)) + ret = ceph_build_auth_request(ac, msg_buf, msg_len); + mutex_unlock(&ac->mutex); + return ret; +} + +int ceph_auth_is_authenticated(struct ceph_auth_client *ac) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops) + ret = ac->ops->is_authenticated(ac); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_is_authenticated); + +int ceph_auth_create_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *auth) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->create_authorizer) + ret = ac->ops->create_authorizer(ac, peer_type, auth); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_create_authorizer); + +void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->destroy_authorizer) + ac->ops->destroy_authorizer(ac, a); + mutex_unlock(&ac->mutex); +} +EXPORT_SYMBOL(ceph_auth_destroy_authorizer); + +int ceph_auth_update_authorizer(struct ceph_auth_client *ac, + int peer_type, + struct ceph_auth_handshake *a) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->update_authorizer) + ret = ac->ops->update_authorizer(ac, peer_type, a); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_update_authorizer); + +int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->verify_authorizer_reply) + ret = ac->ops->verify_authorizer_reply(ac, a, len); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply); + +void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type) +{ + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->invalidate_authorizer) + ac->ops->invalidate_authorizer(ac, peer_type); + mutex_unlock(&ac->mutex); +} +EXPORT_SYMBOL(ceph_auth_invalidate_authorizer); diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c new file mode 100644 index 000000000..8c93fa8d8 --- /dev/null +++ b/net/ceph/auth_none.c @@ -0,0 +1,137 @@ + +#include + +#include +#include +#include +#include + +#include +#include + +#include "auth_none.h" + +static void reset(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + xi->starting = true; + xi->built_authorizer = false; +} + +static void destroy(struct ceph_auth_client *ac) +{ + kfree(ac->private); + ac->private = NULL; +} + +static int is_authenticated(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + return !xi->starting; +} + +static int should_authenticate(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi = ac->private; + + return xi->starting; +} + +static int build_request(struct ceph_auth_client *ac, void *buf, void *end) +{ + return 0; +} + +/* + * the generic auth code decode the global_id, and we carry no actual + * authenticate state, so nothing happens here. + */ +static int handle_reply(struct ceph_auth_client *ac, int result, + void *buf, void *end) +{ + struct ceph_auth_none_info *xi = ac->private; + + xi->starting = false; + return result; +} + +/* + * build an 'authorizer' with our entity_name and global_id. we can + * reuse a single static copy since it is identical for all services + * we connect to. + */ +static int ceph_auth_none_create_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth) +{ + struct ceph_auth_none_info *ai = ac->private; + struct ceph_none_authorizer *au = &ai->au; + void *p, *end; + int ret; + + if (!ai->built_authorizer) { + p = au->buf; + end = p + sizeof(au->buf); + ceph_encode_8(&p, 1); + ret = ceph_entity_name_encode(ac->name, &p, end - 8); + if (ret < 0) + goto bad; + ceph_decode_need(&p, end, sizeof(u64), bad2); + ceph_encode_64(&p, ac->global_id); + au->buf_len = p - (void *)au->buf; + ai->built_authorizer = true; + dout("built authorizer len %d\n", au->buf_len); + } + + auth->authorizer = (struct ceph_authorizer *) au; + auth->authorizer_buf = au->buf; + auth->authorizer_buf_len = au->buf_len; + auth->authorizer_reply_buf = au->reply_buf; + auth->authorizer_reply_buf_len = sizeof (au->reply_buf); + + return 0; + +bad2: + ret = -ERANGE; +bad: + return ret; +} + +static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + /* nothing to do */ +} + +static const struct ceph_auth_client_ops ceph_auth_none_ops = { + .name = "none", + .reset = reset, + .destroy = destroy, + .is_authenticated = is_authenticated, + .should_authenticate = should_authenticate, + .build_request = build_request, + .handle_reply = handle_reply, + .create_authorizer = ceph_auth_none_create_authorizer, + .destroy_authorizer = ceph_auth_none_destroy_authorizer, +}; + +int ceph_auth_none_init(struct ceph_auth_client *ac) +{ + struct ceph_auth_none_info *xi; + + dout("ceph_auth_none_init %p\n", ac); + xi = kzalloc(sizeof(*xi), GFP_NOFS); + if (!xi) + return -ENOMEM; + + xi->starting = true; + xi->built_authorizer = false; + + ac->protocol = CEPH_AUTH_NONE; + ac->private = xi; + ac->ops = &ceph_auth_none_ops; + return 0; +} + diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h new file mode 100644 index 000000000..059a3ce4b --- /dev/null +++ b/net/ceph/auth_none.h @@ -0,0 +1,29 @@ +#ifndef _FS_CEPH_AUTH_NONE_H +#define _FS_CEPH_AUTH_NONE_H + +#include +#include + +/* + * null security mode. + * + * we use a single static authorizer that simply encodes our entity name + * and global id. + */ + +struct ceph_none_authorizer { + char buf[128]; + int buf_len; + char reply_buf[0]; +}; + +struct ceph_auth_none_info { + bool starting; + bool built_authorizer; + struct ceph_none_authorizer au; /* we only need one; it's static */ +}; + +int ceph_auth_none_init(struct ceph_auth_client *ac); + +#endif + diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c new file mode 100644 index 000000000..ba6eb1722 --- /dev/null +++ b/net/ceph/auth_x.c @@ -0,0 +1,782 @@ + +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "crypto.h" +#include "auth_x.h" +#include "auth_x_protocol.h" + +static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); + +static int ceph_x_is_authenticated(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + int need; + + ceph_x_validate_tickets(ac, &need); + dout("ceph_x_is_authenticated want=%d need=%d have=%d\n", + ac->want_keys, need, xi->have_keys); + return (ac->want_keys & xi->have_keys) == ac->want_keys; +} + +static int ceph_x_should_authenticate(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + int need; + + ceph_x_validate_tickets(ac, &need); + dout("ceph_x_should_authenticate want=%d need=%d have=%d\n", + ac->want_keys, need, xi->have_keys); + return need != 0; +} + +static int ceph_x_encrypt_buflen(int ilen) +{ + return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + + sizeof(u32); +} + +static int ceph_x_encrypt(struct ceph_crypto_key *secret, + void *ibuf, int ilen, void *obuf, size_t olen) +{ + struct ceph_x_encrypt_header head = { + .struct_v = 1, + .magic = cpu_to_le64(CEPHX_ENC_MAGIC) + }; + size_t len = olen - sizeof(u32); + int ret; + + ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len, + &head, sizeof(head), ibuf, ilen); + if (ret) + return ret; + ceph_encode_32(&obuf, len); + return len + sizeof(u32); +} + +static int ceph_x_decrypt(struct ceph_crypto_key *secret, + void **p, void *end, void **obuf, size_t olen) +{ + struct ceph_x_encrypt_header head; + size_t head_len = sizeof(head); + int len, ret; + + len = ceph_decode_32(p); + if (*p + len > end) + return -EINVAL; + + dout("ceph_x_decrypt len %d\n", len); + if (*obuf == NULL) { + *obuf = kmalloc(len, GFP_NOFS); + if (!*obuf) + return -ENOMEM; + olen = len; + } + + ret = ceph_decrypt2(secret, &head, &head_len, *obuf, &olen, *p, len); + if (ret) + return ret; + if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC) + return -EPERM; + *p += len; + return olen; +} + +/* + * get existing (or insert new) ticket handler + */ +static struct ceph_x_ticket_handler * +get_ticket_handler(struct ceph_auth_client *ac, int service) +{ + struct ceph_x_ticket_handler *th; + struct ceph_x_info *xi = ac->private; + struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node; + + while (*p) { + parent = *p; + th = rb_entry(parent, struct ceph_x_ticket_handler, node); + if (service < th->service) + p = &(*p)->rb_left; + else if (service > th->service) + p = &(*p)->rb_right; + else + return th; + } + + /* add it */ + th = kzalloc(sizeof(*th), GFP_NOFS); + if (!th) + return ERR_PTR(-ENOMEM); + th->service = service; + rb_link_node(&th->node, parent, p); + rb_insert_color(&th->node, &xi->ticket_handlers); + return th; +} + +static void remove_ticket_handler(struct ceph_auth_client *ac, + struct ceph_x_ticket_handler *th) +{ + struct ceph_x_info *xi = ac->private; + + dout("remove_ticket_handler %p %d\n", th, th->service); + rb_erase(&th->node, &xi->ticket_handlers); + ceph_crypto_key_destroy(&th->session_key); + if (th->ticket_blob) + ceph_buffer_put(th->ticket_blob); + kfree(th); +} + +static int process_one_ticket(struct ceph_auth_client *ac, + struct ceph_crypto_key *secret, + void **p, void *end) +{ + struct ceph_x_info *xi = ac->private; + int type; + u8 tkt_struct_v, blob_struct_v; + struct ceph_x_ticket_handler *th; + void *dbuf = NULL; + void *dp, *dend; + int dlen; + char is_enc; + struct timespec validity; + struct ceph_crypto_key old_key; + void *ticket_buf = NULL; + void *tp, *tpend; + void **ptp; + struct ceph_timespec new_validity; + struct ceph_crypto_key new_session_key; + struct ceph_buffer *new_ticket_blob; + unsigned long new_expires, new_renew_after; + u64 new_secret_id; + int ret; + + ceph_decode_need(p, end, sizeof(u32) + 1, bad); + + type = ceph_decode_32(p); + dout(" ticket type %d %s\n", type, ceph_entity_type_name(type)); + + tkt_struct_v = ceph_decode_8(p); + if (tkt_struct_v != 1) + goto bad; + + th = get_ticket_handler(ac, type); + if (IS_ERR(th)) { + ret = PTR_ERR(th); + goto out; + } + + /* blob for me */ + dlen = ceph_x_decrypt(secret, p, end, &dbuf, 0); + if (dlen <= 0) { + ret = dlen; + goto out; + } + dout(" decrypted %d bytes\n", dlen); + dp = dbuf; + dend = dp + dlen; + + tkt_struct_v = ceph_decode_8(&dp); + if (tkt_struct_v != 1) + goto bad; + + memcpy(&old_key, &th->session_key, sizeof(old_key)); + ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); + if (ret) + goto out; + + ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); + ceph_decode_timespec(&validity, &new_validity); + new_expires = get_seconds() + validity.tv_sec; + new_renew_after = new_expires - (validity.tv_sec / 4); + dout(" expires=%lu renew_after=%lu\n", new_expires, + new_renew_after); + + /* ticket blob for service */ + ceph_decode_8_safe(p, end, is_enc, bad); + if (is_enc) { + /* encrypted */ + dout(" encrypted ticket\n"); + dlen = ceph_x_decrypt(&old_key, p, end, &ticket_buf, 0); + if (dlen < 0) { + ret = dlen; + goto out; + } + tp = ticket_buf; + ptp = &tp; + tpend = *ptp + dlen; + } else { + /* unencrypted */ + ptp = p; + tpend = end; + } + ceph_decode_32_safe(ptp, tpend, dlen, bad); + dout(" ticket blob is %d bytes\n", dlen); + ceph_decode_need(ptp, tpend, 1 + sizeof(u64), bad); + blob_struct_v = ceph_decode_8(ptp); + new_secret_id = ceph_decode_64(ptp); + ret = ceph_decode_buffer(&new_ticket_blob, ptp, tpend); + if (ret) + goto out; + + /* all is well, update our ticket */ + ceph_crypto_key_destroy(&th->session_key); + if (th->ticket_blob) + ceph_buffer_put(th->ticket_blob); + th->session_key = new_session_key; + th->ticket_blob = new_ticket_blob; + th->validity = new_validity; + th->secret_id = new_secret_id; + th->expires = new_expires; + th->renew_after = new_renew_after; + dout(" got ticket service %d (%s) secret_id %lld len %d\n", + type, ceph_entity_type_name(type), th->secret_id, + (int)th->ticket_blob->vec.iov_len); + xi->have_keys |= th->service; + +out: + kfree(ticket_buf); + kfree(dbuf); + return ret; + +bad: + ret = -EINVAL; + goto out; +} + +static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, + struct ceph_crypto_key *secret, + void *buf, void *end) +{ + void *p = buf; + u8 reply_struct_v; + u32 num; + int ret; + + ceph_decode_8_safe(&p, end, reply_struct_v, bad); + if (reply_struct_v != 1) + return -EINVAL; + + ceph_decode_32_safe(&p, end, num, bad); + dout("%d tickets\n", num); + + while (num--) { + ret = process_one_ticket(ac, secret, &p, end); + if (ret) + return ret; + } + + return 0; + +bad: + return -EINVAL; +} + +static int ceph_x_build_authorizer(struct ceph_auth_client *ac, + struct ceph_x_ticket_handler *th, + struct ceph_x_authorizer *au) +{ + int maxlen; + struct ceph_x_authorize_a *msg_a; + struct ceph_x_authorize_b msg_b; + void *p, *end; + int ret; + int ticket_blob_len = + (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); + + dout("build_authorizer for %s %p\n", + ceph_entity_type_name(th->service), au); + + ceph_crypto_key_destroy(&au->session_key); + ret = ceph_crypto_key_clone(&au->session_key, &th->session_key); + if (ret) + return ret; + + maxlen = sizeof(*msg_a) + sizeof(msg_b) + + ceph_x_encrypt_buflen(ticket_blob_len); + dout(" need len %d\n", maxlen); + if (au->buf && au->buf->alloc_len < maxlen) { + ceph_buffer_put(au->buf); + au->buf = NULL; + } + if (!au->buf) { + au->buf = ceph_buffer_new(maxlen, GFP_NOFS); + if (!au->buf) { + ceph_crypto_key_destroy(&au->session_key); + return -ENOMEM; + } + } + au->service = th->service; + au->secret_id = th->secret_id; + + msg_a = au->buf->vec.iov_base; + msg_a->struct_v = 1; + msg_a->global_id = cpu_to_le64(ac->global_id); + msg_a->service_id = cpu_to_le32(th->service); + msg_a->ticket_blob.struct_v = 1; + msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id); + msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len); + if (ticket_blob_len) { + memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base, + th->ticket_blob->vec.iov_len); + } + dout(" th %p secret_id %lld %lld\n", th, th->secret_id, + le64_to_cpu(msg_a->ticket_blob.secret_id)); + + p = msg_a + 1; + p += ticket_blob_len; + end = au->buf->vec.iov_base + au->buf->vec.iov_len; + + get_random_bytes(&au->nonce, sizeof(au->nonce)); + msg_b.struct_v = 1; + msg_b.nonce = cpu_to_le64(au->nonce); + ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b), + p, end - p); + if (ret < 0) + goto out_buf; + p += ret; + au->buf->vec.iov_len = p - au->buf->vec.iov_base; + dout(" built authorizer nonce %llx len %d\n", au->nonce, + (int)au->buf->vec.iov_len); + BUG_ON(au->buf->vec.iov_len > maxlen); + return 0; + +out_buf: + ceph_buffer_put(au->buf); + au->buf = NULL; + return ret; +} + +static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th, + void **p, void *end) +{ + ceph_decode_need(p, end, 1 + sizeof(u64), bad); + ceph_encode_8(p, 1); + ceph_encode_64(p, th->secret_id); + if (th->ticket_blob) { + const char *buf = th->ticket_blob->vec.iov_base; + u32 len = th->ticket_blob->vec.iov_len; + + ceph_encode_32_safe(p, end, len, bad); + ceph_encode_copy_safe(p, end, buf, len, bad); + } else { + ceph_encode_32_safe(p, end, 0, bad); + } + + return 0; +bad: + return -ERANGE; +} + +static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) +{ + int want = ac->want_keys; + struct ceph_x_info *xi = ac->private; + int service; + + *pneed = ac->want_keys & ~(xi->have_keys); + + for (service = 1; service <= want; service <<= 1) { + struct ceph_x_ticket_handler *th; + + if (!(ac->want_keys & service)) + continue; + + if (*pneed & service) + continue; + + th = get_ticket_handler(ac, service); + + if (IS_ERR(th)) { + *pneed |= service; + continue; + } + + if (get_seconds() >= th->renew_after) + *pneed |= service; + if (get_seconds() >= th->expires) + xi->have_keys &= ~service; + } +} + + +static int ceph_x_build_request(struct ceph_auth_client *ac, + void *buf, void *end) +{ + struct ceph_x_info *xi = ac->private; + int need; + struct ceph_x_request_header *head = buf; + int ret; + struct ceph_x_ticket_handler *th = + get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + + if (IS_ERR(th)) + return PTR_ERR(th); + + ceph_x_validate_tickets(ac, &need); + + dout("build_request want %x have %x need %x\n", + ac->want_keys, xi->have_keys, need); + + if (need & CEPH_ENTITY_TYPE_AUTH) { + struct ceph_x_authenticate *auth = (void *)(head + 1); + void *p = auth + 1; + struct ceph_x_challenge_blob tmp; + char tmp_enc[40]; + u64 *u; + + if (p > end) + return -ERANGE; + + dout(" get_auth_session_key\n"); + head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY); + + /* encrypt and hash */ + get_random_bytes(&auth->client_challenge, sizeof(u64)); + tmp.client_challenge = auth->client_challenge; + tmp.server_challenge = cpu_to_le64(xi->server_challenge); + ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp), + tmp_enc, sizeof(tmp_enc)); + if (ret < 0) + return ret; + + auth->struct_v = 1; + auth->key = 0; + for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) + auth->key ^= *(__le64 *)u; + dout(" server_challenge %llx client_challenge %llx key %llx\n", + xi->server_challenge, le64_to_cpu(auth->client_challenge), + le64_to_cpu(auth->key)); + + /* now encode the old ticket if exists */ + ret = ceph_x_encode_ticket(th, &p, end); + if (ret < 0) + return ret; + + return p - buf; + } + + if (need) { + void *p = head + 1; + struct ceph_x_service_ticket_request *req; + + if (p > end) + return -ERANGE; + head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); + + ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); + if (ret) + return ret; + ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base, + xi->auth_authorizer.buf->vec.iov_len); + + req = p; + req->keys = cpu_to_le32(need); + p += sizeof(*req); + return p - buf; + } + + return 0; +} + +static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, + void *buf, void *end) +{ + struct ceph_x_info *xi = ac->private; + struct ceph_x_reply_header *head = buf; + struct ceph_x_ticket_handler *th; + int len = end - buf; + int op; + int ret; + + if (result) + return result; /* XXX hmm? */ + + if (xi->starting) { + /* it's a hello */ + struct ceph_x_server_challenge *sc = buf; + + if (len != sizeof(*sc)) + return -EINVAL; + xi->server_challenge = le64_to_cpu(sc->server_challenge); + dout("handle_reply got server challenge %llx\n", + xi->server_challenge); + xi->starting = false; + xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH; + return -EAGAIN; + } + + op = le16_to_cpu(head->op); + result = le32_to_cpu(head->result); + dout("handle_reply op %d result %d\n", op, result); + switch (op) { + case CEPHX_GET_AUTH_SESSION_KEY: + /* verify auth key */ + ret = ceph_x_proc_ticket_reply(ac, &xi->secret, + buf + sizeof(*head), end); + break; + + case CEPHX_GET_PRINCIPAL_SESSION_KEY: + th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + if (IS_ERR(th)) + return PTR_ERR(th); + ret = ceph_x_proc_ticket_reply(ac, &th->session_key, + buf + sizeof(*head), end); + break; + + default: + return -EINVAL; + } + if (ret) + return ret; + if (ac->want_keys == xi->have_keys) + return 0; + return -EAGAIN; +} + +static int ceph_x_create_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth) +{ + struct ceph_x_authorizer *au; + struct ceph_x_ticket_handler *th; + int ret; + + th = get_ticket_handler(ac, peer_type); + if (IS_ERR(th)) + return PTR_ERR(th); + + au = kzalloc(sizeof(*au), GFP_NOFS); + if (!au) + return -ENOMEM; + + ret = ceph_x_build_authorizer(ac, th, au); + if (ret) { + kfree(au); + return ret; + } + + auth->authorizer = (struct ceph_authorizer *) au; + auth->authorizer_buf = au->buf->vec.iov_base; + auth->authorizer_buf_len = au->buf->vec.iov_len; + auth->authorizer_reply_buf = au->reply_buf; + auth->authorizer_reply_buf_len = sizeof (au->reply_buf); + auth->sign_message = ac->ops->sign_message; + auth->check_message_signature = ac->ops->check_message_signature; + + return 0; +} + +static int ceph_x_update_authorizer( + struct ceph_auth_client *ac, int peer_type, + struct ceph_auth_handshake *auth) +{ + struct ceph_x_authorizer *au; + struct ceph_x_ticket_handler *th; + + th = get_ticket_handler(ac, peer_type); + if (IS_ERR(th)) + return PTR_ERR(th); + + au = (struct ceph_x_authorizer *)auth->authorizer; + if (au->secret_id < th->secret_id) { + dout("ceph_x_update_authorizer service %u secret %llu < %llu\n", + au->service, au->secret_id, th->secret_id); + return ceph_x_build_authorizer(ac, th, au); + } + return 0; +} + +static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, + struct ceph_authorizer *a, size_t len) +{ + struct ceph_x_authorizer *au = (void *)a; + int ret = 0; + struct ceph_x_authorize_reply reply; + void *preply = &reply; + void *p = au->reply_buf; + void *end = p + sizeof(au->reply_buf); + + ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply)); + if (ret < 0) + return ret; + if (ret != sizeof(reply)) + return -EPERM; + + if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one)) + ret = -EPERM; + else + ret = 0; + dout("verify_authorizer_reply nonce %llx got %llx ret %d\n", + au->nonce, le64_to_cpu(reply.nonce_plus_one), ret); + return ret; +} + +static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac, + struct ceph_authorizer *a) +{ + struct ceph_x_authorizer *au = (void *)a; + + ceph_crypto_key_destroy(&au->session_key); + ceph_buffer_put(au->buf); + kfree(au); +} + + +static void ceph_x_reset(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + + dout("reset\n"); + xi->starting = true; + xi->server_challenge = 0; +} + +static void ceph_x_destroy(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi = ac->private; + struct rb_node *p; + + dout("ceph_x_destroy %p\n", ac); + ceph_crypto_key_destroy(&xi->secret); + + while ((p = rb_first(&xi->ticket_handlers)) != NULL) { + struct ceph_x_ticket_handler *th = + rb_entry(p, struct ceph_x_ticket_handler, node); + remove_ticket_handler(ac, th); + } + + if (xi->auth_authorizer.buf) + ceph_buffer_put(xi->auth_authorizer.buf); + + kfree(ac->private); + ac->private = NULL; +} + +static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type) +{ + struct ceph_x_ticket_handler *th; + + th = get_ticket_handler(ac, peer_type); + if (!IS_ERR(th)) + memset(&th->validity, 0, sizeof(th->validity)); +} + +static int calcu_signature(struct ceph_x_authorizer *au, + struct ceph_msg *msg, __le64 *sig) +{ + int ret; + char tmp_enc[40]; + __le32 tmp[5] = { + cpu_to_le32(16), msg->hdr.crc, msg->footer.front_crc, + msg->footer.middle_crc, msg->footer.data_crc, + }; + ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp), + tmp_enc, sizeof(tmp_enc)); + if (ret < 0) + return ret; + *sig = *(__le64*)(tmp_enc + 4); + return 0; +} + +static int ceph_x_sign_message(struct ceph_auth_handshake *auth, + struct ceph_msg *msg) +{ + int ret; + if (!auth->authorizer) + return 0; + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, + msg, &msg->footer.sig); + if (ret < 0) + return ret; + msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED; + return 0; +} + +static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, + struct ceph_msg *msg) +{ + __le64 sig_check; + int ret; + + if (!auth->authorizer) + return 0; + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, + msg, &sig_check); + if (ret < 0) + return ret; + if (sig_check == msg->footer.sig) + return 0; + if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED) + dout("ceph_x_check_message_signature %p has signature %llx " + "expect %llx\n", msg, msg->footer.sig, sig_check); + else + dout("ceph_x_check_message_signature %p sender did not set " + "CEPH_MSG_FOOTER_SIGNED\n", msg); + return -EBADMSG; +} + +static const struct ceph_auth_client_ops ceph_x_ops = { + .name = "x", + .is_authenticated = ceph_x_is_authenticated, + .should_authenticate = ceph_x_should_authenticate, + .build_request = ceph_x_build_request, + .handle_reply = ceph_x_handle_reply, + .create_authorizer = ceph_x_create_authorizer, + .update_authorizer = ceph_x_update_authorizer, + .verify_authorizer_reply = ceph_x_verify_authorizer_reply, + .destroy_authorizer = ceph_x_destroy_authorizer, + .invalidate_authorizer = ceph_x_invalidate_authorizer, + .reset = ceph_x_reset, + .destroy = ceph_x_destroy, + .sign_message = ceph_x_sign_message, + .check_message_signature = ceph_x_check_message_signature, +}; + + +int ceph_x_init(struct ceph_auth_client *ac) +{ + struct ceph_x_info *xi; + int ret; + + dout("ceph_x_init %p\n", ac); + ret = -ENOMEM; + xi = kzalloc(sizeof(*xi), GFP_NOFS); + if (!xi) + goto out; + + ret = -EINVAL; + if (!ac->key) { + pr_err("no secret set (for auth_x protocol)\n"); + goto out_nomem; + } + + ret = ceph_crypto_key_clone(&xi->secret, ac->key); + if (ret < 0) { + pr_err("cannot clone key: %d\n", ret); + goto out_nomem; + } + + xi->starting = true; + xi->ticket_handlers = RB_ROOT; + + ac->protocol = CEPH_AUTH_CEPHX; + ac->private = xi; + ac->ops = &ceph_x_ops; + return 0; + +out_nomem: + kfree(xi); +out: + return ret; +} + + diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h new file mode 100644 index 000000000..e8b7c6917 --- /dev/null +++ b/net/ceph/auth_x.h @@ -0,0 +1,52 @@ +#ifndef _FS_CEPH_AUTH_X_H +#define _FS_CEPH_AUTH_X_H + +#include + +#include + +#include "crypto.h" +#include "auth_x_protocol.h" + +/* + * Handle ticket for a single service. + */ +struct ceph_x_ticket_handler { + struct rb_node node; + unsigned int service; + + struct ceph_crypto_key session_key; + struct ceph_timespec validity; + + u64 secret_id; + struct ceph_buffer *ticket_blob; + + unsigned long renew_after, expires; +}; + + +struct ceph_x_authorizer { + struct ceph_crypto_key session_key; + struct ceph_buffer *buf; + unsigned int service; + u64 nonce; + u64 secret_id; + char reply_buf[128]; /* big enough for encrypted blob */ +}; + +struct ceph_x_info { + struct ceph_crypto_key secret; + + bool starting; + u64 server_challenge; + + unsigned int have_keys; + struct rb_root ticket_handlers; + + struct ceph_x_authorizer auth_authorizer; +}; + +int ceph_x_init(struct ceph_auth_client *ac); + +#endif + diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h new file mode 100644 index 000000000..671d30576 --- /dev/null +++ b/net/ceph/auth_x_protocol.h @@ -0,0 +1,90 @@ +#ifndef __FS_CEPH_AUTH_X_PROTOCOL +#define __FS_CEPH_AUTH_X_PROTOCOL + +#define CEPHX_GET_AUTH_SESSION_KEY 0x0100 +#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200 +#define CEPHX_GET_ROTATING_KEY 0x0400 + +/* common bits */ +struct ceph_x_ticket_blob { + __u8 struct_v; + __le64 secret_id; + __le32 blob_len; + char blob[]; +} __attribute__ ((packed)); + + +/* common request/reply headers */ +struct ceph_x_request_header { + __le16 op; +} __attribute__ ((packed)); + +struct ceph_x_reply_header { + __le16 op; + __le32 result; +} __attribute__ ((packed)); + + +/* authenticate handshake */ + +/* initial hello (no reply header) */ +struct ceph_x_server_challenge { + __u8 struct_v; + __le64 server_challenge; +} __attribute__ ((packed)); + +struct ceph_x_authenticate { + __u8 struct_v; + __le64 client_challenge; + __le64 key; + /* ticket blob */ +} __attribute__ ((packed)); + +struct ceph_x_service_ticket_request { + __u8 struct_v; + __le32 keys; +} __attribute__ ((packed)); + +struct ceph_x_challenge_blob { + __le64 server_challenge; + __le64 client_challenge; +} __attribute__ ((packed)); + + + +/* authorize handshake */ + +/* + * The authorizer consists of two pieces: + * a - service id, ticket blob + * b - encrypted with session key + */ +struct ceph_x_authorize_a { + __u8 struct_v; + __le64 global_id; + __le32 service_id; + struct ceph_x_ticket_blob ticket_blob; +} __attribute__ ((packed)); + +struct ceph_x_authorize_b { + __u8 struct_v; + __le64 nonce; +} __attribute__ ((packed)); + +struct ceph_x_authorize_reply { + __u8 struct_v; + __le64 nonce_plus_one; +} __attribute__ ((packed)); + + +/* + * encyption bundle + */ +#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull + +struct ceph_x_encrypt_header { + __u8 struct_v; + __le64 magic; +} __attribute__ ((packed)); + +#endif diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c new file mode 100644 index 000000000..add5f921a --- /dev/null +++ b/net/ceph/buffer.c @@ -0,0 +1,58 @@ + +#include + +#include +#include + +#include +#include +#include /* for ceph_kvmalloc */ + +struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) +{ + struct ceph_buffer *b; + + b = kmalloc(sizeof(*b), gfp); + if (!b) + return NULL; + + b->vec.iov_base = ceph_kvmalloc(len, gfp); + if (!b->vec.iov_base) { + kfree(b); + return NULL; + } + + kref_init(&b->kref); + b->alloc_len = len; + b->vec.iov_len = len; + dout("buffer_new %p\n", b); + return b; +} +EXPORT_SYMBOL(ceph_buffer_new); + +void ceph_buffer_release(struct kref *kref) +{ + struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); + + dout("buffer_release %p\n", b); + kvfree(b->vec.iov_base); + kfree(b); +} +EXPORT_SYMBOL(ceph_buffer_release); + +int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) +{ + size_t len; + + ceph_decode_need(p, end, sizeof(u32), bad); + len = ceph_decode_32(p); + dout("decode_buffer len %d\n", (int)len); + ceph_decode_need(p, end, len, bad); + *b = ceph_buffer_new(len, GFP_NOFS); + if (!*b) + return -ENOMEM; + ceph_decode_copy(p, (*b)->vec.iov_base, len); + return 0; +bad: + return -EINVAL; +} diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c new file mode 100644 index 000000000..79e8f71ae --- /dev/null +++ b/net/ceph/ceph_common.c @@ -0,0 +1,725 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include +#include "crypto.h" + + +/* + * Module compatibility interface. For now it doesn't do anything, + * but its existence signals a certain level of functionality. + * + * The data buffer is used to pass information both to and from + * libceph. The return value indicates whether libceph determines + * it is compatible with the caller (from another kernel module), + * given the provided data. + * + * The data pointer can be null. + */ +bool libceph_compatible(void *data) +{ + return true; +} +EXPORT_SYMBOL(libceph_compatible); + +/* + * find filename portion of a path (/foo/bar/baz -> baz) + */ +const char *ceph_file_part(const char *s, int len) +{ + const char *e = s + len; + + while (e != s && *(e-1) != '/') + e--; + return e; +} +EXPORT_SYMBOL(ceph_file_part); + +const char *ceph_msg_type_name(int type) +{ + switch (type) { + case CEPH_MSG_SHUTDOWN: return "shutdown"; + case CEPH_MSG_PING: return "ping"; + case CEPH_MSG_AUTH: return "auth"; + case CEPH_MSG_AUTH_REPLY: return "auth_reply"; + case CEPH_MSG_MON_MAP: return "mon_map"; + case CEPH_MSG_MON_GET_MAP: return "mon_get_map"; + case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe"; + case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; + case CEPH_MSG_STATFS: return "statfs"; + case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; + case CEPH_MSG_MON_GET_VERSION: return "mon_get_version"; + case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply"; + case CEPH_MSG_MDS_MAP: return "mds_map"; + case CEPH_MSG_CLIENT_SESSION: return "client_session"; + case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect"; + case CEPH_MSG_CLIENT_REQUEST: return "client_request"; + case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward"; + case CEPH_MSG_CLIENT_REPLY: return "client_reply"; + case CEPH_MSG_CLIENT_CAPS: return "client_caps"; + case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release"; + case CEPH_MSG_CLIENT_SNAP: return "client_snap"; + case CEPH_MSG_CLIENT_LEASE: return "client_lease"; + case CEPH_MSG_OSD_MAP: return "osd_map"; + case CEPH_MSG_OSD_OP: return "osd_op"; + case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; + case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; + default: return "unknown"; + } +} +EXPORT_SYMBOL(ceph_msg_type_name); + +/* + * Initially learn our fsid, or verify an fsid matches. + */ +int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) +{ + if (client->have_fsid) { + if (ceph_fsid_compare(&client->fsid, fsid)) { + pr_err("bad fsid, had %pU got %pU", + &client->fsid, fsid); + return -1; + } + } else { + memcpy(&client->fsid, fsid, sizeof(*fsid)); + } + return 0; +} +EXPORT_SYMBOL(ceph_check_fsid); + +static int strcmp_null(const char *s1, const char *s2) +{ + if (!s1 && !s2) + return 0; + if (s1 && !s2) + return -1; + if (!s1 && s2) + return 1; + return strcmp(s1, s2); +} + +int ceph_compare_options(struct ceph_options *new_opt, + struct ceph_client *client) +{ + struct ceph_options *opt1 = new_opt; + struct ceph_options *opt2 = client->options; + int ofs = offsetof(struct ceph_options, mon_addr); + int i; + int ret; + + ret = memcmp(opt1, opt2, ofs); + if (ret) + return ret; + + ret = strcmp_null(opt1->name, opt2->name); + if (ret) + return ret; + + if (opt1->key && !opt2->key) + return -1; + if (!opt1->key && opt2->key) + return 1; + if (opt1->key && opt2->key) { + if (opt1->key->type != opt2->key->type) + return -1; + if (opt1->key->created.tv_sec != opt2->key->created.tv_sec) + return -1; + if (opt1->key->created.tv_nsec != opt2->key->created.tv_nsec) + return -1; + if (opt1->key->len != opt2->key->len) + return -1; + if (opt1->key->key && !opt2->key->key) + return -1; + if (!opt1->key->key && opt2->key->key) + return 1; + if (opt1->key->key && opt2->key->key) { + ret = memcmp(opt1->key->key, opt2->key->key, opt1->key->len); + if (ret) + return ret; + } + } + + /* any matching mon ip implies a match */ + for (i = 0; i < opt1->num_mon; i++) { + if (ceph_monmap_contains(client->monc.monmap, + &opt1->mon_addr[i])) + return 0; + } + return -1; +} +EXPORT_SYMBOL(ceph_compare_options); + +void *ceph_kvmalloc(size_t size, gfp_t flags) +{ + if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + void *ptr = kmalloc(size, flags | __GFP_NOWARN); + if (ptr) + return ptr; + } + + return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); +} + + +static int parse_fsid(const char *str, struct ceph_fsid *fsid) +{ + int i = 0; + char tmp[3]; + int err = -EINVAL; + int d; + + dout("parse_fsid '%s'\n", str); + tmp[2] = 0; + while (*str && i < 16) { + if (ispunct(*str)) { + str++; + continue; + } + if (!isxdigit(str[0]) || !isxdigit(str[1])) + break; + tmp[0] = str[0]; + tmp[1] = str[1]; + if (sscanf(tmp, "%x", &d) < 1) + break; + fsid->fsid[i] = d & 0xff; + i++; + str += 2; + } + + if (i == 16) + err = 0; + dout("parse_fsid ret %d got fsid %pU", err, fsid); + return err; +} + +/* + * ceph options + */ +enum { + Opt_osdtimeout, + Opt_osdkeepalivetimeout, + Opt_mount_timeout, + Opt_osd_idle_ttl, + Opt_last_int, + /* int args above */ + Opt_fsid, + Opt_name, + Opt_secret, + Opt_key, + Opt_ip, + Opt_last_string, + /* string args above */ + Opt_share, + Opt_noshare, + Opt_crc, + Opt_nocrc, + Opt_cephx_require_signatures, + Opt_nocephx_require_signatures, + Opt_tcp_nodelay, + Opt_notcp_nodelay, +}; + +static match_table_t opt_tokens = { + {Opt_osdtimeout, "osdtimeout=%d"}, + {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, + {Opt_mount_timeout, "mount_timeout=%d"}, + {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, + /* int args above */ + {Opt_fsid, "fsid=%s"}, + {Opt_name, "name=%s"}, + {Opt_secret, "secret=%s"}, + {Opt_key, "key=%s"}, + {Opt_ip, "ip=%s"}, + /* string args above */ + {Opt_share, "share"}, + {Opt_noshare, "noshare"}, + {Opt_crc, "crc"}, + {Opt_nocrc, "nocrc"}, + {Opt_cephx_require_signatures, "cephx_require_signatures"}, + {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, + {Opt_tcp_nodelay, "tcp_nodelay"}, + {Opt_notcp_nodelay, "notcp_nodelay"}, + {-1, NULL} +}; + +void ceph_destroy_options(struct ceph_options *opt) +{ + dout("destroy_options %p\n", opt); + kfree(opt->name); + if (opt->key) { + ceph_crypto_key_destroy(opt->key); + kfree(opt->key); + } + kfree(opt->mon_addr); + kfree(opt); +} +EXPORT_SYMBOL(ceph_destroy_options); + +/* get secret from key store */ +static int get_secret(struct ceph_crypto_key *dst, const char *name) { + struct key *ukey; + int key_err; + int err = 0; + struct ceph_crypto_key *ckey; + + ukey = request_key(&key_type_ceph, name, NULL); + if (!ukey || IS_ERR(ukey)) { + /* request_key errors don't map nicely to mount(2) + errors; don't even try, but still printk */ + key_err = PTR_ERR(ukey); + switch (key_err) { + case -ENOKEY: + pr_warn("ceph: Mount failed due to key not found: %s\n", + name); + break; + case -EKEYEXPIRED: + pr_warn("ceph: Mount failed due to expired key: %s\n", + name); + break; + case -EKEYREVOKED: + pr_warn("ceph: Mount failed due to revoked key: %s\n", + name); + break; + default: + pr_warn("ceph: Mount failed due to unknown key error %d: %s\n", + key_err, name); + } + err = -EPERM; + goto out; + } + + ckey = ukey->payload.data; + err = ceph_crypto_key_clone(dst, ckey); + if (err) + goto out_key; + /* pass through, err is 0 */ + +out_key: + key_put(ukey); +out: + return err; +} + +struct ceph_options * +ceph_parse_options(char *options, const char *dev_name, + const char *dev_name_end, + int (*parse_extra_token)(char *c, void *private), + void *private) +{ + struct ceph_options *opt; + const char *c; + int err = -ENOMEM; + substring_t argstr[MAX_OPT_ARGS]; + + if (current->nsproxy->net_ns != &init_net) + return ERR_PTR(-EINVAL); + + opt = kzalloc(sizeof(*opt), GFP_KERNEL); + if (!opt) + return ERR_PTR(-ENOMEM); + opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr), + GFP_KERNEL); + if (!opt->mon_addr) + goto out; + + dout("parse_options %p options '%s' dev_name '%s'\n", opt, options, + dev_name); + + /* start with defaults */ + opt->flags = CEPH_OPT_DEFAULT; + opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; + opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ + opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */ + + /* get mon ip(s) */ + /* ip1[:port1][,ip2[:port2]...] */ + err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr, + CEPH_MAX_MON, &opt->num_mon); + if (err < 0) + goto out; + + /* parse mount options */ + while ((c = strsep(&options, ",")) != NULL) { + int token, intval, ret; + if (!*c) + continue; + err = -EINVAL; + token = match_token((char *)c, opt_tokens, argstr); + if (token < 0 && parse_extra_token) { + /* extra? */ + err = parse_extra_token((char *)c, private); + if (err < 0) { + pr_err("bad option at '%s'\n", c); + goto out; + } + continue; + } + if (token < Opt_last_int) { + ret = match_int(&argstr[0], &intval); + if (ret < 0) { + pr_err("bad mount option arg (not int) " + "at '%s'\n", c); + continue; + } + dout("got int token %d val %d\n", token, intval); + } else if (token > Opt_last_int && token < Opt_last_string) { + dout("got string token %d val %s\n", token, + argstr[0].from); + } else { + dout("got token %d\n", token); + } + switch (token) { + case Opt_ip: + err = ceph_parse_ips(argstr[0].from, + argstr[0].to, + &opt->my_addr, + 1, NULL); + if (err < 0) + goto out; + opt->flags |= CEPH_OPT_MYIP; + break; + + case Opt_fsid: + err = parse_fsid(argstr[0].from, &opt->fsid); + if (err == 0) + opt->flags |= CEPH_OPT_FSID; + break; + case Opt_name: + opt->name = kstrndup(argstr[0].from, + argstr[0].to-argstr[0].from, + GFP_KERNEL); + break; + case Opt_secret: + opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); + if (!opt->key) { + err = -ENOMEM; + goto out; + } + err = ceph_crypto_key_unarmor(opt->key, argstr[0].from); + if (err < 0) + goto out; + break; + case Opt_key: + opt->key = kzalloc(sizeof(*opt->key), GFP_KERNEL); + if (!opt->key) { + err = -ENOMEM; + goto out; + } + err = get_secret(opt->key, argstr[0].from); + if (err < 0) + goto out; + break; + + /* misc */ + case Opt_osdtimeout: + pr_warn("ignoring deprecated osdtimeout option\n"); + break; + case Opt_osdkeepalivetimeout: + opt->osd_keepalive_timeout = intval; + break; + case Opt_osd_idle_ttl: + opt->osd_idle_ttl = intval; + break; + case Opt_mount_timeout: + opt->mount_timeout = intval; + break; + + case Opt_share: + opt->flags &= ~CEPH_OPT_NOSHARE; + break; + case Opt_noshare: + opt->flags |= CEPH_OPT_NOSHARE; + break; + + case Opt_crc: + opt->flags &= ~CEPH_OPT_NOCRC; + break; + case Opt_nocrc: + opt->flags |= CEPH_OPT_NOCRC; + break; + + case Opt_cephx_require_signatures: + opt->flags &= ~CEPH_OPT_NOMSGAUTH; + break; + case Opt_nocephx_require_signatures: + opt->flags |= CEPH_OPT_NOMSGAUTH; + break; + + case Opt_tcp_nodelay: + opt->flags |= CEPH_OPT_TCP_NODELAY; + break; + case Opt_notcp_nodelay: + opt->flags &= ~CEPH_OPT_TCP_NODELAY; + break; + + default: + BUG_ON(token); + } + } + + /* success */ + return opt; + +out: + ceph_destroy_options(opt); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ceph_parse_options); + +int ceph_print_client_options(struct seq_file *m, struct ceph_client *client) +{ + struct ceph_options *opt = client->options; + size_t pos = m->count; + + if (opt->name) + seq_printf(m, "name=%s,", opt->name); + if (opt->key) + seq_puts(m, "secret=,"); + + if (opt->flags & CEPH_OPT_FSID) + seq_printf(m, "fsid=%pU,", &opt->fsid); + if (opt->flags & CEPH_OPT_NOSHARE) + seq_puts(m, "noshare,"); + if (opt->flags & CEPH_OPT_NOCRC) + seq_puts(m, "nocrc,"); + if (opt->flags & CEPH_OPT_NOMSGAUTH) + seq_puts(m, "nocephx_require_signatures,"); + if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) + seq_puts(m, "notcp_nodelay,"); + + if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) + seq_printf(m, "mount_timeout=%d,", opt->mount_timeout); + if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) + seq_printf(m, "osd_idle_ttl=%d,", opt->osd_idle_ttl); + if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) + seq_printf(m, "osdkeepalivetimeout=%d,", + opt->osd_keepalive_timeout); + + /* drop redundant comma */ + if (m->count != pos) + m->count--; + + return 0; +} +EXPORT_SYMBOL(ceph_print_client_options); + +u64 ceph_client_id(struct ceph_client *client) +{ + return client->monc.auth->global_id; +} +EXPORT_SYMBOL(ceph_client_id); + +/* + * create a fresh client instance + */ +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, + u64 supported_features, + u64 required_features) +{ + struct ceph_client *client; + struct ceph_entity_addr *myaddr = NULL; + int err = -ENOMEM; + + client = kzalloc(sizeof(*client), GFP_KERNEL); + if (client == NULL) + return ERR_PTR(-ENOMEM); + + client->private = private; + client->options = opt; + + mutex_init(&client->mount_mutex); + init_waitqueue_head(&client->auth_wq); + client->auth_err = 0; + + if (!ceph_test_opt(client, NOMSGAUTH)) + required_features |= CEPH_FEATURE_MSG_AUTH; + + client->extra_mon_dispatch = NULL; + client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT | + supported_features; + client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT | + required_features; + + /* msgr */ + if (ceph_test_opt(client, MYIP)) + myaddr = &client->options->my_addr; + + ceph_messenger_init(&client->msgr, myaddr, + client->supported_features, + client->required_features, + ceph_test_opt(client, NOCRC), + ceph_test_opt(client, TCP_NODELAY)); + + /* subsystems */ + err = ceph_monc_init(&client->monc, client); + if (err < 0) + goto fail; + err = ceph_osdc_init(&client->osdc, client); + if (err < 0) + goto fail_monc; + + return client; + +fail_monc: + ceph_monc_stop(&client->monc); +fail: + kfree(client); + return ERR_PTR(err); +} +EXPORT_SYMBOL(ceph_create_client); + +void ceph_destroy_client(struct ceph_client *client) +{ + dout("destroy_client %p\n", client); + + atomic_set(&client->msgr.stopping, 1); + + /* unmount */ + ceph_osdc_stop(&client->osdc); + + ceph_monc_stop(&client->monc); + + ceph_debugfs_client_cleanup(client); + + ceph_destroy_options(client->options); + + kfree(client); + dout("destroy_client %p done\n", client); +} +EXPORT_SYMBOL(ceph_destroy_client); + +/* + * true if we have the mon map (and have thus joined the cluster) + */ +static int have_mon_and_osd_map(struct ceph_client *client) +{ + return client->monc.monmap && client->monc.monmap->epoch && + client->osdc.osdmap && client->osdc.osdmap->epoch; +} + +/* + * mount: join the ceph cluster, and open root directory. + */ +int __ceph_open_session(struct ceph_client *client, unsigned long started) +{ + int err; + unsigned long timeout = client->options->mount_timeout * HZ; + + /* open session, and wait for mon and osd maps */ + err = ceph_monc_open_session(&client->monc); + if (err < 0) + return err; + + while (!have_mon_and_osd_map(client)) { + err = -EIO; + if (timeout && time_after_eq(jiffies, started + timeout)) + return err; + + /* wait */ + dout("mount waiting for mon_map\n"); + err = wait_event_interruptible_timeout(client->auth_wq, + have_mon_and_osd_map(client) || (client->auth_err < 0), + timeout); + if (err == -EINTR || err == -ERESTARTSYS) + return err; + if (client->auth_err < 0) + return client->auth_err; + } + + return 0; +} +EXPORT_SYMBOL(__ceph_open_session); + + +int ceph_open_session(struct ceph_client *client) +{ + int ret; + unsigned long started = jiffies; /* note the start time */ + + dout("open_session start\n"); + mutex_lock(&client->mount_mutex); + + ret = __ceph_open_session(client, started); + + mutex_unlock(&client->mount_mutex); + return ret; +} +EXPORT_SYMBOL(ceph_open_session); + + +static int __init init_ceph_lib(void) +{ + int ret = 0; + + ret = ceph_debugfs_init(); + if (ret < 0) + goto out; + + ret = ceph_crypto_init(); + if (ret < 0) + goto out_debugfs; + + ret = ceph_msgr_init(); + if (ret < 0) + goto out_crypto; + + ret = ceph_osdc_setup(); + if (ret < 0) + goto out_msgr; + + pr_info("loaded (mon/osd proto %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); + + return 0; + +out_msgr: + ceph_msgr_exit(); +out_crypto: + ceph_crypto_shutdown(); +out_debugfs: + ceph_debugfs_cleanup(); +out: + return ret; +} + +static void __exit exit_ceph_lib(void) +{ + dout("exit_ceph_lib\n"); + ceph_osdc_cleanup(); + ceph_msgr_exit(); + ceph_crypto_shutdown(); + ceph_debugfs_cleanup(); +} + +module_init(init_ceph_lib); +module_exit(exit_ceph_lib); + +MODULE_AUTHOR("Sage Weil "); +MODULE_AUTHOR("Yehuda Sadeh "); +MODULE_AUTHOR("Patience Warnick "); +MODULE_DESCRIPTION("Ceph filesystem for Linux"); +MODULE_LICENSE("GPL"); diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c new file mode 100644 index 000000000..41466ccb9 --- /dev/null +++ b/net/ceph/ceph_fs.c @@ -0,0 +1,78 @@ +/* + * Some non-inline ceph helpers + */ +#include +#include + +/* + * return true if @layout appears to be valid + */ +int ceph_file_layout_is_valid(const struct ceph_file_layout *layout) +{ + __u32 su = le32_to_cpu(layout->fl_stripe_unit); + __u32 sc = le32_to_cpu(layout->fl_stripe_count); + __u32 os = le32_to_cpu(layout->fl_object_size); + + /* stripe unit, object size must be non-zero, 64k increment */ + if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1))) + return 0; + /* object size must be a multiple of stripe unit */ + if (os < su || os % su) + return 0; + /* stripe count must be non-zero */ + if (!sc) + return 0; + return 1; +} + + +int ceph_flags_to_mode(int flags) +{ + int mode; + +#ifdef O_DIRECTORY /* fixme */ + if ((flags & O_DIRECTORY) == O_DIRECTORY) + return CEPH_FILE_MODE_PIN; +#endif + + switch (flags & O_ACCMODE) { + case O_WRONLY: + mode = CEPH_FILE_MODE_WR; + break; + case O_RDONLY: + mode = CEPH_FILE_MODE_RD; + break; + case O_RDWR: + case O_ACCMODE: /* this is what the VFS does */ + mode = CEPH_FILE_MODE_RDWR; + break; + } +#ifdef O_LAZY + if (flags & O_LAZY) + mode |= CEPH_FILE_MODE_LAZY; +#endif + + return mode; +} +EXPORT_SYMBOL(ceph_flags_to_mode); + +int ceph_caps_for_mode(int mode) +{ + int caps = CEPH_CAP_PIN; + + if (mode & CEPH_FILE_MODE_RD) + caps |= CEPH_CAP_FILE_SHARED | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; + if (mode & CEPH_FILE_MODE_WR) + caps |= CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | + CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | + CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + if (mode & CEPH_FILE_MODE_LAZY) + caps |= CEPH_CAP_FILE_LAZYIO; + + return caps; +} +EXPORT_SYMBOL(ceph_caps_for_mode); diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c new file mode 100644 index 000000000..67bb1f11e --- /dev/null +++ b/net/ceph/ceph_hash.c @@ -0,0 +1,121 @@ + +#include +#include + +/* + * Robert Jenkin's hash function. + * http://burtleburtle.net/bob/hash/evahash.html + * This is in the public domain. + */ +#define mix(a, b, c) \ + do { \ + a = a - b; a = a - c; a = a ^ (c >> 13); \ + b = b - c; b = b - a; b = b ^ (a << 8); \ + c = c - a; c = c - b; c = c ^ (b >> 13); \ + a = a - b; a = a - c; a = a ^ (c >> 12); \ + b = b - c; b = b - a; b = b ^ (a << 16); \ + c = c - a; c = c - b; c = c ^ (b >> 5); \ + a = a - b; a = a - c; a = a ^ (c >> 3); \ + b = b - c; b = b - a; b = b ^ (a << 10); \ + c = c - a; c = c - b; c = c ^ (b >> 15); \ + } while (0) + +unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length) +{ + const unsigned char *k = (const unsigned char *)str; + __u32 a, b, c; /* the internal state */ + __u32 len; /* how many key bytes still need mixing */ + + /* Set up the internal state */ + len = length; + a = 0x9e3779b9; /* the golden ratio; an arbitrary value */ + b = a; + c = 0; /* variable initialization of internal state */ + + /* handle most of the key */ + while (len >= 12) { + a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) + + ((__u32)k[3] << 24)); + b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) + + ((__u32)k[7] << 24)); + c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) + + ((__u32)k[11] << 24)); + mix(a, b, c); + k = k + 12; + len = len - 12; + } + + /* handle the last 11 bytes */ + c = c + length; + switch (len) { /* all the case statements fall through */ + case 11: + c = c + ((__u32)k[10] << 24); + case 10: + c = c + ((__u32)k[9] << 16); + case 9: + c = c + ((__u32)k[8] << 8); + /* the first byte of c is reserved for the length */ + case 8: + b = b + ((__u32)k[7] << 24); + case 7: + b = b + ((__u32)k[6] << 16); + case 6: + b = b + ((__u32)k[5] << 8); + case 5: + b = b + k[4]; + case 4: + a = a + ((__u32)k[3] << 24); + case 3: + a = a + ((__u32)k[2] << 16); + case 2: + a = a + ((__u32)k[1] << 8); + case 1: + a = a + k[0]; + /* case 0: nothing left to add */ + } + mix(a, b, c); + + return c; +} + +/* + * linux dcache hash + */ +unsigned int ceph_str_hash_linux(const char *str, unsigned int length) +{ + unsigned long hash = 0; + unsigned char c; + + while (length--) { + c = *str++; + hash = (hash + (c << 4) + (c >> 4)) * 11; + } + return hash; +} + + +unsigned int ceph_str_hash(int type, const char *s, unsigned int len) +{ + switch (type) { + case CEPH_STR_HASH_LINUX: + return ceph_str_hash_linux(s, len); + case CEPH_STR_HASH_RJENKINS: + return ceph_str_hash_rjenkins(s, len); + default: + return -1; + } +} +EXPORT_SYMBOL(ceph_str_hash); + +const char *ceph_str_hash_name(int type) +{ + switch (type) { + case CEPH_STR_HASH_LINUX: + return "linux"; + case CEPH_STR_HASH_RJENKINS: + return "rjenkins"; + default: + return "unknown"; + } +} +EXPORT_SYMBOL(ceph_str_hash_name); diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c new file mode 100644 index 000000000..139a9cb19 --- /dev/null +++ b/net/ceph/ceph_strings.c @@ -0,0 +1,44 @@ +/* + * Ceph string constants + */ +#include +#include + +const char *ceph_entity_type_name(int type) +{ + switch (type) { + case CEPH_ENTITY_TYPE_MDS: return "mds"; + case CEPH_ENTITY_TYPE_OSD: return "osd"; + case CEPH_ENTITY_TYPE_MON: return "mon"; + case CEPH_ENTITY_TYPE_CLIENT: return "client"; + case CEPH_ENTITY_TYPE_AUTH: return "auth"; + default: return "unknown"; + } +} + +const char *ceph_osd_op_name(int op) +{ + switch (op) { +#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return (str); +__CEPH_FORALL_OSD_OPS(GENERATE_CASE) +#undef GENERATE_CASE + default: + return "???"; + } +} + +const char *ceph_osd_state_name(int s) +{ + switch (s) { + case CEPH_OSD_EXISTS: + return "exists"; + case CEPH_OSD_UP: + return "up"; + case CEPH_OSD_AUTOOUT: + return "autoout"; + case CEPH_OSD_NEW: + return "new"; + default: + return "???"; + } +} diff --git a/net/ceph/crush/crush.c b/net/ceph/crush/crush.c new file mode 100644 index 000000000..9d84ce4ea --- /dev/null +++ b/net/ceph/crush/crush.c @@ -0,0 +1,143 @@ + +#ifdef __KERNEL__ +# include +#else +# include +# include +# define kfree(x) do { if (x) free(x); } while (0) +# define BUG_ON(x) assert(!(x)) +#endif + +#include + +const char *crush_bucket_alg_name(int alg) +{ + switch (alg) { + case CRUSH_BUCKET_UNIFORM: return "uniform"; + case CRUSH_BUCKET_LIST: return "list"; + case CRUSH_BUCKET_TREE: return "tree"; + case CRUSH_BUCKET_STRAW: return "straw"; + case CRUSH_BUCKET_STRAW2: return "straw2"; + default: return "unknown"; + } +} + +/** + * crush_get_bucket_item_weight - Get weight of an item in given bucket + * @b: bucket pointer + * @p: item index in bucket + */ +int crush_get_bucket_item_weight(const struct crush_bucket *b, int p) +{ + if ((__u32)p >= b->size) + return 0; + + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + return ((struct crush_bucket_uniform *)b)->item_weight; + case CRUSH_BUCKET_LIST: + return ((struct crush_bucket_list *)b)->item_weights[p]; + case CRUSH_BUCKET_TREE: + return ((struct crush_bucket_tree *)b)->node_weights[crush_calc_tree_node(p)]; + case CRUSH_BUCKET_STRAW: + return ((struct crush_bucket_straw *)b)->item_weights[p]; + case CRUSH_BUCKET_STRAW2: + return ((struct crush_bucket_straw2 *)b)->item_weights[p]; + } + return 0; +} + +void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b) +{ + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_list(struct crush_bucket_list *b) +{ + kfree(b->item_weights); + kfree(b->sum_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_tree(struct crush_bucket_tree *b) +{ + kfree(b->h.perm); + kfree(b->h.items); + kfree(b->node_weights); + kfree(b); +} + +void crush_destroy_bucket_straw(struct crush_bucket_straw *b) +{ + kfree(b->straws); + kfree(b->item_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b) +{ + kfree(b->item_weights); + kfree(b->h.perm); + kfree(b->h.items); + kfree(b); +} + +void crush_destroy_bucket(struct crush_bucket *b) +{ + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b); + break; + case CRUSH_BUCKET_LIST: + crush_destroy_bucket_list((struct crush_bucket_list *)b); + break; + case CRUSH_BUCKET_TREE: + crush_destroy_bucket_tree((struct crush_bucket_tree *)b); + break; + case CRUSH_BUCKET_STRAW: + crush_destroy_bucket_straw((struct crush_bucket_straw *)b); + break; + case CRUSH_BUCKET_STRAW2: + crush_destroy_bucket_straw2((struct crush_bucket_straw2 *)b); + break; + } +} + +/** + * crush_destroy - Destroy a crush_map + * @map: crush_map pointer + */ +void crush_destroy(struct crush_map *map) +{ + /* buckets */ + if (map->buckets) { + __s32 b; + for (b = 0; b < map->max_buckets; b++) { + if (map->buckets[b] == NULL) + continue; + crush_destroy_bucket(map->buckets[b]); + } + kfree(map->buckets); + } + + /* rules */ + if (map->rules) { + __u32 b; + for (b = 0; b < map->max_rules; b++) + crush_destroy_rule(map->rules[b]); + kfree(map->rules); + } + + kfree(map); +} + +void crush_destroy_rule(struct crush_rule *rule) +{ + kfree(rule); +} diff --git a/net/ceph/crush/crush_ln_table.h b/net/ceph/crush/crush_ln_table.h new file mode 100644 index 000000000..6192c7fc9 --- /dev/null +++ b/net/ceph/crush/crush_ln_table.h @@ -0,0 +1,166 @@ +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel Corporation All Rights Reserved + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#if defined(__linux__) +#include +#elif defined(__FreeBSD__) +#include +#endif + +#ifndef CEPH_CRUSH_LN_H +#define CEPH_CRUSH_LN_H + + +// RH_LH_tbl[2*k] = 2^48/(1.0+k/128.0) +// RH_LH_tbl[2*k+1] = 2^48*log2(1.0+k/128.0) + +static int64_t __RH_LH_tbl[128*2+2] = { + 0x0001000000000000ll, 0x0000000000000000ll, 0x0000fe03f80fe040ll, 0x000002dfca16dde1ll, + 0x0000fc0fc0fc0fc1ll, 0x000005b9e5a170b4ll, 0x0000fa232cf25214ll, 0x0000088e68ea899all, + 0x0000f83e0f83e0f9ll, 0x00000b5d69bac77ell, 0x0000f6603d980f67ll, 0x00000e26fd5c8555ll, + 0x0000f4898d5f85bcll, 0x000010eb389fa29fll, 0x0000f2b9d6480f2cll, 0x000013aa2fdd27f1ll, + 0x0000f0f0f0f0f0f1ll, 0x00001663f6fac913ll, 0x0000ef2eb71fc435ll, 0x00001918a16e4633ll, + 0x0000ed7303b5cc0fll, 0x00001bc84240adabll, 0x0000ebbdb2a5c162ll, 0x00001e72ec117fa5ll, + 0x0000ea0ea0ea0ea1ll, 0x00002118b119b4f3ll, 0x0000e865ac7b7604ll, 0x000023b9a32eaa56ll, + 0x0000e6c2b4481cd9ll, 0x00002655d3c4f15cll, 0x0000e525982af70dll, 0x000028ed53f307eell, + 0x0000e38e38e38e39ll, 0x00002b803473f7adll, 0x0000e1fc780e1fc8ll, 0x00002e0e85a9de04ll, + 0x0000e070381c0e08ll, 0x0000309857a05e07ll, 0x0000dee95c4ca038ll, 0x0000331dba0efce1ll, + 0x0000dd67c8a60dd7ll, 0x0000359ebc5b69d9ll, 0x0000dbeb61eed19dll, 0x0000381b6d9bb29bll, + 0x0000da740da740dbll, 0x00003a93dc9864b2ll, 0x0000d901b2036407ll, 0x00003d0817ce9cd4ll, + 0x0000d79435e50d7all, 0x00003f782d7204d0ll, 0x0000d62b80d62b81ll, 0x000041e42b6ec0c0ll, + 0x0000d4c77b03531ell, 0x0000444c1f6b4c2dll, 0x0000d3680d3680d4ll, 0x000046b016ca47c1ll, + 0x0000d20d20d20d21ll, 0x000049101eac381cll, 0x0000d0b69fcbd259ll, 0x00004b6c43f1366all, + 0x0000cf6474a8819fll, 0x00004dc4933a9337ll, 0x0000ce168a772509ll, 0x0000501918ec6c11ll, + 0x0000cccccccccccdll, 0x00005269e12f346ell, 0x0000cb8727c065c4ll, 0x000054b6f7f1325all, + 0x0000ca4587e6b750ll, 0x0000570068e7ef5all, 0x0000c907da4e8712ll, 0x000059463f919deell, + 0x0000c7ce0c7ce0c8ll, 0x00005b8887367433ll, 0x0000c6980c6980c7ll, 0x00005dc74ae9fbecll, + 0x0000c565c87b5f9ell, 0x00006002958c5871ll, 0x0000c4372f855d83ll, 0x0000623a71cb82c8ll, + 0x0000c30c30c30c31ll, 0x0000646eea247c5cll, 0x0000c1e4bbd595f7ll, 0x000066a008e4788cll, + 0x0000c0c0c0c0c0c1ll, 0x000068cdd829fd81ll, 0x0000bfa02fe80bfbll, 0x00006af861e5fc7dll, + 0x0000be82fa0be830ll, 0x00006d1fafdce20all, 0x0000bd6910470767ll, 0x00006f43cba79e40ll, + 0x0000bc52640bc527ll, 0x00007164beb4a56dll, 0x0000bb3ee721a54ell, 0x000073829248e961ll, + 0x0000ba2e8ba2e8bbll, 0x0000759d4f80cba8ll, 0x0000b92143fa36f6ll, 0x000077b4ff5108d9ll, + 0x0000b81702e05c0cll, 0x000079c9aa879d53ll, 0x0000b70fbb5a19bfll, 0x00007bdb59cca388ll, + 0x0000b60b60b60b61ll, 0x00007dea15a32c1bll, 0x0000b509e68a9b95ll, 0x00007ff5e66a0ffell, + 0x0000b40b40b40b41ll, 0x000081fed45cbccbll, 0x0000b30f63528918ll, 0x00008404e793fb81ll, + 0x0000b21642c8590cll, 0x000086082806b1d5ll, 0x0000b11fd3b80b12ll, 0x000088089d8a9e47ll, + 0x0000b02c0b02c0b1ll, 0x00008a064fd50f2all, 0x0000af3addc680b0ll, 0x00008c01467b94bbll, + 0x0000ae4c415c9883ll, 0x00008df988f4ae80ll, 0x0000ad602b580ad7ll, 0x00008fef1e987409ll, + 0x0000ac7691840ac8ll, 0x000091e20ea1393ell, 0x0000ab8f69e2835all, 0x000093d2602c2e5fll, + 0x0000aaaaaaaaaaabll, 0x000095c01a39fbd6ll, 0x0000a9c84a47a080ll, 0x000097ab43af59f9ll, + 0x0000a8e83f5717c1ll, 0x00009993e355a4e5ll, 0x0000a80a80a80a81ll, 0x00009b79ffdb6c8bll, + 0x0000a72f0539782all, 0x00009d5d9fd5010bll, 0x0000a655c4392d7cll, 0x00009f3ec9bcfb80ll, + 0x0000a57eb50295fbll, 0x0000a11d83f4c355ll, 0x0000a4a9cf1d9684ll, 0x0000a2f9d4c51039ll, + 0x0000a3d70a3d70a4ll, 0x0000a4d3c25e68dcll, 0x0000a3065e3fae7dll, 0x0000a6ab52d99e76ll, + 0x0000a237c32b16d0ll, 0x0000a8808c384547ll, 0x0000a16b312ea8fdll, 0x0000aa5374652a1cll, + 0x0000a0a0a0a0a0a1ll, 0x0000ac241134c4e9ll, 0x00009fd809fd80a0ll, 0x0000adf26865a8a1ll, + 0x00009f1165e72549ll, 0x0000afbe7fa0f04dll, 0x00009e4cad23dd60ll, 0x0000b1885c7aa982ll, + 0x00009d89d89d89d9ll, 0x0000b35004723c46ll, 0x00009cc8e160c3fcll, 0x0000b5157cf2d078ll, + 0x00009c09c09c09c1ll, 0x0000b6d8cb53b0call, 0x00009b4c6f9ef03bll, 0x0000b899f4d8ab63ll, + 0x00009a90e7d95bc7ll, 0x0000ba58feb2703all, 0x000099d722dabde6ll, 0x0000bc15edfeed32ll, + 0x0000991f1a515886ll, 0x0000bdd0c7c9a817ll, 0x00009868c809868dll, 0x0000bf89910c1678ll, + 0x000097b425ed097cll, 0x0000c1404eadf383ll, 0x000097012e025c05ll, 0x0000c2f5058593d9ll, + 0x0000964fda6c0965ll, 0x0000c4a7ba58377cll, 0x000095a02568095bll, 0x0000c65871da59ddll, + 0x000094f2094f2095ll, 0x0000c80730b00016ll, 0x0000944580944581ll, 0x0000c9b3fb6d0559ll, + 0x0000939a85c4093all, 0x0000cb5ed69565afll, 0x000092f113840498ll, 0x0000cd07c69d8702ll, + 0x0000924924924925ll, 0x0000ceaecfea8085ll, 0x000091a2b3c4d5e7ll, 0x0000d053f6d26089ll, + 0x000090fdbc090fdcll, 0x0000d1f73f9c70c0ll, 0x0000905a38633e07ll, 0x0000d398ae817906ll, + 0x00008fb823ee08fcll, 0x0000d53847ac00a6ll, 0x00008f1779d9fdc4ll, 0x0000d6d60f388e41ll, + 0x00008e78356d1409ll, 0x0000d8720935e643ll, 0x00008dda5202376all, 0x0000da0c39a54804ll, + 0x00008d3dcb08d3ddll, 0x0000dba4a47aa996ll, 0x00008ca29c046515ll, 0x0000dd3b4d9cf24bll, + 0x00008c08c08c08c1ll, 0x0000ded038e633f3ll, 0x00008b70344a139cll, 0x0000e0636a23e2eell, + 0x00008ad8f2fba939ll, 0x0000e1f4e5170d02ll, 0x00008a42f870566all, 0x0000e384ad748f0ell, + 0x000089ae4089ae41ll, 0x0000e512c6e54998ll, 0x0000891ac73ae982ll, 0x0000e69f35065448ll, + 0x0000888888888889ll, 0x0000e829fb693044ll, 0x000087f78087f781ll, 0x0000e9b31d93f98ell, + 0x00008767ab5f34e5ll, 0x0000eb3a9f019750ll, 0x000086d905447a35ll, 0x0000ecc08321eb30ll, + 0x0000864b8a7de6d2ll, 0x0000ee44cd59ffabll, 0x000085bf37612cefll, 0x0000efc781043579ll, + 0x0000853408534086ll, 0x0000f148a170700all, 0x000084a9f9c8084bll, 0x0000f2c831e44116ll, + 0x0000842108421085ll, 0x0000f446359b1353ll, 0x0000839930523fbfll, 0x0000f5c2afc65447ll, + 0x000083126e978d50ll, 0x0000f73da38d9d4all, 0x0000828cbfbeb9a1ll, 0x0000f8b7140edbb1ll, + 0x0000820820820821ll, 0x0000fa2f045e7832ll, 0x000081848da8faf1ll, 0x0000fba577877d7dll, + 0x0000810204081021ll, 0x0000fd1a708bbe11ll, 0x0000808080808081ll, 0x0000fe8df263f957ll, + 0x0000800000000000ll, 0x0000ffff00000000ll, + }; + + + // LL_tbl[k] = 2^48*log2(1.0+k/2^15); +static int64_t __LL_tbl[256] = { + 0x0000000000000000ull, 0x00000002e2a60a00ull, 0x000000070cb64ec5ull, 0x00000009ef50ce67ull, + 0x0000000cd1e588fdull, 0x0000000fb4747e9cull, 0x0000001296fdaf5eull, 0x0000001579811b58ull, + 0x000000185bfec2a1ull, 0x0000001b3e76a552ull, 0x0000001e20e8c380ull, 0x0000002103551d43ull, + 0x00000023e5bbb2b2ull, 0x00000026c81c83e4ull, 0x00000029aa7790f0ull, 0x0000002c8cccd9edull, + 0x0000002f6f1c5ef2ull, 0x0000003251662017ull, 0x0000003533aa1d71ull, 0x0000003815e8571aull, + 0x0000003af820cd26ull, 0x0000003dda537faeull, 0x00000040bc806ec8ull, 0x000000439ea79a8cull, + 0x0000004680c90310ull, 0x0000004962e4a86cull, 0x0000004c44fa8ab6ull, 0x0000004f270aaa06ull, + 0x0000005209150672ull, 0x00000054eb19a013ull, 0x00000057cd1876fdull, 0x0000005aaf118b4aull, + 0x0000005d9104dd0full, 0x0000006072f26c64ull, 0x0000006354da3960ull, 0x0000006636bc441aull, + 0x0000006918988ca8ull, 0x0000006bfa6f1322ull, 0x0000006edc3fd79full, 0x00000071be0ada35ull, + 0x000000749fd01afdull, 0x00000077818f9a0cull, 0x0000007a6349577aull, 0x0000007d44fd535eull, + 0x0000008026ab8dceull, 0x00000083085406e3ull, 0x00000085e9f6beb2ull, 0x00000088cb93b552ull, + 0x0000008bad2aeadcull, 0x0000008e8ebc5f65ull, 0x0000009170481305ull, 0x0000009451ce05d3ull, + 0x00000097334e37e5ull, 0x0000009a14c8a953ull, 0x0000009cf63d5a33ull, 0x0000009fd7ac4a9dull, + 0x000000a2b07f3458ull, 0x000000a59a78ea6aull, 0x000000a87bd699fbull, 0x000000ab5d2e8970ull, + 0x000000ae3e80b8e3ull, 0x000000b11fcd2869ull, 0x000000b40113d818ull, 0x000000b6e254c80aull, + 0x000000b9c38ff853ull, 0x000000bca4c5690cull, 0x000000bf85f51a4aull, 0x000000c2671f0c26ull, + 0x000000c548433eb6ull, 0x000000c82961b211ull, 0x000000cb0a7a664dull, 0x000000cdeb8d5b82ull, + 0x000000d0cc9a91c8ull, 0x000000d3ada20933ull, 0x000000d68ea3c1ddull, 0x000000d96f9fbbdbull, + 0x000000dc5095f744ull, 0x000000df31867430ull, 0x000000e2127132b5ull, 0x000000e4f35632eaull, + 0x000000e7d43574e6ull, 0x000000eab50ef8c1ull, 0x000000ed95e2be90ull, 0x000000f076b0c66cull, + 0x000000f35779106aull, 0x000000f6383b9ca2ull, 0x000000f918f86b2aull, 0x000000fbf9af7c1aull, + 0x000000feda60cf88ull, 0x00000101bb0c658cull, 0x000001049bb23e3cull, 0x000001077c5259afull, + 0x0000010a5cecb7fcull, 0x0000010d3d81593aull, 0x000001101e103d7full, 0x00000112fe9964e4ull, + 0x00000115df1ccf7eull, 0x00000118bf9a7d64ull, 0x0000011ba0126eadull, 0x0000011e8084a371ull, + 0x0000012160f11bc6ull, 0x000001244157d7c3ull, 0x0000012721b8d77full, 0x0000012a02141b10ull, + 0x0000012ce269a28eull, 0x0000012fc2b96e0full, 0x00000132a3037daaull, 0x000001358347d177ull, + 0x000001386386698cull, 0x0000013b43bf45ffull, 0x0000013e23f266e9ull, 0x00000141041fcc5eull, + 0x00000143e4477678ull, 0x00000146c469654bull, 0x00000149a48598f0ull, 0x0000014c849c117cull, + 0x0000014f64accf08ull, 0x0000015244b7d1a9ull, 0x0000015524bd1976ull, 0x0000015804bca687ull, + 0x0000015ae4b678f2ull, 0x0000015dc4aa90ceull, 0x00000160a498ee31ull, 0x0000016384819134ull, + 0x00000166646479ecull, 0x000001694441a870ull, 0x0000016c24191cd7ull, 0x0000016df6ca19bdull, + 0x00000171e3b6d7aaull, 0x00000174c37d1e44ull, 0x00000177a33dab1cull, 0x0000017a82f87e49ull, + 0x0000017d62ad97e2ull, 0x00000180425cf7feull, 0x00000182b07f3458ull, 0x0000018601aa8c19ull, + 0x00000188e148c046ull, 0x0000018bc0e13b52ull, 0x0000018ea073fd52ull, 0x000001918001065dull, + 0x000001945f88568bull, 0x000001973f09edf2ull, 0x0000019a1e85ccaaull, 0x0000019cfdfbf2c8ull, + 0x0000019fdd6c6063ull, 0x000001a2bcd71593ull, 0x000001a59c3c126eull, 0x000001a87b9b570bull, + 0x000001ab5af4e380ull, 0x000001ae3a48b7e5ull, 0x000001b11996d450ull, 0x000001b3f8df38d9ull, + 0x000001b6d821e595ull, 0x000001b9b75eda9bull, 0x000001bc96961803ull, 0x000001bf75c79de3ull, + 0x000001c254f36c51ull, 0x000001c534198365ull, 0x000001c81339e336ull, 0x000001caf2548bd9ull, + 0x000001cdd1697d67ull, 0x000001d0b078b7f5ull, 0x000001d38f823b9aull, 0x000001d66e86086dull, + 0x000001d94d841e86ull, 0x000001dc2c7c7df9ull, 0x000001df0b6f26dfull, 0x000001e1ea5c194eull, + 0x000001e4c943555dull, 0x000001e7a824db23ull, 0x000001ea8700aab5ull, 0x000001ed65d6c42bull, + 0x000001f044a7279dull, 0x000001f32371d51full, 0x000001f60236cccaull, 0x000001f8e0f60eb3ull, + 0x000001fbbfaf9af3ull, 0x000001fe9e63719eull, 0x000002017d1192ccull, 0x000002045bb9fe94ull, + 0x000002073a5cb50dull, 0x00000209c06e6212ull, 0x0000020cf791026aull, 0x0000020fd622997cull, + 0x00000212b07f3458ull, 0x000002159334a8d8ull, 0x0000021871b52150ull, 0x0000021b502fe517ull, + 0x0000021d6a73a78full, 0x000002210d144eeeull, 0x00000223eb7df52cull, 0x00000226c9e1e713ull, + 0x00000229a84024bbull, 0x0000022c23679b4eull, 0x0000022f64eb83a8ull, 0x000002324338a51bull, + 0x00000235218012a9ull, 0x00000237ffc1cc69ull, 0x0000023a2c3b0ea4ull, 0x0000023d13ee805bull, + 0x0000024035e9221full, 0x00000243788faf25ull, 0x0000024656b4e735ull, 0x00000247ed646bfeull, + 0x0000024c12ee3d98ull, 0x0000024ef1025c1aull, 0x00000251cf10c799ull, 0x0000025492644d65ull, + 0x000002578b1c85eeull, 0x0000025a6919d8f0ull, 0x0000025d13ee805bull, 0x0000026025036716ull, + 0x0000026296453882ull, 0x00000265e0d62b53ull, 0x00000268beb701f3ull, 0x0000026b9c92265eull, + 0x0000026d32f798a9ull, 0x00000271583758ebull, 0x000002743601673bull, 0x0000027713c5c3b0ull, + 0x00000279f1846e5full, 0x0000027ccf3d6761ull, 0x0000027e6580aecbull, 0x000002828a9e44b3ull, + 0x0000028568462932ull, 0x00000287bdbf5255ull, 0x0000028b2384de4aull, 0x0000028d13ee805bull, + 0x0000029035e9221full, 0x0000029296453882ull, 0x0000029699bdfb61ull, 0x0000029902a37aabull, + 0x0000029c54b864c9ull, 0x0000029deabd1083ull, 0x000002a20f9c0bb5ull, 0x000002a4c7605d61ull, + 0x000002a7bdbf5255ull, 0x000002a96056dafcull, 0x000002ac3daf14efull, 0x000002af1b019ecaull, + 0x000002b296453882ull, 0x000002b5d022d80full, 0x000002b8fa471cb3ull, 0x000002ba9012e713ull, + 0x000002bd6d4901ccull, 0x000002c04a796cf6ull, 0x000002c327a428a6ull, 0x000002c61a5e8f4cull, + 0x000002c8e1e891f6ull, 0x000002cbbf023fc2ull, 0x000002ce9c163e6eull, 0x000002d179248e13ull, + 0x000002d4562d2ec6ull, 0x000002d73330209dull, 0x000002da102d63b0ull, 0x000002dced24f814ull, +}; + + + + +#endif diff --git a/net/ceph/crush/hash.c b/net/ceph/crush/hash.c new file mode 100644 index 000000000..5bb63e37a --- /dev/null +++ b/net/ceph/crush/hash.c @@ -0,0 +1,149 @@ + +#include +#include + +/* + * Robert Jenkins' function for mixing 32-bit values + * http://burtleburtle.net/bob/hash/evahash.html + * a, b = random bits, c = input and output + */ +#define crush_hashmix(a, b, c) do { \ + a = a-b; a = a-c; a = a^(c>>13); \ + b = b-c; b = b-a; b = b^(a<<8); \ + c = c-a; c = c-b; c = c^(b>>13); \ + a = a-b; a = a-c; a = a^(c>>12); \ + b = b-c; b = b-a; b = b^(a<<16); \ + c = c-a; c = c-b; c = c^(b>>5); \ + a = a-b; a = a-c; a = a^(c>>3); \ + b = b-c; b = b-a; b = b^(a<<10); \ + c = c-a; c = c-b; c = c^(b>>15); \ + } while (0) + +#define crush_hash_seed 1315423911 + +static __u32 crush_hash32_rjenkins1(__u32 a) +{ + __u32 hash = crush_hash_seed ^ a; + __u32 b = a; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(b, x, hash); + crush_hashmix(y, a, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b) +{ + __u32 hash = crush_hash_seed ^ a ^ b; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(x, a, hash); + crush_hashmix(b, y, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, x, hash); + crush_hashmix(y, a, hash); + crush_hashmix(b, x, hash); + crush_hashmix(y, c, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, d, hash); + crush_hashmix(a, x, hash); + crush_hashmix(y, b, hash); + crush_hashmix(c, x, hash); + crush_hashmix(y, d, hash); + return hash; +} + +static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d, + __u32 e) +{ + __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e; + __u32 x = 231232; + __u32 y = 1232; + crush_hashmix(a, b, hash); + crush_hashmix(c, d, hash); + crush_hashmix(e, x, hash); + crush_hashmix(y, a, hash); + crush_hashmix(b, x, hash); + crush_hashmix(y, c, hash); + crush_hashmix(d, x, hash); + crush_hashmix(y, e, hash); + return hash; +} + + +__u32 crush_hash32(int type, __u32 a) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1(a); + default: + return 0; + } +} + +__u32 crush_hash32_2(int type, __u32 a, __u32 b) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_2(a, b); + default: + return 0; + } +} + +__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_3(a, b, c); + default: + return 0; + } +} + +__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_4(a, b, c, d); + default: + return 0; + } +} + +__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return crush_hash32_rjenkins1_5(a, b, c, d, e); + default: + return 0; + } +} + +const char *crush_hash_name(int type) +{ + switch (type) { + case CRUSH_HASH_RJENKINS1: + return "rjenkins1"; + default: + return "unknown"; + } +} diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c new file mode 100644 index 000000000..5b47736d2 --- /dev/null +++ b/net/ceph/crush/mapper.c @@ -0,0 +1,927 @@ + +#ifdef __KERNEL__ +# include +# include +# include +# include +# ifndef dprintk +# define dprintk(args...) +# endif +#else +# include +# include +# include +# include +# define BUG_ON(x) assert(!(x)) +# define dprintk(args...) /* printf(args) */ +# define kmalloc(x, f) malloc(x) +# define kfree(x) free(x) +#endif + +#include +#include +#include "crush_ln_table.h" + +/* + * Implement the core CRUSH mapping algorithm. + */ + +/** + * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. + * @map: the crush_map + * @ruleset: the storage ruleset id (user defined) + * @type: storage ruleset type (user defined) + * @size: output set size + */ +int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size) +{ + __u32 i; + + for (i = 0; i < map->max_rules; i++) { + if (map->rules[i] && + map->rules[i]->mask.ruleset == ruleset && + map->rules[i]->mask.type == type && + map->rules[i]->mask.min_size <= size && + map->rules[i]->mask.max_size >= size) + return i; + } + return -1; +} + + +/* + * bucket choose methods + * + * For each bucket algorithm, we have a "choose" method that, given a + * crush input @x and replica position (usually, position in output set) @r, + * will produce an item in the bucket. + */ + +/* + * Choose based on a random permutation of the bucket. + * + * We used to use some prime number arithmetic to do this, but it + * wasn't very random, and had some other bad behaviors. Instead, we + * calculate an actual random permutation of the bucket members. + * Since this is expensive, we optimize for the r=0 case, which + * captures the vast majority of calls. + */ +static int bucket_perm_choose(struct crush_bucket *bucket, + int x, int r) +{ + unsigned int pr = r % bucket->size; + unsigned int i, s; + + /* start a new permutation if @x has changed */ + if (bucket->perm_x != (__u32)x || bucket->perm_n == 0) { + dprintk("bucket %d new x=%d\n", bucket->id, x); + bucket->perm_x = x; + + /* optimize common r=0 case */ + if (pr == 0) { + s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % + bucket->size; + bucket->perm[0] = s; + bucket->perm_n = 0xffff; /* magic value, see below */ + goto out; + } + + for (i = 0; i < bucket->size; i++) + bucket->perm[i] = i; + bucket->perm_n = 0; + } else if (bucket->perm_n == 0xffff) { + /* clean up after the r=0 case above */ + for (i = 1; i < bucket->size; i++) + bucket->perm[i] = i; + bucket->perm[bucket->perm[0]] = 0; + bucket->perm_n = 1; + } + + /* calculate permutation up to pr */ + for (i = 0; i < bucket->perm_n; i++) + dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); + while (bucket->perm_n <= pr) { + unsigned int p = bucket->perm_n; + /* no point in swapping the final entry */ + if (p < bucket->size - 1) { + i = crush_hash32_3(bucket->hash, x, bucket->id, p) % + (bucket->size - p); + if (i) { + unsigned int t = bucket->perm[p + i]; + bucket->perm[p + i] = bucket->perm[p]; + bucket->perm[p] = t; + } + dprintk(" perm_choose swap %d with %d\n", p, p+i); + } + bucket->perm_n++; + } + for (i = 0; i < bucket->size; i++) + dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); + + s = bucket->perm[pr]; +out: + dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, + bucket->size, x, r, pr, s); + return bucket->items[s]; +} + +/* uniform */ +static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, + int x, int r) +{ + return bucket_perm_choose(&bucket->h, x, r); +} + +/* list */ +static int bucket_list_choose(struct crush_bucket_list *bucket, + int x, int r) +{ + int i; + + for (i = bucket->h.size-1; i >= 0; i--) { + __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], + r, bucket->h.id); + w &= 0xffff; + dprintk("list_choose i=%d x=%d r=%d item %d weight %x " + "sw %x rand %llx", + i, x, r, bucket->h.items[i], bucket->item_weights[i], + bucket->sum_weights[i], w); + w *= bucket->sum_weights[i]; + w = w >> 16; + /*dprintk(" scaled %llx\n", w);*/ + if (w < bucket->item_weights[i]) + return bucket->h.items[i]; + } + + dprintk("bad list sums for bucket %d\n", bucket->h.id); + return bucket->h.items[0]; +} + + +/* (binary) tree */ +static int height(int n) +{ + int h = 0; + while ((n & 1) == 0) { + h++; + n = n >> 1; + } + return h; +} + +static int left(int x) +{ + int h = height(x); + return x - (1 << (h-1)); +} + +static int right(int x) +{ + int h = height(x); + return x + (1 << (h-1)); +} + +static int terminal(int x) +{ + return x & 1; +} + +static int bucket_tree_choose(struct crush_bucket_tree *bucket, + int x, int r) +{ + int n; + __u32 w; + __u64 t; + + /* start at root */ + n = bucket->num_nodes >> 1; + + while (!terminal(n)) { + int l; + /* pick point in [0, w) */ + w = bucket->node_weights[n]; + t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, + bucket->h.id) * (__u64)w; + t = t >> 32; + + /* descend to the left or right? */ + l = left(n); + if (t < bucket->node_weights[l]) + n = l; + else + n = right(n); + } + + return bucket->h.items[n >> 1]; +} + + +/* straw */ + +static int bucket_straw_choose(struct crush_bucket_straw *bucket, + int x, int r) +{ + __u32 i; + int high = 0; + __u64 high_draw = 0; + __u64 draw; + + for (i = 0; i < bucket->h.size; i++) { + draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); + draw &= 0xffff; + draw *= bucket->straws[i]; + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + +// compute 2^44*log2(input+1) +uint64_t crush_ln(unsigned xin) +{ + unsigned x=xin, x1; + int iexpon, index1, index2; + uint64_t RH, LH, LL, xl64, result; + + x++; + + // normalize input + iexpon = 15; + while(!(x&0x18000)) { x<<=1; iexpon--; } + + index1 = (x>>8)<<1; + // RH ~ 2^56/index1 + RH = __RH_LH_tbl[index1 - 256]; + // LH ~ 2^48 * log2(index1/256) + LH = __RH_LH_tbl[index1 + 1 - 256]; + + // RH*x ~ 2^48 * (2^15 + xf), xf<2^8 + xl64 = (int64_t)x * RH; + xl64 >>= 48; + x1 = xl64; + + result = iexpon; + result <<= (12 + 32); + + index2 = x1 & 0xff; + // LL ~ 2^48*log2(1.0+index2/2^15) + LL = __LL_tbl[index2]; + + LH = LH + LL; + + LH >>= (48-12 - 32); + result += LH; + + return result; +} + + +/* + * straw2 + * + * for reference, see: + * + * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables + * + */ + +static int bucket_straw2_choose(struct crush_bucket_straw2 *bucket, + int x, int r) +{ + unsigned i, high = 0; + unsigned u; + unsigned w; + __s64 ln, draw, high_draw = 0; + + for (i = 0; i < bucket->h.size; i++) { + w = bucket->item_weights[i]; + if (w) { + u = crush_hash32_3(bucket->h.hash, x, + bucket->h.items[i], r); + u &= 0xffff; + + /* + * for some reason slightly less than 0x10000 produces + * a slightly more accurate distribution... probably a + * rounding effect. + * + * the natural log lookup table maps [0,0xffff] + * (corresponding to real numbers [1/0x10000, 1] to + * [0, 0xffffffffffff] (corresponding to real numbers + * [-11.090355,0]). + */ + ln = crush_ln(u) - 0x1000000000000ll; + + /* + * divide by 16.16 fixed-point weight. note + * that the ln value is negative, so a larger + * weight means a larger (less negative) value + * for draw. + */ + draw = div64_s64(ln, w); + } else { + draw = S64_MIN; + } + + if (i == 0 || draw > high_draw) { + high = i; + high_draw = draw; + } + } + return bucket->h.items[high]; +} + + +static int crush_bucket_choose(struct crush_bucket *in, int x, int r) +{ + dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); + BUG_ON(in->size == 0); + switch (in->alg) { + case CRUSH_BUCKET_UNIFORM: + return bucket_uniform_choose((struct crush_bucket_uniform *)in, + x, r); + case CRUSH_BUCKET_LIST: + return bucket_list_choose((struct crush_bucket_list *)in, + x, r); + case CRUSH_BUCKET_TREE: + return bucket_tree_choose((struct crush_bucket_tree *)in, + x, r); + case CRUSH_BUCKET_STRAW: + return bucket_straw_choose((struct crush_bucket_straw *)in, + x, r); + case CRUSH_BUCKET_STRAW2: + return bucket_straw2_choose((struct crush_bucket_straw2 *)in, + x, r); + default: + dprintk("unknown bucket %d alg %d\n", in->id, in->alg); + return in->items[0]; + } +} + + +/* + * true if device is marked "out" (failed, fully offloaded) + * of the cluster + */ +static int is_out(const struct crush_map *map, + const __u32 *weight, int weight_max, + int item, int x) +{ + if (item >= weight_max) + return 1; + if (weight[item] >= 0x10000) + return 0; + if (weight[item] == 0) + return 1; + if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) + < weight[item]) + return 0; + return 1; +} + +/** + * crush_choose_firstn - choose numrep distinct items of given type + * @map: the crush_map + * @bucket: the bucket we are choose an item from + * @x: crush input value + * @numrep: the number of items to choose + * @type: the type of item to choose + * @out: pointer to output vector + * @outpos: our position in that vector + * @out_size: size of the out vector + * @tries: number of attempts to make + * @recurse_tries: number of attempts to have recursive chooseleaf make + * @local_retries: localized retries + * @local_fallback_retries: localized fallback retries + * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) + * @vary_r: pass r to recursive calls + * @out2: second output vector for leaf items (if @recurse_to_leaf) + * @parent_r: r value passed from the parent + */ +static int crush_choose_firstn(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int numrep, int type, + int *out, int outpos, + int out_size, + unsigned int tries, + unsigned int recurse_tries, + unsigned int local_retries, + unsigned int local_fallback_retries, + int recurse_to_leaf, + unsigned int vary_r, + int *out2, + int parent_r) +{ + int rep; + unsigned int ftotal, flocal; + int retry_descent, retry_bucket, skip_rep; + struct crush_bucket *in = bucket; + int r; + int i; + int item = 0; + int itemtype; + int collide, reject; + int count = out_size; + + dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n", + recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep, + tries, recurse_tries, local_retries, local_fallback_retries, + parent_r); + + for (rep = outpos; rep < numrep && count > 0 ; rep++) { + /* keep trying until we get a non-out, non-colliding item */ + ftotal = 0; + skip_rep = 0; + do { + retry_descent = 0; + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + flocal = 0; + do { + collide = 0; + retry_bucket = 0; + r = rep + parent_r; + /* r' = r + f_total */ + r += ftotal; + + /* bucket choose */ + if (in->size == 0) { + reject = 1; + goto reject; + } + if (local_fallback_retries > 0 && + flocal >= (in->size>>1) && + flocal > local_fallback_retries) + item = bucket_perm_choose(in, x, r); + else + item = crush_bucket_choose(in, x, r); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + skip_rep = 1; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + skip_rep = 1; + break; + } + in = map->buckets[-1-item]; + retry_bucket = 1; + continue; + } + + /* collision? */ + for (i = 0; i < outpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + + reject = 0; + if (!collide && recurse_to_leaf) { + if (item < 0) { + int sub_r; + if (vary_r) + sub_r = r >> (vary_r-1); + else + sub_r = 0; + if (crush_choose_firstn(map, + map->buckets[-1-item], + weight, weight_max, + x, outpos+1, 0, + out2, outpos, count, + recurse_tries, 0, + local_retries, + local_fallback_retries, + 0, + vary_r, + NULL, + sub_r) <= outpos) + /* didn't get leaf */ + reject = 1; + } else { + /* we already have a leaf! */ + out2[outpos] = item; + } + } + + if (!reject) { + /* out? */ + if (itemtype == 0) + reject = is_out(map, weight, + weight_max, + item, x); + else + reject = 0; + } + +reject: + if (reject || collide) { + ftotal++; + flocal++; + + if (collide && flocal <= local_retries) + /* retry locally a few times */ + retry_bucket = 1; + else if (local_fallback_retries > 0 && + flocal <= in->size + local_fallback_retries) + /* exhaustive bucket search */ + retry_bucket = 1; + else if (ftotal < tries) + /* then retry descent */ + retry_descent = 1; + else + /* else give up */ + skip_rep = 1; + dprintk(" reject %d collide %d " + "ftotal %u flocal %u\n", + reject, collide, ftotal, + flocal); + } + } while (retry_bucket); + } while (retry_descent); + + if (skip_rep) { + dprintk("skip rep\n"); + continue; + } + + dprintk("CHOOSE got %d\n", item); + out[outpos] = item; + outpos++; + count--; + } + + dprintk("CHOOSE returns %d\n", outpos); + return outpos; +} + + +/** + * crush_choose_indep: alternative breadth-first positionally stable mapping + * + */ +static void crush_choose_indep(const struct crush_map *map, + struct crush_bucket *bucket, + const __u32 *weight, int weight_max, + int x, int left, int numrep, int type, + int *out, int outpos, + unsigned int tries, + unsigned int recurse_tries, + int recurse_to_leaf, + int *out2, + int parent_r) +{ + struct crush_bucket *in = bucket; + int endpos = outpos + left; + int rep; + unsigned int ftotal; + int r; + int i; + int item = 0; + int itemtype; + int collide; + + dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", + bucket->id, x, outpos, numrep); + + /* initially my result is undefined */ + for (rep = outpos; rep < endpos; rep++) { + out[rep] = CRUSH_ITEM_UNDEF; + if (out2) + out2[rep] = CRUSH_ITEM_UNDEF; + } + + for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) { + for (rep = outpos; rep < endpos; rep++) { + if (out[rep] != CRUSH_ITEM_UNDEF) + continue; + + in = bucket; /* initial bucket */ + + /* choose through intervening buckets */ + for (;;) { + /* note: we base the choice on the position + * even in the nested call. that means that + * if the first layer chooses the same bucket + * in a different position, we will tend to + * choose a different item in that bucket. + * this will involve more devices in data + * movement and tend to distribute the load. + */ + r = rep + parent_r; + + /* be careful */ + if (in->alg == CRUSH_BUCKET_UNIFORM && + in->size % numrep == 0) + /* r'=r+(n+1)*f_total */ + r += (numrep+1) * ftotal; + else + /* r' = r + n*f_total */ + r += numrep * ftotal; + + /* bucket choose */ + if (in->size == 0) { + dprintk(" empty bucket\n"); + break; + } + + item = crush_bucket_choose(in, x, r); + if (item >= map->max_devices) { + dprintk(" bad item %d\n", item); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = CRUSH_ITEM_NONE; + left--; + break; + } + + /* desired type? */ + if (item < 0) + itemtype = map->buckets[-1-item]->type; + else + itemtype = 0; + dprintk(" item %d type %d\n", item, itemtype); + + /* keep going? */ + if (itemtype != type) { + if (item >= 0 || + (-1-item) >= map->max_buckets) { + dprintk(" bad item type %d\n", type); + out[rep] = CRUSH_ITEM_NONE; + if (out2) + out2[rep] = + CRUSH_ITEM_NONE; + left--; + break; + } + in = map->buckets[-1-item]; + continue; + } + + /* collision? */ + collide = 0; + for (i = outpos; i < endpos; i++) { + if (out[i] == item) { + collide = 1; + break; + } + } + if (collide) + break; + + if (recurse_to_leaf) { + if (item < 0) { + crush_choose_indep(map, + map->buckets[-1-item], + weight, weight_max, + x, 1, numrep, 0, + out2, rep, + recurse_tries, 0, + 0, NULL, r); + if (out2[rep] == CRUSH_ITEM_NONE) { + /* placed nothing; no leaf */ + break; + } + } else { + /* we already have a leaf! */ + out2[rep] = item; + } + } + + /* out? */ + if (itemtype == 0 && + is_out(map, weight, weight_max, item, x)) + break; + + /* yay! */ + out[rep] = item; + left--; + break; + } + } + } + for (rep = outpos; rep < endpos; rep++) { + if (out[rep] == CRUSH_ITEM_UNDEF) { + out[rep] = CRUSH_ITEM_NONE; + } + if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) { + out2[rep] = CRUSH_ITEM_NONE; + } + } +} + +/** + * crush_do_rule - calculate a mapping with the given input and rule + * @map: the crush_map + * @ruleno: the rule id + * @x: hash input + * @result: pointer to result vector + * @result_max: maximum result size + * @weight: weight vector (for map leaves) + * @weight_max: size of weight vector + * @scratch: scratch vector for private use; must be >= 3 * result_max + */ +int crush_do_rule(const struct crush_map *map, + int ruleno, int x, int *result, int result_max, + const __u32 *weight, int weight_max, + int *scratch) +{ + int result_len; + int *a = scratch; + int *b = scratch + result_max; + int *c = scratch + result_max*2; + int recurse_to_leaf; + int *w; + int wsize = 0; + int *o; + int osize; + int *tmp; + struct crush_rule *rule; + __u32 step; + int i, j; + int numrep; + int out_size; + /* + * the original choose_total_tries value was off by one (it + * counted "retries" and not "tries"). add one. + */ + int choose_tries = map->choose_total_tries + 1; + int choose_leaf_tries = 0; + /* + * the local tries values were counted as "retries", though, + * and need no adjustment + */ + int choose_local_retries = map->choose_local_tries; + int choose_local_fallback_retries = map->choose_local_fallback_tries; + + int vary_r = map->chooseleaf_vary_r; + + if ((__u32)ruleno >= map->max_rules) { + dprintk(" bad ruleno %d\n", ruleno); + return 0; + } + + rule = map->rules[ruleno]; + result_len = 0; + w = a; + o = b; + + for (step = 0; step < rule->len; step++) { + int firstn = 0; + struct crush_rule_step *curstep = &rule->steps[step]; + + switch (curstep->op) { + case CRUSH_RULE_TAKE: + w[0] = curstep->arg1; + wsize = 1; + break; + + case CRUSH_RULE_SET_CHOOSE_TRIES: + if (curstep->arg1 > 0) + choose_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_TRIES: + if (curstep->arg1 > 0) + choose_leaf_tries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: + if (curstep->arg1 >= 0) + choose_local_retries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: + if (curstep->arg1 >= 0) + choose_local_fallback_retries = curstep->arg1; + break; + + case CRUSH_RULE_SET_CHOOSELEAF_VARY_R: + if (curstep->arg1 >= 0) + vary_r = curstep->arg1; + break; + + case CRUSH_RULE_CHOOSELEAF_FIRSTN: + case CRUSH_RULE_CHOOSE_FIRSTN: + firstn = 1; + /* fall through */ + case CRUSH_RULE_CHOOSELEAF_INDEP: + case CRUSH_RULE_CHOOSE_INDEP: + if (wsize == 0) + break; + + recurse_to_leaf = + curstep->op == + CRUSH_RULE_CHOOSELEAF_FIRSTN || + curstep->op == + CRUSH_RULE_CHOOSELEAF_INDEP; + + /* reset output */ + osize = 0; + + for (i = 0; i < wsize; i++) { + /* + * see CRUSH_N, CRUSH_N_MINUS macros. + * basically, numrep <= 0 means relative to + * the provided result_max + */ + numrep = curstep->arg1; + if (numrep <= 0) { + numrep += result_max; + if (numrep <= 0) + continue; + } + j = 0; + if (firstn) { + int recurse_tries; + if (choose_leaf_tries) + recurse_tries = + choose_leaf_tries; + else if (map->chooseleaf_descend_once) + recurse_tries = 1; + else + recurse_tries = choose_tries; + osize += crush_choose_firstn( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, numrep, + curstep->arg2, + o+osize, j, + result_max-osize, + choose_tries, + recurse_tries, + choose_local_retries, + choose_local_fallback_retries, + recurse_to_leaf, + vary_r, + c+osize, + 0); + } else { + out_size = ((numrep < (result_max-osize)) ? + numrep : (result_max-osize)); + crush_choose_indep( + map, + map->buckets[-1-w[i]], + weight, weight_max, + x, out_size, numrep, + curstep->arg2, + o+osize, j, + choose_tries, + choose_leaf_tries ? + choose_leaf_tries : 1, + recurse_to_leaf, + c+osize, + 0); + osize += out_size; + } + } + + if (recurse_to_leaf) + /* copy final _leaf_ values to output set */ + memcpy(o, c, osize*sizeof(*o)); + + /* swap o and w arrays */ + tmp = o; + o = w; + w = tmp; + wsize = osize; + break; + + + case CRUSH_RULE_EMIT: + for (i = 0; i < wsize && result_len < result_max; i++) { + result[result_len] = w[i]; + result_len++; + } + wsize = 0; + break; + + default: + dprintk(" unknown op %d at step %d\n", + curstep->op, step); + break; + } + } + return result_len; +} + + diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c new file mode 100644 index 000000000..790fe89d9 --- /dev/null +++ b/net/ceph/crypto.c @@ -0,0 +1,583 @@ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "crypto.h" + +int ceph_crypto_key_clone(struct ceph_crypto_key *dst, + const struct ceph_crypto_key *src) +{ + memcpy(dst, src, sizeof(struct ceph_crypto_key)); + dst->key = kmemdup(src->key, src->len, GFP_NOFS); + if (!dst->key) + return -ENOMEM; + return 0; +} + +int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) +{ + if (*p + sizeof(u16) + sizeof(key->created) + + sizeof(u16) + key->len > end) + return -ERANGE; + ceph_encode_16(p, key->type); + ceph_encode_copy(p, &key->created, sizeof(key->created)); + ceph_encode_16(p, key->len); + ceph_encode_copy(p, key->key, key->len); + return 0; +} + +int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end) +{ + ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad); + key->type = ceph_decode_16(p); + ceph_decode_copy(p, &key->created, sizeof(key->created)); + key->len = ceph_decode_16(p); + ceph_decode_need(p, end, key->len, bad); + key->key = kmalloc(key->len, GFP_NOFS); + if (!key->key) + return -ENOMEM; + ceph_decode_copy(p, key->key, key->len); + return 0; + +bad: + dout("failed to decode crypto key\n"); + return -EINVAL; +} + +int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey) +{ + int inlen = strlen(inkey); + int blen = inlen * 3 / 4; + void *buf, *p; + int ret; + + dout("crypto_key_unarmor %s\n", inkey); + buf = kmalloc(blen, GFP_NOFS); + if (!buf) + return -ENOMEM; + blen = ceph_unarmor(buf, inkey, inkey+inlen); + if (blen < 0) { + kfree(buf); + return blen; + } + + p = buf; + ret = ceph_crypto_key_decode(key, &p, p + blen); + kfree(buf); + if (ret) + return ret; + dout("crypto_key_unarmor key %p type %d len %d\n", key, + key->type, key->len); + return 0; +} + + + +#define AES_KEY_SIZE 16 + +static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) +{ + return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); +} + +static const u8 *aes_iv = (u8 *)CEPH_AES_IV; + +/* + * Should be used for buffers allocated with ceph_kvmalloc(). + * Currently these are encrypt out-buffer (ceph_buffer) and decrypt + * in-buffer (msg front). + * + * Dispose of @sgt with teardown_sgtable(). + * + * @prealloc_sg is to avoid memory allocation inside sg_alloc_table() + * in cases where a single sg is sufficient. No attempt to reduce the + * number of sgs by squeezing physically contiguous pages together is + * made though, for simplicity. + */ +static int setup_sgtable(struct sg_table *sgt, struct scatterlist *prealloc_sg, + const void *buf, unsigned int buf_len) +{ + struct scatterlist *sg; + const bool is_vmalloc = is_vmalloc_addr(buf); + unsigned int off = offset_in_page(buf); + unsigned int chunk_cnt = 1; + unsigned int chunk_len = PAGE_ALIGN(off + buf_len); + int i; + int ret; + + if (buf_len == 0) { + memset(sgt, 0, sizeof(*sgt)); + return -EINVAL; + } + + if (is_vmalloc) { + chunk_cnt = chunk_len >> PAGE_SHIFT; + chunk_len = PAGE_SIZE; + } + + if (chunk_cnt > 1) { + ret = sg_alloc_table(sgt, chunk_cnt, GFP_NOFS); + if (ret) + return ret; + } else { + WARN_ON(chunk_cnt != 1); + sg_init_table(prealloc_sg, 1); + sgt->sgl = prealloc_sg; + sgt->nents = sgt->orig_nents = 1; + } + + for_each_sg(sgt->sgl, sg, sgt->orig_nents, i) { + struct page *page; + unsigned int len = min(chunk_len - off, buf_len); + + if (is_vmalloc) + page = vmalloc_to_page(buf); + else + page = virt_to_page(buf); + + sg_set_page(sg, page, len, off); + + off = 0; + buf += len; + buf_len -= len; + } + WARN_ON(buf_len != 0); + + return 0; +} + +static void teardown_sgtable(struct sg_table *sgt) +{ + if (sgt->orig_nents > 1) + sg_free_table(sgt); +} + +static int ceph_aes_encrypt(const void *key, int key_len, + void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + struct scatterlist sg_in[2], prealloc_sg; + struct sg_table sg_out; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; + int ret; + void *iv; + int ivsize; + size_t zero_padding = (0x10 - (src_len & 0x0f)); + char pad[16]; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + memset(pad, zero_padding, zero_padding); + + *dst_len = src_len + zero_padding; + + sg_init_table(sg_in, 2); + sg_set_buf(&sg_in[0], src, src_len); + sg_set_buf(&sg_in[1], pad, zero_padding); + ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len); + if (ret) + goto out_tfm; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + memcpy(iv, aes_iv, ivsize); + + /* + print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, + pad, zero_padding, 1); + */ + ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in, + src_len + zero_padding); + if (ret < 0) { + pr_err("ceph_aes_crypt failed %d\n", ret); + goto out_sg; + } + /* + print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + +out_sg: + teardown_sgtable(&sg_out); +out_tfm: + crypto_free_blkcipher(tfm); + return ret; +} + +static int ceph_aes_encrypt2(const void *key, int key_len, void *dst, + size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len) +{ + struct scatterlist sg_in[3], prealloc_sg; + struct sg_table sg_out; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 }; + int ret; + void *iv; + int ivsize; + size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f)); + char pad[16]; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + memset(pad, zero_padding, zero_padding); + + *dst_len = src1_len + src2_len + zero_padding; + + sg_init_table(sg_in, 3); + sg_set_buf(&sg_in[0], src1, src1_len); + sg_set_buf(&sg_in[1], src2, src2_len); + sg_set_buf(&sg_in[2], pad, zero_padding); + ret = setup_sgtable(&sg_out, &prealloc_sg, dst, *dst_len); + if (ret) + goto out_tfm; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + memcpy(iv, aes_iv, ivsize); + + /* + print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1, + src1, src1_len, 1); + print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1, + src2, src2_len, 1); + print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1, + pad, zero_padding, 1); + */ + ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in, + src1_len + src2_len + zero_padding); + if (ret < 0) { + pr_err("ceph_aes_crypt2 failed %d\n", ret); + goto out_sg; + } + /* + print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + +out_sg: + teardown_sgtable(&sg_out); +out_tfm: + crypto_free_blkcipher(tfm); + return ret; +} + +static int ceph_aes_decrypt(const void *key, int key_len, + void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + struct sg_table sg_in; + struct scatterlist sg_out[2], prealloc_sg; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm }; + char pad[16]; + void *iv; + int ivsize; + int ret; + int last_byte; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + sg_init_table(sg_out, 2); + sg_set_buf(&sg_out[0], dst, *dst_len); + sg_set_buf(&sg_out[1], pad, sizeof(pad)); + ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len); + if (ret) + goto out_tfm; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + memcpy(iv, aes_iv, ivsize); + + /* + print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + */ + ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len); + if (ret < 0) { + pr_err("ceph_aes_decrypt failed %d\n", ret); + goto out_sg; + } + + if (src_len <= *dst_len) + last_byte = ((char *)dst)[src_len - 1]; + else + last_byte = pad[src_len - *dst_len - 1]; + if (last_byte <= 16 && src_len >= last_byte) { + *dst_len = src_len - last_byte; + } else { + pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", + last_byte, (int)src_len); + return -EPERM; /* bad padding */ + } + /* + print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1, + dst, *dst_len, 1); + */ + +out_sg: + teardown_sgtable(&sg_in); +out_tfm: + crypto_free_blkcipher(tfm); + return ret; +} + +static int ceph_aes_decrypt2(const void *key, int key_len, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len) +{ + struct sg_table sg_in; + struct scatterlist sg_out[3], prealloc_sg; + struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); + struct blkcipher_desc desc = { .tfm = tfm }; + char pad[16]; + void *iv; + int ivsize; + int ret; + int last_byte; + + if (IS_ERR(tfm)) + return PTR_ERR(tfm); + + sg_init_table(sg_out, 3); + sg_set_buf(&sg_out[0], dst1, *dst1_len); + sg_set_buf(&sg_out[1], dst2, *dst2_len); + sg_set_buf(&sg_out[2], pad, sizeof(pad)); + ret = setup_sgtable(&sg_in, &prealloc_sg, src, src_len); + if (ret) + goto out_tfm; + + crypto_blkcipher_setkey((void *)tfm, key, key_len); + iv = crypto_blkcipher_crt(tfm)->iv; + ivsize = crypto_blkcipher_ivsize(tfm); + memcpy(iv, aes_iv, ivsize); + + /* + print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1, + key, key_len, 1); + print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1, + src, src_len, 1); + */ + ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len); + if (ret < 0) { + pr_err("ceph_aes_decrypt failed %d\n", ret); + goto out_sg; + } + + if (src_len <= *dst1_len) + last_byte = ((char *)dst1)[src_len - 1]; + else if (src_len <= *dst1_len + *dst2_len) + last_byte = ((char *)dst2)[src_len - *dst1_len - 1]; + else + last_byte = pad[src_len - *dst1_len - *dst2_len - 1]; + if (last_byte <= 16 && src_len >= last_byte) { + src_len -= last_byte; + } else { + pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n", + last_byte, (int)src_len); + return -EPERM; /* bad padding */ + } + + if (src_len < *dst1_len) { + *dst1_len = src_len; + *dst2_len = 0; + } else { + *dst2_len = src_len - *dst1_len; + } + /* + print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1, + dst1, *dst1_len, 1); + print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1, + dst2, *dst2_len, 1); + */ + +out_sg: + teardown_sgtable(&sg_in); +out_tfm: + crypto_free_blkcipher(tfm); + return ret; +} + + +int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src_len) + return -ERANGE; + memcpy(dst, src, src_len); + *dst_len = src_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_decrypt(secret->key, secret->len, dst, + dst_len, src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_decrypt2(struct ceph_crypto_key *secret, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len) +{ + size_t t; + + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst1_len + *dst2_len < src_len) + return -ERANGE; + t = min(*dst1_len, src_len); + memcpy(dst1, src, t); + *dst1_len = t; + src += t; + src_len -= t; + if (src_len) { + t = min(*dst2_len, src_len); + memcpy(dst2, src, t); + *dst2_len = t; + } + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_decrypt2(secret->key, secret->len, + dst1, dst1_len, dst2, dst2_len, + src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src, size_t src_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src_len) + return -ERANGE; + memcpy(dst, src, src_len); + *dst_len = src_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_encrypt(secret->key, secret->len, dst, + dst_len, src, src_len); + + default: + return -EINVAL; + } +} + +int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len) +{ + switch (secret->type) { + case CEPH_CRYPTO_NONE: + if (*dst_len < src1_len + src2_len) + return -ERANGE; + memcpy(dst, src1, src1_len); + memcpy(dst + src1_len, src2, src2_len); + *dst_len = src1_len + src2_len; + return 0; + + case CEPH_CRYPTO_AES: + return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len, + src1, src1_len, src2, src2_len); + + default: + return -EINVAL; + } +} + +static int ceph_key_preparse(struct key_preparsed_payload *prep) +{ + struct ceph_crypto_key *ckey; + size_t datalen = prep->datalen; + int ret; + void *p; + + ret = -EINVAL; + if (datalen <= 0 || datalen > 32767 || !prep->data) + goto err; + + ret = -ENOMEM; + ckey = kmalloc(sizeof(*ckey), GFP_KERNEL); + if (!ckey) + goto err; + + /* TODO ceph_crypto_key_decode should really take const input */ + p = (void *)prep->data; + ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen); + if (ret < 0) + goto err_ckey; + + prep->payload[0] = ckey; + prep->quotalen = datalen; + return 0; + +err_ckey: + kfree(ckey); +err: + return ret; +} + +static void ceph_key_free_preparse(struct key_preparsed_payload *prep) +{ + struct ceph_crypto_key *ckey = prep->payload[0]; + ceph_crypto_key_destroy(ckey); + kfree(ckey); +} + +static void ceph_key_destroy(struct key *key) +{ + struct ceph_crypto_key *ckey = key->payload.data; + + ceph_crypto_key_destroy(ckey); + kfree(ckey); +} + +struct key_type key_type_ceph = { + .name = "ceph", + .preparse = ceph_key_preparse, + .free_preparse = ceph_key_free_preparse, + .instantiate = generic_key_instantiate, + .destroy = ceph_key_destroy, +}; + +int ceph_crypto_init(void) { + return register_key_type(&key_type_ceph); +} + +void ceph_crypto_shutdown(void) { + unregister_key_type(&key_type_ceph); +} diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h new file mode 100644 index 000000000..d1498224c --- /dev/null +++ b/net/ceph/crypto.h @@ -0,0 +1,51 @@ +#ifndef _FS_CEPH_CRYPTO_H +#define _FS_CEPH_CRYPTO_H + +#include +#include + +/* + * cryptographic secret + */ +struct ceph_crypto_key { + int type; + struct ceph_timespec created; + int len; + void *key; +}; + +static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) +{ + if (key) + kfree(key->key); +} + +int ceph_crypto_key_clone(struct ceph_crypto_key *dst, + const struct ceph_crypto_key *src); +int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end); +int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end); +int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in); + +/* crypto.c */ +int ceph_decrypt(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src, size_t src_len); +int ceph_encrypt(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src, size_t src_len); +int ceph_decrypt2(struct ceph_crypto_key *secret, + void *dst1, size_t *dst1_len, + void *dst2, size_t *dst2_len, + const void *src, size_t src_len); +int ceph_encrypt2(struct ceph_crypto_key *secret, + void *dst, size_t *dst_len, + const void *src1, size_t src1_len, + const void *src2, size_t src2_len); +int ceph_crypto_init(void); +void ceph_crypto_shutdown(void); + +/* armor.c */ +int ceph_armor(char *dst, const char *src, const char *end); +int ceph_unarmor(char *dst, const char *src, const char *end); + +#endif diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c new file mode 100644 index 000000000..593dc2eab --- /dev/null +++ b/net/ceph/debugfs.c @@ -0,0 +1,309 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_DEBUG_FS + +/* + * Implement /sys/kernel/debug/ceph fun + * + * /sys/kernel/debug/ceph/client* - an instance of the ceph client + * .../osdmap - current osdmap + * .../monmap - current monmap + * .../osdc - active osd requests + * .../monc - mon client state + * .../client_options - libceph-only (i.e. not rbd or cephfs) options + * .../dentry_lru - dump contents of dentry lru + * .../caps - expose cap (reservation) stats + * .../bdi - symlink to ../../bdi/something + */ + +static struct dentry *ceph_debugfs_dir; + +static int monmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + + if (client->monc.monmap == NULL) + return 0; + + seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); + for (i = 0; i < client->monc.monmap->num_mon; i++) { + struct ceph_entity_inst *inst = + &client->monc.monmap->mon_inst[i]; + + seq_printf(s, "\t%s%lld\t%s\n", + ENTITY_NAME(inst->name), + ceph_pr_addr(&inst->addr.in_addr)); + } + return 0; +} + +static int osdmap_show(struct seq_file *s, void *p) +{ + int i; + struct ceph_client *client = s->private; + struct ceph_osdmap *map = client->osdc.osdmap; + struct rb_node *n; + + if (map == NULL) + return 0; + + seq_printf(s, "epoch %d\n", map->epoch); + seq_printf(s, "flags%s%s\n", + (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "", + (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : ""); + + for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { + struct ceph_pg_pool_info *pool = + rb_entry(n, struct ceph_pg_pool_info, node); + + seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n", + pool->id, pool->pg_num, pool->pg_num_mask, + pool->read_tier, pool->write_tier); + } + for (i = 0; i < map->max_osd; i++) { + struct ceph_entity_addr *addr = &map->osd_addr[i]; + int state = map->osd_state[i]; + char sb[64]; + + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", + i, ceph_pr_addr(&addr->in_addr), + ((map->osd_weight[i]*100) >> 16), + ceph_osdmap_state_str(sb, sizeof(sb), state), + ((ceph_get_primary_affinity(map, i)*100) >> 16)); + } + for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool, + pg->pgid.seed); + for (i = 0; i < pg->pg_temp.len; i++) + seq_printf(s, "%s%d", (i == 0 ? "" : ","), + pg->pg_temp.osds[i]); + seq_printf(s, "]\n"); + } + for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) { + struct ceph_pg_mapping *pg = + rb_entry(n, struct ceph_pg_mapping, node); + + seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, + pg->pgid.seed, pg->primary_temp.osd); + } + + return 0; +} + +static int monc_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + struct ceph_mon_generic_request *req; + struct ceph_mon_client *monc = &client->monc; + struct rb_node *rp; + + mutex_lock(&monc->mutex); + + if (monc->have_mdsmap) + seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap); + if (monc->have_osdmap) + seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap); + if (monc->want_next_osdmap) + seq_printf(s, "want next osdmap\n"); + + for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { + __u16 op; + req = rb_entry(rp, struct ceph_mon_generic_request, node); + op = le16_to_cpu(req->request->hdr.type); + if (op == CEPH_MSG_STATFS) + seq_printf(s, "%llu statfs\n", req->tid); + else if (op == CEPH_MSG_MON_GET_VERSION) + seq_printf(s, "%llu mon_get_version", req->tid); + else + seq_printf(s, "%llu unknown\n", req->tid); + } + + mutex_unlock(&monc->mutex); + return 0; +} + +static int osdc_show(struct seq_file *s, void *pp) +{ + struct ceph_client *client = s->private; + struct ceph_osd_client *osdc = &client->osdc; + struct rb_node *p; + + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { + struct ceph_osd_request *req; + unsigned int i; + int opcode; + + req = rb_entry(p, struct ceph_osd_request, r_node); + + seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1, + req->r_pgid.pool, req->r_pgid.seed); + + seq_printf(s, "%.*s", req->r_base_oid.name_len, + req->r_base_oid.name); + + if (req->r_reassert_version.epoch) + seq_printf(s, "\t%u'%llu", + (unsigned int)le32_to_cpu(req->r_reassert_version.epoch), + le64_to_cpu(req->r_reassert_version.version)); + else + seq_printf(s, "\t"); + + for (i = 0; i < req->r_num_ops; i++) { + opcode = req->r_ops[i].op; + seq_printf(s, "%s%s", (i == 0 ? "\t" : ","), + ceph_osd_op_name(opcode)); + } + + seq_printf(s, "\n"); + } + mutex_unlock(&osdc->request_mutex); + return 0; +} + +static int client_options_show(struct seq_file *s, void *p) +{ + struct ceph_client *client = s->private; + int ret; + + ret = ceph_print_client_options(s, client); + if (ret) + return ret; + + seq_putc(s, '\n'); + return 0; +} + +CEPH_DEFINE_SHOW_FUNC(monmap_show) +CEPH_DEFINE_SHOW_FUNC(osdmap_show) +CEPH_DEFINE_SHOW_FUNC(monc_show) +CEPH_DEFINE_SHOW_FUNC(osdc_show) +CEPH_DEFINE_SHOW_FUNC(client_options_show) + +int ceph_debugfs_init(void) +{ + ceph_debugfs_dir = debugfs_create_dir("ceph", NULL); + if (!ceph_debugfs_dir) + return -ENOMEM; + return 0; +} + +void ceph_debugfs_cleanup(void) +{ + debugfs_remove(ceph_debugfs_dir); +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + int ret = -ENOMEM; + char name[80]; + + snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, + client->monc.auth->global_id); + + dout("ceph_debugfs_client_init %p %s\n", client, name); + + BUG_ON(client->debugfs_dir); + client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); + if (!client->debugfs_dir) + goto out; + + client->monc.debugfs_file = debugfs_create_file("monc", + 0600, + client->debugfs_dir, + client, + &monc_show_fops); + if (!client->monc.debugfs_file) + goto out; + + client->osdc.debugfs_file = debugfs_create_file("osdc", + 0600, + client->debugfs_dir, + client, + &osdc_show_fops); + if (!client->osdc.debugfs_file) + goto out; + + client->debugfs_monmap = debugfs_create_file("monmap", + 0600, + client->debugfs_dir, + client, + &monmap_show_fops); + if (!client->debugfs_monmap) + goto out; + + client->debugfs_osdmap = debugfs_create_file("osdmap", + 0600, + client->debugfs_dir, + client, + &osdmap_show_fops); + if (!client->debugfs_osdmap) + goto out; + + client->debugfs_options = debugfs_create_file("client_options", + 0600, + client->debugfs_dir, + client, + &client_options_show_fops); + if (!client->debugfs_options) + goto out; + + return 0; + +out: + ceph_debugfs_client_cleanup(client); + return ret; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ + dout("ceph_debugfs_client_cleanup %p\n", client); + debugfs_remove(client->debugfs_options); + debugfs_remove(client->debugfs_osdmap); + debugfs_remove(client->debugfs_monmap); + debugfs_remove(client->osdc.debugfs_file); + debugfs_remove(client->monc.debugfs_file); + debugfs_remove(client->debugfs_dir); +} + +#else /* CONFIG_DEBUG_FS */ + +int ceph_debugfs_init(void) +{ + return 0; +} + +void ceph_debugfs_cleanup(void) +{ +} + +int ceph_debugfs_client_init(struct ceph_client *client) +{ + return 0; +} + +void ceph_debugfs_client_cleanup(struct ceph_client *client) +{ +} + +#endif /* CONFIG_DEBUG_FS */ + +EXPORT_SYMBOL(ceph_debugfs_init); +EXPORT_SYMBOL(ceph_debugfs_cleanup); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c new file mode 100644 index 000000000..967080a9f --- /dev/null +++ b/net/ceph/messenger.c @@ -0,0 +1,3395 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_BLOCK +#include +#endif /* CONFIG_BLOCK */ +#include +#include + +#include +#include +#include +#include +#include +#include + +#define list_entry_next(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +/* + * Ceph uses the messenger to exchange ceph_msg messages with other + * hosts in the system. The messenger provides ordered and reliable + * delivery. We tolerate TCP disconnects by reconnecting (with + * exponential backoff) in the case of a fault (disconnection, bad + * crc, protocol error). Acks allow sent messages to be discarded by + * the sender. + */ + +/* + * We track the state of the socket on a given connection using + * values defined below. The transition to a new socket state is + * handled by a function which verifies we aren't coming from an + * unexpected state. + * + * -------- + * | NEW* | transient initial state + * -------- + * | con_sock_state_init() + * v + * ---------- + * | CLOSED | initialized, but no socket (and no + * ---------- TCP connection) + * ^ \ + * | \ con_sock_state_connecting() + * | ---------------------- + * | \ + * + con_sock_state_closed() \ + * |+--------------------------- \ + * | \ \ \ + * | ----------- \ \ + * | | CLOSING | socket event; \ \ + * | ----------- await close \ \ + * | ^ \ | + * | | \ | + * | + con_sock_state_closing() \ | + * | / \ | | + * | / --------------- | | + * | / \ v v + * | / -------------- + * | / -----------------| CONNECTING | socket created, TCP + * | | / -------------- connect initiated + * | | | con_sock_state_connected() + * | | v + * ------------- + * | CONNECTED | TCP connection established + * ------------- + * + * State values for ceph_connection->sock_state; NEW is assumed to be 0. + */ + +#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */ +#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */ +#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */ +#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */ +#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */ + +/* + * connection states + */ +#define CON_STATE_CLOSED 1 /* -> PREOPEN */ +#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */ +#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */ +#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */ +#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */ +#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */ + +/* + * ceph_connection flag bits + */ +#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop + * messages on errors */ +#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */ +#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ +#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ +#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ + +static bool con_flag_valid(unsigned long con_flag) +{ + switch (con_flag) { + case CON_FLAG_LOSSYTX: + case CON_FLAG_KEEPALIVE_PENDING: + case CON_FLAG_WRITE_PENDING: + case CON_FLAG_SOCK_CLOSED: + case CON_FLAG_BACKOFF: + return true; + default: + return false; + } +} + +static void con_flag_clear(struct ceph_connection *con, unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + clear_bit(con_flag, &con->flags); +} + +static void con_flag_set(struct ceph_connection *con, unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + set_bit(con_flag, &con->flags); +} + +static bool con_flag_test(struct ceph_connection *con, unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + return test_bit(con_flag, &con->flags); +} + +static bool con_flag_test_and_clear(struct ceph_connection *con, + unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + return test_and_clear_bit(con_flag, &con->flags); +} + +static bool con_flag_test_and_set(struct ceph_connection *con, + unsigned long con_flag) +{ + BUG_ON(!con_flag_valid(con_flag)); + + return test_and_set_bit(con_flag, &con->flags); +} + +/* Slab caches for frequently-allocated structures */ + +static struct kmem_cache *ceph_msg_cache; +static struct kmem_cache *ceph_msg_data_cache; + +/* static tag bytes (protocol control messages) */ +static char tag_msg = CEPH_MSGR_TAG_MSG; +static char tag_ack = CEPH_MSGR_TAG_ACK; +static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; + +#ifdef CONFIG_LOCKDEP +static struct lock_class_key socket_class; +#endif + +/* + * When skipping (ignoring) a block of input we read it into a "skip + * buffer," which is this many bytes in size. + */ +#define SKIP_BUF_SIZE 1024 + +static void queue_con(struct ceph_connection *con); +static void cancel_con(struct ceph_connection *con); +static void con_work(struct work_struct *); +static void con_fault(struct ceph_connection *con); + +/* + * Nicely render a sockaddr as a string. An array of formatted + * strings is used, to approximate reentrancy. + */ +#define ADDR_STR_COUNT_LOG 5 /* log2(# address strings in array) */ +#define ADDR_STR_COUNT (1 << ADDR_STR_COUNT_LOG) +#define ADDR_STR_COUNT_MASK (ADDR_STR_COUNT - 1) +#define MAX_ADDR_STR_LEN 64 /* 54 is enough */ + +static char addr_str[ADDR_STR_COUNT][MAX_ADDR_STR_LEN]; +static atomic_t addr_str_seq = ATOMIC_INIT(0); + +static struct page *zero_page; /* used in certain error cases */ + +const char *ceph_pr_addr(const struct sockaddr_storage *ss) +{ + int i; + char *s; + struct sockaddr_in *in4 = (struct sockaddr_in *) ss; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; + + i = atomic_inc_return(&addr_str_seq) & ADDR_STR_COUNT_MASK; + s = addr_str[i]; + + switch (ss->ss_family) { + case AF_INET: + snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%hu", &in4->sin_addr, + ntohs(in4->sin_port)); + break; + + case AF_INET6: + snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%hu", &in6->sin6_addr, + ntohs(in6->sin6_port)); + break; + + default: + snprintf(s, MAX_ADDR_STR_LEN, "(unknown sockaddr family %hu)", + ss->ss_family); + } + + return s; +} +EXPORT_SYMBOL(ceph_pr_addr); + +static void encode_my_addr(struct ceph_messenger *msgr) +{ + memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr)); + ceph_encode_addr(&msgr->my_enc_addr); +} + +/* + * work queue for all reading and writing to/from the socket. + */ +static struct workqueue_struct *ceph_msgr_wq; + +static int ceph_msgr_slab_init(void) +{ + BUG_ON(ceph_msg_cache); + ceph_msg_cache = kmem_cache_create("ceph_msg", + sizeof (struct ceph_msg), + __alignof__(struct ceph_msg), 0, NULL); + + if (!ceph_msg_cache) + return -ENOMEM; + + BUG_ON(ceph_msg_data_cache); + ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", + sizeof (struct ceph_msg_data), + __alignof__(struct ceph_msg_data), + 0, NULL); + if (ceph_msg_data_cache) + return 0; + + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; + + return -ENOMEM; +} + +static void ceph_msgr_slab_exit(void) +{ + BUG_ON(!ceph_msg_data_cache); + kmem_cache_destroy(ceph_msg_data_cache); + ceph_msg_data_cache = NULL; + + BUG_ON(!ceph_msg_cache); + kmem_cache_destroy(ceph_msg_cache); + ceph_msg_cache = NULL; +} + +static void _ceph_msgr_exit(void) +{ + if (ceph_msgr_wq) { + destroy_workqueue(ceph_msgr_wq); + ceph_msgr_wq = NULL; + } + + ceph_msgr_slab_exit(); + + BUG_ON(zero_page == NULL); + kunmap(zero_page); + page_cache_release(zero_page); + zero_page = NULL; +} + +int ceph_msgr_init(void) +{ + BUG_ON(zero_page != NULL); + zero_page = ZERO_PAGE(0); + page_cache_get(zero_page); + + if (ceph_msgr_slab_init()) + return -ENOMEM; + + /* + * The number of active work items is limited by the number of + * connections, so leave @max_active at default. + */ + ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0); + if (ceph_msgr_wq) + return 0; + + pr_err("msgr_init failed to create workqueue\n"); + _ceph_msgr_exit(); + + return -ENOMEM; +} +EXPORT_SYMBOL(ceph_msgr_init); + +void ceph_msgr_exit(void) +{ + BUG_ON(ceph_msgr_wq == NULL); + + _ceph_msgr_exit(); +} +EXPORT_SYMBOL(ceph_msgr_exit); + +void ceph_msgr_flush(void) +{ + flush_workqueue(ceph_msgr_wq); +} +EXPORT_SYMBOL(ceph_msgr_flush); + +/* Connection socket state transition functions */ + +static void con_sock_state_init(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); + if (WARN_ON(old_state != CON_SOCK_STATE_NEW)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSED); +} + +static void con_sock_state_connecting(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING); + if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CONNECTING); +} + +static void con_sock_state_connected(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CONNECTED); +} + +static void con_sock_state_closing(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING && + old_state != CON_SOCK_STATE_CONNECTED && + old_state != CON_SOCK_STATE_CLOSING)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSING); +} + +static void con_sock_state_closed(struct ceph_connection *con) +{ + int old_state; + + old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED); + if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED && + old_state != CON_SOCK_STATE_CLOSING && + old_state != CON_SOCK_STATE_CONNECTING && + old_state != CON_SOCK_STATE_CLOSED)) + printk("%s: unexpected old state %d\n", __func__, old_state); + dout("%s con %p sock %d -> %d\n", __func__, con, old_state, + CON_SOCK_STATE_CLOSED); +} + +/* + * socket callback functions + */ + +/* data available on socket, or listen socket received a connect */ +static void ceph_sock_data_ready(struct sock *sk) +{ + struct ceph_connection *con = sk->sk_user_data; + if (atomic_read(&con->msgr->stopping)) { + return; + } + + if (sk->sk_state != TCP_CLOSE_WAIT) { + dout("%s on %p state = %lu, queueing work\n", __func__, + con, con->state); + queue_con(con); + } +} + +/* socket has buffer space for writing */ +static void ceph_sock_write_space(struct sock *sk) +{ + struct ceph_connection *con = sk->sk_user_data; + + /* only queue to workqueue if there is data we want to write, + * and there is sufficient space in the socket buffer to accept + * more data. clear SOCK_NOSPACE so that ceph_sock_write_space() + * doesn't get called again until try_write() fills the socket + * buffer. See net/ipv4/tcp_input.c:tcp_check_space() + * and net/core/stream.c:sk_stream_write_space(). + */ + if (con_flag_test(con, CON_FLAG_WRITE_PENDING)) { + if (sk_stream_is_writeable(sk)) { + dout("%s %p queueing write work\n", __func__, con); + clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + queue_con(con); + } + } else { + dout("%s %p nothing to write\n", __func__, con); + } +} + +/* socket's state has changed */ +static void ceph_sock_state_change(struct sock *sk) +{ + struct ceph_connection *con = sk->sk_user_data; + + dout("%s %p state = %lu sk_state = %u\n", __func__, + con, con->state, sk->sk_state); + + switch (sk->sk_state) { + case TCP_CLOSE: + dout("%s TCP_CLOSE\n", __func__); + case TCP_CLOSE_WAIT: + dout("%s TCP_CLOSE_WAIT\n", __func__); + con_sock_state_closing(con); + con_flag_set(con, CON_FLAG_SOCK_CLOSED); + queue_con(con); + break; + case TCP_ESTABLISHED: + dout("%s TCP_ESTABLISHED\n", __func__); + con_sock_state_connected(con); + queue_con(con); + break; + default: /* Everything else is uninteresting */ + break; + } +} + +/* + * set up socket callbacks + */ +static void set_sock_callbacks(struct socket *sock, + struct ceph_connection *con) +{ + struct sock *sk = sock->sk; + sk->sk_user_data = con; + sk->sk_data_ready = ceph_sock_data_ready; + sk->sk_write_space = ceph_sock_write_space; + sk->sk_state_change = ceph_sock_state_change; +} + + +/* + * socket helpers + */ + +/* + * initiate connection to a remote socket. + */ +static int ceph_tcp_connect(struct ceph_connection *con) +{ + struct sockaddr_storage *paddr = &con->peer_addr.in_addr; + struct socket *sock; + int ret; + + BUG_ON(con->sock); + ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM, + IPPROTO_TCP, &sock); + if (ret) + return ret; + sock->sk->sk_allocation = GFP_NOFS; + +#ifdef CONFIG_LOCKDEP + lockdep_set_class(&sock->sk->sk_lock, &socket_class); +#endif + + set_sock_callbacks(sock, con); + + dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); + + con_sock_state_connecting(con); + ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), + O_NONBLOCK); + if (ret == -EINPROGRESS) { + dout("connect %s EINPROGRESS sk_state = %u\n", + ceph_pr_addr(&con->peer_addr.in_addr), + sock->sk->sk_state); + } else if (ret < 0) { + pr_err("connect %s error %d\n", + ceph_pr_addr(&con->peer_addr.in_addr), ret); + sock_release(sock); + return ret; + } + + if (con->msgr->tcp_nodelay) { + int optval = 1; + + ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char *)&optval, sizeof(optval)); + if (ret) + pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", + ret); + } + + con->sock = sock; + return 0; +} + +static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len) +{ + struct kvec iov = {buf, len}; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + r = kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags); + if (r == -EAGAIN) + r = 0; + return r; +} + +static int ceph_tcp_recvpage(struct socket *sock, struct page *page, + int page_offset, size_t length) +{ + void *kaddr; + int ret; + + BUG_ON(page_offset + length > PAGE_SIZE); + + kaddr = kmap(page); + BUG_ON(!kaddr); + ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length); + kunmap(page); + + return ret; +} + +/* + * write something. @more is true if caller will be sending more data + * shortly. + */ +static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov, + size_t kvlen, size_t len, int more) +{ + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL }; + int r; + + if (more) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */ + + r = kernel_sendmsg(sock, &msg, iov, kvlen, len); + if (r == -EAGAIN) + r = 0; + return r; +} + +static int __ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, bool more) +{ + int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); + int ret; + + ret = kernel_sendpage(sock, page, offset, size, flags); + if (ret == -EAGAIN) + ret = 0; + + return ret; +} + +static int ceph_tcp_sendpage(struct socket *sock, struct page *page, + int offset, size_t size, bool more) +{ + int ret; + struct kvec iov; + + /* sendpage cannot properly handle pages with page_count == 0, + * we need to fallback to sendmsg if that's the case */ + if (page_count(page) >= 1) + return __ceph_tcp_sendpage(sock, page, offset, size, more); + + iov.iov_base = kmap(page) + offset; + iov.iov_len = size; + ret = ceph_tcp_sendmsg(sock, &iov, 1, size, more); + kunmap(page); + + return ret; +} + +/* + * Shutdown/close the socket for the given connection. + */ +static int con_close_socket(struct ceph_connection *con) +{ + int rc = 0; + + dout("con_close_socket on %p sock %p\n", con, con->sock); + if (con->sock) { + rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); + sock_release(con->sock); + con->sock = NULL; + } + + /* + * Forcibly clear the SOCK_CLOSED flag. It gets set + * independent of the connection mutex, and we could have + * received a socket close event before we had the chance to + * shut the socket down. + */ + con_flag_clear(con, CON_FLAG_SOCK_CLOSED); + + con_sock_state_closed(con); + return rc; +} + +/* + * Reset a connection. Discard all incoming and outgoing messages + * and clear *_seq state. + */ +static void ceph_msg_remove(struct ceph_msg *msg) +{ + list_del_init(&msg->list_head); + BUG_ON(msg->con == NULL); + msg->con->ops->put(msg->con); + msg->con = NULL; + + ceph_msg_put(msg); +} +static void ceph_msg_remove_list(struct list_head *head) +{ + while (!list_empty(head)) { + struct ceph_msg *msg = list_first_entry(head, struct ceph_msg, + list_head); + ceph_msg_remove(msg); + } +} + +static void reset_connection(struct ceph_connection *con) +{ + /* reset connection, out_queue, msg_ and connect_seq */ + /* discard existing out_queue and msg_seq */ + dout("reset_connection %p\n", con); + ceph_msg_remove_list(&con->out_queue); + ceph_msg_remove_list(&con->out_sent); + + if (con->in_msg) { + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + con->ops->put(con); + } + + con->connect_seq = 0; + con->out_seq = 0; + if (con->out_msg) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } + con->in_seq = 0; + con->in_seq_acked = 0; +} + +/* + * mark a peer down. drop any open connections. + */ +void ceph_con_close(struct ceph_connection *con) +{ + mutex_lock(&con->mutex); + dout("con_close %p peer %s\n", con, + ceph_pr_addr(&con->peer_addr.in_addr)); + con->state = CON_STATE_CLOSED; + + con_flag_clear(con, CON_FLAG_LOSSYTX); /* so we retry next connect */ + con_flag_clear(con, CON_FLAG_KEEPALIVE_PENDING); + con_flag_clear(con, CON_FLAG_WRITE_PENDING); + con_flag_clear(con, CON_FLAG_BACKOFF); + + reset_connection(con); + con->peer_global_seq = 0; + cancel_con(con); + con_close_socket(con); + mutex_unlock(&con->mutex); +} +EXPORT_SYMBOL(ceph_con_close); + +/* + * Reopen a closed connection, with a new peer address. + */ +void ceph_con_open(struct ceph_connection *con, + __u8 entity_type, __u64 entity_num, + struct ceph_entity_addr *addr) +{ + mutex_lock(&con->mutex); + dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); + + WARN_ON(con->state != CON_STATE_CLOSED); + con->state = CON_STATE_PREOPEN; + + con->peer_name.type = (__u8) entity_type; + con->peer_name.num = cpu_to_le64(entity_num); + + memcpy(&con->peer_addr, addr, sizeof(*addr)); + con->delay = 0; /* reset backoff memory */ + mutex_unlock(&con->mutex); + queue_con(con); +} +EXPORT_SYMBOL(ceph_con_open); + +/* + * return true if this connection ever successfully opened + */ +bool ceph_con_opened(struct ceph_connection *con) +{ + return con->connect_seq > 0; +} + +/* + * initialize a new connection. + */ +void ceph_con_init(struct ceph_connection *con, void *private, + const struct ceph_connection_operations *ops, + struct ceph_messenger *msgr) +{ + dout("con_init %p\n", con); + memset(con, 0, sizeof(*con)); + con->private = private; + con->ops = ops; + con->msgr = msgr; + + con_sock_state_init(con); + + mutex_init(&con->mutex); + INIT_LIST_HEAD(&con->out_queue); + INIT_LIST_HEAD(&con->out_sent); + INIT_DELAYED_WORK(&con->work, con_work); + + con->state = CON_STATE_CLOSED; +} +EXPORT_SYMBOL(ceph_con_init); + + +/* + * We maintain a global counter to order connection attempts. Get + * a unique seq greater than @gt. + */ +static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) +{ + u32 ret; + + spin_lock(&msgr->global_seq_lock); + if (msgr->global_seq < gt) + msgr->global_seq = gt; + ret = ++msgr->global_seq; + spin_unlock(&msgr->global_seq_lock); + return ret; +} + +static void con_out_kvec_reset(struct ceph_connection *con) +{ + con->out_kvec_left = 0; + con->out_kvec_bytes = 0; + con->out_kvec_cur = &con->out_kvec[0]; +} + +static void con_out_kvec_add(struct ceph_connection *con, + size_t size, void *data) +{ + int index; + + index = con->out_kvec_left; + BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); + + con->out_kvec[index].iov_len = size; + con->out_kvec[index].iov_base = data; + con->out_kvec_left++; + con->out_kvec_bytes += size; +} + +#ifdef CONFIG_BLOCK + +/* + * For a bio data item, a piece is whatever remains of the next + * entry in the current bio iovec, or the first entry in the next + * bio in the list. + */ +static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + struct bio *bio; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = data->bio; + BUG_ON(!bio); + + cursor->resid = min(length, data->bio_length); + cursor->bio = bio; + cursor->bvec_iter = bio->bi_iter; + cursor->last_piece = + cursor->resid <= bio_iter_len(bio, cursor->bvec_iter); +} + +static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, + size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct bio *bio; + struct bio_vec bio_vec; + + BUG_ON(data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); + + *page_offset = (size_t) bio_vec.bv_offset; + BUG_ON(*page_offset >= PAGE_SIZE); + if (cursor->last_piece) /* pagelist offset is always 0 */ + *length = cursor->resid; + else + *length = (size_t) bio_vec.bv_len; + BUG_ON(*length > cursor->resid); + BUG_ON(*page_offset + *length > PAGE_SIZE); + + return bio_vec.bv_page; +} + +static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct bio *bio; + struct bio_vec bio_vec; + + BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO); + + bio = cursor->bio; + BUG_ON(!bio); + + bio_vec = bio_iter_iovec(bio, cursor->bvec_iter); + + /* Advance the cursor offset */ + + BUG_ON(cursor->resid < bytes); + cursor->resid -= bytes; + + bio_advance_iter(bio, &cursor->bvec_iter, bytes); + + if (bytes < bio_vec.bv_len) + return false; /* more bytes to process in this segment */ + + /* Move on to the next segment, and possibly the next bio */ + + if (!cursor->bvec_iter.bi_size) { + bio = bio->bi_next; + cursor->bio = bio; + if (bio) + cursor->bvec_iter = bio->bi_iter; + else + memset(&cursor->bvec_iter, 0, + sizeof(cursor->bvec_iter)); + } + + if (!cursor->last_piece) { + BUG_ON(!cursor->resid); + BUG_ON(!bio); + /* A short read is OK, so use <= rather than == */ + if (cursor->resid <= bio_iter_len(bio, cursor->bvec_iter)) + cursor->last_piece = true; + } + + return true; +} +#endif /* CONFIG_BLOCK */ + +/* + * For a page array, a piece comes from the first page in the array + * that has not already been fully consumed. + */ +static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + int page_count; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(!data->pages); + BUG_ON(!data->length); + + cursor->resid = min(length, data->length); + page_count = calc_pages_for(data->alignment, (u64)data->length); + cursor->page_offset = data->alignment & ~PAGE_MASK; + cursor->page_index = 0; + BUG_ON(page_count > (int)USHRT_MAX); + cursor->page_count = (unsigned short)page_count; + BUG_ON(length > SIZE_MAX - cursor->page_offset); + cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE; +} + +static struct page * +ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_index >= cursor->page_count); + BUG_ON(cursor->page_offset >= PAGE_SIZE); + + *page_offset = cursor->page_offset; + if (cursor->last_piece) + *length = cursor->resid; + else + *length = PAGE_SIZE - *page_offset; + + return data->pages[cursor->page_index]; +} + +static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES); + + BUG_ON(cursor->page_offset + bytes > PAGE_SIZE); + + /* Advance the cursor page offset */ + + cursor->resid -= bytes; + cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK; + if (!bytes || cursor->page_offset) + return false; /* more bytes to process in the current page */ + + if (!cursor->resid) + return false; /* no more data */ + + /* Move on to the next page; offset is already at 0 */ + + BUG_ON(cursor->page_index >= cursor->page_count); + cursor->page_index++; + cursor->last_piece = cursor->resid <= PAGE_SIZE; + + return true; +} + +/* + * For a pagelist, a piece is whatever remains to be consumed in the + * first page in the list, or the front of the next page. + */ +static void +ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, + size_t length) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + struct page *page; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + if (!length) + return; /* pagelist can be assigned but empty */ + + BUG_ON(list_empty(&pagelist->head)); + page = list_first_entry(&pagelist->head, struct page, lru); + + cursor->resid = min(length, pagelist->length); + cursor->page = page; + cursor->offset = 0; + cursor->last_piece = cursor->resid <= PAGE_SIZE; +} + +static struct page * +ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(!cursor->page); + BUG_ON(cursor->offset + cursor->resid != pagelist->length); + + /* offset of first page in pagelist is always 0 */ + *page_offset = cursor->offset & ~PAGE_MASK; + if (cursor->last_piece) + *length = cursor->resid; + else + *length = PAGE_SIZE - *page_offset; + + return cursor->page; +} + +static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + struct ceph_msg_data *data = cursor->data; + struct ceph_pagelist *pagelist; + + BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST); + + pagelist = data->pagelist; + BUG_ON(!pagelist); + + BUG_ON(cursor->offset + cursor->resid != pagelist->length); + BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE); + + /* Advance the cursor offset */ + + cursor->resid -= bytes; + cursor->offset += bytes; + /* offset of first page in pagelist is always 0 */ + if (!bytes || cursor->offset & ~PAGE_MASK) + return false; /* more bytes to process in the current page */ + + if (!cursor->resid) + return false; /* no more data */ + + /* Move on to the next page */ + + BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); + cursor->page = list_entry_next(cursor->page, lru); + cursor->last_piece = cursor->resid <= PAGE_SIZE; + + return true; +} + +/* + * Message data is handled (sent or received) in pieces, where each + * piece resides on a single page. The network layer might not + * consume an entire piece at once. A data item's cursor keeps + * track of which piece is next to process and how much remains to + * be processed in that piece. It also tracks whether the current + * piece is the last one in the data item. + */ +static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) +{ + size_t length = cursor->total_resid; + + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + ceph_msg_data_pagelist_cursor_init(cursor, length); + break; + case CEPH_MSG_DATA_PAGES: + ceph_msg_data_pages_cursor_init(cursor, length); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + ceph_msg_data_bio_cursor_init(cursor, length); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + /* BUG(); */ + break; + } + cursor->need_crc = true; +} + +static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) +{ + struct ceph_msg_data_cursor *cursor = &msg->cursor; + struct ceph_msg_data *data; + + BUG_ON(!length); + BUG_ON(length > msg->data_length); + BUG_ON(list_empty(&msg->data)); + + cursor->data_head = &msg->data; + cursor->total_resid = length; + data = list_first_entry(&msg->data, struct ceph_msg_data, links); + cursor->data = data; + + __ceph_msg_data_cursor_init(cursor); +} + +/* + * Return the page containing the next piece to process for a given + * data item, and supply the page offset and length of that piece. + * Indicate whether this is the last piece in this data item. + */ +static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, + size_t *page_offset, size_t *length, + bool *last_piece) +{ + struct page *page; + + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + page = ceph_msg_data_pagelist_next(cursor, page_offset, length); + break; + case CEPH_MSG_DATA_PAGES: + page = ceph_msg_data_pages_next(cursor, page_offset, length); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + page = ceph_msg_data_bio_next(cursor, page_offset, length); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + page = NULL; + break; + } + BUG_ON(!page); + BUG_ON(*page_offset + *length > PAGE_SIZE); + BUG_ON(!*length); + if (last_piece) + *last_piece = cursor->last_piece; + + return page; +} + +/* + * Returns true if the result moves the cursor on to the next piece + * of the data item. + */ +static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, + size_t bytes) +{ + bool new_piece; + + BUG_ON(bytes > cursor->resid); + switch (cursor->data->type) { + case CEPH_MSG_DATA_PAGELIST: + new_piece = ceph_msg_data_pagelist_advance(cursor, bytes); + break; + case CEPH_MSG_DATA_PAGES: + new_piece = ceph_msg_data_pages_advance(cursor, bytes); + break; +#ifdef CONFIG_BLOCK + case CEPH_MSG_DATA_BIO: + new_piece = ceph_msg_data_bio_advance(cursor, bytes); + break; +#endif /* CONFIG_BLOCK */ + case CEPH_MSG_DATA_NONE: + default: + BUG(); + break; + } + cursor->total_resid -= bytes; + + if (!cursor->resid && cursor->total_resid) { + WARN_ON(!cursor->last_piece); + BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); + cursor->data = list_entry_next(cursor->data, links); + __ceph_msg_data_cursor_init(cursor); + new_piece = true; + } + cursor->need_crc = new_piece; + + return new_piece; +} + +static void prepare_message_data(struct ceph_msg *msg, u32 data_len) +{ + BUG_ON(!msg); + BUG_ON(!data_len); + + /* Initialize data cursor */ + + ceph_msg_data_cursor_init(msg, (size_t)data_len); +} + +/* + * Prepare footer for currently outgoing message, and finish things + * off. Assumes out_kvec* are already valid.. we just add on to the end. + */ +static void prepare_write_message_footer(struct ceph_connection *con) +{ + struct ceph_msg *m = con->out_msg; + int v = con->out_kvec_left; + + m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; + + dout("prepare_write_message_footer %p\n", con); + con->out_kvec_is_msg = true; + con->out_kvec[v].iov_base = &m->footer; + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { + if (con->ops->sign_message) + con->ops->sign_message(con, m); + else + m->footer.sig = 0; + con->out_kvec[v].iov_len = sizeof(m->footer); + con->out_kvec_bytes += sizeof(m->footer); + } else { + m->old_footer.flags = m->footer.flags; + con->out_kvec[v].iov_len = sizeof(m->old_footer); + con->out_kvec_bytes += sizeof(m->old_footer); + } + con->out_kvec_left++; + con->out_more = m->more_to_follow; + con->out_msg_done = true; +} + +/* + * Prepare headers for the next outgoing message. + */ +static void prepare_write_message(struct ceph_connection *con) +{ + struct ceph_msg *m; + u32 crc; + + con_out_kvec_reset(con); + con->out_kvec_is_msg = true; + con->out_msg_done = false; + + /* Sneak an ack in there first? If we can get it into the same + * TCP packet that's a good thing. */ + if (con->in_seq > con->in_seq_acked) { + con->in_seq_acked = con->in_seq; + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + } + + BUG_ON(list_empty(&con->out_queue)); + m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); + con->out_msg = m; + BUG_ON(m->con != con); + + /* put message on sent list */ + ceph_msg_get(m); + list_move_tail(&m->list_head, &con->out_sent); + + /* + * only assign outgoing seq # if we haven't sent this message + * yet. if it is requeued, resend with it's original seq. + */ + if (m->needs_out_seq) { + m->hdr.seq = cpu_to_le64(++con->out_seq); + m->needs_out_seq = false; + } + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); + + dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", + m, con->out_seq, le16_to_cpu(m->hdr.type), + le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), + m->data_length); + BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); + + /* tag + hdr + front + middle */ + con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); + con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); + + if (m->middle) + con_out_kvec_add(con, m->middle->vec.iov_len, + m->middle->vec.iov_base); + + /* fill in crc (except data pages), footer */ + crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); + con->out_msg->hdr.crc = cpu_to_le32(crc); + con->out_msg->footer.flags = 0; + + crc = crc32c(0, m->front.iov_base, m->front.iov_len); + con->out_msg->footer.front_crc = cpu_to_le32(crc); + if (m->middle) { + crc = crc32c(0, m->middle->vec.iov_base, + m->middle->vec.iov_len); + con->out_msg->footer.middle_crc = cpu_to_le32(crc); + } else + con->out_msg->footer.middle_crc = 0; + dout("%s front_crc %u middle_crc %u\n", __func__, + le32_to_cpu(con->out_msg->footer.front_crc), + le32_to_cpu(con->out_msg->footer.middle_crc)); + + /* is there a data payload? */ + con->out_msg->footer.data_crc = 0; + if (m->data_length) { + prepare_message_data(con->out_msg, m->data_length); + con->out_more = 1; /* data + footer will follow */ + } else { + /* no, queue up footer too and be done */ + prepare_write_message_footer(con); + } + + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +/* + * Prepare an ack. + */ +static void prepare_write_ack(struct ceph_connection *con) +{ + dout("prepare_write_ack %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); + + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + + con->out_more = 1; /* more will follow.. eventually.. */ + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +/* + * Prepare to share the seq during handshake + */ +static void prepare_write_seq(struct ceph_connection *con) +{ + dout("prepare_write_seq %p %llu -> %llu\n", con, + con->in_seq_acked, con->in_seq); + con->in_seq_acked = con->in_seq; + + con_out_kvec_reset(con); + + con->out_temp_ack = cpu_to_le64(con->in_seq_acked); + con_out_kvec_add(con, sizeof (con->out_temp_ack), + &con->out_temp_ack); + + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +/* + * Prepare to write keepalive byte. + */ +static void prepare_write_keepalive(struct ceph_connection *con) +{ + dout("prepare_write_keepalive %p\n", con); + con_out_kvec_reset(con); + con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +/* + * Connection negotiation. + */ + +static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, + int *auth_proto) +{ + struct ceph_auth_handshake *auth; + + if (!con->ops->get_authorizer) { + con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; + con->out_connect.authorizer_len = 0; + return NULL; + } + + /* Can't hold the mutex while getting authorizer */ + mutex_unlock(&con->mutex); + auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); + mutex_lock(&con->mutex); + + if (IS_ERR(auth)) + return auth; + if (con->state != CON_STATE_NEGOTIATING) + return ERR_PTR(-EAGAIN); + + con->auth_reply_buf = auth->authorizer_reply_buf; + con->auth_reply_buf_len = auth->authorizer_reply_buf_len; + return auth; +} + +/* + * We connected to a peer and are saying hello. + */ +static void prepare_write_banner(struct ceph_connection *con) +{ + con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); + con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), + &con->msgr->my_enc_addr); + + con->out_more = 0; + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + +static int prepare_write_connect(struct ceph_connection *con) +{ + unsigned int global_seq = get_global_seq(con->msgr, 0); + int proto; + int auth_proto; + struct ceph_auth_handshake *auth; + + switch (con->peer_name.type) { + case CEPH_ENTITY_TYPE_MON: + proto = CEPH_MONC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_OSD: + proto = CEPH_OSDC_PROTOCOL; + break; + case CEPH_ENTITY_TYPE_MDS: + proto = CEPH_MDSC_PROTOCOL; + break; + default: + BUG(); + } + + dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, + con->connect_seq, global_seq, proto); + + con->out_connect.features = cpu_to_le64(con->msgr->supported_features); + con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); + con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); + con->out_connect.global_seq = cpu_to_le32(global_seq); + con->out_connect.protocol_version = cpu_to_le32(proto); + con->out_connect.flags = 0; + + auth_proto = CEPH_AUTH_UNKNOWN; + auth = get_connect_authorizer(con, &auth_proto); + if (IS_ERR(auth)) + return PTR_ERR(auth); + + con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); + con->out_connect.authorizer_len = auth ? + cpu_to_le32(auth->authorizer_buf_len) : 0; + + con_out_kvec_add(con, sizeof (con->out_connect), + &con->out_connect); + if (auth && auth->authorizer_buf_len) + con_out_kvec_add(con, auth->authorizer_buf_len, + auth->authorizer_buf); + + con->out_more = 0; + con_flag_set(con, CON_FLAG_WRITE_PENDING); + + return 0; +} + +/* + * write as much of pending kvecs to the socket as we can. + * 1 -> done + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_kvec(struct ceph_connection *con) +{ + int ret; + + dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes); + while (con->out_kvec_bytes > 0) { + ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur, + con->out_kvec_left, con->out_kvec_bytes, + con->out_more); + if (ret <= 0) + goto out; + con->out_kvec_bytes -= ret; + if (con->out_kvec_bytes == 0) + break; /* done */ + + /* account for full iov entries consumed */ + while (ret >= con->out_kvec_cur->iov_len) { + BUG_ON(!con->out_kvec_left); + ret -= con->out_kvec_cur->iov_len; + con->out_kvec_cur++; + con->out_kvec_left--; + } + /* and for a partially-consumed entry */ + if (ret) { + con->out_kvec_cur->iov_len -= ret; + con->out_kvec_cur->iov_base += ret; + } + } + con->out_kvec_left = 0; + con->out_kvec_is_msg = false; + ret = 1; +out: + dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, + con->out_kvec_bytes, con->out_kvec_left, ret); + return ret; /* done! */ +} + +static u32 ceph_crc32c_page(u32 crc, struct page *page, + unsigned int page_offset, + unsigned int length) +{ + char *kaddr; + + kaddr = kmap(page); + BUG_ON(kaddr == NULL); + crc = crc32c(crc, kaddr + page_offset, length); + kunmap(page); + + return crc; +} +/* + * Write as much message data payload as we can. If we finish, queue + * up the footer. + * 1 -> done, footer is now queued in out_kvec[]. + * 0 -> socket full, but more to do + * <0 -> error + */ +static int write_partial_message_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->out_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + bool do_datacrc = !con->msgr->nocrc; + u32 crc; + + dout("%s %p msg %p\n", __func__, con, msg); + + if (list_empty(&msg->data)) + return -EINVAL; + + /* + * Iterate through each page that contains data to be + * written, and send as much as possible for each. + * + * If we are calculating the data crc (the default), we will + * need to map the page. If we have no pages, they have + * been revoked, so use the zero page. + */ + crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0; + while (cursor->resid) { + struct page *page; + size_t page_offset; + size_t length; + bool last_piece; + bool need_crc; + int ret; + + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, + &last_piece); + ret = ceph_tcp_sendpage(con->sock, page, page_offset, + length, last_piece); + if (ret <= 0) { + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + + return ret; + } + if (do_datacrc && cursor->need_crc) + crc = ceph_crc32c_page(crc, page, page_offset, length); + need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret); + } + + dout("%s %p msg %p done\n", __func__, con, msg); + + /* prepare and queue up footer, too */ + if (do_datacrc) + msg->footer.data_crc = cpu_to_le32(crc); + else + msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; + con_out_kvec_reset(con); + prepare_write_message_footer(con); + + return 1; /* must return > 0 to indicate success */ +} + +/* + * write some zeros + */ +static int write_partial_skip(struct ceph_connection *con) +{ + int ret; + + while (con->out_skip > 0) { + size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); + + ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true); + if (ret <= 0) + goto out; + con->out_skip -= ret; + } + ret = 1; +out: + return ret; +} + +/* + * Prepare to read connection handshake, or an ack. + */ +static void prepare_read_banner(struct ceph_connection *con) +{ + dout("prepare_read_banner %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_connect(struct ceph_connection *con) +{ + dout("prepare_read_connect %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_ack(struct ceph_connection *con) +{ + dout("prepare_read_ack %p\n", con); + con->in_base_pos = 0; +} + +static void prepare_read_seq(struct ceph_connection *con) +{ + dout("prepare_read_seq %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_SEQ; +} + +static void prepare_read_tag(struct ceph_connection *con) +{ + dout("prepare_read_tag %p\n", con); + con->in_base_pos = 0; + con->in_tag = CEPH_MSGR_TAG_READY; +} + +/* + * Prepare to read a message. + */ +static int prepare_read_message(struct ceph_connection *con) +{ + dout("prepare_read_message %p\n", con); + BUG_ON(con->in_msg != NULL); + con->in_base_pos = 0; + con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0; + return 0; +} + + +static int read_partial(struct ceph_connection *con, + int end, int size, void *object) +{ + while (con->in_base_pos < end) { + int left = end - con->in_base_pos; + int have = size - left; + int ret = ceph_tcp_recvmsg(con->sock, object + have, left); + if (ret <= 0) + return ret; + con->in_base_pos += ret; + } + return 1; +} + + +/* + * Read all or part of the connect-side handshake on a new connection + */ +static int read_partial_banner(struct ceph_connection *con) +{ + int size; + int end; + int ret; + + dout("read_partial_banner %p at %d\n", con, con->in_base_pos); + + /* peer's banner */ + size = strlen(CEPH_BANNER); + end = size; + ret = read_partial(con, end, size, con->in_banner); + if (ret <= 0) + goto out; + + size = sizeof (con->actual_peer_addr); + end += size; + ret = read_partial(con, end, size, &con->actual_peer_addr); + if (ret <= 0) + goto out; + + size = sizeof (con->peer_addr_for_me); + end += size; + ret = read_partial(con, end, size, &con->peer_addr_for_me); + if (ret <= 0) + goto out; + +out: + return ret; +} + +static int read_partial_connect(struct ceph_connection *con) +{ + int size; + int end; + int ret; + + dout("read_partial_connect %p at %d\n", con, con->in_base_pos); + + size = sizeof (con->in_reply); + end = size; + ret = read_partial(con, end, size, &con->in_reply); + if (ret <= 0) + goto out; + + size = le32_to_cpu(con->in_reply.authorizer_len); + end += size; + ret = read_partial(con, end, size, con->auth_reply_buf); + if (ret <= 0) + goto out; + + dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", + con, (int)con->in_reply.tag, + le32_to_cpu(con->in_reply.connect_seq), + le32_to_cpu(con->in_reply.global_seq)); +out: + return ret; + +} + +/* + * Verify the hello banner looks okay. + */ +static int verify_hello(struct ceph_connection *con) +{ + if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { + pr_err("connect to %s got bad banner\n", + ceph_pr_addr(&con->peer_addr.in_addr)); + con->error_msg = "protocol error, bad banner"; + return -1; + } + return 0; +} + +static bool addr_is_blank(struct sockaddr_storage *ss) +{ + switch (ss->ss_family) { + case AF_INET: + return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0; + case AF_INET6: + return + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 && + ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0; + } + return false; +} + +static int addr_port(struct sockaddr_storage *ss) +{ + switch (ss->ss_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)ss)->sin_port); + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)ss)->sin6_port); + } + return 0; +} + +static void addr_set_port(struct sockaddr_storage *ss, int p) +{ + switch (ss->ss_family) { + case AF_INET: + ((struct sockaddr_in *)ss)->sin_port = htons(p); + break; + case AF_INET6: + ((struct sockaddr_in6 *)ss)->sin6_port = htons(p); + break; + } +} + +/* + * Unlike other *_pton function semantics, zero indicates success. + */ +static int ceph_pton(const char *str, size_t len, struct sockaddr_storage *ss, + char delim, const char **ipend) +{ + struct sockaddr_in *in4 = (struct sockaddr_in *) ss; + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) ss; + + memset(ss, 0, sizeof(*ss)); + + if (in4_pton(str, len, (u8 *)&in4->sin_addr.s_addr, delim, ipend)) { + ss->ss_family = AF_INET; + return 0; + } + + if (in6_pton(str, len, (u8 *)&in6->sin6_addr.s6_addr, delim, ipend)) { + ss->ss_family = AF_INET6; + return 0; + } + + return -EINVAL; +} + +/* + * Extract hostname string and resolve using kernel DNS facility. + */ +#ifdef CONFIG_CEPH_LIB_USE_DNS_RESOLVER +static int ceph_dns_resolve_name(const char *name, size_t namelen, + struct sockaddr_storage *ss, char delim, const char **ipend) +{ + const char *end, *delim_p; + char *colon_p, *ip_addr = NULL; + int ip_len, ret; + + /* + * The end of the hostname occurs immediately preceding the delimiter or + * the port marker (':') where the delimiter takes precedence. + */ + delim_p = memchr(name, delim, namelen); + colon_p = memchr(name, ':', namelen); + + if (delim_p && colon_p) + end = delim_p < colon_p ? delim_p : colon_p; + else if (!delim_p && colon_p) + end = colon_p; + else { + end = delim_p; + if (!end) /* case: hostname:/ */ + end = name + namelen; + } + + if (end <= name) + return -EINVAL; + + /* do dns_resolve upcall */ + ip_len = dns_query(NULL, name, end - name, NULL, &ip_addr, NULL); + if (ip_len > 0) + ret = ceph_pton(ip_addr, ip_len, ss, -1, NULL); + else + ret = -ESRCH; + + kfree(ip_addr); + + *ipend = end; + + pr_info("resolve '%.*s' (ret=%d): %s\n", (int)(end - name), name, + ret, ret ? "failed" : ceph_pr_addr(ss)); + + return ret; +} +#else +static inline int ceph_dns_resolve_name(const char *name, size_t namelen, + struct sockaddr_storage *ss, char delim, const char **ipend) +{ + return -EINVAL; +} +#endif + +/* + * Parse a server name (IP or hostname). If a valid IP address is not found + * then try to extract a hostname to resolve using userspace DNS upcall. + */ +static int ceph_parse_server_name(const char *name, size_t namelen, + struct sockaddr_storage *ss, char delim, const char **ipend) +{ + int ret; + + ret = ceph_pton(name, namelen, ss, delim, ipend); + if (ret) + ret = ceph_dns_resolve_name(name, namelen, ss, delim, ipend); + + return ret; +} + +/* + * Parse an ip[:port] list into an addr array. Use the default + * monitor port if a port isn't specified. + */ +int ceph_parse_ips(const char *c, const char *end, + struct ceph_entity_addr *addr, + int max_count, int *count) +{ + int i, ret = -EINVAL; + const char *p = c; + + dout("parse_ips on '%.*s'\n", (int)(end-c), c); + for (i = 0; i < max_count; i++) { + const char *ipend; + struct sockaddr_storage *ss = &addr[i].in_addr; + int port; + char delim = ','; + + if (*p == '[') { + delim = ']'; + p++; + } + + ret = ceph_parse_server_name(p, end - p, ss, delim, &ipend); + if (ret) + goto bad; + ret = -EINVAL; + + p = ipend; + + if (delim == ']') { + if (*p != ']') { + dout("missing matching ']'\n"); + goto bad; + } + p++; + } + + /* port? */ + if (p < end && *p == ':') { + port = 0; + p++; + while (p < end && *p >= '0' && *p <= '9') { + port = (port * 10) + (*p - '0'); + p++; + } + if (port == 0) + port = CEPH_MON_PORT; + else if (port > 65535) + goto bad; + } else { + port = CEPH_MON_PORT; + } + + addr_set_port(ss, port); + + dout("parse_ips got %s\n", ceph_pr_addr(ss)); + + if (p == end) + break; + if (*p != ',') + goto bad; + p++; + } + + if (p != end) + goto bad; + + if (count) + *count = i + 1; + return 0; + +bad: + pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); + return ret; +} +EXPORT_SYMBOL(ceph_parse_ips); + +static int process_banner(struct ceph_connection *con) +{ + dout("process_banner on %p\n", con); + + if (verify_hello(con) < 0) + return -1; + + ceph_decode_addr(&con->actual_peer_addr); + ceph_decode_addr(&con->peer_addr_for_me); + + /* + * Make sure the other end is who we wanted. note that the other + * end may not yet know their ip address, so if it's 0.0.0.0, give + * them the benefit of the doubt. + */ + if (memcmp(&con->peer_addr, &con->actual_peer_addr, + sizeof(con->peer_addr)) != 0 && + !(addr_is_blank(&con->actual_peer_addr.in_addr) && + con->actual_peer_addr.nonce == con->peer_addr.nonce)) { + pr_warn("wrong peer, want %s/%d, got %s/%d\n", + ceph_pr_addr(&con->peer_addr.in_addr), + (int)le32_to_cpu(con->peer_addr.nonce), + ceph_pr_addr(&con->actual_peer_addr.in_addr), + (int)le32_to_cpu(con->actual_peer_addr.nonce)); + con->error_msg = "wrong peer at address"; + return -1; + } + + /* + * did we learn our address? + */ + if (addr_is_blank(&con->msgr->inst.addr.in_addr)) { + int port = addr_port(&con->msgr->inst.addr.in_addr); + + memcpy(&con->msgr->inst.addr.in_addr, + &con->peer_addr_for_me.in_addr, + sizeof(con->peer_addr_for_me.in_addr)); + addr_set_port(&con->msgr->inst.addr.in_addr, port); + encode_my_addr(con->msgr); + dout("process_banner learned my addr is %s\n", + ceph_pr_addr(&con->msgr->inst.addr.in_addr)); + } + + return 0; +} + +static int process_connect(struct ceph_connection *con) +{ + u64 sup_feat = con->msgr->supported_features; + u64 req_feat = con->msgr->required_features; + u64 server_feat = ceph_sanitize_features( + le64_to_cpu(con->in_reply.features)); + int ret; + + dout("process_connect on %p tag %d\n", con, (int)con->in_tag); + + switch (con->in_reply.tag) { + case CEPH_MSGR_TAG_FEATURES: + pr_err("%s%lld %s feature set mismatch," + " my %llx < server's %llx, missing %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), + sup_feat, server_feat, server_feat & ~sup_feat); + con->error_msg = "missing required protocol features"; + reset_connection(con); + return -1; + + case CEPH_MSGR_TAG_BADPROTOVER: + pr_err("%s%lld %s protocol version mismatch," + " my %d != server's %d\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), + le32_to_cpu(con->out_connect.protocol_version), + le32_to_cpu(con->in_reply.protocol_version)); + con->error_msg = "protocol version mismatch"; + reset_connection(con); + return -1; + + case CEPH_MSGR_TAG_BADAUTHORIZER: + con->auth_retry++; + dout("process_connect %p got BADAUTHORIZER attempt %d\n", con, + con->auth_retry); + if (con->auth_retry == 2) { + con->error_msg = "connect authorization failure"; + return -1; + } + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RESETSESSION: + /* + * If we connected with a large connect_seq but the peer + * has no record of a session with us (no connection, or + * connect_seq == 0), they will send RESETSESION to indicate + * that they must have reset their session, and may have + * dropped messages. + */ + dout("process_connect got RESET peer seq %u\n", + le32_to_cpu(con->in_reply.connect_seq)); + pr_err("%s%lld %s connection reset\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr)); + reset_connection(con); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + + /* Tell ceph about it. */ + mutex_unlock(&con->mutex); + pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name)); + if (con->ops->peer_reset) + con->ops->peer_reset(con); + mutex_lock(&con->mutex); + if (con->state != CON_STATE_NEGOTIATING) + return -EAGAIN; + break; + + case CEPH_MSGR_TAG_RETRY_SESSION: + /* + * If we sent a smaller connect_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", + le32_to_cpu(con->out_connect.connect_seq), + le32_to_cpu(con->in_reply.connect_seq)); + con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_RETRY_GLOBAL: + /* + * If we sent a smaller global_seq than the peer has, try + * again with a larger value. + */ + dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", + con->peer_global_seq, + le32_to_cpu(con->in_reply.global_seq)); + get_global_seq(con->msgr, + le32_to_cpu(con->in_reply.global_seq)); + con_out_kvec_reset(con); + ret = prepare_write_connect(con); + if (ret < 0) + return ret; + prepare_read_connect(con); + break; + + case CEPH_MSGR_TAG_SEQ: + case CEPH_MSGR_TAG_READY: + if (req_feat & ~server_feat) { + pr_err("%s%lld %s protocol feature mismatch," + " my required %llx > server's %llx, need %llx\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), + req_feat, server_feat, req_feat & ~server_feat); + con->error_msg = "missing required protocol features"; + reset_connection(con); + return -1; + } + + WARN_ON(con->state != CON_STATE_NEGOTIATING); + con->state = CON_STATE_OPEN; + con->auth_retry = 0; /* we authenticated; clear flag */ + con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); + con->connect_seq++; + con->peer_features = server_feat; + dout("process_connect got READY gseq %d cseq %d (%d)\n", + con->peer_global_seq, + le32_to_cpu(con->in_reply.connect_seq), + con->connect_seq); + WARN_ON(con->connect_seq != + le32_to_cpu(con->in_reply.connect_seq)); + + if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) + con_flag_set(con, CON_FLAG_LOSSYTX); + + con->delay = 0; /* reset backoff memory */ + + if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) { + prepare_write_seq(con); + prepare_read_seq(con); + } else { + prepare_read_tag(con); + } + break; + + case CEPH_MSGR_TAG_WAIT: + /* + * If there is a connection race (we are opening + * connections to each other), one of us may just have + * to WAIT. This shouldn't happen if we are the + * client. + */ + con->error_msg = "protocol error, got WAIT as client"; + return -1; + + default: + con->error_msg = "protocol error, garbage tag during connect"; + return -1; + } + return 0; +} + + +/* + * read (part of) an ack + */ +static int read_partial_ack(struct ceph_connection *con) +{ + int size = sizeof (con->in_temp_ack); + int end = size; + + return read_partial(con, end, size, &con->in_temp_ack); +} + +/* + * We can finally discard anything that's been acked. + */ +static void process_ack(struct ceph_connection *con) +{ + struct ceph_msg *m; + u64 ack = le64_to_cpu(con->in_temp_ack); + u64 seq; + + while (!list_empty(&con->out_sent)) { + m = list_first_entry(&con->out_sent, struct ceph_msg, + list_head); + seq = le64_to_cpu(m->hdr.seq); + if (seq > ack) + break; + dout("got ack for seq %llu type %d at %p\n", seq, + le16_to_cpu(m->hdr.type), m); + m->ack_stamp = jiffies; + ceph_msg_remove(m); + } + prepare_read_tag(con); +} + + +static int read_partial_message_section(struct ceph_connection *con, + struct kvec *section, + unsigned int sec_len, u32 *crc) +{ + int ret, left; + + BUG_ON(!section); + + while (section->iov_len < sec_len) { + BUG_ON(section->iov_base == NULL); + left = sec_len - section->iov_len; + ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base + + section->iov_len, left); + if (ret <= 0) + return ret; + section->iov_len += ret; + } + if (section->iov_len == sec_len) + *crc = crc32c(0, section->iov_base, section->iov_len); + + return 1; +} + +static int read_partial_msg_data(struct ceph_connection *con) +{ + struct ceph_msg *msg = con->in_msg; + struct ceph_msg_data_cursor *cursor = &msg->cursor; + const bool do_datacrc = !con->msgr->nocrc; + struct page *page; + size_t page_offset; + size_t length; + u32 crc = 0; + int ret; + + BUG_ON(!msg); + if (list_empty(&msg->data)) + return -EIO; + + if (do_datacrc) + crc = con->in_data_crc; + while (cursor->resid) { + page = ceph_msg_data_next(&msg->cursor, &page_offset, &length, + NULL); + ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); + if (ret <= 0) { + if (do_datacrc) + con->in_data_crc = crc; + + return ret; + } + + if (do_datacrc) + crc = ceph_crc32c_page(crc, page, page_offset, ret); + (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret); + } + if (do_datacrc) + con->in_data_crc = crc; + + return 1; /* must return > 0 to indicate success */ +} + +/* + * read (part of) a message. + */ +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); + +static int read_partial_message(struct ceph_connection *con) +{ + struct ceph_msg *m = con->in_msg; + int size; + int end; + int ret; + unsigned int front_len, middle_len, data_len; + bool do_datacrc = !con->msgr->nocrc; + bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); + u64 seq; + u32 crc; + + dout("read_partial_message con %p msg %p\n", con, m); + + /* header */ + size = sizeof (con->in_hdr); + end = size; + ret = read_partial(con, end, size, &con->in_hdr); + if (ret <= 0) + return ret; + + crc = crc32c(0, &con->in_hdr, offsetof(struct ceph_msg_header, crc)); + if (cpu_to_le32(crc) != con->in_hdr.crc) { + pr_err("read_partial_message bad hdr crc %u != expected %u\n", + crc, con->in_hdr.crc); + return -EBADMSG; + } + + front_len = le32_to_cpu(con->in_hdr.front_len); + if (front_len > CEPH_MSG_MAX_FRONT_LEN) + return -EIO; + middle_len = le32_to_cpu(con->in_hdr.middle_len); + if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN) + return -EIO; + data_len = le32_to_cpu(con->in_hdr.data_len); + if (data_len > CEPH_MSG_MAX_DATA_LEN) + return -EIO; + + /* verify seq# */ + seq = le64_to_cpu(con->in_hdr.seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("skipping %s%lld %s seq %lld expected %lld\n", + ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), + seq, con->in_seq + 1); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + return 0; + } else if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("read_partial_message bad seq %lld expected %lld\n", + seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADE; + } + + /* allocate message? */ + if (!con->in_msg) { + int skip = 0; + + dout("got hdr type %d front %d data %d\n", con->in_hdr.type, + front_len, data_len); + ret = ceph_con_in_msg_alloc(con, &skip); + if (ret < 0) + return ret; + + BUG_ON(!con->in_msg ^ skip); + if (con->in_msg && data_len > con->in_msg->data_length) { + pr_warn("%s skipping long message (%u > %zd)\n", + __func__, data_len, con->in_msg->data_length); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + skip = 1; + } + if (skip) { + /* skip this message */ + dout("alloc_msg said skip message\n"); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + return 0; + } + + BUG_ON(!con->in_msg); + BUG_ON(con->in_msg->con != con); + m = con->in_msg; + m->front.iov_len = 0; /* haven't read it yet */ + if (m->middle) + m->middle->vec.iov_len = 0; + + /* prepare for data payload, if any */ + + if (data_len) + prepare_message_data(con->in_msg, data_len); + } + + /* front */ + ret = read_partial_message_section(con, &m->front, front_len, + &con->in_front_crc); + if (ret <= 0) + return ret; + + /* middle */ + if (m->middle) { + ret = read_partial_message_section(con, &m->middle->vec, + middle_len, + &con->in_middle_crc); + if (ret <= 0) + return ret; + } + + /* (page) data */ + if (data_len) { + ret = read_partial_msg_data(con); + if (ret <= 0) + return ret; + } + + /* footer */ + if (need_sign) + size = sizeof(m->footer); + else + size = sizeof(m->old_footer); + + end += size; + ret = read_partial(con, end, size, &m->footer); + if (ret <= 0) + return ret; + + if (!need_sign) { + m->footer.flags = m->old_footer.flags; + m->footer.sig = 0; + } + + dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", + m, front_len, m->footer.front_crc, middle_len, + m->footer.middle_crc, data_len, m->footer.data_crc); + + /* crc ok? */ + if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) { + pr_err("read_partial_message %p front crc %u != exp. %u\n", + m, con->in_front_crc, m->footer.front_crc); + return -EBADMSG; + } + if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) { + pr_err("read_partial_message %p middle crc %u != exp %u\n", + m, con->in_middle_crc, m->footer.middle_crc); + return -EBADMSG; + } + if (do_datacrc && + (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 && + con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { + pr_err("read_partial_message %p data crc %u != exp. %u\n", m, + con->in_data_crc, le32_to_cpu(m->footer.data_crc)); + return -EBADMSG; + } + + if (need_sign && con->ops->check_message_signature && + con->ops->check_message_signature(con, m)) { + pr_err("read_partial_message %p signature check failed\n", m); + return -EBADMSG; + } + + return 1; /* done! */ +} + +/* + * Process message. This happens in the worker thread. The callback should + * be careful not to do anything that waits on other incoming messages or it + * may deadlock. + */ +static void process_message(struct ceph_connection *con) +{ + struct ceph_msg *msg; + + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; + msg = con->in_msg; + con->in_msg = NULL; + con->ops->put(con); + + /* if first message, set peer_name */ + if (con->peer_name.type == 0) + con->peer_name = msg->hdr.src; + + con->in_seq++; + mutex_unlock(&con->mutex); + + dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n", + msg, le64_to_cpu(msg->hdr.seq), + ENTITY_NAME(msg->hdr.src), + le16_to_cpu(msg->hdr.type), + ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), + le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.data_len), + con->in_front_crc, con->in_middle_crc, con->in_data_crc); + con->ops->dispatch(con, msg); + + mutex_lock(&con->mutex); +} + + +/* + * Write something to the socket. Called in a worker thread when the + * socket appears to be writeable and we have something ready to send. + */ +static int try_write(struct ceph_connection *con) +{ + int ret = 1; + + dout("try_write start %p state %lu\n", con, con->state); + +more: + dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); + + /* open the socket first? */ + if (con->state == CON_STATE_PREOPEN) { + BUG_ON(con->sock); + con->state = CON_STATE_CONNECTING; + + con_out_kvec_reset(con); + prepare_write_banner(con); + prepare_read_banner(con); + + BUG_ON(con->in_msg); + con->in_tag = CEPH_MSGR_TAG_READY; + dout("try_write initiating connect on %p new state %lu\n", + con, con->state); + ret = ceph_tcp_connect(con); + if (ret < 0) { + con->error_msg = "connect error"; + goto out; + } + } + +more_kvec: + /* kvec data queued? */ + if (con->out_skip) { + ret = write_partial_skip(con); + if (ret <= 0) + goto out; + } + if (con->out_kvec_left) { + ret = write_partial_kvec(con); + if (ret <= 0) + goto out; + } + + /* msg pages? */ + if (con->out_msg) { + if (con->out_msg_done) { + ceph_msg_put(con->out_msg); + con->out_msg = NULL; /* we're done with this one */ + goto do_next; + } + + ret = write_partial_message_data(con); + if (ret == 1) + goto more_kvec; /* we need to send the footer, too! */ + if (ret == 0) + goto out; + if (ret < 0) { + dout("try_write write_partial_message_data err %d\n", + ret); + goto out; + } + } + +do_next: + if (con->state == CON_STATE_OPEN) { + /* is anything else pending? */ + if (!list_empty(&con->out_queue)) { + prepare_write_message(con); + goto more; + } + if (con->in_seq > con->in_seq_acked) { + prepare_write_ack(con); + goto more; + } + if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) { + prepare_write_keepalive(con); + goto more; + } + } + + /* Nothing to do! */ + con_flag_clear(con, CON_FLAG_WRITE_PENDING); + dout("try_write nothing else to write.\n"); + ret = 0; +out: + dout("try_write done on %p ret %d\n", con, ret); + return ret; +} + + + +/* + * Read what we can from the socket. + */ +static int try_read(struct ceph_connection *con) +{ + int ret = -1; + +more: + dout("try_read start on %p state %lu\n", con, con->state); + if (con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN) + return 0; + + BUG_ON(!con->sock); + + dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, + con->in_base_pos); + + if (con->state == CON_STATE_CONNECTING) { + dout("try_read connecting\n"); + ret = read_partial_banner(con); + if (ret <= 0) + goto out; + ret = process_banner(con); + if (ret < 0) + goto out; + + con->state = CON_STATE_NEGOTIATING; + + /* + * Received banner is good, exchange connection info. + * Do not reset out_kvec, as sending our banner raced + * with receiving peer banner after connect completed. + */ + ret = prepare_write_connect(con); + if (ret < 0) + goto out; + prepare_read_connect(con); + + /* Send connection info before awaiting response */ + goto out; + } + + if (con->state == CON_STATE_NEGOTIATING) { + dout("try_read negotiating\n"); + ret = read_partial_connect(con); + if (ret <= 0) + goto out; + ret = process_connect(con); + if (ret < 0) + goto out; + goto more; + } + + WARN_ON(con->state != CON_STATE_OPEN); + + if (con->in_base_pos < 0) { + /* + * skipping + discarding content. + * + * FIXME: there must be a better way to do this! + */ + static char buf[SKIP_BUF_SIZE]; + int skip = min((int) sizeof (buf), -con->in_base_pos); + + dout("skipping %d / %d bytes\n", skip, -con->in_base_pos); + ret = ceph_tcp_recvmsg(con->sock, buf, skip); + if (ret <= 0) + goto out; + con->in_base_pos += ret; + if (con->in_base_pos) + goto more; + } + if (con->in_tag == CEPH_MSGR_TAG_READY) { + /* + * what's next? + */ + ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1); + if (ret <= 0) + goto out; + dout("try_read got tag %d\n", (int)con->in_tag); + switch (con->in_tag) { + case CEPH_MSGR_TAG_MSG: + prepare_read_message(con); + break; + case CEPH_MSGR_TAG_ACK: + prepare_read_ack(con); + break; + case CEPH_MSGR_TAG_CLOSE: + con_close_socket(con); + con->state = CON_STATE_CLOSED; + goto out; + default: + goto bad_tag; + } + } + if (con->in_tag == CEPH_MSGR_TAG_MSG) { + ret = read_partial_message(con); + if (ret <= 0) { + switch (ret) { + case -EBADMSG: + con->error_msg = "bad crc"; + /* fall through */ + case -EBADE: + ret = -EIO; + break; + case -EIO: + con->error_msg = "io error"; + break; + } + goto out; + } + if (con->in_tag == CEPH_MSGR_TAG_READY) + goto more; + process_message(con); + if (con->state == CON_STATE_OPEN) + prepare_read_tag(con); + goto more; + } + if (con->in_tag == CEPH_MSGR_TAG_ACK || + con->in_tag == CEPH_MSGR_TAG_SEQ) { + /* + * the final handshake seq exchange is semantically + * equivalent to an ACK + */ + ret = read_partial_ack(con); + if (ret <= 0) + goto out; + process_ack(con); + goto more; + } + +out: + dout("try_read done on %p ret %d\n", con, ret); + return ret; + +bad_tag: + pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag); + con->error_msg = "protocol error, garbage tag"; + ret = -1; + goto out; +} + + +/* + * Atomically queue work on a connection after the specified delay. + * Bump @con reference to avoid races with connection teardown. + * Returns 0 if work was queued, or an error code otherwise. + */ +static int queue_con_delay(struct ceph_connection *con, unsigned long delay) +{ + if (!con->ops->get(con)) { + dout("%s %p ref count 0\n", __func__, con); + return -ENOENT; + } + + if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { + dout("%s %p - already queued\n", __func__, con); + con->ops->put(con); + return -EBUSY; + } + + dout("%s %p %lu\n", __func__, con, delay); + return 0; +} + +static void queue_con(struct ceph_connection *con) +{ + (void) queue_con_delay(con, 0); +} + +static void cancel_con(struct ceph_connection *con) +{ + if (cancel_delayed_work(&con->work)) { + dout("%s %p\n", __func__, con); + con->ops->put(con); + } +} + +static bool con_sock_closed(struct ceph_connection *con) +{ + if (!con_flag_test_and_clear(con, CON_FLAG_SOCK_CLOSED)) + return false; + +#define CASE(x) \ + case CON_STATE_ ## x: \ + con->error_msg = "socket closed (con state " #x ")"; \ + break; + + switch (con->state) { + CASE(CLOSED); + CASE(PREOPEN); + CASE(CONNECTING); + CASE(NEGOTIATING); + CASE(OPEN); + CASE(STANDBY); + default: + pr_warn("%s con %p unrecognized state %lu\n", + __func__, con, con->state); + con->error_msg = "unrecognized con state"; + BUG(); + break; + } +#undef CASE + + return true; +} + +static bool con_backoff(struct ceph_connection *con) +{ + int ret; + + if (!con_flag_test_and_clear(con, CON_FLAG_BACKOFF)) + return false; + + ret = queue_con_delay(con, round_jiffies_relative(con->delay)); + if (ret) { + dout("%s: con %p FAILED to back off %lu\n", __func__, + con, con->delay); + BUG_ON(ret == -ENOENT); + con_flag_set(con, CON_FLAG_BACKOFF); + } + + return true; +} + +/* Finish fault handling; con->mutex must *not* be held here */ + +static void con_fault_finish(struct ceph_connection *con) +{ + /* + * in case we faulted due to authentication, invalidate our + * current tickets so that we can get new ones. + */ + if (con->auth_retry && con->ops->invalidate_authorizer) { + dout("calling invalidate_authorizer()\n"); + con->ops->invalidate_authorizer(con); + } + + if (con->ops->fault) + con->ops->fault(con); +} + +/* + * Do some work on a connection. Drop a connection ref when we're done. + */ +static void con_work(struct work_struct *work) +{ + struct ceph_connection *con = container_of(work, struct ceph_connection, + work.work); + bool fault; + + mutex_lock(&con->mutex); + while (true) { + int ret; + + if ((fault = con_sock_closed(con))) { + dout("%s: con %p SOCK_CLOSED\n", __func__, con); + break; + } + if (con_backoff(con)) { + dout("%s: con %p BACKOFF\n", __func__, con); + break; + } + if (con->state == CON_STATE_STANDBY) { + dout("%s: con %p STANDBY\n", __func__, con); + break; + } + if (con->state == CON_STATE_CLOSED) { + dout("%s: con %p CLOSED\n", __func__, con); + BUG_ON(con->sock); + break; + } + if (con->state == CON_STATE_PREOPEN) { + dout("%s: con %p PREOPEN\n", __func__, con); + BUG_ON(con->sock); + } + + ret = try_read(con); + if (ret < 0) { + if (ret == -EAGAIN) + continue; + if (!con->error_msg) + con->error_msg = "socket error on read"; + fault = true; + break; + } + + ret = try_write(con); + if (ret < 0) { + if (ret == -EAGAIN) + continue; + if (!con->error_msg) + con->error_msg = "socket error on write"; + fault = true; + } + + break; /* If we make it to here, we're done */ + } + if (fault) + con_fault(con); + mutex_unlock(&con->mutex); + + if (fault) + con_fault_finish(con); + + con->ops->put(con); +} + +/* + * Generic error/fault handler. A retry mechanism is used with + * exponential backoff + */ +static void con_fault(struct ceph_connection *con) +{ + dout("fault %p state %lu to peer %s\n", + con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); + + pr_warn("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), + ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); + con->error_msg = NULL; + + WARN_ON(con->state != CON_STATE_CONNECTING && + con->state != CON_STATE_NEGOTIATING && + con->state != CON_STATE_OPEN); + + con_close_socket(con); + + if (con_flag_test(con, CON_FLAG_LOSSYTX)) { + dout("fault on LOSSYTX channel, marking CLOSED\n"); + con->state = CON_STATE_CLOSED; + return; + } + + if (con->in_msg) { + BUG_ON(con->in_msg->con != con); + con->in_msg->con = NULL; + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + con->ops->put(con); + } + + /* Requeue anything that hasn't been acked */ + list_splice_init(&con->out_sent, &con->out_queue); + + /* If there are no messages queued or keepalive pending, place + * the connection in a STANDBY state */ + if (list_empty(&con->out_queue) && + !con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)) { + dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); + con_flag_clear(con, CON_FLAG_WRITE_PENDING); + con->state = CON_STATE_STANDBY; + } else { + /* retry after a delay. */ + con->state = CON_STATE_PREOPEN; + if (con->delay == 0) + con->delay = BASE_DELAY_INTERVAL; + else if (con->delay < MAX_DELAY_INTERVAL) + con->delay *= 2; + con_flag_set(con, CON_FLAG_BACKOFF); + queue_con(con); + } +} + + + +/* + * initialize a new messenger instance + */ +void ceph_messenger_init(struct ceph_messenger *msgr, + struct ceph_entity_addr *myaddr, + u64 supported_features, + u64 required_features, + bool nocrc, + bool tcp_nodelay) +{ + msgr->supported_features = supported_features; + msgr->required_features = required_features; + + spin_lock_init(&msgr->global_seq_lock); + + if (myaddr) + msgr->inst.addr = *myaddr; + + /* select a random nonce */ + msgr->inst.addr.type = 0; + get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); + encode_my_addr(msgr); + msgr->nocrc = nocrc; + msgr->tcp_nodelay = tcp_nodelay; + + atomic_set(&msgr->stopping, 0); + + dout("%s %p\n", __func__, msgr); +} +EXPORT_SYMBOL(ceph_messenger_init); + +static void clear_standby(struct ceph_connection *con) +{ + /* come back from STANDBY? */ + if (con->state == CON_STATE_STANDBY) { + dout("clear_standby %p and ++connect_seq\n", con); + con->state = CON_STATE_PREOPEN; + con->connect_seq++; + WARN_ON(con_flag_test(con, CON_FLAG_WRITE_PENDING)); + WARN_ON(con_flag_test(con, CON_FLAG_KEEPALIVE_PENDING)); + } +} + +/* + * Queue up an outgoing message on the given connection. + */ +void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) +{ + /* set src+dst */ + msg->hdr.src = con->msgr->inst.name; + BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); + msg->needs_out_seq = true; + + mutex_lock(&con->mutex); + + if (con->state == CON_STATE_CLOSED) { + dout("con_send %p closed, dropping %p\n", con, msg); + ceph_msg_put(msg); + mutex_unlock(&con->mutex); + return; + } + + BUG_ON(msg->con != NULL); + msg->con = con->ops->get(con); + BUG_ON(msg->con == NULL); + + BUG_ON(!list_empty(&msg->list_head)); + list_add_tail(&msg->list_head, &con->out_queue); + dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, + ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type), + ceph_msg_type_name(le16_to_cpu(msg->hdr.type)), + le32_to_cpu(msg->hdr.front_len), + le32_to_cpu(msg->hdr.middle_len), + le32_to_cpu(msg->hdr.data_len)); + + clear_standby(con); + mutex_unlock(&con->mutex); + + /* if there wasn't anything waiting to send before, queue + * new work */ + if (con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + queue_con(con); +} +EXPORT_SYMBOL(ceph_con_send); + +/* + * Revoke a message that was previously queued for send + */ +void ceph_msg_revoke(struct ceph_msg *msg) +{ + struct ceph_connection *con = msg->con; + + if (!con) + return; /* Message not in our possession */ + + mutex_lock(&con->mutex); + if (!list_empty(&msg->list_head)) { + dout("%s %p msg %p - was on queue\n", __func__, con, msg); + list_del_init(&msg->list_head); + BUG_ON(msg->con == NULL); + msg->con->ops->put(msg->con); + msg->con = NULL; + msg->hdr.seq = 0; + + ceph_msg_put(msg); + } + if (con->out_msg == msg) { + dout("%s %p msg %p - was sending\n", __func__, con, msg); + con->out_msg = NULL; + if (con->out_kvec_is_msg) { + con->out_skip = con->out_kvec_bytes; + con->out_kvec_is_msg = false; + } + msg->hdr.seq = 0; + + ceph_msg_put(msg); + } + mutex_unlock(&con->mutex); +} + +/* + * Revoke a message that we may be reading data into + */ +void ceph_msg_revoke_incoming(struct ceph_msg *msg) +{ + struct ceph_connection *con; + + BUG_ON(msg == NULL); + if (!msg->con) { + dout("%s msg %p null con\n", __func__, msg); + + return; /* Message not in our possession */ + } + + con = msg->con; + mutex_lock(&con->mutex); + if (con->in_msg == msg) { + unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); + unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); + unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); + + /* skip rest of message */ + dout("%s %p msg %p revoked\n", __func__, con, msg); + con->in_base_pos = con->in_base_pos - + sizeof(struct ceph_msg_header) - + front_len - + middle_len - + data_len - + sizeof(struct ceph_msg_footer); + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + } else { + dout("%s %p in_msg %p msg %p no-op\n", + __func__, con, con->in_msg, msg); + } + mutex_unlock(&con->mutex); +} + +/* + * Queue a keepalive byte to ensure the tcp connection is alive. + */ +void ceph_con_keepalive(struct ceph_connection *con) +{ + dout("con_keepalive %p\n", con); + mutex_lock(&con->mutex); + clear_standby(con); + mutex_unlock(&con->mutex); + if (con_flag_test_and_set(con, CON_FLAG_KEEPALIVE_PENDING) == 0 && + con_flag_test_and_set(con, CON_FLAG_WRITE_PENDING) == 0) + queue_con(con); +} +EXPORT_SYMBOL(ceph_con_keepalive); + +static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) +{ + struct ceph_msg_data *data; + + if (WARN_ON(!ceph_msg_data_type_valid(type))) + return NULL; + + data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); + if (data) + data->type = type; + INIT_LIST_HEAD(&data->links); + + return data; +} + +static void ceph_msg_data_destroy(struct ceph_msg_data *data) +{ + if (!data) + return; + + WARN_ON(!list_empty(&data->links)); + if (data->type == CEPH_MSG_DATA_PAGELIST) + ceph_pagelist_release(data->pagelist); + kmem_cache_free(ceph_msg_data_cache, data); +} + +void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, + size_t length, size_t alignment) +{ + struct ceph_msg_data *data; + + BUG_ON(!pages); + BUG_ON(!length); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); + BUG_ON(!data); + data->pages = pages; + data->length = length; + data->alignment = alignment & ~PAGE_MASK; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pages); + +void ceph_msg_data_add_pagelist(struct ceph_msg *msg, + struct ceph_pagelist *pagelist) +{ + struct ceph_msg_data *data; + + BUG_ON(!pagelist); + BUG_ON(!pagelist->length); + + data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); + BUG_ON(!data); + data->pagelist = pagelist; + + list_add_tail(&data->links, &msg->data); + msg->data_length += pagelist->length; +} +EXPORT_SYMBOL(ceph_msg_data_add_pagelist); + +#ifdef CONFIG_BLOCK +void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio, + size_t length) +{ + struct ceph_msg_data *data; + + BUG_ON(!bio); + + data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); + BUG_ON(!data); + data->bio = bio; + data->bio_length = length; + + list_add_tail(&data->links, &msg->data); + msg->data_length += length; +} +EXPORT_SYMBOL(ceph_msg_data_add_bio); +#endif /* CONFIG_BLOCK */ + +/* + * construct a new message with given type, size + * the new msg has a ref count of 1. + */ +struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, + bool can_fail) +{ + struct ceph_msg *m; + + m = kmem_cache_zalloc(ceph_msg_cache, flags); + if (m == NULL) + goto out; + + m->hdr.type = cpu_to_le16(type); + m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); + m->hdr.front_len = cpu_to_le32(front_len); + + INIT_LIST_HEAD(&m->list_head); + kref_init(&m->kref); + INIT_LIST_HEAD(&m->data); + + /* front */ + if (front_len) { + m->front.iov_base = ceph_kvmalloc(front_len, flags); + if (m->front.iov_base == NULL) { + dout("ceph_msg_new can't allocate %d bytes\n", + front_len); + goto out2; + } + } else { + m->front.iov_base = NULL; + } + m->front_alloc_len = m->front.iov_len = front_len; + + dout("ceph_msg_new %p front %d\n", m, front_len); + return m; + +out2: + ceph_msg_put(m); +out: + if (!can_fail) { + pr_err("msg_new can't create type %d front %d\n", type, + front_len); + WARN_ON(1); + } else { + dout("msg_new can't create type %d front %d\n", type, + front_len); + } + return NULL; +} +EXPORT_SYMBOL(ceph_msg_new); + +/* + * Allocate "middle" portion of a message, if it is needed and wasn't + * allocated by alloc_msg. This allows us to read a small fixed-size + * per-type header in the front and then gracefully fail (i.e., + * propagate the error to the caller based on info in the front) when + * the middle is too large. + */ +static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg) +{ + int type = le16_to_cpu(msg->hdr.type); + int middle_len = le32_to_cpu(msg->hdr.middle_len); + + dout("alloc_middle %p type %d %s middle_len %d\n", msg, type, + ceph_msg_type_name(type), middle_len); + BUG_ON(!middle_len); + BUG_ON(msg->middle); + + msg->middle = ceph_buffer_new(middle_len, GFP_NOFS); + if (!msg->middle) + return -ENOMEM; + return 0; +} + +/* + * Allocate a message for receiving an incoming message on a + * connection, and save the result in con->in_msg. Uses the + * connection's private alloc_msg op if available. + * + * Returns 0 on success, or a negative error code. + * + * On success, if we set *skip = 1: + * - the next message should be skipped and ignored. + * - con->in_msg == NULL + * or if we set *skip = 0: + * - con->in_msg is non-null. + * On error (ENOMEM, EAGAIN, ...), + * - con->in_msg == NULL + */ +static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) +{ + struct ceph_msg_header *hdr = &con->in_hdr; + int middle_len = le32_to_cpu(hdr->middle_len); + struct ceph_msg *msg; + int ret = 0; + + BUG_ON(con->in_msg != NULL); + BUG_ON(!con->ops->alloc_msg); + + mutex_unlock(&con->mutex); + msg = con->ops->alloc_msg(con, hdr, skip); + mutex_lock(&con->mutex); + if (con->state != CON_STATE_OPEN) { + if (msg) + ceph_msg_put(msg); + return -EAGAIN; + } + if (msg) { + BUG_ON(*skip); + con->in_msg = msg; + con->in_msg->con = con->ops->get(con); + BUG_ON(con->in_msg->con == NULL); + } else { + /* + * Null message pointer means either we should skip + * this message or we couldn't allocate memory. The + * former is not an error. + */ + if (*skip) + return 0; + + con->error_msg = "error allocating memory for incoming message"; + return -ENOMEM; + } + memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); + + if (middle_len && !con->in_msg->middle) { + ret = ceph_alloc_middle(con, con->in_msg); + if (ret < 0) { + ceph_msg_put(con->in_msg); + con->in_msg = NULL; + } + } + + return ret; +} + + +/* + * Free a generically kmalloc'd message. + */ +static void ceph_msg_free(struct ceph_msg *m) +{ + dout("%s %p\n", __func__, m); + kvfree(m->front.iov_base); + kmem_cache_free(ceph_msg_cache, m); +} + +static void ceph_msg_release(struct kref *kref) +{ + struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); + LIST_HEAD(data); + struct list_head *links; + struct list_head *next; + + dout("%s %p\n", __func__, m); + WARN_ON(!list_empty(&m->list_head)); + + /* drop middle, data, if any */ + if (m->middle) { + ceph_buffer_put(m->middle); + m->middle = NULL; + } + + list_splice_init(&m->data, &data); + list_for_each_safe(links, next, &data) { + struct ceph_msg_data *data; + + data = list_entry(links, struct ceph_msg_data, links); + list_del_init(links); + ceph_msg_data_destroy(data); + } + m->data_length = 0; + + if (m->pool) + ceph_msgpool_put(m->pool, m); + else + ceph_msg_free(m); +} + +struct ceph_msg *ceph_msg_get(struct ceph_msg *msg) +{ + dout("%s %p (was %d)\n", __func__, msg, + atomic_read(&msg->kref.refcount)); + kref_get(&msg->kref); + return msg; +} +EXPORT_SYMBOL(ceph_msg_get); + +void ceph_msg_put(struct ceph_msg *msg) +{ + dout("%s %p (was %d)\n", __func__, msg, + atomic_read(&msg->kref.refcount)); + kref_put(&msg->kref, ceph_msg_release); +} +EXPORT_SYMBOL(ceph_msg_put); + +void ceph_msg_dump(struct ceph_msg *msg) +{ + pr_debug("msg_dump %p (front_alloc_len %d length %zd)\n", msg, + msg->front_alloc_len, msg->data_length); + print_hex_dump(KERN_DEBUG, "header: ", + DUMP_PREFIX_OFFSET, 16, 1, + &msg->hdr, sizeof(msg->hdr), true); + print_hex_dump(KERN_DEBUG, " front: ", + DUMP_PREFIX_OFFSET, 16, 1, + msg->front.iov_base, msg->front.iov_len, true); + if (msg->middle) + print_hex_dump(KERN_DEBUG, "middle: ", + DUMP_PREFIX_OFFSET, 16, 1, + msg->middle->vec.iov_base, + msg->middle->vec.iov_len, true); + print_hex_dump(KERN_DEBUG, "footer: ", + DUMP_PREFIX_OFFSET, 16, 1, + &msg->footer, sizeof(msg->footer), true); +} +EXPORT_SYMBOL(ceph_msg_dump); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c new file mode 100644 index 000000000..2b3cf05e8 --- /dev/null +++ b/net/ceph/mon_client.c @@ -0,0 +1,1121 @@ +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Interact with Ceph monitor cluster. Handle requests for new map + * versions, and periodically resend as needed. Also implement + * statfs() and umount(). + * + * A small cluster of Ceph "monitors" are responsible for managing critical + * cluster configuration and state information. An odd number (e.g., 3, 5) + * of cmon daemons use a modified version of the Paxos part-time parliament + * algorithm to manage the MDS map (mds cluster membership), OSD map, and + * list of clients who have mounted the file system. + * + * We maintain an open, active session with a monitor at all times in order to + * receive timely MDSMap updates. We periodically send a keepalive byte on the + * TCP socket to ensure we detect a failure. If the connection does break, we + * randomly hunt for a new monitor. Once the connection is reestablished, we + * resend any outstanding requests. + */ + +static const struct ceph_connection_operations mon_con_ops; + +static int __validate_auth(struct ceph_mon_client *monc); + +/* + * Decode a monmap blob (e.g., during mount). + */ +struct ceph_monmap *ceph_monmap_decode(void *p, void *end) +{ + struct ceph_monmap *m = NULL; + int i, err = -EINVAL; + struct ceph_fsid fsid; + u32 epoch, num_mon; + u16 version; + u32 len; + + ceph_decode_32_safe(&p, end, len, bad); + ceph_decode_need(&p, end, len, bad); + + dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p)); + + ceph_decode_16_safe(&p, end, version, bad); + + ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + epoch = ceph_decode_32(&p); + + num_mon = ceph_decode_32(&p); + ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad); + + if (num_mon >= CEPH_MAX_MON) + goto bad; + m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS); + if (m == NULL) + return ERR_PTR(-ENOMEM); + m->fsid = fsid; + m->epoch = epoch; + m->num_mon = num_mon; + ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0])); + for (i = 0; i < num_mon; i++) + ceph_decode_addr(&m->mon_inst[i].addr); + + dout("monmap_decode epoch %d, num_mon %d\n", m->epoch, + m->num_mon); + for (i = 0; i < m->num_mon; i++) + dout("monmap_decode mon%d is %s\n", i, + ceph_pr_addr(&m->mon_inst[i].addr.in_addr)); + return m; + +bad: + dout("monmap_decode failed with %d\n", err); + kfree(m); + return ERR_PTR(err); +} + +/* + * return true if *addr is included in the monmap. + */ +int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr) +{ + int i; + + for (i = 0; i < m->num_mon; i++) + if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0) + return 1; + return 0; +} + +/* + * Send an auth request. + */ +static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len) +{ + monc->pending_auth = 1; + monc->m_auth->front.iov_len = len; + monc->m_auth->hdr.front_len = cpu_to_le32(len); + ceph_msg_revoke(monc->m_auth); + ceph_msg_get(monc->m_auth); /* keep our ref */ + ceph_con_send(&monc->con, monc->m_auth); +} + +/* + * Close monitor session, if any. + */ +static void __close_session(struct ceph_mon_client *monc) +{ + dout("__close_session closing mon%d\n", monc->cur_mon); + ceph_msg_revoke(monc->m_auth); + ceph_msg_revoke_incoming(monc->m_auth_reply); + ceph_msg_revoke(monc->m_subscribe); + ceph_msg_revoke_incoming(monc->m_subscribe_ack); + ceph_con_close(&monc->con); + monc->cur_mon = -1; + monc->pending_auth = 0; + ceph_auth_reset(monc->auth); +} + +/* + * Open a session with a (new) monitor. + */ +static int __open_session(struct ceph_mon_client *monc) +{ + char r; + int ret; + + if (monc->cur_mon < 0) { + get_random_bytes(&r, 1); + monc->cur_mon = r % monc->monmap->num_mon; + dout("open_session num=%d r=%d -> mon%d\n", + monc->monmap->num_mon, r, monc->cur_mon); + monc->sub_sent = 0; + monc->sub_renew_after = jiffies; /* i.e., expired */ + monc->want_next_osdmap = !!monc->want_next_osdmap; + + dout("open_session mon%d opening\n", monc->cur_mon); + ceph_con_open(&monc->con, + CEPH_ENTITY_TYPE_MON, monc->cur_mon, + &monc->monmap->mon_inst[monc->cur_mon].addr); + + /* initiatiate authentication handshake */ + ret = ceph_auth_build_hello(monc->auth, + monc->m_auth->front.iov_base, + monc->m_auth->front_alloc_len); + __send_prepared_auth_request(monc, ret); + } else { + dout("open_session mon%d already open\n", monc->cur_mon); + } + return 0; +} + +static bool __sub_expired(struct ceph_mon_client *monc) +{ + return time_after_eq(jiffies, monc->sub_renew_after); +} + +/* + * Reschedule delayed work timer. + */ +static void __schedule_delayed(struct ceph_mon_client *monc) +{ + unsigned int delay; + + if (monc->cur_mon < 0 || __sub_expired(monc)) + delay = 10 * HZ; + else + delay = 20 * HZ; + dout("__schedule_delayed after %u\n", delay); + schedule_delayed_work(&monc->delayed_work, delay); +} + +/* + * Send subscribe request for mdsmap and/or osdmap. + */ +static void __send_subscribe(struct ceph_mon_client *monc) +{ + dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", + (unsigned int)monc->sub_sent, __sub_expired(monc), + monc->want_next_osdmap); + if ((__sub_expired(monc) && !monc->sub_sent) || + monc->want_next_osdmap == 1) { + struct ceph_msg *msg = monc->m_subscribe; + struct ceph_mon_subscribe_item *i; + void *p, *end; + int num; + + p = msg->front.iov_base; + end = p + msg->front_alloc_len; + + num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; + ceph_encode_32(&p, num); + + if (monc->want_next_osdmap) { + dout("__send_subscribe to 'osdmap' %u\n", + (unsigned int)monc->have_osdmap); + ceph_encode_string(&p, end, "osdmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_osdmap); + i->onetime = 1; + p += sizeof(*i); + monc->want_next_osdmap = 2; /* requested */ + } + if (monc->want_mdsmap) { + dout("__send_subscribe to 'mdsmap' %u+\n", + (unsigned int)monc->have_mdsmap); + ceph_encode_string(&p, end, "mdsmap", 6); + i = p; + i->have = cpu_to_le64(monc->have_mdsmap); + i->onetime = 0; + p += sizeof(*i); + } + ceph_encode_string(&p, end, "monmap", 6); + i = p; + i->have = 0; + i->onetime = 0; + p += sizeof(*i); + + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + ceph_msg_revoke(msg); + ceph_con_send(&monc->con, ceph_msg_get(msg)); + + monc->sub_sent = jiffies | 1; /* never 0 */ + } +} + +static void handle_subscribe_ack(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + unsigned int seconds; + struct ceph_mon_subscribe_ack *h = msg->front.iov_base; + + if (msg->front.iov_len < sizeof(*h)) + goto bad; + seconds = le32_to_cpu(h->duration); + + mutex_lock(&monc->mutex); + if (monc->hunting) { + pr_info("mon%d %s session established\n", + monc->cur_mon, + ceph_pr_addr(&monc->con.peer_addr.in_addr)); + monc->hunting = false; + } + dout("handle_subscribe_ack after %d seconds\n", seconds); + monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; + monc->sub_sent = 0; + mutex_unlock(&monc->mutex); + return; +bad: + pr_err("got corrupt subscribe-ack msg\n"); + ceph_msg_dump(msg); +} + +/* + * Keep track of which maps we have + */ +int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) +{ + mutex_lock(&monc->mutex); + monc->have_mdsmap = got; + mutex_unlock(&monc->mutex); + return 0; +} +EXPORT_SYMBOL(ceph_monc_got_mdsmap); + +int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) +{ + mutex_lock(&monc->mutex); + monc->have_osdmap = got; + monc->want_next_osdmap = 0; + mutex_unlock(&monc->mutex); + return 0; +} + +/* + * Register interest in the next osdmap + */ +void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) +{ + dout("request_next_osdmap have %u\n", monc->have_osdmap); + mutex_lock(&monc->mutex); + if (!monc->want_next_osdmap) + monc->want_next_osdmap = 1; + if (monc->want_next_osdmap < 2) + __send_subscribe(monc); + mutex_unlock(&monc->mutex); +} +EXPORT_SYMBOL(ceph_monc_request_next_osdmap); + +int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, + unsigned long timeout) +{ + unsigned long started = jiffies; + int ret; + + mutex_lock(&monc->mutex); + while (monc->have_osdmap < epoch) { + mutex_unlock(&monc->mutex); + + if (timeout != 0 && time_after_eq(jiffies, started + timeout)) + return -ETIMEDOUT; + + ret = wait_event_interruptible_timeout(monc->client->auth_wq, + monc->have_osdmap >= epoch, timeout); + if (ret < 0) + return ret; + + mutex_lock(&monc->mutex); + } + + mutex_unlock(&monc->mutex); + return 0; +} +EXPORT_SYMBOL(ceph_monc_wait_osdmap); + +/* + * + */ +int ceph_monc_open_session(struct ceph_mon_client *monc) +{ + mutex_lock(&monc->mutex); + __open_session(monc); + __schedule_delayed(monc); + mutex_unlock(&monc->mutex); + return 0; +} +EXPORT_SYMBOL(ceph_monc_open_session); + +/* + * We require the fsid and global_id in order to initialize our + * debugfs dir. + */ +static bool have_debugfs_info(struct ceph_mon_client *monc) +{ + dout("have_debugfs_info fsid %d globalid %lld\n", + (int)monc->client->have_fsid, monc->auth->global_id); + return monc->client->have_fsid && monc->auth->global_id > 0; +} + +/* + * The monitor responds with mount ack indicate mount success. The + * included client ticket allows the client to talk to MDSs and OSDs. + */ +static void ceph_monc_handle_map(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_client *client = monc->client; + struct ceph_monmap *monmap = NULL, *old = monc->monmap; + void *p, *end; + int had_debugfs_info, init_debugfs = 0; + + mutex_lock(&monc->mutex); + + had_debugfs_info = have_debugfs_info(monc); + + dout("handle_monmap\n"); + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + monmap = ceph_monmap_decode(p, end); + if (IS_ERR(monmap)) { + pr_err("problem decoding monmap, %d\n", + (int)PTR_ERR(monmap)); + goto out; + } + + if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) { + kfree(monmap); + goto out; + } + + client->monc.monmap = monmap; + kfree(old); + + if (!client->have_fsid) { + client->have_fsid = true; + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } + mutex_unlock(&monc->mutex); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } + + goto out_unlocked; + } +out: + mutex_unlock(&monc->mutex); +out_unlocked: + wake_up_all(&client->auth_wq); +} + +/* + * generic requests (currently statfs, mon_get_version) + */ +static struct ceph_mon_generic_request *__lookup_generic_req( + struct ceph_mon_client *monc, u64 tid) +{ + struct ceph_mon_generic_request *req; + struct rb_node *n = monc->generic_request_tree.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_mon_generic_request, node); + if (tid < req->tid) + n = n->rb_left; + else if (tid > req->tid) + n = n->rb_right; + else + return req; + } + return NULL; +} + +static void __insert_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *new) +{ + struct rb_node **p = &monc->generic_request_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_mon_generic_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_mon_generic_request, node); + if (new->tid < req->tid) + p = &(*p)->rb_left; + else if (new->tid > req->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &monc->generic_request_tree); +} + +static void release_generic_request(struct kref *kref) +{ + struct ceph_mon_generic_request *req = + container_of(kref, struct ceph_mon_generic_request, kref); + + if (req->reply) + ceph_msg_put(req->reply); + if (req->request) + ceph_msg_put(req->request); + + kfree(req); +} + +static void put_generic_request(struct ceph_mon_generic_request *req) +{ + kref_put(&req->kref, release_generic_request); +} + +static void get_generic_request(struct ceph_mon_generic_request *req) +{ + kref_get(&req->kref); +} + +static struct ceph_msg *get_generic_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + struct ceph_mon_generic_request *req; + u64 tid = le64_to_cpu(hdr->tid); + struct ceph_msg *m; + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (!req) { + dout("get_generic_reply %lld dne\n", tid); + *skip = 1; + m = NULL; + } else { + dout("get_generic_reply %lld got %p\n", tid, req->reply); + *skip = 0; + m = ceph_msg_get(req->reply); + /* + * we don't need to track the connection reading into + * this reply because we only have one open connection + * at a time, ever. + */ + } + mutex_unlock(&monc->mutex); + return m; +} + +static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, + struct ceph_mon_generic_request *req) +{ + int err; + + /* register request */ + req->tid = tid != 0 ? tid : ++monc->last_tid; + req->request->hdr.tid = cpu_to_le64(req->tid); + __insert_generic_request(monc, req); + monc->num_generic_requests++; + ceph_con_send(&monc->con, ceph_msg_get(req->request)); + mutex_unlock(&monc->mutex); + + err = wait_for_completion_interruptible(&req->completion); + + mutex_lock(&monc->mutex); + rb_erase(&req->node, &monc->generic_request_tree); + monc->num_generic_requests--; + + if (!err) + err = req->result; + return err; +} + +static int do_generic_request(struct ceph_mon_client *monc, + struct ceph_mon_generic_request *req) +{ + int err; + + mutex_lock(&monc->mutex); + err = __do_generic_request(monc, 0, req); + mutex_unlock(&monc->mutex); + + return err; +} + +/* + * statfs + */ +static void handle_statfs_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_statfs_reply *reply = msg->front.iov_base; + u64 tid = le64_to_cpu(msg->hdr.tid); + + if (msg->front.iov_len != sizeof(*reply)) + goto bad; + dout("handle_statfs_reply %p tid %llu\n", msg, tid); + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, tid); + if (req) { + *(struct ceph_statfs *)req->buf = reply->st; + req->result = 0; + get_generic_request(req); + } + mutex_unlock(&monc->mutex); + if (req) { + complete_all(&req->completion); + put_generic_request(req); + } + return; + +bad: + pr_err("corrupt statfs reply, tid %llu\n", tid); + ceph_msg_dump(msg); +} + +/* + * Do a synchronous statfs(). + */ +int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf) +{ + struct ceph_mon_generic_request *req; + struct ceph_mon_statfs *h; + int err; + + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = buf; + init_completion(&req->completion); + + err = -ENOMEM; + req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS, + true); + if (!req->request) + goto out; + req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS, + true); + if (!req->reply) + goto out; + + /* fill out request */ + h = req->request->front.iov_base; + h->monhdr.have_version = 0; + h->monhdr.session_mon = cpu_to_le16(-1); + h->monhdr.session_mon_tid = 0; + h->fsid = monc->monmap->fsid; + + err = do_generic_request(monc, req); + +out: + put_generic_request(req); + return err; +} +EXPORT_SYMBOL(ceph_monc_do_statfs); + +static void handle_get_version_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + struct ceph_mon_generic_request *req; + u64 tid = le64_to_cpu(msg->hdr.tid); + void *p = msg->front.iov_base; + void *end = p + msg->front_alloc_len; + u64 handle; + + dout("%s %p tid %llu\n", __func__, msg, tid); + + ceph_decode_need(&p, end, 2*sizeof(u64), bad); + handle = ceph_decode_64(&p); + if (tid != 0 && tid != handle) + goto bad; + + mutex_lock(&monc->mutex); + req = __lookup_generic_req(monc, handle); + if (req) { + *(u64 *)req->buf = ceph_decode_64(&p); + req->result = 0; + get_generic_request(req); + } + mutex_unlock(&monc->mutex); + if (req) { + complete_all(&req->completion); + put_generic_request(req); + } + + return; +bad: + pr_err("corrupt mon_get_version reply, tid %llu\n", tid); + ceph_msg_dump(msg); +} + +/* + * Send MMonGetVersion and wait for the reply. + * + * @what: one of "mdsmap", "osdmap" or "monmap" + */ +int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, + u64 *newest) +{ + struct ceph_mon_generic_request *req; + void *p, *end; + u64 tid; + int err; + + req = kzalloc(sizeof(*req), GFP_NOFS); + if (!req) + return -ENOMEM; + + kref_init(&req->kref); + req->buf = newest; + init_completion(&req->completion); + + req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, + sizeof(u64) + sizeof(u32) + strlen(what), + GFP_NOFS, true); + if (!req->request) { + err = -ENOMEM; + goto out; + } + + req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, + GFP_NOFS, true); + if (!req->reply) { + err = -ENOMEM; + goto out; + } + + p = req->request->front.iov_base; + end = p + req->request->front_alloc_len; + + /* fill out request */ + mutex_lock(&monc->mutex); + tid = ++monc->last_tid; + ceph_encode_64(&p, tid); /* handle */ + ceph_encode_string(&p, end, what, strlen(what)); + + err = __do_generic_request(monc, tid, req); + + mutex_unlock(&monc->mutex); +out: + put_generic_request(req); + return err; +} +EXPORT_SYMBOL(ceph_monc_do_get_version); + +/* + * Resend pending generic requests. + */ +static void __resend_generic_request(struct ceph_mon_client *monc) +{ + struct ceph_mon_generic_request *req; + struct rb_node *p; + + for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { + req = rb_entry(p, struct ceph_mon_generic_request, node); + ceph_msg_revoke(req->request); + ceph_msg_revoke_incoming(req->reply); + ceph_con_send(&monc->con, ceph_msg_get(req->request)); + } +} + +/* + * Delayed work. If we haven't mounted yet, retry. Otherwise, + * renew/retry subscription as needed (in case it is timing out, or we + * got an ENOMEM). And keep the monitor connection alive. + */ +static void delayed_work(struct work_struct *work) +{ + struct ceph_mon_client *monc = + container_of(work, struct ceph_mon_client, delayed_work.work); + + dout("monc delayed_work\n"); + mutex_lock(&monc->mutex); + if (monc->hunting) { + __close_session(monc); + __open_session(monc); /* continue hunting */ + } else { + ceph_con_keepalive(&monc->con); + + __validate_auth(monc); + + if (ceph_auth_is_authenticated(monc->auth)) + __send_subscribe(monc); + } + __schedule_delayed(monc); + mutex_unlock(&monc->mutex); +} + +/* + * On startup, we build a temporary monmap populated with the IPs + * provided by mount(2). + */ +static int build_initial_monmap(struct ceph_mon_client *monc) +{ + struct ceph_options *opt = monc->client->options; + struct ceph_entity_addr *mon_addr = opt->mon_addr; + int num_mon = opt->num_mon; + int i; + + /* build initial monmap */ + monc->monmap = kzalloc(sizeof(*monc->monmap) + + num_mon*sizeof(monc->monmap->mon_inst[0]), + GFP_KERNEL); + if (!monc->monmap) + return -ENOMEM; + for (i = 0; i < num_mon; i++) { + monc->monmap->mon_inst[i].addr = mon_addr[i]; + monc->monmap->mon_inst[i].addr.nonce = 0; + monc->monmap->mon_inst[i].name.type = + CEPH_ENTITY_TYPE_MON; + monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); + } + monc->monmap->num_mon = num_mon; + return 0; +} + +int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) +{ + int err = 0; + + dout("init\n"); + memset(monc, 0, sizeof(*monc)); + monc->client = cl; + monc->monmap = NULL; + mutex_init(&monc->mutex); + + err = build_initial_monmap(monc); + if (err) + goto out; + + /* connection */ + /* authentication */ + monc->auth = ceph_auth_init(cl->options->name, + cl->options->key); + if (IS_ERR(monc->auth)) { + err = PTR_ERR(monc->auth); + goto out_monmap; + } + monc->auth->want_keys = + CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | + CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS; + + /* msgs */ + err = -ENOMEM; + monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK, + sizeof(struct ceph_mon_subscribe_ack), + GFP_NOFS, true); + if (!monc->m_subscribe_ack) + goto out_auth; + + monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS, + true); + if (!monc->m_subscribe) + goto out_subscribe_ack; + + monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS, + true); + if (!monc->m_auth_reply) + goto out_subscribe; + + monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS, true); + monc->pending_auth = 0; + if (!monc->m_auth) + goto out_auth_reply; + + ceph_con_init(&monc->con, monc, &mon_con_ops, + &monc->client->msgr); + + monc->cur_mon = -1; + monc->hunting = true; + monc->sub_renew_after = jiffies; + monc->sub_sent = 0; + + INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); + monc->generic_request_tree = RB_ROOT; + monc->num_generic_requests = 0; + monc->last_tid = 0; + + monc->have_mdsmap = 0; + monc->have_osdmap = 0; + monc->want_next_osdmap = 1; + return 0; + +out_auth_reply: + ceph_msg_put(monc->m_auth_reply); +out_subscribe: + ceph_msg_put(monc->m_subscribe); +out_subscribe_ack: + ceph_msg_put(monc->m_subscribe_ack); +out_auth: + ceph_auth_destroy(monc->auth); +out_monmap: + kfree(monc->monmap); +out: + return err; +} +EXPORT_SYMBOL(ceph_monc_init); + +void ceph_monc_stop(struct ceph_mon_client *monc) +{ + dout("stop\n"); + cancel_delayed_work_sync(&monc->delayed_work); + + mutex_lock(&monc->mutex); + __close_session(monc); + + mutex_unlock(&monc->mutex); + + /* + * flush msgr queue before we destroy ourselves to ensure that: + * - any work that references our embedded con is finished. + * - any osd_client or other work that may reference an authorizer + * finishes before we shut down the auth subsystem. + */ + ceph_msgr_flush(); + + ceph_auth_destroy(monc->auth); + + ceph_msg_put(monc->m_auth); + ceph_msg_put(monc->m_auth_reply); + ceph_msg_put(monc->m_subscribe); + ceph_msg_put(monc->m_subscribe_ack); + + kfree(monc->monmap); +} +EXPORT_SYMBOL(ceph_monc_stop); + +static void handle_auth_reply(struct ceph_mon_client *monc, + struct ceph_msg *msg) +{ + int ret; + int was_auth = 0; + int had_debugfs_info, init_debugfs = 0; + + mutex_lock(&monc->mutex); + had_debugfs_info = have_debugfs_info(monc); + was_auth = ceph_auth_is_authenticated(monc->auth); + monc->pending_auth = 0; + ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, + msg->front.iov_len, + monc->m_auth->front.iov_base, + monc->m_auth->front_alloc_len); + if (ret < 0) { + monc->client->auth_err = ret; + wake_up_all(&monc->client->auth_wq); + } else if (ret > 0) { + __send_prepared_auth_request(monc, ret); + } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { + dout("authenticated, starting session\n"); + + monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; + monc->client->msgr.inst.name.num = + cpu_to_le64(monc->auth->global_id); + + __send_subscribe(monc); + __resend_generic_request(monc); + } + + if (!had_debugfs_info && have_debugfs_info(monc)) { + pr_info("client%lld fsid %pU\n", + ceph_client_id(monc->client), + &monc->client->fsid); + init_debugfs = 1; + } + mutex_unlock(&monc->mutex); + + if (init_debugfs) { + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(monc->client); + } +} + +static int __validate_auth(struct ceph_mon_client *monc) +{ + int ret; + + if (monc->pending_auth) + return 0; + + ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base, + monc->m_auth->front_alloc_len); + if (ret <= 0) + return ret; /* either an error, or no need to authenticate */ + __send_prepared_auth_request(monc, ret); + return 0; +} + +int ceph_monc_validate_auth(struct ceph_mon_client *monc) +{ + int ret; + + mutex_lock(&monc->mutex); + ret = __validate_auth(monc); + mutex_unlock(&monc->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_monc_validate_auth); + +/* + * handle incoming message + */ +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_mon_client *monc = con->private; + int type = le16_to_cpu(msg->hdr.type); + + if (!monc) + return; + + switch (type) { + case CEPH_MSG_AUTH_REPLY: + handle_auth_reply(monc, msg); + break; + + case CEPH_MSG_MON_SUBSCRIBE_ACK: + handle_subscribe_ack(monc, msg); + break; + + case CEPH_MSG_STATFS_REPLY: + handle_statfs_reply(monc, msg); + break; + + case CEPH_MSG_MON_GET_VERSION_REPLY: + handle_get_version_reply(monc, msg); + break; + + case CEPH_MSG_MON_MAP: + ceph_monc_handle_map(monc, msg); + break; + + case CEPH_MSG_OSD_MAP: + ceph_osdc_handle_map(&monc->client->osdc, msg); + break; + + default: + /* can the chained handler handle it? */ + if (monc->client->extra_mon_dispatch && + monc->client->extra_mon_dispatch(monc->client, msg) == 0) + break; + + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } + ceph_msg_put(msg); +} + +/* + * Allocate memory for incoming message + */ +static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_mon_client *monc = con->private; + int type = le16_to_cpu(hdr->type); + int front_len = le32_to_cpu(hdr->front_len); + struct ceph_msg *m = NULL; + + *skip = 0; + + switch (type) { + case CEPH_MSG_MON_SUBSCRIBE_ACK: + m = ceph_msg_get(monc->m_subscribe_ack); + break; + case CEPH_MSG_STATFS_REPLY: + return get_generic_reply(con, hdr, skip); + case CEPH_MSG_AUTH_REPLY: + m = ceph_msg_get(monc->m_auth_reply); + break; + case CEPH_MSG_MON_GET_VERSION_REPLY: + if (le64_to_cpu(hdr->tid) != 0) + return get_generic_reply(con, hdr, skip); + + /* + * Older OSDs don't set reply tid even if the orignal + * request had a non-zero tid. Workaround this weirdness + * by falling through to the allocate case. + */ + case CEPH_MSG_MON_MAP: + case CEPH_MSG_MDS_MAP: + case CEPH_MSG_OSD_MAP: + m = ceph_msg_new(type, front_len, GFP_NOFS, false); + if (!m) + return NULL; /* ENOMEM--return skip == 0 */ + break; + } + + if (!m) { + pr_info("alloc_msg unknown type %d\n", type); + *skip = 1; + } else if (front_len > m->front_alloc_len) { + pr_warn("mon_alloc_msg front %d > prealloc %d (%u#%llu)\n", + front_len, m->front_alloc_len, + (unsigned int)con->peer_name.type, + le64_to_cpu(con->peer_name.num)); + ceph_msg_put(m); + m = ceph_msg_new(type, front_len, GFP_NOFS, false); + } + + return m; +} + +/* + * If the monitor connection resets, pick a new monitor and resubmit + * any pending requests. + */ +static void mon_fault(struct ceph_connection *con) +{ + struct ceph_mon_client *monc = con->private; + + if (!monc) + return; + + dout("mon_fault\n"); + mutex_lock(&monc->mutex); + if (!con->private) + goto out; + + if (!monc->hunting) + pr_info("mon%d %s session lost, " + "hunting for new mon\n", monc->cur_mon, + ceph_pr_addr(&monc->con.peer_addr.in_addr)); + + __close_session(monc); + if (!monc->hunting) { + /* start hunting */ + monc->hunting = true; + __open_session(monc); + } else { + /* already hunting, let's wait a bit */ + __schedule_delayed(monc); + } +out: + mutex_unlock(&monc->mutex); +} + +/* + * We can ignore refcounting on the connection struct, as all references + * will come from the messenger workqueue, which is drained prior to + * mon_client destruction. + */ +static struct ceph_connection *con_get(struct ceph_connection *con) +{ + return con; +} + +static void con_put(struct ceph_connection *con) +{ +} + +static const struct ceph_connection_operations mon_con_ops = { + .get = con_get, + .put = con_put, + .dispatch = dispatch, + .fault = mon_fault, + .alloc_msg = mon_alloc_msg, +}; diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c new file mode 100644 index 000000000..ddec1c10a --- /dev/null +++ b/net/ceph/msgpool.c @@ -0,0 +1,83 @@ +#include + +#include +#include +#include +#include + +#include + +static void *msgpool_alloc(gfp_t gfp_mask, void *arg) +{ + struct ceph_msgpool *pool = arg; + struct ceph_msg *msg; + + msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); + if (!msg) { + dout("msgpool_alloc %s failed\n", pool->name); + } else { + dout("msgpool_alloc %s %p\n", pool->name, msg); + msg->pool = pool; + } + return msg; +} + +static void msgpool_free(void *element, void *arg) +{ + struct ceph_msgpool *pool = arg; + struct ceph_msg *msg = element; + + dout("msgpool_release %s %p\n", pool->name, msg); + msg->pool = NULL; + ceph_msg_put(msg); +} + +int ceph_msgpool_init(struct ceph_msgpool *pool, int type, + int front_len, int size, bool blocking, const char *name) +{ + dout("msgpool %s init\n", name); + pool->type = type; + pool->front_len = front_len; + pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); + if (!pool->pool) + return -ENOMEM; + pool->name = name; + return 0; +} + +void ceph_msgpool_destroy(struct ceph_msgpool *pool) +{ + dout("msgpool %s destroy\n", pool->name); + mempool_destroy(pool->pool); +} + +struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, + int front_len) +{ + struct ceph_msg *msg; + + if (front_len > pool->front_len) { + dout("msgpool_get %s need front %d, pool size is %d\n", + pool->name, front_len, pool->front_len); + WARN_ON(1); + + /* try to alloc a fresh message */ + return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); + } + + msg = mempool_alloc(pool->pool, GFP_NOFS); + dout("msgpool_get %s %p\n", pool->name, msg); + return msg; +} + +void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) +{ + dout("msgpool_put %s %p\n", pool->name, msg); + + /* reset msg front_len; user may have changed it */ + msg->front.iov_len = pool->front_len; + msg->hdr.front_len = cpu_to_le32(pool->front_len); + + kref_init(&msg->kref); /* retake single ref */ + mempool_free(msg, pool->pool); +} diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c new file mode 100644 index 000000000..c4ec92392 --- /dev/null +++ b/net/ceph/osd_client.c @@ -0,0 +1,3008 @@ + +#include + +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_BLOCK +#include +#endif + +#include +#include +#include +#include +#include +#include + +#define OSD_OP_FRONT_LEN 4096 +#define OSD_OPREPLY_FRONT_LEN 512 + +static struct kmem_cache *ceph_osd_request_cache; + +static const struct ceph_connection_operations osd_con_ops; + +static void __send_queued(struct ceph_osd_client *osdc); +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd); +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static void __unregister_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); +static void __enqueue_request(struct ceph_osd_request *req); +static void __send_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req); + +/* + * Implement client access to distributed object storage cluster. + * + * All data objects are stored within a cluster/cloud of OSDs, or + * "object storage devices." (Note that Ceph OSDs have _nothing_ to + * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply + * remote daemons serving up and coordinating consistent and safe + * access to storage. + * + * Cluster membership and the mapping of data objects onto storage devices + * are described by the osd map. + * + * We keep track of pending OSD requests (read, write), resubmit + * requests to different OSDs when the cluster topology/data layout + * change, or retry the affected requests when the communications + * channel with an OSD is reset. + */ + +/* + * calculate the mapping of a file extent onto an object, and fill out the + * request accordingly. shorten extent as necessary if it crosses an + * object boundary. + * + * fill osd op in request message. + */ +static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, + u64 *objnum, u64 *objoff, u64 *objlen) +{ + u64 orig_len = *plen; + int r; + + /* object extent? */ + r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum, + objoff, objlen); + if (r < 0) + return r; + if (*objlen < orig_len) { + *plen = *objlen; + dout(" skipping last %llu, final file extent %llu~%llu\n", + orig_len - *plen, off, *plen); + } + + dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen); + + return 0; +} + +static void ceph_osd_data_init(struct ceph_osd_data *osd_data) +{ + memset(osd_data, 0, sizeof (*osd_data)); + osd_data->type = CEPH_OSD_DATA_TYPE_NONE; +} + +static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, + struct page **pages, u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGES; + osd_data->pages = pages; + osd_data->length = length; + osd_data->alignment = alignment; + osd_data->pages_from_pool = pages_from_pool; + osd_data->own_pages = own_pages; +} + +static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, + struct ceph_pagelist *pagelist) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST; + osd_data->pagelist = pagelist; +} + +#ifdef CONFIG_BLOCK +static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data, + struct bio *bio, size_t bio_length) +{ + osd_data->type = CEPH_OSD_DATA_TYPE_BIO; + osd_data->bio = bio; + osd_data->bio_length = bio_length; +} +#endif /* CONFIG_BLOCK */ + +#define osd_req_op_data(oreq, whch, typ, fld) \ + ({ \ + BUG_ON(whch >= (oreq)->r_num_ops); \ + &(oreq)->r_ops[whch].typ.fld; \ + }) + +static struct ceph_osd_data * +osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which) +{ + BUG_ON(which >= osd_req->r_num_ops); + + return &osd_req->r_ops[which].raw_data_in; +} + +struct ceph_osd_data * +osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + return osd_req_op_data(osd_req, which, extent, osd_data); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data); + +struct ceph_osd_data * +osd_req_op_cls_response_data(struct ceph_osd_request *osd_req, + unsigned int which) +{ + return osd_req_op_data(osd_req, which, cls, response_data); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */ + +void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_raw_data_in(osd_req, which); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_raw_data_in_pages); + +void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, + u64 length, u32 alignment, + bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); + +void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); + +#ifdef CONFIG_BLOCK +void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, + unsigned int which, struct bio *bio, size_t bio_length) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); + ceph_osd_data_bio_init(osd_data, bio, bio_length); +} +EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); +#endif /* CONFIG_BLOCK */ + +static void osd_req_op_cls_request_info_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_info); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} + +void osd_req_op_cls_request_data_pagelist( + struct ceph_osd_request *osd_req, + unsigned int which, struct ceph_pagelist *pagelist) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pagelist_init(osd_data, pagelist); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); + +void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, request_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_request_data_pages); + +void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req, + unsigned int which, struct page **pages, u64 length, + u32 alignment, bool pages_from_pool, bool own_pages) +{ + struct ceph_osd_data *osd_data; + + osd_data = osd_req_op_data(osd_req, which, cls, response_data); + ceph_osd_data_pages_init(osd_data, pages, length, alignment, + pages_from_pool, own_pages); +} +EXPORT_SYMBOL(osd_req_op_cls_response_data_pages); + +static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data) +{ + switch (osd_data->type) { + case CEPH_OSD_DATA_TYPE_NONE: + return 0; + case CEPH_OSD_DATA_TYPE_PAGES: + return osd_data->length; + case CEPH_OSD_DATA_TYPE_PAGELIST: + return (u64)osd_data->pagelist->length; +#ifdef CONFIG_BLOCK + case CEPH_OSD_DATA_TYPE_BIO: + return (u64)osd_data->bio_length; +#endif /* CONFIG_BLOCK */ + default: + WARN(true, "unrecognized data type %d\n", (int)osd_data->type); + return 0; + } +} + +static void ceph_osd_data_release(struct ceph_osd_data *osd_data) +{ + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) { + int num_pages; + + num_pages = calc_pages_for((u64)osd_data->alignment, + (u64)osd_data->length); + ceph_release_page_vector(osd_data->pages, num_pages); + } + ceph_osd_data_init(osd_data); +} + +static void osd_req_op_data_release(struct ceph_osd_request *osd_req, + unsigned int which) +{ + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + + switch (op->op) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + ceph_osd_data_release(&op->extent.osd_data); + break; + case CEPH_OSD_OP_CALL: + ceph_osd_data_release(&op->cls.request_info); + ceph_osd_data_release(&op->cls.request_data); + ceph_osd_data_release(&op->cls.response_data); + break; + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + ceph_osd_data_release(&op->xattr.osd_data); + break; + default: + break; + } +} + +/* + * requests + */ +static void ceph_osdc_release_request(struct kref *kref) +{ + struct ceph_osd_request *req = container_of(kref, + struct ceph_osd_request, r_kref); + unsigned int which; + + dout("%s %p (r_request %p r_reply %p)\n", __func__, req, + req->r_request, req->r_reply); + WARN_ON(!RB_EMPTY_NODE(&req->r_node)); + WARN_ON(!list_empty(&req->r_req_lru_item)); + WARN_ON(!list_empty(&req->r_osd_item)); + WARN_ON(!list_empty(&req->r_linger_item)); + WARN_ON(!list_empty(&req->r_linger_osd_item)); + WARN_ON(req->r_osd); + + if (req->r_request) + ceph_msg_put(req->r_request); + if (req->r_reply) { + ceph_msg_revoke_incoming(req->r_reply); + ceph_msg_put(req->r_reply); + } + + for (which = 0; which < req->r_num_ops; which++) + osd_req_op_data_release(req, which); + + ceph_put_snap_context(req->r_snapc); + if (req->r_mempool) + mempool_free(req, req->r_osdc->req_mempool); + else + kmem_cache_free(ceph_osd_request_cache, req); + +} + +void ceph_osdc_get_request(struct ceph_osd_request *req) +{ + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_get(&req->r_kref); +} +EXPORT_SYMBOL(ceph_osdc_get_request); + +void ceph_osdc_put_request(struct ceph_osd_request *req) +{ + dout("%s %p (was %d)\n", __func__, req, + atomic_read(&req->r_kref.refcount)); + kref_put(&req->r_kref, ceph_osdc_release_request); +} +EXPORT_SYMBOL(ceph_osdc_put_request); + +struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, + struct ceph_snap_context *snapc, + unsigned int num_ops, + bool use_mempool, + gfp_t gfp_flags) +{ + struct ceph_osd_request *req; + struct ceph_msg *msg; + size_t msg_size; + + BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); + BUG_ON(num_ops > CEPH_OSD_MAX_OP); + + msg_size = 4 + 4 + 8 + 8 + 4+8; + msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ + msg_size += 1 + 8 + 4 + 4; /* pg_t */ + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ + msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); + msg_size += 8; /* snapid */ + msg_size += 8; /* snap_seq */ + msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ + msg_size += 4; + + if (use_mempool) { + req = mempool_alloc(osdc->req_mempool, gfp_flags); + memset(req, 0, sizeof(*req)); + } else { + req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); + } + if (req == NULL) + return NULL; + + req->r_osdc = osdc; + req->r_mempool = use_mempool; + req->r_num_ops = num_ops; + + kref_init(&req->r_kref); + init_completion(&req->r_completion); + init_completion(&req->r_safe_completion); + RB_CLEAR_NODE(&req->r_node); + INIT_LIST_HEAD(&req->r_unsafe_item); + INIT_LIST_HEAD(&req->r_linger_item); + INIT_LIST_HEAD(&req->r_linger_osd_item); + INIT_LIST_HEAD(&req->r_req_lru_item); + INIT_LIST_HEAD(&req->r_osd_item); + + req->r_base_oloc.pool = -1; + req->r_target_oloc.pool = -1; + + /* create reply message */ + if (use_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, + OSD_OPREPLY_FRONT_LEN, gfp_flags, true); + if (!msg) { + ceph_osdc_put_request(req); + return NULL; + } + req->r_reply = msg; + + /* create request message; allow space for oid */ + if (use_mempool) + msg = ceph_msgpool_get(&osdc->msgpool_op, 0); + else + msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags, true); + if (!msg) { + ceph_osdc_put_request(req); + return NULL; + } + + memset(msg->front.iov_base, 0, msg->front.iov_len); + + req->r_request = msg; + + return req; +} +EXPORT_SYMBOL(ceph_osdc_alloc_request); + +static bool osd_req_opcode_valid(u16 opcode) +{ + switch (opcode) { +#define GENERATE_CASE(op, opcode, str) case CEPH_OSD_OP_##op: return true; +__CEPH_FORALL_OSD_OPS(GENERATE_CASE) +#undef GENERATE_CASE + default: + return false; + } +} + +/* + * This is an osd op init function for opcodes that have no data or + * other information associated with them. It also serves as a + * common init routine for all the other init functions, below. + */ +static struct ceph_osd_req_op * +_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode) +{ + struct ceph_osd_req_op *op; + + BUG_ON(which >= osd_req->r_num_ops); + BUG_ON(!osd_req_opcode_valid(opcode)); + + op = &osd_req->r_ops[which]; + memset(op, 0, sizeof (*op)); + op->op = opcode; + + return op; +} + +void osd_req_op_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode) +{ + (void)_osd_req_op_init(osd_req, which, opcode); +} +EXPORT_SYMBOL(osd_req_op_init); + +void osd_req_op_extent_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 offset, u64 length, + u64 truncate_size, u32 truncate_seq) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + size_t payload_len = 0; + + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && + opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE); + + op->extent.offset = offset; + op->extent.length = length; + op->extent.truncate_size = truncate_size; + op->extent.truncate_seq = truncate_seq; + if (opcode == CEPH_OSD_OP_WRITE) + payload_len += length; + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_extent_init); + +void osd_req_op_extent_update(struct ceph_osd_request *osd_req, + unsigned int which, u64 length) +{ + struct ceph_osd_req_op *op; + u64 previous; + + BUG_ON(which >= osd_req->r_num_ops); + op = &osd_req->r_ops[which]; + previous = op->extent.length; + + if (length == previous) + return; /* Nothing to do */ + BUG_ON(length > previous); + + op->extent.length = length; + op->payload_len -= previous - length; +} +EXPORT_SYMBOL(osd_req_op_extent_update); + +void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *class, const char *method) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_pagelist *pagelist; + size_t payload_len = 0; + size_t size; + + BUG_ON(opcode != CEPH_OSD_OP_CALL); + + pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); + BUG_ON(!pagelist); + ceph_pagelist_init(pagelist); + + op->cls.class_name = class; + size = strlen(class); + BUG_ON(size > (size_t) U8_MAX); + op->cls.class_len = size; + ceph_pagelist_append(pagelist, class, size); + payload_len += size; + + op->cls.method_name = method; + size = strlen(method); + BUG_ON(size > (size_t) U8_MAX); + op->cls.method_len = size; + ceph_pagelist_append(pagelist, method, size); + payload_len += size; + + osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); + + op->cls.argc = 0; /* currently unused */ + + op->payload_len = payload_len; +} +EXPORT_SYMBOL(osd_req_op_cls_init); + +int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, + u16 opcode, const char *name, const void *value, + size_t size, u8 cmp_op, u8 cmp_mode) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + struct ceph_pagelist *pagelist; + size_t payload_len; + + BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); + + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + if (!pagelist) + return -ENOMEM; + + ceph_pagelist_init(pagelist); + + payload_len = strlen(name); + op->xattr.name_len = payload_len; + ceph_pagelist_append(pagelist, name, payload_len); + + op->xattr.value_len = size; + ceph_pagelist_append(pagelist, value, size); + payload_len += size; + + op->xattr.cmp_op = cmp_op; + op->xattr.cmp_mode = cmp_mode; + + ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); + op->payload_len = payload_len; + return 0; +} +EXPORT_SYMBOL(osd_req_op_xattr_init); + +void osd_req_op_watch_init(struct ceph_osd_request *osd_req, + unsigned int which, u16 opcode, + u64 cookie, u64 version, int flag) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); + + BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH); + + op->watch.cookie = cookie; + op->watch.ver = version; + if (opcode == CEPH_OSD_OP_WATCH && flag) + op->watch.flag = (u8)1; +} +EXPORT_SYMBOL(osd_req_op_watch_init); + +void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, + unsigned int which, + u64 expected_object_size, + u64 expected_write_size) +{ + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, + CEPH_OSD_OP_SETALLOCHINT); + + op->alloc_hint.expected_object_size = expected_object_size; + op->alloc_hint.expected_write_size = expected_write_size; + + /* + * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed + * not worth a feature bit. Set FAILOK per-op flag to make + * sure older osds don't trip over an unsupported opcode. + */ + op->flags |= CEPH_OSD_OP_FLAG_FAILOK; +} +EXPORT_SYMBOL(osd_req_op_alloc_hint_init); + +static void ceph_osdc_msg_data_add(struct ceph_msg *msg, + struct ceph_osd_data *osd_data) +{ + u64 length = ceph_osd_data_length(osd_data); + + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + BUG_ON(length > (u64) SIZE_MAX); + if (length) + ceph_msg_data_add_pages(msg, osd_data->pages, + length, osd_data->alignment); + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { + BUG_ON(!length); + ceph_msg_data_add_pagelist(msg, osd_data->pagelist); +#ifdef CONFIG_BLOCK + } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) { + ceph_msg_data_add_bio(msg, osd_data->bio, length); +#endif + } else { + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE); + } +} + +static u64 osd_req_encode_op(struct ceph_osd_request *req, + struct ceph_osd_op *dst, unsigned int which) +{ + struct ceph_osd_req_op *src; + struct ceph_osd_data *osd_data; + u64 request_data_len = 0; + u64 data_length; + + BUG_ON(which >= req->r_num_ops); + src = &req->r_ops[which]; + if (WARN_ON(!osd_req_opcode_valid(src->op))) { + pr_err("unrecognized osd opcode %d\n", src->op); + + return 0; + } + + switch (src->op) { + case CEPH_OSD_OP_STAT: + osd_data = &src->raw_data_in; + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + case CEPH_OSD_OP_ZERO: + case CEPH_OSD_OP_TRUNCATE: + if (src->op == CEPH_OSD_OP_WRITE) + request_data_len = src->extent.length; + dst->extent.offset = cpu_to_le64(src->extent.offset); + dst->extent.length = cpu_to_le64(src->extent.length); + dst->extent.truncate_size = + cpu_to_le64(src->extent.truncate_size); + dst->extent.truncate_seq = + cpu_to_le32(src->extent.truncate_seq); + osd_data = &src->extent.osd_data; + if (src->op == CEPH_OSD_OP_WRITE) + ceph_osdc_msg_data_add(req->r_request, osd_data); + else + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_CALL: + dst->cls.class_len = src->cls.class_len; + dst->cls.method_len = src->cls.method_len; + osd_data = &src->cls.request_info; + ceph_osdc_msg_data_add(req->r_request, osd_data); + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST); + request_data_len = osd_data->pagelist->length; + + osd_data = &src->cls.request_data; + data_length = ceph_osd_data_length(osd_data); + if (data_length) { + BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); + dst->cls.indata_len = cpu_to_le32(data_length); + ceph_osdc_msg_data_add(req->r_request, osd_data); + src->payload_len += data_length; + request_data_len += data_length; + } + osd_data = &src->cls.response_data; + ceph_osdc_msg_data_add(req->r_reply, osd_data); + break; + case CEPH_OSD_OP_STARTSYNC: + break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + dst->watch.cookie = cpu_to_le64(src->watch.cookie); + dst->watch.ver = cpu_to_le64(src->watch.ver); + dst->watch.flag = src->watch.flag; + break; + case CEPH_OSD_OP_SETALLOCHINT: + dst->alloc_hint.expected_object_size = + cpu_to_le64(src->alloc_hint.expected_object_size); + dst->alloc_hint.expected_write_size = + cpu_to_le64(src->alloc_hint.expected_write_size); + break; + case CEPH_OSD_OP_SETXATTR: + case CEPH_OSD_OP_CMPXATTR: + dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); + dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); + dst->xattr.cmp_op = src->xattr.cmp_op; + dst->xattr.cmp_mode = src->xattr.cmp_mode; + osd_data = &src->xattr.osd_data; + ceph_osdc_msg_data_add(req->r_request, osd_data); + request_data_len = osd_data->pagelist->length; + break; + case CEPH_OSD_OP_CREATE: + case CEPH_OSD_OP_DELETE: + break; + default: + pr_err("unsupported osd opcode %s\n", + ceph_osd_op_name(src->op)); + WARN_ON(1); + + return 0; + } + + dst->op = cpu_to_le16(src->op); + dst->flags = cpu_to_le32(src->flags); + dst->payload_len = cpu_to_le32(src->payload_len); + + return request_data_len; +} + +/* + * build new request AND message, calculate layout, and adjust file + * extent as needed. + * + * if the file was recently truncated, we include information about its + * old and new size so that the object can be updated appropriately. (we + * avoid synchronously deleting truncated objects because it's slow.) + * + * if @do_sync, include a 'startsync' command so that the osd will flush + * data quickly. + */ +struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, + struct ceph_file_layout *layout, + struct ceph_vino vino, + u64 off, u64 *plen, + unsigned int which, int num_ops, + int opcode, int flags, + struct ceph_snap_context *snapc, + u32 truncate_seq, + u64 truncate_size, + bool use_mempool) +{ + struct ceph_osd_request *req; + u64 objnum = 0; + u64 objoff = 0; + u64 objlen = 0; + int r; + + BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && + opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && + opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE); + + req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, + GFP_NOFS); + if (!req) + return ERR_PTR(-ENOMEM); + + req->r_flags = flags; + + /* calculate max write size */ + r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen); + if (r < 0) { + ceph_osdc_put_request(req); + return ERR_PTR(r); + } + + if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { + osd_req_op_init(req, which, opcode); + } else { + u32 object_size = le32_to_cpu(layout->fl_object_size); + u32 object_base = off - objoff; + if (!(truncate_seq == 1 && truncate_size == -1ULL)) { + if (truncate_size <= object_base) { + truncate_size = 0; + } else { + truncate_size -= object_base; + if (truncate_size > object_size) + truncate_size = object_size; + } + } + osd_req_op_extent_init(req, which, opcode, objoff, objlen, + truncate_size, truncate_seq); + } + + req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); + + snprintf(req->r_base_oid.name, sizeof(req->r_base_oid.name), + "%llx.%08llx", vino.ino, objnum); + req->r_base_oid.name_len = strlen(req->r_base_oid.name); + + return req; +} +EXPORT_SYMBOL(ceph_osdc_new_request); + +/* + * We keep osd requests in an rbtree, sorted by ->r_tid. + */ +static void __insert_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *new) +{ + struct rb_node **p = &osdc->requests.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_request *req = NULL; + + while (*p) { + parent = *p; + req = rb_entry(parent, struct ceph_osd_request, r_node); + if (new->r_tid < req->r_tid) + p = &(*p)->rb_left; + else if (new->r_tid > req->r_tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->r_node, parent, p); + rb_insert_color(&new->r_node, &osdc->requests); +} + +static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc, + u64 tid) +{ + struct ceph_osd_request *req; + struct rb_node *n = osdc->requests.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_osd_request, r_node); + if (tid < req->r_tid) + n = n->rb_left; + else if (tid > req->r_tid) + n = n->rb_right; + else + return req; + } + return NULL; +} + +static struct ceph_osd_request * +__lookup_request_ge(struct ceph_osd_client *osdc, + u64 tid) +{ + struct ceph_osd_request *req; + struct rb_node *n = osdc->requests.rb_node; + + while (n) { + req = rb_entry(n, struct ceph_osd_request, r_node); + if (tid < req->r_tid) { + if (!n->rb_left) + return req; + n = n->rb_left; + } else if (tid > req->r_tid) { + n = n->rb_right; + } else { + return req; + } + } + return NULL; +} + +static void __kick_linger_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + struct ceph_osd *osd = req->r_osd; + + /* + * Linger requests need to be resent with a new tid to avoid + * the dup op detection logic on the OSDs. Achieve this with + * a re-register dance instead of open-coding. + */ + ceph_osdc_get_request(req); + if (!list_empty(&req->r_linger_item)) + __unregister_linger_request(osdc, req); + else + __unregister_request(osdc, req); + __register_request(osdc, req); + ceph_osdc_put_request(req); + + /* + * Unless request has been registered as both normal and + * lingering, __unregister{,_linger}_request clears r_osd. + * However, here we need to preserve r_osd to make sure we + * requeue on the same OSD. + */ + WARN_ON(req->r_osd || !osd); + req->r_osd = osd; + + dout("%s requeueing %p tid %llu\n", __func__, req, req->r_tid); + __enqueue_request(req); +} + +/* + * Resubmit requests pending on the given osd. + */ +static void __kick_osd_requests(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + struct ceph_osd_request *req, *nreq; + LIST_HEAD(resend); + LIST_HEAD(resend_linger); + int err; + + dout("%s osd%d\n", __func__, osd->o_osd); + err = __reset_osd(osdc, osd); + if (err) + return; + + /* + * Build up a list of requests to resend by traversing the + * osd's list of requests. Requests for a given object are + * sent in tid order, and that is also the order they're + * kept on this list. Therefore all requests that are in + * flight will be found first, followed by all requests that + * have not yet been sent. And to resend requests while + * preserving this order we will want to put any sent + * requests back on the front of the osd client's unsent + * list. + * + * So we build a separate ordered list of already-sent + * requests for the affected osd and splice it onto the + * front of the osd client's unsent list. Once we've seen a + * request that has not yet been sent we're done. Those + * requests are already sitting right where they belong. + */ + list_for_each_entry(req, &osd->o_requests, r_osd_item) { + if (!req->r_sent) + break; + + if (!req->r_linger) { + dout("%s requeueing %p tid %llu\n", __func__, req, + req->r_tid); + list_move_tail(&req->r_req_lru_item, &resend); + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } else { + list_move_tail(&req->r_req_lru_item, &resend_linger); + } + } + list_splice(&resend, &osdc->req_unsent); + + /* + * Both registered and not yet registered linger requests are + * enqueued with a new tid on the same OSD. We add/move them + * to req_unsent/o_requests at the end to keep things in tid + * order. + */ + list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, + r_linger_osd_item) { + WARN_ON(!list_empty(&req->r_req_lru_item)); + __kick_linger_request(req); + } + + list_for_each_entry_safe(req, nreq, &resend_linger, r_req_lru_item) + __kick_linger_request(req); +} + +/* + * If the osd connection drops, we need to resubmit all requests. + */ +static void osd_reset(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc; + + if (!osd) + return; + dout("osd_reset osd%d\n", osd->o_osd); + osdc = osd->o_osdc; + down_read(&osdc->map_sem); + mutex_lock(&osdc->request_mutex); + __kick_osd_requests(osdc, osd); + __send_queued(osdc); + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); +} + +/* + * Track open sessions with osds. + */ +static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) +{ + struct ceph_osd *osd; + + osd = kzalloc(sizeof(*osd), GFP_NOFS); + if (!osd) + return NULL; + + atomic_set(&osd->o_ref, 1); + osd->o_osdc = osdc; + osd->o_osd = onum; + RB_CLEAR_NODE(&osd->o_node); + INIT_LIST_HEAD(&osd->o_requests); + INIT_LIST_HEAD(&osd->o_linger_requests); + INIT_LIST_HEAD(&osd->o_osd_lru); + osd->o_incarnation = 1; + + ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr); + + INIT_LIST_HEAD(&osd->o_keepalive_item); + return osd; +} + +static struct ceph_osd *get_osd(struct ceph_osd *osd) +{ + if (atomic_inc_not_zero(&osd->o_ref)) { + dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, + atomic_read(&osd->o_ref)); + return osd; + } else { + dout("get_osd %p FAIL\n", osd); + return NULL; + } +} + +static void put_osd(struct ceph_osd *osd) +{ + dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), + atomic_read(&osd->o_ref) - 1); + if (atomic_dec_and_test(&osd->o_ref)) { + struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; + + if (osd->o_auth.authorizer) + ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer); + kfree(osd); + } +} + +/* + * remove an osd from our map + */ +static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); + WARN_ON(!list_empty(&osd->o_requests)); + WARN_ON(!list_empty(&osd->o_linger_requests)); + + list_del_init(&osd->o_osd_lru); + rb_erase(&osd->o_node, &osdc->osds); + RB_CLEAR_NODE(&osd->o_node); +} + +static void remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + dout("%s %p osd%d\n", __func__, osd, osd->o_osd); + + if (!RB_EMPTY_NODE(&osd->o_node)) { + ceph_con_close(&osd->o_con); + __remove_osd(osdc, osd); + put_osd(osd); + } +} + +static void remove_all_osds(struct ceph_osd_client *osdc) +{ + dout("%s %p\n", __func__, osdc); + mutex_lock(&osdc->request_mutex); + while (!RB_EMPTY_ROOT(&osdc->osds)) { + struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), + struct ceph_osd, o_node); + remove_osd(osdc, osd); + } + mutex_unlock(&osdc->request_mutex); +} + +static void __move_osd_to_lru(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + dout("%s %p\n", __func__, osd); + BUG_ON(!list_empty(&osd->o_osd_lru)); + + list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); + osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ; +} + +static void maybe_move_osd_to_lru(struct ceph_osd_client *osdc, + struct ceph_osd *osd) +{ + dout("%s %p\n", __func__, osd); + + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) + __move_osd_to_lru(osdc, osd); +} + +static void __remove_osd_from_lru(struct ceph_osd *osd) +{ + dout("__remove_osd_from_lru %p\n", osd); + if (!list_empty(&osd->o_osd_lru)) + list_del_init(&osd->o_osd_lru); +} + +static void remove_old_osds(struct ceph_osd_client *osdc) +{ + struct ceph_osd *osd, *nosd; + + dout("__remove_old_osds %p\n", osdc); + mutex_lock(&osdc->request_mutex); + list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) { + if (time_before(jiffies, osd->lru_ttl)) + break; + remove_osd(osdc, osd); + } + mutex_unlock(&osdc->request_mutex); +} + +/* + * reset osd connect + */ +static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) +{ + struct ceph_entity_addr *peer_addr; + + dout("__reset_osd %p osd%d\n", osd, osd->o_osd); + if (list_empty(&osd->o_requests) && + list_empty(&osd->o_linger_requests)) { + remove_osd(osdc, osd); + return -ENODEV; + } + + peer_addr = &osdc->osdmap->osd_addr[osd->o_osd]; + if (!memcmp(peer_addr, &osd->o_con.peer_addr, sizeof (*peer_addr)) && + !ceph_con_opened(&osd->o_con)) { + struct ceph_osd_request *req; + + dout("osd addr hasn't changed and connection never opened, " + "letting msgr retry\n"); + /* touch each r_stamp for handle_timeout()'s benfit */ + list_for_each_entry(req, &osd->o_requests, r_osd_item) + req->r_stamp = jiffies; + + return -EAGAIN; + } + + ceph_con_close(&osd->o_con); + ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd, peer_addr); + osd->o_incarnation++; + + return 0; +} + +static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new) +{ + struct rb_node **p = &osdc->osds.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd *osd = NULL; + + dout("__insert_osd %p osd%d\n", new, new->o_osd); + while (*p) { + parent = *p; + osd = rb_entry(parent, struct ceph_osd, o_node); + if (new->o_osd < osd->o_osd) + p = &(*p)->rb_left; + else if (new->o_osd > osd->o_osd) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->o_node, parent, p); + rb_insert_color(&new->o_node, &osdc->osds); +} + +static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o) +{ + struct ceph_osd *osd; + struct rb_node *n = osdc->osds.rb_node; + + while (n) { + osd = rb_entry(n, struct ceph_osd, o_node); + if (o < osd->o_osd) + n = n->rb_left; + else if (o > osd->o_osd) + n = n->rb_right; + else + return osd; + } + return NULL; +} + +static void __schedule_osd_timeout(struct ceph_osd_client *osdc) +{ + schedule_delayed_work(&osdc->timeout_work, + osdc->client->options->osd_keepalive_timeout * HZ); +} + +static void __cancel_osd_timeout(struct ceph_osd_client *osdc) +{ + cancel_delayed_work(&osdc->timeout_work); +} + +/* + * Register request, assign tid. If this is the first request, set up + * the timeout event. + */ +static void __register_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + req->r_tid = ++osdc->last_tid; + req->r_request->hdr.tid = cpu_to_le64(req->r_tid); + dout("__register_request %p tid %lld\n", req, req->r_tid); + __insert_request(osdc, req); + ceph_osdc_get_request(req); + osdc->num_requests++; + if (osdc->num_requests == 1) { + dout(" first request, scheduling timeout\n"); + __schedule_osd_timeout(osdc); + } +} + +/* + * called under osdc->request_mutex + */ +static void __unregister_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + if (RB_EMPTY_NODE(&req->r_node)) { + dout("__unregister_request %p tid %lld not registered\n", + req, req->r_tid); + return; + } + + dout("__unregister_request %p tid %lld\n", req, req->r_tid); + rb_erase(&req->r_node, &osdc->requests); + RB_CLEAR_NODE(&req->r_node); + osdc->num_requests--; + + if (req->r_osd) { + /* make sure the original request isn't in flight. */ + ceph_msg_revoke(req->r_request); + + list_del_init(&req->r_osd_item); + maybe_move_osd_to_lru(osdc, req->r_osd); + if (list_empty(&req->r_linger_osd_item)) + req->r_osd = NULL; + } + + list_del_init(&req->r_req_lru_item); + ceph_osdc_put_request(req); + + if (osdc->num_requests == 0) { + dout(" no requests, canceling timeout\n"); + __cancel_osd_timeout(osdc); + } +} + +/* + * Cancel a previously queued request message + */ +static void __cancel_request(struct ceph_osd_request *req) +{ + if (req->r_sent && req->r_osd) { + ceph_msg_revoke(req->r_request); + req->r_sent = 0; + } +} + +static void __register_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + WARN_ON(!req->r_linger); + + ceph_osdc_get_request(req); + list_add_tail(&req->r_linger_item, &osdc->req_linger); + if (req->r_osd) + list_add_tail(&req->r_linger_osd_item, + &req->r_osd->o_linger_requests); +} + +static void __unregister_linger_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + WARN_ON(!req->r_linger); + + if (list_empty(&req->r_linger_item)) { + dout("%s %p tid %llu not registered\n", __func__, req, + req->r_tid); + return; + } + + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + list_del_init(&req->r_linger_item); + + if (req->r_osd) { + list_del_init(&req->r_linger_osd_item); + maybe_move_osd_to_lru(osdc, req->r_osd); + if (list_empty(&req->r_osd_item)) + req->r_osd = NULL; + } + ceph_osdc_put_request(req); +} + +void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + if (!req->r_linger) { + dout("set_request_linger %p\n", req); + req->r_linger = 1; + } +} +EXPORT_SYMBOL(ceph_osdc_set_request_linger); + +/* + * Returns whether a request should be blocked from being sent + * based on the current osdmap and osd_client settings. + * + * Caller should hold map_sem for read. + */ +static bool __req_should_be_paused(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + bool pauserd = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD); + bool pausewr = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + return (req->r_flags & CEPH_OSD_FLAG_READ && pauserd) || + (req->r_flags & CEPH_OSD_FLAG_WRITE && pausewr); +} + +/* + * Calculate mapping of a request to a PG. Takes tiering into account. + */ +static int __calc_request_pg(struct ceph_osdmap *osdmap, + struct ceph_osd_request *req, + struct ceph_pg *pg_out) +{ + bool need_check_tiering; + + need_check_tiering = false; + if (req->r_target_oloc.pool == -1) { + req->r_target_oloc = req->r_base_oloc; /* struct */ + need_check_tiering = true; + } + if (req->r_target_oid.name_len == 0) { + ceph_oid_copy(&req->r_target_oid, &req->r_base_oid); + need_check_tiering = true; + } + + if (need_check_tiering && + (req->r_flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { + struct ceph_pg_pool_info *pi; + + pi = ceph_pg_pool_by_id(osdmap, req->r_target_oloc.pool); + if (pi) { + if ((req->r_flags & CEPH_OSD_FLAG_READ) && + pi->read_tier >= 0) + req->r_target_oloc.pool = pi->read_tier; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + pi->write_tier >= 0) + req->r_target_oloc.pool = pi->write_tier; + } + /* !pi is caught in ceph_oloc_oid_to_pg() */ + } + + return ceph_oloc_oid_to_pg(osdmap, &req->r_target_oloc, + &req->r_target_oid, pg_out); +} + +static void __enqueue_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + + dout("%s %p tid %llu to osd%d\n", __func__, req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + + if (req->r_osd) { + __remove_osd_from_lru(req->r_osd); + list_add_tail(&req->r_osd_item, &req->r_osd->o_requests); + list_move_tail(&req->r_req_lru_item, &osdc->req_unsent); + } else { + list_move_tail(&req->r_req_lru_item, &osdc->req_notarget); + } +} + +/* + * Pick an osd (the first 'up' osd in the pg), allocate the osd struct + * (as needed), and set the request r_osd appropriately. If there is + * no up osd, set r_osd to NULL. Move the request to the appropriate list + * (unsent, homeless) or leave on in-flight lru. + * + * Return 0 if unchanged, 1 if changed, or negative on error. + * + * Caller should hold map_sem for read and request_mutex. + */ +static int __map_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, int force_resend) +{ + struct ceph_pg pgid; + int acting[CEPH_PG_MAX_SIZE]; + int num, o; + int err; + bool was_paused; + + dout("map_request %p tid %lld\n", req, req->r_tid); + + err = __calc_request_pg(osdc->osdmap, req, &pgid); + if (err) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); + return err; + } + req->r_pgid = pgid; + + num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o); + if (num < 0) + num = 0; + + was_paused = req->r_paused; + req->r_paused = __req_should_be_paused(osdc, req); + if (was_paused && !req->r_paused) + force_resend = 1; + + if ((!force_resend && + req->r_osd && req->r_osd->o_osd == o && + req->r_sent >= req->r_osd->o_incarnation && + req->r_num_pg_osds == num && + memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) || + (req->r_osd == NULL && o == -1) || + req->r_paused) + return 0; /* no change */ + + dout("map_request tid %llu pgid %lld.%x osd%d (was osd%d)\n", + req->r_tid, pgid.pool, pgid.seed, o, + req->r_osd ? req->r_osd->o_osd : -1); + + /* record full pg acting set */ + memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num); + req->r_num_pg_osds = num; + + if (req->r_osd) { + __cancel_request(req); + list_del_init(&req->r_osd_item); + list_del_init(&req->r_linger_osd_item); + req->r_osd = NULL; + } + + req->r_osd = __lookup_osd(osdc, o); + if (!req->r_osd && o >= 0) { + err = -ENOMEM; + req->r_osd = create_osd(osdc, o); + if (!req->r_osd) { + list_move(&req->r_req_lru_item, &osdc->req_notarget); + goto out; + } + + dout("map_request osd %p is osd%d\n", req->r_osd, o); + __insert_osd(osdc, req->r_osd); + + ceph_con_open(&req->r_osd->o_con, + CEPH_ENTITY_TYPE_OSD, o, + &osdc->osdmap->osd_addr[o]); + } + + __enqueue_request(req); + err = 1; /* osd or pg changed */ + +out: + return err; +} + +/* + * caller should hold map_sem (for read) and request_mutex + */ +static void __send_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + void *p; + + dout("send_request %p tid %llu to osd%d flags %d pg %lld.%x\n", + req, req->r_tid, req->r_osd->o_osd, req->r_flags, + (unsigned long long)req->r_pgid.pool, req->r_pgid.seed); + + /* fill in message content that changes each time we send it */ + put_unaligned_le32(osdc->osdmap->epoch, req->r_request_osdmap_epoch); + put_unaligned_le32(req->r_flags, req->r_request_flags); + put_unaligned_le64(req->r_target_oloc.pool, req->r_request_pool); + p = req->r_request_pgid; + ceph_encode_64(&p, req->r_pgid.pool); + ceph_encode_32(&p, req->r_pgid.seed); + put_unaligned_le64(1, req->r_request_attempts); /* FIXME */ + memcpy(req->r_request_reassert_version, &req->r_reassert_version, + sizeof(req->r_reassert_version)); + + req->r_stamp = jiffies; + list_move_tail(&req->r_req_lru_item, &osdc->req_lru); + + ceph_msg_get(req->r_request); /* send consumes a ref */ + + req->r_sent = req->r_osd->o_incarnation; + + ceph_con_send(&req->r_osd->o_con, req->r_request); +} + +/* + * Send any requests in the queue (req_unsent). + */ +static void __send_queued(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req, *tmp; + + dout("__send_queued\n"); + list_for_each_entry_safe(req, tmp, &osdc->req_unsent, r_req_lru_item) + __send_request(osdc, req); +} + +/* + * Caller should hold map_sem for read and request_mutex. + */ +static int __ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc; + + __register_request(osdc, req); + req->r_sent = 0; + req->r_got_reply = 0; + rc = __map_request(osdc, req, 0); + if (rc < 0) { + if (nofail) { + dout("osdc_start_request failed map, " + " will retry %lld\n", req->r_tid); + rc = 0; + } else { + __unregister_request(osdc, req); + } + return rc; + } + + if (req->r_osd == NULL) { + dout("send_request %p no up osds in pg\n", req); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } else { + __send_queued(osdc); + } + + return 0; +} + +/* + * Timeout callback, called every N seconds when 1 or more osd + * requests has been active for more than N seconds. When this + * happens, we ping all OSDs with requests who have timed out to + * ensure any communications channel reset is detected. Reset the + * request timeouts another N seconds in the future as we go. + * Reschedule the timeout event another N seconds in future (unless + * there are no open requests). + */ +static void handle_timeout(struct work_struct *work) +{ + struct ceph_osd_client *osdc = + container_of(work, struct ceph_osd_client, timeout_work.work); + struct ceph_osd_request *req; + struct ceph_osd *osd; + unsigned long keepalive = + osdc->client->options->osd_keepalive_timeout * HZ; + struct list_head slow_osds; + dout("timeout\n"); + down_read(&osdc->map_sem); + + ceph_monc_request_next_osdmap(&osdc->client->monc); + + mutex_lock(&osdc->request_mutex); + + /* + * ping osds that are a bit slow. this ensures that if there + * is a break in the TCP connection we will notice, and reopen + * a connection with that osd (from the fault callback). + */ + INIT_LIST_HEAD(&slow_osds); + list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { + if (time_before(jiffies, req->r_stamp + keepalive)) + break; + + osd = req->r_osd; + BUG_ON(!osd); + dout(" tid %llu is slow, will send keepalive on osd%d\n", + req->r_tid, osd->o_osd); + list_move_tail(&osd->o_keepalive_item, &slow_osds); + } + while (!list_empty(&slow_osds)) { + osd = list_entry(slow_osds.next, struct ceph_osd, + o_keepalive_item); + list_del_init(&osd->o_keepalive_item); + ceph_con_keepalive(&osd->o_con); + } + + __schedule_osd_timeout(osdc); + __send_queued(osdc); + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); +} + +static void handle_osds_timeout(struct work_struct *work) +{ + struct ceph_osd_client *osdc = + container_of(work, struct ceph_osd_client, + osds_timeout_work.work); + unsigned long delay = + osdc->client->options->osd_idle_ttl * HZ >> 2; + + dout("osds timeout\n"); + down_read(&osdc->map_sem); + remove_old_osds(osdc); + up_read(&osdc->map_sem); + + schedule_delayed_work(&osdc->osds_timeout_work, + round_jiffies_relative(delay)); +} + +static int ceph_oloc_decode(void **p, void *end, + struct ceph_object_locator *oloc) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret = 0; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_v < 3) { + pr_warn("got v %d < 3 cv %d of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + if (struct_cv > 6) { + pr_warn("got v %d cv %d > 6 of ceph_object_locator\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + oloc->pool = ceph_decode_64(p); + *p += 4; /* skip preferred */ + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::key is set\n"); + goto e_inval; + } + + if (struct_v >= 5) { + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_object_locator::nspace is set\n"); + goto e_inval; + } + } + + if (struct_v >= 6) { + s64 hash = ceph_decode_64(p); + if (hash != -1) { + pr_warn("ceph_object_locator::hash is set\n"); + goto e_inval; + } + } + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + +static int ceph_redirect_decode(void **p, void *end, + struct ceph_request_redirect *redir) +{ + u8 struct_v, struct_cv; + u32 len; + void *struct_end; + int ret; + + ceph_decode_need(p, end, 1 + 1 + 4, e_inval); + struct_v = ceph_decode_8(p); + struct_cv = ceph_decode_8(p); + if (struct_cv > 1) { + pr_warn("got v %d cv %d > 1 of ceph_request_redirect\n", + struct_v, struct_cv); + goto e_inval; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, e_inval); + struct_end = *p + len; + + ret = ceph_oloc_decode(p, end, &redir->oloc); + if (ret) + goto out; + + len = ceph_decode_32(p); + if (len > 0) { + pr_warn("ceph_request_redirect::object_name is set\n"); + goto e_inval; + } + + len = ceph_decode_32(p); + *p += len; /* skip osd_instructions */ + + /* skip the rest */ + *p = struct_end; +out: + return ret; + +e_inval: + ret = -EINVAL; + goto out; +} + +static void complete_request(struct ceph_osd_request *req) +{ + complete_all(&req->r_safe_completion); /* fsync waiter */ +} + +/* + * handle osd op reply. either call the callback if it is specified, + * or do the completion to wake up the waiting thread. + */ +static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg, + struct ceph_connection *con) +{ + void *p, *end; + struct ceph_osd_request *req; + struct ceph_request_redirect redir; + u64 tid; + int object_len; + unsigned int numops; + int payload_len, flags; + s32 result; + s32 retry_attempt; + struct ceph_pg pg; + int err; + u32 reassert_epoch; + u64 reassert_version; + u32 osdmap_epoch; + int already_completed; + u32 bytes; + unsigned int i; + + tid = le64_to_cpu(msg->hdr.tid); + dout("handle_reply %p tid %llu\n", msg, tid); + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + ceph_decode_need(&p, end, 4, bad); + object_len = ceph_decode_32(&p); + ceph_decode_need(&p, end, object_len, bad); + p += object_len; + + err = ceph_decode_pgid(&p, end, &pg); + if (err) + goto bad; + + ceph_decode_need(&p, end, 8 + 4 + 4 + 8 + 4, bad); + flags = ceph_decode_64(&p); + result = ceph_decode_32(&p); + reassert_epoch = ceph_decode_32(&p); + reassert_version = ceph_decode_64(&p); + osdmap_epoch = ceph_decode_32(&p); + + /* lookup */ + down_read(&osdc->map_sem); + mutex_lock(&osdc->request_mutex); + req = __lookup_request(osdc, tid); + if (req == NULL) { + dout("handle_reply tid %llu dne\n", tid); + goto bad_mutex; + } + ceph_osdc_get_request(req); + + dout("handle_reply %p tid %llu req %p result %d\n", msg, tid, + req, result); + + ceph_decode_need(&p, end, 4, bad_put); + numops = ceph_decode_32(&p); + if (numops > CEPH_OSD_MAX_OP) + goto bad_put; + if (numops != req->r_num_ops) + goto bad_put; + payload_len = 0; + ceph_decode_need(&p, end, numops * sizeof(struct ceph_osd_op), bad_put); + for (i = 0; i < numops; i++) { + struct ceph_osd_op *op = p; + int len; + + len = le32_to_cpu(op->payload_len); + req->r_reply_op_len[i] = len; + dout(" op %d has %d bytes\n", i, len); + payload_len += len; + p += sizeof(*op); + } + bytes = le32_to_cpu(msg->hdr.data_len); + if (payload_len != bytes) { + pr_warn("sum of op payload lens %d != data_len %d\n", + payload_len, bytes); + goto bad_put; + } + + ceph_decode_need(&p, end, 4 + numops * 4, bad_put); + retry_attempt = ceph_decode_32(&p); + for (i = 0; i < numops; i++) + req->r_reply_op_result[i] = ceph_decode_32(&p); + + if (le16_to_cpu(msg->hdr.version) >= 6) { + p += 8 + 4; /* skip replay_version */ + p += 8; /* skip user_version */ + + err = ceph_redirect_decode(&p, end, &redir); + if (err) + goto bad_put; + } else { + redir.oloc.pool = -1; + } + + if (redir.oloc.pool != -1) { + dout("redirect pool %lld\n", redir.oloc.pool); + + __unregister_request(osdc, req); + + req->r_target_oloc = redir.oloc; /* struct */ + + /* + * Start redirect requests with nofail=true. If + * mapping fails, request will end up on the notarget + * list, waiting for the new osdmap (which can take + * a while), even though the original request mapped + * successfully. In the future we might want to follow + * original request's nofail setting here. + */ + err = __ceph_osdc_start_request(osdc, req, true); + BUG_ON(err); + + goto out_unlock; + } + + already_completed = req->r_got_reply; + if (!req->r_got_reply) { + req->r_result = result; + dout("handle_reply result %d bytes %d\n", req->r_result, + bytes); + if (req->r_result == 0) + req->r_result = bytes; + + /* in case this is a write and we need to replay, */ + req->r_reassert_version.epoch = cpu_to_le32(reassert_epoch); + req->r_reassert_version.version = cpu_to_le64(reassert_version); + + req->r_got_reply = 1; + } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) { + dout("handle_reply tid %llu dup ack\n", tid); + goto out_unlock; + } + + dout("handle_reply tid %llu flags %d\n", tid, flags); + + if (req->r_linger && (flags & CEPH_OSD_FLAG_ONDISK)) + __register_linger_request(osdc, req); + + /* either this is a read, or we got the safe response */ + if (result < 0 || + (flags & CEPH_OSD_FLAG_ONDISK) || + ((flags & CEPH_OSD_FLAG_WRITE) == 0)) + __unregister_request(osdc, req); + + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + + if (!already_completed) { + if (req->r_unsafe_callback && + result >= 0 && !(flags & CEPH_OSD_FLAG_ONDISK)) + req->r_unsafe_callback(req, true); + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + } + + if (flags & CEPH_OSD_FLAG_ONDISK) { + if (req->r_unsafe_callback && already_completed) + req->r_unsafe_callback(req, false); + complete_request(req); + } + +out: + dout("req=%p req->r_linger=%d\n", req, req->r_linger); + ceph_osdc_put_request(req); + return; +out_unlock: + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + goto out; + +bad_put: + req->r_result = -EIO; + __unregister_request(osdc, req); + if (req->r_callback) + req->r_callback(req, msg); + else + complete_all(&req->r_completion); + complete_request(req); + ceph_osdc_put_request(req); +bad_mutex: + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); +bad: + pr_err("corrupt osd_op_reply got %d %d\n", + (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); + ceph_msg_dump(msg); +} + +static void reset_changed_osds(struct ceph_osd_client *osdc) +{ + struct rb_node *p, *n; + + dout("%s %p\n", __func__, osdc); + for (p = rb_first(&osdc->osds); p; p = n) { + struct ceph_osd *osd = rb_entry(p, struct ceph_osd, o_node); + + n = rb_next(p); + if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) || + memcmp(&osd->o_con.peer_addr, + ceph_osd_addr(osdc->osdmap, + osd->o_osd), + sizeof(struct ceph_entity_addr)) != 0) + __reset_osd(osdc, osd); + } +} + +/* + * Requeue requests whose mapping to an OSD has changed. If requests map to + * no osd, request a new map. + * + * Caller should hold map_sem for read. + */ +static void kick_requests(struct ceph_osd_client *osdc, bool force_resend, + bool force_resend_writes) +{ + struct ceph_osd_request *req, *nreq; + struct rb_node *p; + int needmap = 0; + int err; + bool force_resend_req; + + dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "", + force_resend_writes ? " (force resend writes)" : ""); + mutex_lock(&osdc->request_mutex); + for (p = rb_first(&osdc->requests); p; ) { + req = rb_entry(p, struct ceph_osd_request, r_node); + p = rb_next(p); + + /* + * For linger requests that have not yet been + * registered, move them to the linger list; they'll + * be sent to the osd in the loop below. Unregister + * the request before re-registering it as a linger + * request to ensure the __map_request() below + * will decide it needs to be sent. + */ + if (req->r_linger && list_empty(&req->r_linger_item)) { + dout("%p tid %llu restart on osd%d\n", + req, req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + ceph_osdc_get_request(req); + __unregister_request(osdc, req); + __register_linger_request(osdc, req); + ceph_osdc_put_request(req); + continue; + } + + force_resend_req = force_resend || + (force_resend_writes && + req->r_flags & CEPH_OSD_FLAG_WRITE); + err = __map_request(osdc, req, force_resend_req); + if (err < 0) + continue; /* error */ + if (req->r_osd == NULL) { + dout("%p tid %llu maps to no osd\n", req, req->r_tid); + needmap++; /* request a newer map */ + } else if (err > 0) { + if (!req->r_linger) { + dout("%p tid %llu requeued on osd%d\n", req, + req->r_tid, + req->r_osd ? req->r_osd->o_osd : -1); + req->r_flags |= CEPH_OSD_FLAG_RETRY; + } + } + } + + list_for_each_entry_safe(req, nreq, &osdc->req_linger, + r_linger_item) { + dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); + + err = __map_request(osdc, req, + force_resend || force_resend_writes); + dout("__map_request returned %d\n", err); + if (err < 0) + continue; /* hrm! */ + if (req->r_osd == NULL || err > 0) { + if (req->r_osd == NULL) { + dout("lingering %p tid %llu maps to no osd\n", + req, req->r_tid); + /* + * A homeless lingering request makes + * no sense, as it's job is to keep + * a particular OSD connection open. + * Request a newer map and kick the + * request, knowing that it won't be + * resent until we actually get a map + * that can tell us where to send it. + */ + needmap++; + } + + dout("kicking lingering %p tid %llu osd%d\n", req, + req->r_tid, req->r_osd ? req->r_osd->o_osd : -1); + __register_request(osdc, req); + __unregister_linger_request(osdc, req); + } + } + reset_changed_osds(osdc); + mutex_unlock(&osdc->request_mutex); + + if (needmap) { + dout("%d requests for down osds, need new map\n", needmap); + ceph_monc_request_next_osdmap(&osdc->client->monc); + } +} + + +/* + * Process updated osd map. + * + * The message contains any number of incremental and full maps, normally + * indicating some sort of topology change in the cluster. Kick requests + * off to different OSDs as needed. + */ +void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) +{ + void *p, *end, *next; + u32 nr_maps, maplen; + u32 epoch; + struct ceph_osdmap *newmap = NULL, *oldmap; + int err; + struct ceph_fsid fsid; + bool was_full; + + dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + /* verify fsid */ + ceph_decode_need(&p, end, sizeof(fsid), bad); + ceph_decode_copy(&p, &fsid, sizeof(fsid)); + if (ceph_check_fsid(osdc->client, &fsid) < 0) + return; + + down_write(&osdc->map_sem); + + was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL); + + /* incremental maps */ + ceph_decode_32_safe(&p, end, nr_maps, bad); + dout(" %d inc maps\n", nr_maps); + while (nr_maps > 0) { + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + ceph_decode_need(&p, end, maplen, bad); + next = p + maplen; + if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) { + dout("applying incremental map %u len %d\n", + epoch, maplen); + newmap = osdmap_apply_incremental(&p, next, + osdc->osdmap, + &osdc->client->msgr); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad; + } + BUG_ON(!newmap); + if (newmap != osdc->osdmap) { + ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = newmap; + } + was_full = was_full || + ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_FULL); + kick_requests(osdc, 0, was_full); + } else { + dout("ignoring incremental map %u len %d\n", + epoch, maplen); + } + p = next; + nr_maps--; + } + if (newmap) + goto done; + + /* full maps */ + ceph_decode_32_safe(&p, end, nr_maps, bad); + dout(" %d full maps\n", nr_maps); + while (nr_maps) { + ceph_decode_need(&p, end, 2*sizeof(u32), bad); + epoch = ceph_decode_32(&p); + maplen = ceph_decode_32(&p); + ceph_decode_need(&p, end, maplen, bad); + if (nr_maps > 1) { + dout("skipping non-latest full map %u len %d\n", + epoch, maplen); + } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) { + dout("skipping full map %u len %d, " + "older than our %u\n", epoch, maplen, + osdc->osdmap->epoch); + } else { + int skipped_map = 0; + + dout("taking full map %u len %d\n", epoch, maplen); + newmap = ceph_osdmap_decode(&p, p+maplen); + if (IS_ERR(newmap)) { + err = PTR_ERR(newmap); + goto bad; + } + BUG_ON(!newmap); + oldmap = osdc->osdmap; + osdc->osdmap = newmap; + if (oldmap) { + if (oldmap->epoch + 1 < newmap->epoch) + skipped_map = 1; + ceph_osdmap_destroy(oldmap); + } + was_full = was_full || + ceph_osdmap_flag(osdc->osdmap, + CEPH_OSDMAP_FULL); + kick_requests(osdc, skipped_map, was_full); + } + p += maplen; + nr_maps--; + } + + if (!osdc->osdmap) + goto bad; +done: + downgrade_write(&osdc->map_sem); + ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); + + /* + * subscribe to subsequent osdmap updates if full to ensure + * we find out when we are no longer full and stop returning + * ENOSPC. + */ + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSERD) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_PAUSEWR)) + ceph_monc_request_next_osdmap(&osdc->client->monc); + + mutex_lock(&osdc->request_mutex); + __send_queued(osdc); + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + wake_up_all(&osdc->client->auth_wq); + return; + +bad: + pr_err("osdc handle_map corrupt msg\n"); + ceph_msg_dump(msg); + up_write(&osdc->map_sem); +} + +/* + * watch/notify callback event infrastructure + * + * These callbacks are used both for watch and notify operations. + */ +static void __release_event(struct kref *kref) +{ + struct ceph_osd_event *event = + container_of(kref, struct ceph_osd_event, kref); + + dout("__release_event %p\n", event); + kfree(event); +} + +static void get_event(struct ceph_osd_event *event) +{ + kref_get(&event->kref); +} + +void ceph_osdc_put_event(struct ceph_osd_event *event) +{ + kref_put(&event->kref, __release_event); +} +EXPORT_SYMBOL(ceph_osdc_put_event); + +static void __insert_event(struct ceph_osd_client *osdc, + struct ceph_osd_event *new) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (new->cookie < event->cookie) + p = &(*p)->rb_left; + else if (new->cookie > event->cookie) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, &osdc->event_tree); +} + +static struct ceph_osd_event *__find_event(struct ceph_osd_client *osdc, + u64 cookie) +{ + struct rb_node **p = &osdc->event_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_osd_event *event = NULL; + + while (*p) { + parent = *p; + event = rb_entry(parent, struct ceph_osd_event, node); + if (cookie < event->cookie) + p = &(*p)->rb_left; + else if (cookie > event->cookie) + p = &(*p)->rb_right; + else + return event; + } + return NULL; +} + +static void __remove_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + if (!RB_EMPTY_NODE(&event->node)) { + dout("__remove_event removed %p\n", event); + rb_erase(&event->node, &osdc->event_tree); + ceph_osdc_put_event(event); + } else { + dout("__remove_event didn't remove %p\n", event); + } +} + +int ceph_osdc_create_event(struct ceph_osd_client *osdc, + void (*event_cb)(u64, u64, u8, void *), + void *data, struct ceph_osd_event **pevent) +{ + struct ceph_osd_event *event; + + event = kmalloc(sizeof(*event), GFP_NOIO); + if (!event) + return -ENOMEM; + + dout("create_event %p\n", event); + event->cb = event_cb; + event->one_shot = 0; + event->data = data; + event->osdc = osdc; + INIT_LIST_HEAD(&event->osd_node); + RB_CLEAR_NODE(&event->node); + kref_init(&event->kref); /* one ref for us */ + kref_get(&event->kref); /* one ref for the caller */ + + spin_lock(&osdc->event_lock); + event->cookie = ++osdc->event_count; + __insert_event(osdc, event); + spin_unlock(&osdc->event_lock); + + *pevent = event; + return 0; +} +EXPORT_SYMBOL(ceph_osdc_create_event); + +void ceph_osdc_cancel_event(struct ceph_osd_event *event) +{ + struct ceph_osd_client *osdc = event->osdc; + + dout("cancel_event %p\n", event); + spin_lock(&osdc->event_lock); + __remove_event(event); + spin_unlock(&osdc->event_lock); + ceph_osdc_put_event(event); /* caller's */ +} +EXPORT_SYMBOL(ceph_osdc_cancel_event); + + +static void do_event_work(struct work_struct *work) +{ + struct ceph_osd_event_work *event_work = + container_of(work, struct ceph_osd_event_work, work); + struct ceph_osd_event *event = event_work->event; + u64 ver = event_work->ver; + u64 notify_id = event_work->notify_id; + u8 opcode = event_work->opcode; + + dout("do_event_work completing %p\n", event); + event->cb(ver, notify_id, opcode, event->data); + dout("do_event_work completed %p\n", event); + ceph_osdc_put_event(event); + kfree(event_work); +} + + +/* + * Process osd watch notifications + */ +static void handle_watch_notify(struct ceph_osd_client *osdc, + struct ceph_msg *msg) +{ + void *p, *end; + u8 proto_ver; + u64 cookie, ver, notify_id; + u8 opcode; + struct ceph_osd_event *event; + struct ceph_osd_event_work *event_work; + + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + ceph_decode_8_safe(&p, end, proto_ver, bad); + ceph_decode_8_safe(&p, end, opcode, bad); + ceph_decode_64_safe(&p, end, cookie, bad); + ceph_decode_64_safe(&p, end, ver, bad); + ceph_decode_64_safe(&p, end, notify_id, bad); + + spin_lock(&osdc->event_lock); + event = __find_event(osdc, cookie); + if (event) { + BUG_ON(event->one_shot); + get_event(event); + } + spin_unlock(&osdc->event_lock); + dout("handle_watch_notify cookie %lld ver %lld event %p\n", + cookie, ver, event); + if (event) { + event_work = kmalloc(sizeof(*event_work), GFP_NOIO); + if (!event_work) { + pr_err("couldn't allocate event_work\n"); + ceph_osdc_put_event(event); + return; + } + INIT_WORK(&event_work->work, do_event_work); + event_work->event = event; + event_work->ver = ver; + event_work->notify_id = notify_id; + event_work->opcode = opcode; + + queue_work(osdc->notify_wq, &event_work->work); + } + + return; + +bad: + pr_err("osdc handle_watch_notify corrupt msg\n"); +} + +/* + * build new request AND message + * + */ +void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, + struct ceph_snap_context *snapc, u64 snap_id, + struct timespec *mtime) +{ + struct ceph_msg *msg = req->r_request; + void *p; + size_t msg_size; + int flags = req->r_flags; + u64 data_len; + unsigned int i; + + req->r_snapid = snap_id; + req->r_snapc = ceph_get_snap_context(snapc); + + /* encode request */ + msg->hdr.version = cpu_to_le16(4); + + p = msg->front.iov_base; + ceph_encode_32(&p, 1); /* client_inc is always 1 */ + req->r_request_osdmap_epoch = p; + p += 4; + req->r_request_flags = p; + p += 4; + if (req->r_flags & CEPH_OSD_FLAG_WRITE) + ceph_encode_timespec(p, mtime); + p += sizeof(struct ceph_timespec); + req->r_request_reassert_version = p; + p += sizeof(struct ceph_eversion); /* will get filled in */ + + /* oloc */ + ceph_encode_8(&p, 4); + ceph_encode_8(&p, 4); + ceph_encode_32(&p, 8 + 4 + 4); + req->r_request_pool = p; + p += 8; + ceph_encode_32(&p, -1); /* preferred */ + ceph_encode_32(&p, 0); /* key len */ + + ceph_encode_8(&p, 1); + req->r_request_pgid = p; + p += 8 + 4; + ceph_encode_32(&p, -1); /* preferred */ + + /* oid */ + ceph_encode_32(&p, req->r_base_oid.name_len); + memcpy(p, req->r_base_oid.name, req->r_base_oid.name_len); + dout("oid '%.*s' len %d\n", req->r_base_oid.name_len, + req->r_base_oid.name, req->r_base_oid.name_len); + p += req->r_base_oid.name_len; + + /* ops--can imply data */ + ceph_encode_16(&p, (u16)req->r_num_ops); + data_len = 0; + for (i = 0; i < req->r_num_ops; i++) { + data_len += osd_req_encode_op(req, p, i); + p += sizeof(struct ceph_osd_op); + } + + /* snaps */ + ceph_encode_64(&p, req->r_snapid); + ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); + ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); + if (req->r_snapc) { + for (i = 0; i < snapc->num_snaps; i++) { + ceph_encode_64(&p, req->r_snapc->snaps[i]); + } + } + + req->r_request_attempts = p; + p += 4; + + /* data */ + if (flags & CEPH_OSD_FLAG_WRITE) { + u16 data_off; + + /* + * The header "data_off" is a hint to the receiver + * allowing it to align received data into its + * buffers such that there's no need to re-copy + * it before writing it to disk (direct I/O). + */ + data_off = (u16) (off & 0xffff); + req->r_request->hdr.data_off = cpu_to_le16(data_off); + } + req->r_request->hdr.data_len = cpu_to_le32(data_len); + + BUG_ON(p > msg->front.iov_base + msg->front.iov_len); + msg_size = p - msg->front.iov_base; + msg->front.iov_len = msg_size; + msg->hdr.front_len = cpu_to_le32(msg_size); + + dout("build_request msg_size was %d\n", (int)msg_size); +} +EXPORT_SYMBOL(ceph_osdc_build_request); + +/* + * Register request, send initial attempt. + */ +int ceph_osdc_start_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req, + bool nofail) +{ + int rc; + + down_read(&osdc->map_sem); + mutex_lock(&osdc->request_mutex); + + rc = __ceph_osdc_start_request(osdc, req, nofail); + + mutex_unlock(&osdc->request_mutex); + up_read(&osdc->map_sem); + + return rc; +} +EXPORT_SYMBOL(ceph_osdc_start_request); + +/* + * Unregister a registered request. The request is not completed (i.e. + * no callbacks or wakeups) - higher layers are supposed to know what + * they are canceling. + */ +void ceph_osdc_cancel_request(struct ceph_osd_request *req) +{ + struct ceph_osd_client *osdc = req->r_osdc; + + mutex_lock(&osdc->request_mutex); + if (req->r_linger) + __unregister_linger_request(osdc, req); + __unregister_request(osdc, req); + mutex_unlock(&osdc->request_mutex); + + dout("%s %p tid %llu canceled\n", __func__, req, req->r_tid); +} +EXPORT_SYMBOL(ceph_osdc_cancel_request); + +/* + * wait for a request to complete + */ +int ceph_osdc_wait_request(struct ceph_osd_client *osdc, + struct ceph_osd_request *req) +{ + int rc; + + dout("%s %p tid %llu\n", __func__, req, req->r_tid); + + rc = wait_for_completion_interruptible(&req->r_completion); + if (rc < 0) { + dout("%s %p tid %llu interrupted\n", __func__, req, req->r_tid); + ceph_osdc_cancel_request(req); + complete_request(req); + return rc; + } + + dout("%s %p tid %llu result %d\n", __func__, req, req->r_tid, + req->r_result); + return req->r_result; +} +EXPORT_SYMBOL(ceph_osdc_wait_request); + +/* + * sync - wait for all in-flight requests to flush. avoid starvation. + */ +void ceph_osdc_sync(struct ceph_osd_client *osdc) +{ + struct ceph_osd_request *req; + u64 last_tid, next_tid = 0; + + mutex_lock(&osdc->request_mutex); + last_tid = osdc->last_tid; + while (1) { + req = __lookup_request_ge(osdc, next_tid); + if (!req) + break; + if (req->r_tid > last_tid) + break; + + next_tid = req->r_tid + 1; + if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0) + continue; + + ceph_osdc_get_request(req); + mutex_unlock(&osdc->request_mutex); + dout("sync waiting on tid %llu (last is %llu)\n", + req->r_tid, last_tid); + wait_for_completion(&req->r_safe_completion); + mutex_lock(&osdc->request_mutex); + ceph_osdc_put_request(req); + } + mutex_unlock(&osdc->request_mutex); + dout("sync done (thru tid %llu)\n", last_tid); +} +EXPORT_SYMBOL(ceph_osdc_sync); + +/* + * Call all pending notify callbacks - for use after a watch is + * unregistered, to make sure no more callbacks for it will be invoked + */ +void ceph_osdc_flush_notifies(struct ceph_osd_client *osdc) +{ + flush_workqueue(osdc->notify_wq); +} +EXPORT_SYMBOL(ceph_osdc_flush_notifies); + + +/* + * init, shutdown + */ +int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) +{ + int err; + + dout("init\n"); + osdc->client = client; + osdc->osdmap = NULL; + init_rwsem(&osdc->map_sem); + init_completion(&osdc->map_waiters); + osdc->last_requested_map = 0; + mutex_init(&osdc->request_mutex); + osdc->last_tid = 0; + osdc->osds = RB_ROOT; + INIT_LIST_HEAD(&osdc->osd_lru); + osdc->requests = RB_ROOT; + INIT_LIST_HEAD(&osdc->req_lru); + INIT_LIST_HEAD(&osdc->req_unsent); + INIT_LIST_HEAD(&osdc->req_notarget); + INIT_LIST_HEAD(&osdc->req_linger); + osdc->num_requests = 0; + INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); + INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); + spin_lock_init(&osdc->event_lock); + osdc->event_tree = RB_ROOT; + osdc->event_count = 0; + + schedule_delayed_work(&osdc->osds_timeout_work, + round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ)); + + err = -ENOMEM; + osdc->req_mempool = mempool_create_kmalloc_pool(10, + sizeof(struct ceph_osd_request)); + if (!osdc->req_mempool) + goto out; + + err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, + OSD_OP_FRONT_LEN, 10, true, + "osd_op"); + if (err < 0) + goto out_mempool; + err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, + OSD_OPREPLY_FRONT_LEN, 10, true, + "osd_op_reply"); + if (err < 0) + goto out_msgpool; + + err = -ENOMEM; + osdc->notify_wq = create_singlethread_workqueue("ceph-watch-notify"); + if (!osdc->notify_wq) + goto out_msgpool_reply; + + return 0; + +out_msgpool_reply: + ceph_msgpool_destroy(&osdc->msgpool_op_reply); +out_msgpool: + ceph_msgpool_destroy(&osdc->msgpool_op); +out_mempool: + mempool_destroy(osdc->req_mempool); +out: + return err; +} + +void ceph_osdc_stop(struct ceph_osd_client *osdc) +{ + flush_workqueue(osdc->notify_wq); + destroy_workqueue(osdc->notify_wq); + cancel_delayed_work_sync(&osdc->timeout_work); + cancel_delayed_work_sync(&osdc->osds_timeout_work); + if (osdc->osdmap) { + ceph_osdmap_destroy(osdc->osdmap); + osdc->osdmap = NULL; + } + remove_all_osds(osdc); + mempool_destroy(osdc->req_mempool); + ceph_msgpool_destroy(&osdc->msgpool_op); + ceph_msgpool_destroy(&osdc->msgpool_op_reply); +} + +/* + * Read some contiguous pages. If we cross a stripe boundary, shorten + * *plen. Return number of bytes read, or error. + */ +int ceph_osdc_readpages(struct ceph_osd_client *osdc, + struct ceph_vino vino, struct ceph_file_layout *layout, + u64 off, u64 *plen, + u32 truncate_seq, u64 truncate_size, + struct page **pages, int num_pages, int page_align) +{ + struct ceph_osd_request *req; + int rc = 0; + + dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, + vino.snap, off, *plen); + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, + NULL, truncate_seq, truncate_size, + false); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* it may be a short read due to an object boundary */ + + osd_req_op_extent_osd_data_pages(req, 0, + pages, *plen, page_align, false, false); + + dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", + off, *plen, *plen, page_align); + + ceph_osdc_build_request(req, off, NULL, vino.snap, NULL); + + rc = ceph_osdc_start_request(osdc, req, false); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + dout("readpages result %d\n", rc); + return rc; +} +EXPORT_SYMBOL(ceph_osdc_readpages); + +/* + * do a synchronous write on N pages + */ +int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, + struct ceph_file_layout *layout, + struct ceph_snap_context *snapc, + u64 off, u64 len, + u32 truncate_seq, u64 truncate_size, + struct timespec *mtime, + struct page **pages, int num_pages) +{ + struct ceph_osd_request *req; + int rc = 0; + int page_align = off & ~PAGE_MASK; + + BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, + snapc, truncate_seq, truncate_size, + true); + if (IS_ERR(req)) + return PTR_ERR(req); + + /* it may be a short write due to an object boundary */ + osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, + false, false); + dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); + + ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime); + + rc = ceph_osdc_start_request(osdc, req, true); + if (!rc) + rc = ceph_osdc_wait_request(osdc, req); + + ceph_osdc_put_request(req); + if (rc == 0) + rc = len; + dout("writepages result %d\n", rc); + return rc; +} +EXPORT_SYMBOL(ceph_osdc_writepages); + +int ceph_osdc_setup(void) +{ + BUG_ON(ceph_osd_request_cache); + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", + sizeof (struct ceph_osd_request), + __alignof__(struct ceph_osd_request), + 0, NULL); + + return ceph_osd_request_cache ? 0 : -ENOMEM; +} +EXPORT_SYMBOL(ceph_osdc_setup); + +void ceph_osdc_cleanup(void) +{ + BUG_ON(!ceph_osd_request_cache); + kmem_cache_destroy(ceph_osd_request_cache); + ceph_osd_request_cache = NULL; +} +EXPORT_SYMBOL(ceph_osdc_cleanup); + +/* + * handle incoming message + */ +static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc; + int type = le16_to_cpu(msg->hdr.type); + + if (!osd) + goto out; + osdc = osd->o_osdc; + + switch (type) { + case CEPH_MSG_OSD_MAP: + ceph_osdc_handle_map(osdc, msg); + break; + case CEPH_MSG_OSD_OPREPLY: + handle_reply(osdc, msg, con); + break; + case CEPH_MSG_WATCH_NOTIFY: + handle_watch_notify(osdc, msg); + break; + + default: + pr_err("received unknown message type %d %s\n", type, + ceph_msg_type_name(type)); + } +out: + ceph_msg_put(msg); +} + +/* + * lookup and return message for incoming reply. set up reply message + * pages. + */ +static struct ceph_msg *get_reply(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_osd *osd = con->private; + struct ceph_osd_client *osdc = osd->o_osdc; + struct ceph_msg *m; + struct ceph_osd_request *req; + int front_len = le32_to_cpu(hdr->front_len); + int data_len = le32_to_cpu(hdr->data_len); + u64 tid; + + tid = le64_to_cpu(hdr->tid); + mutex_lock(&osdc->request_mutex); + req = __lookup_request(osdc, tid); + if (!req) { + *skip = 1; + m = NULL; + dout("get_reply unknown tid %llu from osd%d\n", tid, + osd->o_osd); + goto out; + } + + if (req->r_reply->con) + dout("%s revoking msg %p from old con %p\n", __func__, + req->r_reply, req->r_reply->con); + ceph_msg_revoke_incoming(req->r_reply); + + if (front_len > req->r_reply->front_alloc_len) { + pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n", + front_len, req->r_reply->front_alloc_len, + (unsigned int)con->peer_name.type, + le64_to_cpu(con->peer_name.num)); + m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, + false); + if (!m) + goto out; + ceph_msg_put(req->r_reply); + req->r_reply = m; + } + m = ceph_msg_get(req->r_reply); + + if (data_len > 0) { + struct ceph_osd_data *osd_data; + + /* + * XXX This is assuming there is only one op containing + * XXX page data. Probably OK for reads, but this + * XXX ought to be done more generally. + */ + osd_data = osd_req_op_extent_osd_data(req, 0); + if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { + if (osd_data->pages && + unlikely(osd_data->length < data_len)) { + + pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n", + tid, data_len, osd_data->length); + *skip = 1; + ceph_msg_put(m); + m = NULL; + goto out; + } + } + } + *skip = 0; + dout("get_reply tid %lld %p\n", tid, m); + +out: + mutex_unlock(&osdc->request_mutex); + return m; + +} + +static struct ceph_msg *alloc_msg(struct ceph_connection *con, + struct ceph_msg_header *hdr, + int *skip) +{ + struct ceph_osd *osd = con->private; + int type = le16_to_cpu(hdr->type); + int front = le32_to_cpu(hdr->front_len); + + *skip = 0; + switch (type) { + case CEPH_MSG_OSD_MAP: + case CEPH_MSG_WATCH_NOTIFY: + return ceph_msg_new(type, front, GFP_NOFS, false); + case CEPH_MSG_OSD_OPREPLY: + return get_reply(con, hdr, skip); + default: + pr_info("alloc_msg unexpected msg type %d from osd%d\n", type, + osd->o_osd); + *skip = 1; + return NULL; + } +} + +/* + * Wrappers to refcount containing ceph_osd struct + */ +static struct ceph_connection *get_osd_con(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + if (get_osd(osd)) + return con; + return NULL; +} + +static void put_osd_con(struct ceph_connection *con) +{ + struct ceph_osd *osd = con->private; + put_osd(osd); +} + +/* + * authentication + */ +/* + * Note: returned pointer is the address of a structure that's + * managed separately. Caller must *not* attempt to free it. + */ +static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, + int *proto, int force_new) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + struct ceph_auth_handshake *auth = &o->o_auth; + + if (force_new && auth->authorizer) { + ceph_auth_destroy_authorizer(ac, auth->authorizer); + auth->authorizer = NULL; + } + if (!auth->authorizer) { + int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); + if (ret) + return ERR_PTR(ret); + } else { + int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD, + auth); + if (ret) + return ERR_PTR(ret); + } + *proto = ac->protocol; + + return auth; +} + + +static int verify_authorizer_reply(struct ceph_connection *con, int len) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len); +} + +static int invalidate_authorizer(struct ceph_connection *con) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD); + return ceph_monc_validate_auth(&osdc->client->monc); +} + +static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_handshake *auth = &o->o_auth; + return ceph_auth_sign_message(auth, msg); +} + +static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) +{ + struct ceph_osd *o = con->private; + struct ceph_auth_handshake *auth = &o->o_auth; + return ceph_auth_check_message_signature(auth, msg); +} + +static const struct ceph_connection_operations osd_con_ops = { + .get = get_osd_con, + .put = put_osd_con, + .dispatch = dispatch, + .get_authorizer = get_authorizer, + .verify_authorizer_reply = verify_authorizer_reply, + .invalidate_authorizer = invalidate_authorizer, + .alloc_msg = alloc_msg, + .sign_message = sign_message, + .check_message_signature = check_message_signature, + .fault = osd_reset, +}; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c new file mode 100644 index 000000000..4a3125836 --- /dev/null +++ b/net/ceph/osdmap.c @@ -0,0 +1,1754 @@ + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +char *ceph_osdmap_state_str(char *str, int len, int state) +{ + if (!len) + return str; + + if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) + snprintf(str, len, "exists, up"); + else if (state & CEPH_OSD_EXISTS) + snprintf(str, len, "exists"); + else if (state & CEPH_OSD_UP) + snprintf(str, len, "up"); + else + snprintf(str, len, "doesn't exist"); + + return str; +} + +/* maps */ + +static int calc_bits_of(unsigned int t) +{ + int b = 0; + while (t) { + t = t >> 1; + b++; + } + return b; +} + +/* + * the foo_mask is the smallest value 2^n-1 that is >= foo. + */ +static void calc_pg_masks(struct ceph_pg_pool_info *pi) +{ + pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; + pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; +} + +/* + * decode crush map + */ +static int crush_decode_uniform_bucket(void **p, void *end, + struct crush_bucket_uniform *b) +{ + dout("crush_decode_uniform_bucket %p to %p\n", *p, end); + ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); + b->item_weight = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_list_bucket(void **p, void *end, + struct crush_bucket_list *b) +{ + int j; + dout("crush_decode_list_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->sum_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) { + b->item_weights[j] = ceph_decode_32(p); + b->sum_weights[j] = ceph_decode_32(p); + } + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_tree_bucket(void **p, void *end, + struct crush_bucket_tree *b) +{ + int j; + dout("crush_decode_tree_bucket %p to %p\n", *p, end); + ceph_decode_8_safe(p, end, b->num_nodes, bad); + b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); + if (b->node_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); + for (j = 0; j < b->num_nodes; j++) + b->node_weights[j] = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_straw_bucket(void **p, void *end, + struct crush_bucket_straw *b) +{ + int j; + dout("crush_decode_straw_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->straws == NULL) + return -ENOMEM; + ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) { + b->item_weights[j] = ceph_decode_32(p); + b->straws[j] = ceph_decode_32(p); + } + return 0; +bad: + return -EINVAL; +} + +static int crush_decode_straw2_bucket(void **p, void *end, + struct crush_bucket_straw2 *b) +{ + int j; + dout("crush_decode_straw2_bucket %p to %p\n", *p, end); + b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); + if (b->item_weights == NULL) + return -ENOMEM; + ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); + for (j = 0; j < b->h.size; j++) + b->item_weights[j] = ceph_decode_32(p); + return 0; +bad: + return -EINVAL; +} + +static int skip_name_map(void **p, void *end) +{ + int len; + ceph_decode_32_safe(p, end, len ,bad); + while (len--) { + int strlen; + *p += sizeof(u32); + ceph_decode_32_safe(p, end, strlen, bad); + *p += strlen; +} + return 0; +bad: + return -EINVAL; +} + +static struct crush_map *crush_decode(void *pbyval, void *end) +{ + struct crush_map *c; + int err = -EINVAL; + int i, j; + void **p = &pbyval; + void *start = pbyval; + u32 magic; + u32 num_name_maps; + + dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); + + c = kzalloc(sizeof(*c), GFP_NOFS); + if (c == NULL) + return ERR_PTR(-ENOMEM); + + /* set tunables to default values */ + c->choose_local_tries = 2; + c->choose_local_fallback_tries = 5; + c->choose_total_tries = 19; + c->chooseleaf_descend_once = 0; + + ceph_decode_need(p, end, 4*sizeof(u32), bad); + magic = ceph_decode_32(p); + if (magic != CRUSH_MAGIC) { + pr_err("crush_decode magic %x != current %x\n", + (unsigned int)magic, (unsigned int)CRUSH_MAGIC); + goto bad; + } + c->max_buckets = ceph_decode_32(p); + c->max_rules = ceph_decode_32(p); + c->max_devices = ceph_decode_32(p); + + c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); + if (c->buckets == NULL) + goto badmem; + c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); + if (c->rules == NULL) + goto badmem; + + /* buckets */ + for (i = 0; i < c->max_buckets; i++) { + int size = 0; + u32 alg; + struct crush_bucket *b; + + ceph_decode_32_safe(p, end, alg, bad); + if (alg == 0) { + c->buckets[i] = NULL; + continue; + } + dout("crush_decode bucket %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + + switch (alg) { + case CRUSH_BUCKET_UNIFORM: + size = sizeof(struct crush_bucket_uniform); + break; + case CRUSH_BUCKET_LIST: + size = sizeof(struct crush_bucket_list); + break; + case CRUSH_BUCKET_TREE: + size = sizeof(struct crush_bucket_tree); + break; + case CRUSH_BUCKET_STRAW: + size = sizeof(struct crush_bucket_straw); + break; + case CRUSH_BUCKET_STRAW2: + size = sizeof(struct crush_bucket_straw2); + break; + default: + err = -EINVAL; + goto bad; + } + BUG_ON(size == 0); + b = c->buckets[i] = kzalloc(size, GFP_NOFS); + if (b == NULL) + goto badmem; + + ceph_decode_need(p, end, 4*sizeof(u32), bad); + b->id = ceph_decode_32(p); + b->type = ceph_decode_16(p); + b->alg = ceph_decode_8(p); + b->hash = ceph_decode_8(p); + b->weight = ceph_decode_32(p); + b->size = ceph_decode_32(p); + + dout("crush_decode bucket size %d off %x %p to %p\n", + b->size, (int)(*p-start), *p, end); + + b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); + if (b->items == NULL) + goto badmem; + b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS); + if (b->perm == NULL) + goto badmem; + b->perm_n = 0; + + ceph_decode_need(p, end, b->size*sizeof(u32), bad); + for (j = 0; j < b->size; j++) + b->items[j] = ceph_decode_32(p); + + switch (b->alg) { + case CRUSH_BUCKET_UNIFORM: + err = crush_decode_uniform_bucket(p, end, + (struct crush_bucket_uniform *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_LIST: + err = crush_decode_list_bucket(p, end, + (struct crush_bucket_list *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_TREE: + err = crush_decode_tree_bucket(p, end, + (struct crush_bucket_tree *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_STRAW: + err = crush_decode_straw_bucket(p, end, + (struct crush_bucket_straw *)b); + if (err < 0) + goto bad; + break; + case CRUSH_BUCKET_STRAW2: + err = crush_decode_straw2_bucket(p, end, + (struct crush_bucket_straw2 *)b); + if (err < 0) + goto bad; + break; + } + } + + /* rules */ + dout("rule vec is %p\n", c->rules); + for (i = 0; i < c->max_rules; i++) { + u32 yes; + struct crush_rule *r; + + ceph_decode_32_safe(p, end, yes, bad); + if (!yes) { + dout("crush_decode NO rule %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + c->rules[i] = NULL; + continue; + } + + dout("crush_decode rule %d off %x %p to %p\n", + i, (int)(*p-start), *p, end); + + /* len */ + ceph_decode_32_safe(p, end, yes, bad); +#if BITS_PER_LONG == 32 + err = -EINVAL; + if (yes > (ULONG_MAX - sizeof(*r)) + / sizeof(struct crush_rule_step)) + goto bad; +#endif + r = c->rules[i] = kmalloc(sizeof(*r) + + yes*sizeof(struct crush_rule_step), + GFP_NOFS); + if (r == NULL) + goto badmem; + dout(" rule %d is at %p\n", i, r); + r->len = yes; + ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ + ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); + for (j = 0; j < r->len; j++) { + r->steps[j].op = ceph_decode_32(p); + r->steps[j].arg1 = ceph_decode_32(p); + r->steps[j].arg2 = ceph_decode_32(p); + } + } + + /* ignore trailing name maps. */ + for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { + err = skip_name_map(p, end); + if (err < 0) + goto done; + } + + /* tunables */ + ceph_decode_need(p, end, 3*sizeof(u32), done); + c->choose_local_tries = ceph_decode_32(p); + c->choose_local_fallback_tries = ceph_decode_32(p); + c->choose_total_tries = ceph_decode_32(p); + dout("crush decode tunable choose_local_tries = %d", + c->choose_local_tries); + dout("crush decode tunable choose_local_fallback_tries = %d", + c->choose_local_fallback_tries); + dout("crush decode tunable choose_total_tries = %d", + c->choose_total_tries); + + ceph_decode_need(p, end, sizeof(u32), done); + c->chooseleaf_descend_once = ceph_decode_32(p); + dout("crush decode tunable chooseleaf_descend_once = %d", + c->chooseleaf_descend_once); + + ceph_decode_need(p, end, sizeof(u8), done); + c->chooseleaf_vary_r = ceph_decode_8(p); + dout("crush decode tunable chooseleaf_vary_r = %d", + c->chooseleaf_vary_r); + +done: + dout("crush_decode success\n"); + return c; + +badmem: + err = -ENOMEM; +bad: + dout("crush_decode fail %d\n", err); + crush_destroy(c); + return ERR_PTR(err); +} + +/* + * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid + * to a set of osds) and primary_temp (explicit primary setting) + */ +static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) +{ + if (l.pool < r.pool) + return -1; + if (l.pool > r.pool) + return 1; + if (l.seed < r.seed) + return -1; + if (l.seed > r.seed) + return 1; + return 0; +} + +static int __insert_pg_mapping(struct ceph_pg_mapping *new, + struct rb_root *root) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_mapping *pg = NULL; + int c; + + dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new); + while (*p) { + parent = *p; + pg = rb_entry(parent, struct ceph_pg_mapping, node); + c = pgid_cmp(new->pgid, pg->pgid); + if (c < 0) + p = &(*p)->rb_left; + else if (c > 0) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, + struct ceph_pg pgid) +{ + struct rb_node *n = root->rb_node; + struct ceph_pg_mapping *pg; + int c; + + while (n) { + pg = rb_entry(n, struct ceph_pg_mapping, node); + c = pgid_cmp(pgid, pg->pgid); + if (c < 0) { + n = n->rb_left; + } else if (c > 0) { + n = n->rb_right; + } else { + dout("__lookup_pg_mapping %lld.%x got %p\n", + pgid.pool, pgid.seed, pg); + return pg; + } + } + return NULL; +} + +static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) +{ + struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); + + if (pg) { + dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, + pg); + rb_erase(&pg->node, root); + kfree(pg); + return 0; + } + dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); + return -ENOENT; +} + +/* + * rbtree of pg pool info + */ +static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct ceph_pg_pool_info *pi = NULL; + + while (*p) { + parent = *p; + pi = rb_entry(parent, struct ceph_pg_pool_info, node); + if (new->id < pi->id) + p = &(*p)->rb_left; + else if (new->id > pi->id) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(&new->node, parent, p); + rb_insert_color(&new->node, root); + return 0; +} + +static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, u64 id) +{ + struct ceph_pg_pool_info *pi; + struct rb_node *n = root->rb_node; + + while (n) { + pi = rb_entry(n, struct ceph_pg_pool_info, node); + if (id < pi->id) + n = n->rb_left; + else if (id > pi->id) + n = n->rb_right; + else + return pi; + } + return NULL; +} + +struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) +{ + return __lookup_pg_pool(&map->pg_pools, id); +} + +const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) +{ + struct ceph_pg_pool_info *pi; + + if (id == CEPH_NOPOOL) + return NULL; + + if (WARN_ON_ONCE(id > (u64) INT_MAX)) + return NULL; + + pi = __lookup_pg_pool(&map->pg_pools, (int) id); + + return pi ? pi->name : NULL; +} +EXPORT_SYMBOL(ceph_pg_pool_name_by_id); + +int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) +{ + struct rb_node *rbp; + + for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { + struct ceph_pg_pool_info *pi = + rb_entry(rbp, struct ceph_pg_pool_info, node); + if (pi->name && strcmp(pi->name, name) == 0) + return pi->id; + } + return -ENOENT; +} +EXPORT_SYMBOL(ceph_pg_poolid_by_name); + +static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) +{ + rb_erase(&pi->node, root); + kfree(pi->name); + kfree(pi); +} + +static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) +{ + u8 ev, cv; + unsigned len, num; + void *pool_end; + + ceph_decode_need(p, end, 2 + 4, bad); + ev = ceph_decode_8(p); /* encoding version */ + cv = ceph_decode_8(p); /* compat version */ + if (ev < 5) { + pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); + return -EINVAL; + } + if (cv > 9) { + pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); + return -EINVAL; + } + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, bad); + pool_end = *p + len; + + pi->type = ceph_decode_8(p); + pi->size = ceph_decode_8(p); + pi->crush_ruleset = ceph_decode_8(p); + pi->object_hash = ceph_decode_8(p); + + pi->pg_num = ceph_decode_32(p); + pi->pgp_num = ceph_decode_32(p); + + *p += 4 + 4; /* skip lpg* */ + *p += 4; /* skip last_change */ + *p += 8 + 4; /* skip snap_seq, snap_epoch */ + + /* skip snaps */ + num = ceph_decode_32(p); + while (num--) { + *p += 8; /* snapid key */ + *p += 1 + 1; /* versions */ + len = ceph_decode_32(p); + *p += len; + } + + /* skip removed_snaps */ + num = ceph_decode_32(p); + *p += num * (8 + 8); + + *p += 8; /* skip auid */ + pi->flags = ceph_decode_64(p); + *p += 4; /* skip crash_replay_interval */ + + if (ev >= 7) + *p += 1; /* skip min_size */ + + if (ev >= 8) + *p += 8 + 8; /* skip quota_max_* */ + + if (ev >= 9) { + /* skip tiers */ + num = ceph_decode_32(p); + *p += num * 8; + + *p += 8; /* skip tier_of */ + *p += 1; /* skip cache_mode */ + + pi->read_tier = ceph_decode_64(p); + pi->write_tier = ceph_decode_64(p); + } else { + pi->read_tier = -1; + pi->write_tier = -1; + } + + /* ignore the rest */ + + *p = pool_end; + calc_pg_masks(pi); + return 0; + +bad: + return -EINVAL; +} + +static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) +{ + struct ceph_pg_pool_info *pi; + u32 num, len; + u64 pool; + + ceph_decode_32_safe(p, end, num, bad); + dout(" %d pool names\n", num); + while (num--) { + ceph_decode_64_safe(p, end, pool, bad); + ceph_decode_32_safe(p, end, len, bad); + dout(" pool %llu len %d\n", pool, len); + ceph_decode_need(p, end, len, bad); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) { + char *name = kstrndup(*p, len, GFP_NOFS); + + if (!name) + return -ENOMEM; + kfree(pi->name); + pi->name = name; + dout(" name is %s\n", pi->name); + } + *p += len; + } + return 0; + +bad: + return -EINVAL; +} + +/* + * osd map + */ +void ceph_osdmap_destroy(struct ceph_osdmap *map) +{ + dout("osdmap_destroy %p\n", map); + if (map->crush) + crush_destroy(map->crush); + while (!RB_EMPTY_ROOT(&map->pg_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_temp); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->primary_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->primary_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->primary_temp); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_pools)) { + struct ceph_pg_pool_info *pi = + rb_entry(rb_first(&map->pg_pools), + struct ceph_pg_pool_info, node); + __remove_pg_pool(&map->pg_pools, pi); + } + kfree(map->osd_state); + kfree(map->osd_weight); + kfree(map->osd_addr); + kfree(map->osd_primary_affinity); + kfree(map); +} + +/* + * Adjust max_osd value, (re)allocate arrays. + * + * The new elements are properly initialized. + */ +static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +{ + u8 *state; + u32 *weight; + struct ceph_entity_addr *addr; + int i; + + state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); + if (!state) + return -ENOMEM; + map->osd_state = state; + + weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); + if (!weight) + return -ENOMEM; + map->osd_weight = weight; + + addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); + if (!addr) + return -ENOMEM; + map->osd_addr = addr; + + for (i = map->max_osd; i < max; i++) { + map->osd_state[i] = 0; + map->osd_weight[i] = CEPH_OSD_OUT; + memset(map->osd_addr + i, 0, sizeof(*map->osd_addr)); + } + + if (map->osd_primary_affinity) { + u32 *affinity; + + affinity = krealloc(map->osd_primary_affinity, + max*sizeof(*affinity), GFP_NOFS); + if (!affinity) + return -ENOMEM; + map->osd_primary_affinity = affinity; + + for (i = map->max_osd; i < max; i++) + map->osd_primary_affinity[i] = + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + } + + map->max_osd = max; + + return 0; +} + +#define OSDMAP_WRAPPER_COMPAT_VER 7 +#define OSDMAP_CLIENT_DATA_COMPAT_VER 1 + +/* + * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, + * to struct_v of the client_data section for new (v7 and above) + * osdmaps. + */ +static int get_osdmap_client_data_v(void **p, void *end, + const char *prefix, u8 *v) +{ + u8 struct_v; + + ceph_decode_8_safe(p, end, struct_v, e_inval); + if (struct_v >= 7) { + u8 struct_compat; + + ceph_decode_8_safe(p, end, struct_compat, e_inval); + if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { + pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n", + struct_v, struct_compat, + OSDMAP_WRAPPER_COMPAT_VER, prefix); + return -EINVAL; + } + *p += 4; /* ignore wrapper struct_len */ + + ceph_decode_8_safe(p, end, struct_v, e_inval); + ceph_decode_8_safe(p, end, struct_compat, e_inval); + if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { + pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n", + struct_v, struct_compat, + OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); + return -EINVAL; + } + *p += 4; /* ignore client data struct_len */ + } else { + u16 version; + + *p -= 1; + ceph_decode_16_safe(p, end, version, e_inval); + if (version < 6) { + pr_warn("got v %d < 6 of %s ceph_osdmap\n", + version, prefix); + return -EINVAL; + } + + /* old osdmap enconding */ + struct_v = 0; + } + + *v = struct_v; + return 0; + +e_inval: + return -EINVAL; +} + +static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg_pool_info *pi; + u64 pool; + int ret; + + ceph_decode_64_safe(p, end, pool, e_inval); + + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (!incremental || !pi) { + pi = kzalloc(sizeof(*pi), GFP_NOFS); + if (!pi) + return -ENOMEM; + + pi->id = pool; + + ret = __insert_pg_pool(&map->pg_pools, pi); + if (ret) { + kfree(pi); + return ret; + } + } + + ret = decode_pool(p, end, pi); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_pools(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pools(p, end, map, false); +} + +static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pools(p, end, map, true); +} + +static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg pgid; + u32 len, i; + int ret; + + ret = ceph_decode_pgid(p, end, &pgid); + if (ret) + return ret; + + ceph_decode_32_safe(p, end, len, e_inval); + + ret = __remove_pg_mapping(&map->pg_temp, pgid); + BUG_ON(!incremental && ret != -ENOENT); + + if (!incremental || len > 0) { + struct ceph_pg_mapping *pg; + + ceph_decode_need(p, end, len*sizeof(u32), e_inval); + + if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) + return -EINVAL; + + pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); + if (!pg) + return -ENOMEM; + + pg->pgid = pgid; + pg->pg_temp.len = len; + for (i = 0; i < len; i++) + pg->pg_temp.osds[i] = ceph_decode_32(p); + + ret = __insert_pg_mapping(pg, &map->pg_temp); + if (ret) { + kfree(pg); + return ret; + } + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pg_temp(p, end, map, false); +} + +static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_pg_temp(p, end, map, true); +} + +static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, + bool incremental) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + struct ceph_pg pgid; + u32 osd; + int ret; + + ret = ceph_decode_pgid(p, end, &pgid); + if (ret) + return ret; + + ceph_decode_32_safe(p, end, osd, e_inval); + + ret = __remove_pg_mapping(&map->primary_temp, pgid); + BUG_ON(!incremental && ret != -ENOENT); + + if (!incremental || osd != (u32)-1) { + struct ceph_pg_mapping *pg; + + pg = kzalloc(sizeof(*pg), GFP_NOFS); + if (!pg) + return -ENOMEM; + + pg->pgid = pgid; + pg->primary_temp.osd = osd; + + ret = __insert_pg_mapping(pg, &map->primary_temp); + if (ret) { + kfree(pg); + return ret; + } + } + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) +{ + return __decode_primary_temp(p, end, map, false); +} + +static int decode_new_primary_temp(void **p, void *end, + struct ceph_osdmap *map) +{ + return __decode_primary_temp(p, end, map, true); +} + +u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + + return map->osd_primary_affinity[osd]; +} + +static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) +{ + BUG_ON(osd >= map->max_osd); + + if (!map->osd_primary_affinity) { + int i; + + map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32), + GFP_NOFS); + if (!map->osd_primary_affinity) + return -ENOMEM; + + for (i = 0; i < map->max_osd; i++) + map->osd_primary_affinity[i] = + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + } + + map->osd_primary_affinity[osd] = aff; + + return 0; +} + +static int decode_primary_affinity(void **p, void *end, + struct ceph_osdmap *map) +{ + u32 len, i; + + ceph_decode_32_safe(p, end, len, e_inval); + if (len == 0) { + kfree(map->osd_primary_affinity); + map->osd_primary_affinity = NULL; + return 0; + } + if (len != map->max_osd) + goto e_inval; + + ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); + + for (i = 0; i < map->max_osd; i++) { + int ret; + + ret = set_primary_affinity(map, i, ceph_decode_32(p)); + if (ret) + return ret; + } + + return 0; + +e_inval: + return -EINVAL; +} + +static int decode_new_primary_affinity(void **p, void *end, + struct ceph_osdmap *map) +{ + u32 n; + + ceph_decode_32_safe(p, end, n, e_inval); + while (n--) { + u32 osd, aff; + int ret; + + ceph_decode_32_safe(p, end, osd, e_inval); + ceph_decode_32_safe(p, end, aff, e_inval); + + ret = set_primary_affinity(map, osd, aff); + if (ret) + return ret; + + pr_info("osd%d primary-affinity 0x%x\n", osd, aff); + } + + return 0; + +e_inval: + return -EINVAL; +} + +/* + * decode a full map. + */ +static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) +{ + u8 struct_v; + u32 epoch = 0; + void *start = *p; + u32 max; + u32 len, i; + int err; + + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); + + err = get_osdmap_client_data_v(p, end, "full", &struct_v); + if (err) + goto bad; + + /* fsid, epoch, created, modified */ + ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + + sizeof(map->created) + sizeof(map->modified), e_inval); + ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); + epoch = map->epoch = ceph_decode_32(p); + ceph_decode_copy(p, &map->created, sizeof(map->created)); + ceph_decode_copy(p, &map->modified, sizeof(map->modified)); + + /* pools */ + err = decode_pools(p, end, map); + if (err) + goto bad; + + /* pool_name */ + err = decode_pool_names(p, end, map); + if (err) + goto bad; + + ceph_decode_32_safe(p, end, map->pool_max, e_inval); + + ceph_decode_32_safe(p, end, map->flags, e_inval); + + /* max_osd */ + ceph_decode_32_safe(p, end, max, e_inval); + + /* (re)alloc osd arrays */ + err = osdmap_set_max_osd(map, max); + if (err) + goto bad; + + /* osd_state, osd_weight, osd_addrs->client_addr */ + ceph_decode_need(p, end, 3*sizeof(u32) + + map->max_osd*(1 + sizeof(*map->osd_weight) + + sizeof(*map->osd_addr)), e_inval); + + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + + ceph_decode_copy(p, map->osd_state, map->max_osd); + + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + + for (i = 0; i < map->max_osd; i++) + map->osd_weight[i] = ceph_decode_32(p); + + if (ceph_decode_32(p) != map->max_osd) + goto e_inval; + + ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); + for (i = 0; i < map->max_osd; i++) + ceph_decode_addr(&map->osd_addr[i]); + + /* pg_temp */ + err = decode_pg_temp(p, end, map); + if (err) + goto bad; + + /* primary_temp */ + if (struct_v >= 1) { + err = decode_primary_temp(p, end, map); + if (err) + goto bad; + } + + /* primary_affinity */ + if (struct_v >= 2) { + err = decode_primary_affinity(p, end, map); + if (err) + goto bad; + } else { + /* XXX can this happen? */ + kfree(map->osd_primary_affinity); + map->osd_primary_affinity = NULL; + } + + /* crush */ + ceph_decode_32_safe(p, end, len, e_inval); + map->crush = crush_decode(*p, min(*p + len, end)); + if (IS_ERR(map->crush)) { + err = PTR_ERR(map->crush); + map->crush = NULL; + goto bad; + } + *p += len; + + /* ignore the rest */ + *p = end; + + dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); + return 0; + +e_inval: + err = -EINVAL; +bad: + pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", + err, epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + return err; +} + +/* + * Allocate and decode a full map. + */ +struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) +{ + struct ceph_osdmap *map; + int ret; + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return ERR_PTR(-ENOMEM); + + map->pg_temp = RB_ROOT; + map->primary_temp = RB_ROOT; + mutex_init(&map->crush_scratch_mutex); + + ret = osdmap_decode(p, end, map); + if (ret) { + ceph_osdmap_destroy(map); + return ERR_PTR(ret); + } + + return map; +} + +/* + * decode and apply an incremental map update. + */ +struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, + struct ceph_osdmap *map, + struct ceph_messenger *msgr) +{ + struct crush_map *newcrush = NULL; + struct ceph_fsid fsid; + u32 epoch = 0; + struct ceph_timespec modified; + s32 len; + u64 pool; + __s64 new_pool_max; + __s32 new_flags, max; + void *start = *p; + int err; + u8 struct_v; + + dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); + + err = get_osdmap_client_data_v(p, end, "inc", &struct_v); + if (err) + goto bad; + + /* fsid, epoch, modified, new_pool_max, new_flags */ + ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + + sizeof(u64) + sizeof(u32), e_inval); + ceph_decode_copy(p, &fsid, sizeof(fsid)); + epoch = ceph_decode_32(p); + BUG_ON(epoch != map->epoch+1); + ceph_decode_copy(p, &modified, sizeof(modified)); + new_pool_max = ceph_decode_64(p); + new_flags = ceph_decode_32(p); + + /* full map? */ + ceph_decode_32_safe(p, end, len, e_inval); + if (len > 0) { + dout("apply_incremental full map len %d, %p to %p\n", + len, *p, end); + return ceph_osdmap_decode(p, min(*p+len, end)); + } + + /* new crush? */ + ceph_decode_32_safe(p, end, len, e_inval); + if (len > 0) { + newcrush = crush_decode(*p, min(*p+len, end)); + if (IS_ERR(newcrush)) { + err = PTR_ERR(newcrush); + newcrush = NULL; + goto bad; + } + *p += len; + } + + /* new flags? */ + if (new_flags >= 0) + map->flags = new_flags; + if (new_pool_max >= 0) + map->pool_max = new_pool_max; + + /* new max? */ + ceph_decode_32_safe(p, end, max, e_inval); + if (max >= 0) { + err = osdmap_set_max_osd(map, max); + if (err) + goto bad; + } + + map->epoch++; + map->modified = modified; + if (newcrush) { + if (map->crush) + crush_destroy(map->crush); + map->crush = newcrush; + newcrush = NULL; + } + + /* new_pools */ + err = decode_new_pools(p, end, map); + if (err) + goto bad; + + /* new_pool_names */ + err = decode_pool_names(p, end, map); + if (err) + goto bad; + + /* old_pool */ + ceph_decode_32_safe(p, end, len, e_inval); + while (len--) { + struct ceph_pg_pool_info *pi; + + ceph_decode_64_safe(p, end, pool, e_inval); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) + __remove_pg_pool(&map->pg_pools, pi); + } + + /* new_up */ + ceph_decode_32_safe(p, end, len, e_inval); + while (len--) { + u32 osd; + struct ceph_entity_addr addr; + ceph_decode_32_safe(p, end, osd, e_inval); + ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval); + ceph_decode_addr(&addr); + pr_info("osd%d up\n", osd); + BUG_ON(osd >= map->max_osd); + map->osd_state[osd] |= CEPH_OSD_UP; + map->osd_addr[osd] = addr; + } + + /* new_state */ + ceph_decode_32_safe(p, end, len, e_inval); + while (len--) { + u32 osd; + u8 xorstate; + ceph_decode_32_safe(p, end, osd, e_inval); + xorstate = **(u8 **)p; + (*p)++; /* clean flag */ + if (xorstate == 0) + xorstate = CEPH_OSD_UP; + if (xorstate & CEPH_OSD_UP) + pr_info("osd%d down\n", osd); + if (osd < map->max_osd) + map->osd_state[osd] ^= xorstate; + } + + /* new_weight */ + ceph_decode_32_safe(p, end, len, e_inval); + while (len--) { + u32 osd, off; + ceph_decode_need(p, end, sizeof(u32)*2, e_inval); + osd = ceph_decode_32(p); + off = ceph_decode_32(p); + pr_info("osd%d weight 0x%x %s\n", osd, off, + off == CEPH_OSD_IN ? "(in)" : + (off == CEPH_OSD_OUT ? "(out)" : "")); + if (osd < map->max_osd) + map->osd_weight[osd] = off; + } + + /* new_pg_temp */ + err = decode_new_pg_temp(p, end, map); + if (err) + goto bad; + + /* new_primary_temp */ + if (struct_v >= 1) { + err = decode_new_primary_temp(p, end, map); + if (err) + goto bad; + } + + /* new_primary_affinity */ + if (struct_v >= 2) { + err = decode_new_primary_affinity(p, end, map); + if (err) + goto bad; + } + + /* ignore the rest */ + *p = end; + + dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); + return map; + +e_inval: + err = -EINVAL; +bad: + pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", + err, epoch, (int)(*p - start), *p, start, end); + print_hex_dump(KERN_DEBUG, "osdmap: ", + DUMP_PREFIX_OFFSET, 16, 1, + start, end - start, true); + if (newcrush) + crush_destroy(newcrush); + return ERR_PTR(err); +} + + + + +/* + * calculate file layout from given offset, length. + * fill in correct oid, logical length, and object extent + * offset, length. + * + * for now, we write only a single su, until we can + * pass a stride back to the caller. + */ +int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, + u64 off, u64 len, + u64 *ono, + u64 *oxoff, u64 *oxlen) +{ + u32 osize = le32_to_cpu(layout->fl_object_size); + u32 su = le32_to_cpu(layout->fl_stripe_unit); + u32 sc = le32_to_cpu(layout->fl_stripe_count); + u32 bl, stripeno, stripepos, objsetno; + u32 su_per_object; + u64 t, su_offset; + + dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, + osize, su); + if (su == 0 || sc == 0) + goto invalid; + su_per_object = osize / su; + if (su_per_object == 0) + goto invalid; + dout("osize %u / su %u = su_per_object %u\n", osize, su, + su_per_object); + + if ((su & ~PAGE_MASK) != 0) + goto invalid; + + /* bl = *off / su; */ + t = off; + do_div(t, su); + bl = t; + dout("off %llu / su %u = bl %u\n", off, su, bl); + + stripeno = bl / sc; + stripepos = bl % sc; + objsetno = stripeno / su_per_object; + + *ono = objsetno * sc + stripepos; + dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned int)*ono); + + /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */ + t = off; + su_offset = do_div(t, su); + *oxoff = su_offset + (stripeno % su_per_object) * su; + + /* + * Calculate the length of the extent being written to the selected + * object. This is the minimum of the full length requested (len) or + * the remainder of the current stripe being written to. + */ + *oxlen = min_t(u64, len, su - su_offset); + + dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); + return 0; + +invalid: + dout(" invalid layout\n"); + *ono = 0; + *oxoff = 0; + *oxlen = 0; + return -EINVAL; +} +EXPORT_SYMBOL(ceph_calc_file_object_mapping); + +/* + * Calculate mapping of a (oloc, oid) pair to a PG. Should only be + * called with target's (oloc, oid), since tiering isn't taken into + * account. + */ +int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap, + struct ceph_object_locator *oloc, + struct ceph_object_id *oid, + struct ceph_pg *pg_out) +{ + struct ceph_pg_pool_info *pi; + + pi = __lookup_pg_pool(&osdmap->pg_pools, oloc->pool); + if (!pi) + return -EIO; + + pg_out->pool = oloc->pool; + pg_out->seed = ceph_str_hash(pi->object_hash, oid->name, + oid->name_len); + + dout("%s '%.*s' pgid %llu.%x\n", __func__, oid->name_len, oid->name, + pg_out->pool, pg_out->seed); + return 0; +} +EXPORT_SYMBOL(ceph_oloc_oid_to_pg); + +static int do_crush(struct ceph_osdmap *map, int ruleno, int x, + int *result, int result_max, + const __u32 *weight, int weight_max) +{ + int r; + + BUG_ON(result_max > CEPH_PG_MAX_SIZE); + + mutex_lock(&map->crush_scratch_mutex); + r = crush_do_rule(map->crush, ruleno, x, result, result_max, + weight, weight_max, map->crush_scratch_ary); + mutex_unlock(&map->crush_scratch_mutex); + + return r; +} + +/* + * Calculate raw (crush) set for given pgid. + * + * Return raw set length, or error. + */ +static int pg_to_raw_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, + struct ceph_pg pgid, u32 pps, int *osds) +{ + int ruleno; + int len; + + /* crush */ + ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, + pool->type, pool->size); + if (ruleno < 0) { + pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", + pgid.pool, pool->crush_ruleset, pool->type, + pool->size); + return -ENOENT; + } + + len = do_crush(osdmap, ruleno, pps, osds, + min_t(int, pool->size, CEPH_PG_MAX_SIZE), + osdmap->osd_weight, osdmap->max_osd); + if (len < 0) { + pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", + len, ruleno, pgid.pool, pool->crush_ruleset, + pool->type, pool->size); + return len; + } + + return len; +} + +/* + * Given raw set, calculate up set and up primary. + * + * Return up set length. *primary is set to up primary osd id, or -1 + * if up set is empty. + */ +static int raw_to_up_osds(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, + int *osds, int len, int *primary) +{ + int up_primary = -1; + int i; + + if (ceph_can_shift_osds(pool)) { + int removed = 0; + + for (i = 0; i < len; i++) { + if (ceph_osd_is_down(osdmap, osds[i])) { + removed++; + continue; + } + if (removed) + osds[i - removed] = osds[i]; + } + + len -= removed; + if (len > 0) + up_primary = osds[0]; + } else { + for (i = len - 1; i >= 0; i--) { + if (ceph_osd_is_down(osdmap, osds[i])) + osds[i] = CRUSH_ITEM_NONE; + else + up_primary = osds[i]; + } + } + + *primary = up_primary; + return len; +} + +static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps, + struct ceph_pg_pool_info *pool, + int *osds, int len, int *primary) +{ + int i; + int pos = -1; + + /* + * Do we have any non-default primary_affinity values for these + * osds? + */ + if (!osdmap->osd_primary_affinity) + return; + + for (i = 0; i < len; i++) { + int osd = osds[i]; + + if (osd != CRUSH_ITEM_NONE && + osdmap->osd_primary_affinity[osd] != + CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { + break; + } + } + if (i == len) + return; + + /* + * Pick the primary. Feed both the seed (for the pg) and the + * osd into the hash/rng so that a proportional fraction of an + * osd's pgs get rejected as primary. + */ + for (i = 0; i < len; i++) { + int osd = osds[i]; + u32 aff; + + if (osd == CRUSH_ITEM_NONE) + continue; + + aff = osdmap->osd_primary_affinity[osd]; + if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && + (crush_hash32_2(CRUSH_HASH_RJENKINS1, + pps, osd) >> 16) >= aff) { + /* + * We chose not to use this primary. Note it + * anyway as a fallback in case we don't pick + * anyone else, but keep looking. + */ + if (pos < 0) + pos = i; + } else { + pos = i; + break; + } + } + if (pos < 0) + return; + + *primary = osds[pos]; + + if (ceph_can_shift_osds(pool) && pos > 0) { + /* move the new primary to the front */ + for (i = pos; i > 0; i--) + osds[i] = osds[i - 1]; + osds[0] = *primary; + } +} + +/* + * Given up set, apply pg_temp and primary_temp mappings. + * + * Return acting set length. *primary is set to acting primary osd id, + * or -1 if acting set is empty. + */ +static int apply_temps(struct ceph_osdmap *osdmap, + struct ceph_pg_pool_info *pool, struct ceph_pg pgid, + int *osds, int len, int *primary) +{ + struct ceph_pg_mapping *pg; + int temp_len; + int temp_primary; + int i; + + /* raw_pg -> pg */ + pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, + pool->pg_num_mask); + + /* pg_temp? */ + pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); + if (pg) { + temp_len = 0; + temp_primary = -1; + + for (i = 0; i < pg->pg_temp.len; i++) { + if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { + if (ceph_can_shift_osds(pool)) + continue; + else + osds[temp_len++] = CRUSH_ITEM_NONE; + } else { + osds[temp_len++] = pg->pg_temp.osds[i]; + } + } + + /* apply pg_temp's primary */ + for (i = 0; i < temp_len; i++) { + if (osds[i] != CRUSH_ITEM_NONE) { + temp_primary = osds[i]; + break; + } + } + } else { + temp_len = len; + temp_primary = *primary; + } + + /* primary_temp? */ + pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); + if (pg) + temp_primary = pg->primary_temp.osd; + + *primary = temp_primary; + return temp_len; +} + +/* + * Calculate acting set for given pgid. + * + * Return acting set length, or error. *primary is set to acting + * primary osd id, or -1 if acting set is empty or on error. + */ +int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, + int *osds, int *primary) +{ + struct ceph_pg_pool_info *pool; + u32 pps; + int len; + + pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); + if (!pool) { + *primary = -1; + return -ENOENT; + } + + if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { + /* hash pool id and seed so that pool PGs do not overlap */ + pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, + ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask), + pgid.pool); + } else { + /* + * legacy behavior: add ps and pool together. this is + * not a great approach because the PGs from each pool + * will overlap on top of each other: 0.5 == 1.4 == + * 2.3 == ... + */ + pps = ceph_stable_mod(pgid.seed, pool->pgp_num, + pool->pgp_num_mask) + + (unsigned)pgid.pool; + } + + len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds); + if (len < 0) { + *primary = -1; + return len; + } + + len = raw_to_up_osds(osdmap, pool, osds, len, primary); + + apply_primary_affinity(osdmap, pps, pool, osds, len, primary); + + len = apply_temps(osdmap, pool, pgid, osds, len, primary); + + return len; +} + +/* + * Return primary osd for given pgid, or -1 if none. + */ +int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) +{ + int osds[CEPH_PG_MAX_SIZE]; + int primary; + + ceph_calc_pg_acting(osdmap, pgid, osds, &primary); + + return primary; +} +EXPORT_SYMBOL(ceph_calc_pg_primary); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c new file mode 100644 index 000000000..c7c220a73 --- /dev/null +++ b/net/ceph/pagelist.c @@ -0,0 +1,150 @@ +#include +#include +#include +#include +#include +#include + +static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) +{ + if (pl->mapped_tail) { + struct page *page = list_entry(pl->head.prev, struct page, lru); + kunmap(page); + pl->mapped_tail = NULL; + } +} + +void ceph_pagelist_release(struct ceph_pagelist *pl) +{ + if (!atomic_dec_and_test(&pl->refcnt)) + return; + ceph_pagelist_unmap_tail(pl); + while (!list_empty(&pl->head)) { + struct page *page = list_first_entry(&pl->head, struct page, + lru); + list_del(&page->lru); + __free_page(page); + } + ceph_pagelist_free_reserve(pl); + kfree(pl); +} +EXPORT_SYMBOL(ceph_pagelist_release); + +static int ceph_pagelist_addpage(struct ceph_pagelist *pl) +{ + struct page *page; + + if (!pl->num_pages_free) { + page = __page_cache_alloc(GFP_NOFS); + } else { + page = list_first_entry(&pl->free_list, struct page, lru); + list_del(&page->lru); + --pl->num_pages_free; + } + if (!page) + return -ENOMEM; + pl->room += PAGE_SIZE; + ceph_pagelist_unmap_tail(pl); + list_add_tail(&page->lru, &pl->head); + pl->mapped_tail = kmap(page); + return 0; +} + +int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len) +{ + while (pl->room < len) { + size_t bit = pl->room; + int ret; + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), + buf, bit); + pl->length += bit; + pl->room -= bit; + buf += bit; + len -= bit; + ret = ceph_pagelist_addpage(pl); + if (ret) + return ret; + } + + memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len); + pl->length += len; + pl->room -= len; + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_append); + +/* Allocate enough pages for a pagelist to append the given amount + * of data without without allocating. + * Returns: 0 on success, -ENOMEM on error. + */ +int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space) +{ + if (space <= pl->room) + return 0; + space -= pl->room; + space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */ + + while (space > pl->num_pages_free) { + struct page *page = __page_cache_alloc(GFP_NOFS); + if (!page) + return -ENOMEM; + list_add_tail(&page->lru, &pl->free_list); + ++pl->num_pages_free; + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_reserve); + +/* Free any pages that have been preallocated. */ +int ceph_pagelist_free_reserve(struct ceph_pagelist *pl) +{ + while (!list_empty(&pl->free_list)) { + struct page *page = list_first_entry(&pl->free_list, + struct page, lru); + list_del(&page->lru); + __free_page(page); + --pl->num_pages_free; + } + BUG_ON(pl->num_pages_free); + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_free_reserve); + +/* Create a truncation point. */ +void ceph_pagelist_set_cursor(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + c->pl = pl; + c->page_lru = pl->head.prev; + c->room = pl->room; +} +EXPORT_SYMBOL(ceph_pagelist_set_cursor); + +/* Truncate a pagelist to the given point. Move extra pages to reserve. + * This won't sleep. + * Returns: 0 on success, + * -EINVAL if the pagelist doesn't match the trunc point pagelist + */ +int ceph_pagelist_truncate(struct ceph_pagelist *pl, + struct ceph_pagelist_cursor *c) +{ + struct page *page; + + if (pl != c->pl) + return -EINVAL; + ceph_pagelist_unmap_tail(pl); + while (pl->head.prev != c->page_lru) { + page = list_entry(pl->head.prev, struct page, lru); + /* move from pagelist to reserve */ + list_move_tail(&page->lru, &pl->free_list); + ++pl->num_pages_free; + } + pl->room = c->room; + if (!list_empty(&pl->head)) { + page = list_entry(pl->head.prev, struct page, lru); + pl->mapped_tail = kmap(page); + } + return 0; +} +EXPORT_SYMBOL(ceph_pagelist_truncate); diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c new file mode 100644 index 000000000..096d91447 --- /dev/null +++ b/net/ceph/pagevec.c @@ -0,0 +1,202 @@ +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* + * build a vector of user pages + */ +struct page **ceph_get_direct_page_vector(const void __user *data, + int num_pages, bool write_page) +{ + struct page **pages; + int got = 0; + int rc = 0; + + pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS); + if (!pages) + return ERR_PTR(-ENOMEM); + + while (got < num_pages) { + rc = get_user_pages_unlocked(current, current->mm, + (unsigned long)data + ((unsigned long)got * PAGE_SIZE), + num_pages - got, write_page, 0, pages + got); + if (rc < 0) + break; + BUG_ON(rc == 0); + got += rc; + } + if (rc < 0) + goto fail; + return pages; + +fail: + ceph_put_page_vector(pages, got, false); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(ceph_get_direct_page_vector); + +void ceph_put_page_vector(struct page **pages, int num_pages, bool dirty) +{ + int i; + + for (i = 0; i < num_pages; i++) { + if (dirty) + set_page_dirty_lock(pages[i]); + put_page(pages[i]); + } + if (is_vmalloc_addr(pages)) + vfree(pages); + else + kfree(pages); +} +EXPORT_SYMBOL(ceph_put_page_vector); + +void ceph_release_page_vector(struct page **pages, int num_pages) +{ + int i; + + for (i = 0; i < num_pages; i++) + __free_pages(pages[i], 0); + kfree(pages); +} +EXPORT_SYMBOL(ceph_release_page_vector); + +/* + * allocate a vector new pages + */ +struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) +{ + struct page **pages; + int i; + + pages = kmalloc(sizeof(*pages) * num_pages, flags); + if (!pages) + return ERR_PTR(-ENOMEM); + for (i = 0; i < num_pages; i++) { + pages[i] = __page_cache_alloc(flags); + if (pages[i] == NULL) { + ceph_release_page_vector(pages, i); + return ERR_PTR(-ENOMEM); + } + } + return pages; +} +EXPORT_SYMBOL(ceph_alloc_page_vector); + +/* + * copy user data into a page vector + */ +int ceph_copy_user_to_page_vector(struct page **pages, + const void __user *data, + loff_t off, size_t len) +{ + int i = 0; + int po = off & ~PAGE_CACHE_MASK; + int left = len; + int l, bad; + + while (left > 0) { + l = min_t(int, PAGE_CACHE_SIZE-po, left); + bad = copy_from_user(page_address(pages[i]) + po, data, l); + if (bad == l) + return -EFAULT; + data += l - bad; + left -= l - bad; + po += l - bad; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } + return len; +} +EXPORT_SYMBOL(ceph_copy_user_to_page_vector); + +void ceph_copy_to_page_vector(struct page **pages, + const void *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + + while (left > 0) { + size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + + memcpy(page_address(pages[i]) + po, data, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } +} +EXPORT_SYMBOL(ceph_copy_to_page_vector); + +void ceph_copy_from_page_vector(struct page **pages, + void *data, + loff_t off, size_t len) +{ + int i = 0; + size_t po = off & ~PAGE_CACHE_MASK; + size_t left = len; + + while (left > 0) { + size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left); + + memcpy(data, page_address(pages[i]) + po, l); + data += l; + left -= l; + po += l; + if (po == PAGE_CACHE_SIZE) { + po = 0; + i++; + } + } +} +EXPORT_SYMBOL(ceph_copy_from_page_vector); + +/* + * Zero an extent within a page vector. Offset is relative to the + * start of the first page. + */ +void ceph_zero_page_vector_range(int off, int len, struct page **pages) +{ + int i = off >> PAGE_CACHE_SHIFT; + + off &= ~PAGE_CACHE_MASK; + + dout("zero_page_vector_page %u~%u\n", off, len); + + /* leading partial page? */ + if (off) { + int end = min((int)PAGE_CACHE_SIZE, off + len); + dout("zeroing %d %p head from %d\n", i, pages[i], + (int)off); + zero_user_segment(pages[i], off, end); + len -= (end - off); + i++; + } + while (len >= PAGE_CACHE_SIZE) { + dout("zeroing %d %p len=%d\n", i, pages[i], len); + zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); + len -= PAGE_CACHE_SIZE; + i++; + } + /* trailing partial page? */ + if (len) { + dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len); + zero_user_segment(pages[i], 0, len); + } +} +EXPORT_SYMBOL(ceph_zero_page_vector_range); + diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c new file mode 100644 index 000000000..154683f5f --- /dev/null +++ b/net/ceph/snapshot.c @@ -0,0 +1,78 @@ +/* + * snapshot.c Ceph snapshot context utility routines (part of libceph) + * + * Copyright (C) 2013 Inktank Storage, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ + +#include + +#include +#include +#include + +/* + * Ceph snapshot contexts are reference counted objects, and the + * returned structure holds a single reference. Acquire additional + * references with ceph_get_snap_context(), and release them with + * ceph_put_snap_context(). When the reference count reaches zero + * the entire structure is freed. + */ + +/* + * Create a new ceph snapshot context large enough to hold the + * indicated number of snapshot ids (which can be 0). Caller has + * to fill in snapc->seq and snapc->snaps[0..snap_count-1]. + * + * Returns a null pointer if an error occurs. + */ +struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, + gfp_t gfp_flags) +{ + struct ceph_snap_context *snapc; + size_t size; + + size = sizeof (struct ceph_snap_context); + size += snap_count * sizeof (snapc->snaps[0]); + snapc = kzalloc(size, gfp_flags); + if (!snapc) + return NULL; + + atomic_set(&snapc->nref, 1); + snapc->num_snaps = snap_count; + + return snapc; +} +EXPORT_SYMBOL(ceph_create_snap_context); + +struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) +{ + if (sc) + atomic_inc(&sc->nref); + return sc; +} +EXPORT_SYMBOL(ceph_get_snap_context); + +void ceph_put_snap_context(struct ceph_snap_context *sc) +{ + if (!sc) + return; + if (atomic_dec_and_test(&sc->nref)) { + /*printk(" deleting snap_context %p\n", sc);*/ + kfree(sc); + } +} +EXPORT_SYMBOL(ceph_put_snap_context); -- cgit v1.2.3-54-g00ecf