diff options
author | Sage Weil | 2012-07-10 11:53:34 -0700 |
---|---|---|
committer | Sage Weil | 2012-07-30 09:29:52 -0700 |
commit | a16cb1f70799c851410d9dca0a24122e258df06c (patch) | |
tree | 0290805d3594d770da3937da7fa68841c1e294ac /net/ceph | |
parent | cd43045c2de60f40a0aea49bfb252a2eafe58f8c (diff) |
libceph: fix messenger retry
In ancient times, the messenger could both initiate and accept connections.
An artifact if that was data structures to store/process an incoming
ceph_msg_connect request and send an outgoing ceph_msg_connect_reply.
Sadly, the negotiation code was referencing those structures and ignoring
important information (like the peer's connect_seq) from the correct ones.
Among other things, this fixes tight reconnect loops where the server sends
RETRY_SESSION and we (the client) retries with the same connect_seq as last
time. This bug pretty easily triggered by injecting socket failures on the
MDS and running some fs workload like workunits/direct_io/test_sync_io.
Signed-off-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'net/ceph')
-rw-r--r-- | net/ceph/messenger.c | 12 |
1 files changed, 6 insertions, 6 deletions
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 09ada7924874..16814d1f4774 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1540,7 +1540,7 @@ static int process_connect(struct ceph_connection *con) * dropped messages. */ dout("process_connect got RESET peer seq %u\n", - le32_to_cpu(con->in_connect.connect_seq)); + le32_to_cpu(con->in_reply.connect_seq)); pr_err("%s%lld %s connection reset\n", ENTITY_NAME(con->peer_name), ceph_pr_addr(&con->peer_addr.in_addr)); @@ -1566,10 +1566,10 @@ static int process_connect(struct ceph_connection *con) * If we sent a smaller connect_seq than the peer has, try * again with a larger value. */ - dout("process_connect got RETRY my seq = %u, peer_seq = %u\n", + dout("process_connect got RETRY_SESSION my seq %u, peer %u\n", le32_to_cpu(con->out_connect.connect_seq), - le32_to_cpu(con->in_connect.connect_seq)); - con->connect_seq = le32_to_cpu(con->in_connect.connect_seq); + le32_to_cpu(con->in_reply.connect_seq)); + con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); ret = prepare_write_connect(con); if (ret < 0) return ret; @@ -1583,9 +1583,9 @@ static int process_connect(struct ceph_connection *con) */ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n", con->peer_global_seq, - le32_to_cpu(con->in_connect.global_seq)); + le32_to_cpu(con->in_reply.global_seq)); get_global_seq(con->msgr, - le32_to_cpu(con->in_connect.global_seq)); + le32_to_cpu(con->in_reply.global_seq)); ret = prepare_write_connect(con); if (ret < 0) return ret; |