diff options
author | Linus Torvalds | 2018-06-06 18:39:49 -0700 |
---|---|---|
committer | Linus Torvalds | 2018-06-06 18:39:49 -0700 |
commit | 1c8c5a9d38f607c0b6fd12c91cbe1a4418762a21 (patch) | |
tree | dcc97181d4d187252e0cc8fdf29d9b365fa3ffd0 /net/smc | |
parent | 285767604576148fc1be7fcd112e4a90eb0d6ad2 (diff) | |
parent | 7170e6045a6a8b33f4fa5753589dc77b16198e2d (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller:
1) Add Maglev hashing scheduler to IPVS, from Inju Song.
2) Lots of new TC subsystem tests from Roman Mashak.
3) Add TCP zero copy receive and fix delayed acks and autotuning with
SO_RCVLOWAT, from Eric Dumazet.
4) Add XDP_REDIRECT support to mlx5 driver, from Jesper Dangaard
Brouer.
5) Add ttl inherit support to vxlan, from Hangbin Liu.
6) Properly separate ipv6 routes into their logically independant
components. fib6_info for the routing table, and fib6_nh for sets of
nexthops, which thus can be shared. From David Ahern.
7) Add bpf_xdp_adjust_tail helper, which can be used to generate ICMP
messages from XDP programs. From Nikita V. Shirokov.
8) Lots of long overdue cleanups to the r8169 driver, from Heiner
Kallweit.
9) Add BTF ("BPF Type Format"), from Martin KaFai Lau.
10) Add traffic condition monitoring to iwlwifi, from Luca Coelho.
11) Plumb extack down into fib_rules, from Roopa Prabhu.
12) Add Flower classifier offload support to igb, from Vinicius Costa
Gomes.
13) Add UDP GSO support, from Willem de Bruijn.
14) Add documentation for eBPF helpers, from Quentin Monnet.
15) Add TLS tx offload to mlx5, from Ilya Lesokhin.
16) Allow applications to be given the number of bytes available to read
on a socket via a control message returned from recvmsg(), from
Soheil Hassas Yeganeh.
17) Add x86_32 eBPF JIT compiler, from Wang YanQing.
18) Add AF_XDP sockets, with zerocopy support infrastructure as well.
From Björn Töpel.
19) Remove indirect load support from all of the BPF JITs and handle
these operations in the verifier by translating them into native BPF
instead. From Daniel Borkmann.
20) Add GRO support to ipv6 gre tunnels, from Eran Ben Elisha.
21) Allow XDP programs to do lookups in the main kernel routing tables
for forwarding. From David Ahern.
22) Allow drivers to store hardware state into an ELF section of kernel
dump vmcore files, and use it in cxgb4. From Rahul Lakkireddy.
23) Various RACK and loss detection improvements in TCP, from Yuchung
Cheng.
24) Add TCP SACK compression, from Eric Dumazet.
25) Add User Mode Helper support and basic bpfilter infrastructure, from
Alexei Starovoitov.
26) Support ports and protocol values in RTM_GETROUTE, from Roopa
Prabhu.
27) Support bulking in ->ndo_xdp_xmit() API, from Jesper Dangaard
Brouer.
28) Add lots of forwarding selftests, from Petr Machata.
29) Add generic network device failover driver, from Sridhar Samudrala.
* ra.kernel.org:/pub/scm/linux/kernel/git/davem/net-next: (1959 commits)
strparser: Add __strp_unpause and use it in ktls.
rxrpc: Fix terminal retransmission connection ID to include the channel
net: hns3: Optimize PF CMDQ interrupt switching process
net: hns3: Fix for VF mailbox receiving unknown message
net: hns3: Fix for VF mailbox cannot receiving PF response
bnx2x: use the right constant
Revert "net: sched: cls: Fix offloading when ingress dev is vxlan"
net: dsa: b53: Fix for brcm tag issue in Cygnus SoC
enic: fix UDP rss bits
netdev-FAQ: clarify DaveM's position for stable backports
rtnetlink: validate attributes in do_setlink()
mlxsw: Add extack messages for port_{un, }split failures
netdevsim: Add extack error message for devlink reload
devlink: Add extack to reload and port_{un, }split operations
net: metrics: add proper netlink validation
ipmr: fix error path when ipmr_new_table fails
ip6mr: only set ip6mr_table from setsockopt when ip6mr_new_table succeeds
net: hns3: remove unused hclgevf_cfg_func_mta_filter
netfilter: provide udp*_lib_lookup for nf_tproxy
qed*: Utilize FW 8.37.2.0
...
Diffstat (limited to 'net/smc')
-rw-r--r-- | net/smc/af_smc.c | 803 | ||||
-rw-r--r-- | net/smc/smc.h | 68 | ||||
-rw-r--r-- | net/smc/smc_cdc.c | 101 | ||||
-rw-r--r-- | net/smc/smc_cdc.h | 15 | ||||
-rw-r--r-- | net/smc/smc_clc.c | 6 | ||||
-rw-r--r-- | net/smc/smc_clc.h | 2 | ||||
-rw-r--r-- | net/smc/smc_core.c | 199 | ||||
-rw-r--r-- | net/smc/smc_core.h | 29 | ||||
-rw-r--r-- | net/smc/smc_diag.c | 44 | ||||
-rw-r--r-- | net/smc/smc_ib.c | 13 | ||||
-rw-r--r-- | net/smc/smc_llc.c | 242 | ||||
-rw-r--r-- | net/smc/smc_llc.h | 8 | ||||
-rw-r--r-- | net/smc/smc_rx.c | 308 | ||||
-rw-r--r-- | net/smc/smc_rx.h | 11 | ||||
-rw-r--r-- | net/smc/smc_tx.c | 111 | ||||
-rw-r--r-- | net/smc/smc_tx.h | 5 | ||||
-rw-r--r-- | net/smc/smc_wr.c | 1 |
17 files changed, 1406 insertions, 560 deletions
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 544bab42f925..973b4471b532 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -8,8 +8,6 @@ * * Initial restrictions: * - support for alternate links postponed - * - partial support for non-blocking sockets only - * - support for urgent data postponed * * Copyright IBM Corp. 2016, 2018 * @@ -29,6 +27,7 @@ #include <net/sock.h> #include <net/tcp.h> #include <net/smc.h> +#include <asm/ioctls.h> #include "smc.h" #include "smc_clc.h" @@ -45,11 +44,6 @@ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group * creation */ -struct smc_lgr_list smc_lgr_list = { /* established link groups */ - .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), - .list = LIST_HEAD_INIT(smc_lgr_list.list), -}; - static void smc_tcp_listen_work(struct work_struct *); static void smc_set_keepalive(struct sock *sk, int val) @@ -192,8 +186,10 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_protocol = protocol; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); + spin_lock_init(&smc->conn.send_lock); sk->sk_prot->hash(sk); sk_refcnt_debug_inc(sk); @@ -292,19 +288,28 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +/* register a new rmb, optionally send confirm_rkey msg to register with peer */ +static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, + bool conf_rkey) { /* register memory region for new rmb */ if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { rmb_desc->regerr = 1; return -EFAULT; } + if (!conf_rkey) + return 0; + /* exchange confirm_rkey msg with peer */ + if (smc_llc_do_confirm_rkey(link, rmb_desc)) { + rmb_desc->regerr = 1; + return -EFAULT; + } return 0; } static int smc_clnt_conf_first_link(struct smc_sock *smc) { + struct net *net = sock_net(smc->clcsock->sk); struct smc_link_group *lgr = smc->conn.lgr; struct smc_link *link; int rest; @@ -332,7 +337,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc)) + if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_INTERR; /* send CONFIRM LINK response over RoCE fabric */ @@ -362,7 +367,7 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) if (rc < 0) return SMC_CLC_DECL_TCL; - link->state = SMC_LNK_ACTIVE; + smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); return 0; } @@ -370,10 +375,13 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) static void smc_conn_save_peer_info(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *clc) { - smc->conn.peer_conn_idx = clc->conn_idx; + int bufsize = smc_uncompress_bufsize(clc->rmbe_size); + + smc->conn.peer_rmbe_idx = clc->rmbe_idx; smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token); - smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size); + smc->conn.peer_rmbe_size = bufsize; atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size); + smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1); } static void smc_link_save_peer_info(struct smc_link *link, @@ -386,160 +394,186 @@ static void smc_link_save_peer_info(struct smc_link *link, link->peer_mtu = clc->qp_mtu; } -/* setup for RDMA connection of client */ -static int smc_connect_rdma(struct smc_sock *smc) +/* fall back during connect */ +static int smc_connect_fallback(struct smc_sock *smc) { - struct smc_clc_msg_accept_confirm aclc; - int local_contact = SMC_FIRST_CONTACT; - struct smc_ib_device *smcibdev; - struct smc_link *link; - u8 srv_first_contact; - int reason_code = 0; - int rc = 0; - u8 ibport; + smc->use_fallback = true; + smc_copy_sock_settings_to_clc(smc); + if (smc->sk.sk_state == SMC_INIT) + smc->sk.sk_state = SMC_ACTIVE; + return 0; +} - sock_hold(&smc->sk); /* sock put in passive closing */ +/* decline and fall back during connect */ +static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code) +{ + int rc; - if (!tcp_sk(smc->clcsock->sk)->syn_smc) { - /* peer has not signalled SMC-capability */ - smc->use_fallback = true; - goto out_connected; + if (reason_code < 0) /* error, fallback is not possible */ + return reason_code; + if (reason_code != SMC_CLC_DECL_REPLY) { + rc = smc_clc_send_decline(smc, reason_code); + if (rc < 0) + return rc; } + return smc_connect_fallback(smc); +} - /* IPSec connections opt out of SMC-R optimizations */ - if (using_ipsec(smc)) { - reason_code = SMC_CLC_DECL_IPSEC; - goto decline_rdma; - } +/* abort connecting */ +static int smc_connect_abort(struct smc_sock *smc, int reason_code, + int local_contact) +{ + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(smc->conn.lgr); + mutex_unlock(&smc_create_lgr_pending); + smc_conn_free(&smc->conn); + if (reason_code < 0 && smc->sk.sk_state == SMC_INIT) + sock_put(&smc->sk); /* passive closing */ + return reason_code; +} + +/* check if there is a rdma device available for this connection. */ +/* called for connect and listen */ +static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev, + u8 *ibport) +{ + int reason_code = 0; /* PNET table look up: search active ib_device and port * within same PNETID that also contains the ethernet device * used for the internal TCP socket */ - smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); - if (!smcibdev) { + smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport); + if (!(*ibdev)) reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } + + return reason_code; +} + +/* CLC handshake during connect */ +static int smc_connect_clc(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_ib_device *ibdev, u8 ibport) +{ + int rc = 0; /* do inband token exchange */ - reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); - if (reason_code < 0) { - rc = reason_code; - goto out_err; - } - if (reason_code > 0) /* configuration error */ - goto decline_rdma; + rc = smc_clc_send_proposal(smc, ibdev, ibport); + if (rc) + return rc; /* receive SMC Accept CLC message */ - reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), - SMC_CLC_ACCEPT); - if (reason_code < 0) { - rc = reason_code; - goto out_err; - } - if (reason_code > 0) - goto decline_rdma; + return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT); +} + +/* setup for RDMA connection of client */ +static int smc_connect_rdma(struct smc_sock *smc, + struct smc_clc_msg_accept_confirm *aclc, + struct smc_ib_device *ibdev, u8 ibport) +{ + int local_contact = SMC_FIRST_CONTACT; + struct smc_link *link; + int reason_code = 0; - srv_first_contact = aclc.hdr.flag; mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl, - srv_first_contact); + local_contact = smc_conn_create(smc, ibdev, ibport, &aclc->lcl, + aclc->hdr.flag); if (local_contact < 0) { - rc = local_contact; - if (rc == -ENOMEM) + if (local_contact == -ENOMEM) reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - else if (rc == -ENOLINK) + else if (local_contact == -ENOLINK) reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */ - goto decline_rdma_unlock; + else + reason_code = SMC_CLC_DECL_INTERR; /* other error */ + return smc_connect_abort(smc, reason_code, 0); } link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; - smc_conn_save_peer_info(smc, &aclc); + smc_conn_save_peer_info(smc, aclc); /* create send buffer and rmb */ - rc = smc_buf_create(smc); - if (rc) { - reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma_unlock; - } + if (smc_buf_create(smc)) + return smc_connect_abort(smc, SMC_CLC_DECL_MEM, local_contact); if (local_contact == SMC_FIRST_CONTACT) - smc_link_save_peer_info(link, &aclc); + smc_link_save_peer_info(link, aclc); - rc = smc_rmb_rtoken_handling(&smc->conn, &aclc); - if (rc) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_rmb_rtoken_handling(&smc->conn, aclc)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); smc_close_init(smc); smc_rx_init(smc); if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_ib_ready_link(link); - if (rc) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_ib_ready_link(link)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); } else { - if (!smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } - } + if (!smc->conn.rmb_desc->reused && + smc_reg_rmb(link, smc->conn.rmb_desc, true)) + return smc_connect_abort(smc, SMC_CLC_DECL_INTERR, + local_contact); } smc_rmb_sync_sg_for_device(&smc->conn); - rc = smc_clc_send_confirm(smc); - if (rc) - goto out_err_unlock; + reason_code = smc_clc_send_confirm(smc); + if (reason_code) + return smc_connect_abort(smc, reason_code, local_contact); + + smc_tx_init(smc); if (local_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ reason_code = smc_clnt_conf_first_link(smc); - if (reason_code < 0) { - rc = reason_code; - goto out_err_unlock; - } - if (reason_code > 0) - goto decline_rdma_unlock; + if (reason_code) + return smc_connect_abort(smc, reason_code, + local_contact); } - mutex_unlock(&smc_create_lgr_pending); - smc_tx_init(smc); -out_connected: smc_copy_sock_settings_to_clc(smc); if (smc->sk.sk_state == SMC_INIT) smc->sk.sk_state = SMC_ACTIVE; - return rc ? rc : local_contact; + return 0; +} -decline_rdma_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); - smc_conn_free(&smc->conn); -decline_rdma: - /* RDMA setup failed, switch back to TCP */ - smc->use_fallback = true; - if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - rc = smc_clc_send_decline(smc, reason_code); - if (rc < 0) - goto out_err; - } - goto out_connected; +/* perform steps before actually connecting */ +static int __smc_connect(struct smc_sock *smc) +{ + struct smc_clc_msg_accept_confirm aclc; + struct smc_ib_device *ibdev; + int rc = 0; + u8 ibport; -out_err_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); - smc_conn_free(&smc->conn); -out_err: - if (smc->sk.sk_state == SMC_INIT) - sock_put(&smc->sk); /* passive closing */ - return rc; + sock_hold(&smc->sk); /* sock put in passive closing */ + + if (smc->use_fallback) + return smc_connect_fallback(smc); + + /* if peer has not signalled SMC-capability, fall back */ + if (!tcp_sk(smc->clcsock->sk)->syn_smc) + return smc_connect_fallback(smc); + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(smc)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC); + + /* check if a RDMA device is available; if not, fall back */ + if (smc_check_rdma(smc, &ibdev, &ibport)) + return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR); + + /* perform CLC handshake */ + rc = smc_connect_clc(smc, &aclc, ibdev, ibport); + if (rc) + return smc_connect_decline_fallback(smc, rc); + + /* connect using rdma */ + rc = smc_connect_rdma(smc, &aclc, ibdev, ibport); + if (rc) + return smc_connect_decline_fallback(smc, rc); + + return 0; } static int smc_connect(struct socket *sock, struct sockaddr *addr, @@ -575,8 +609,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, if (rc) goto out; - /* setup RDMA connection */ - rc = smc_connect_rdma(smc); + rc = __smc_connect(smc); if (rc < 0) goto out; else @@ -716,6 +749,7 @@ void smc_close_non_accepted(struct sock *sk) static int smc_serv_conf_first_link(struct smc_sock *smc) { + struct net *net = sock_net(smc->clcsock->sk); struct smc_link_group *lgr = smc->conn.lgr; struct smc_link *link; int rest; @@ -723,7 +757,7 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) link = &lgr->lnk[SMC_SINGLE_LINK]; - if (smc_reg_rmb(link, smc->conn.rmb_desc)) + if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) return SMC_CLC_DECL_INTERR; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -768,184 +802,244 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) return rc; } - link->state = SMC_LNK_ACTIVE; + smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); return 0; } -/* setup for RDMA connection of server */ -static void smc_listen_work(struct work_struct *work) +/* listen worker: finish */ +static void smc_listen_out(struct smc_sock *new_smc) { - struct smc_sock *new_smc = container_of(work, struct smc_sock, - smc_listen_work); - struct smc_clc_msg_proposal_prefix *pclc_prfx; - struct socket *newclcsock = new_smc->clcsock; struct smc_sock *lsmc = new_smc->listen_smc; - struct smc_clc_msg_accept_confirm cclc; - int local_contact = SMC_REUSE_CONTACT; struct sock *newsmcsk = &new_smc->sk; - struct smc_clc_msg_proposal *pclc; - struct smc_ib_device *smcibdev; - u8 buf[SMC_CLC_MAX_LEN]; - struct smc_link *link; - int reason_code = 0; - int rc = 0; - u8 ibport; - /* check if peer is smc capable */ - if (!tcp_sk(newclcsock->sk)->syn_smc) { - new_smc->use_fallback = true; - goto out_connected; + lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); + if (lsmc->sk.sk_state == SMC_LISTEN) { + smc_accept_enqueue(&lsmc->sk, newsmcsk); + } else { /* no longer listening */ + smc_close_non_accepted(newsmcsk); } + release_sock(&lsmc->sk); - /* do inband token exchange - - *wait for and receive SMC Proposal CLC message - */ - reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf), - SMC_CLC_PROPOSAL); - if (reason_code < 0) - goto out_err; - if (reason_code > 0) - goto decline_rdma; + /* Wake up accept */ + lsmc->sk.sk_data_ready(&lsmc->sk); + sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ +} - /* IPSec connections opt out of SMC-R optimizations */ - if (using_ipsec(new_smc)) { - reason_code = SMC_CLC_DECL_IPSEC; - goto decline_rdma; - } +/* listen worker: finish in state connected */ +static void smc_listen_out_connected(struct smc_sock *new_smc) +{ + struct sock *newsmcsk = &new_smc->sk; - /* PNET table look up: search active ib_device and port - * within same PNETID that also contains the ethernet device - * used for the internal TCP socket - */ - smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); - if (!smcibdev) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; + sk_refcnt_debug_inc(newsmcsk); + if (newsmcsk->sk_state == SMC_INIT) + newsmcsk->sk_state = SMC_ACTIVE; + + smc_listen_out(new_smc); +} + +/* listen worker: finish in error state */ +static void smc_listen_out_err(struct smc_sock *new_smc) +{ + struct sock *newsmcsk = &new_smc->sk; + + if (newsmcsk->sk_state == SMC_INIT) + sock_put(&new_smc->sk); /* passive closing */ + newsmcsk->sk_state = SMC_CLOSED; + smc_conn_free(&new_smc->conn); + + smc_listen_out(new_smc); +} + +/* listen worker: decline and fall back if possible */ +static void smc_listen_decline(struct smc_sock *new_smc, int reason_code, + int local_contact) +{ + /* RDMA setup failed, switch back to TCP */ + if (local_contact == SMC_FIRST_CONTACT) + smc_lgr_forget(new_smc->conn.lgr); + if (reason_code < 0) { /* error, no fallback possible */ + smc_listen_out_err(new_smc); + return; + } + smc_conn_free(&new_smc->conn); + new_smc->use_fallback = true; + if (reason_code && reason_code != SMC_CLC_DECL_REPLY) { + if (smc_clc_send_decline(new_smc, reason_code) < 0) { + smc_listen_out_err(new_smc); + return; + } } + smc_listen_out_connected(new_smc); +} + +/* listen worker: check prefixes */ +static int smc_listen_rdma_check(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc) +{ + struct smc_clc_msg_proposal_prefix *pclc_prfx; + struct socket *newclcsock = new_smc->clcsock; - pclc = (struct smc_clc_msg_proposal *)&buf; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (smc_clc_prfx_match(newclcsock, pclc_prfx)) + return SMC_CLC_DECL_CNFERR; - rc = smc_clc_prfx_match(newclcsock, pclc_prfx); - if (rc) { - reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ - goto decline_rdma; - } + return 0; +} +/* listen worker: initialize connection and buffers */ +static int smc_listen_rdma_init(struct smc_sock *new_smc, + struct smc_clc_msg_proposal *pclc, + struct smc_ib_device *ibdev, u8 ibport, + int *local_contact) +{ /* allocate connection / link group */ - mutex_lock(&smc_create_lgr_pending); - local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl, - 0); - if (local_contact < 0) { - rc = local_contact; - if (rc == -ENOMEM) - reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/ - goto decline_rdma_unlock; + *local_contact = smc_conn_create(new_smc, ibdev, ibport, &pclc->lcl, 0); + if (*local_contact < 0) { + if (*local_contact == -ENOMEM) + return SMC_CLC_DECL_MEM;/* insufficient memory*/ + return SMC_CLC_DECL_INTERR; /* other error */ } - link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; /* create send buffer and rmb */ - rc = smc_buf_create(new_smc); - if (rc) { - reason_code = SMC_CLC_DECL_MEM; - goto decline_rdma_unlock; - } + if (smc_buf_create(new_smc)) + return SMC_CLC_DECL_MEM; - smc_close_init(new_smc); - smc_rx_init(new_smc); + return 0; +} + +/* listen worker: register buffers */ +static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) +{ + struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; if (local_contact != SMC_FIRST_CONTACT) { if (!new_smc->conn.rmb_desc->reused) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc)) { - reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; - } + if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + return SMC_CLC_DECL_INTERR; } } smc_rmb_sync_sg_for_device(&new_smc->conn); - rc = smc_clc_send_accept(new_smc, local_contact); - if (rc) - goto out_err_unlock; + return 0; +} + +/* listen worker: finish RDMA setup */ +static void smc_listen_rdma_finish(struct smc_sock *new_smc, + struct smc_clc_msg_accept_confirm *cclc, + int local_contact) +{ + struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + int reason_code = 0; - /* receive SMC Confirm CLC message */ - reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), - SMC_CLC_CONFIRM); - if (reason_code < 0) - goto out_err_unlock; - if (reason_code > 0) - goto decline_rdma_unlock; - smc_conn_save_peer_info(new_smc, &cclc); if (local_contact == SMC_FIRST_CONTACT) - smc_link_save_peer_info(link, &cclc); + smc_link_save_peer_info(link, cclc); - rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc); - if (rc) { + if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; + goto decline; } if (local_contact == SMC_FIRST_CONTACT) { - rc = smc_ib_ready_link(link); - if (rc) { + if (smc_ib_ready_link(link)) { reason_code = SMC_CLC_DECL_INTERR; - goto decline_rdma_unlock; + goto decline; } /* QP confirmation over RoCE fabric */ reason_code = smc_serv_conf_first_link(new_smc); - if (reason_code < 0) - /* peer is not aware of a problem */ - goto out_err_unlock; - if (reason_code > 0) - goto decline_rdma_unlock; + if (reason_code) + goto decline; } + return; - smc_tx_init(new_smc); +decline: mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, reason_code, local_contact); +} -out_connected: - sk_refcnt_debug_inc(newsmcsk); - if (newsmcsk->sk_state == SMC_INIT) - newsmcsk->sk_state = SMC_ACTIVE; -enqueue: - lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); - if (lsmc->sk.sk_state == SMC_LISTEN) { - smc_accept_enqueue(&lsmc->sk, newsmcsk); - } else { /* no longer listening */ - smc_close_non_accepted(newsmcsk); +/* setup for RDMA connection of server */ +static void smc_listen_work(struct work_struct *work) +{ + struct smc_sock *new_smc = container_of(work, struct smc_sock, + smc_listen_work); + struct socket *newclcsock = new_smc->clcsock; + struct smc_clc_msg_accept_confirm cclc; + struct smc_clc_msg_proposal *pclc; + struct smc_ib_device *ibdev; + u8 buf[SMC_CLC_MAX_LEN]; + int local_contact = 0; + int reason_code = 0; + int rc = 0; + u8 ibport; + + if (new_smc->use_fallback) { + smc_listen_out_connected(new_smc); + return; } - release_sock(&lsmc->sk); - /* Wake up accept */ - lsmc->sk.sk_data_ready(&lsmc->sk); - sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ - return; + /* check if peer is smc capable */ + if (!tcp_sk(newclcsock->sk)->syn_smc) { + new_smc->use_fallback = true; + smc_listen_out_connected(new_smc); + return; + } -decline_rdma_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); - mutex_unlock(&smc_create_lgr_pending); -decline_rdma: - /* RDMA setup failed, switch back to TCP */ - smc_conn_free(&new_smc->conn); - new_smc->use_fallback = true; - if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { - if (smc_clc_send_decline(new_smc, reason_code) < 0) - goto out_err; + /* do inband token exchange - + * wait for and receive SMC Proposal CLC message + */ + pclc = (struct smc_clc_msg_proposal *)&buf; + reason_code = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN, + SMC_CLC_PROPOSAL); + if (reason_code) { + smc_listen_decline(new_smc, reason_code, 0); + return; } - goto out_connected; -out_err_unlock: - if (local_contact == SMC_FIRST_CONTACT) - smc_lgr_forget(new_smc->conn.lgr); + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(new_smc)) { + smc_listen_decline(new_smc, SMC_CLC_DECL_IPSEC, 0); + return; + } + + mutex_lock(&smc_create_lgr_pending); + smc_close_init(new_smc); + smc_rx_init(new_smc); + smc_tx_init(new_smc); + + /* check if RDMA is available */ + if (smc_check_rdma(new_smc, &ibdev, &ibport) || + smc_listen_rdma_check(new_smc, pclc) || + smc_listen_rdma_init(new_smc, pclc, ibdev, ibport, + &local_contact) || + smc_listen_rdma_reg(new_smc, local_contact)) { + /* SMC not supported, decline */ + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact); + return; + } + + /* send SMC Accept CLC message */ + rc = smc_clc_send_accept(new_smc, local_contact); + if (rc) { + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, rc, local_contact); + return; + } + + /* receive SMC Confirm CLC message */ + reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM); + if (reason_code) { + mutex_unlock(&smc_create_lgr_pending); + smc_listen_decline(new_smc, reason_code, local_contact); + return; + } + + /* finish worker */ + smc_listen_rdma_finish(new_smc, &cclc, local_contact); + smc_conn_save_peer_info(new_smc, &cclc); mutex_unlock(&smc_create_lgr_pending); -out_err: - if (newsmcsk->sk_state == SMC_INIT) - sock_put(&new_smc->sk); /* passive closing */ - newsmcsk->sk_state = SMC_CLOSED; - smc_conn_free(&new_smc->conn); - goto enqueue; /* queue new sock with sk_err set */ + smc_listen_out_connected(new_smc); } static void smc_tcp_listen_work(struct work_struct *work) @@ -965,7 +1059,7 @@ static void smc_tcp_listen_work(struct work_struct *work) continue; new_smc->listen_smc = lsmc; - new_smc->use_fallback = false; /* assume rdma capability first*/ + new_smc->use_fallback = lsmc->use_fallback; sock_hold(lsk); /* sock_put in smc_listen_work */ INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); smc_copy_sock_settings_to_smc(new_smc); @@ -1001,7 +1095,8 @@ static int smc_listen(struct socket *sock, int backlog) * them to the clc socket -- copy smc socket options to clc socket */ smc_copy_sock_settings_to_clc(smc); - tcp_sk(smc->clcsock->sk)->syn_smc = 1; + if (!smc->use_fallback) + tcp_sk(smc->clcsock->sk)->syn_smc = 1; rc = kernel_listen(smc->clcsock, backlog); if (rc) @@ -1034,6 +1129,7 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (lsmc->sk.sk_state != SMC_LISTEN) { rc = -EINVAL; + release_sock(sk); goto out; } @@ -1061,9 +1157,29 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, if (!rc) rc = sock_error(nsk); + release_sock(sk); + if (rc) + goto out; + + if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) { + /* wait till data arrives on the socket */ + timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept * + MSEC_PER_SEC); + if (smc_sk(nsk)->use_fallback) { + struct sock *clcsk = smc_sk(nsk)->clcsock->sk; + + lock_sock(clcsk); + if (skb_queue_empty(&clcsk->sk_receive_queue)) + sk_wait_data(clcsk, &timeo, NULL); + release_sock(clcsk); + } else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) { + lock_sock(nsk); + smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available); + release_sock(nsk); + } + } out: - release_sock(sk); sock_put(sk); /* sock_hold above */ return rc; } @@ -1094,6 +1210,16 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) (sk->sk_state != SMC_APPCLOSEWAIT1) && (sk->sk_state != SMC_INIT)) goto out; + + if (msg->msg_flags & MSG_FASTOPEN) { + if (sk->sk_state == SMC_INIT) { + smc->use_fallback = true; + } else { + rc = -EINVAL; + goto out; + } + } + if (smc->use_fallback) rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len); else @@ -1122,10 +1248,12 @@ static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, goto out; } - if (smc->use_fallback) + if (smc->use_fallback) { rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags); - else - rc = smc_rx_recvmsg(smc, msg, len, flags); + } else { + msg->msg_namelen = 0; + rc = smc_rx_recvmsg(smc, msg, NULL, len, flags); + } out: release_sock(sk); @@ -1172,7 +1300,7 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, if (sk->sk_state == SMC_INIT && mask & EPOLLOUT && smc->clcsock->sk->sk_state != TCP_CLOSE) { - rc = smc_connect_rdma(smc); + rc = __smc_connect(smc); if (rc < 0) mask |= EPOLLERR; /* success cases including fallback */ @@ -1208,6 +1336,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock, if (sk->sk_state == SMC_APPCLOSEWAIT1) mask |= EPOLLIN; } + if (smc->conn.urg_state == SMC_URG_VALID) + mask |= EPOLLPRI; } release_sock(sk); @@ -1273,14 +1403,64 @@ static int smc_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct smc_sock *smc; + int val, rc; smc = smc_sk(sk); /* generic setsockopts reaching us here always apply to the * CLC socket */ - return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, - optval, optlen); + rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname, + optval, optlen); + if (smc->clcsock->sk->sk_err) { + sk->sk_err = smc->clcsock->sk->sk_err; + sk->sk_error_report(sk); + } + if (rc) + return rc; + + if (optlen < sizeof(int)) + return -EINVAL; + get_user(val, (int __user *)optval); + + lock_sock(sk); + switch (optname) { + case TCP_ULP: + case TCP_FASTOPEN: + case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_KEY: + case TCP_FASTOPEN_NO_COOKIE: + /* option not supported by SMC */ + if (sk->sk_state == SMC_INIT) { + smc->use_fallback = true; + } else { + if (!smc->use_fallback) + rc = -EINVAL; + } + break; + case TCP_NODELAY: + if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (val && !smc->use_fallback) + mod_delayed_work(system_wq, &smc->conn.tx_work, + 0); + } + break; + case TCP_CORK: + if (sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) { + if (!val && !smc->use_fallback) + mod_delayed_work(system_wq, &smc->conn.tx_work, + 0); + } + break; + case TCP_DEFER_ACCEPT: + smc->sockopt_defer_accept = val; + break; + default: + break; + } + release_sock(sk); + + return rc; } static int smc_getsockopt(struct socket *sock, int level, int optname, @@ -1297,13 +1477,71 @@ static int smc_getsockopt(struct socket *sock, int level, int optname, static int smc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + union smc_host_cursor cons, urg; + struct smc_connection *conn; struct smc_sock *smc; + int answ; smc = smc_sk(sock->sk); - if (smc->use_fallback) + conn = &smc->conn; + if (smc->use_fallback) { + if (!smc->clcsock) + return -EBADF; return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg); - else - return sock_no_ioctl(sock, cmd, arg); + } + switch (cmd) { + case SIOCINQ: /* same as FIONREAD */ + if (smc->sk.sk_state == SMC_LISTEN) + return -EINVAL; + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = atomic_read(&smc->conn.bytes_to_rcv); + break; + case SIOCOUTQ: + /* output queue size (not send + not acked) */ + if (smc->sk.sk_state == SMC_LISTEN) + return -EINVAL; + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = smc->conn.sndbuf_desc->len - + atomic_read(&smc->conn.sndbuf_space); + break; + case SIOCOUTQNSD: + /* output queue size (not send only) */ + if (smc->sk.sk_state == SMC_LISTEN) + return -EINVAL; + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) + answ = 0; + else + answ = smc_tx_prepared_sends(&smc->conn); + break; + case SIOCATMARK: + if (smc->sk.sk_state == SMC_LISTEN) + return -EINVAL; + if (smc->sk.sk_state == SMC_INIT || + smc->sk.sk_state == SMC_CLOSED) { + answ = 0; + } else { + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + smc_curs_write(&urg, + smc_curs_read(&conn->urg_curs, conn), + conn); + answ = smc_curs_diff(conn->rmb_desc->len, + &cons, &urg) == 1; + } + break; + default: + return -ENOIOCTLCMD; + } + + return put_user(answ, (int __user *)arg); } static ssize_t smc_sendpage(struct socket *sock, struct page *page, @@ -1330,9 +1568,15 @@ out: return rc; } +/* Map the affected portions of the rmbe into an spd, note the number of bytes + * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor + * updates till whenever a respective page has been fully processed. + * Note that subsequent recv() calls have to wait till all splice() processing + * completed. + */ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, - unsigned int flags) + unsigned int flags) { struct sock *sk = sock->sk; struct smc_sock *smc; @@ -1340,16 +1584,34 @@ static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos, smc = smc_sk(sk); lock_sock(sk); - if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED)) + + if (sk->sk_state == SMC_INIT || + sk->sk_state == SMC_LISTEN || + sk->sk_state == SMC_CLOSED) + goto out; + + if (sk->sk_state == SMC_PEERFINCLOSEWAIT) { + rc = 0; goto out; + } + if (smc->use_fallback) { rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos, pipe, len, flags); } else { - rc = -EOPNOTSUPP; + if (*ppos) { + rc = -ESPIPE; + goto out; + } + if (flags & SPLICE_F_NONBLOCK) + flags = MSG_DONTWAIT; + else + flags = 0; + rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags); } out: release_sock(sk); + return rc; } @@ -1482,18 +1744,7 @@ out_pnet: static void __exit smc_exit(void) { - struct smc_link_group *lgr, *lg; - LIST_HEAD(lgr_freeing_list); - - spin_lock_bh(&smc_lgr_list.lock); - if (!list_empty(&smc_lgr_list.list)) - list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); - spin_unlock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { - list_del_init(&lgr->list); - cancel_delayed_work_sync(&lgr->free_work); - smc_lgr_free(lgr); /* free link group */ - } + smc_core_exit(); static_branch_disable(&tcp_have_smc); smc_ib_unregister_client(); sock_unregister(PF_SMC); diff --git a/net/smc/smc.h b/net/smc/smc.h index e4829a2f46ba..51ae1f10d81a 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -114,11 +114,17 @@ struct smc_host_cdc_msg { /* Connection Data Control message */ u8 reserved[18]; } __aligned(8); +enum smc_urg_state { + SMC_URG_VALID, /* data present */ + SMC_URG_NOTYET, /* data pending */ + SMC_URG_READ /* data was already read */ +}; + struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ u32 alert_token_local; /* unique conn. id */ - u8 peer_conn_idx; /* from tcp handshake */ + u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ atomic_t peer_rmbe_space;/* remaining free bytes in peer * rmbe @@ -126,9 +132,7 @@ struct smc_connection { int rtoken_idx; /* idx to peer RMB rkey/addr */ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ - int sndbuf_size; /* sndbuf size <== sock wmem */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ - int rmbe_size; /* RMBE size <== sock rmem */ int rmbe_size_short;/* compressed notation */ int rmbe_update_limit; /* lower limit for consumer @@ -153,6 +157,7 @@ struct smc_connection { u16 tx_cdc_seq; /* sequence # for CDC send */ spinlock_t send_lock; /* protect wr_sends */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ + u32 tx_off; /* base offset in peer rmb */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt @@ -161,9 +166,21 @@ struct smc_connection { union smc_host_cursor rx_curs_confirmed; /* confirmed to peer * source of snd_una ? */ + union smc_host_cursor urg_curs; /* points at urgent byte */ + enum smc_urg_state urg_state; + bool urg_tx_pend; /* urgent data staged */ + bool urg_rx_skip_pend; + /* indicate urgent oob data + * read, but previous regular + * data still pending + */ + char urg_rx_byte; /* urgent byte */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ + atomic_t splice_pending; /* number of spliced bytes + * pending processing + */ #ifndef KERNEL_HAS_ATOMIC64 spinlock_t acurs_lock; /* protect cursors */ #endif @@ -180,6 +197,10 @@ struct smc_sock { /* smc sock container */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool use_fallback; /* fallback to tcp */ + int sockopt_defer_accept; + /* sockopt TCP_DEFER_ACCEPT + * value + */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close * started, waiting for unsent @@ -214,41 +235,6 @@ static inline u32 ntoh24(u8 *net) return be32_to_cpu(t); } -#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ - -#define SMC_RMBE_SIZES 16 /* number of distinct sizes for an RMBE */ -/* theoretically, the RFC states that largest size would be 512K, - * i.e. compressed 5 and thus 6 sizes (0..5), despite - * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) - */ - -/* convert the RMB size into the compressed notation - minimum 16K. - * In contrast to plain ilog2, this rounds towards the next power of 2, - * so the socket application gets at least its desired sndbuf / rcvbuf size. - */ -static inline u8 smc_compress_bufsize(int size) -{ - u8 compressed; - - if (size <= SMC_BUF_MIN_SIZE) - return 0; - - size = (size - 1) >> 14; - compressed = ilog2(size) + 1; - if (compressed >= SMC_RMBE_SIZES) - compressed = SMC_RMBE_SIZES - 1; - return compressed; -} - -/* convert the RMB size from compressed notation into integer */ -static inline int smc_uncompress_bufsize(u8 compressed) -{ - u32 size; - - size = 0x00000001 << (((int)compressed) + 14); - return (int)size; -} - #ifdef CONFIG_XFRM static inline bool using_ipsec(struct smc_sock *smc) { @@ -262,12 +248,6 @@ static inline bool using_ipsec(struct smc_sock *smc) } #endif -struct smc_clc_msg_local; - -void smc_conn_free(struct smc_connection *conn); -int smc_conn_create(struct smc_sock *smc, - struct smc_ib_device *smcibdev, u8 ibport, - struct smc_clc_msg_local *lcl, int srv_first_contact); struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index b42395d24cba..a7e8d63fc8ae 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -44,13 +44,13 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, smc = container_of(cdcpend->conn, struct smc_sock, conn); bh_lock_sock(&smc->sk); if (!wc_status) { - diff = smc_curs_diff(cdcpend->conn->sndbuf_size, + diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, &cdcpend->conn->tx_curs_fin, &cdcpend->cursor); /* sndbuf_space is decreased in smc_sendmsg */ smp_mb__before_atomic(); atomic_add(diff, &cdcpend->conn->sndbuf_space); - /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); smc_curs_write(&cdcpend->conn->tx_curs_fin, smc_curs_read(&cdcpend->cursor, cdcpend->conn), @@ -82,7 +82,7 @@ static inline void smc_cdc_add_pending_send(struct smc_connection *conn, sizeof(struct smc_cdc_msg) > SMC_WR_BUF_SIZE, "must increase SMC_WR_BUF_SIZE to at least sizeof(struct smc_cdc_msg)"); BUILD_BUG_ON_MSG( - offsetof(struct smc_cdc_msg, reserved) > SMC_WR_TX_SIZE, + sizeof(struct smc_cdc_msg) != SMC_WR_TX_SIZE, "must adapt SMC_WR_TX_SIZE to sizeof(struct smc_cdc_msg); if not all smc_wr upper layer protocols use the same message size any more, must start to set link->wr_tx_sges[i].length on each individual smc_wr_tx_send()"); BUILD_BUG_ON_MSG( sizeof(struct smc_cdc_tx_pend) > SMC_WR_TX_PEND_PRIV_SIZE, @@ -164,20 +164,35 @@ static inline bool smc_cdc_before(u16 seq1, u16 seq2) return (s16)(seq1 - seq2) < 0; } +static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, + int *diff_prod) +{ + struct smc_connection *conn = &smc->conn; + char *base; + + /* new data included urgent business */ + smc_curs_write(&conn->urg_curs, + smc_curs_read(&conn->local_rx_ctrl.prod, conn), + conn); + conn->urg_state = SMC_URG_VALID; + if (!sock_flag(&smc->sk, SOCK_URGINLINE)) + /* we'll skip the urgent byte, so don't account for it */ + (*diff_prod)--; + base = (char *)conn->rmb_desc->cpu_addr; + if (conn->urg_curs.count) + conn->urg_rx_byte = *(base + conn->urg_curs.count - 1); + else + conn->urg_rx_byte = *(base + conn->rmb_desc->len - 1); + sk_send_sigurg(&smc->sk); +} + static void smc_cdc_msg_recv_action(struct smc_sock *smc, - struct smc_link *link, struct smc_cdc_msg *cdc) { union smc_host_cursor cons_old, prod_old; struct smc_connection *conn = &smc->conn; int diff_cons, diff_prod; - if (!cdc->prod_flags.failover_validation) { - if (smc_cdc_before(ntohs(cdc->seqno), - conn->local_rx_ctrl.seqno)) - /* received seqno is old */ - return; - } smc_curs_write(&prod_old, smc_curs_read(&conn->local_rx_ctrl.prod, conn), conn); @@ -198,18 +213,28 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smp_mb__after_atomic(); } - diff_prod = smc_curs_diff(conn->rmbe_size, &prod_old, + diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old, &conn->local_rx_ctrl.prod); if (diff_prod) { + if (conn->local_rx_ctrl.prod_flags.urg_data_present) + smc_cdc_handle_urg_data_arrival(smc, &diff_prod); /* bytes_to_rcv is decreased in smc_recvmsg */ smp_mb__before_atomic(); atomic_add(diff_prod, &conn->bytes_to_rcv); - /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ smp_mb__after_atomic(); smc->sk.sk_data_ready(&smc->sk); - } else if ((conn->local_rx_ctrl.prod_flags.write_blocked) || - (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req)) { - smc->sk.sk_data_ready(&smc->sk); + } else { + if (conn->local_rx_ctrl.prod_flags.write_blocked || + conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || + conn->local_rx_ctrl.prod_flags.urg_data_pending) { + if (conn->local_rx_ctrl.prod_flags.urg_data_pending) + conn->urg_state = SMC_URG_NOTYET; + /* force immediate tx of current consumer cursor, but + * under send_lock to guarantee arrival in seqno-order + */ + smc_tx_sndbuf_nonempty(conn); + } } /* piggy backed tx info */ @@ -219,6 +244,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, /* trigger socket release if connection closed */ smc_close_wake_tx_prepared(smc); } + if (diff_cons && conn->urg_tx_pend && + atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) { + /* urg data confirmed by peer, indicate we're ready for more */ + conn->urg_tx_pend = false; + smc->sk.sk_write_space(&smc->sk); + } if (conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) { smc->sk.sk_err = ECONNRESET; @@ -236,26 +267,11 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, } /* called under tasklet context */ -static inline void smc_cdc_msg_recv(struct smc_cdc_msg *cdc, - struct smc_link *link, u64 wr_id) +static void smc_cdc_msg_recv(struct smc_sock *smc, struct smc_cdc_msg *cdc) { - struct smc_link_group *lgr = container_of(link, struct smc_link_group, - lnk[SMC_SINGLE_LINK]); - struct smc_connection *connection; - struct smc_sock *smc; - - /* lookup connection */ - read_lock_bh(&lgr->conns_lock); - connection = smc_lgr_find_conn(ntohl(cdc->token), lgr); - if (!connection) { - read_unlock_bh(&lgr->conns_lock); - return; - } - smc = container_of(connection, struct smc_sock, conn); sock_hold(&smc->sk); - read_unlock_bh(&lgr->conns_lock); bh_lock_sock(&smc->sk); - smc_cdc_msg_recv_action(smc, link, cdc); + smc_cdc_msg_recv_action(smc, cdc); bh_unlock_sock(&smc->sk); sock_put(&smc->sk); /* no free sk in softirq-context */ } @@ -266,12 +282,31 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) { struct smc_link *link = (struct smc_link *)wc->qp->qp_context; struct smc_cdc_msg *cdc = buf; + struct smc_connection *conn; + struct smc_link_group *lgr; + struct smc_sock *smc; if (wc->byte_len < offsetof(struct smc_cdc_msg, reserved)) return; /* short message */ if (cdc->len != SMC_WR_TX_SIZE) return; /* invalid message */ - smc_cdc_msg_recv(cdc, link, wc->wr_id); + + /* lookup connection */ + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + read_lock_bh(&lgr->conns_lock); + conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); + read_unlock_bh(&lgr->conns_lock); + if (!conn) + return; + smc = container_of(conn, struct smc_sock, conn); + + if (!cdc->prod_flags.failover_validation) { + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + } + smc_cdc_msg_recv(smc, cdc); } static struct smc_wr_rx_handler smc_cdc_rx_handlers[] = { diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index ab240b37ad11..f60082fee5b8 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -48,7 +48,7 @@ struct smc_cdc_msg { struct smc_cdc_producer_flags prod_flags; struct smc_cdc_conn_state_flags conn_state_flags; u8 reserved[18]; -} __aligned(8); +} __packed; /* format defined in RFC7609 */ static inline bool smc_cdc_rxed_any_close(struct smc_connection *conn) { @@ -146,6 +146,19 @@ static inline int smc_curs_diff(unsigned int size, return max_t(int, 0, (new->count - old->count)); } +/* calculate cursor difference between old and new - returns negative + * value in case old > new + */ +static inline int smc_curs_comp(unsigned int size, + union smc_host_cursor *old, + union smc_host_cursor *new) +{ + if (old->wrap > new->wrap || + (old->wrap == new->wrap && old->count > new->count)) + return -smc_curs_diff(size, new, old); + return smc_curs_diff(size, old, new); +} + static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer, union smc_host_cursor *local, struct smc_connection *conn) diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 3a988c22f627..717449b1da0b 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -316,7 +316,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, if (clcm->type == SMC_CLC_DECLINE) { reason_code = SMC_CLC_DECL_REPLY; if (((struct smc_clc_msg_decline *)buf)->hdr.flag) { - smc->conn.lgr->sync_err = true; + smc->conn.lgr->sync_err = 1; smc_lgr_terminate(smc->conn.lgr); } } @@ -442,7 +442,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ + cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; @@ -494,7 +494,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); - aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ + aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 63bf1dc2c1f9..41ff9ea96139 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -97,7 +97,7 @@ struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ struct smc_clc_msg_local lcl; u8 qpn[3]; /* QP number */ __be32 rmb_rkey; /* RMB rkey */ - u8 conn_idx; /* Connection index, which RMBE in RMB */ + u8 rmbe_idx; /* Index of RMBE in RMB */ __be32 rmbe_alert_token;/* unique connection id */ #if defined(__BIG_ENDIAN_BITFIELD) u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index d4bd01bb44e1..add82b0266f3 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -28,12 +28,16 @@ #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) -#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) +#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) -static u32 smc_lgr_num; /* unique link group number */ +static struct smc_lgr_list smc_lgr_list = { /* established link groups */ + .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), + .list = LIST_HEAD_INIT(smc_lgr_list.list), + .num = 0, +}; -static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, - bool is_rmb); +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc); static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) { @@ -148,8 +152,11 @@ static void smc_lgr_free_work(struct work_struct *work) list_del_init(&lgr->list); /* remove from smc_lgr_list */ free: spin_unlock_bh(&smc_lgr_list.lock); - if (!delayed_work_pending(&lgr->free_work)) + if (!delayed_work_pending(&lgr->free_work)) { + if (lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE) + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); smc_lgr_free(lgr); + } } /* create a new SMC link group */ @@ -169,7 +176,7 @@ static int smc_lgr_create(struct smc_sock *smc, goto out; } lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - lgr->sync_err = false; + lgr->sync_err = 0; memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); @@ -178,8 +185,8 @@ static int smc_lgr_create(struct smc_sock *smc, INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } - smc_lgr_num += SMC_LGR_NUM_INCR; - memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE); + smc_lgr_list.num += SMC_LGR_NUM_INCR; + memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; @@ -194,9 +201,12 @@ static int smc_lgr_create(struct smc_sock *smc, smc_ib_setup_per_ibdev(smcibdev); get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); - rc = smc_wr_alloc_link_mem(lnk); + rc = smc_llc_link_init(lnk); if (rc) goto free_lgr; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; rc = smc_ib_create_protection_domain(lnk); if (rc) goto free_link_mem; @@ -206,10 +216,6 @@ static int smc_lgr_create(struct smc_sock *smc, rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; - init_completion(&lnk->llc_confirm); - init_completion(&lnk->llc_confirm_resp); - init_completion(&lnk->llc_add); - init_completion(&lnk->llc_add_resp); smc->conn.lgr = lgr; rwlock_init(&lgr->conns_lock); @@ -224,6 +230,8 @@ dealloc_pd: smc_ib_dealloc_protection_domain(lnk); free_link_mem: smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk); free_lgr: kfree(lgr); out: @@ -232,26 +240,21 @@ out: static void smc_buf_unuse(struct smc_connection *conn) { - if (conn->sndbuf_desc) { + if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; - conn->sndbuf_size = 0; - } if (conn->rmb_desc) { if (!conn->rmb_desc->regerr) { conn->rmb_desc->reused = 1; conn->rmb_desc->used = 0; - conn->rmbe_size = 0; } else { /* buf registration failed, reuse not possible */ struct smc_link_group *lgr = conn->lgr; - struct smc_link *lnk; write_lock_bh(&lgr->rmbs_lock); list_del(&conn->rmb_desc->list); write_unlock_bh(&lgr->rmbs_lock); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - smc_buf_free(conn->rmb_desc, lnk, true); + smc_buf_free(lgr, true, conn->rmb_desc); } } } @@ -269,6 +272,7 @@ void smc_conn_free(struct smc_connection *conn) static void smc_link_clear(struct smc_link *lnk) { lnk->peer_qpn = 0; + smc_llc_link_clear(lnk); smc_ib_modify_qp_reset(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); @@ -276,9 +280,11 @@ static void smc_link_clear(struct smc_link *lnk) smc_wr_free_link_mem(lnk); } -static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, - bool is_rmb) +static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, + struct smc_buf_desc *buf_desc) { + struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + if (is_rmb) { if (buf_desc->mr_rx[SMC_SINGLE_LINK]) smc_ib_put_memory_region( @@ -290,14 +296,13 @@ static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, DMA_TO_DEVICE); } sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); - if (buf_desc->cpu_addr) - free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order); + if (buf_desc->pages) + __free_pages(buf_desc->pages, buf_desc->order); kfree(buf_desc); } static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; struct smc_buf_desc *buf_desc, *bf_desc; struct list_head *buf_list; int i; @@ -310,7 +315,7 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { list_del(&buf_desc->list); - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); } } } @@ -341,13 +346,18 @@ void smc_lgr_forget(struct smc_link_group *lgr) } /* terminate linkgroup abnormally */ -void smc_lgr_terminate(struct smc_link_group *lgr) +static void __smc_lgr_terminate(struct smc_link_group *lgr) { struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; - smc_lgr_forget(lgr); + if (lgr->terminating) + return; /* lgr already terminating */ + lgr->terminating = 1; + if (!list_empty(&lgr->list)) /* forget lgr */ + list_del_init(&lgr->list); + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); @@ -368,13 +378,35 @@ void smc_lgr_terminate(struct smc_link_group *lgr) smc_lgr_schedule_free_work(lgr); } +void smc_lgr_terminate(struct smc_link_group *lgr) +{ + spin_lock_bh(&smc_lgr_list.lock); + __smc_lgr_terminate(lgr); + spin_unlock_bh(&smc_lgr_list.lock); +} + +/* Called when IB port is terminated */ +void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *l; + + spin_lock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { + if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && + lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) + __smc_lgr_terminate(lgr); + } + spin_unlock_bh(&smc_lgr_list.lock); +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) { struct dst_entry *dst = sk_dst_get(clcsock->sk); - int rc = 0; + struct net_device *ndev; + int i, nest_lvl, rc = 0; *vlan_id = 0; if (!dst) { @@ -386,8 +418,27 @@ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) goto out_rel; } - if (is_vlan_dev(dst->dev)) - *vlan_id = vlan_dev_vlan_id(dst->dev); + ndev = dst->dev; + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + goto out_rel; + } + + rtnl_lock(); + nest_lvl = dev_get_nest_level(ndev); + for (i = 0; i < nest_lvl; i++) { + struct list_head *lower = &ndev->adj_list.lower; + + if (list_empty(lower)) + break; + lower = lower->next; + ndev = (struct net_device *)netdev_lower_get_next(ndev, &lower); + if (is_vlan_dev(ndev)) { + *vlan_id = vlan_dev_vlan_id(ndev); + break; + } + } + rtnl_unlock(); out_rel: dst_release(dst); @@ -432,10 +483,10 @@ int smc_conn_create(struct smc_sock *smc, struct smc_clc_msg_local *lcl, int srv_first_contact) { struct smc_connection *conn = &smc->conn; + int local_contact = SMC_FIRST_CONTACT; struct smc_link_group *lgr; unsigned short vlan_id; enum smc_lgr_role role; - int local_contact = SMC_FIRST_CONTACT; int rc = 0; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; @@ -493,6 +544,7 @@ create: } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; + conn->urg_state = SMC_URG_READ; #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif @@ -501,14 +553,39 @@ out: return rc ? rc : local_contact; } +/* convert the RMB size into the compressed notation - minimum 16K. + * In contrast to plain ilog2, this rounds towards the next power of 2, + * so the socket application gets at least its desired sndbuf / rcvbuf size. + */ +static u8 smc_compress_bufsize(int size) +{ + u8 compressed; + + if (size <= SMC_BUF_MIN_SIZE) + return 0; + + size = (size - 1) >> 14; + compressed = ilog2(size) + 1; + if (compressed >= SMC_RMBE_SIZES) + compressed = SMC_RMBE_SIZES - 1; + return compressed; +} + +/* convert the RMB size from compressed notation into integer */ +int smc_uncompress_bufsize(u8 compressed) +{ + u32 size; + + size = 0x00000001 << (((int)compressed) + 14); + return (int)size; +} + /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ -static inline -struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr, - int compressed_bufsize, - rwlock_t *lock, - struct list_head *buf_list) +static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, + rwlock_t *lock, + struct list_head *buf_list) { struct smc_buf_desc *buf_slot; @@ -544,23 +621,23 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, if (!buf_desc) return ERR_PTR(-ENOMEM); - buf_desc->cpu_addr = - (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | - __GFP_NOMEMALLOC | - __GFP_NORETRY | __GFP_ZERO, - get_order(bufsize)); - if (!buf_desc->cpu_addr) { + buf_desc->order = get_order(bufsize); + buf_desc->pages = alloc_pages(GFP_KERNEL | __GFP_NOWARN | + __GFP_NOMEMALLOC | __GFP_COMP | + __GFP_NORETRY | __GFP_ZERO, + buf_desc->order); + if (!buf_desc->pages) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } - buf_desc->order = get_order(bufsize); + buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); /* build the sg table from the pages */ lnk = &lgr->lnk[SMC_SINGLE_LINK]; rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, GFP_KERNEL); if (rc) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, @@ -571,7 +648,7 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != 1) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(-EAGAIN); } @@ -582,19 +659,20 @@ static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, IB_ACCESS_LOCAL_WRITE, buf_desc); if (rc) { - smc_buf_free(buf_desc, lnk, is_rmb); + smc_buf_free(lgr, is_rmb, buf_desc); return ERR_PTR(rc); } } + buf_desc->len = bufsize; return buf_desc; } static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) { + struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; - struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct list_head *buf_list; int bufsize, bufsize_short; int sk_buf_size; @@ -622,7 +700,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) continue; /* check for reusable slot in the link group */ - buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list); + buf_desc = smc_buf_get_slot(bufsize_short, lock, buf_list); if (buf_desc) { memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ @@ -646,14 +724,12 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; - conn->rmbe_size = bufsize; conn->rmbe_size_short = bufsize_short; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); } else { conn->sndbuf_desc = buf_desc; - conn->sndbuf_size = bufsize; smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } @@ -709,8 +785,7 @@ int smc_buf_create(struct smc_sock *smc) /* create rmb */ rc = __smc_buf_create(smc, true); if (rc) - smc_buf_free(smc->conn.sndbuf_desc, - &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false); + smc_buf_free(smc->conn.lgr, false, smc->conn.sndbuf_desc); return rc; } @@ -777,3 +852,21 @@ int smc_rmb_rtoken_handling(struct smc_connection *conn, return conn->rtoken_idx; return 0; } + +/* Called (from smc_exit) when module is removed */ +void smc_core_exit(void) +{ + struct smc_link_group *lgr, *lg; + LIST_HEAD(lgr_freeing_list); + + spin_lock_bh(&smc_lgr_list.lock); + if (!list_empty(&smc_lgr_list.list)) + list_splice_init(&smc_lgr_list.list, &lgr_freeing_list); + spin_unlock_bh(&smc_lgr_list.lock); + list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) { + list_del_init(&lgr->list); + smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); + cancel_delayed_work_sync(&lgr->free_work); + smc_lgr_free(lgr); /* free link group */ + } +} diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 5dfcb15d529f..93cb3523bf50 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -23,10 +23,9 @@ struct smc_lgr_list { /* list of link group definition */ struct list_head list; spinlock_t lock; /* protects list of link groups */ + u32 num; /* unique link group number */ }; -extern struct smc_lgr_list smc_lgr_list; /* list of link groups */ - enum smc_lgr_role { /* possible roles of a link group */ SMC_CLNT, /* client */ SMC_SERV /* server */ @@ -79,6 +78,7 @@ struct smc_link { dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ u64 wr_rx_id; /* seq # of last recv WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ + unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ @@ -95,12 +95,18 @@ struct smc_link { u8 link_id; /* unique # within link group */ enum smc_link_state state; /* state of link */ + struct workqueue_struct *llc_wq; /* single thread work queue */ struct completion llc_confirm; /* wait for rx of conf link */ struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ int llc_confirm_rc; /* rc from confirm link msg */ int llc_confirm_resp_rc; /* rc from conf_resp msg */ struct completion llc_add; /* wait for rx of add link */ struct completion llc_add_resp; /* wait for rx of add link rsp*/ + struct delayed_work llc_testlink_wrk; /* testlink worker */ + struct completion llc_testlink_resp; /* wait for rx of testlink */ + int llc_testlink_time; /* testlink interval */ + struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ + int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -116,6 +122,8 @@ struct smc_link { struct smc_buf_desc { struct list_head list; void *cpu_addr; /* virtual address of buffer */ + struct page *pages; + int len; /* length of buffer */ struct sg_table sgt[SMC_LINKS_PER_LGR_MAX];/* virtual buffer */ struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; /* for rmb only: memory region @@ -133,6 +141,12 @@ struct smc_rtoken { /* address/key of remote RMB */ }; #define SMC_LGR_ID_SIZE 4 +#define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ +#define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ +/* theoretically, the RFC states that largest size would be 512K, + * i.e. compressed 5 and thus 6 sizes (0..5), despite + * struct smc_clc_msg_accept_confirm.rmbe_size being a 4 bit value (0..15) + */ struct smc_link_group { struct list_head list; @@ -158,7 +172,8 @@ struct smc_link_group { u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ - bool sync_err; /* lgr no longer fits to peer */ + u8 sync_err : 1; /* lgr no longer fits to peer */ + u8 terminating : 1;/* lgr is terminating */ }; /* Find the connection associated with the given alert token in the link group. @@ -196,11 +211,14 @@ static inline struct smc_connection *smc_lgr_find_conn( struct smc_sock; struct smc_clc_msg_accept_confirm; +struct smc_clc_msg_local; void smc_lgr_free(struct smc_link_group *lgr); void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_terminate(struct smc_link_group *lgr); +void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); int smc_buf_create(struct smc_sock *smc); +int smc_uncompress_bufsize(u8 compressed); int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc); int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); @@ -209,4 +227,9 @@ void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); void smc_rmb_sync_sg_for_device(struct smc_connection *conn); +void smc_conn_free(struct smc_connection *conn); +int smc_conn_create(struct smc_sock *smc, + struct smc_ib_device *smcibdev, u8 ibport, + struct smc_clc_msg_local *lcl, int srv_first_contact); +void smc_core_exit(void); #endif diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 427b91c1c964..839354402215 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -38,17 +38,27 @@ static void smc_diag_msg_common_fill(struct smc_diag_msg *r, struct sock *sk) { struct smc_sock *smc = smc_sk(sk); - r->diag_family = sk->sk_family; if (!smc->clcsock) return; r->id.idiag_sport = htons(smc->clcsock->sk->sk_num); r->id.idiag_dport = smc->clcsock->sk->sk_dport; r->id.idiag_if = smc->clcsock->sk->sk_bound_dev_if; sock_diag_save_cookie(sk, r->id.idiag_cookie); - memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); - memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; - r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; + if (sk->sk_protocol == SMCPROTO_SMC) { + r->diag_family = PF_INET; + memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); + memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); + r->id.idiag_src[0] = smc->clcsock->sk->sk_rcv_saddr; + r->id.idiag_dst[0] = smc->clcsock->sk->sk_daddr; +#if IS_ENABLED(CONFIG_IPV6) + } else if (sk->sk_protocol == SMCPROTO_SMC6) { + r->diag_family = PF_INET6; + memcpy(&r->id.idiag_src, &smc->clcsock->sk->sk_v6_rcv_saddr, + sizeof(smc->clcsock->sk->sk_v6_rcv_saddr)); + memcpy(&r->id.idiag_dst, &smc->clcsock->sk->sk_v6_daddr, + sizeof(smc->clcsock->sk->sk_v6_daddr)); +#endif + } } static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, @@ -91,8 +101,9 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, struct smc_connection *conn = &smc->conn; struct smc_diag_conninfo cinfo = { .token = conn->alert_token_local, - .sndbuf_size = conn->sndbuf_size, - .rmbe_size = conn->rmbe_size, + .sndbuf_size = conn->sndbuf_desc ? + conn->sndbuf_desc->len : 0, + .rmbe_size = conn->rmb_desc ? conn->rmb_desc->len : 0, .peer_rmbe_size = conn->peer_rmbe_size, .rx_prod.wrap = conn->local_rx_ctrl.prod.wrap, @@ -153,7 +164,8 @@ errout: return -EMSGSIZE; } -static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +static int smc_diag_dump_proto(struct proto *prot, struct sk_buff *skb, + struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr *bc = NULL; @@ -161,8 +173,8 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct sock *sk; int rc = 0; - read_lock(&smc_proto.h.smc_hash->lock); - head = &smc_proto.h.smc_hash->ht; + read_lock(&prot->h.smc_hash->lock); + head = &prot->h.smc_hash->ht; if (hlist_empty(head)) goto out; @@ -175,7 +187,17 @@ static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } out: - read_unlock(&smc_proto.h.smc_hash->lock); + read_unlock(&prot->h.smc_hash->lock); + return rc; +} + +static int smc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int rc = 0; + + rc = smc_diag_dump_proto(&smc_proto, skb, cb); + if (!rc) + rc = smc_diag_dump_proto(&smc_proto6, skb, cb); return rc; } diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 26df554f7588..0eed7ab9f28b 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -143,17 +143,6 @@ out: return rc; } -static void smc_ib_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct smc_link_group *lgr, *l; - - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) - smc_lgr_terminate(lgr); - } -} - /* process context wrapper for might_sleep smc_ib_remember_port_attr */ static void smc_ib_port_event_work(struct work_struct *work) { @@ -165,7 +154,7 @@ static void smc_ib_port_event_work(struct work_struct *work) smc_ib_remember_port_attr(smcibdev, port_idx + 1); clear_bit(port_idx, &smcibdev->port_event_mask); if (!smc_ib_port_active(smcibdev, port_idx + 1)) - smc_ib_port_terminate(smcibdev, port_idx + 1); + smc_port_terminate(smcibdev, port_idx + 1); } } diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index ea4b21981b4b..5800a6b43d83 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -214,12 +214,11 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[], return rc; } -/* send ADD LINK request or response */ -int smc_llc_send_add_link(struct smc_link *link, u8 mac[], - union ib_gid *gid, - enum smc_llc_reqresp reqresp) +/* send LLC confirm rkey request */ +static int smc_llc_send_confirm_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) { - struct smc_llc_msg_add_link *addllc; + struct smc_llc_msg_confirm_rkey *rkeyllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; @@ -227,7 +226,25 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) return rc; - addllc = (struct smc_llc_msg_add_link *)wr_buf; + rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; + memset(rkeyllc, 0, sizeof(*rkeyllc)); + rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; + rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + rkeyllc->rtoken[0].rmb_key = + htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( + (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* prepare an add link message */ +static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, + struct smc_link *link, u8 mac[], + union ib_gid *gid, + enum smc_llc_reqresp reqresp) +{ memset(addllc, 0, sizeof(*addllc)); addllc->hd.common.type = SMC_LLC_ADD_LINK; addllc->hd.length = sizeof(struct smc_llc_msg_add_link); @@ -239,16 +256,14 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], } memcpy(addllc->sender_mac, mac, ETH_ALEN); memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); - /* send llc message */ - rc = smc_wr_tx_send(link, pend); - return rc; } -/* send DELETE LINK request or response */ -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp) +/* send ADD LINK request or response */ +int smc_llc_send_add_link(struct smc_link *link, u8 mac[], + union ib_gid *gid, + enum smc_llc_reqresp reqresp) { - struct smc_llc_msg_del_link *delllc; + struct smc_llc_msg_add_link *addllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; @@ -256,7 +271,18 @@ int smc_llc_send_delete_link(struct smc_link *link, rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) return rc; - delllc = (struct smc_llc_msg_del_link *)wr_buf; + addllc = (struct smc_llc_msg_add_link *)wr_buf; + smc_llc_prep_add_link(addllc, link, mac, gid, reqresp); + /* send llc message */ + rc = smc_wr_tx_send(link, pend); + return rc; +} + +/* prepare a delete link message */ +static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, + struct smc_link *link, + enum smc_llc_reqresp reqresp) +{ memset(delllc, 0, sizeof(*delllc)); delllc->hd.common.type = SMC_LLC_DELETE_LINK; delllc->hd.length = sizeof(struct smc_llc_msg_add_link); @@ -266,14 +292,29 @@ int smc_llc_send_delete_link(struct smc_link *link, delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; delllc->link_num = link->link_id; +} + +/* send DELETE LINK request or response */ +int smc_llc_send_delete_link(struct smc_link *link, + enum smc_llc_reqresp reqresp) +{ + struct smc_llc_msg_del_link *delllc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; + + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + delllc = (struct smc_llc_msg_del_link *)wr_buf; + smc_llc_prep_delete_link(delllc, link, reqresp); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* send LLC test link request or response */ -int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16], - enum smc_llc_reqresp reqresp) +/* send LLC test link request */ +static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) { struct smc_llc_msg_test_link *testllc; struct smc_wr_tx_pend_priv *pend; @@ -287,28 +328,52 @@ int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16], memset(testllc, 0, sizeof(*testllc)); testllc->hd.common.type = SMC_LLC_TEST_LINK; testllc->hd.length = sizeof(struct smc_llc_msg_test_link); - if (reqresp == SMC_LLC_RESP) - testllc->hd.flags |= SMC_LLC_FLAG_RESP; memcpy(testllc->user_data, user_data, sizeof(testllc->user_data)); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* send a prepared message */ -static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +struct smc_llc_send_work { + struct work_struct work; + struct smc_link *link; + int llclen; + union smc_llc_msg llcbuf; +}; + +/* worker that sends a prepared message */ +static void smc_llc_send_message_work(struct work_struct *work) { + struct smc_llc_send_work *llcwrk = container_of(work, + struct smc_llc_send_work, work); struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (llcwrk->link->state == SMC_LNK_INACTIVE) + goto out; + rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); if (rc) - return rc; - memcpy(wr_buf, llcbuf, llclen); - /* send llc message */ - rc = smc_wr_tx_send(link, pend); - return rc; + goto out; + memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen); + smc_wr_tx_send(llcwrk->link, pend); +out: + kfree(llcwrk); +} + +/* copy llcbuf and schedule an llc send on link */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +{ + struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + + if (!wrk) + return -ENOMEM; + INIT_WORK(&wrk->work, smc_llc_send_message_work); + wrk->link = link; + wrk->llclen = llclen; + memcpy(&wrk->llcbuf, llcbuf, llclen); + queue_work(link->llc_wq, &wrk->work); + return 0; } /********************************* receive ***********************************/ @@ -359,17 +424,18 @@ static void smc_llc_rx_add_link(struct smc_link *link, } if (lgr->role == SMC_SERV) { - smc_llc_send_add_link(link, + smc_llc_prep_add_link(llc, link, link->smcibdev->mac[link->ibport - 1], &link->smcibdev->gid[link->ibport - 1], SMC_LLC_REQ); } else { - smc_llc_send_add_link(link, + smc_llc_prep_add_link(llc, link, link->smcibdev->mac[link->ibport - 1], &link->smcibdev->gid[link->ibport - 1], SMC_LLC_RESP); } + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -385,9 +451,11 @@ static void smc_llc_rx_delete_link(struct smc_link *link, } else { if (lgr->role == SMC_SERV) { smc_lgr_forget(lgr); - smc_llc_send_delete_link(link, SMC_LLC_REQ); + smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ); + smc_llc_send_message(link, llc, sizeof(*llc)); } else { - smc_llc_send_delete_link(link, SMC_LLC_RESP); + smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP); + smc_llc_send_message(link, llc, sizeof(*llc)); smc_lgr_terminate(lgr); } } @@ -397,9 +465,11 @@ static void smc_llc_rx_test_link(struct smc_link *link, struct smc_llc_msg_test_link *llc) { if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ + if (link->state == SMC_LNK_ACTIVE) + complete(&link->llc_testlink_resp); } else { - smc_llc_send_test_link(link, llc->user_data, SMC_LLC_RESP); + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -412,7 +482,9 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ + link->llc_confirm_rkey_rc = llc->hd.flags & + SMC_LLC_FLAG_RKEY_NEG; + complete(&link->llc_confirm_rkey); } else { rc = smc_rtoken_add(lgr, llc->rtoken[0].rmb_vaddr, @@ -423,7 +495,7 @@ static void smc_llc_rx_confirm_rkey(struct smc_link *link, llc->hd.flags |= SMC_LLC_FLAG_RESP; if (rc < 0) llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -435,7 +507,7 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, } else { /* ignore rtokens for other links, we have only one link */ llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -463,7 +535,7 @@ static void smc_llc_rx_delete_rkey(struct smc_link *link, } llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, (void *)llc, sizeof(*llc)); + smc_llc_send_message(link, llc, sizeof(*llc)); } } @@ -476,6 +548,8 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) return; /* short message */ if (llc->raw.hdr.length != sizeof(*llc)) return; /* invalid message */ + if (link->state == SMC_LNK_INACTIVE) + return; /* link not active, drop msg */ switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: @@ -502,6 +576,100 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) } } +/***************************** worker, utils *********************************/ + +static void smc_llc_testlink_work(struct work_struct *work) +{ + struct smc_link *link = container_of(to_delayed_work(work), + struct smc_link, llc_testlink_wrk); + unsigned long next_interval; + struct smc_link_group *lgr; + unsigned long expire_time; + u8 user_data[16] = { 0 }; + int rc; + + lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + if (link->state != SMC_LNK_ACTIVE) + return; /* don't reschedule worker */ + expire_time = link->wr_rx_tstamp + link->llc_testlink_time; + if (time_is_after_jiffies(expire_time)) { + next_interval = expire_time - jiffies; + goto out; + } + reinit_completion(&link->llc_testlink_resp); + smc_llc_send_test_link(link, user_data); + /* receive TEST LINK response over RoCE fabric */ + rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, + SMC_LLC_WAIT_TIME); + if (rc <= 0) { + smc_lgr_terminate(lgr); + return; + } + next_interval = link->llc_testlink_time; +out: + queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, + next_interval); +} + +int smc_llc_link_init(struct smc_link *link) +{ + struct smc_link_group *lgr = container_of(link, struct smc_link_group, + lnk[SMC_SINGLE_LINK]); + link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, + *((u32 *)lgr->id), + link->link_id); + if (!link->llc_wq) + return -ENOMEM; + init_completion(&link->llc_confirm); + init_completion(&link->llc_confirm_resp); + init_completion(&link->llc_add); + init_completion(&link->llc_add_resp); + init_completion(&link->llc_confirm_rkey); + init_completion(&link->llc_testlink_resp); + INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + return 0; +} + +void smc_llc_link_active(struct smc_link *link, int testlink_time) +{ + link->state = SMC_LNK_ACTIVE; + if (testlink_time) { + link->llc_testlink_time = testlink_time * HZ; + queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, + link->llc_testlink_time); + } +} + +/* called in tasklet context */ +void smc_llc_link_inactive(struct smc_link *link) +{ + link->state = SMC_LNK_INACTIVE; + cancel_delayed_work(&link->llc_testlink_wrk); +} + +/* called in worker context */ +void smc_llc_link_clear(struct smc_link *link) +{ + flush_workqueue(link->llc_wq); + destroy_workqueue(link->llc_wq); +} + +/* register a new rtoken at the remote peer */ +int smc_llc_do_confirm_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + int rc; + + reinit_completion(&link->llc_confirm_rkey); + smc_llc_send_confirm_rkey(link, rmb_desc); + /* receive CONFIRM RKEY response from server over RoCE fabric */ + rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, + SMC_LLC_WAIT_TIME); + if (rc <= 0 || link->llc_confirm_rkey_rc) + return -EFAULT; + return 0; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index e4a7d5e234d5..65c8645e96a1 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -42,8 +42,12 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid, enum smc_llc_reqresp reqresp); int smc_llc_send_delete_link(struct smc_link *link, enum smc_llc_reqresp reqresp); -int smc_llc_send_test_link(struct smc_link *lnk, u8 user_data[16], - enum smc_llc_reqresp reqresp); +int smc_llc_link_init(struct smc_link *link); +void smc_llc_link_active(struct smc_link *link, int testlink_time); +void smc_llc_link_inactive(struct smc_link *link); +void smc_llc_link_clear(struct smc_link *link); +int smc_llc_do_confirm_rkey(struct smc_link *link, + struct smc_buf_desc *rmb_desc); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index eff4e0d0bb31..3d77b383cccd 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -22,11 +22,10 @@ #include "smc_tx.h" /* smc_tx_consumer_update() */ #include "smc_rx.h" -/* callback implementation for sk.sk_data_ready() - * to wakeup rcvbuf consumers that blocked with smc_rx_wait_data(). +/* callback implementation to wakeup consumers blocked with smc_rx_wait(). * indirectly called by smc_cdc_msg_recv_action(). */ -static void smc_rx_data_ready(struct sock *sk) +static void smc_rx_wake_up(struct sock *sk) { struct socket_wq *wq; @@ -44,28 +43,180 @@ static void smc_rx_data_ready(struct sock *sk) rcu_read_unlock(); } +/* Update consumer cursor + * @conn connection to update + * @cons consumer cursor + * @len number of Bytes consumed + * Returns: + * 1 if we should end our receive, 0 otherwise + */ +static int smc_rx_update_consumer(struct smc_sock *smc, + union smc_host_cursor cons, size_t len) +{ + struct smc_connection *conn = &smc->conn; + struct sock *sk = &smc->sk; + bool force = false; + int diff, rc = 0; + + smc_curs_add(conn->rmb_desc->len, &cons, len); + + /* did we process urgent data? */ + if (conn->urg_state == SMC_URG_VALID || conn->urg_rx_skip_pend) { + diff = smc_curs_comp(conn->rmb_desc->len, &cons, + &conn->urg_curs); + if (sock_flag(sk, SOCK_URGINLINE)) { + if (diff == 0) { + force = true; + rc = 1; + conn->urg_state = SMC_URG_READ; + } + } else { + if (diff == 1) { + /* skip urgent byte */ + force = true; + smc_curs_add(conn->rmb_desc->len, &cons, 1); + conn->urg_rx_skip_pend = false; + } else if (diff < -1) + /* we read past urgent byte */ + conn->urg_state = SMC_URG_READ; + } + } + + smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn), + conn); + + /* send consumer cursor update if required */ + /* similar to advertising new TCP rcv_wnd if required */ + smc_tx_consumer_update(conn, force); + + return rc; +} + +static void smc_rx_update_cons(struct smc_sock *smc, size_t len) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + + smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), + conn); + smc_rx_update_consumer(smc, cons, len); +} + +struct smc_spd_priv { + struct smc_sock *smc; + size_t len; +}; + +static void smc_rx_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct smc_spd_priv *priv = (struct smc_spd_priv *)buf->private; + struct smc_sock *smc = priv->smc; + struct smc_connection *conn; + struct sock *sk = &smc->sk; + + if (sk->sk_state == SMC_CLOSED || + sk->sk_state == SMC_PEERFINCLOSEWAIT || + sk->sk_state == SMC_APPFINCLOSEWAIT) + goto out; + conn = &smc->conn; + lock_sock(sk); + smc_rx_update_cons(smc, priv->len); + release_sock(sk); + if (atomic_sub_and_test(priv->len, &conn->splice_pending)) + smc_rx_wake_up(sk); +out: + kfree(priv); + put_page(buf->page); + sock_put(sk); +} + +static int smc_rx_pipe_buf_nosteal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static const struct pipe_buf_operations smc_pipe_ops = { + .can_merge = 0, + .confirm = generic_pipe_buf_confirm, + .release = smc_rx_pipe_buf_release, + .steal = smc_rx_pipe_buf_nosteal, + .get = generic_pipe_buf_get +}; + +static void smc_rx_spd_release(struct splice_pipe_desc *spd, + unsigned int i) +{ + put_page(spd->pages[i]); +} + +static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len, + struct smc_sock *smc) +{ + struct splice_pipe_desc spd; + struct partial_page partial; + struct smc_spd_priv *priv; + struct page *page; + int bytes; + + page = virt_to_page(smc->conn.rmb_desc->cpu_addr); + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + priv->len = len; + priv->smc = smc; + partial.offset = src - (char *)smc->conn.rmb_desc->cpu_addr; + partial.len = len; + partial.private = (unsigned long)priv; + + spd.nr_pages_max = 1; + spd.nr_pages = 1; + spd.pages = &page; + spd.partial = &partial; + spd.ops = &smc_pipe_ops; + spd.spd_release = smc_rx_spd_release; + + bytes = splice_to_pipe(pipe, &spd); + if (bytes > 0) { + sock_hold(&smc->sk); + get_page(smc->conn.rmb_desc->pages); + atomic_add(bytes, &smc->conn.splice_pending); + } + + return bytes; +} + +static int smc_rx_data_available_and_no_splice_pend(struct smc_connection *conn) +{ + return atomic_read(&conn->bytes_to_rcv) && + !atomic_read(&conn->splice_pending); +} + /* blocks rcvbuf consumer until >=len bytes available or timeout or interrupted * @smc smc socket * @timeo pointer to max seconds to wait, pointer to value 0 for no timeout + * @fcrit add'l criterion to evaluate as function pointer * Returns: * 1 if at least 1 byte available in rcvbuf or if socket error/shutdown. * 0 otherwise (nothing in rcvbuf nor timeout, e.g. interrupted). */ -static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) +int smc_rx_wait(struct smc_sock *smc, long *timeo, + int (*fcrit)(struct smc_connection *conn)) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; struct sock *sk = &smc->sk; int rc; - if (atomic_read(&conn->bytes_to_rcv)) + if (fcrit(conn)) return 1; sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); add_wait_queue(sk_sleep(sk), &wait); rc = sk_wait_event(sk, timeo, sk->sk_err || sk->sk_shutdown & RCV_SHUTDOWN || - atomic_read(&conn->bytes_to_rcv) || + fcrit(conn) || smc_cdc_rxed_any_close_or_senddone(conn), &wait); remove_wait_queue(sk_sleep(sk), &wait); @@ -73,65 +224,115 @@ static int smc_rx_wait_data(struct smc_sock *smc, long *timeo) return rc; } -/* rcvbuf consumer: main API called by socket layer. - * called under sk lock. +static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len, + int flags) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons; + struct sock *sk = &smc->sk; + int rc = 0; + + if (sock_flag(sk, SOCK_URGINLINE) || + !(conn->urg_state == SMC_URG_VALID) || + conn->urg_state == SMC_URG_READ) + return -EINVAL; + + if (conn->urg_state == SMC_URG_VALID) { + if (!(flags & MSG_PEEK)) + smc->conn.urg_state = SMC_URG_READ; + msg->msg_flags |= MSG_OOB; + if (len > 0) { + if (!(flags & MSG_TRUNC)) + rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1); + len = 1; + smc_curs_write(&cons, + smc_curs_read(&conn->local_tx_ctrl.cons, + conn), + conn); + if (smc_curs_diff(conn->rmb_desc->len, &cons, + &conn->urg_curs) > 1) + conn->urg_rx_skip_pend = true; + /* Urgent Byte was already accounted for, but trigger + * skipping the urgent byte in non-inline case + */ + if (!(flags & MSG_PEEK)) + smc_rx_update_consumer(smc, cons, 0); + } else { + msg->msg_flags |= MSG_TRUNC; + } + + return rc ? -EFAULT : len; + } + + if (sk->sk_state == SMC_CLOSED || sk->sk_shutdown & RCV_SHUTDOWN) + return 0; + + return -EAGAIN; +} + +/* smc_rx_recvmsg - receive data from RMBE + * @msg: copy data to receive buffer + * @pipe: copy data to pipe if set - indicates splice() call + * + * rcvbuf consumer: main API called by socket layer. + * Called under sk lock. */ -int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, - int flags) +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, + struct pipe_inode_info *pipe, size_t len, int flags) { size_t copylen, read_done = 0, read_remaining = len; size_t chunk_len, chunk_off, chunk_len_sum; struct smc_connection *conn = &smc->conn; + int (*func)(struct smc_connection *conn); union smc_host_cursor cons; int readable, chunk; char *rcvbuf_base; struct sock *sk; + int splbytes; long timeo; int target; /* Read at least these many bytes */ int rc; if (unlikely(flags & MSG_ERRQUEUE)) return -EINVAL; /* future work for sk.sk_family == AF_SMC */ - if (flags & MSG_OOB) - return -EINVAL; /* future work */ sk = &smc->sk; if (sk->sk_state == SMC_LISTEN) return -ENOTCONN; + if (flags & MSG_OOB) + return smc_rx_recv_urg(smc, msg, len, flags); timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); - msg->msg_namelen = 0; /* we currently use 1 RMBE per RMB, so RMBE == RMB base addr */ rcvbuf_base = conn->rmb_desc->cpu_addr; do { /* while (read_remaining) */ - if (read_done >= target) + if (read_done >= target || (pipe && read_done)) break; if (atomic_read(&conn->bytes_to_rcv)) goto copy; + else if (conn->urg_state == SMC_URG_VALID) + /* we received a single urgent Byte - skip */ + smc_rx_update_cons(smc, 0); + + if (sk->sk_shutdown & RCV_SHUTDOWN || + smc_cdc_rxed_any_close_or_senddone(conn) || + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort) + break; if (read_done) { if (sk->sk_err || sk->sk_state == SMC_CLOSED || - sk->sk_shutdown & RCV_SHUTDOWN || !timeo || - signal_pending(current) || - smc_cdc_rxed_any_close_or_senddone(conn) || - conn->local_tx_ctrl.conn_state_flags. - peer_conn_abort) + signal_pending(current)) break; } else { if (sk->sk_err) { read_done = sock_error(sk); break; } - if (sk->sk_shutdown & RCV_SHUTDOWN || - smc_cdc_rxed_any_close_or_senddone(conn) || - conn->local_tx_ctrl.conn_state_flags. - peer_conn_abort) - break; if (sk->sk_state == SMC_CLOSED) { if (!sock_flag(sk, SOCK_DONE)) { /* This occurs when user tries to read @@ -150,32 +351,56 @@ int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, return -EAGAIN; } - if (!atomic_read(&conn->bytes_to_rcv)) { - smc_rx_wait_data(smc, &timeo); + if (!smc_rx_data_available(conn)) { + smc_rx_wait(smc, &timeo, smc_rx_data_available); continue; } copy: /* initialize variables for 1st iteration of subsequent loop */ - /* could be just 1 byte, even after smc_rx_wait_data above */ + /* could be just 1 byte, even after waiting on data above */ readable = atomic_read(&conn->bytes_to_rcv); - /* not more than what user space asked for */ - copylen = min_t(size_t, read_remaining, readable); + splbytes = atomic_read(&conn->splice_pending); + if (!readable || (msg && splbytes)) { + if (splbytes) + func = smc_rx_data_available_and_no_splice_pend; + else + func = smc_rx_data_available; + smc_rx_wait(smc, &timeo, func); + continue; + } + smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn), conn); + /* subsequent splice() calls pick up where previous left */ + if (splbytes) + smc_curs_add(conn->rmb_desc->len, &cons, splbytes); + if (conn->urg_state == SMC_URG_VALID && + sock_flag(&smc->sk, SOCK_URGINLINE) && + readable > 1) + readable--; /* always stop at urgent Byte */ + /* not more than what user space asked for */ + copylen = min_t(size_t, read_remaining, readable); /* determine chunks where to read from rcvbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ - chunk_len = min_t(size_t, - copylen, conn->rmbe_size - cons.count); + chunk_len = min_t(size_t, copylen, conn->rmb_desc->len - + cons.count); chunk_len_sum = chunk_len; chunk_off = cons.count; smc_rmb_sync_sg_for_cpu(conn); for (chunk = 0; chunk < 2; chunk++) { if (!(flags & MSG_TRUNC)) { - rc = memcpy_to_msg(msg, rcvbuf_base + chunk_off, - chunk_len); - if (rc) { + if (msg) { + rc = memcpy_to_msg(msg, rcvbuf_base + + chunk_off, + chunk_len); + } else { + rc = smc_rx_splice(pipe, rcvbuf_base + + chunk_off, chunk_len, + smc); + } + if (rc < 0) { if (!read_done) read_done = -EFAULT; smc_rmb_sync_sg_for_device(conn); @@ -196,18 +421,13 @@ copy: /* update cursors */ if (!(flags & MSG_PEEK)) { - smc_curs_add(conn->rmbe_size, &cons, copylen); /* increased in recv tasklet smc_cdc_msg_rcv() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->bytes_to_rcv); - /* guarantee 0 <= bytes_to_rcv <= rmbe_size */ + /* guarantee 0 <= bytes_to_rcv <= rmb_desc->len */ smp_mb__after_atomic(); - smc_curs_write(&conn->local_tx_ctrl.cons, - smc_curs_read(&cons, conn), - conn); - /* send consumer cursor update if required */ - /* similar to advertising new TCP rcv_wnd if required */ - smc_tx_consumer_update(conn); + if (msg && smc_rx_update_consumer(smc, cons, copylen)) + goto out; } } while (read_remaining); out: @@ -217,5 +437,7 @@ out: /* Initialize receive properties on connection establishment. NB: not __init! */ void smc_rx_init(struct smc_sock *smc) { - smc->sk.sk_data_ready = smc_rx_data_ready; + smc->sk.sk_data_ready = smc_rx_wake_up; + atomic_set(&smc->conn.splice_pending, 0); + smc->conn.urg_state = SMC_URG_READ; } diff --git a/net/smc/smc_rx.h b/net/smc/smc_rx.h index 3a32b59bf06c..db823c97d824 100644 --- a/net/smc/smc_rx.h +++ b/net/smc/smc_rx.h @@ -18,7 +18,14 @@ #include "smc.h" void smc_rx_init(struct smc_sock *smc); -int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, size_t len, - int flags); + +int smc_rx_recvmsg(struct smc_sock *smc, struct msghdr *msg, + struct pipe_inode_info *pipe, size_t len, int flags); +int smc_rx_wait(struct smc_sock *smc, long *timeo, + int (*fcrit)(struct smc_connection *conn)); +static inline int smc_rx_data_available(struct smc_connection *conn) +{ + return atomic_read(&conn->bytes_to_rcv); +} #endif /* SMC_RX_H */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 72f004c9c9b1..cee666400752 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -19,6 +19,7 @@ #include <linux/sched/signal.h> #include <net/sock.h> +#include <net/tcp.h> #include "smc.h" #include "smc_wr.h" @@ -26,11 +27,12 @@ #include "smc_tx.h" #define SMC_TX_WORK_DELAY HZ +#define SMC_TX_CORK_DELAY (HZ >> 2) /* 250 ms */ /***************************** sndbuf producer *******************************/ /* callback implementation for sk.sk_write_space() - * to wakeup sndbuf producers that blocked with smc_tx_wait_memory(). + * to wakeup sndbuf producers that blocked with smc_tx_wait(). * called under sk_socket lock. */ static void smc_tx_write_space(struct sock *sk) @@ -54,7 +56,7 @@ static void smc_tx_write_space(struct sock *sk) } } -/* Wakeup sndbuf producers that blocked with smc_tx_wait_memory(). +/* Wakeup sndbuf producers that blocked with smc_tx_wait(). * Cf. tcp_data_snd_check()=>tcp_check_space()=>tcp_new_space(). */ void smc_tx_sndbuf_nonfull(struct smc_sock *smc) @@ -64,8 +66,10 @@ void smc_tx_sndbuf_nonfull(struct smc_sock *smc) smc->sk.sk_write_space(&smc->sk); } -/* blocks sndbuf producer until at least one byte of free space available */ -static int smc_tx_wait_memory(struct smc_sock *smc, int flags) +/* blocks sndbuf producer until at least one byte of free space available + * or urgent Byte was consumed + */ +static int smc_tx_wait(struct smc_sock *smc, int flags) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct smc_connection *conn = &smc->conn; @@ -101,20 +105,28 @@ static int smc_tx_wait_memory(struct smc_sock *smc, int flags) break; } sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); - if (atomic_read(&conn->sndbuf_space)) - break; /* at least 1 byte of free space available */ + if (atomic_read(&conn->sndbuf_space) && !conn->urg_tx_pend) + break; /* at least 1 byte of free & no urgent data */ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk_wait_event(sk, &timeo, sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN) || smc_cdc_rxed_any_close(conn) || - atomic_read(&conn->sndbuf_space), + (atomic_read(&conn->sndbuf_space) && + !conn->urg_tx_pend), &wait); } remove_wait_queue(sk_sleep(sk), &wait); return rc; } +static bool smc_tx_is_corked(struct smc_sock *smc) +{ + struct tcp_sock *tp = tcp_sk(smc->clcsock->sk); + + return (tp->nonagle & TCP_NAGLE_CORK) ? true : false; +} + /* sndbuf producer: main API called by socket layer. * called under sock lock. */ @@ -148,8 +160,11 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) if (smc_cdc_rxed_any_close(conn)) return send_done ?: -ECONNRESET; - if (!atomic_read(&conn->sndbuf_space)) { - rc = smc_tx_wait_memory(smc, msg->msg_flags); + if (msg->msg_flags & MSG_OOB) + conn->local_tx_ctrl.prod_flags.urg_data_pending = 1; + + if (!atomic_read(&conn->sndbuf_space) || conn->urg_tx_pend) { + rc = smc_tx_wait(smc, msg->msg_flags); if (rc) { if (send_done) return send_done; @@ -159,7 +174,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) } /* initialize variables for 1st iteration of subsequent loop */ - /* could be just 1 byte, even after smc_tx_wait_memory above */ + /* could be just 1 byte, even after smc_tx_wait above */ writespace = atomic_read(&conn->sndbuf_space); /* not more than what user space asked for */ copylen = min_t(size_t, send_remaining, writespace); @@ -171,8 +186,8 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) tx_cnt_prep = prep.count; /* determine chunks where to write into sndbuf */ /* either unwrapped case, or 1st chunk of wrapped case */ - chunk_len = min_t(size_t, - copylen, conn->sndbuf_size - tx_cnt_prep); + chunk_len = min_t(size_t, copylen, conn->sndbuf_desc->len - + tx_cnt_prep); chunk_len_sum = chunk_len; chunk_off = tx_cnt_prep; smc_sndbuf_sync_sg_for_cpu(conn); @@ -197,19 +212,30 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len) } smc_sndbuf_sync_sg_for_device(conn); /* update cursors */ - smc_curs_add(conn->sndbuf_size, &prep, copylen); + smc_curs_add(conn->sndbuf_desc->len, &prep, copylen); smc_curs_write(&conn->tx_curs_prep, smc_curs_read(&prep, conn), conn); /* increased in send tasklet smc_cdc_tx_handler() */ smp_mb__before_atomic(); atomic_sub(copylen, &conn->sndbuf_space); - /* guarantee 0 <= sndbuf_space <= sndbuf_size */ + /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); /* since we just produced more new data into sndbuf, * trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ - smc_tx_sndbuf_nonempty(conn); + if ((msg->msg_flags & MSG_OOB) && !send_remaining) + conn->urg_tx_pend = true; + if ((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc)) && + (atomic_read(&conn->sndbuf_space) > + (conn->sndbuf_desc->len >> 1))) + /* for a corked socket defer the RDMA writes if there + * is still sufficient sndbuf_space available + */ + schedule_delayed_work(&conn->tx_work, + SMC_TX_CORK_DELAY); + else + smc_tx_sndbuf_nonempty(conn); } /* while (msg_data_left(msg)) */ return send_done; @@ -243,7 +269,7 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, rdma_wr.remote_addr = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + /* RMBE within RMB */ - ((conn->peer_conn_idx - 1) * conn->peer_rmbe_size) + + conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; rdma_wr.rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; @@ -268,7 +294,7 @@ static inline void smc_tx_advance_cursors(struct smc_connection *conn, atomic_sub(len, &conn->peer_rmbe_space); /* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */ smp_mb__after_atomic(); - smc_curs_add(conn->sndbuf_size, sent, len); + smc_curs_add(conn->sndbuf_desc->len, sent, len); } /* sndbuf consumer: prepare all necessary (src&dst) chunks of data transmit; @@ -281,6 +307,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) union smc_host_cursor sent, prep, prod, cons; struct ib_sge sges[SMC_IB_MAX_SEND_SGE]; struct smc_link_group *lgr = conn->lgr; + struct smc_cdc_producer_flags *pflags; int to_send, rmbespace; struct smc_link *link; dma_addr_t dma_addr; @@ -291,7 +318,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); /* cf. wmem_alloc - (snd_max - snd_una) */ - to_send = smc_curs_diff(conn->sndbuf_size, &sent, &prep); + to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); if (to_send <= 0) return 0; @@ -308,7 +335,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) conn); /* if usable snd_wnd closes ask peer to advertise once it opens again */ - conn->local_tx_ctrl.prod_flags.write_blocked = (to_send >= rmbespace); + pflags = &conn->local_tx_ctrl.prod_flags; + pflags->write_blocked = (to_send >= rmbespace); /* cf. usable snd_wnd */ len = min(to_send, rmbespace); @@ -333,12 +361,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) dst_len_sum = dst_len; src_off = sent.count; /* dst_len determines the maximum src_len */ - if (sent.count + dst_len <= conn->sndbuf_size) { + if (sent.count + dst_len <= conn->sndbuf_desc->len) { /* unwrapped src case: single chunk of entire dst_len */ src_len = dst_len; } else { /* wrapped src case: 2 chunks of sum dst_len; start with 1st: */ - src_len = conn->sndbuf_size - sent.count; + src_len = conn->sndbuf_desc->len - sent.count; } src_len_sum = src_len; dma_addr = sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); @@ -350,8 +378,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) sges[srcchunk].lkey = link->roce_pd->local_dma_lkey; num_sges++; src_off += src_len; - if (src_off >= conn->sndbuf_size) - src_off -= conn->sndbuf_size; + if (src_off >= conn->sndbuf_desc->len) + src_off -= conn->sndbuf_desc->len; /* modulo in send ring */ if (src_len_sum == dst_len) break; /* either on 1st or 2nd iteration */ @@ -369,10 +397,12 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) dst_len = len - dst_len; /* remainder */ dst_len_sum += dst_len; src_len = min_t(int, - dst_len, conn->sndbuf_size - sent.count); + dst_len, conn->sndbuf_desc->len - sent.count); src_len_sum = src_len; } + if (conn->urg_tx_pend && len == to_send) + pflags->urg_data_present = 1; smc_tx_advance_cursors(conn, &prod, &sent, len); /* update connection's cursors with advanced local cursors */ smc_curs_write(&conn->local_tx_ctrl.prod, @@ -392,6 +422,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn) */ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) { + struct smc_cdc_producer_flags *pflags; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; @@ -409,20 +440,27 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn) } rc = 0; if (conn->alert_token_local) /* connection healthy */ - schedule_delayed_work(&conn->tx_work, - SMC_TX_WORK_DELAY); + mod_delayed_work(system_wq, &conn->tx_work, + SMC_TX_WORK_DELAY); } goto out_unlock; } - rc = smc_tx_rdma_writes(conn); - if (rc) { - smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], - (struct smc_wr_tx_pend_priv *)pend); - goto out_unlock; + if (!conn->local_tx_ctrl.prod_flags.urg_data_present) { + rc = smc_tx_rdma_writes(conn); + if (rc) { + smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + (struct smc_wr_tx_pend_priv *)pend); + goto out_unlock; + } } rc = smc_cdc_msg_send(conn, wr_buf, pend); + pflags = &conn->local_tx_ctrl.prod_flags; + if (!rc && pflags->urg_data_present) { + pflags->urg_data_pending = 0; + pflags->urg_data_present = 0; + } out_unlock: spin_unlock_bh(&conn->send_lock); @@ -432,7 +470,7 @@ out_unlock: /* Wakeup sndbuf consumers from process context * since there is more data to transmit */ -static void smc_tx_work(struct work_struct *work) +void smc_tx_work(struct work_struct *work) { struct smc_connection *conn = container_of(to_delayed_work(work), struct smc_connection, @@ -455,7 +493,7 @@ out: release_sock(&smc->sk); } -void smc_tx_consumer_update(struct smc_connection *conn) +void smc_tx_consumer_update(struct smc_connection *conn, bool force) { union smc_host_cursor cfed, cons; int to_confirm; @@ -466,11 +504,12 @@ void smc_tx_consumer_update(struct smc_connection *conn) smc_curs_write(&cfed, smc_curs_read(&conn->rx_curs_confirmed, conn), conn); - to_confirm = smc_curs_diff(conn->rmbe_size, &cfed, &cons); + to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons); if (conn->local_rx_ctrl.prod_flags.cons_curs_upd_req || + force || ((to_confirm > conn->rmbe_update_limit) && - ((to_confirm > (conn->rmbe_size / 2)) || + ((to_confirm > (conn->rmb_desc->len / 2)) || conn->local_rx_ctrl.prod_flags.write_blocked))) { if ((smc_cdc_get_slot_and_msg_send(conn) < 0) && conn->alert_token_local) { /* connection healthy */ @@ -494,6 +533,4 @@ void smc_tx_consumer_update(struct smc_connection *conn) void smc_tx_init(struct smc_sock *smc) { smc->sk.sk_write_space = smc_tx_write_space; - INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); - spin_lock_init(&smc->conn.send_lock); } diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h index 78255964fa4d..9d2238909fa0 100644 --- a/net/smc/smc_tx.h +++ b/net/smc/smc_tx.h @@ -24,13 +24,14 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn) smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn); smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn); - return smc_curs_diff(conn->sndbuf_size, &sent, &prep); + return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep); } +void smc_tx_work(struct work_struct *work); void smc_tx_init(struct smc_sock *smc); int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len); int smc_tx_sndbuf_nonempty(struct smc_connection *conn); void smc_tx_sndbuf_nonfull(struct smc_sock *smc); -void smc_tx_consumer_update(struct smc_connection *conn); +void smc_tx_consumer_update(struct smc_connection *conn, bool force); #endif /* SMC_TX_H */ diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 1b8af23e6e2b..cc7c1bb60fe8 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -376,6 +376,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) for (i = 0; i < num; i++) { link = wc[i].qp->qp_context; if (wc[i].status == IB_WC_SUCCESS) { + link->wr_rx_tstamp = jiffies; smc_wr_rx_demultiplex(&wc[i]); smc_wr_rx_post(link); /* refill WR RX */ } else { |