diff options
Diffstat (limited to 'net/smc')
-rw-r--r-- | net/smc/af_smc.c | 192 | ||||
-rw-r--r-- | net/smc/smc.h | 7 | ||||
-rw-r--r-- | net/smc/smc_cdc.c | 88 | ||||
-rw-r--r-- | net/smc/smc_cdc.h | 21 | ||||
-rw-r--r-- | net/smc/smc_clc.c | 12 | ||||
-rw-r--r-- | net/smc/smc_clc.h | 2 | ||||
-rw-r--r-- | net/smc/smc_core.c | 1175 | ||||
-rw-r--r-- | net/smc/smc_core.h | 156 | ||||
-rw-r--r-- | net/smc/smc_ib.c | 77 | ||||
-rw-r--r-- | net/smc/smc_ib.h | 10 | ||||
-rw-r--r-- | net/smc/smc_ism.c | 9 | ||||
-rw-r--r-- | net/smc/smc_llc.c | 1645 | ||||
-rw-r--r-- | net/smc/smc_llc.h | 69 | ||||
-rw-r--r-- | net/smc/smc_pnet.c | 593 | ||||
-rw-r--r-- | net/smc/smc_pnet.h | 7 | ||||
-rw-r--r-- | net/smc/smc_tx.c | 25 | ||||
-rw-r--r-- | net/smc/smc_wr.c | 62 | ||||
-rw-r--r-- | net/smc/smc_wr.h | 3 |
18 files changed, 3183 insertions, 970 deletions
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6fd44bdb0fc3..903321543838 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -337,50 +337,61 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb, send confirm_rkey msg to register with peer */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, - bool conf_rkey) -{ - if (!rmb_desc->wr_reg) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { - rmb_desc->regerr = 1; - return -EFAULT; - } - rmb_desc->wr_reg = 1; +/* register the new rmb on all links */ +static int smcr_lgr_reg_rmbs(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + /* protect against parallel smc_llc_cli_rkey_exchange() and + * parallel smcr_link_reg_rmb() + */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); + if (rc) + goto out; } - if (!conf_rkey) - return 0; + /* exchange confirm_rkey msg with peer */ - if (smc_llc_do_confirm_rkey(link, rmb_desc)) { - rmb_desc->regerr = 1; - return -EFAULT; + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; } - return 0; + rmb_desc->is_conf_rkey = true; +out: + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + return rc; } -static int smc_clnt_conf_first_link(struct smc_sock *smc) +static int smcr_clnt_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; - int rest; + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; /* receive CONFIRM LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_rc) + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); @@ -389,34 +400,34 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; + /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; - /* receive ADD LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; } - - /* send add link reject message, only one link supported for now */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); - + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); return 0; } @@ -596,8 +607,8 @@ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { + int i, reason_code = 0; struct smc_link *link; - int reason_code = 0; ini->is_smcd = false; ini->ib_lcl = &aclc->lcl; @@ -610,10 +621,28 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_unlock(&smc_client_lgr_pending); return reason_code; } - link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; smc_conn_save_peer_info(smc, aclc); + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + link = smc->conn.lnk; + } else { + /* set link that was assigned by server */ + link = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *l = &smc->conn.lgr->lnk[i]; + + if (l->peer_qpn == ntoh24(aclc->qpn)) { + link = l; + break; + } + } + if (!link) + return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK, + ini->cln_first_contact); + smc->conn.lnk = link; + } + /* create send buffer and rmb */ if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, @@ -622,7 +651,7 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); - if (smc_rmb_rtoken_handling(&smc->conn, aclc)) + if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, ini->cln_first_contact); @@ -634,7 +663,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, ini->cln_first_contact); } else { - if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, ini->cln_first_contact); } @@ -649,7 +678,9 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link(smc); + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_clnt_conf_first_link(smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) return smc_connect_abort(smc, reason_code, ini->cln_first_contact); @@ -999,17 +1030,13 @@ void smc_close_non_accepted(struct sock *sk) sock_put(sk); /* final sock_put */ } -static int smc_serv_conf_first_link(struct smc_sock *smc) +static int smcr_serv_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; - int rest; + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; - - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -1018,40 +1045,29 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm_resp, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_resp_rc) + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; - /* send ADD LINK request to client over the RoCE fabric */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - /* receive ADD LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; - } + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link); return 0; } @@ -1194,10 +1210,10 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_connection *conn = &new_smc->conn; if (local_contact != SMC_FIRST_CONTACT) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1210,13 +1226,13 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, cclc); - if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; goto decline; } @@ -1227,7 +1243,9 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, goto decline; } /* QP confirmation over RoCE fabric */ - reason_code = smc_serv_conf_first_link(new_smc); + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_serv_conf_first_link(new_smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) goto decline; } diff --git a/net/smc/smc.h b/net/smc/smc.h index be11ba41190f..6f1c42da7a4c 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -121,6 +121,7 @@ enum smc_urg_state { struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ + struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ @@ -142,6 +143,9 @@ struct smc_connection { * .prod cf. TCP snd_nxt * .cons cf. TCP sends ack */ + union smc_host_cursor local_tx_ctrl_fin; + /* prod crsr - confirmed by peer + */ union smc_host_cursor tx_curs_prep; /* tx - prepared data * snd_max..wmem_alloc */ @@ -153,6 +157,7 @@ struct smc_connection { */ atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ + u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ @@ -183,12 +188,14 @@ struct smc_connection { spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ + struct work_struct abort_work; /* abort the connection */ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ + u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 164f1584861b..a47e8855e045 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -47,17 +47,20 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn); + smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor, + conn); + conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); } int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; int rc; rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, @@ -91,12 +94,10 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend) { + struct smc_link *link = conn->lnk; union smc_host_cursor cfed; - struct smc_link *link; int rc; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; @@ -106,22 +107,60 @@ int smc_cdc_msg_send(struct smc_connection *conn, if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + } else { + conn->tx_cdc_seq--; + conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; } return rc; } +/* send a validation msg indicating the move of a conn to an other QP link */ +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) +{ + struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; + struct smc_link *link = conn->lnk; + struct smc_cdc_msg *peer; + int rc; + + peer = (struct smc_cdc_msg *)wr_buf; + peer->common.type = local->common.type; + peer->len = local->len; + peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */ + peer->token = htonl(local->token); + peer->prod_flags.failover_validation = 1; + + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + return rc; +} + static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) { struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; + struct smc_link *link; + bool again = false; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend); +again: + link = conn->lnk; + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); if (rc) return rc; spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, try again one time*/ + spin_unlock_bh(&conn->send_lock); + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + if (again) + return -ENOLINK; + again = true; + goto again; + } rc = smc_cdc_msg_send(conn, wr_buf, pend); spin_unlock_bh(&conn->send_lock); return rc; @@ -165,7 +204,7 @@ static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = conn->lnk; smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, smc_cdc_tx_filter, smc_cdc_tx_dismisser, @@ -239,6 +278,28 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, sk_send_sigurg(&smc->sk); } +static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc, + struct smc_link *link) +{ + struct smc_connection *conn = &smc->conn; + u16 recv_seq = ntohs(cdc->seqno); + s16 diff; + + /* check that seqnum was seen before */ + diff = conn->local_rx_ctrl.seqno - recv_seq; + if (diff < 0) { /* diff larger than 0x7fff */ + /* drop connection */ + conn->out_of_sync = 1; /* prevent any further receives */ + spin_lock_bh(&conn->send_lock); + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + conn->lnk = link; + spin_unlock_bh(&conn->send_lock); + sock_hold(&smc->sk); /* sock_put in abort_work */ + if (!schedule_work(&conn->abort_work)) + sock_put(&smc->sk); + } +} + static void smc_cdc_msg_recv_action(struct smc_sock *smc, struct smc_cdc_msg *cdc) { @@ -369,16 +430,19 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) read_lock_bh(&lgr->conns_lock); conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); read_unlock_bh(&lgr->conns_lock); - if (!conn) + if (!conn || conn->out_of_sync) return; smc = container_of(conn, struct smc_sock, conn); - if (!cdc->prod_flags.failover_validation) { - if (smc_cdc_before(ntohs(cdc->seqno), - conn->local_rx_ctrl.seqno)) - /* received seqno is old */ - return; + if (cdc->prod_flags.failover_validation) { + smc_cdc_msg_validate(smc, cdc, link); + return; } + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + smc_cdc_msg_recv(smc, cdc); } diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 861dc24c588c..0a0a89abd38b 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -97,23 +97,6 @@ static inline void smc_curs_add(int size, union smc_host_cursor *curs, } } -/* SMC cursors are 8 bytes long and require atomic reading and writing */ -static inline u64 smc_curs_read(union smc_host_cursor *curs, - struct smc_connection *conn) -{ -#ifndef KERNEL_HAS_ATOMIC64 - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&conn->acurs_lock, flags); - ret = curs->acurs; - spin_unlock_irqrestore(&conn->acurs_lock, flags); - return ret; -#else - return atomic64_read(&curs->acurs); -#endif -} - /* Copy cursor src into tgt */ static inline void smc_curs_copy(union smc_host_cursor *tgt, union smc_host_cursor *src, @@ -304,6 +287,7 @@ struct smc_cdc_tx_pend { }; int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend); @@ -312,6 +296,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); int smcd_cdc_msg_send(struct smc_connection *conn); +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf); int smc_cdc_init(void) __init; void smcd_cdc_rx_init(struct smc_connection *conn); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ea0068f0173c..d5627df24215 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -496,7 +496,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) sizeof(SMCD_EYECATCHER)); } else { /* SMC-R specific settings */ - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); cclc.hdr.path = SMC_TYPE_R; @@ -508,13 +508,13 @@ int smc_clc_send_confirm(struct smc_sock *smc) ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(cclc.psn, link->psn_initial); memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -572,7 +572,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); aclc.hdr.path = SMC_TYPE_R; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); @@ -580,13 +580,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(aclc.psn, link->psn_initial); memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ca209272e5fa..465876701b75 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -44,6 +44,8 @@ #define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ +#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ +#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 824c5211b027..7964a21e5e6f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -44,10 +44,20 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); +struct smc_ib_up_work { + struct work_struct work; + struct smc_link_group *lgr; + struct smc_ib_device *smcibdev; + u8 ibport; +}; + static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); +static void smc_link_up_work(struct work_struct *work); +static void smc_link_down_work(struct work_struct *work); + /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, spinlock_t **lgr_lock) @@ -111,16 +121,60 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn) rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } +/* assign an SMC-R link to the connection */ +static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) +{ + enum smc_link_state expected = first ? SMC_LNK_ACTIVATING : + SMC_LNK_ACTIVE; + int i, j; + + /* do link balancing */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &conn->lgr->lnk[i]; + + if (lnk->state != expected || lnk->link_is_asym) + continue; + if (conn->lgr->role == SMC_CLNT) { + conn->lnk = lnk; /* temporary, SMC server assigns link*/ + break; + } + if (conn->lgr->conns_num % 2) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + struct smc_link *lnk2; + + lnk2 = &conn->lgr->lnk[j]; + if (lnk2->state == expected && + !lnk2->link_is_asym) { + conn->lnk = lnk2; + break; + } + } + } + if (!conn->lnk) + conn->lnk = lnk; + break; + } + if (!conn->lnk) + return SMC_CLC_DECL_NOACTLINK; + return 0; +} + /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ -static void smc_lgr_register_conn(struct smc_connection *conn) +static int smc_lgr_register_conn(struct smc_connection *conn, bool first) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); + int rc; + if (!conn->lgr->is_smcd) { + rc = smcr_lgr_conn_assign_link(conn, first); + if (rc) + return rc; + } /* find a new alert_token_local value not yet used by some connection * in this link group */ @@ -132,6 +186,7 @@ static void smc_lgr_register_conn(struct smc_connection *conn) } smc_lgr_add_alert_token(conn); conn->lgr->conns_num++; + return 0; } /* Unregister connection and reset the alert token of the given connection< @@ -166,27 +221,33 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) void smc_lgr_cleanup_early(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + struct list_head *lgr_list; + spinlock_t *lgr_lock; if (!lgr) return; smc_conn_free(conn); - smc_lgr_forget(lgr); + lgr_list = smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + /* do not use this link group for new connections */ + if (!list_empty(lgr_list)) + list_del_init(lgr_list); + spin_unlock_bh(lgr_lock); smc_lgr_schedule_free_work_fast(lgr); } -/* Send delete link, either as client to request the initiation - * of the DELETE LINK sequence from server; or as server to - * initiate the delete processing. See smc_llc_rx_delete_link(). - */ -static int smc_link_send_delete(struct smc_link *lnk, bool orderly) +static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) { - if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { - smc_llc_link_deleting(lnk); - return 0; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk)) + lnk->state = SMC_LNK_INACTIVE; } - return -ENOTCONN; + wake_up_interruptible_all(&lgr->llc_waiter); } static void smc_lgr_free(struct smc_link_group *lgr); @@ -197,7 +258,6 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group, free_work); spinlock_t *lgr_lock; - struct smc_link *lnk; bool conns; smc_lgr_list_head(lgr, &lgr_lock); @@ -214,26 +274,17 @@ static void smc_lgr_free_work(struct work_struct *work) return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ - - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - if (!lgr->is_smcd && !lgr->terminating) { - /* try to send del link msg, on error free lgr immediately */ - if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk, true)) { - /* reschedule in case we never receive a response */ - smc_lgr_schedule_free_work(lgr); - spin_unlock_bh(lgr_lock); - return; - } - } lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); + if (!lgr->is_smcd && !lgr->terminating) + smc_llc_send_link_delete_all(lgr, true, + SMC_LLC_DEL_PROG_INIT_TERM); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); + if (!lgr->is_smcd) + smcr_lgr_link_deactivate_all(lgr); smc_lgr_free(lgr); } @@ -245,6 +296,89 @@ static void smc_lgr_terminate_work(struct work_struct *work) __smc_lgr_terminate(lgr, true); } +/* return next unique link id for the lgr */ +static u8 smcr_next_link_id(struct smc_link_group *lgr) +{ + u8 link_id; + int i; + + while (1) { + link_id = ++lgr->next_link_id; + if (!link_id) /* skip zero as link_id */ + link_id = ++lgr->next_link_id; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (smc_link_usable(&lgr->lnk[i]) && + lgr->lnk[i].link_id == link_id) + continue; + } + break; + } + return link_id; +} + +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini) +{ + u8 rndvec[3]; + int rc; + + get_device(&ini->ib_dev->ibdev->dev); + atomic_inc(&ini->ib_dev->lnk_cnt); + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = smcr_next_link_id(lgr); + lnk->lgr = lgr; + lnk->link_idx = link_idx; + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + smc_llc_link_set_uid(lnk); + INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); + if (!ini->ib_dev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); + if (rc) + goto out; + } + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + ini->vlan_id, lnk->gid, &lnk->sgid_index); + if (rc) + goto out; + rc = smc_llc_link_init(lnk); + if (rc) + goto out; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk, false); +out: + put_device(&ini->ib_dev->ibdev->dev); + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) + wake_up(&ini->ib_dev->lnks_deleted); + return rc; +} + /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -252,7 +386,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; - u8 rndvec[3]; + u8 link_idx; int rc = 0; int i; @@ -274,13 +408,14 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->freefast = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; - rwlock_init(&lgr->sndbufs_lock); - rwlock_init(&lgr->rmbs_lock); + mutex_init(&lgr->sndbufs_lock); + mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } + lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); @@ -297,48 +432,21 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ - get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); + memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1], + SMC_MAX_PNETID_LEN); + smc_llc_lgr_init(lgr, smc); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = ini->ib_dev; - lnk->ibport = ini->ib_port; - lgr_list = &smc_lgr_list.list; - lgr_lock = &smc_lgr_list.lock; - lnk->path_mtu = - ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; - if (!ini->ib_dev->initialized) - smc_ib_setup_per_ibdev(ini->ib_dev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + - (rndvec[2] << 16); - rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - ini->vlan_id, lnk->gid, - &lnk->sgid_index); + link_idx = SMC_SINGLE_LINK; + lnk = &lgr->lnk[link_idx]; + rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) goto free_lgr; - rc = smc_llc_link_init(lnk); - if (rc) - goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); - atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); @@ -346,14 +454,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_unlock_bh(lgr_lock); return 0; -destroy_qp: - smc_ib_destroy_queue_pair(lnk); -dealloc_pd: - smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); -clear_llc_lnk: - smc_llc_link_clear(lnk); free_lgr: kfree(lgr); ism_put_vlan: @@ -369,29 +469,186 @@ out: return rc; } +static int smc_write_space(struct smc_connection *conn) +{ + int buffer_len = conn->peer_rmbe_size; + union smc_host_cursor prod; + union smc_host_cursor cons; + int space; + + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + /* determine rx_buf space */ + space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); + return space; +} + +static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons, fin; + int rc = 0; + int diff; + + smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); + smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); + /* set prod cursor to old state, enforce tx_rdma_writes() */ + smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + + if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { + /* cons cursor advanced more than fin, and prod was set + * fin above, so now prod is smaller than cons. Fix that. + */ + diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_sent, diff); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_fin, diff); + + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + smp_mb__after_atomic(); + + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl.prod, diff); + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl_fin, diff); + } + /* recalculate, value is used by tx_rdma_writes() */ + atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); + + if (smc->sk.sk_state != SMC_INIT && + smc->sk.sk_state != SMC_CLOSED) { + rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); + if (!rc) { + schedule_delayed_work(&conn->tx_work, 0); + smc->sk.sk_data_ready(&smc->sk); + } + } else { + smc_wr_tx_put_slot(conn->lnk, + (struct smc_wr_tx_pend_priv *)pend); + } + return rc; +} + +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err) +{ + struct smc_link *to_lnk = NULL; + struct smc_cdc_tx_pend *pend; + struct smc_connection *conn; + struct smc_wr_buf *wr_buf; + struct smc_sock *smc; + struct rb_node *node; + int i, rc = 0; + + /* link is inactive, wake up tx waiters */ + smc_wr_wakeup_tx_wait(from_lnk); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE || + i == from_lnk->link_idx) + continue; + if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && + from_lnk->ibport == lgr->lnk[i].ibport) { + continue; + } + to_lnk = &lgr->lnk[i]; + break; + } + if (!to_lnk) { + smc_lgr_terminate_sched(lgr); + return NULL; + } +again: + read_lock_bh(&lgr->conns_lock); + for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { + conn = rb_entry(node, struct smc_connection, alert_node); + if (conn->lnk != from_lnk) + continue; + smc = container_of(conn, struct smc_sock, conn); + /* conn->lnk not yet set in SMC_INIT state */ + if (smc->sk.sk_state == SMC_INIT) + continue; + if (smc->sk.sk_state == SMC_CLOSED || + smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || + smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || + smc->sk.sk_state == SMC_APPFINCLOSEWAIT || + smc->sk.sk_state == SMC_APPCLOSEWAIT1 || + smc->sk.sk_state == SMC_APPCLOSEWAIT2 || + smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || + smc->sk.sk_state == SMC_PEERABORTWAIT || + smc->sk.sk_state == SMC_PROCESSABORT) { + spin_lock_bh(&conn->send_lock); + conn->lnk = to_lnk; + spin_unlock_bh(&conn->send_lock); + continue; + } + sock_hold(&smc->sk); + read_unlock_bh(&lgr->conns_lock); + /* pre-fetch buffer outside of send_lock, might sleep */ + rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); + if (rc) { + smcr_link_down_cond_sched(to_lnk); + return NULL; + } + /* avoid race with smcr_tx_sndbuf_nonempty() */ + spin_lock_bh(&conn->send_lock); + conn->lnk = to_lnk; + rc = smc_switch_cursor(smc, pend, wr_buf); + spin_unlock_bh(&conn->send_lock); + sock_put(&smc->sk); + if (rc) { + smcr_link_down_cond_sched(to_lnk); + return NULL; + } + goto again; + } + read_unlock_bh(&lgr->conns_lock); + return to_lnk; +} + +static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, + struct smc_link_group *lgr) +{ + int rc; + + if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + /* unregister rmb with peer */ + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (!rc) { + /* protect against smc_llc_cli_rkey_exchange() */ + mutex_lock(&lgr->llc_conf_mutex); + smc_llc_do_delete_rkey(lgr, rmb_desc); + rmb_desc->is_conf_rkey = false; + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + } + } + + if (rmb_desc->is_reg_err) { + /* buf registration failed, reuse not possible */ + mutex_lock(&lgr->rmbs_lock); + list_del(&rmb_desc->list); + mutex_unlock(&lgr->rmbs_lock); + + smc_buf_free(lgr, true, rmb_desc); + } else { + rmb_desc->used = 0; + } +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; - if (conn->rmb_desc) { - if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd && !list_empty(&lgr->list)) { - /* unregister rmb with peer */ - smc_llc_do_delete_rkey( - &lgr->lnk[SMC_SINGLE_LINK], - conn->rmb_desc); - } - conn->rmb_desc->used = 0; - } else { - /* buf registration failed, reuse not possible */ - write_lock_bh(&lgr->rmbs_lock); - list_del(&conn->rmb_desc->list); - write_unlock_bh(&lgr->rmbs_lock); - - smc_buf_free(lgr, true, conn->rmb_desc); - } - } + if (conn->rmb_desc && lgr->is_smcd) + conn->rmb_desc->used = 0; + else if (conn->rmb_desc) + smcr_buf_unuse(conn->rmb_desc, lgr); } /* remove a finished connection from its link group */ @@ -407,6 +664,8 @@ void smc_conn_free(struct smc_connection *conn) tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); + if (current_work() != &conn->abort_work) + cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { smc_lgr_unregister_conn(conn); @@ -417,35 +676,91 @@ void smc_conn_free(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); } -static void smc_link_clear(struct smc_link *lnk) +/* unregister a link from a buf_desc */ +static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + if (is_rmb) + buf_desc->is_reg_mr[lnk->link_idx] = false; + if (!buf_desc->is_map_ib[lnk->link_idx]) + return; + if (is_rmb) { + if (buf_desc->mr_rx[lnk->link_idx]) { + smc_ib_put_memory_region( + buf_desc->mr_rx[lnk->link_idx]); + buf_desc->mr_rx[lnk->link_idx] = NULL; + } + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); + } else { + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + buf_desc->is_map_ib[lnk->link_idx] = false; +} + +/* unmap all buffers of lgr for a deleted link */ +static void smcr_buf_unmap_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + mutex_lock(&lgr->rmbs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) + smcr_buf_unmap_link(buf_desc, true, lnk); + mutex_unlock(&lgr->rmbs_lock); + mutex_lock(&lgr->sndbufs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], + list) + smcr_buf_unmap_link(buf_desc, false, lnk); + mutex_unlock(&lgr->sndbufs_lock); + } +} + +static void smcr_rtoken_clear_link(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + lgr->rtokens[i][lnk->link_idx].rkey = 0; + lgr->rtokens[i][lnk->link_idx].dma_addr = 0; + } +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_clear(struct smc_link *lnk, bool log) { + struct smc_ib_device *smcibdev; + + if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) + return; lnk->peer_qpn = 0; - smc_llc_link_clear(lnk); + smc_llc_link_clear(lnk, log); + smcr_buf_unmap_lgr(lnk); + smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_reset(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); - if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) - wake_up(&lnk->smcibdev->lnks_deleted); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - if (is_rmb) { - if (buf_desc->mr_rx[SMC_SINGLE_LINK]) - smc_ib_put_memory_region( - buf_desc->mr_rx[SMC_SINGLE_LINK]); - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_FROM_DEVICE); - } else { - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_TO_DEVICE); - } - sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); kfree(buf_desc); @@ -503,6 +818,18 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { + int i; + + if (!lgr->is_smcd) { + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_UNUSED) + smcr_link_clear(&lgr->lnk[i], false); + } + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_lgr_clear(lgr); + } + smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!lgr->terminating) { @@ -512,27 +839,12 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); - put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } kfree(lgr); } -void smc_lgr_forget(struct smc_link_group *lgr) -{ - struct list_head *lgr_list; - spinlock_t *lgr_lock; - - lgr_list = smc_lgr_list_head(lgr, &lgr_lock); - spin_lock_bh(lgr_lock); - /* do not use this link group for new connections */ - if (!list_empty(lgr_list)) - list_del_init(lgr_list); - spin_unlock_bh(lgr_lock); -} - static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) { int i; @@ -587,10 +899,12 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); } else { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + u32 rsn = lgr->llc_termination_rsn; - if (lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); + if (!rsn) + rsn = SMC_LLC_DEL_PROG_INIT_TERM; + smc_llc_send_link_delete_all(lgr, false, rsn); + smcr_lgr_link_deactivate_all(lgr); } } @@ -606,11 +920,9 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (lgr->terminating) return; /* lgr already terminating */ - if (!soft) - cancel_delayed_work_sync(&lgr->free_work); + /* cancel free_work sync, will terminate when lgr->freeing is set */ + cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!lgr->is_smcd) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); @@ -629,10 +941,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) } read_unlock_bh(&lgr->conns_lock); smc_lgr_cleanup(lgr); - if (soft) - smc_lgr_schedule_free_work_fast(lgr); - else - smc_lgr_free(lgr); + smc_lgr_free(lgr); } /* unlink link group and schedule termination */ @@ -647,33 +956,11 @@ void smc_lgr_terminate_sched(struct smc_link_group *lgr) return; /* lgr already terminating */ } list_del_init(&lgr->list); + lgr->freeing = 1; spin_unlock_bh(lgr_lock); schedule_work(&lgr->terminate_work); } -/* Called when IB port is terminated */ -void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct smc_link_group *lgr, *l; - LIST_HEAD(lgr_free_list); - - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (!lgr->is_smcd && - lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; - } - } - spin_unlock_bh(&smc_lgr_list.lock); - - list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { - list_del_init(&lgr->list); - __smc_lgr_terminate(lgr, false); - } -} - /* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { @@ -688,6 +975,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) if (peer_gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; } } spin_unlock_bh(&dev->lgr_lock); @@ -728,6 +1016,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { @@ -736,9 +1025,9 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) + smcr_link_down_cond_sched(&lgr->lnk[i]); } } } @@ -746,6 +1035,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); __smc_lgr_terminate(lgr, false); } @@ -759,6 +1049,225 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) } } +/* set new lgr type and clear all asymmetric link tagging */ +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) +{ + char *lgr_type = ""; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + lgr->lnk[i].link_is_asym = false; + if (lgr->type == new_type) + return; + lgr->type = new_type; + + switch (lgr->type) { + case SMC_LGR_NONE: + lgr_type = "NONE"; + break; + case SMC_LGR_SINGLE: + lgr_type = "SINGLE"; + break; + case SMC_LGR_SYMMETRIC: + lgr_type = "SYMMETRIC"; + break; + case SMC_LGR_ASYMMETRIC_PEER: + lgr_type = "ASYMMETRIC_PEER"; + break; + case SMC_LGR_ASYMMETRIC_LOCAL: + lgr_type = "ASYMMETRIC_LOCAL"; + break; + } + pr_warn_ratelimited("smc: SMC-R lg %*phN state changed: " + "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr_type, lgr->pnet_id); +} + +/* set new lgr type and tag a link as asymmetric */ +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx) +{ + smcr_lgr_set_type(lgr, new_type); + lgr->lnk[asym_lnk_idx].link_is_asym = true; +} + +/* abort connection, abort_work scheduled from tasklet context */ +static void smc_conn_abort_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(work, + struct smc_connection, + abort_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + smc_conn_kill(conn, true); + sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ +} + +/* link is up - establish alternate link if applicable */ +static void smcr_link_up(struct smc_link_group *lgr, + struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link *link = NULL; + + if (list_empty(&lgr->list) || + lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + return; + + if (lgr->role == SMC_SERV) { + /* trigger local add link processing */ + link = smc_llc_usable_link(lgr); + if (!link) + return; + smc_llc_srv_add_link_local(link); + } else { + /* invite server to start add link processing */ + u8 gid[SMC_GID_SIZE]; + + if (smc_ib_determine_gid(smcibdev, ibport, lgr->vlan_id, gid, + NULL)) + return; + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* some other llc task is ongoing */ + wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + SMC_LLC_WAIT_TIME); + } + if (list_empty(&lgr->list) || + !smc_ib_port_active(smcibdev, ibport)) + return; /* lgr or device no longer active */ + link = smc_llc_usable_link(lgr); + if (!link) + return; + smc_llc_send_add_link(link, smcibdev->mac[ibport - 1], gid, + NULL, SMC_LLC_REQ); + } +} + +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_ib_up_work *ib_work; + struct smc_link_group *lgr, *n; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN) || + lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + continue; + ib_work = kmalloc(sizeof(*ib_work), GFP_KERNEL); + if (!ib_work) + continue; + INIT_WORK(&ib_work->work, smc_link_up_work); + ib_work->lgr = lgr; + ib_work->smcibdev = smcibdev; + ib_work->ibport = ibport; + schedule_work(&ib_work->work); + } +} + +/* link is down - switch connections to alternate link, + * must be called under lgr->llc_conf_mutex lock + */ +static void smcr_link_down(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_link *to_lnk; + int del_link_id; + + if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) + return; + + smc_ib_modify_qp_reset(lnk); + to_lnk = smc_switch_conns(lgr, lnk, true); + if (!to_lnk) { /* no backup link available */ + smcr_link_clear(lnk, true); + return; + } + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + del_link_id = lnk->link_id; + + if (lgr->role == SMC_SERV) { + /* trigger local delete link processing */ + smc_llc_srv_delete_link_local(to_lnk, del_link_id); + } else { + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* another llc task is ongoing */ + mutex_unlock(&lgr->llc_conf_mutex); + wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + SMC_LLC_WAIT_TIME); + mutex_lock(&lgr->llc_conf_mutex); + } + smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true, + SMC_LLC_DEL_LOST_PATH); + } +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_down_cond(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) + smcr_link_down(lnk); +} + +/* will get the lgr->llc_conf_mutex lock */ +void smcr_link_down_cond_sched(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) + schedule_work(&lnk->link_down_wrk); +} + +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *n; + int i; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk) && + lnk->smcibdev == smcibdev && lnk->ibport == ibport) + smcr_link_down_cond_sched(lnk); + } + } +} + +static void smc_link_up_work(struct work_struct *work) +{ + struct smc_ib_up_work *ib_work = container_of(work, + struct smc_ib_up_work, + work); + struct smc_link_group *lgr = ib_work->lgr; + + if (list_empty(&lgr->list)) + goto out; + smcr_link_up(lgr, ib_work->smcibdev, ib_work->ibport); +out: + kfree(ib_work); +} + +static void smc_link_down_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, struct smc_link, + link_down_wrk); + struct smc_link_group *lgr = link->lgr; + + if (list_empty(&lgr->list)) + return; + wake_up_interruptible_all(&lgr->llc_waiter); + mutex_lock(&lgr->llc_conf_mutex); + smcr_link_down(link); + mutex_unlock(&lgr->llc_conf_mutex); +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ @@ -810,15 +1319,21 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, enum smc_lgr_role role, u32 clcqpn) { - return !memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && - lgr->role == role && - (lgr->role == SMC_SERV || - lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); + int i; + + if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) || + lgr->role != role) + return false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && + !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) && + !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac))) + return true; + } + return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, @@ -859,15 +1374,17 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) /* link group found */ ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; - smc_lgr_register_conn(conn); /* add smc conn to lgr */ - if (delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); + if (!rc && delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); + if (rc) + return rc; if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -885,12 +1402,15 @@ create: goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); - smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); + if (rc) + goto out; } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ @@ -934,19 +1454,19 @@ int smc_uncompress_bufsize(u8 compressed) * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - rwlock_t *lock, + struct mutex *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - read_lock_bh(lock); + mutex_lock(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - read_unlock_bh(lock); + mutex_unlock(lock); return buf_slot; } } - read_unlock_bh(lock); + mutex_unlock(lock); return NULL; } @@ -959,12 +1479,135 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } +/* map an rmb buf to a link */ +static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + int rc; + + if (buf_desc->is_map_ib[lnk->link_idx]) + return 0; + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (rc) + return rc; + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + + /* map sg table to DMA address */ + rc = smc_ib_buf_map_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + /* SMC protocol depends on mapping to one DMA address only */ + if (rc != 1) { + rc = -EAGAIN; + goto free_table; + } + + /* create a new memory region for the RMB */ + if (is_rmb) { + rc = smc_ib_get_memory_region(lnk->roce_pd, + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE, + buf_desc, lnk->link_idx); + if (rc) + goto buf_unmap; + smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE); + } + buf_desc->is_map_ib[lnk->link_idx] = true; + return 0; + +buf_unmap: + smc_ib_buf_unmap_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); +free_table: + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + return rc; +} + +/* register a new rmb on IB device, + * must be called under lgr->llc_conf_mutex lock + */ +int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +{ + if (list_empty(&link->lgr->list)) + return -ENOLINK; + if (!rmb_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new rmb */ + if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { + rmb_desc->is_reg_err = true; + return -EFAULT; + } + rmb_desc->is_reg_mr[link->link_idx] = true; + } + return 0; +} + +static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, + struct list_head *lst, bool is_rmb) +{ + struct smc_buf_desc *buf_desc, *bf; + int rc = 0; + + mutex_lock(lock); + list_for_each_entry_safe(buf_desc, bf, lst, list) { + if (!buf_desc->used) + continue; + rc = smcr_buf_map_link(buf_desc, is_rmb, lnk); + if (rc) + goto out; + } +out: + mutex_unlock(lock); + return rc; +} + +/* map all used buffers of lgr for a new link */ +int smcr_buf_map_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i, rc = 0; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock, + &lgr->rmbs[i], true); + if (rc) + return rc; + rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock, + &lgr->sndbufs[i], false); + if (rc) + return rc; + } + return 0; +} + +/* register all used buffers of lgr for a new link, + * must be called under lgr->llc_conf_mutex lock + */ +int smcr_buf_reg_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i, rc = 0; + + mutex_lock(&lgr->rmbs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { + if (!buf_desc->used) + continue; + rc = smcr_link_reg_rmb(lnk, buf_desc); + if (rc) + goto out; + } + } +out: + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; - struct smc_link *lnk; - int rc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); @@ -981,41 +1624,33 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, return ERR_PTR(-EAGAIN); } buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + return buf_desc; +} - /* build the sg table from the pages */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, - GFP_KERNEL); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); - } - sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, - buf_desc->cpu_addr, bufsize); +/* map buf_desc on all usable links, + * unused buffers stay mapped as long as the link is up + */ +static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + struct smc_buf_desc *buf_desc, bool is_rmb) +{ + int i, rc = 0; - /* map sg table to DMA address */ - rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, - is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(-EAGAIN); - } + /* protect against parallel link reconfiguration */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - buf_desc); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); + if (!smc_link_usable(lnk)) + continue; + if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { + rc = -ENOMEM; + goto out; } } - - buf_desc->len = bufsize; - return buf_desc; +out: + mutex_unlock(&lgr->llc_conf_mutex); + return rc; } #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ @@ -1062,8 +1697,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + struct mutex *lock; /* lock buffer list */ int sk_buf_size; - rwlock_t *lock; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ @@ -1104,15 +1739,22 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) continue; buf_desc->used = 1; - write_lock_bh(lock); + mutex_lock(lock); list_add(&buf_desc->list, buf_list); - write_unlock_bh(lock); + mutex_unlock(lock); break; /* found */ } if (IS_ERR(buf_desc)) return -ENOMEM; + if (!is_smcd) { + if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + smcr_buf_unuse(buf_desc, lgr); + return -ENOMEM; + } + } + if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; @@ -1132,42 +1774,44 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&conn->lgr->lnk[i])) + continue; + smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&conn->lgr->lnk[i])) + continue; + smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } /* create the send and receive buffer for an SMC socket; @@ -1202,16 +1846,64 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) return -ENOSPC; } +static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, + u32 rkey) +{ + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (test_bit(i, lgr->rtokens_used_mask) && + lgr->rtokens[i][lnk_idx].rkey == rkey) + return i; + } + return -ENOENT; +} + +/* set rtoken for a new link to an existing rmb */ +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) +{ + int rtok_idx; + + rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); + if (rtok_idx == -ENOENT) + return; + lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); + lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); +} + +/* set rtoken for a new link whose link_id is given */ +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey) +{ + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); + bool found = false; + int link_idx; + + for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { + if (lgr->lnk[link_idx].link_id == link_id) { + found = true; + break; + } + } + if (!found) + return; + lgr->rtokens[rtok_idx][link_idx].rkey = rkey; + lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; +} + /* add a new rtoken from peer */ -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && - (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && + lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; @@ -1220,23 +1912,25 @@ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; + lgr->rtokens[i][lnk->link_idx].rkey = rkey; + lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } -/* delete an rtoken */ -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) +/* delete an rtoken from all links */ +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); - int i; + int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; - + for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { + lgr->rtokens[i][j].rkey = 0; + lgr->rtokens[i][j].dma_addr = 0; + } clear_bit(i, lgr->rtokens_used_mask); return 0; } @@ -1246,9 +1940,10 @@ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, + conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8041db20c753..86d160f0d187 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -32,10 +32,10 @@ enum smc_lgr_role { /* possible roles of a link group */ }; enum smc_link_state { /* possible states of a link */ + SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ - SMC_LNK_DELETING, /* link is being deleted */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ @@ -70,6 +70,8 @@ struct smc_rdma_wr { /* work requests per message struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; }; +#define SMC_LGR_ID_SIZE 4 + struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ @@ -85,6 +87,7 @@ struct smc_link { struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ + struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ atomic_long_t wr_tx_id; /* seq # of last sent WR */ @@ -115,29 +118,23 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ + u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ + u8 link_idx; /* index in lgr link array */ + u8 link_is_asym; /* is link asymmetric? */ + struct smc_link_group *lgr; /* parent link group */ + struct work_struct link_down_wrk; /* wrk to bring link down */ enum smc_link_state state; /* state of link */ - struct workqueue_struct *llc_wq; /* single thread work queue */ - struct completion llc_confirm; /* wait for rx of conf link */ - struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ - int llc_confirm_rc; /* rc from confirm link msg */ - int llc_confirm_resp_rc; /* rc from conf_resp msg */ - struct completion llc_add; /* wait for rx of add link */ - struct completion llc_add_resp; /* wait for rx of add link rsp*/ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ - int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ - struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ - int llc_delete_rkey_rc; /* rc from del rkey msg */ - struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; /* For now we just allow one parallel link per link group. The SMC protocol * allows more (up to 8). */ -#define SMC_LINKS_PER_LGR_MAX 1 +#define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 #define SMC_FIRST_CONTACT 1 /* first contact to a peer */ @@ -150,25 +147,32 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 wr_reg : 1; /* mem region registered */ - u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; - /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + + u8 is_conf_rkey; + /* confirm_rkey done */ + u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; + /* mem region registered */ + u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; + /* mem region mapped to lnk */ + u8 is_reg_err; + /* buffer registration err */ }; struct { /* SMC-D */ - unsigned short sba_idx; - /* SBA index number */ - u64 token; - /* DMB token number */ - dma_addr_t dma_addr; - /* DMA address */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ }; }; }; @@ -178,7 +182,6 @@ struct smc_rtoken { /* address/key of remote RMB */ u32 rkey; }; -#define SMC_LGR_ID_SIZE 4 #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, @@ -188,6 +191,28 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smcd_dev; +enum smc_lgr_type { /* redundancy state of lgr */ + SMC_LGR_NONE, /* no active links, lgr to be deleted */ + SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ + SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ + SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ + SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ +}; + +enum smc_llc_flowtype { + SMC_LLC_FLOW_NONE = 0, + SMC_LLC_FLOW_ADD_LINK = 2, + SMC_LLC_FLOW_DEL_LINK = 4, + SMC_LLC_FLOW_RKEY = 6, +}; + +struct smc_llc_qentry; + +struct smc_llc_flow { + enum smc_llc_flowtype type; + struct smc_llc_qentry *qentry; +}; + struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ @@ -196,9 +221,9 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - rwlock_t sndbufs_lock; /* protects tx buffers */ + struct mutex sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - rwlock_t rmbs_lock; /* protects rx buffers */ + struct mutex rmbs_lock; /* protects rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ @@ -222,6 +247,35 @@ struct smc_link_group { /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ + u8 next_link_id; + enum smc_lgr_type type; + /* redundancy state */ + u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; + /* pnet id of this lgr */ + struct list_head llc_event_q; + /* queue for llc events */ + spinlock_t llc_event_q_lock; + /* protects llc_event_q */ + struct mutex llc_conf_mutex; + /* protects lgr reconfig. */ + struct work_struct llc_add_link_work; + struct work_struct llc_del_link_work; + struct work_struct llc_event_work; + /* llc event worker */ + wait_queue_head_t llc_waiter; + /* w4 next llc event */ + struct smc_llc_flow llc_flow_lcl; + /* llc local control field */ + struct smc_llc_flow llc_flow_rmt; + /* llc remote control field */ + struct smc_llc_qentry *delayed_event; + /* arrived when flow active */ + spinlock_t llc_flow_lock; + /* protects llc flow */ + int llc_testlink_time; + /* link keep alive time */ + u32 llc_termination_rsn; + /* rsn code for termination */ }; struct { /* SMC-D */ u64 peer_gid; @@ -285,24 +339,36 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +/* returns true if the specified link is usable */ +static inline bool smc_link_usable(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) + return false; + return true; +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; -void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); -void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); -int smc_rmb_rtoken_handling(struct smc_connection *conn, +int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey); +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); @@ -315,8 +381,22 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini); +void smcr_link_clear(struct smc_link *lnk, bool log); +int smcr_buf_map_lgr(struct smc_link *lnk); +int smcr_buf_reg_lgr(struct smc_link *lnk); +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx); +int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err); +void smcr_link_down_cond(struct smc_link *lnk); +void smcr_link_down_cond_sched(struct smc_link *lnk); + static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { - return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + return link->lgr; } #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 04b6fefb8bce..f0a5064bf9bd 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -249,9 +249,10 @@ static void smc_ib_port_event_work(struct work_struct *work) clear_bit(port_idx, &smcibdev->port_event_mask); if (!smc_ib_port_active(smcibdev, port_idx + 1)) { set_bit(port_idx, smcibdev->ports_going_away); - smc_port_terminate(smcibdev, port_idx + 1); + smcr_port_err(smcibdev, port_idx + 1); } else { clear_bit(port_idx, smcibdev->ports_going_away); + smcr_port_add(smcibdev, port_idx + 1); } } } @@ -389,15 +390,15 @@ void smc_ib_put_memory_region(struct ib_mr *mr) ib_dereg_mr(mr); } -static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) { unsigned int offset = 0; int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK], - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + buf_slot->sgt[link_idx].sgl, + buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); return sg_num; @@ -405,29 +406,29 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) /* Allocate a memory region and map the dma mapped SG list of buf_slot */ int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot) + struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[SMC_SINGLE_LINK]) + if (buf_slot->mr_rx[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[SMC_SINGLE_LINK] = + buf_slot->mr_rx[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) { + if (IS_ERR(buf_slot->mr_rx[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]); - buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL; + rc = PTR_ERR(buf_slot->mr_rx[link_idx]); + buf_slot->mr_rx[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) return -EINVAL; return 0; } /* synchronize buffer usage for cpu access */ -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -435,11 +436,11 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_cpu(smcibdev->ibdev, + ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -447,7 +448,7 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, } /* synchronize buffer usage for device access */ -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -455,11 +456,11 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_device(smcibdev->ibdev, + ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -467,15 +468,15 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, } /* Map a new TX or RX buffer SG-table to DMA */ -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { int mapped_nents; - mapped_nents = ib_dma_map_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); if (!mapped_nents) return -ENOMEM; @@ -483,18 +484,18 @@ int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, return mapped_nents; } -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { - if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address) + if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) return; /* already unmapped */ - ib_dma_unmap_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + ib_dma_unmap_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); - buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; + buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) @@ -574,13 +575,23 @@ static void smc_ib_add_dev(struct ib_device *ibdev) /* trigger reading of the port attributes */ port_cnt = smcibdev->ibdev->phys_port_cnt; + pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", + smcibdev->ibdev->name, port_cnt); for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { set_bit(i, &smcibdev->port_event_mask); /* determine pnetids of the port */ - smc_pnetid_by_dev_port(ibdev->dev.parent, i, - smcibdev->pnetid[i]); + if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i])) + smc_pnetid_by_table_ib(smcibdev, i + 1); + pr_warn_ratelimited("smc: ib device %s port %d has pnetid " + "%.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); } schedule_work(&smcibdev->port_event_work); } @@ -597,6 +608,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); + pr_warn_ratelimited("smc: removing ib device %s\n", + smcibdev->ibdev->name); smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5c2b115d36da..e6a696ae15f3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -59,10 +59,10 @@ struct smc_link; int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); void smc_ib_dealloc_protection_domain(struct smc_link *lnk); @@ -74,12 +74,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot); + struct smc_buf_desc *buf_slot, u8 link_idx); void smc_ib_put_memory_region(struct ib_mr *mr); -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5c4727d5066e..91f85fc09fb8 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -296,7 +296,8 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; - smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); + if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); @@ -320,12 +321,18 @@ int smcd_register_dev(struct smcd_dev *smcd) list_add_tail(&smcd->list, &smcd_dev_list.list); spin_unlock(&smcd_dev_list.lock); + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&smcd->dev), smcd->pnetid, + smcd->pnetid_by_user ? " (user defined)" : ""); + return device_add(&smcd->dev); } EXPORT_SYMBOL_GPL(smcd_register_dev); void smcd_unregister_dev(struct smcd_dev *smcd) { + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&smcd->dev)); spin_lock(&smcd_dev_list.lock); list_del_init(&smcd->list); spin_unlock(&smcd_dev_list.lock); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 0e52aab53d97..391237b601fe 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -17,6 +17,7 @@ #include "smc_core.h" #include "smc_clc.h" #include "smc_llc.h" +#include "smc_pnet.h" #define SMC_LLC_DATA_LEN 40 @@ -58,11 +59,34 @@ struct smc_llc_msg_add_link { /* type 0x02 */ u8 sender_gid[SMC_GID_SIZE]; u8 sender_qp_num[3]; u8 link_num; - u8 flags2; /* QP mtu */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved3 : 4, + qp_mtu : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + reserved3 : 4; +#endif u8 initial_psn[3]; u8 reserved[8]; }; +struct smc_llc_msg_add_link_cont_rt { + __be32 rmb_key; + __be32 rmb_key_new; + __be64 rmb_vaddr_new; +}; + +#define SMC_LLC_RKEYS_PER_CONT_MSG 2 + +struct smc_llc_msg_add_link_cont { /* type 0x03 */ + struct smc_llc_hdr hd; + u8 link_num; + u8 num_rkeys; + u8 reserved2[2]; + struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG]; + u8 reserved[4]; +} __packed; /* format defined in RFC7609 */ + #define SMC_LLC_FLAG_DEL_LINK_ALL 0x40 #define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20 @@ -98,13 +122,8 @@ struct smc_llc_msg_confirm_rkey { /* type 0x06 */ u8 reserved; }; -struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ - struct smc_llc_hdr hd; - u8 num_rkeys; - struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; -}; - #define SMC_LLC_DEL_RKEY_MAX 8 +#define SMC_LLC_FLAG_RKEY_RETRY 0x10 #define SMC_LLC_FLAG_RKEY_NEG 0x20 struct smc_llc_msg_delete_rkey { /* type 0x09 */ @@ -119,10 +138,10 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_add_link_cont add_link_cont; struct smc_llc_msg_del_link delete_link; struct smc_llc_msg_confirm_rkey confirm_rkey; - struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont; struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; @@ -134,6 +153,162 @@ union smc_llc_msg { #define SMC_LLC_FLAG_RESP 0x80 +struct smc_llc_qentry { + struct list_head list; + struct smc_link *link; + union smc_llc_msg msg; +}; + +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc); + +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry = flow->qentry; + + flow->qentry = NULL; + return qentry; +} + +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry; + + if (flow->qentry) { + qentry = flow->qentry; + flow->qentry = NULL; + kfree(qentry); + } +} + +static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + flow->qentry = qentry; +} + +/* try to start a new llc flow, initiated by an incoming llc msg */ +static bool smc_llc_flow_start(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = qentry->link->lgr; + + spin_lock_bh(&lgr->llc_flow_lock); + if (flow->type) { + /* a flow is already active */ + if ((qentry->msg.raw.hdr.common.type == SMC_LLC_ADD_LINK || + qentry->msg.raw.hdr.common.type == SMC_LLC_DELETE_LINK) && + !lgr->delayed_event) { + lgr->delayed_event = qentry; + } else { + /* forget this llc request */ + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_flow_lock); + return false; + } + switch (qentry->msg.raw.hdr.common.type) { + case SMC_LLC_ADD_LINK: + flow->type = SMC_LLC_FLOW_ADD_LINK; + break; + case SMC_LLC_DELETE_LINK: + flow->type = SMC_LLC_FLOW_DEL_LINK; + break; + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + flow->type = SMC_LLC_FLOW_RKEY; + break; + default: + flow->type = SMC_LLC_FLOW_NONE; + } + if (qentry == lgr->delayed_event) + lgr->delayed_event = NULL; + spin_unlock_bh(&lgr->llc_flow_lock); + smc_llc_flow_qentry_set(flow, qentry); + return true; +} + +/* start a new local llc flow, wait till current flow finished */ +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type) +{ + enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE; + int rc; + + /* all flows except confirm_rkey and delete_rkey are exclusive, + * confirm/delete rkey flows can run concurrently (local and remote) + */ + if (type == SMC_LLC_FLOW_RKEY) + allowed_remote = SMC_LLC_FLOW_RKEY; +again: + if (list_empty(&lgr->list)) + return -ENODEV; + spin_lock_bh(&lgr->llc_flow_lock); + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)) { + lgr->llc_flow_lcl.type = type; + spin_unlock_bh(&lgr->llc_flow_lock); + return 0; + } + spin_unlock_bh(&lgr->llc_flow_lock); + rc = wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)), + SMC_LLC_WAIT_TIME); + if (!rc) + return -ETIMEDOUT; + goto again; +} + +/* finish the current llc flow */ +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow) +{ + spin_lock_bh(&lgr->llc_flow_lock); + memset(flow, 0, sizeof(*flow)); + flow->type = SMC_LLC_FLOW_NONE; + spin_unlock_bh(&lgr->llc_flow_lock); + if (!list_empty(&lgr->list) && lgr->delayed_event && + flow == &lgr->llc_flow_lcl) + schedule_work(&lgr->llc_event_work); + else + wake_up_interruptible(&lgr->llc_waiter); +} + +/* lnk is optional and used for early wakeup when link goes down, useful in + * cases where we wait for a response on the link after we sent a request + */ +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg) +{ + struct smc_llc_flow *flow = &lgr->llc_flow_lcl; + + wait_event_interruptible_timeout(lgr->llc_waiter, + (flow->qentry || + (lnk && !smc_link_usable(lnk)) || + list_empty(&lgr->list)), + time_out); + if (!flow->qentry || + (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) { + smc_llc_flow_qentry_del(flow); + goto out; + } + if (exp_msg && flow->qentry->msg.raw.hdr.common.type != exp_msg) { + if (exp_msg == SMC_LLC_ADD_LINK && + flow->qentry->msg.raw.hdr.common.type == + SMC_LLC_DELETE_LINK) { + /* flow_start will delay the unexpected msg */ + smc_llc_flow_start(&lgr->llc_flow_lcl, + smc_llc_flow_qentry_clr(flow)); + return NULL; + } + smc_llc_flow_qentry_del(flow); + } +out: + return flow->qentry; +} + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -186,7 +361,6 @@ static int smc_llc_add_pending_send(struct smc_link *link, int smc_llc_send_confirm_link(struct smc_link *link, enum smc_llc_reqresp reqresp) { - struct smc_link_group *lgr = smc_get_lgr(link); struct smc_llc_msg_confirm_link *confllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; @@ -207,35 +381,52 @@ int smc_llc_send_confirm_link(struct smc_link *link, memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE); hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; - memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */ + memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); + confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } /* send LLC confirm rkey request */ -static int smc_llc_send_confirm_rkey(struct smc_link *link, +static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { struct smc_llc_msg_confirm_rkey *rkeyllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; - int rc; + struct smc_link *link; + int i, rc, rtok_ix; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) return rc; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + + rtok_ix = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + link = &send_link->lgr->lnk[i]; + if (link->state == SMC_LNK_ACTIVE && link != send_link) { + rkeyllc->rtoken[rtok_ix].link_id = link->link_id; + rkeyllc->rtoken[rtok_ix].rmb_key = + htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( + (u64)sg_dma_address( + rmb_desc->sgt[link->link_idx].sgl)); + rtok_ix++; + } + } + /* rkey of send_link is in rtoken[0] */ + rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ - rc = smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(send_link, pend); return rc; } @@ -256,32 +447,15 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* prepare an add link message */ -static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, - struct smc_link *link, u8 mac[], u8 gid[], - enum smc_llc_reqresp reqresp) -{ - memset(addllc, 0, sizeof(*addllc)); - addllc->hd.common.type = SMC_LLC_ADD_LINK; - addllc->hd.length = sizeof(struct smc_llc_msg_add_link); - if (reqresp == SMC_LLC_RESP) { - addllc->hd.flags |= SMC_LLC_FLAG_RESP; - /* always reject more links for now */ - addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; - addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; - } - memcpy(addllc->sender_mac, mac, ETH_ALEN); - memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); -} - /* send ADD LINK request or response */ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, enum smc_llc_reqresp reqresp) { struct smc_llc_msg_add_link *addllc; @@ -293,32 +467,33 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], if (rc) return rc; addllc = (struct smc_llc_msg_add_link *)wr_buf; - smc_llc_prep_add_link(addllc, link, mac, gid, reqresp); + + memset(addllc, 0, sizeof(*addllc)); + addllc->hd.common.type = SMC_LLC_ADD_LINK; + addllc->hd.length = sizeof(struct smc_llc_msg_add_link); + if (reqresp == SMC_LLC_RESP) + addllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(addllc->sender_mac, mac, ETH_ALEN); + memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); + if (link_new) { + addllc->link_num = link_new->link_id; + hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num); + hton24(addllc->initial_psn, link_new->psn_initial); + if (reqresp == SMC_LLC_REQ) + addllc->qp_mtu = link_new->path_mtu; + else + addllc->qp_mtu = min(link_new->path_mtu, + link_new->peer_mtu); + } /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* prepare a delete link message */ -static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, - struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly) -{ - memset(delllc, 0, sizeof(*delllc)); - delllc->hd.common.type = SMC_LLC_DELETE_LINK; - delllc->hd.length = sizeof(struct smc_llc_msg_add_link); - if (reqresp == SMC_LLC_RESP) - delllc->hd.flags |= SMC_LLC_FLAG_RESP; - /* DEL_LINK_ALL because only 1 link supported */ - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; - if (orderly) - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; - delllc->link_num = link->link_id; -} - /* send DELETE LINK request or response */ -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly) +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason) { struct smc_llc_msg_del_link *delllc; struct smc_wr_tx_pend_priv *pend; @@ -329,7 +504,19 @@ int smc_llc_send_delete_link(struct smc_link *link, if (rc) return rc; delllc = (struct smc_llc_msg_del_link *)wr_buf; - smc_llc_prep_delete_link(delllc, link, reqresp, orderly); + + memset(delllc, 0, sizeof(*delllc)); + delllc->hd.common.type = SMC_LLC_DELETE_LINK; + delllc->hd.length = sizeof(struct smc_llc_msg_del_link); + if (reqresp == SMC_LLC_RESP) + delllc->hd.flags |= SMC_LLC_FLAG_RESP; + if (orderly) + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + if (link_del_id) + delllc->link_num = link_del_id; + else + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc->reason = htonl(reason); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -356,238 +543,1094 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } -struct smc_llc_send_work { - struct work_struct work; - struct smc_link *link; - int llclen; - union smc_llc_msg llcbuf; -}; - -/* worker that sends a prepared message */ -static void smc_llc_send_message_work(struct work_struct *work) +/* schedule an llc send on link, may wait for buffers */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *llcwrk = container_of(work, - struct smc_llc_send_work, work); struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; - if (llcwrk->link->state == SMC_LNK_INACTIVE) - goto out; - rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); + if (!smc_link_usable(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - goto out; - memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen); - smc_wr_tx_send(llcwrk->link, pend); -out: - kfree(llcwrk); + return rc; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + return smc_wr_tx_send(link, pend); } -/* copy llcbuf and schedule an llc send on link */ -static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +/* schedule an llc send on link, may wait for buffers, + * and wait for send completion notification. + * @return 0 on success + */ +static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; - if (!wrk) - return -ENOMEM; - INIT_WORK(&wrk->work, smc_llc_send_message_work); - wrk->link = link; - wrk->llclen = llclen; - memcpy(&wrk->llcbuf, llcbuf, llclen); - queue_work(link->llc_wq, &wrk->work); - return 0; + if (!smc_link_usable(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); } /********************************* receive ***********************************/ -static void smc_llc_rx_confirm_link(struct smc_link *link, - struct smc_llc_msg_confirm_link *llc) +static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, + enum smc_lgr_type lgr_new_t) { - struct smc_link_group *lgr = smc_get_lgr(link); - int conf_rc; + int i; + + if (lgr->type == SMC_LGR_SYMMETRIC || + (lgr->type != SMC_LGR_SINGLE && + (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER))) + return -EMLINK; + + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) { + for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } else { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } + return -EMLINK; +} - /* RMBE eyecatchers are not supported */ - if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) - conf_rc = 0; - else - conf_rc = ENOTSUPP; +/* return first buffer from any of the next buf lists */ +static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + struct smc_buf_desc *buf_pos; + + while (*buf_lst < SMC_RMBE_SIZES) { + buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], + struct smc_buf_desc, list); + if (buf_pos) + return buf_pos; + (*buf_lst)++; + } + return NULL; +} + +/* return next rmb from buffer lists */ +static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst, + struct smc_buf_desc *buf_pos) +{ + struct smc_buf_desc *buf_next; + + if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + (*buf_lst)++; + return _smc_llc_get_next_rmb(lgr, buf_lst); + } + buf_next = list_next_entry(buf_pos, list); + return buf_next; +} + +static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + *buf_lst = 0; + return smc_llc_get_next_rmb(lgr, buf_lst, NULL); +} + +/* send one add_link_continue msg */ +static int smc_llc_add_link_cont(struct smc_link *link, + struct smc_link *link_new, u8 *num_rkeys_todo, + int *buf_lst, struct smc_buf_desc **buf_pos) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + int prim_lnk_idx, lnk_idx, i, rc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + struct smc_buf_desc *rmb; + u8 n; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_resp_rc = conf_rc; - complete(&link->llc_confirm_resp); + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; + memset(addc_llc, 0, sizeof(*addc_llc)); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + addc_llc->link_num = link_new->link_id; + addc_llc->num_rkeys = *num_rkeys_todo; + n = *num_rkeys_todo; + for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) { + if (!*buf_pos) { + addc_llc->num_rkeys = addc_llc->num_rkeys - + *num_rkeys_todo; + *num_rkeys_todo = 0; + break; } - } else { - if (lgr->role == SMC_CLNT && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_rc = conf_rc; - link->link_id = llc->link_num; - complete(&link->llc_confirm); + rmb = *buf_pos; + + addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + + (*num_rkeys_todo)--; + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + while (*buf_pos && !(*buf_pos)->used) + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + } + addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT; + addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); + if (lgr->role == SMC_CLNT) + addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; + return smc_wr_tx_send(link, pend); +} + +static int smc_llc_cli_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + mutex_lock(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + break; + } + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + if (rc) + break; + } while (num_rkeys_send || num_rkeys_recv); + + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + +/* prepare and send an add link reject response */ +static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) +{ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; + qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + return smc_llc_send_message(qentry->link, &qentry->msg); +} + +static int smc_llc_cli_conf_link(struct smc_link *link, + struct smc_init_info *ini, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; + + /* receive CONFIRM LINK request over RoCE fabric */ + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0); + if (!qentry) { + rc = smc_llc_send_delete_link(link, link_new->link_id, + SMC_LLC_REQ, false, + SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { + /* received DELETE_LINK instead */ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return -ENOLINK; + } + smc_llc_save_peer_uid(qentry); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_modify_qp_rts(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; } + smc_wr_remember_qp_attr(link_new); + + rc = smcr_buf_reg_lgr(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + + /* send CONFIRM LINK response over RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_llc_link_active(link_new); + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); + return 0; } -static void smc_llc_rx_add_link(struct smc_link *link, - struct smc_llc_msg_add_link *llc) +static void smc_llc_save_add_link_info(struct smc_link *link, + struct smc_llc_msg_add_link *add_llc) { + link->peer_qpn = ntoh24(add_llc->sender_qp_num); + memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); + link->peer_psn = ntoh24(add_llc->initial_psn); + link->peer_mtu = add_llc->qp_mtu; +} + +/* as an SMC client, process an add link request */ +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) +{ + struct smc_llc_msg_add_link *llc = &qentry->msg.add_link; + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_link *lnk_new = NULL; + struct smc_init_info ini; + int lnk_idx, rc = 0; + + ini.vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) { + if (!ini.ib_dev) + goto out_reject; + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + if (!ini.ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini.ib_dev = link->smcibdev; + ini.ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) + goto out_reject; + lnk_new = &lgr->lnk[lnk_idx]; + rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini); + if (rc) + goto out_reject; + smc_llc_save_add_link_info(lnk_new, llc); + lnk_new->link_id = llc->link_num; /* SMC server assigns link id */ + smc_llc_link_set_uid(lnk_new); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVATING) - complete(&link->llc_add_resp); - } else { - if (link->state == SMC_LNK_ACTIVATING) { - complete(&link->llc_add); - return; - } + rc = smc_ib_ready_link(lnk_new); + if (rc) + goto out_clear_lnk; - if (lgr->role == SMC_SERV) { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); + rc = smcr_buf_map_lgr(lnk_new); + if (rc) + goto out_clear_lnk; - } else { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); + rc = smc_llc_send_add_link(link, + lnk_new->smcibdev->mac[ini.ib_port - 1], + lnk_new->gid, lnk_new, SMC_LLC_RESP); + if (rc) + goto out_clear_lnk; + rc = smc_llc_cli_rkey_exchange(link, lnk_new); + if (rc) { + rc = 0; + goto out_clear_lnk; + } + rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); + if (!rc) + goto out; +out_clear_lnk: + smcr_link_clear(lnk_new, false); +out_reject: + smc_llc_cli_add_link_reject(qentry); +out: + kfree(qentry); + return rc; +} + +static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + + mutex_lock(&lgr->llc_conf_mutex); + smc_llc_cli_add_link(qentry->link, qentry); + mutex_unlock(&lgr->llc_conf_mutex); +} + +static int smc_llc_active_link_count(struct smc_link_group *lgr) +{ + int i, link_count = 0; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + link_count++; + } + return link_count; +} + +/* find the asymmetric link when 3 links are established */ +static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr) +{ + int asym_idx = -ENOENT; + int i, j, k; + bool found; + + /* determine asymmetric link */ + found = false; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + if (!smc_link_usable(&lgr->lnk[i]) || + !smc_link_usable(&lgr->lnk[j])) + continue; + if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid, + SMC_GID_SIZE)) { + found = true; /* asym_lnk is i or j */ + break; + } } - smc_llc_send_message(link, llc, sizeof(*llc)); + if (found) + break; } + if (!found) + goto out; /* no asymmetric link */ + for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) { + if (!smc_link_usable(&lgr->lnk[k])) + continue; + if (k != i && + !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = i; + break; + } + if (k != j && + !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = j; + break; + } + } +out: + return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx]; } -static void smc_llc_rx_delete_link(struct smc_link *link, - struct smc_llc_msg_del_link *llc) +static void smc_llc_delete_asym_link(struct smc_link_group *lgr) { - struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_link *lnk_new = NULL, *lnk_asym; + struct smc_llc_qentry *qentry; + int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) - smc_lgr_schedule_free_work_fast(lgr); - } else { - smc_lgr_forget(lgr); - smc_llc_link_deleting(link); - if (lgr->role == SMC_SERV) { - /* client asks to delete this link, send request */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); - } else { - /* server requests to delete this link, send response */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); + lnk_asym = smc_llc_find_asym_link(lgr); + if (!lnk_asym) + return; /* no asymmetric link */ + if (!smc_link_downing(&lnk_asym->state)) + return; + lnk_new = smc_switch_conns(lgr, lnk_asym, false); + smc_wr_tx_wait_no_pending_sends(lnk_asym); + if (!lnk_new) + goto out_free; + /* change flow type from ADD_LINK into DEL_LINK */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK; + rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, + true, SMC_LLC_DEL_NO_ASYM_NEEDED); + if (rc) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (!qentry) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); +out_free: + smcr_link_clear(lnk_asym, true); +} + +static int smc_llc_srv_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry = NULL; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + mutex_lock(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + goto out; } - smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_terminate_sched(lgr); + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + } while (num_rkeys_send || num_rkeys_recv); +out: + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + +static int smc_llc_srv_conf_link(struct smc_link *link, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc; + + /* send CONFIRM LINK request over the RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ); + if (rc) + return -ENOLINK; + /* receive CONFIRM LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { + /* send DELETE LINK */ + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; } + smc_llc_save_peer_uid(qentry); + smc_llc_link_active(link_new); + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return 0; } -static void smc_llc_rx_test_link(struct smc_link *link, - struct smc_llc_msg_test_link *llc) +int smc_llc_srv_add_link(struct smc_link *link) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVE) - complete(&link->llc_testlink_resp); - } else { - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; + struct smc_link_group *lgr = link->lgr; + struct smc_llc_msg_add_link *add_llc; + struct smc_llc_qentry *qentry = NULL; + struct smc_link *link_new; + struct smc_init_info ini; + int lnk_idx, rc = 0; + + /* ignore client add link recommendation, start new flow */ + ini.vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + if (!ini.ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini.ib_dev = link->smcibdev; + ini.ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) + return 0; + + rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini); + if (rc) + return rc; + link_new = &lgr->lnk[lnk_idx]; + rc = smc_llc_send_add_link(link, + link_new->smcibdev->mac[ini.ib_port - 1], + link_new->gid, link_new, SMC_LLC_REQ); + if (rc) + goto out_err; + /* receive ADD LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); + if (!qentry) { + rc = -ETIMEDOUT; + goto out_err; } + add_llc = &qentry->msg.add_link; + if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) { + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = -ENOLINK; + goto out_err; + } + if (lgr->type == SMC_LGR_SINGLE && + (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) { + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + smc_llc_save_add_link_info(link_new, add_llc); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_ready_link(link_new); + if (rc) + goto out_err; + rc = smcr_buf_map_lgr(link_new); + if (rc) + goto out_err; + rc = smcr_buf_reg_lgr(link_new); + if (rc) + goto out_err; + rc = smc_llc_srv_rkey_exchange(link, link_new); + if (rc) + goto out_err; + rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); + if (rc) + goto out_err; + return 0; +out_err: + smcr_link_clear(link_new, false); + return rc; } -static void smc_llc_rx_confirm_rkey(struct smc_link *link, - struct smc_llc_msg_confirm_rkey *llc) +static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) { + struct smc_link *link = lgr->llc_flow_lcl.qentry->link; int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_confirm_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_confirm_rkey); - } else { - rc = smc_rtoken_add(smc_get_lgr(link), - llc->rtoken[0].rmb_vaddr, - llc->rtoken[0].rmb_key); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); - /* ignore rtokens for other links, we have only one link */ + mutex_lock(&lgr->llc_conf_mutex); + rc = smc_llc_srv_add_link(link); + if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { + /* delete any asymmetric link */ + smc_llc_delete_asym_link(lgr); + } + mutex_unlock(&lgr->llc_conf_mutex); +} + +/* enqueue a local add_link req to trigger a new add_link flow, only as SERV */ +void smc_llc_srv_add_link_local(struct smc_link *link) +{ + struct smc_llc_msg_add_link add_llc = {0}; + + add_llc.hd.length = sizeof(add_llc); + add_llc.hd.common.type = SMC_LLC_ADD_LINK; + /* no dev and port needed, we as server ignore client data anyway */ + smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc); +} + +/* worker to process an add link message */ +static void smc_llc_add_link_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_add_link_work); - llc->hd.flags |= SMC_LLC_FLAG_RESP; - if (rc < 0) - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc, sizeof(*llc)); + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_add_link(lgr); + else + smc_llc_process_srv_add_link(lgr); +out: + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } -static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, - struct smc_llc_msg_confirm_rkey_cont *llc) +/* enqueue a local del_link msg to trigger a new del_link flow, + * called only for role SMC_SERV + */ +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ - } else { - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + struct smc_llc_msg_del_link del_llc = {0}; + + del_llc.hd.length = sizeof(del_llc); + del_llc.hd.common.type = SMC_LLC_DELETE_LINK; + del_llc.link_num = del_link_id; + del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH); + del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc); +} + +static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) +{ + struct smc_link *lnk_del = NULL, *lnk_asym, *lnk; + struct smc_llc_msg_del_link *del_llc; + struct smc_llc_qentry *qentry; + int active_links; + int lnk_idx; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + smc_lgr_terminate_sched(lgr); + goto out; + } + mutex_lock(&lgr->llc_conf_mutex); + /* delete single link */ + for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) { + if (lgr->lnk[lnk_idx].link_id != del_llc->link_num) + continue; + lnk_del = &lgr->lnk[lnk_idx]; + break; + } + del_llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (!lnk_del) { + /* link was not found */ + del_llc->reason = htonl(SMC_LLC_DEL_NOLNK); + smc_llc_send_message(lnk, &qentry->msg); + goto out_unlock; + } + lnk_asym = smc_llc_find_asym_link(lgr); + + del_llc->reason = 0; + smc_llc_send_message(lnk, &qentry->msg); /* response */ + + if (smc_link_downing(&lnk_del->state)) { + smc_switch_conns(lgr, lnk_del, false); + smc_wr_tx_wait_no_pending_sends(lnk_del); + } + smcr_link_clear(lnk_del, true); + + active_links = smc_llc_active_link_count(lgr); + if (lnk_del == lnk_asym) { + /* expected deletion of asym link, don't change lgr state */ + } else if (active_links == 1) { + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + } else if (!active_links) { + smcr_lgr_set_type(lgr, SMC_LGR_NONE); + smc_lgr_terminate_sched(lgr); } +out_unlock: + mutex_unlock(&lgr->llc_conf_mutex); +out: + kfree(qentry); } -static void smc_llc_rx_delete_rkey(struct smc_link *link, - struct smc_llc_msg_delete_rkey *llc) +/* try to send a DELETE LINK ALL request on any active link, + * waiting for send completion + */ +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) { - u8 err_mask = 0; - int i, max; + struct smc_llc_msg_del_link delllc = {0}; + int i; + + delllc.hd.common.type = SMC_LLC_DELETE_LINK; + delllc.hd.length = sizeof(delllc); + if (ord) + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc.reason = htonl(rsn); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc)) + break; + } +} - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_delete_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_delete_rkey); - } else { - max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); - for (i = 0; i < max; i++) { - if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i])) - err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); +static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) +{ + struct smc_llc_msg_del_link *del_llc; + struct smc_link *lnk, *lnk_del; + struct smc_llc_qentry *qentry; + int active_links; + int i; + + mutex_lock(&lgr->llc_conf_mutex); + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + /* delete entire lgr */ + smc_llc_send_link_delete_all(lgr, true, ntohl( + qentry->msg.delete_link.reason)); + smc_lgr_terminate_sched(lgr); + goto out; + } + /* delete single link */ + lnk_del = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].link_id == del_llc->link_num) { + lnk_del = &lgr->lnk[i]; + break; } + } + if (!lnk_del) + goto out; /* asymmetric link already deleted */ - if (err_mask) { - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - llc->err_mask = err_mask; + if (smc_link_downing(&lnk_del->state)) { + smc_switch_conns(lgr, lnk_del, false); + smc_wr_tx_wait_no_pending_sends(lnk_del); + } + if (!list_empty(&lgr->list)) { + /* qentry is either a request from peer (send it back to + * initiate the DELETE_LINK processing), or a locally + * enqueued DELETE_LINK request (forward it) + */ + if (!smc_llc_send_message(lnk, &qentry->msg)) { + struct smc_llc_qentry *qentry2; + + qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (qentry2) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); } + } + smcr_link_clear(lnk_del, true); - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + active_links = smc_llc_active_link_count(lgr); + if (active_links == 1) { + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + } else if (!active_links) { + smcr_lgr_set_type(lgr, SMC_LGR_NONE); + smc_lgr_terminate_sched(lgr); } + + if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) { + /* trigger setup of asymm alt link */ + smc_llc_srv_add_link_local(lnk); + } +out: + mutex_unlock(&lgr->llc_conf_mutex); + kfree(qentry); } -static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +static void smc_llc_delete_link_work(struct work_struct *work) { - struct smc_link *link = (struct smc_link *)wc->qp->qp_context; - union smc_llc_msg *llc = buf; + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_del_link_work); - if (wc->byte_len < sizeof(*llc)) - return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ - if (link->state == SMC_LNK_INACTIVE) - return; /* link not active, drop msg */ + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; + } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_delete_link(lgr); + else + smc_llc_process_srv_delete_link(lgr); +out: + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); +} + +/* process a confirm_rkey request from peer, remote flow */ +static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr) +{ + struct smc_llc_msg_confirm_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + int num_entries; + int rk_idx; + int i; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.confirm_rkey; + link = qentry->link; + + num_entries = llc->rtoken[0].num_rkeys; + /* first rkey entry is for receiving link */ + rk_idx = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); + if (rk_idx < 0) + goto out_err; + + for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++) + smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id, + llc->rtoken[i].rmb_vaddr, + llc->rtoken[i].rmb_key); + /* max links is 3 so there is no need to support conf_rkey_cont msgs */ + goto out; +out_err: + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY; +out: + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} + +/* process a delete_rkey request from peer, remote flow */ +static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) +{ + struct smc_llc_msg_delete_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + u8 err_mask = 0; + int i, max; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.delete_rkey; + link = qentry->link; + + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + } + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; + } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} + +static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type) +{ + pr_warn_ratelimited("smc: SMC-R lg %*phN LLC protocol violation: " + "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, type); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL); + smc_lgr_terminate_sched(lgr); +} + +/* flush the llc event queue */ +static void smc_llc_event_flush(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry, *q; + + spin_lock_bh(&lgr->llc_event_q_lock); + list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) { + list_del_init(&qentry->list); + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +static void smc_llc_event_handler(struct smc_llc_qentry *qentry) +{ + union smc_llc_msg *llc = &qentry->msg; + struct smc_link *link = qentry->link; + struct smc_link_group *lgr = link->lgr; + + if (!smc_link_usable(link)) + goto out; switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: - smc_llc_rx_test_link(link, &llc->test_link); - break; - case SMC_LLC_CONFIRM_LINK: - smc_llc_rx_confirm_link(link, &llc->confirm_link); + llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); break; case SMC_LLC_ADD_LINK: - smc_llc_rx_add_link(link, &llc->add_link); + if (list_empty(&lgr->list)) + goto out; /* lgr is terminating */ + if (lgr->role == SMC_CLNT) { + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_add_link_work); + } + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + schedule_work(&lgr->llc_add_link_work); + } + return; + case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_ADD_LINK_CONT: + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up_interruptible(&lgr->llc_waiter); + return; + } break; case SMC_LLC_DELETE_LINK: - smc_llc_rx_delete_link(link, &llc->delete_link); - break; + if (lgr->role == SMC_CLNT) { + /* server requests to delete this link, send response */ + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_del_link_work); + } + } else { + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_del_link_work); + } + } + return; case SMC_LLC_CONFIRM_RKEY: - smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey); - break; + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_conf_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; case SMC_LLC_CONFIRM_RKEY_CONT: - smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); + /* not used because max links is 3, and 3 rkeys fit into + * one CONFIRM_RKEY message + */ break; case SMC_LLC_DELETE_RKEY: - smc_llc_rx_delete_rkey(link, &llc->delete_rkey); + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_delete_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; + default: + smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); break; } +out: + kfree(qentry); +} + +/* worker to process llc messages on the event queue */ +static void smc_llc_event_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_event_work); + struct smc_llc_qentry *qentry; + + if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { + if (smc_link_usable(lgr->delayed_event->link)) { + smc_llc_event_handler(lgr->delayed_event); + } else { + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + kfree(qentry); + } + } + +again: + spin_lock_bh(&lgr->llc_event_q_lock); + if (!list_empty(&lgr->llc_event_q)) { + qentry = list_first_entry(&lgr->llc_event_q, + struct smc_llc_qentry, list); + list_del_init(&qentry->list); + spin_unlock_bh(&lgr->llc_event_q_lock); + smc_llc_event_handler(qentry); + goto again; + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +/* process llc responses in tasklet context */ +static void smc_llc_rx_response(struct smc_link *link, + struct smc_llc_qentry *qentry) +{ + u8 llc_type = qentry->msg.raw.hdr.common.type; + + switch (llc_type) { + case SMC_LLC_TEST_LINK: + if (link->state == SMC_LNK_ACTIVE) + complete(&link->llc_testlink_resp); + break; + case SMC_LLC_ADD_LINK: + case SMC_LLC_DELETE_LINK: + case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_ADD_LINK_CONT: + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + /* assign responses to the local flow, we requested them */ + smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); + wake_up_interruptible(&link->lgr->llc_waiter); + return; + case SMC_LLC_CONFIRM_RKEY_CONT: + /* not used because max links is 3 */ + break; + default: + smc_llc_protocol_violation(link->lgr, llc_type); + break; + } + kfree(qentry); +} + +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry; + unsigned long flags; + + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + if (!qentry) + return; + qentry->link = link; + INIT_LIST_HEAD(&qentry->list); + memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + + /* process responses immediately */ + if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + smc_llc_rx_response(link, qentry); + return; + } + + /* add requests to event queue */ + spin_lock_irqsave(&lgr->llc_event_q_lock, flags); + list_add_tail(&qentry->list, &lgr->llc_event_q); + spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); + schedule_work(&link->lgr->llc_event_work); +} + +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + union smc_llc_msg *llc = buf; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + + smc_llc_enqueue(link, llc); } /***************************** worker, utils *********************************/ @@ -613,112 +1656,162 @@ static void smc_llc_testlink_work(struct work_struct *work) /* receive TEST LINK response over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); + if (link->state != SMC_LNK_ACTIVE) + return; /* link state changed */ if (rc <= 0) { - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); return; } next_interval = link->llc_testlink_time; out: - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - next_interval); + schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } -int smc_llc_link_init(struct smc_link *link) +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { - struct smc_link_group *lgr = smc_get_lgr(link); - link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, - *((u32 *)lgr->id), - link->link_id); - if (!link->llc_wq) - return -ENOMEM; - init_completion(&link->llc_confirm); - init_completion(&link->llc_confirm_resp); - init_completion(&link->llc_add); - init_completion(&link->llc_add_resp); - init_completion(&link->llc_confirm_rkey); - init_completion(&link->llc_delete_rkey); - mutex_init(&link->llc_delete_rkey_mutex); - init_completion(&link->llc_testlink_resp); - INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); - return 0; + struct net *net = sock_net(smc->clcsock->sk); + + INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); + INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work); + INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work); + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); + spin_lock_init(&lgr->llc_flow_lock); + init_waitqueue_head(&lgr->llc_waiter); + mutex_init(&lgr->llc_conf_mutex); + lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; } -void smc_llc_link_active(struct smc_link *link, int testlink_time) +/* called after lgr was removed from lgr_list */ +void smc_llc_lgr_clear(struct smc_link_group *lgr) { - link->state = SMC_LNK_ACTIVE; - if (testlink_time) { - link->llc_testlink_time = testlink_time * HZ; - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - link->llc_testlink_time); + smc_llc_event_flush(lgr); + wake_up_interruptible_all(&lgr->llc_waiter); + cancel_work_sync(&lgr->llc_event_work); + cancel_work_sync(&lgr->llc_add_link_work); + cancel_work_sync(&lgr->llc_del_link_work); + if (lgr->delayed_event) { + kfree(lgr->delayed_event); + lgr->delayed_event = NULL; } } -void smc_llc_link_deleting(struct smc_link *link) +int smc_llc_link_init(struct smc_link *link) { - link->state = SMC_LNK_DELETING; - smc_wr_wakeup_tx_wait(link); + init_completion(&link->llc_testlink_resp); + INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + return 0; } -/* called in tasklet context */ -void smc_llc_link_inactive(struct smc_link *link) +void smc_llc_link_active(struct smc_link *link) { - link->state = SMC_LNK_INACTIVE; - cancel_delayed_work(&link->llc_testlink_wrk); - smc_wr_wakeup_reg_wait(link); - smc_wr_wakeup_tx_wait(link); + pr_warn_ratelimited("smc: SMC-R lg %*phN link added: id %*phN, " + "peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); + link->state = SMC_LNK_ACTIVE; + if (link->lgr->llc_testlink_time) { + link->llc_testlink_time = link->lgr->llc_testlink_time * HZ; + schedule_delayed_work(&link->llc_testlink_wrk, + link->llc_testlink_time); + } } /* called in worker context */ -void smc_llc_link_clear(struct smc_link *link) +void smc_llc_link_clear(struct smc_link *link, bool log) { - flush_workqueue(link->llc_wq); - destroy_workqueue(link->llc_wq); + if (log) + pr_warn_ratelimited("smc: SMC-R lg %*phN link removed: id %*phN" + ", peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); + complete(&link->llc_testlink_resp); + cancel_delayed_work_sync(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } -/* register a new rtoken at the remote peer */ -int smc_llc_do_confirm_rkey(struct smc_link *link, +/* register a new rtoken at the remote peer (for all links) */ +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { - int rc; + struct smc_link_group *lgr = send_link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; - /* protected by mutex smc_create_lgr_pending */ - reinit_completion(&link->llc_confirm_rkey); - rc = smc_llc_send_confirm_rkey(link, rmb_desc); + rc = smc_llc_send_confirm_rkey(send_link, rmb_desc); if (rc) - return rc; + goto out; /* receive CONFIRM RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_confirm_rkey_rc) - return -EFAULT; - return 0; + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) + rc = -EFAULT; +out: + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return rc; } /* unregister an rtoken at the remote peer */ -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc) { + struct smc_llc_qentry *qentry = NULL; + struct smc_link *send_link; int rc = 0; - mutex_lock(&link->llc_delete_rkey_mutex); - if (link->state != SMC_LNK_ACTIVE) - goto out; - reinit_completion(&link->llc_delete_rkey); - rc = smc_llc_send_delete_rkey(link, rmb_desc); + send_link = smc_llc_usable_link(lgr); + if (!send_link) + return -ENOLINK; + + /* protected by llc_flow control */ + rc = smc_llc_send_delete_rkey(send_link, rmb_desc); if (rc) goto out; /* receive DELETE RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_delete_rkey_rc) + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) rc = -EFAULT; - else - rc = 0; out: - mutex_unlock(&link->llc_delete_rkey_mutex); + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); return rc; } +void smc_llc_link_set_uid(struct smc_link *link) +{ + __be32 link_uid; + + link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id); + memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE); +} + +/* save peers link user id, used for debug purposes */ +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry) +{ + memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid, + SMC_LGR_ID_SIZE); +} + +/* evaluate confirm link request or response */ +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type) +{ + if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */ + qentry->link->link_id = qentry->msg.confirm_link.link_num; + smc_llc_link_set_uid(qentry->link); + } + if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + return -ENOTSUPP; + return 0; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { @@ -736,6 +1829,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { }, { .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_CONT + }, + { + .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_LINK }, { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 461c0c3ef76e..a5d2fe3eea61 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -28,6 +28,7 @@ enum smc_llc_reqresp { enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, SMC_LLC_ADD_LINK = 0x02, + SMC_LLC_ADD_LINK_CONT = 0x03, SMC_LLC_DELETE_LINK = 0x04, SMC_LLC_CONFIRM_RKEY = 0x06, SMC_LLC_TEST_LINK = 0x07, @@ -35,22 +36,74 @@ enum smc_llc_msg_type { SMC_LLC_DELETE_RKEY = 0x09, }; +#define smc_link_downing(state) \ + (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE) + +/* LLC DELETE LINK Request Reason Codes */ +#define SMC_LLC_DEL_LOST_PATH 0x00010000 +#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000 +#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000 +#define SMC_LLC_DEL_PROT_VIOL 0x00040000 +#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000 +/* LLC DELETE LINK Response Reason Codes */ +#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */ +#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */ + +/* returns a usable link of the link group, or NULL */ +static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + return &lgr->lnk[i]; + return NULL; +} + +/* set the termination reason code for the link group */ +static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr, + u32 rsn) +{ + if (!lgr->llc_termination_rsn) + lgr->llc_termination_rsn = rsn; +} + /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, enum smc_llc_reqresp reqresp); -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly); +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason); +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); +void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); -void smc_llc_link_active(struct smc_link *link, int testlink_time); -void smc_llc_link_deleting(struct smc_link *link); -void smc_llc_link_inactive(struct smc_link *link); -void smc_llc_link_clear(struct smc_link *link); -int smc_llc_do_confirm_rkey(struct smc_link *link, +void smc_llc_link_active(struct smc_link *link); +void smc_llc_link_clear(struct smc_link *link, bool log); +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc); +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type); +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type); +void smc_llc_link_set_uid(struct smc_link *link); +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry); +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg); +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, + u32 rsn); +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); +int smc_llc_srv_add_link(struct smc_link *link); +void smc_llc_srv_add_link_local(struct smc_link *link); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2a5ed47c3e08..014d91b9778e 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -32,7 +32,7 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev); -static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { +static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN @@ -50,29 +50,26 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { static struct genl_family smc_pnet_nl_family; -/** - * struct smc_user_pnetentry - pnet identifier name entry for/from user - * @list: List node. - * @pnet_name: Pnet identifier name - * @ndev: pointer to network device. - * @smcibdev: Pointer to IB device. - * @ib_port: Port of IB device. - * @smcd_dev: Pointer to smcd device. - */ -struct smc_user_pnetentry { - struct list_head list; - char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; - struct smc_ib_device *smcibdev; - u8 ib_port; - struct smcd_dev *smcd_dev; +enum smc_pnet_nametype { + SMC_PNET_ETH = 1, + SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; + enum smc_pnet_nametype type; + union { + struct { + char eth_name[IFNAMSIZ + 1]; + struct net_device *ndev; + }; + struct { + char ib_name[IB_DEVICE_NAME_MAX + 1]; + u8 ib_port; + }; + }; }; /* Check if two given pnetids match */ @@ -106,14 +103,21 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* remove netdevices */ + /* remove table entry */ write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); - dev_put(pnetelem->ndev); + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { + dev_put(pnetelem->ndev); + pr_warn_ratelimited("smc: net device %s " + "erased user defined " + "pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + } kfree(pnetelem); rc = 0; } @@ -132,6 +136,12 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { + pr_warn_ratelimited("smc: ib device %s ibport " + "%d erased user defined " + "pnetid %.16s\n", + ibdev->ibdev->name, + ibport + 1, + ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; @@ -146,6 +156,10 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) if (smcd_dev->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd_dev->pnetid))) { + pr_warn_ratelimited("smc: smcd device %s " + "erased user defined pnetid " + "%.16s\n", dev_name(&smcd_dev->dev), + smcd_dev->pnetid); memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = false; rc = 0; @@ -155,9 +169,9 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) return rc; } -/* Remove a pnet entry mentioning a given network device from the pnet table. +/* Add the reference to a given network device to the pnet table. */ -static int smc_pnet_remove_by_ndev(struct net_device *ndev) +static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; @@ -171,11 +185,15 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { - if (pnetelem->ndev == ndev) { - list_del(&pnetelem->list); - dev_put(pnetelem->ndev); - kfree(pnetelem); + if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && + !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { + dev_hold(ndev); + pnetelem->ndev = ndev; rc = 0; + pr_warn_ratelimited("smc: adding net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); break; } } @@ -183,80 +201,71 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) return rc; } -/* Append a pnetid to the end of the pnet table if not already on this list. +/* Remove the reference to a given network device from the pnet table. */ -static int smc_pnet_enter(struct smc_pnettable *pnettable, - struct smc_user_pnetentry *new_pnetelem) +static int smc_pnet_remove_by_ndev(struct net_device *ndev) { - u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; - u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_pnetentry *tmp_pnetelem; - struct smc_pnetentry *pnetelem; - bool new_smcddev = false; - struct net_device *ndev; - bool new_netdev = true; - bool new_ibdev = false; - - if (new_pnetelem->smcibdev) { - struct smc_ib_device *ib_dev = new_pnetelem->smcibdev; - int ib_port = new_pnetelem->ib_port; + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_net *sn; + int rc = -ENOENT; - spin_lock(&smc_ib_devices.lock); - if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { - memcpy(ib_dev->pnetid[ib_port - 1], - new_pnetelem->pnet_name, SMC_MAX_PNETID_LEN); - ib_dev->pnetid_by_user[ib_port - 1] = true; - new_ibdev = true; - } - spin_unlock(&smc_ib_devices.lock); - } - if (new_pnetelem->smcd_dev) { - struct smcd_dev *smcd_dev = new_pnetelem->smcd_dev; + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; - spin_lock(&smcd_dev_list.lock); - if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { - memcpy(smcd_dev->pnetid, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = true; - new_smcddev = true; + write_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { + dev_put(pnetelem->ndev); + pnetelem->ndev = NULL; + rc = 0; + pr_warn_ratelimited("smc: removing net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + break; } - spin_unlock(&smcd_dev_list.lock); } + write_unlock(&pnettable->lock); + return rc; +} - if (!new_pnetelem->ndev) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; +/* Apply pnetid to ib device when no pnetid is set. + */ +static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, + char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - /* check if (base) netdev already has a pnetid. If there is one, we do - * not want to add a pnet table entry - */ - ndev = pnet_find_base_ndev(new_pnetelem->ndev); - if (!smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, - ndev_pnetid)) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_lock(&smc_ib_devices.lock); + if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { + memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, + SMC_MAX_PNETID_LEN); + ib_dev->pnetid_by_user[ib_port - 1] = true; + applied = true; + } + spin_unlock(&smc_ib_devices.lock); + return applied; +} - /* add a new netdev entry to the pnet table if there isn't one */ - tmp_pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); - if (!tmp_pnetelem) - return -ENOMEM; - memcpy(tmp_pnetelem->pnet_name, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_pnetelem->ndev = new_pnetelem->ndev; +/* Apply pnetid to smcd device when no pnetid is set. + */ +static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - write_lock(&pnettable->lock); - list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (pnetelem->ndev == new_pnetelem->ndev) - new_netdev = false; - } - if (new_netdev) { - dev_hold(tmp_pnetelem->ndev); - list_add_tail(&tmp_pnetelem->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); - } else { - write_unlock(&pnettable->lock); - kfree(tmp_pnetelem); + spin_lock(&smcd_dev_list.lock); + if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { + memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); + smcd_dev->pnetid_by_user = true; + applied = true; } - - return (new_netdev || new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_unlock(&smcd_dev_list.lock); + return applied; } /* The limit for pnetid is 16 characters. @@ -323,57 +332,184 @@ out: return smcd_dev; } -/* Parse the supplied netlink attributes and fill a pnetentry structure. - * For ethernet and infiniband device names verify that the devices exist. +static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, + char *eth_name, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct net_device *ndev, *base_ndev; + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + bool new_netdev; + int rc; + + /* check if (base) netdev already has a pnetid. If there is one, we do + * not want to add a pnet table entry + */ + rc = -EEXIST; + ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ + if (ndev) { + base_ndev = pnet_find_base_ndev(ndev); + if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, + base_ndev->dev_port, ndev_pnetid)) + goto out_put; + } + + /* add a new netdev entry to the pnet table if there isn't one */ + rc = -ENOMEM; + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + goto out_put; + new_pe->type = SMC_PNET_ETH; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + new_pe->ndev = ndev; + + rc = -EEXIST; + new_netdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_ETH && + !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { + new_netdev = false; + break; + } + } + if (new_netdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + goto out_put; + } + if (ndev) + pr_warn_ratelimited("smc: net device %s " + "applied user defined pnetid %.16s\n", + new_pe->eth_name, new_pe->pnet_name); + return 0; + +out_put: + if (ndev) + dev_put(ndev); + return rc; +} + +static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, + u8 ib_port, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct smc_ib_device *ib_dev; + bool smcddev_applied = true; + bool ibdev_applied = true; + struct smcd_dev *smcd_dev; + bool new_ibdev; + + /* try to apply the pnetid to active devices */ + ib_dev = smc_pnet_find_ib(ib_name); + if (ib_dev) { + ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); + if (ibdev_applied) + pr_warn_ratelimited("smc: ib device %s ibport %d " + "applied user defined pnetid " + "%.16s\n", ib_dev->ibdev->name, + ib_port, + ib_dev->pnetid[ib_port - 1]); + } + smcd_dev = smc_pnet_find_smcd(ib_name); + if (smcd_dev) { + smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); + if (smcddev_applied) + pr_warn_ratelimited("smc: smcd device %s " + "applied user defined pnetid " + "%.16s\n", dev_name(&smcd_dev->dev), + smcd_dev->pnetid); + } + /* Apply fails when a device has a hardware-defined pnetid set, do not + * add a pnet table entry in that case. + */ + if (!ibdev_applied || !smcddev_applied) + return -EEXIST; + + /* add a new ib entry to the pnet table if there isn't one */ + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + return -ENOMEM; + new_pe->type = SMC_PNET_IB; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + new_pe->ib_port = ib_port; + + new_ibdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + new_ibdev = false; + break; + } + } + if (new_ibdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + } + return (new_ibdev) ? 0 : -EEXIST; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. */ -static int smc_pnet_fill_entry(struct net *net, - struct smc_user_pnetentry *pnetelem, - struct nlattr *tb[]) +static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { - char *string, *ibname; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; + struct smc_pnettable *pnettable; + bool new_netdev = false; + bool new_ibdev = false; + struct smc_net *sn; + u8 ibport = 1; + char *string; int rc; - memset(pnetelem, 0, sizeof(*pnetelem)); - INIT_LIST_HEAD(&pnetelem->list); + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); - if (!smc_pnetid_valid(string, pnetelem->pnet_name)) + if (!smc_pnetid_valid(string, pnet_name)) goto error; - rc = -EINVAL; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); - pnetelem->ndev = dev_get_by_name(net, string); - if (!pnetelem->ndev) + rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); + if (!rc) + new_netdev = true; + else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) - return 0; + return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { - ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); - ibname = strim(ibname); - pnetelem->smcibdev = smc_pnet_find_ib(ibname); - pnetelem->smcd_dev = smc_pnet_find_smcd(ibname); - if (!pnetelem->smcibdev && !pnetelem->smcd_dev) - goto error; - if (pnetelem->smcibdev) { - if (!tb[SMC_PNETID_IBPORT]) - goto error; - pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); - if (pnetelem->ib_port < 1 || - pnetelem->ib_port > SMC_MAX_PORTS) + string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + string = strim(string); + if (tb[SMC_PNETID_IBPORT]) { + ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } + rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); + if (!rc) + new_ibdev = true; + else if (rc != -EEXIST) + goto error; } - - return 0; + return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; @@ -381,28 +517,22 @@ error: /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; - if (pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, - pnetelem->ndev->name)) + pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } - if (pnetelem->smcibdev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(pnetelem->smcibdev->ibdev->dev.parent)) || + if (pnetelem->type == SMC_PNET_IB) { + if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; - } else if (pnetelem->smcd_dev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(&pnetelem->smcd_dev->dev)) || - nla_put_u8(msg, SMC_PNETID_IBPORT, 1)) - return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) @@ -415,21 +545,8 @@ static int smc_pnet_set_nla(struct sk_buff *msg, static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct smc_user_pnetentry pnetelem; - struct smc_pnettable *pnettable; - struct smc_net *sn; - int rc; - - /* get pnettable for namespace */ - sn = net_generic(net, smc_net_id); - pnettable = &sn->pnettable; - rc = smc_pnet_fill_entry(net, &pnetelem, info->attrs); - if (!rc) - rc = smc_pnet_enter(pnettable, &pnetelem); - if (pnetelem.ndev) - dev_put(pnetelem.ndev); - return rc; + return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) @@ -450,7 +567,7 @@ static int smc_pnet_dump_start(struct netlink_callback *cb) static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { void *hdr; @@ -469,91 +586,32 @@ static int smc_pnet_dumpinfo(struct sk_buff *skb, static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { - struct smc_user_pnetentry tmp_entry; struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; - struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; struct smc_net *sn; int idx = 0; - int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* dump netdevices */ + /* dump pnettable entries */ read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_entry.ndev = pnetelem->ndev; + /* if this is not the initial namespace, dump only netdev */ + if (net != &init_net && pnetelem->type != SMC_PNET_ETH) + continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { + pnetelem)) { --idx; break; } } read_unlock(&pnettable->lock); - - /* if this is not the initial namespace, stop here */ - if (net != &init_net) - return idx; - - /* dump ib devices */ - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { - if (ibdev->pnetid_by_user[ibport]) { - if (pnetid && - !smc_pnet_match(ibdev->pnetid[ibport], - pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, - ibdev->pnetid[ibport], - SMC_MAX_PNETID_LEN); - tmp_entry.smcibdev = ibdev; - tmp_entry.ib_port = ibport + 1; - if (smc_pnet_dumpinfo(skb, portid, seq, - NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - } - spin_unlock(&smc_ib_devices.lock); - - /* dump smcd devices */ - spin_lock(&smcd_dev_list.lock); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user) { - if (pnetid && !smc_pnet_match(smcd_dev->pnetid, pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, smcd_dev->pnetid, - SMC_MAX_PNETID_LEN); - tmp_entry.smcd_dev = smcd_dev; - if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - spin_unlock(&smcd_dev_list.lock); - return idx; } @@ -659,6 +717,9 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); return NOTIFY_OK; + case NETDEV_REGISTER: + smc_pnet_add_by_ndev(event_dev); + return NOTIFY_OK; default: return NOTIFY_DONE; } @@ -744,7 +805,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (ndev == pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; @@ -755,6 +816,45 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, return rc; } +/* find a roce device for the given pnetid */ +static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, + struct smc_init_info *ini, + struct smc_ib_device *known_dev) +{ + struct smc_ib_device *ibdev; + int i; + + ini->ib_dev = NULL; + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (ibdev == known_dev) + continue; + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && + smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + goto out; + } + } + } +out: + spin_unlock(&smc_ib_devices.lock); +} + +/* find alternate roce device with same pnet_id and vlan_id */ +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev) +{ + _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev); +} + /* if handshake network device belongs to a roce device, return its * IB device and port */ @@ -801,8 +901,6 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_ib_device *ibdev; - int i; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, @@ -811,25 +909,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (i = 1; i <= SMC_MAX_PORTS; i++) { - if (!rdma_is_port_valid(ibdev->ibdev, i)) - continue; - if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && - smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - goto out; - } - } - } -out: - spin_unlock(&smc_ib_devices.lock); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, @@ -895,3 +975,60 @@ out_rel: out: return; } + +/* Lookup and apply a pnet table entry to the given ib device. + */ +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) +{ + char *ib_name = smcibdev->ibdev->name; + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && + tmp_pe->ib_port == ib_port) { + smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} + +/* Lookup and apply a pnet table entry to the given smcd device. + */ +int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) +{ + const char *ib_name = dev_name(&smcddev->dev); + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 4564e4d69c2e..811a65986691 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -19,6 +19,7 @@ struct smc_ib_device; struct smcd_dev; struct smc_init_info; +struct smc_link_group; /** * struct smc_pnettable - SMC PNET table anchor @@ -46,5 +47,9 @@ void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); - +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port); +int smc_pnetid_by_table_smcd(struct smcd_dev *smcd); +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev); #endif diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 9f1ade86d70e..54ba0443847e 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -269,22 +269,21 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; - struct smc_link *link; + struct smc_link *link = conn->lnk; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + + lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; - rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) - smc_lgr_terminate_sched(lgr); + smcr_link_down_cond_sched(link); return rc; } @@ -310,8 +309,10 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { + struct smc_link *link = conn->lnk; + dma_addr_t dma_addr = - sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -481,12 +482,13 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + struct smc_link *link = conn->lnk; struct smc_rdma_wr *wr_rdma_buf; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend); + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { if (rc == -EBUSY) { struct smc_sock *smc = @@ -504,10 +506,17 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) } spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, tx_work will restart */ + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + rc = -ENOLINK; + goto out_unlock; + } if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { - smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 337ee52ad3d3..7239ba9b99dc 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -44,6 +44,7 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ struct smc_link *link; u32 idx; struct smc_wr_tx_pend_priv priv; + u8 compl_requested; }; /******************************** send queue *********************************/ @@ -61,7 +62,7 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link) } /* wait till all pending tx work requests on the given link are completed */ -static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) { if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), SMC_WR_TX_WAIT_PENDING_TIME)) @@ -103,6 +104,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) if (pnd_snd_idx == link->wr_tx_cnt) return; link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + if (link->wr_tx_pends[pnd_snd_idx].compl_requested) + complete(&link->wr_tx_compl[pnd_snd_idx]); memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd)); /* clear the full struct smc_wr_tx_pend including .priv */ memset(&link->wr_tx_pends[pnd_snd_idx], 0, @@ -120,8 +123,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) sizeof(link->wr_tx_bufs[i])); clear_bit(i, link->wr_tx_mask); } - /* terminate connections of this link group abnormally */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + /* terminate link */ + smcr_link_down_cond_sched(link); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -207,13 +210,13 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { rc = wait_event_interruptible_timeout( link->wr_tx_wait, - link->state == SMC_LNK_INACTIVE || + !smc_link_usable(link) || lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { - /* timeout - terminate connections */ - smc_lgr_terminate_sched(lgr); + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -270,11 +273,38 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); } return rc; } +/* Send prepared WR slot via ib_post_send and wait for send completion + * notification. + * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer + */ +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout) +{ + struct smc_wr_tx_pend *pend; + int rc; + + pend = container_of(priv, struct smc_wr_tx_pend, priv); + pend->compl_requested = 1; + init_completion(&link->wr_tx_compl[pend->idx]); + + rc = smc_wr_tx_send(link, priv); + if (rc) + return rc; + /* wait for completion by smc_wr_tx_process_cqe() */ + rc = wait_for_completion_interruptible_timeout( + &link->wr_tx_compl[pend->idx], timeout); + if (rc <= 0) + rc = -ENODATA; + if (rc > 0) + rc = 0; + return rc; +} + /* Register a memory region and wait for result. */ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { @@ -294,8 +324,8 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) (link->wr_reg_state != POSTED), SMC_WR_REG_MR_WAIT_TIME); if (!rc) { - /* timeout - terminate connections */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -393,10 +423,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RETRY_EXC_ERR: case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: - /* terminate connections of this link group - * abnormally - */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -558,6 +585,8 @@ void smc_wr_free_link(struct smc_link *lnk) void smc_wr_free_link_mem(struct smc_link *lnk) { + kfree(lnk->wr_tx_compl); + lnk->wr_tx_compl = NULL; kfree(lnk->wr_tx_pends); lnk->wr_tx_pends = NULL; kfree(lnk->wr_tx_mask); @@ -628,8 +657,15 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_pends) goto no_mem_wr_tx_mask; + link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_compl[0]), + GFP_KERNEL); + if (!link->wr_tx_compl) + goto no_mem_wr_tx_pends; return 0; +no_mem_wr_tx_pends: + kfree(link->wr_tx_pends); no_mem_wr_tx_mask: kfree(link->wr_tx_mask); no_mem_wr_rx_sges: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 3ac99c898418..423b8709f1c9 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -101,11 +101,14 @@ int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, smc_wr_tx_filter filter, smc_wr_tx_dismisser dismisser, unsigned long data); +int smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); |