diff options
Diffstat (limited to 'net')
60 files changed, 1294 insertions, 914 deletions
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index d8e376a5f0f1..36a1a739ad68 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -658,14 +658,30 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args) static void p9_virtio_remove(struct virtio_device *vdev) { struct virtio_chan *chan = vdev->priv; - - if (chan->inuse) - p9_virtio_close(chan->client); - vdev->config->del_vqs(vdev); + unsigned long warning_time; mutex_lock(&virtio_9p_lock); + + /* Remove self from list so we don't get new users. */ list_del(&chan->chan_list); + warning_time = jiffies; + + /* Wait for existing users to close. */ + while (chan->inuse) { + mutex_unlock(&virtio_9p_lock); + msleep(250); + if (time_after(jiffies, warning_time + 10 * HZ)) { + dev_emerg(&vdev->dev, + "p9_virtio_remove: waiting for device in use.\n"); + warning_time = jiffies; + } + mutex_lock(&virtio_9p_lock); + } + mutex_unlock(&virtio_9p_lock); + + vdev->config->del_vqs(vdev); + sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); kfree(chan->tag); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index b087d278c679..1849d96b3c91 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -563,6 +563,8 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) */ del_nbp(p); + dev_set_mtu(br->dev, br_min_mtu(br)); + spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 769b185fefbd..a6e2da0bc718 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -281,7 +281,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, int copylen; ret = -EOPNOTSUPP; - if (m->msg_flags&MSG_OOB) + if (flags & MSG_OOB) goto read_error; skb = skb_recv_datagram(sk, flags, 0 , &ret); diff --git a/net/can/af_can.c b/net/can/af_can.c index 66e08040ced7..32d710eaf1fc 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -259,6 +259,9 @@ int can_send(struct sk_buff *skb, int loop) goto inval_skb; } + skb->ip_summed = CHECKSUM_UNNECESSARY; + + skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_transport_header(skb); diff --git a/net/compat.c b/net/compat.c index 94d3d5e97883..f7bd286a8280 100644 --- a/net/compat.c +++ b/net/compat.c @@ -49,6 +49,13 @@ ssize_t get_compat_msghdr(struct msghdr *kmsg, __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || __get_user(kmsg->msg_flags, &umsg->msg_flags)) return -EFAULT; + + if (!uaddr) + kmsg->msg_namelen = 0; + + if (kmsg->msg_namelen < 0) + return -EINVAL; + if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); kmsg->msg_control = compat_ptr(tmp3); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 25b4b5d23485..ee0608bb3bc0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2166,28 +2166,28 @@ replay: } } err = rtnl_configure_link(dev, ifm); - if (err < 0) { - if (ops->newlink) { - LIST_HEAD(list_kill); - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - } else { - unregister_netdevice(dev); - } - goto out; - } - + if (err < 0) + goto out_unregister; if (link_net) { err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) - unregister_netdevice(dev); + goto out_unregister; } out: if (link_net) put_net(link_net); put_net(dest_net); return err; +out_unregister: + if (ops->newlink) { + LIST_HEAD(list_kill); + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + } else { + unregister_netdevice(dev); + } + goto out; } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f80507823531..8e4ac97c8477 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3733,9 +3733,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, struct sock *sk, int tstype) { struct sk_buff *skb; - bool tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + bool tsonly; - if (!sk || !skb_may_tx_timestamp(sk, tsonly)) + if (!sk) + return; + + tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) @@ -4173,7 +4177,7 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet) skb->ignore_df = 0; skb_dst_drop(skb); skb->mark = 0; - skb->sender_cpu = 0; + skb_sender_cpu_clear(skb); skb_init_secmark(skb); secpath_reset(skb); nf_reset(skb); diff --git a/net/core/sock.c b/net/core/sock.c index 93c8b20c91e4..78e89eb7eb70 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1655,6 +1655,10 @@ void sock_rfree(struct sk_buff *skb) } EXPORT_SYMBOL(sock_rfree); +/* + * Buffer destructor for skbs that are not used directly in read or write + * path, e.g. for error handler skbs. Automatically called from kfree_skb. + */ void sock_efree(struct sk_buff *skb) { sock_put(skb->sk); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 433424804284..8ce351ffceb1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,8 @@ static int zero = 0; static int one = 1; static int ushort_max = USHRT_MAX; +static int min_sndbuf = SOCK_MIN_SNDBUF; +static int min_rcvbuf = SOCK_MIN_RCVBUF; static int net_msg_warn; /* Unused, but still a sysctl */ @@ -237,7 +239,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "rmem_max", @@ -245,7 +247,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", @@ -253,7 +255,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_sndbuf, }, { .procname = "rmem_default", @@ -261,7 +263,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &one, + .extra1 = &min_rcvbuf, }, { .procname = "dev_weight", diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 14d02ea905b6..3e44b9b0b78e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -268,6 +268,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); + sched_annotate_sleep(); lock_sock(sk); err = 0; if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 81751f12645f..592aff37366b 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -71,6 +71,20 @@ static inline void inet_diag_unlock_handler( mutex_unlock(&inet_diag_table_mutex); } +static size_t inet_sk_attr_size(void) +{ + return nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(1) /* INET_DIAG_SHUTDOWN */ + + nla_total_size(1) /* INET_DIAG_TOS */ + + nla_total_size(1) /* INET_DIAG_TCLASS */ + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct sk_buff *skb, struct inet_diag_req_v2 *req, struct user_namespace *user_ns, @@ -326,9 +340,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s if (err) goto out; - rep = nlmsg_new(sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - sizeof(struct tcp_info) + 64, GFP_KERNEL); + rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL); if (!rep) { err = -ENOMEM; goto out; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 787b3c294ce6..d9bc28ac5d1b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -67,6 +67,7 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + skb_sender_cpu_clear(skb); return dst_output(skb); } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 2c8d98e728c0..145a50c4d566 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -659,27 +659,30 @@ EXPORT_SYMBOL(ip_defrag); struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user) { struct iphdr iph; + int netoff; u32 len; if (skb->protocol != htons(ETH_P_IP)) return skb; - if (skb_copy_bits(skb, 0, &iph, sizeof(iph)) < 0) + netoff = skb_network_offset(skb); + + if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0) return skb; if (iph.ihl < 5 || iph.version != 4) return skb; len = ntohs(iph.tot_len); - if (skb->len < len || len < (iph.ihl * 4)) + if (skb->len < netoff + len || len < (iph.ihl * 4)) return skb; if (ip_is_fragment(&iph)) { skb = skb_share_check(skb, GFP_ATOMIC); if (skb) { - if (!pskb_may_pull(skb, iph.ihl*4)) + if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) return skb; - if (pskb_trim_rcsum(skb, len)) + if (pskb_trim_rcsum(skb, netoff + len)) return skb; memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); if (ip_defrag(skb, user)) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 31d8c71986b4..5cd99271d3a6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -432,17 +432,32 @@ void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 inf kfree_skb(skb); } -static bool ipv4_pktinfo_prepare_errqueue(const struct sock *sk, - const struct sk_buff *skb, - int ee_origin) +/* IPv4 supports cmsg on all imcp errors and some timestamps + * + * Timestamp code paths do not initialize the fields expected by cmsg: + * the PKTINFO fields in skb->cb[]. Fill those in here. + */ +static bool ipv4_datagram_support_cmsg(const struct sock *sk, + struct sk_buff *skb, + int ee_origin) { - struct in_pktinfo *info = PKTINFO_SKB_CB(skb); + struct in_pktinfo *info; + + if (ee_origin == SO_EE_ORIGIN_ICMP) + return true; - if ((ee_origin != SO_EE_ORIGIN_TIMESTAMPING) || - (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || + if (ee_origin == SO_EE_ORIGIN_LOCAL) + return false; + + /* Support IP_PKTINFO on tstamp packets if requested, to correlate + * timestamp with egress dev. Not possible for packets without dev + * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). + */ + if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || (!skb->dev)) return false; + info = PKTINFO_SKB_CB(skb); info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; info->ipi_ifindex = skb->dev->ifindex; return true; @@ -483,7 +498,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && skb->len) { + if (sin && serr->port) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); @@ -496,9 +511,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - if (skb->len && - (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || - ipv4_pktinfo_prepare_errqueue(sk, skb, serr->ee.ee_origin))) { + if (ipv4_datagram_support_cmsg(sk, skb, serr->ee.ee_origin)) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; if (inet_sk(sk)->cmsg_flags) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 99e810f84671..cf5e82f39d3b 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -272,9 +272,9 @@ static void trace_packet(const struct sk_buff *skb, &chainname, &comment, &rulenum) != 0) break; - nf_log_packet(net, AF_INET, hook, skb, in, out, &trace_loginfo, - "TRACE: %s:%s:%s:%u ", - tablename, chainname, comment, rulenum); + nf_log_trace(net, AF_INET, hook, skb, in, out, &trace_loginfo, + "TRACE: %s:%s:%s:%u ", + tablename, chainname, comment, rulenum); } #endif diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index e9f66e1cda50..208d5439e59b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -259,6 +259,9 @@ int ping_init_sock(struct sock *sk) kgid_t low, high; int ret = 0; + if (sk->sk_family == AF_INET6) + sk->sk_ipv6only = 1; + inet_get_ping_group_range_net(net, &low, &high); if (gid_lte(low, group) && gid_lte(group, high)) return 0; @@ -305,6 +308,11 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, if (addr_len < sizeof(*addr)) return -EINVAL; + if (addr->sin_family != AF_INET && + !(addr->sin_family == AF_UNSPEC && + addr->sin_addr.s_addr == htonl(INADDR_ANY))) + return -EAFNOSUPPORT; + pr_debug("ping_check_bind_addr(sk=%p,addr=%pI4,port=%d)\n", sk, &addr->sin_addr.s_addr, ntohs(addr->sin_port)); @@ -330,7 +338,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, return -EINVAL; if (addr->sin6_family != AF_INET6) - return -EINVAL; + return -EAFNOSUPPORT; pr_debug("ping_check_bind_addr(sk=%p,addr=%pI6c,port=%d)\n", sk, addr->sin6_addr.s6_addr, ntohs(addr->sin6_port)); @@ -716,7 +724,7 @@ static int ping_v4_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) - return -EINVAL; + return -EAFNOSUPPORT; daddr = usin->sin_addr.s_addr; /* no remote port */ } else { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9d72a0fcd928..995a2259bcfc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -835,17 +835,13 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); - u32 new_size_goal, size_goal, hlen; + u32 new_size_goal, size_goal; if (!large_allowed || !sk_can_gso(sk)) return mss_now; - /* Maybe we should/could use sk->sk_prot->max_header here ? */ - hlen = inet_csk(sk)->icsk_af_ops->net_header_len + - inet_csk(sk)->icsk_ext_hdr_len + - tp->tcp_header_len; - - new_size_goal = sk->sk_gso_max_size - 1 - hlen; + /* Note : tcp_tso_autosize() will eventually split this later */ + new_size_goal = sk->sk_gso_max_size - 1 - MAX_TCP_HEADER; new_size_goal = tcp_bound_to_half_wnd(tp, new_size_goal); /* We try hard to avoid divides here */ diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index d694088214cd..62856e185a93 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -378,6 +378,12 @@ EXPORT_SYMBOL_GPL(tcp_slow_start); */ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) { + /* If credits accumulated at a higher w, apply them gently now. */ + if (tp->snd_cwnd_cnt >= w) { + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd++; + } + tp->snd_cwnd_cnt += acked; if (tp->snd_cwnd_cnt >= w) { u32 delta = tp->snd_cwnd_cnt / w; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 4b276d1ed980..06d3d665a9fd 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -306,8 +306,10 @@ tcp_friendliness: } } - if (ca->cnt == 0) /* cannot be zero */ - ca->cnt = 1; + /* The maximum rate of cwnd increase CUBIC allows is 1 packet per + * 2 packets ACKed, meaning cwnd grows at 1.5x per RTT. + */ + ca->cnt = max(ca->cnt, 2U); } static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a2a796c5536b..1db253e36045 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2773,15 +2773,11 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb_fclone(MAX_TCP_HEADER, - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (skb) break; yield(); } - - /* Reserve space for headers and prepare control bits. */ - skb_reserve(skb, MAX_TCP_HEADER); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index d5f6bd9a210a..dab73813cb92 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -63,6 +63,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) return err; IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + skb->protocol = htons(ETH_P_IP); return x->outer_mode->output2(x, skb); } @@ -71,7 +72,6 @@ EXPORT_SYMBOL(xfrm4_prepare_output); int xfrm4_output_finish(struct sk_buff *skb) { memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - skb->protocol = htons(ETH_P_IP); #ifdef CONFIG_NETFILTER IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index c215be70cac0..ace8daca5c83 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -325,14 +325,34 @@ void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu) kfree_skb(skb); } -static void ip6_datagram_prepare_pktinfo_errqueue(struct sk_buff *skb) +/* IPv6 supports cmsg on all origins aside from SO_EE_ORIGIN_LOCAL. + * + * At one point, excluding local errors was a quick test to identify icmp/icmp6 + * errors. This is no longer true, but the test remained, so the v6 stack, + * unlike v4, also honors cmsg requests on all wifi and timestamp errors. + * + * Timestamp code paths do not initialize the fields expected by cmsg: + * the PKTINFO fields in skb->cb[]. Fill those in here. + */ +static bool ip6_datagram_support_cmsg(struct sk_buff *skb, + struct sock_exterr_skb *serr) { - int ifindex = skb->dev ? skb->dev->ifindex : -1; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP || + serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) + return true; + + if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) + return false; + + if (!skb->dev) + return false; if (skb->protocol == htons(ETH_P_IPV6)) - IP6CB(skb)->iif = ifindex; + IP6CB(skb)->iif = skb->dev->ifindex; else - PKTINFO_SKB_CB(skb)->ipi_ifindex = ifindex; + PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex; + + return true; } /* @@ -369,7 +389,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) serr = SKB_EXT_ERR(skb); - if (sin && skb->len) { + if (sin && serr->port) { const unsigned char *nh = skb_network_header(skb); sin->sin6_family = AF_INET6; sin->sin6_flowinfo = 0; @@ -394,14 +414,11 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); sin = &errhdr.offender; memset(sin, 0, sizeof(*sin)); - if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL && skb->len) { + + if (ip6_datagram_support_cmsg(skb, serr)) { sin->sin6_family = AF_INET6; - if (np->rxopt.all) { - if (serr->ee.ee_origin != SO_EE_ORIGIN_ICMP && - serr->ee.ee_origin != SO_EE_ORIGIN_ICMP6) - ip6_datagram_prepare_pktinfo_errqueue(skb); + if (np->rxopt.all) ip6_datagram_recv_common_ctl(sk, msg, skb); - } if (skb->protocol == htons(ETH_P_IPV6)) { sin->sin6_addr = ipv6_hdr(skb)->saddr; if (np->rxopt.all) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index b4d5e1d97c1b..27ca79682efb 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -104,6 +104,7 @@ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, goto again; flp6->saddr = saddr; } + err = rt->dst.error; goto out; } again: diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0a04a37305d5..7e80b61b51ff 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -318,6 +318,7 @@ static int ip6_forward_proxy_check(struct sk_buff *skb) static inline int ip6_forward_finish(struct sk_buff *skb) { + skb_sender_cpu_clear(skb); return dst_output(skb); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 266a264ec212..ddd94eca19b3 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -314,7 +314,7 @@ out: * Create tunnel matching given parameters. * * Return: - * created tunnel or NULL + * created tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) @@ -322,7 +322,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; - int err; + int err = -ENOMEM; if (p->name[0]) strlcpy(name, p->name, IFNAMSIZ); @@ -348,7 +348,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) failed_free: ip6_dev_free(dev); failed: - return NULL; + return ERR_PTR(err); } /** @@ -362,7 +362,7 @@ failed: * tunnel device is created and registered for use. * * Return: - * matching tunnel or NULL + * matching tunnel or error pointer **/ static struct ip6_tnl *ip6_tnl_locate(struct net *net, @@ -380,13 +380,13 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) { if (create) - return NULL; + return ERR_PTR(-EEXIST); return t; } } if (!create) - return NULL; + return ERR_PTR(-ENODEV); return ip6_tnl_create(net, p); } @@ -1420,7 +1420,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); - if (t == NULL) + if (IS_ERR(t)) t = netdev_priv(dev); } else { memset(&p, 0, sizeof(p)); @@ -1445,7 +1445,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, cmd == SIOCADDTUNNEL); if (cmd == SIOCCHGTUNNEL) { - if (t != NULL) { + if (!IS_ERR(t)) { if (t->dev != dev) { err = -EEXIST; break; @@ -1457,14 +1457,15 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) else err = ip6_tnl_update(t, &p1); } - if (t) { + if (!IS_ERR(t)) { err = 0; ip6_tnl_parm_to_user(&p, &t->parms); if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + } else { + err = PTR_ERR(t); + } break; case SIOCDELTUNNEL: err = -EPERM; @@ -1478,7 +1479,7 @@ ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) err = -ENOENT; ip6_tnl_parm_from_user(&p1, &p); t = ip6_tnl_locate(net, &p1, 0); - if (t == NULL) + if (IS_ERR(t)) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) @@ -1672,12 +1673,13 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct net *net = dev_net(dev); - struct ip6_tnl *nt; + struct ip6_tnl *nt, *t; nt = netdev_priv(dev); ip6_tnl_netlink_parms(data, &nt->parms); - if (ip6_tnl_locate(net, &nt->parms, 0)) + t = ip6_tnl_locate(net, &nt->parms, 0); + if (!IS_ERR(t)) return -EEXIST; return ip6_tnl_create2(dev); @@ -1697,8 +1699,7 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], ip6_tnl_netlink_parms(data, &p); t = ip6_tnl_locate(net, &p, 0); - - if (t) { + if (!IS_ERR(t)) { if (t->dev != dev) return -EEXIST; } else diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e080fbbbc0e5..bb00c6f2a885 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -298,9 +298,9 @@ static void trace_packet(const struct sk_buff *skb, &chainname, &comment, &rulenum) != 0) break; - nf_log_packet(net, AF_INET6, hook, skb, in, out, &trace_loginfo, - "TRACE: %s:%s:%s:%u ", - tablename, chainname, comment, rulenum); + nf_log_trace(net, AF_INET6, hook, skb, in, out, &trace_loginfo, + "TRACE: %s:%s:%s:%u ", + tablename, chainname, comment, rulenum); } #endif diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index bd46f736f61d..a2dfff6ff227 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -102,9 +102,10 @@ int ping_v6_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_in6 *, u, msg->msg_name); - if (msg->msg_namelen < sizeof(struct sockaddr_in6) || - u->sin6_family != AF_INET6) { + if (msg->msg_namelen < sizeof(*u)) return -EINVAL; + if (u->sin6_family != AF_INET6) { + return -EAFNOSUPPORT; } if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != u->sin6_scope_id) { diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index ab889bb16b3c..be2c0ba82c85 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -112,11 +112,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen); fptr->nexthdr = nexthdr; fptr->reserved = 0; - if (skb_shinfo(skb)->ip6_frag_id) - fptr->identification = skb_shinfo(skb)->ip6_frag_id; - else - ipv6_select_ident(fptr, - (struct rt6_info *)skb_dst(skb)); + if (!skb_shinfo(skb)->ip6_frag_id) + ipv6_proxy_select_ident(skb); + fptr->identification = skb_shinfo(skb)->ip6_frag_id; /* Fragment the skb. ipv6 header and the remaining fields of the * fragment header are updated in ipv6_gso_segment() diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index ca3f29b98ae5..010f8bd2d577 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -114,6 +114,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) return err; skb->ignore_df = 1; + skb->protocol = htons(ETH_P_IPV6); return x->outer_mode->output2(x, skb); } @@ -122,7 +123,6 @@ EXPORT_SYMBOL(xfrm6_prepare_output); int xfrm6_output_finish(struct sk_buff *skb) { memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - skb->protocol = htons(ETH_P_IPV6); #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 48bf5a06847b..8d2d01b4800a 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -200,6 +200,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: + offset += ipv6_optlen(exthdr); if (!onlyproto && pskb_may_pull(skb, nh + offset + 3 - skb->data)) { struct ip6_mh *mh; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3afe36824703..8d53d65bd2ab 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -58,13 +58,24 @@ struct ieee80211_local; #define IEEE80211_UNSET_POWER_LEVEL INT_MIN /* - * Some APs experience problems when working with U-APSD. Decrease the - * probability of that happening by using legacy mode for all ACs but VO. - * The AP that caused us trouble was a Cisco 4410N. It ignores our - * setting, and always treats non-VO ACs as legacy. + * Some APs experience problems when working with U-APSD. Decreasing the + * probability of that happening by using legacy mode for all ACs but VO isn't + * enough. + * + * Cisco 4410N originally forced us to enable VO by default only because it + * treated non-VO ACs as legacy. + * + * However some APs (notably Netgear R7000) silently reclassify packets to + * different ACs. Since u-APSD ACs require trigger frames for frame retrieval + * clients would never see some frames (e.g. ARP responses) or would fetch them + * accidentally after a long time. + * + * It makes little sense to enable u-APSD queues by default because it needs + * userspace applications to be aware of it to actually take advantage of the + * possible additional powersavings. Implicitly depending on driver autotrigger + * frame support doesn't make much sense. */ -#define IEEE80211_DEFAULT_UAPSD_QUEUES \ - IEEE80211_WMM_IE_STA_QOSINFO_AC_VO +#define IEEE80211_DEFAULT_UAPSD_QUEUES 0 #define IEEE80211_DEFAULT_MAX_SP_LEN \ IEEE80211_WMM_IE_STA_QOSINFO_SP_ALL @@ -453,6 +464,7 @@ struct ieee80211_if_managed { unsigned int flags; bool csa_waiting_bcn; + bool csa_ignored_same_chan; bool beacon_crc_valid; u32 beacon_crc; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 10ac6324c1d0..142f66aece18 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1150,6 +1150,17 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; } + if (cfg80211_chandef_identical(&csa_ie.chandef, + &sdata->vif.bss_conf.chandef)) { + if (ifmgd->csa_ignored_same_chan) + return; + sdata_info(sdata, + "AP %pM tries to chanswitch to same channel, ignore\n", + ifmgd->associated->bssid); + ifmgd->csa_ignored_same_chan = true; + return; + } + mutex_lock(&local->mtx); mutex_lock(&local->chanctx_mtx); conf = rcu_dereference_protected(sdata->vif.chanctx_conf, @@ -1210,6 +1221,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = true; sdata->csa_chandef = csa_ie.chandef; sdata->csa_block_tx = csa_ie.mode; + ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) ieee80211_stop_vif_queues(local, sdata, @@ -2090,6 +2102,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sdata->vif.csa_active = false; ifmgd->csa_waiting_bcn = false; + ifmgd->csa_ignored_same_chan = false; if (sdata->csa_block_tx) { ieee80211_wake_vif_queues(local, sdata, IEEE80211_QUEUE_STOP_REASON_CSA); @@ -3204,7 +3217,8 @@ static const u64 care_about_ies = (1ULL << WLAN_EID_CHANNEL_SWITCH) | (1ULL << WLAN_EID_PWR_CONSTRAINT) | (1ULL << WLAN_EID_HT_CAPABILITY) | - (1ULL << WLAN_EID_HT_OPERATION); + (1ULL << WLAN_EID_HT_OPERATION) | + (1ULL << WLAN_EID_EXT_CHANSWITCH_ANN); static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1101563357ea..944bdc04e913 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2214,6 +2214,9 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) hdr = (struct ieee80211_hdr *) skb->data; mesh_hdr = (struct ieee80211s_hdr *) (skb->data + hdrlen); + if (ieee80211_drop_unencrypted(rx, hdr->frame_control)) + return RX_DROP_MONITOR; + /* frame is in RMC, don't forward */ if (ieee80211_is_data(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 8428f4a95479..747bdcf72e92 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3178,7 +3178,7 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, wdev_iter = &sdata_iter->wdev; if (sdata_iter == sdata || - rcu_access_pointer(sdata_iter->vif.chanctx_conf) == NULL || + !ieee80211_sdata_running(sdata_iter) || local->hw.wiphy->software_iftypes & BIT(wdev_iter->iftype)) continue; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index c47ffd7a0a70..d93ceeb3ef04 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -896,6 +896,8 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param, IP_VS_DBG(2, "BACKUP, add new conn. failed\n"); return; } + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + kfree(param->pe_data); } if (opt) @@ -1169,6 +1171,7 @@ static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end) (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL) ); #endif + ip_vs_pe_put(param.pe); return 0; /* Error exit */ out: diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 0d8448f19dfe..675d12c69e32 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -212,6 +212,30 @@ void nf_log_packet(struct net *net, } EXPORT_SYMBOL(nf_log_packet); +void nf_log_trace(struct net *net, + u_int8_t pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct nf_loginfo *loginfo, const char *fmt, ...) +{ + va_list args; + char prefix[NF_LOG_PREFIXLEN]; + const struct nf_logger *logger; + + rcu_read_lock(); + logger = rcu_dereference(net->nf.nf_loggers[pf]); + if (logger) { + va_start(args, fmt); + vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_trace); + #define S_SIZE (1024 - (sizeof(unsigned int) + 1)) struct nf_log_buf { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 199fd0f27b0e..ac1a9528dbf2 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -227,7 +227,7 @@ nft_rule_deactivate_next(struct net *net, struct nft_rule *rule) static inline void nft_rule_clear(struct net *net, struct nft_rule *rule) { - rule->genmask = 0; + rule->genmask &= ~(1 << gencursor_next(net)); } static int @@ -1225,7 +1225,10 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb, if (nla[NFTA_CHAIN_POLICY]) { if ((chain != NULL && - !(chain->flags & NFT_BASE_CHAIN)) || + !(chain->flags & NFT_BASE_CHAIN))) + return -EOPNOTSUPP; + + if (chain == NULL && nla[NFTA_CHAIN_HOOK] == NULL) return -EOPNOTSUPP; @@ -1711,9 +1714,12 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, } nla_nest_end(skb, list); - if (rule->ulen && - nla_put(skb, NFTA_RULE_USERDATA, rule->ulen, nft_userdata(rule))) - goto nla_put_failure; + if (rule->udata) { + struct nft_userdata *udata = nft_userdata(rule); + if (nla_put(skb, NFTA_RULE_USERDATA, udata->len + 1, + udata->data) < 0) + goto nla_put_failure; + } nlmsg_end(skb, nlh); return 0; @@ -1896,11 +1902,12 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, struct nft_table *table; struct nft_chain *chain; struct nft_rule *rule, *old_rule = NULL; + struct nft_userdata *udata; struct nft_trans *trans = NULL; struct nft_expr *expr; struct nft_ctx ctx; struct nlattr *tmp; - unsigned int size, i, n, ulen = 0; + unsigned int size, i, n, ulen = 0, usize = 0; int err, rem; bool create; u64 handle, pos_handle; @@ -1968,12 +1975,19 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, n++; } } + /* Check for overflow of dlen field */ + err = -EFBIG; + if (size >= 1 << 12) + goto err1; - if (nla[NFTA_RULE_USERDATA]) + if (nla[NFTA_RULE_USERDATA]) { ulen = nla_len(nla[NFTA_RULE_USERDATA]); + if (ulen > 0) + usize = sizeof(struct nft_userdata) + ulen; + } err = -ENOMEM; - rule = kzalloc(sizeof(*rule) + size + ulen, GFP_KERNEL); + rule = kzalloc(sizeof(*rule) + size + usize, GFP_KERNEL); if (rule == NULL) goto err1; @@ -1981,10 +1995,13 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, rule->handle = handle; rule->dlen = size; - rule->ulen = ulen; + rule->udata = ulen ? 1 : 0; - if (ulen) - nla_memcpy(nft_userdata(rule), nla[NFTA_RULE_USERDATA], ulen); + if (ulen) { + udata = nft_userdata(rule); + udata->len = ulen - 1; + nla_memcpy(udata->data, nla[NFTA_RULE_USERDATA], ulen); + } expr = nft_expr_first(rule); for (i = 0; i < n; i++) { @@ -2031,12 +2048,6 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb, err3: list_del_rcu(&rule->list); - if (trans) { - list_del_rcu(&nft_trans_rule(trans)->list); - nft_rule_clear(net, nft_trans_rule(trans)); - nft_trans_destroy(trans); - chain->use++; - } err2: nf_tables_rule_destroy(&ctx, rule); err1: @@ -3612,12 +3623,11 @@ static int nf_tables_commit(struct sk_buff *skb) &te->elem, NFT_MSG_DELSETELEM, 0); te->set->ops->get(te->set, &te->elem); - te->set->ops->remove(te->set, &te->elem); nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); - if (te->elem.flags & NFT_SET_MAP) { - nft_data_uninit(&te->elem.data, - te->set->dtype); - } + if (te->set->flags & NFT_SET_MAP && + !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_uninit(&te->elem.data, te->set->dtype); + te->set->ops->remove(te->set, &te->elem); nft_trans_destroy(trans); break; } @@ -3658,7 +3668,7 @@ static int nf_tables_abort(struct sk_buff *skb) { struct net *net = sock_net(skb->sk); struct nft_trans *trans, *next; - struct nft_set *set; + struct nft_trans_elem *te; list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { switch (trans->msg_type) { @@ -3719,9 +3729,13 @@ static int nf_tables_abort(struct sk_buff *skb) break; case NFT_MSG_NEWSETELEM: nft_trans_elem_set(trans)->nelems--; - set = nft_trans_elem_set(trans); - set->ops->get(set, &nft_trans_elem(trans)); - set->ops->remove(set, &nft_trans_elem(trans)); + te = (struct nft_trans_elem *)trans->data; + te->set->ops->get(te->set, &te->elem); + nft_data_uninit(&te->elem.key, NFT_DATA_VALUE); + if (te->set->flags & NFT_SET_MAP && + !(te->elem.flags & NFT_SET_ELEM_INTERVAL_END)) + nft_data_uninit(&te->elem.data, te->set->dtype); + te->set->ops->remove(te->set, &te->elem); nft_trans_destroy(trans); break; case NFT_MSG_DELSETELEM: diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 3b90eb2b2c55..2d298dccb6dd 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -94,10 +94,10 @@ static void nft_trace_packet(const struct nft_pktinfo *pkt, { struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); - nf_log_packet(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, - pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", - chain->table->name, chain->name, comments[type], - rulenum); + nf_log_trace(net, pkt->xt.family, pkt->ops->hooknum, pkt->skb, pkt->in, + pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", + chain->table->name, chain->name, comments[type], + rulenum); } unsigned int diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index a5599fc51a6f..54330fb5efaf 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -77,6 +77,9 @@ nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, if (!tb[NFCTH_TUPLE_L3PROTONUM] || !tb[NFCTH_TUPLE_L4PROTONUM]) return -EINVAL; + /* Not all fields are initialized so first zero the tuple */ + memset(tuple, 0, sizeof(struct nf_conntrack_tuple)); + tuple->src.l3num = ntohs(nla_get_be16(tb[NFCTH_TUPLE_L3PROTONUM])); tuple->dst.protonum = nla_get_u8(tb[NFCTH_TUPLE_L4PROTONUM]); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 1279cd85663e..65f3e2b6be44 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -123,7 +123,7 @@ static void nft_target_set_tgchk_param(struct xt_tgchk_param *par, const struct nft_ctx *ctx, struct xt_target *target, void *info, - union nft_entry *entry, u8 proto, bool inv) + union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; @@ -133,11 +133,14 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; break; case AF_INET6: + if (proto) + entry->e6.ipv6.flags |= IP6T_F_PROTO; + entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: - entry->ebt.ethproto = proto; + entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; } @@ -171,7 +174,7 @@ static const struct nla_policy nft_rule_compat_policy[NFTA_RULE_COMPAT_MAX + 1] [NFTA_RULE_COMPAT_FLAGS] = { .type = NLA_U32 }, }; -static int nft_parse_compat(const struct nlattr *attr, u8 *proto, bool *inv) +static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) { struct nlattr *tb[NFTA_RULE_COMPAT_MAX+1]; u32 flags; @@ -203,7 +206,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_target *target = expr->ops->data; struct xt_tgchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_TARGET_INFO])); - u8 proto = 0; + u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; @@ -334,7 +337,7 @@ static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { static void nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, struct xt_match *match, void *info, - union nft_entry *entry, u8 proto, bool inv) + union nft_entry *entry, u16 proto, bool inv) { par->net = ctx->net; par->table = ctx->table->name; @@ -344,11 +347,14 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, entry->e4.ip.invflags = inv ? IPT_INV_PROTO : 0; break; case AF_INET6: + if (proto) + entry->e6.ipv6.flags |= IP6T_F_PROTO; + entry->e6.ipv6.proto = proto; entry->e6.ipv6.invflags = inv ? IP6T_INV_PROTO : 0; break; case NFPROTO_BRIDGE: - entry->ebt.ethproto = proto; + entry->ebt.ethproto = (__force __be16)proto; entry->ebt.invflags = inv ? EBT_IPROTO : 0; break; } @@ -385,7 +391,7 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, struct xt_match *match = expr->ops->data; struct xt_mtchk_param par; size_t size = XT_ALIGN(nla_len(tb[NFTA_MATCH_INFO])); - u8 proto = 0; + u16 proto = 0; bool inv = false; union nft_entry e = {}; int ret; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index c82df0a48fcd..37c15e674884 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -153,6 +153,8 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, iter->err = err; goto out; } + + continue; } if (iter->count < iter->skip) diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index ef8a926752a9..50e1e5aaf4ce 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -513,8 +513,8 @@ static int tproxy_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_ip6 *i = par->entryinfo; - if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) - && !(i->flags & IP6T_INV_PROTO)) + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && + !(i->invflags & IP6T_INV_PROTO)) return 0; pr_info("Can be used only in combination with " diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 5bf1e968a728..f8db7064d81c 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3123,11 +3123,18 @@ static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, return 0; } -static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) +static void packet_dev_mclist_delete(struct net_device *dev, + struct packet_mclist **mlp) { - for ( ; i; i = i->next) { - if (i->ifindex == dev->ifindex) - packet_dev_mc(dev, i, what); + struct packet_mclist *ml; + + while ((ml = *mlp) != NULL) { + if (ml->ifindex == dev->ifindex) { + packet_dev_mc(dev, ml, -1); + *mlp = ml->next; + kfree(ml); + } else + mlp = &ml->next; } } @@ -3204,12 +3211,11 @@ static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) packet_dev_mc(dev, ml, -1); kfree(ml); } - rtnl_unlock(); - return 0; + break; } } rtnl_unlock(); - return -EADDRNOTAVAIL; + return 0; } static void packet_flush_mclist(struct sock *sk) @@ -3559,7 +3565,7 @@ static int packet_notifier(struct notifier_block *this, switch (msg) { case NETDEV_UNREGISTER: if (po->mclist) - packet_dev_mclist(dev, po->mclist, -1); + packet_dev_mclist_delete(dev, &po->mclist); /* fallthrough */ case NETDEV_DOWN: diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index a817705ce2d0..dba8d0864f18 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -88,7 +88,9 @@ static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool, int *unpinned); static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr); -static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id) +static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst, + struct rds_iw_device **rds_iwdev, + struct rdma_cm_id **cm_id) { struct rds_iw_device *iwdev; struct rds_iw_cm_id *i_cm_id; @@ -112,15 +114,15 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd src_addr->sin_port, dst_addr->sin_addr.s_addr, dst_addr->sin_port, - rs->rs_bound_addr, - rs->rs_bound_port, - rs->rs_conn_addr, - rs->rs_conn_port); + src->sin_addr.s_addr, + src->sin_port, + dst->sin_addr.s_addr, + dst->sin_port); #ifdef WORKING_TUPLE_DETECTION - if (src_addr->sin_addr.s_addr == rs->rs_bound_addr && - src_addr->sin_port == rs->rs_bound_port && - dst_addr->sin_addr.s_addr == rs->rs_conn_addr && - dst_addr->sin_port == rs->rs_conn_port) { + if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr && + src_addr->sin_port == src->sin_port && + dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr && + dst_addr->sin_port == dst->sin_port) { #else /* FIXME - needs to compare the local and remote * ipaddr/port tuple, but the ipaddr is the only @@ -128,7 +130,7 @@ static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwd * zero'ed. It doesn't appear to be properly populated * during connection setup... */ - if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) { + if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) { #endif spin_unlock_irq(&iwdev->spinlock); *rds_iwdev = iwdev; @@ -180,19 +182,13 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i { struct sockaddr_in *src_addr, *dst_addr; struct rds_iw_device *rds_iwdev_old; - struct rds_sock rs; struct rdma_cm_id *pcm_id; int rc; src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr; dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr; - rs.rs_bound_addr = src_addr->sin_addr.s_addr; - rs.rs_bound_port = src_addr->sin_port; - rs.rs_conn_addr = dst_addr->sin_addr.s_addr; - rs.rs_conn_port = dst_addr->sin_port; - - rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id); + rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id); if (rc) rds_iw_remove_cm_id(rds_iwdev, cm_id); @@ -598,9 +594,17 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_iw_device *rds_iwdev; struct rds_iw_mr *ibmr = NULL; struct rdma_cm_id *cm_id; + struct sockaddr_in src = { + .sin_addr.s_addr = rs->rs_bound_addr, + .sin_port = rs->rs_bound_port, + }; + struct sockaddr_in dst = { + .sin_addr.s_addr = rs->rs_conn_addr, + .sin_port = rs->rs_conn_port, + }; int ret; - ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id); + ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id); if (ret || !cm_id) { ret = -ENODEV; goto out; diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c index 5394b6be46ec..0610efa83d72 100644 --- a/net/rxrpc/ar-error.c +++ b/net/rxrpc/ar-error.c @@ -42,7 +42,8 @@ void rxrpc_UDP_error_report(struct sock *sk) _leave("UDP socket errqueue empty"); return; } - if (!skb->len) { + serr = SKB_EXT_ERR(skb); + if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { _leave("UDP empty message"); kfree_skb(skb); return; @@ -50,7 +51,6 @@ void rxrpc_UDP_error_report(struct sock *sk) rxrpc_new_skb(skb); - serr = SKB_EXT_ERR(skb); addr = *(__be32 *)(skb_network_header(skb) + serr->addr_offset); port = serr->port; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index 4575485ad1b4..19a560626dc4 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -87,7 +87,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (!skb) { /* nothing remains on the queue */ if (copied && - (msg->msg_flags & MSG_PEEK || timeo == 0)) + (flags & MSG_PEEK || timeo == 0)) goto out; /* wait for a message to turn up */ diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 82c5d7fc1988..5f6288fa3f12 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -25,21 +25,41 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_bpf *b = a->priv; - int action; - int filter_res; + int action, filter_res; spin_lock(&b->tcf_lock); + b->tcf_tm.lastuse = jiffies; bstats_update(&b->tcf_bstats, skb); - action = b->tcf_action; filter_res = BPF_PROG_RUN(b->filter, skb); - if (filter_res == 0) { - /* Return code 0 from the BPF program - * is being interpreted as a drop here. - */ - action = TC_ACT_SHOT; + + /* A BPF program may overwrite the default action opcode. + * Similarly as in cls_bpf, if filter_res == -1 we use the + * default action specified from tc. + * + * In case a different well-known TC_ACT opcode has been + * returned, it will overwrite the default one. + * + * For everything else that is unkown, TC_ACT_UNSPEC is + * returned. + */ + switch (filter_res) { + case TC_ACT_PIPE: + case TC_ACT_RECLASSIFY: + case TC_ACT_OK: + action = filter_res; + break; + case TC_ACT_SHOT: + action = filter_res; b->tcf_qstats.drops++; + break; + case TC_ACT_UNSPEC: + action = b->tcf_action; + break; + default: + action = TC_ACT_UNSPEC; + break; } spin_unlock(&b->tcf_lock); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 09487afbfd51..95fdf4e40051 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -78,8 +78,11 @@ struct tc_u_hnode { struct tc_u_common *tp_c; int refcnt; unsigned int divisor; - struct tc_u_knode __rcu *ht[1]; struct rcu_head rcu; + /* The 'ht' field MUST be the last field in structure to allow for + * more entries allocated at end of structure. + */ + struct tc_u_knode __rcu *ht[1]; }; struct tc_u_common { diff --git a/net/socket.c b/net/socket.c index bbedbfcb42c2..245330ca0015 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1702,6 +1702,8 @@ SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, if (len > INT_MAX) len = INT_MAX; + if (unlikely(!access_ok(VERIFY_READ, buff, len))) + return -EFAULT; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; @@ -1760,6 +1762,8 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size, if (size > INT_MAX) size = INT_MAX; + if (unlikely(!access_ok(VERIFY_WRITE, ubuf, size))) + return -EFAULT; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (!sock) goto out; diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index da5136fd5694..579f72bbcf4b 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o -xprtrdma-y := transport.o rpc_rdma.o verbs.o +xprtrdma-y := transport.o rpc_rdma.o verbs.o \ + fmr_ops.o frwr_ops.o physical_ops.o obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c new file mode 100644 index 000000000000..a91ba2c8ef1e --- /dev/null +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -0,0 +1,208 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Memory Regions (FMR). + * Referred to sometimes as MTHCAFMR mode. + * + * FMR uses synchronous memory registration and deregistration. + * FMR registration is known to be fast, but FMR deregistration + * can take tens of usecs to complete. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +/* Maximum scatter/gather per FMR */ +#define RPCRDMA_MAX_FMR_SGES (64) + +static int +fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* FMR mode conveys up to 64 pages of payload per chunk segment. + */ +static size_t +fmr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * RPCRDMA_MAX_FMR_SGES); +} + +static int +fmr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; + struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int i, rc; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); + + while (i--) { + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->r.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr); + if (IS_ERR(r->r.fmr)) + goto out_fmr_err; + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + } + return 0; + +out_fmr_err: + rc = PTR_ERR(r->r.fmr); + dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc); + kfree(r); + return rc; +} + +/* Use the ib_map_phys_fmr() verb to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; + int len, pageoff, i, rc; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > RPCRDMA_MAX_FMR_SGES) + nsegs = RPCRDMA_MAX_FMR_SGES; + for (i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + physaddrs[i] = seg->mr_dma; + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + + rc = ib_map_phys_fmr(mw->r.fmr, physaddrs, i, seg1->mr_dma); + if (rc) + goto out_maperr; + + seg1->mr_rkey = mw->r.fmr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_maperr: + dprintk("RPC: %s: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n", + __func__, len, (unsigned long long)seg1->mr_dma, + pageoff, i, rc); + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Use the ib_unmap_fmr() verb to prevent further remote + * access via RDMA READ or RDMA WRITE. + */ +static int +fmr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_mr_seg *seg1 = seg; + struct ib_device *device; + int rc, nsegs = seg->mr_nsegs; + LIST_HEAD(l); + + list_add(&seg1->rl_mw->r.fmr->list, &l); + rc = ib_unmap_fmr(&l); + read_lock(&ia->ri_qplock); + device = ia->ri_id->device; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(device, seg++); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + dprintk("RPC: %s: ib_unmap_fmr status %i\n", __func__, rc); + return nsegs; +} + +/* After a disconnect, unmap all FMRs. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_fmr_external(). + */ +static void +fmr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_mw *r; + LIST_HEAD(list); + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) + list_add(&r->r.fmr->list, &list); + + rc = ib_unmap_fmr(&list); + if (rc) + dprintk("RPC: %s: ib_unmap_fmr failed %i\n", + __func__, rc); +} + +static void +fmr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + int rc; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + rc = ib_dealloc_fmr(r->r.fmr); + if (rc) + dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", + __func__, rc); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { + .ro_map = fmr_op_map, + .ro_unmap = fmr_op_unmap, + .ro_open = fmr_op_open, + .ro_maxpages = fmr_op_maxpages, + .ro_init = fmr_op_init, + .ro_reset = fmr_op_reset, + .ro_destroy = fmr_op_destroy, + .ro_displayname = "fmr", +}; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c new file mode 100644 index 000000000000..0a7b9df70133 --- /dev/null +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* Lightweight memory registration using Fast Registration Work + * Requests (FRWR). Also referred to sometimes as FRMR mode. + * + * FRWR features ordered asynchronous registration and deregistration + * of arbitrarily sized memory regions. This is the fastest and safest + * but most complex memory registration mode. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device, + unsigned int depth) +{ + struct rpcrdma_frmr *f = &r->r.frmr; + int rc; + + f->fr_mr = ib_alloc_fast_reg_mr(pd, depth); + if (IS_ERR(f->fr_mr)) + goto out_mr_err; + f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth); + if (IS_ERR(f->fr_pgl)) + goto out_list_err; + return 0; + +out_mr_err: + rc = PTR_ERR(f->fr_mr); + dprintk("RPC: %s: ib_alloc_fast_reg_mr status %i\n", + __func__, rc); + return rc; + +out_list_err: + rc = PTR_ERR(f->fr_pgl); + dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n", + __func__, rc); + ib_dereg_mr(f->fr_mr); + return rc; +} + +static void +__frwr_release(struct rpcrdma_mw *r) +{ + int rc; + + rc = ib_dereg_mr(r->r.frmr.fr_mr); + if (rc) + dprintk("RPC: %s: ib_dereg_mr status %i\n", + __func__, rc); + ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); +} + +static int +frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + struct ib_device_attr *devattr = &ia->ri_devattr; + int depth, delta; + + ia->ri_max_frmr_depth = + min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + devattr->max_fast_reg_page_list_len); + dprintk("RPC: %s: device's max FR page list len = %u\n", + __func__, ia->ri_max_frmr_depth); + + /* Add room for frmr register and invalidate WRs. + * 1. FRMR reg WR for head + * 2. FRMR invalidate WR for head + * 3. N FRMR reg WRs for pagelist + * 4. N FRMR invalidate WRs for pagelist + * 5. FRMR reg WR for tail + * 6. FRMR invalidate WR for tail + * 7. The RDMA_SEND WR + */ + depth = 7; + + /* Calculate N if the device max FRMR depth is smaller than + * RPCRDMA_MAX_DATA_SEGS. + */ + if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { + delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frmr_depth; + do { + depth += 2; /* FRMR reg + invalidate */ + delta -= ia->ri_max_frmr_depth; + } while (delta > 0); + } + + ep->rep_attr.cap.max_send_wr *= depth; + if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { + cdata->max_requests = devattr->max_qp_wr / depth; + if (!cdata->max_requests) + return -EINVAL; + ep->rep_attr.cap.max_send_wr = cdata->max_requests * + depth; + } + + return 0; +} + +/* FRWR mode conveys a list of pages per chunk segment. The + * maximum length of that list is the FRWR page list depth. + */ +static size_t +frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth); +} + +/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */ +static void +frwr_sendcompletion(struct ib_wc *wc) +{ + struct rpcrdma_mw *r; + + if (likely(wc->status == IB_WC_SUCCESS)) + return; + + /* WARNING: Only wr_id and status are reliable at this point */ + r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; + dprintk("RPC: %s: frmr %p (stale), status %d\n", + __func__, r, wc->status); + r->r.frmr.fr_state = FRMR_IS_STALE; +} + +static int +frwr_op_init(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + int i; + + INIT_LIST_HEAD(&buf->rb_mws); + INIT_LIST_HEAD(&buf->rb_all); + + i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; + dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); + + while (i--) { + struct rpcrdma_mw *r; + int rc; + + r = kzalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return -ENOMEM; + + rc = __frwr_init(r, pd, device, depth); + if (rc) { + kfree(r); + return rc; + } + + list_add(&r->mw_list, &buf->rb_mws); + list_add(&r->mw_all, &buf->rb_all); + r->mw_sendcompletion = frwr_sendcompletion; + } + + return 0; +} + +/* Post a FAST_REG Work Request to register a memory region + * for remote access via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_device *device = ia->ri_id->device; + enum dma_data_direction direction = rpcrdma_data_dir(writing); + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_mw *mw = seg1->rl_mw; + struct rpcrdma_frmr *frmr = &mw->r.frmr; + struct ib_mr *mr = frmr->fr_mr; + struct ib_send_wr fastreg_wr, *bad_wr; + u8 key; + int len, pageoff; + int i, rc; + int seg_len; + u64 pa; + int page_no; + + pageoff = offset_in_page(seg1->mr_offset); + seg1->mr_offset -= pageoff; /* start of page */ + seg1->mr_len += pageoff; + len = -pageoff; + if (nsegs > ia->ri_max_frmr_depth) + nsegs = ia->ri_max_frmr_depth; + for (page_no = i = 0; i < nsegs;) { + rpcrdma_map_one(device, seg, direction); + pa = seg->mr_dma; + for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { + frmr->fr_pgl->page_list[page_no++] = pa; + pa += PAGE_SIZE; + } + len += seg->mr_len; + ++seg; + ++i; + /* Check for holes */ + if ((i < nsegs && offset_in_page(seg->mr_offset)) || + offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) + break; + } + dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n", + __func__, mw, i, len); + + frmr->fr_state = FRMR_IS_VALID; + + memset(&fastreg_wr, 0, sizeof(fastreg_wr)); + fastreg_wr.wr_id = (unsigned long)(void *)mw; + fastreg_wr.opcode = IB_WR_FAST_REG_MR; + fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff; + fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; + fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; + fastreg_wr.wr.fast_reg.page_list_len = page_no; + fastreg_wr.wr.fast_reg.length = len; + fastreg_wr.wr.fast_reg.access_flags = writing ? + IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : + IB_ACCESS_REMOTE_READ; + key = (u8)(mr->rkey & 0x000000FF); + ib_update_fast_reg_key(mr, ++key); + fastreg_wr.wr.fast_reg.rkey = mr->rkey; + + DECR_CQCOUNT(&r_xprt->rx_ep); + rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); + if (rc) + goto out_senderr; + + seg1->mr_rkey = mr->rkey; + seg1->mr_base = seg1->mr_dma + pageoff; + seg1->mr_nsegs = i; + seg1->mr_len = len; + return i; + +out_senderr: + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + ib_update_fast_reg_key(mr, --key); + frmr->fr_state = FRMR_IS_INVALID; + while (i--) + rpcrdma_unmap_one(device, --seg); + return rc; +} + +/* Post a LOCAL_INV Work Request to prevent further remote access + * via RDMA READ or RDMA WRITE. + */ +static int +frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_mr_seg *seg1 = seg; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct ib_send_wr invalidate_wr, *bad_wr; + int rc, nsegs = seg->mr_nsegs; + struct ib_device *device; + + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; + + memset(&invalidate_wr, 0, sizeof(invalidate_wr)); + invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; + invalidate_wr.opcode = IB_WR_LOCAL_INV; + invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; + DECR_CQCOUNT(&r_xprt->rx_ep); + + read_lock(&ia->ri_qplock); + device = ia->ri_id->device; + while (seg1->mr_nsegs--) + rpcrdma_unmap_one(device, seg++); + rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); + read_unlock(&ia->ri_qplock); + if (rc) + goto out_err; + return nsegs; + +out_err: + /* Force rpcrdma_buffer_get() to retry */ + seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; + dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc); + return nsegs; +} + +/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in + * an unusable state. Find FRMRs in this state and dereg / reg + * each. FRMRs that are VALID and attached to an rpcrdma_req are + * also torn down. + * + * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. + * + * This is invoked only in the transport connect worker in order + * to serialize with rpcrdma_register_frmr_external(). + */ +static void +frwr_op_reset(struct rpcrdma_xprt *r_xprt) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct ib_device *device = r_xprt->rx_ia.ri_id->device; + unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth; + struct ib_pd *pd = r_xprt->rx_ia.ri_pd; + struct rpcrdma_mw *r; + int rc; + + list_for_each_entry(r, &buf->rb_all, mw_all) { + if (r->r.frmr.fr_state == FRMR_IS_INVALID) + continue; + + __frwr_release(r); + rc = __frwr_init(r, pd, device, depth); + if (rc) { + dprintk("RPC: %s: mw %p left %s\n", + __func__, r, + (r->r.frmr.fr_state == FRMR_IS_STALE ? + "stale" : "valid")); + continue; + } + + r->r.frmr.fr_state = FRMR_IS_INVALID; + } +} + +static void +frwr_op_destroy(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_mw *r; + + while (!list_empty(&buf->rb_all)) { + r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); + list_del(&r->mw_all); + __frwr_release(r); + kfree(r); + } +} + +const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { + .ro_map = frwr_op_map, + .ro_unmap = frwr_op_unmap, + .ro_open = frwr_op_open, + .ro_maxpages = frwr_op_maxpages, + .ro_init = frwr_op_init, + .ro_reset = frwr_op_reset, + .ro_destroy = frwr_op_destroy, + .ro_displayname = "frwr", +}; diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c new file mode 100644 index 000000000000..ba518af16787 --- /dev/null +++ b/net/sunrpc/xprtrdma/physical_ops.c @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. + */ + +/* No-op chunk preparation. All client memory is pre-registered. + * Sometimes referred to as ALLPHYSICAL mode. + * + * Physical registration is simple because all client memory is + * pre-registered and never deregistered. This mode is good for + * adapter bring up, but is considered not safe: the server is + * trusted not to abuse its access to client memory not involved + * in RDMA I/O. + */ + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +static int +physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, + struct rpcrdma_create_data_internal *cdata) +{ + return 0; +} + +/* PHYSICAL memory registration conveys one page per chunk segment. + */ +static size_t +physical_op_maxpages(struct rpcrdma_xprt *r_xprt) +{ + return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, + rpcrdma_max_segments(r_xprt)); +} + +static int +physical_op_init(struct rpcrdma_xprt *r_xprt) +{ + return 0; +} + +/* The client's physical memory is already exposed for + * remote access via RDMA READ or RDMA WRITE. + */ +static int +physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, + int nsegs, bool writing) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + rpcrdma_map_one(ia->ri_id->device, seg, + rpcrdma_data_dir(writing)); + seg->mr_rkey = ia->ri_bind_mem->rkey; + seg->mr_base = seg->mr_dma; + seg->mr_nsegs = 1; + return 1; +} + +/* Unmap a memory region, but leave it registered. + */ +static int +physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + read_lock(&ia->ri_qplock); + rpcrdma_unmap_one(ia->ri_id->device, seg); + read_unlock(&ia->ri_qplock); + + return 1; +} + +static void +physical_op_reset(struct rpcrdma_xprt *r_xprt) +{ +} + +static void +physical_op_destroy(struct rpcrdma_buffer *buf) +{ +} + +const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = { + .ro_map = physical_op_map, + .ro_unmap = physical_op_unmap, + .ro_open = physical_op_open, + .ro_maxpages = physical_op_maxpages, + .ro_init = physical_op_init, + .ro_reset = physical_op_reset, + .ro_destroy = physical_op_destroy, + .ro_displayname = "physical", +}; diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 91ffde82fa0c..2c53ea9e1b83 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -53,6 +53,14 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif +enum rpcrdma_chunktype { + rpcrdma_noch = 0, + rpcrdma_readch, + rpcrdma_areadch, + rpcrdma_writech, + rpcrdma_replych +}; + #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) static const char transfertypes[][12] = { "pure inline", /* no chunks */ @@ -179,6 +187,7 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, struct rpcrdma_write_array *warray = NULL; struct rpcrdma_write_chunk *cur_wchunk = NULL; __be32 *iptr = headerp->rm_body.rm_chunks; + int (*map)(struct rpcrdma_xprt *, struct rpcrdma_mr_seg *, int, bool); if (type == rpcrdma_readch || type == rpcrdma_areadch) { /* a read chunk - server will RDMA Read our memory */ @@ -201,9 +210,9 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, if (nsegs < 0) return nsegs; + map = r_xprt->rx_ia.ri_ops->ro_map; do { - n = rpcrdma_register_external(seg, nsegs, - cur_wchunk != NULL, r_xprt); + n = map(r_xprt, seg, nsegs, cur_wchunk != NULL); if (n <= 0) goto out; if (cur_rchunk) { /* read */ @@ -275,34 +284,13 @@ rpcrdma_create_chunks(struct rpc_rqst *rqst, struct xdr_buf *target, return (unsigned char *)iptr - (unsigned char *)headerp; out: - if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_FRMR) { - for (pos = 0; nchunks--;) - pos += rpcrdma_deregister_external( - &req->rl_segments[pos], r_xprt); - } - return n; -} + if (r_xprt->rx_ia.ri_memreg_strategy == RPCRDMA_FRMR) + return n; -/* - * Marshal chunks. This routine returns the header length - * consumed by marshaling. - * - * Returns positive RPC/RDMA header size, or negative errno. - */ - -ssize_t -rpcrdma_marshal_chunks(struct rpc_rqst *rqst, ssize_t result) -{ - struct rpcrdma_req *req = rpcr_to_rdmar(rqst); - struct rpcrdma_msg *headerp = rdmab_to_msg(req->rl_rdmabuf); - - if (req->rl_rtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, - headerp, req->rl_rtype); - else if (req->rl_wtype != rpcrdma_noch) - result = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, - headerp, req->rl_wtype); - return result; + for (pos = 0; nchunks--;) + pos += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[pos]); + return n; } /* @@ -397,6 +385,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) char *base; size_t rpclen, padlen; ssize_t hdrlen; + enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; /* @@ -433,13 +422,13 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * into pages; otherwise use reply chunks. */ if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst)) - req->rl_wtype = rpcrdma_noch; + wtype = rpcrdma_noch; else if (rqst->rq_rcv_buf.page_len == 0) - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; else if (rqst->rq_rcv_buf.flags & XDRBUF_READ) - req->rl_wtype = rpcrdma_writech; + wtype = rpcrdma_writech; else - req->rl_wtype = rpcrdma_replych; + wtype = rpcrdma_replych; /* * Chunks needed for arguments? @@ -456,16 +445,16 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * TBD check NFSv4 setacl */ if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst)) - req->rl_rtype = rpcrdma_noch; + rtype = rpcrdma_noch; else if (rqst->rq_snd_buf.page_len == 0) - req->rl_rtype = rpcrdma_areadch; + rtype = rpcrdma_areadch; else - req->rl_rtype = rpcrdma_readch; + rtype = rpcrdma_readch; /* The following simplification is not true forever */ - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype == rpcrdma_replych) - req->rl_wtype = rpcrdma_noch; - if (req->rl_rtype != rpcrdma_noch && req->rl_wtype != rpcrdma_noch) { + if (rtype != rpcrdma_noch && wtype == rpcrdma_replych) + wtype = rpcrdma_noch; + if (rtype != rpcrdma_noch && wtype != rpcrdma_noch) { dprintk("RPC: %s: cannot marshal multiple chunk lists\n", __func__); return -EIO; @@ -479,7 +468,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * When padding is in use and applies to the transfer, insert * it and change the message type. */ - if (req->rl_rtype == rpcrdma_noch) { + if (rtype == rpcrdma_noch) { padlen = rpcrdma_inline_pullup(rqst, RPCRDMA_INLINE_PAD_VALUE(rqst)); @@ -494,7 +483,7 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero; headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero; hdrlen += 2 * sizeof(u32); /* extra words in padhdr */ - if (req->rl_wtype != rpcrdma_noch) { + if (wtype != rpcrdma_noch) { dprintk("RPC: %s: invalid chunk list\n", __func__); return -EIO; @@ -515,18 +504,26 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) * on receive. Therefore, we request a reply chunk * for non-writes wherever feasible and efficient. */ - if (req->rl_wtype == rpcrdma_noch) - req->rl_wtype = rpcrdma_replych; + if (wtype == rpcrdma_noch) + wtype = rpcrdma_replych; } } - hdrlen = rpcrdma_marshal_chunks(rqst, hdrlen); + if (rtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf, + headerp, rtype); + wtype = rtype; /* simplify dprintk */ + + } else if (wtype != rpcrdma_noch) { + hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_rcv_buf, + headerp, wtype); + } if (hdrlen < 0) return hdrlen; dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd" " headerp 0x%p base 0x%p lkey 0x%x\n", - __func__, transfertypes[req->rl_wtype], hdrlen, rpclen, padlen, + __func__, transfertypes[wtype], hdrlen, rpclen, padlen, headerp, base, rdmab_lkey(req->rl_rdmabuf)); /* diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2e192baa59f3..54f23b1be986 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -157,12 +157,47 @@ static struct ctl_table sunrpc_table[] = { static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ static void +xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + char buf[20]; + + snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA; +} + +static void +xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) +{ + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + char buf[40]; + + snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr); + xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); + + xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; +} + +static void xprt_rdma_format_addresses(struct rpc_xprt *xprt) { struct sockaddr *sap = (struct sockaddr *) &rpcx_to_rdmad(xprt).addr; - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - char buf[64]; + char buf[128]; + + switch (sap->sa_family) { + case AF_INET: + xprt_rdma_format_addresses4(xprt, sap); + break; + case AF_INET6: + xprt_rdma_format_addresses6(xprt, sap); + break; + default: + pr_err("rpcrdma: Unrecognized address family\n"); + return; + } (void)rpc_ntop(sap, buf, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); @@ -170,16 +205,10 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt) snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL); - xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; - - snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr)); - xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL); - snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap)); xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL); - /* netid */ - xprt->address_strings[RPC_DISPLAY_NETID] = "rdma"; + xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } static void @@ -377,7 +406,10 @@ xprt_setup_rdma(struct xprt_create *args) xprt_rdma_connect_worker); xprt_rdma_format_addresses(xprt); - xprt->max_payload = rpcrdma_max_payload(new_xprt); + xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt); + if (xprt->max_payload == 0) + goto out4; + xprt->max_payload <<= PAGE_SHIFT; dprintk("RPC: %s: transport data payload maximum: %zu bytes\n", __func__, xprt->max_payload); @@ -552,8 +584,8 @@ xprt_rdma_free(void *buffer) for (i = 0; req->rl_nchunks;) { --req->rl_nchunks; - i += rpcrdma_deregister_external( - &req->rl_segments[i], r_xprt); + i += r_xprt->rx_ia.ri_ops->ro_unmap(r_xprt, + &req->rl_segments[i]); } rpcrdma_buffer_put(req); @@ -579,10 +611,7 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; - if (req->rl_niovs == 0) - rc = rpcrdma_marshal_req(rqst); - else if (r_xprt->rx_ia.ri_memreg_strategy != RPCRDMA_ALLPHYSICAL) - rc = rpcrdma_marshal_chunks(rqst, 0); + rc = rpcrdma_marshal_req(rqst); if (rc < 0) goto failed_marshal; diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 124676c13780..4870d272e006 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -50,6 +50,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/prefetch.h> +#include <linux/sunrpc/addr.h> #include <asm/bitops.h> #include "xprt_rdma.h" @@ -62,9 +63,6 @@ # define RPCDBG_FACILITY RPCDBG_TRANS #endif -static void rpcrdma_reset_frmrs(struct rpcrdma_ia *); -static void rpcrdma_reset_fmrs(struct rpcrdma_ia *); - /* * internal functions */ @@ -188,7 +186,7 @@ static const char * const wc_status[] = { "remote access error", "remote operation error", "transport retry counter exceeded", - "RNR retrycounter exceeded", + "RNR retry counter exceeded", "local RDD violation error", "remove invalid RD request", "operation aborted", @@ -206,21 +204,17 @@ static const char * const wc_status[] = { static void rpcrdma_sendcq_process_wc(struct ib_wc *wc) { - if (likely(wc->status == IB_WC_SUCCESS)) - return; - /* WARNING: Only wr_id and status are reliable at this point */ - if (wc->wr_id == 0ULL) { - if (wc->status != IB_WC_WR_FLUSH_ERR) + if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) { + if (wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: SEND: %s\n", __func__, COMPLETION_MSG(wc->status)); } else { struct rpcrdma_mw *r; r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - r->r.frmr.fr_state = FRMR_IS_STALE; - pr_err("RPC: %s: frmr %p (stale): %s\n", - __func__, r, COMPLETION_MSG(wc->status)); + r->mw_sendcompletion(wc); } } @@ -424,7 +418,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) struct rpcrdma_ia *ia = &xprt->rx_ia; struct rpcrdma_ep *ep = &xprt->rx_ep; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) - struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr; + struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; #endif struct ib_qp_attr *attr = &ia->ri_qp_attr; struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; @@ -480,9 +474,8 @@ connected: wake_up_all(&ep->rep_connect_wait); /*FALLTHROUGH*/ default: - dprintk("RPC: %s: %pI4:%u (ep 0x%p): %s\n", - __func__, &addr->sin_addr.s_addr, - ntohs(addr->sin_port), ep, + dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", + __func__, sap, rpc_get_port(sap), ep, CONNECTION_MSG(event->event)); break; } @@ -491,19 +484,16 @@ connected: if (connstate == 1) { int ird = attr->max_dest_rd_atomic; int tird = ep->rep_remote_cma.responder_resources; - printk(KERN_INFO "rpcrdma: connection to %pI4:%u " - "on %s, memreg %d slots %d ird %d%s\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), + + pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", + sap, rpc_get_port(sap), ia->ri_id->device->name, - ia->ri_memreg_strategy, + ia->ri_ops->ro_displayname, xprt->rx_buf.rb_max_requests, ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); } else if (connstate < 0) { - printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n", - &addr->sin_addr.s_addr, - ntohs(addr->sin_port), - connstate); + pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", + sap, rpc_get_port(sap), connstate); } #endif @@ -621,17 +611,13 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (memreg == RPCRDMA_FRMR) { /* Requires both frmr reg and local dma lkey */ - if ((devattr->device_cap_flags & + if (((devattr->device_cap_flags & (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != - (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) { + (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || + (devattr->max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; - } else { - /* Mind the ia limit on FRMR page list depth */ - ia->ri_max_frmr_depth = min_t(unsigned int, - RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); } } if (memreg == RPCRDMA_MTHCAFMR) { @@ -652,13 +638,16 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) */ switch (memreg) { case RPCRDMA_FRMR: + ia->ri_ops = &rpcrdma_frwr_memreg_ops; break; case RPCRDMA_ALLPHYSICAL: + ia->ri_ops = &rpcrdma_physical_memreg_ops; mem_priv = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; goto register_setup; case RPCRDMA_MTHCAFMR: + ia->ri_ops = &rpcrdma_fmr_memreg_ops; if (ia->ri_have_dma_lkey) break; mem_priv = IB_ACCESS_LOCAL_WRITE; @@ -678,8 +667,8 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) rc = -ENOMEM; goto out3; } - dprintk("RPC: %s: memory registration strategy is %d\n", - __func__, memreg); + dprintk("RPC: %s: memory registration strategy is '%s'\n", + __func__, ia->ri_ops->ro_displayname); /* Else will do memory reg/dereg for each chunk */ ia->ri_memreg_strategy = memreg; @@ -743,49 +732,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; - /* send_cq and recv_cq initialized below */ ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: { - int depth = 7; - - /* Add room for frmr register and invalidate WRs. - * 1. FRMR reg WR for head - * 2. FRMR invalidate WR for head - * 3. N FRMR reg WRs for pagelist - * 4. N FRMR invalidate WRs for pagelist - * 5. FRMR reg WR for tail - * 6. FRMR invalidate WR for tail - * 7. The RDMA_SEND WR - */ - - /* Calculate N if the device max FRMR depth is smaller than - * RPCRDMA_MAX_DATA_SEGS. - */ - if (ia->ri_max_frmr_depth < RPCRDMA_MAX_DATA_SEGS) { - int delta = RPCRDMA_MAX_DATA_SEGS - - ia->ri_max_frmr_depth; - - do { - depth += 2; /* FRMR reg + invalidate */ - delta -= ia->ri_max_frmr_depth; - } while (delta > 0); - - } - ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; - if (!cdata->max_requests) - return -EINVAL; - ep->rep_attr.cap.max_send_wr = cdata->max_requests * - depth; - } - break; - } - default: - break; - } + rc = ia->ri_ops->ro_open(ia, ep, cdata); + if (rc) + return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); ep->rep_attr.cap.max_recv_sge = 1; @@ -944,21 +895,9 @@ retry: rpcrdma_ep_disconnect(ep, ia); rpcrdma_flush_cqs(ep); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_reset_frmrs(ia); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_reset_fmrs(ia); - break; - case RPCRDMA_ALLPHYSICAL: - break; - default: - rc = -EIO; - goto out; - } - xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + ia->ri_ops->ro_reset(xprt); + id = rpcrdma_create_id(xprt, ia, (struct sockaddr *)&xprt->rx_data.addr); if (IS_ERR(id)) { @@ -1123,91 +1062,6 @@ out: return ERR_PTR(rc); } -static int -rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - int mr_access_flags = IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_DATA_SEGS, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initalizing %d FMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - - r->r.fmr = ib_alloc_fmr(ia->ri_pd, mr_access_flags, &fmr_attr); - if (IS_ERR(r->r.fmr)) { - rc = PTR_ERR(r->r.fmr); - dprintk("RPC: %s: ib_alloc_fmr failed %i\n", - __func__, rc); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - return 0; - -out_free: - kfree(r); - return rc; -} - -static int -rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf) -{ - struct rpcrdma_frmr *f; - struct rpcrdma_mw *r; - int i, rc; - - i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS; - dprintk("RPC: %s: initalizing %d FRMRs\n", __func__, i); - - while (i--) { - r = kzalloc(sizeof(*r), GFP_KERNEL); - if (r == NULL) - return -ENOMEM; - f = &r->r.frmr; - - f->fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_mr)) { - rc = PTR_ERR(f->fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr " - "failed %i\n", __func__, rc); - goto out_free; - } - - f->fr_pgl = ib_alloc_fast_reg_page_list(ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(f->fr_pgl)) { - rc = PTR_ERR(f->fr_pgl); - dprintk("RPC: %s: ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(f->fr_mr); - goto out_free; - } - - list_add(&r->mw_list, &buf->rb_mws); - list_add(&r->mw_all, &buf->rb_all); - } - - return 0; - -out_free: - kfree(r); - return rc; -} - int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { @@ -1244,22 +1098,9 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) buf->rb_recv_bufs = (struct rpcrdma_rep **) p; p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - INIT_LIST_HEAD(&buf->rb_mws); - INIT_LIST_HEAD(&buf->rb_all); - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rc = rpcrdma_init_frmrs(ia, buf); - if (rc) - goto out; - break; - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_init_fmrs(ia, buf); - if (rc) - goto out; - break; - default: - break; - } + rc = ia->ri_ops->ro_init(r_xprt); + if (rc) + goto out; for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; @@ -1311,47 +1152,6 @@ rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) kfree(req); } -static void -rpcrdma_destroy_fmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dealloc_fmr(r->r.fmr); - if (rc) - dprintk("RPC: %s: ib_dealloc_fmr failed %i\n", - __func__, rc); - - kfree(r); - } -} - -static void -rpcrdma_destroy_frmrs(struct rpcrdma_buffer *buf) -{ - struct rpcrdma_mw *r; - int rc; - - while (!list_empty(&buf->rb_all)) { - r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all); - list_del(&r->mw_all); - list_del(&r->mw_list); - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - kfree(r); - } -} - void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { @@ -1372,104 +1172,11 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); } - switch (ia->ri_memreg_strategy) { - case RPCRDMA_FRMR: - rpcrdma_destroy_frmrs(buf); - break; - case RPCRDMA_MTHCAFMR: - rpcrdma_destroy_fmrs(buf); - break; - default: - break; - } + ia->ri_ops->ro_destroy(buf); kfree(buf->rb_pool); } -/* After a disconnect, unmap all FMRs. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_fmr_external(). - */ -static void -rpcrdma_reset_fmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - LIST_HEAD(l); - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - INIT_LIST_HEAD(&l); - list_add(&r->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - if (rc) - dprintk("RPC: %s: ib_unmap_fmr failed %i\n", - __func__, rc); - } -} - -/* After a disconnect, a flushed FAST_REG_MR can leave an FRMR in - * an unusable state. Find FRMRs in this state and dereg / reg - * each. FRMRs that are VALID and attached to an rpcrdma_req are - * also torn down. - * - * This gives all in-use FRMRs a fresh rkey and leaves them INVALID. - * - * This is invoked only in the transport connect worker in order - * to serialize with rpcrdma_register_frmr_external(). - */ -static void -rpcrdma_reset_frmrs(struct rpcrdma_ia *ia) -{ - struct rpcrdma_xprt *r_xprt = - container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - struct list_head *pos; - struct rpcrdma_mw *r; - int rc; - - list_for_each(pos, &buf->rb_all) { - r = list_entry(pos, struct rpcrdma_mw, mw_all); - - if (r->r.frmr.fr_state == FRMR_IS_INVALID) - continue; - - rc = ib_dereg_mr(r->r.frmr.fr_mr); - if (rc) - dprintk("RPC: %s: ib_dereg_mr failed %i\n", - __func__, rc); - ib_free_fast_reg_page_list(r->r.frmr.fr_pgl); - - r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_mr)) { - rc = PTR_ERR(r->r.frmr.fr_mr); - dprintk("RPC: %s: ib_alloc_fast_reg_mr" - " failed %i\n", __func__, rc); - continue; - } - r->r.frmr.fr_pgl = ib_alloc_fast_reg_page_list( - ia->ri_id->device, - ia->ri_max_frmr_depth); - if (IS_ERR(r->r.frmr.fr_pgl)) { - rc = PTR_ERR(r->r.frmr.fr_pgl); - dprintk("RPC: %s: " - "ib_alloc_fast_reg_page_list " - "failed %i\n", __func__, rc); - - ib_dereg_mr(r->r.frmr.fr_mr); - continue; - } - r->r.frmr.fr_state = FRMR_IS_INVALID; - } -} - /* "*mw" can be NULL when rpcrdma_buffer_get_mrs() fails, leaving * some req segments uninitialized. */ @@ -1509,7 +1216,7 @@ rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) } } -/* rpcrdma_unmap_one() was already done by rpcrdma_deregister_frmr_external(). +/* rpcrdma_unmap_one() was already done during deregistration. * Redo only the ib_post_send(). */ static void @@ -1729,6 +1436,14 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) * Wrappers for internal-use kmalloc memory registration, used by buffer code. */ +void +rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) +{ + dprintk("RPC: map_one: offset %p iova %llx len %zu\n", + seg->mr_offset, + (unsigned long long)seg->mr_dma, seg->mr_dmalen); +} + static int rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, struct ib_mr **mrp, struct ib_sge *iov) @@ -1854,287 +1569,6 @@ rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) } /* - * Wrappers for chunk registration, shared by read/write chunk code. - */ - -static void -rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing) -{ - seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - seg->mr_dmalen = seg->mr_len; - if (seg->mr_page) - seg->mr_dma = ib_dma_map_page(ia->ri_id->device, - seg->mr_page, offset_in_page(seg->mr_offset), - seg->mr_dmalen, seg->mr_dir); - else - seg->mr_dma = ib_dma_map_single(ia->ri_id->device, - seg->mr_offset, - seg->mr_dmalen, seg->mr_dir); - if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) { - dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n", - __func__, - (unsigned long long)seg->mr_dma, - seg->mr_offset, seg->mr_dmalen); - } -} - -static void -rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg) -{ - if (seg->mr_page) - ib_dma_unmap_page(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); - else - ib_dma_unmap_single(ia->ri_id->device, - seg->mr_dma, seg->mr_dmalen, seg->mr_dir); -} - -static int -rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct rpcrdma_mw *mw = seg1->rl_mw; - struct rpcrdma_frmr *frmr = &mw->r.frmr; - struct ib_mr *mr = frmr->fr_mr; - struct ib_send_wr fastreg_wr, *bad_wr; - u8 key; - int len, pageoff; - int i, rc; - int seg_len; - u64 pa; - int page_no; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > ia->ri_max_frmr_depth) - *nsegs = ia->ri_max_frmr_depth; - for (page_no = i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - pa = seg->mr_dma; - for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) { - frmr->fr_pgl->page_list[page_no++] = pa; - pa += PAGE_SIZE; - } - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - dprintk("RPC: %s: Using frmr %p to map %d segments\n", - __func__, mw, i); - - frmr->fr_state = FRMR_IS_VALID; - - memset(&fastreg_wr, 0, sizeof(fastreg_wr)); - fastreg_wr.wr_id = (unsigned long)(void *)mw; - fastreg_wr.opcode = IB_WR_FAST_REG_MR; - fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma; - fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl; - fastreg_wr.wr.fast_reg.page_list_len = page_no; - fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT; - fastreg_wr.wr.fast_reg.length = page_no << PAGE_SHIFT; - if (fastreg_wr.wr.fast_reg.length < len) { - rc = -EIO; - goto out_err; - } - - /* Bump the key */ - key = (u8)(mr->rkey & 0x000000FF); - ib_update_fast_reg_key(mr, ++key); - - fastreg_wr.wr.fast_reg.access_flags = (writing ? - IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : - IB_ACCESS_REMOTE_READ); - fastreg_wr.wr.fast_reg.rkey = mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr); - if (rc) { - dprintk("RPC: %s: failed ib_post_send for register," - " status %i\n", __func__, rc); - ib_update_fast_reg_key(mr, --key); - goto out_err; - } else { - seg1->mr_rkey = mr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return 0; -out_err: - frmr->fr_state = FRMR_IS_INVALID; - while (i--) - rpcrdma_unmap_one(ia, --seg); - return rc; -} - -static int -rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_mr_seg *seg1 = seg; - struct ib_send_wr invalidate_wr, *bad_wr; - int rc; - - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_INVALID; - - memset(&invalidate_wr, 0, sizeof invalidate_wr); - invalidate_wr.wr_id = (unsigned long)(void *)seg1->rl_mw; - invalidate_wr.opcode = IB_WR_LOCAL_INV; - invalidate_wr.ex.invalidate_rkey = seg1->rl_mw->r.frmr.fr_mr->rkey; - DECR_CQCOUNT(&r_xprt->rx_ep); - - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr); - read_unlock(&ia->ri_qplock); - if (rc) { - /* Force rpcrdma_buffer_get() to retry */ - seg1->rl_mw->r.frmr.fr_state = FRMR_IS_STALE; - dprintk("RPC: %s: failed ib_post_send for invalidate," - " status %i\n", __func__, rc); - } - return rc; -} - -static int -rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg, - int *nsegs, int writing, struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - u64 physaddrs[RPCRDMA_MAX_DATA_SEGS]; - int len, pageoff, i, rc; - - pageoff = offset_in_page(seg1->mr_offset); - seg1->mr_offset -= pageoff; /* start of page */ - seg1->mr_len += pageoff; - len = -pageoff; - if (*nsegs > RPCRDMA_MAX_DATA_SEGS) - *nsegs = RPCRDMA_MAX_DATA_SEGS; - for (i = 0; i < *nsegs;) { - rpcrdma_map_one(ia, seg, writing); - physaddrs[i] = seg->mr_dma; - len += seg->mr_len; - ++seg; - ++i; - /* Check for holes */ - if ((i < *nsegs && offset_in_page(seg->mr_offset)) || - offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len)) - break; - } - rc = ib_map_phys_fmr(seg1->rl_mw->r.fmr, physaddrs, i, seg1->mr_dma); - if (rc) { - dprintk("RPC: %s: failed ib_map_phys_fmr " - "%u@0x%llx+%i (%d)... status %i\n", __func__, - len, (unsigned long long)seg1->mr_dma, - pageoff, i, rc); - while (i--) - rpcrdma_unmap_one(ia, --seg); - } else { - seg1->mr_rkey = seg1->rl_mw->r.fmr->rkey; - seg1->mr_base = seg1->mr_dma + pageoff; - seg1->mr_nsegs = i; - seg1->mr_len = len; - } - *nsegs = i; - return rc; -} - -static int -rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_ia *ia) -{ - struct rpcrdma_mr_seg *seg1 = seg; - LIST_HEAD(l); - int rc; - - list_add(&seg1->rl_mw->r.fmr->list, &l); - rc = ib_unmap_fmr(&l); - read_lock(&ia->ri_qplock); - while (seg1->mr_nsegs--) - rpcrdma_unmap_one(ia, seg++); - read_unlock(&ia->ri_qplock); - if (rc) - dprintk("RPC: %s: failed ib_unmap_fmr," - " status %i\n", __func__, rc); - return rc; -} - -int -rpcrdma_register_external(struct rpcrdma_mr_seg *seg, - int nsegs, int writing, struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc = 0; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - rpcrdma_map_one(ia, seg, writing); - seg->mr_rkey = ia->ri_bind_mem->rkey; - seg->mr_base = seg->mr_dma; - seg->mr_nsegs = 1; - nsegs = 1; - break; - - /* Registration using frmr registration */ - case RPCRDMA_FRMR: - rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt); - break; - - /* Registration using fmr memory registration */ - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia); - break; - - default: - return -EIO; - } - if (rc) - return rc; - - return nsegs; -} - -int -rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg, - struct rpcrdma_xprt *r_xprt) -{ - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int nsegs = seg->mr_nsegs, rc; - - switch (ia->ri_memreg_strategy) { - - case RPCRDMA_ALLPHYSICAL: - read_lock(&ia->ri_qplock); - rpcrdma_unmap_one(ia, seg); - read_unlock(&ia->ri_qplock); - break; - - case RPCRDMA_FRMR: - rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt); - break; - - case RPCRDMA_MTHCAFMR: - rc = rpcrdma_deregister_fmr_external(seg, ia); - break; - - default: - break; - } - return nsegs; -} - -/* * Prepost any receive buffer, then post send. * * Receive buffer is donated to hardware, reclaimed upon recv completion. @@ -2156,7 +1590,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia, } send_wr.next = NULL; - send_wr.wr_id = 0ULL; /* no send cookie */ + send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; send_wr.sg_list = req->rl_send_iov; send_wr.num_sge = req->rl_niovs; send_wr.opcode = IB_WR_SEND; @@ -2215,43 +1649,24 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, return rc; } -/* Physical mapping means one Read/Write list entry per-page. - * All list entries must fit within an inline buffer - * - * NB: The server must return a Write list for NFS READ, - * which has the same constraint. Factor in the inline - * rsize as well. +/* How many chunk list items fit within our inline buffers? */ -static size_t -rpcrdma_physical_max_payload(struct rpcrdma_xprt *r_xprt) +unsigned int +rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - unsigned int inline_size, pages; + int bytes, segments; - inline_size = min_t(unsigned int, - cdata->inline_wsize, cdata->inline_rsize); - inline_size -= RPCRDMA_HDRLEN_MIN; - pages = inline_size / sizeof(struct rpcrdma_segment); - return pages << PAGE_SHIFT; -} - -static size_t -rpcrdma_mr_max_payload(struct rpcrdma_xprt *r_xprt) -{ - return RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT; -} - -size_t -rpcrdma_max_payload(struct rpcrdma_xprt *r_xprt) -{ - size_t result; - - switch (r_xprt->rx_ia.ri_memreg_strategy) { - case RPCRDMA_ALLPHYSICAL: - result = rpcrdma_physical_max_payload(r_xprt); - break; - default: - result = rpcrdma_mr_max_payload(r_xprt); + bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); + bytes -= RPCRDMA_HDRLEN_MIN; + if (bytes < sizeof(struct rpcrdma_segment) * 2) { + pr_warn("RPC: %s: inline threshold too small\n", + __func__); + return 0; } - return result; + + segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); + dprintk("RPC: %s: max chunk list size = %d segments\n", + __func__, segments); + return segments; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0a16fb6f0885..78e0b8beaa36 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -60,6 +60,7 @@ * Interface Adapter -- one per transport instance */ struct rpcrdma_ia { + const struct rpcrdma_memreg_ops *ri_ops; rwlock_t ri_qplock; struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; @@ -105,6 +106,10 @@ struct rpcrdma_ep { #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit) #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount) +/* Force completion handler to ignore the signal + */ +#define RPCRDMA_IGNORE_COMPLETION (0ULL) + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV * * The below structure appears at the front of a large region of kmalloc'd @@ -143,14 +148,6 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } -enum rpcrdma_chunktype { - rpcrdma_noch = 0, - rpcrdma_readch, - rpcrdma_areadch, - rpcrdma_writech, - rpcrdma_replych -}; - /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -213,6 +210,7 @@ struct rpcrdma_mw { struct ib_fmr *fmr; struct rpcrdma_frmr frmr; } r; + void (*mw_sendcompletion)(struct ib_wc *); struct list_head mw_list; struct list_head mw_all; }; @@ -258,7 +256,6 @@ struct rpcrdma_req { unsigned int rl_niovs; /* 0, 2 or 4 */ unsigned int rl_nchunks; /* non-zero if chunks */ unsigned int rl_connect_cookie; /* retry detection */ - enum rpcrdma_chunktype rl_rtype, rl_wtype; struct rpcrdma_buffer *rl_buffer; /* home base for this structure */ struct rpcrdma_rep *rl_reply;/* holder for reply buffer */ struct ib_sge rl_send_iov[4]; /* for active requests */ @@ -340,6 +337,29 @@ struct rpcrdma_stats { }; /* + * Per-registration mode operations + */ +struct rpcrdma_xprt; +struct rpcrdma_memreg_ops { + int (*ro_map)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *, int, bool); + int (*ro_unmap)(struct rpcrdma_xprt *, + struct rpcrdma_mr_seg *); + int (*ro_open)(struct rpcrdma_ia *, + struct rpcrdma_ep *, + struct rpcrdma_create_data_internal *); + size_t (*ro_maxpages)(struct rpcrdma_xprt *); + int (*ro_init)(struct rpcrdma_xprt *); + void (*ro_reset)(struct rpcrdma_xprt *); + void (*ro_destroy)(struct rpcrdma_buffer *); + const char *ro_displayname; +}; + +extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops; +extern const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops; + +/* * RPCRDMA transport -- encapsulates the structures above for * integration with RPC. * @@ -398,16 +418,56 @@ void rpcrdma_buffer_put(struct rpcrdma_req *); void rpcrdma_recv_buffer_get(struct rpcrdma_req *); void rpcrdma_recv_buffer_put(struct rpcrdma_rep *); -int rpcrdma_register_external(struct rpcrdma_mr_seg *, - int, int, struct rpcrdma_xprt *); -int rpcrdma_deregister_external(struct rpcrdma_mr_seg *, - struct rpcrdma_xprt *); - struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *, size_t, gfp_t); void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); +unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); + +/* + * Wrappers for chunk registration, shared by read/write chunk code. + */ + +void rpcrdma_mapping_error(struct rpcrdma_mr_seg *); + +static inline enum dma_data_direction +rpcrdma_data_dir(bool writing) +{ + return writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +static inline void +rpcrdma_map_one(struct ib_device *device, struct rpcrdma_mr_seg *seg, + enum dma_data_direction direction) +{ + seg->mr_dir = direction; + seg->mr_dmalen = seg->mr_len; + + if (seg->mr_page) + seg->mr_dma = ib_dma_map_page(device, + seg->mr_page, offset_in_page(seg->mr_offset), + seg->mr_dmalen, seg->mr_dir); + else + seg->mr_dma = ib_dma_map_single(device, + seg->mr_offset, + seg->mr_dmalen, seg->mr_dir); + + if (ib_dma_mapping_error(device, seg->mr_dma)) + rpcrdma_mapping_error(seg); +} + +static inline void +rpcrdma_unmap_one(struct ib_device *device, struct rpcrdma_mr_seg *seg) +{ + if (seg->mr_page) + ib_dma_unmap_page(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); + else + ib_dma_unmap_single(device, + seg->mr_dma, seg->mr_dmalen, seg->mr_dir); +} + /* * RPC/RDMA connection management calls - xprtrdma/rpc_rdma.c */ @@ -418,9 +478,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *); /* * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c */ -ssize_t rpcrdma_marshal_chunks(struct rpc_rqst *, ssize_t); int rpcrdma_marshal_req(struct rpc_rqst *); -size_t rpcrdma_max_payload(struct rpcrdma_xprt *); /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; diff --git a/net/tipc/link.c b/net/tipc/link.c index a4cf364316de..14f09b3cb87c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -464,10 +464,11 @@ void tipc_link_reset(struct tipc_link *l_ptr) /* Clean up all queues, except inputq: */ __skb_queue_purge(&l_ptr->outqueue); __skb_queue_purge(&l_ptr->deferred_queue); - skb_queue_splice_init(&l_ptr->wakeupq, &l_ptr->inputq); - if (!skb_queue_empty(&l_ptr->inputq)) + if (!owner->inputq) + owner->inputq = &l_ptr->inputq; + skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); + if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; - owner->inputq = &l_ptr->inputq; l_ptr->next_out = NULL; l_ptr->unacked_window = 0; l_ptr->checkpoint = 1; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index be2501538011..b6f84f6a2a09 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4400,6 +4400,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (parse_station_flags(info, dev->ieee80211_ptr->iftype, ¶ms)) return -EINVAL; + /* HT/VHT requires QoS, but if we don't have that just ignore HT/VHT + * as userspace might just pass through the capabilities from the IEs + * directly, rather than enforcing this restriction and returning an + * error in this case. + */ + if (!(params.sta_flags_set & BIT(NL80211_STA_FLAG_WME))) { + params.ht_capa = NULL; + params.vht_capa = NULL; + } + /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cee479bc655c..638af0655aaf 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2269,11 +2269,9 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { - dst_release(dst); - xfrm_pols_put(pols, drop_pols); XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); - - return ERR_PTR(-EREMOTE); + err = -EREMOTE; + goto error; } err = -EAGAIN; @@ -2324,7 +2322,8 @@ nopol: error: dst_release(dst); dropdst: - dst_release(dst_orig); + if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) + dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } @@ -2338,7 +2337,8 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, - flags | XFRM_LOOKUP_QUEUE); + flags | XFRM_LOOKUP_QUEUE | + XFRM_LOOKUP_KEEP_DST_REF); if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); |