diff options
author | Linus Torvalds | 2020-06-03 16:27:18 -0700 |
---|---|---|
committer | Linus Torvalds | 2020-06-03 16:27:18 -0700 |
commit | cb8e59cc87201af93dfbb6c3dccc8fcad72a09c2 (patch) | |
tree | a334db9022f89654b777bbce8c4c6632e65b9031 /net | |
parent | 2e63f6ce7ed2c4ff83ba30ad9ccad422289a6c63 (diff) | |
parent | 065fcfd49763ec71ae345bb5c5a74f961031e70e (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller:
1) Allow setting bluetooth L2CAP modes via socket option, from Luiz
Augusto von Dentz.
2) Add GSO partial support to igc, from Sasha Neftin.
3) Several cleanups and improvements to r8169 from Heiner Kallweit.
4) Add IF_OPER_TESTING link state and use it when ethtool triggers a
device self-test. From Andrew Lunn.
5) Start moving away from custom driver versions, use the globally
defined kernel version instead, from Leon Romanovsky.
6) Support GRO vis gro_cells in DSA layer, from Alexander Lobakin.
7) Allow hard IRQ deferral during NAPI, from Eric Dumazet.
8) Add sriov and vf support to hinic, from Luo bin.
9) Support Media Redundancy Protocol (MRP) in the bridging code, from
Horatiu Vultur.
10) Support netmap in the nft_nat code, from Pablo Neira Ayuso.
11) Allow UDPv6 encapsulation of ESP in the ipsec code, from Sabrina
Dubroca. Also add ipv6 support for espintcp.
12) Lots of ReST conversions of the networking documentation, from Mauro
Carvalho Chehab.
13) Support configuration of ethtool rxnfc flows in bcmgenet driver,
from Doug Berger.
14) Allow to dump cgroup id and filter by it in inet_diag code, from
Dmitry Yakunin.
15) Add infrastructure to export netlink attribute policies to
userspace, from Johannes Berg.
16) Several optimizations to sch_fq scheduler, from Eric Dumazet.
17) Fallback to the default qdisc if qdisc init fails because otherwise
a packet scheduler init failure will make a device inoperative. From
Jesper Dangaard Brouer.
18) Several RISCV bpf jit optimizations, from Luke Nelson.
19) Correct the return type of the ->ndo_start_xmit() method in several
drivers, it's netdev_tx_t but many drivers were using
'int'. From Yunjian Wang.
20) Add an ethtool interface for PHY master/slave config, from Oleksij
Rempel.
21) Add BPF iterators, from Yonghang Song.
22) Add cable test infrastructure, including ethool interfaces, from
Andrew Lunn. Marvell PHY driver is the first to support this
facility.
23) Remove zero-length arrays all over, from Gustavo A. R. Silva.
24) Calculate and maintain an explicit frame size in XDP, from Jesper
Dangaard Brouer.
25) Add CAP_BPF, from Alexei Starovoitov.
26) Support terse dumps in the packet scheduler, from Vlad Buslov.
27) Support XDP_TX bulking in dpaa2 driver, from Ioana Ciornei.
28) Add devm_register_netdev(), from Bartosz Golaszewski.
29) Minimize qdisc resets, from Cong Wang.
30) Get rid of kernel_getsockopt and kernel_setsockopt in order to
eliminate set_fs/get_fs calls. From Christoph Hellwig.
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2517 commits)
selftests: net: ip_defrag: ignore EPERM
net_failover: fixed rollback in net_failover_open()
Revert "tipc: Fix potential tipc_aead refcnt leak in tipc_crypto_rcv"
Revert "tipc: Fix potential tipc_node refcnt leak in tipc_rcv"
vmxnet3: allow rx flow hash ops only when rss is enabled
hinic: add set_channels ethtool_ops support
selftests/bpf: Add a default $(CXX) value
tools/bpf: Don't use $(COMPILE.c)
bpf, selftests: Use bpf_probe_read_kernel
s390/bpf: Use bcr 0,%0 as tail call nop filler
s390/bpf: Maintain 8-byte stack alignment
selftests/bpf: Fix verifier test
selftests/bpf: Fix sample_cnt shared between two threads
bpf, selftests: Adapt cls_redirect to call csum_level helper
bpf: Add csum_level helper for fixing up csum levels
bpf: Fix up bpf_skb_adjust_room helper's skb csum setting
sfc: add missing annotation for efx_ef10_try_update_nic_stats_vf()
crypto/chtls: IPv6 support for inline TLS
Crypto/chcr: Fixes a coccinile check error
Crypto/chcr: Fixes compilations warnings
...
Diffstat (limited to 'net')
341 files changed, 17456 insertions, 6812 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 990b9fde28c6..f00bb57f0f60 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -88,12 +88,11 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER - if (vlan->netpoll) - netpoll_send_skb(vlan->netpoll, skb); + return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, @@ -489,6 +488,25 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } +/* + * vlan network devices have devices nesting below it, and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key vlan_netdev_xmit_lock_key; + +static void vlan_dev_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *unused) +{ + lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); +} + +static void vlan_dev_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, @@ -579,6 +597,8 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); + vlan_dev_set_lockdep_class(dev); + vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; diff --git a/net/Kconfig b/net/Kconfig index df8d8c9bd021..5c524c6ee75d 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -86,7 +86,7 @@ config INET "Sysctl support" below, you can change various aspects of the behavior of the TCP/IP code by writing to the (virtual) files in /proc/sys/net/ipv4/*; the options are explained in the file - <file:Documentation/networking/ip-sysctl.txt>. + <file:Documentation/networking/ip-sysctl.rst>. Short answer: say Y. @@ -344,7 +344,7 @@ config NET_PKTGEN what was just said, you don't need it: say N. Documentation on how to use the packet generator can be found - at <file:Documentation/networking/pktgen.txt>. + at <file:Documentation/networking/pktgen.rst>. To compile this code as a module, choose M here: the module will be called pktgen. @@ -455,6 +455,7 @@ config FAILOVER config ETHTOOL_NETLINK bool "Netlink interface for ethtool" default y + depends on PHYLIB=y || PHYLIB=n help An alternative userspace interface for ethtool based on generic netlink. It provides better extensibility and some new features, diff --git a/net/Makefile b/net/Makefile index 07ea48160874..5744bf1997fd 100644 --- a/net/Makefile +++ b/net/Makefile @@ -6,7 +6,7 @@ # Rewritten to use lists instead of if-statements. # -obj-$(CONFIG_NET) := socket.o core/ +obj-$(CONFIG_NET) := devres.o socket.o core/ tmp-$(CONFIG_COMPAT) := compat.o obj-$(CONFIG_NET) += $(tmp-y) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index b41375d4d295..15787e8c0629 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -57,6 +57,7 @@ #include <net/sock.h> #include <net/tcp_states.h> #include <net/route.h> +#include <net/compat.h> #include <linux/atalk.h> #include <linux/highmem.h> @@ -867,6 +868,24 @@ static int atif_ioctl(int cmd, void __user *arg) return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0; } +static int atrtr_ioctl_addrt(struct rtentry *rt) +{ + struct net_device *dev = NULL; + + if (rt->rt_dev) { + char name[IFNAMSIZ]; + + if (copy_from_user(name, rt->rt_dev, IFNAMSIZ-1)) + return -EFAULT; + name[IFNAMSIZ-1] = '\0'; + + dev = __dev_get_by_name(&init_net, name); + if (!dev) + return -ENODEV; + } + return atrtr_create(rt, dev); +} + /* Routing ioctl() calls */ static int atrtr_ioctl(unsigned int cmd, void __user *arg) { @@ -882,19 +901,8 @@ static int atrtr_ioctl(unsigned int cmd, void __user *arg) return atrtr_delete(&((struct sockaddr_at *) &rt.rt_dst)->sat_addr); - case SIOCADDRT: { - struct net_device *dev = NULL; - if (rt.rt_dev) { - char name[IFNAMSIZ]; - if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1)) - return -EFAULT; - name[IFNAMSIZ-1] = '\0'; - dev = __dev_get_by_name(&init_net, name); - if (!dev) - return -ENODEV; - } - return atrtr_create(&rt, dev); - } + case SIOCADDRT: + return atrtr_ioctl_addrt(&rt); } return -EINVAL; } @@ -1832,20 +1840,58 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) #ifdef CONFIG_COMPAT +static int atalk_compat_routing_ioctl(struct sock *sk, unsigned int cmd, + struct compat_rtentry __user *ur) +{ + compat_uptr_t rtdev; + struct rtentry rt; + + if (copy_from_user(&rt.rt_dst, &ur->rt_dst, + 3 * sizeof(struct sockaddr)) || + get_user(rt.rt_flags, &ur->rt_flags) || + get_user(rt.rt_metric, &ur->rt_metric) || + get_user(rt.rt_mtu, &ur->rt_mtu) || + get_user(rt.rt_window, &ur->rt_window) || + get_user(rt.rt_irtt, &ur->rt_irtt) || + get_user(rtdev, &ur->rt_dev)) + return -EFAULT; + + switch (cmd) { + case SIOCDELRT: + if (rt.rt_dst.sa_family != AF_APPLETALK) + return -EINVAL; + return atrtr_delete(&((struct sockaddr_at *) + &rt.rt_dst)->sat_addr); + + case SIOCADDRT: + rt.rt_dev = compat_ptr(rtdev); + return atrtr_ioctl_addrt(&rt); + default: + return -EINVAL; + } +} static int atalk_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); + struct sock *sk = sock->sk; + + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: + return atalk_compat_routing_ioctl(sk, cmd, argp); /* * SIOCATALKDIFADDR is a SIOCPROTOPRIVATE ioctl number, so we * cannot handle it in common code. The data we access if ifreq * here is compatible, so we can simply call the native * handler. */ - if (cmd == SIOCATALKDIFADDR) - return atalk_ioctl(sock, cmd, (unsigned long)compat_ptr(arg)); - - return -ENOIOCTLCMD; + case SIOCATALKDIFADDR: + return atalk_ioctl(sock, cmd, (unsigned long)argp); + default: + return -ENOIOCTLCMD; + } } -#endif +#endif /* CONFIG_COMPAT */ static const struct net_proto_family atalk_family_ops = { diff --git a/net/atm/Kconfig b/net/atm/Kconfig index 271f682e8438..e61dcc9f85b2 100644 --- a/net/atm/Kconfig +++ b/net/atm/Kconfig @@ -16,7 +16,7 @@ config ATM of your ATM card below. Note that you need a set of user-space programs to actually make use - of ATM. See the file <file:Documentation/networking/atm.txt> for + of ATM. See the file <file:Documentation/networking/atm.rst> for further details. config ATM_CLIP diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c index d955b683aa7c..838ebf0cabbf 100644 --- a/net/atm/ioctl.c +++ b/net/atm/ioctl.c @@ -56,6 +56,8 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, int error; struct list_head *pos; void __user *argp = (void __user *)arg; + void __user *buf; + int __user *len; vcc = ATM_SD(sock); switch (cmd) { @@ -162,7 +164,49 @@ static int do_vcc_ioctl(struct socket *sock, unsigned int cmd, if (error != -ENOIOCTLCMD) goto done; - error = atm_dev_ioctl(cmd, argp, compat); + if (cmd == ATM_GETNAMES) { + if (IS_ENABLED(CONFIG_COMPAT) && compat) { +#ifdef CONFIG_COMPAT + struct compat_atm_iobuf __user *ciobuf = argp; + compat_uptr_t cbuf; + len = &ciobuf->length; + if (get_user(cbuf, &ciobuf->buffer)) + return -EFAULT; + buf = compat_ptr(cbuf); +#endif + } else { + struct atm_iobuf __user *iobuf = argp; + len = &iobuf->length; + if (get_user(buf, &iobuf->buffer)) + return -EFAULT; + } + error = atm_getnames(buf, len); + } else { + int number; + + if (IS_ENABLED(CONFIG_COMPAT) && compat) { +#ifdef CONFIG_COMPAT + struct compat_atmif_sioc __user *csioc = argp; + compat_uptr_t carg; + + len = &csioc->length; + if (get_user(carg, &csioc->arg)) + return -EFAULT; + buf = compat_ptr(carg); + if (get_user(number, &csioc->number)) + return -EFAULT; +#endif + } else { + struct atmif_sioc __user *sioc = argp; + + len = &sioc->length; + if (get_user(buf, &sioc->arg)) + return -EFAULT; + if (get_user(number, &sioc->number)) + return -EFAULT; + } + error = atm_dev_ioctl(cmd, buf, len, number, compat); + } done: return error; @@ -230,61 +274,25 @@ static struct { static int do_atm_iobuf(struct socket *sock, unsigned int cmd, unsigned long arg) { - struct atm_iobuf __user *iobuf; - struct compat_atm_iobuf __user *iobuf32; + struct compat_atm_iobuf __user *iobuf32 = compat_ptr(arg); u32 data; - void __user *datap; - int len, err; - - iobuf = compat_alloc_user_space(sizeof(*iobuf)); - iobuf32 = compat_ptr(arg); - if (get_user(len, &iobuf32->length) || - get_user(data, &iobuf32->buffer)) + if (get_user(data, &iobuf32->buffer)) return -EFAULT; - datap = compat_ptr(data); - if (put_user(len, &iobuf->length) || - put_user(datap, &iobuf->buffer)) - return -EFAULT; - - err = do_vcc_ioctl(sock, cmd, (unsigned long) iobuf, 0); - if (!err) { - if (copy_in_user(&iobuf32->length, &iobuf->length, - sizeof(int))) - err = -EFAULT; - } - - return err; + return atm_getnames(&iobuf32->length, compat_ptr(data)); } static int do_atmif_sioc(struct socket *sock, unsigned int cmd, unsigned long arg) { - struct atmif_sioc __user *sioc; - struct compat_atmif_sioc __user *sioc32; + struct compat_atmif_sioc __user *sioc32 = compat_ptr(arg); + int number; u32 data; - void __user *datap; - int err; - - sioc = compat_alloc_user_space(sizeof(*sioc)); - sioc32 = compat_ptr(arg); - if (copy_in_user(&sioc->number, &sioc32->number, 2 * sizeof(int)) || - get_user(data, &sioc32->arg)) + if (get_user(data, &sioc32->arg) || get_user(number, &sioc32->number)) return -EFAULT; - datap = compat_ptr(data); - if (put_user(datap, &sioc->arg)) - return -EFAULT; - - err = do_vcc_ioctl(sock, cmd, (unsigned long) sioc, 0); - - if (!err) { - if (copy_in_user(&sioc32->length, &sioc->length, - sizeof(int))) - err = -EFAULT; - } - return err; + return atm_dev_ioctl(cmd, compat_ptr(data), &sioc32->length, number, 0); } static int do_atm_ioctl(struct socket *sock, unsigned int cmd32, diff --git a/net/atm/resources.c b/net/atm/resources.c index 889349c6d90d..94bdc6527ee8 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -193,88 +193,48 @@ static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, return error ? -EFAULT : 0; } -int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat) +int atm_getnames(void __user *buf, int __user *iobuf_len) { - void __user *buf; - int error, len, number, size = 0; + int error, len, size = 0; struct atm_dev *dev; struct list_head *p; int *tmp_buf, *tmp_p; - int __user *sioc_len; - int __user *iobuf_len; - switch (cmd) { - case ATM_GETNAMES: - if (IS_ENABLED(CONFIG_COMPAT) && compat) { -#ifdef CONFIG_COMPAT - struct compat_atm_iobuf __user *ciobuf = arg; - compat_uptr_t cbuf; - iobuf_len = &ciobuf->length; - if (get_user(cbuf, &ciobuf->buffer)) - return -EFAULT; - buf = compat_ptr(cbuf); -#endif - } else { - struct atm_iobuf __user *iobuf = arg; - iobuf_len = &iobuf->length; - if (get_user(buf, &iobuf->buffer)) - return -EFAULT; - } - if (get_user(len, iobuf_len)) - return -EFAULT; - mutex_lock(&atm_dev_mutex); - list_for_each(p, &atm_devs) - size += sizeof(int); - if (size > len) { - mutex_unlock(&atm_dev_mutex); - return -E2BIG; - } - tmp_buf = kmalloc(size, GFP_ATOMIC); - if (!tmp_buf) { - mutex_unlock(&atm_dev_mutex); - return -ENOMEM; - } - tmp_p = tmp_buf; - list_for_each(p, &atm_devs) { - dev = list_entry(p, struct atm_dev, dev_list); - *tmp_p++ = dev->number; - } + if (get_user(len, iobuf_len)) + return -EFAULT; + mutex_lock(&atm_dev_mutex); + list_for_each(p, &atm_devs) + size += sizeof(int); + if (size > len) { mutex_unlock(&atm_dev_mutex); - error = ((copy_to_user(buf, tmp_buf, size)) || - put_user(size, iobuf_len)) - ? -EFAULT : 0; - kfree(tmp_buf); - return error; - default: - break; + return -E2BIG; } - - if (IS_ENABLED(CONFIG_COMPAT) && compat) { -#ifdef CONFIG_COMPAT - struct compat_atmif_sioc __user *csioc = arg; - compat_uptr_t carg; - - sioc_len = &csioc->length; - if (get_user(carg, &csioc->arg)) - return -EFAULT; - buf = compat_ptr(carg); - - if (get_user(len, &csioc->length)) - return -EFAULT; - if (get_user(number, &csioc->number)) - return -EFAULT; -#endif - } else { - struct atmif_sioc __user *sioc = arg; - - sioc_len = &sioc->length; - if (get_user(buf, &sioc->arg)) - return -EFAULT; - if (get_user(len, &sioc->length)) - return -EFAULT; - if (get_user(number, &sioc->number)) - return -EFAULT; + tmp_buf = kmalloc(size, GFP_ATOMIC); + if (!tmp_buf) { + mutex_unlock(&atm_dev_mutex); + return -ENOMEM; + } + tmp_p = tmp_buf; + list_for_each(p, &atm_devs) { + dev = list_entry(p, struct atm_dev, dev_list); + *tmp_p++ = dev->number; } + mutex_unlock(&atm_dev_mutex); + error = ((copy_to_user(buf, tmp_buf, size)) || + put_user(size, iobuf_len)) + ? -EFAULT : 0; + kfree(tmp_buf); + return error; +} + +int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len, + int number, int compat) +{ + int error, len, size = 0; + struct atm_dev *dev; + + if (get_user(len, sioc_len)) + return -EFAULT; dev = try_then_request_module(atm_dev_lookup(number), "atm-device-%d", number); diff --git a/net/atm/resources.h b/net/atm/resources.h index 048232e4d4c6..4a0839e92ff3 100644 --- a/net/atm/resources.h +++ b/net/atm/resources.h @@ -14,8 +14,9 @@ extern struct list_head atm_devs; extern struct mutex atm_dev_mutex; -int atm_dev_ioctl(unsigned int cmd, void __user *arg, int compat); - +int atm_getnames(void __user *buf, int __user *iobuf_len); +int atm_dev_ioctl(unsigned int cmd, void __user *buf, int __user *sioc_len, + int number, int compat); #ifdef CONFIG_PROC_FS diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index 043fd5437809..97d686d115c0 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -40,7 +40,7 @@ config AX25 radio as well as information about how to configure an AX.25 port is contained in the AX25-HOWTO, available from <http://www.tldp.org/docs.html#howto>. You might also want to - check out the file <file:Documentation/networking/ax25.txt> in the + check out the file <file:Documentation/networking/ax25.rst> in the kernel source. More information about digital amateur radio in general is on the WWW at <http://www.tapr.org/>. @@ -88,7 +88,7 @@ config NETROM users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from <http://www.linux-ax25.org>. You also might want to check out the - file <file:Documentation/networking/ax25.txt>. More information about + file <file:Documentation/networking/ax25.rst>. More information about digital amateur radio in general is on the WWW at <http://www.tapr.org/>. @@ -107,7 +107,7 @@ config ROSE users as well as information about how to configure an AX.25 port is contained in the Linux Ham Wiki, available from <http://www.linux-ax25.org>. You also might want to check out the - file <file:Documentation/networking/ax25.txt>. More information about + file <file:Documentation/networking/ax25.rst>. More information about digital amateur radio in general is on the WWW at <http://www.tapr.org/>. diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index a7c8dd7ae513..e87f19c82e8d 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -280,7 +280,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) unsigned int msecs; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); return jiffies + msecs_to_jiffies(msecs); } @@ -288,7 +288,7 @@ batadv_iv_ogm_emit_send_time(const struct batadv_priv *bat_priv) /* when do we schedule a ogm packet to be sent */ static unsigned long batadv_iv_ogm_fwd_send_time(void) { - return jiffies + msecs_to_jiffies(prandom_u32() % (BATADV_JITTER / 2)); + return jiffies + msecs_to_jiffies(prandom_u32_max(BATADV_JITTER / 2)); } /* apply hop penalty for a normal link */ diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index 1e3172db7492..0bdefa35da98 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -49,7 +49,7 @@ static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs; msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq, msecs_to_jiffies(msecs)); @@ -127,20 +127,7 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); - - /* Virtual interface drivers such as tun / tap interfaces, VLAN, etc - * tend to initialize the interface throughput with some value for the - * sake of having a throughput number to export via ethtool. This - * exported throughput leaves batman-adv to conclude the interface - * throughput is genuine (reflecting reality), thus no measurements - * are necessary. - * - * Based on the observation that those interface types also tend to set - * the link auto-negotiation to 'off', batman-adv shall check this - * setting to differentiate between genuine link throughput information - * and placeholders installed by virtual interfaces. - */ - if (ret == 0 && link_settings.base.autoneg == AUTONEG_ENABLE) { + if (ret == 0) { /* link characteristics might change over time */ if (link_settings.base.duplex == DUPLEX_FULL) hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX; diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 80b87b1f4e3a..18028b9f95f0 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -88,7 +88,7 @@ static void batadv_v_ogm_start_queue_timer(struct batadv_hard_iface *hard_iface) unsigned int msecs = BATADV_MAX_AGGREGATION_MS * 1000; /* msecs * [0.9, 1.1] */ - msecs += prandom_u32() % (msecs / 5) - (msecs / 10); + msecs += prandom_u32_max(msecs / 5) - (msecs / 10); queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.aggr_wq, msecs_to_jiffies(msecs / 1000)); } @@ -107,7 +107,7 @@ static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv) return; msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER; - msecs += prandom_u32() % (2 * BATADV_JITTER); + msecs += prandom_u32_max(2 * BATADV_JITTER); queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq, msecs_to_jiffies(msecs)); } diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 2bff2f4a325c..4e031661682a 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -163,11 +163,6 @@ static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv, { } -static inline void batadv_arp_change_timeout(struct net_device *soft_iface, - const char *name) -{ -} - static inline int batadv_dat_init(struct batadv_priv *bat_priv) { return 0; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index e22e49289677..a18dcc686dc3 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -146,8 +146,8 @@ static void batadv_gw_select(struct batadv_priv *bat_priv, if (new_gw_node) kref_get(&new_gw_node->refcount); - curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1); - rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node); + curr_gw_node = rcu_replace_pointer(bat_priv->gw.curr_gw, new_gw_node, + true); if (curr_gw_node) batadv_gw_node_put(curr_gw_node); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index c7e98a40dd33..3a256af92784 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -473,8 +473,8 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv, if (new_hard_iface) kref_get(&new_hard_iface->refcount); - curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1); - rcu_assign_pointer(bat_priv->primary_if, new_hard_iface); + curr_hard_iface = rcu_replace_pointer(bat_priv->primary_if, + new_hard_iface, 1); if (!new_hard_iface) goto out; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index ccb535c77e5d..8bdabc03b0b2 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -135,9 +135,6 @@ static ssize_t batadv_socket_read(struct file *file, char __user *buf, if (!buf || count < sizeof(struct batadv_icmp_packet)) return -EINVAL; - if (!access_ok(buf, count)) - return -EFAULT; - error = wait_event_interruptible(socket_client->queue_wait, socket_client->queue_len); diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 2a234d0ad445..61d8dbe8c954 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2020.1" +#define BATADV_SOURCE_VERSION "2020.2" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 3632bd976c56..d343382e9664 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -71,13 +71,13 @@ static void _batadv_update_route(struct batadv_priv *bat_priv, * the code needs to ensure the curr_router variable contains a pointer * to the replaced best neighbor. */ - curr_router = rcu_dereference_protected(orig_ifinfo->router, true); /* increase refcount of new best neighbor */ if (neigh_node) kref_get(&neigh_node->refcount); - rcu_assign_pointer(orig_ifinfo->router, neigh_node); + curr_router = rcu_replace_pointer(orig_ifinfo->router, neigh_node, + true); spin_unlock_bh(&orig_node->neigh_list_lock); batadv_orig_ifinfo_put(orig_ifinfo); diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5f05a728f347..0ddd80130ea3 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -22,6 +22,7 @@ #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> +#include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/percpu.h> @@ -739,6 +740,34 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, return 0; } +/* batman-adv network devices have devices nesting below it and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key batadv_netdev_xmit_lock_key; + +/** + * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue + * @dev: device which owns the tx queue + * @txq: tx queue to modify + * @_unused: always NULL + */ +static void batadv_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); +} + +/** + * batadv_set_lockdep_class() - Set txq and addr_list lockdep class + * @dev: network device to modify + */ +static void batadv_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); +} + /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify @@ -752,6 +781,8 @@ static int batadv_softif_init_late(struct net_device *dev) int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; + batadv_set_lockdep_class(dev); + bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index f631b1e01b89..a87547570b4e 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -15,7 +15,6 @@ #include <linux/percpu.h> #include <linux/printk.h> #include <linux/tracepoint.h> -#include <linux/types.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM batadv diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 4a17a66cc572..d152b8e81f61 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1086,7 +1086,7 @@ struct batadv_priv_bla { * struct batadv_priv_debug_log - debug logging data */ struct batadv_priv_debug_log { - /** @log_buff: buffer holding the logs (ring bufer) */ + /** @log_buff: buffer holding the logs (ring buffer) */ char log_buff[BATADV_LOG_BUF_LEN]; /** @log_start: index of next character to read */ diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4febc82a7c76..bb55d92691b0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -571,7 +571,15 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } +static int bt_dev_init(struct net_device *dev) +{ + netdev_lockdep_set_classes(dev); + + return 0; +} + static const struct net_device_ops netdev_ops = { + .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 165148c7c4ce..1d6d243cdde9 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -93,6 +93,21 @@ config BT_LEDS This option selects a few LED triggers for different Bluetooth events. +config BT_MSFTEXT + bool "Enable Microsoft extensions" + depends on BT + help + This options enables support for the Microsoft defined HCI + vendor extensions. + +config BT_DEBUGFS + bool "Export Bluetooth internals in debugfs" + depends on BT && DEBUG_FS + default y + help + Provide extensive information about internal Bluetooth states + in debugfs. + config BT_SELFTEST bool "Bluetooth self testing support" depends on BT && DEBUG_KERNEL @@ -120,12 +135,11 @@ config BT_SELFTEST_SMP Run test cases for SMP cryptographic functionality, including both legacy SMP as well as the Secure Connections features. -config BT_DEBUGFS - bool "Export Bluetooth internals in debugfs" - depends on BT && DEBUG_FS - default y +config BT_FEATURE_DEBUG + bool "Enable runtime option for debugging statements" + depends on BT && !DYNAMIC_DEBUG help - Provide extensive information about internal Bluetooth states - in debugfs. + This provides an option to enable/disable debugging statements + at runtime via the experimental features interface. source "drivers/bluetooth/Kconfig" diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index fda41c0b4781..41dd541a44a5 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -19,5 +19,6 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o bluetooth-$(CONFIG_BT_LEDS) += leds.o +bluetooth-$(CONFIG_BT_MSFTEXT) += msft.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e245bc155cc2..307800fd18e6 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -122,8 +122,18 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_conn_hash_del(hdev, conn); - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + case SCO_AIRMODE_TRANSP: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_DISABLE_SCO); + break; + } + } else { + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + } hci_conn_del_sysfs(conn); @@ -215,8 +225,6 @@ static void hci_acl_create_connection(struct hci_conn *conn) } memcpy(conn->dev_class, ie->data.dev_class, 3); - if (ie->data.ssp_mode > 0) - set_bit(HCI_CONN_SSP_ENABLED, &conn->flags); } cp.pkt_type = cpu_to_le16(conn->pkt_type); @@ -577,8 +585,15 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, hci_dev_hold(hdev); hci_conn_hash_add(hdev, conn); - if (hdev->notify) - hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + + /* The SCO and eSCO connections will only be notified when their + * setup has been completed. This is different to ACL links which + * can be notified right away. + */ + if (conn->type != SCO_LINK && conn->type != ESCO_LINK) { + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + } hci_conn_init_sysfs(conn); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2e7bc2da8371..dbe2d79f233f 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -44,6 +44,7 @@ #include "hci_debugfs.h" #include "smp.h" #include "leds.h" +#include "msft.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -637,6 +638,14 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) events[0] |= 0x40; /* LE Data Length Change */ + /* If the controller supports LL Privacy feature, enable + * the corresponding event. + */ + if (hdev->le_features[0] & HCI_LE_LL_PRIVACY) + events[1] |= 0x02; /* LE Enhanced Connection + * Complete + */ + /* If the controller supports Extended Scanner Filter * Policies, enable the correspondig event. */ @@ -710,14 +719,6 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt) * Report */ - /* If the controller supports the LE Extended Create Connection - * command, enable the corresponding event. - */ - if (use_ext_conn(hdev)) - events[1] |= 0x02; /* LE Enhanced Connection - * Complete - */ - /* If the controller supports the LE Extended Advertising * command, enable the corresponding event. */ @@ -826,6 +827,10 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt) if (hdev->commands[29] & 0x20) hci_req_add(req, HCI_OP_READ_LOCAL_CODECS, 0, NULL); + /* Read local pairing options if the HCI command is supported */ + if (hdev->commands[41] & 0x08) + hci_req_add(req, HCI_OP_READ_LOCAL_PAIRING_OPTS, 0, NULL); + /* Get MWS transport configuration if the HCI command is supported */ if (hdev->commands[30] & 0x08) hci_req_add(req, HCI_OP_GET_MWS_TRANSPORT_CONFIG, 0, NULL); @@ -1563,6 +1568,8 @@ setup_failed: hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); + msft_do_open(hdev); + clear_bit(HCI_INIT, &hdev->flags); if (!ret) { @@ -1758,6 +1765,8 @@ int hci_dev_do_close(struct hci_dev *hdev) hci_sock_dev_event(hdev, HCI_DEV_DOWN); + msft_do_close(hdev); + if (hdev->flush) hdev->flush(hdev); @@ -3341,10 +3350,12 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, */ ret = hci_change_suspend_state(hdev, BT_SUSPEND_DISCONNECT); - /* Only configure whitelist if disconnect succeeded */ - if (!ret) + /* Only configure whitelist if disconnect succeeded and wake + * isn't being prevented. + */ + if (!ret && !(hdev->prevent_wake && hdev->prevent_wake(hdev))) ret = hci_change_suspend_state(hdev, - BT_SUSPEND_COMPLETE); + BT_SUSPEND_CONFIGURE_WAKE); } else if (action == PM_POST_SUSPEND) { ret = hci_change_suspend_state(hdev, BT_RUNNING); } @@ -4240,6 +4251,54 @@ static void __check_timeout(struct hci_dev *hdev, unsigned int cnt) } } +/* Schedule SCO */ +static void hci_sched_sco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + if (!hci_conn_num(hdev, SCO_LINK)) + return; + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(hdev, skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + +static void hci_sched_esco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + if (!hci_conn_num(hdev, ESCO_LINK)) + return; + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, + "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(hdev, skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + static void hci_sched_acl_pkt(struct hci_dev *hdev) { unsigned int cnt = hdev->acl_cnt; @@ -4271,6 +4330,10 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) hdev->acl_cnt--; chan->sent++; chan->conn->sent++; + + /* Send pending SCO packets right away */ + hci_sched_sco(hdev); + hci_sched_esco(hdev); } } @@ -4355,54 +4418,6 @@ static void hci_sched_acl(struct hci_dev *hdev) } } -/* Schedule SCO */ -static void hci_sched_sco(struct hci_dev *hdev) -{ - struct hci_conn *conn; - struct sk_buff *skb; - int quote; - - BT_DBG("%s", hdev->name); - - if (!hci_conn_num(hdev, SCO_LINK)) - return; - - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { - while (quote-- && (skb = skb_dequeue(&conn->data_q))) { - BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); - - conn->sent++; - if (conn->sent == ~0) - conn->sent = 0; - } - } -} - -static void hci_sched_esco(struct hci_dev *hdev) -{ - struct hci_conn *conn; - struct sk_buff *skb; - int quote; - - BT_DBG("%s", hdev->name); - - if (!hci_conn_num(hdev, ESCO_LINK)) - return; - - while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, - "e))) { - while (quote-- && (skb = skb_dequeue(&conn->data_q))) { - BT_DBG("skb %p len %d", skb, skb->len); - hci_send_frame(hdev, skb); - - conn->sent++; - if (conn->sent == ~0) - conn->sent = 0; - } - } -} - static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; @@ -4437,6 +4452,10 @@ static void hci_sched_le(struct hci_dev *hdev) cnt--; chan->sent++; chan->conn->sent++; + + /* Send pending SCO packets right away */ + hci_sched_sco(hdev); + hci_sched_esco(hdev); } } @@ -4459,9 +4478,9 @@ static void hci_tx_work(struct work_struct *work) if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ - hci_sched_acl(hdev); hci_sched_sco(hdev); hci_sched_esco(hdev); + hci_sched_acl(hdev); hci_sched_le(hdev); } diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 6b1314c738b8..5e8af2658e44 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -1075,6 +1075,50 @@ DEFINE_SIMPLE_ATTRIBUTE(auth_payload_timeout_fops, auth_payload_timeout_get, auth_payload_timeout_set, "%llu\n"); +static ssize_t force_no_mitm_read(struct file *file, + char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[3]; + + buf[0] = hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM) ? 'Y' : 'N'; + buf[1] = '\n'; + buf[2] = '\0'; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t force_no_mitm_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct hci_dev *hdev = file->private_data; + char buf[32]; + size_t buf_size = min(count, (sizeof(buf) - 1)); + bool enable; + + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + if (strtobool(buf, &enable)) + return -EINVAL; + + if (enable == hci_dev_test_flag(hdev, HCI_FORCE_NO_MITM)) + return -EALREADY; + + hci_dev_change_flag(hdev, HCI_FORCE_NO_MITM); + + return count; +} + +static const struct file_operations force_no_mitm_fops = { + .open = simple_open, + .read = force_no_mitm_read, + .write = force_no_mitm_write, + .llseek = default_llseek, +}; + DEFINE_QUIRK_ATTRIBUTE(quirk_strict_duplicate_filter, HCI_QUIRK_STRICT_DUPLICATE_FILTER); DEFINE_QUIRK_ATTRIBUTE(quirk_simultaneous_discovery, @@ -1134,6 +1178,8 @@ void hci_debugfs_create_le(struct hci_dev *hdev) &max_key_size_fops); debugfs_create_file("auth_payload_timeout", 0644, hdev->debugfs, hdev, &auth_payload_timeout_fops); + debugfs_create_file("force_no_mitm", 0644, hdev->debugfs, hdev, + &force_no_mitm_fops); debugfs_create_file("quirk_strict_duplicate_filter", 0644, hdev->debugfs, hdev, diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0a591be8b0ae..cfeaee347db3 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -35,18 +35,34 @@ #include "a2mp.h" #include "amp.h" #include "smp.h" +#include "msft.h" #define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \ "\x00\x00\x00\x00\x00\x00\x00\x00" /* Handle HCI Event packets */ -static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb) +static void hci_cc_inquiry_cancel(struct hci_dev *hdev, struct sk_buff *skb, + u8 *new_status) { __u8 status = *((__u8 *) skb->data); BT_DBG("%s status 0x%2.2x", hdev->name, status); + /* It is possible that we receive Inquiry Complete event right + * before we receive Inquiry Cancel Command Complete event, in + * which case the latter event should have status of Command + * Disallowed (0x0c). This should not be treated as error, since + * we actually achieve what Inquiry Cancel wants to achieve, + * which is to end the last Inquiry session. + */ + if (status == 0x0c && !test_bit(HCI_INQUIRY, &hdev->flags)) { + bt_dev_warn(hdev, "Ignoring error of Inquiry Cancel command"); + status = 0x00; + } + + *new_status = status; + if (status) return; @@ -746,6 +762,23 @@ static void hci_cc_read_bd_addr(struct hci_dev *hdev, struct sk_buff *skb) bacpy(&hdev->setup_addr, &rp->bdaddr); } +static void hci_cc_read_local_pairing_opts(struct hci_dev *hdev, + struct sk_buff *skb) +{ + struct hci_rp_read_local_pairing_opts *rp = (void *) skb->data; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + return; + + if (hci_dev_test_flag(hdev, HCI_SETUP) || + hci_dev_test_flag(hdev, HCI_CONFIG)) { + hdev->pairing_opts = rp->pairing_opts; + hdev->max_enc_key_size = rp->max_key_size; + } +} + static void hci_cc_read_page_scan_activity(struct hci_dev *hdev, struct sk_buff *skb) { @@ -2607,8 +2640,16 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) if (ev->status) { hci_connect_cfm(conn, ev->status); hci_conn_del(conn); - } else if (ev->link_type != ACL_LINK) + } else if (ev->link_type == SCO_LINK) { + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); + break; + } + hci_connect_cfm(conn, ev->status); + } unlock: hci_dev_unlock(hdev); @@ -2890,7 +2931,7 @@ static void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) &cp); } else { clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); - hci_encrypt_cfm(conn, ev->status, 0x00); + hci_encrypt_cfm(conn, ev->status); } } @@ -2975,22 +3016,7 @@ static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, conn->enc_key_size = rp->key_size; } - if (conn->state == BT_CONFIG) { - conn->state = BT_CONNECTED; - hci_connect_cfm(conn, 0); - hci_conn_drop(conn); - } else { - u8 encrypt; - - if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) - encrypt = 0x00; - else if (test_bit(HCI_CONN_AES_CCM, &conn->flags)) - encrypt = 0x02; - else - encrypt = 0x01; - - hci_encrypt_cfm(conn, 0, encrypt); - } + hci_encrypt_cfm(conn, 0); unlock: hci_dev_unlock(hdev); @@ -3108,14 +3134,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) } notify: - if (conn->state == BT_CONFIG) { - if (!ev->status) - conn->state = BT_CONNECTED; - - hci_connect_cfm(conn, ev->status); - hci_conn_drop(conn); - } else - hci_encrypt_cfm(conn, ev->status, ev->encrypt); + hci_encrypt_cfm(conn, ev->status); unlock: hci_dev_unlock(hdev); @@ -3207,7 +3226,7 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, switch (*opcode) { case HCI_OP_INQUIRY_CANCEL: - hci_cc_inquiry_cancel(hdev, skb); + hci_cc_inquiry_cancel(hdev, skb, status); break; case HCI_OP_PERIODIC_INQ: @@ -3334,6 +3353,10 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_read_bd_addr(hdev, skb); break; + case HCI_OP_READ_LOCAL_PAIRING_OPTS: + hci_cc_read_local_pairing_opts(hdev, skb); + break; + case HCI_OP_READ_PAGE_SCAN_ACTIVITY: hci_cc_read_page_scan_activity(hdev, skb); break; @@ -4292,6 +4315,7 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, case 0x11: /* Unsupported Feature or Parameter Value */ case 0x1c: /* SCO interval rejected */ case 0x1a: /* Unsupported Remote Feature */ + case 0x1e: /* Invalid LMP Parameters */ case 0x1f: /* Unspecified error */ case 0x20: /* Unsupported LMP Parameter value */ if (conn->out) { @@ -4307,6 +4331,19 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, break; } + bt_dev_dbg(hdev, "SCO connected with air mode: %02x", ev->air_mode); + + switch (conn->setting & SCO_AIRMODE_MASK) { + case SCO_AIRMODE_CVSD: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_CVSD); + break; + case SCO_AIRMODE_TRANSP: + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_ENABLE_SCO_TRANSP); + break; + } + hci_connect_cfm(conn, ev->status); if (ev->status) hci_conn_del(conn); @@ -5245,7 +5282,9 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, /* Most controller will fail if we try to create new connections * while we have an existing one in slave role. */ - if (hdev->conn_hash.le_num_slave > 0) + if (hdev->conn_hash.le_num_slave > 0 && + (!test_bit(HCI_QUIRK_VALID_LE_STATES, &hdev->quirks) || + !(hdev->le_states[3] & 0x10))) return NULL; /* If we're not connectable only connect devices that we have in @@ -5269,7 +5308,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, case HCI_AUTO_CONN_ALWAYS: /* Devices advertising with ADV_IND or ADV_DIRECT_IND * are triggering a connection attempt. This means - * that incoming connectioms from slave device are + * that incoming connections from slave device are * accepted and also outgoing connections to slave * devices are established when found. */ @@ -5353,7 +5392,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, /* Adjust for actual length */ if (len != real_len) { - bt_dev_err_ratelimited(hdev, "advertising data len corrected"); + bt_dev_err_ratelimited(hdev, "advertising data len corrected %u -> %u", + len, real_len); len = real_len; } @@ -6145,6 +6185,10 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_num_comp_blocks_evt(hdev, skb); break; + case HCI_EV_VENDOR: + msft_vendor_evt(hdev, skb); + break; + default: BT_DBG("%s event 0x%2.2x", hdev->name, event); break; diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index 649e1e5ed446..1fc55685da62 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -35,7 +35,7 @@ #define HCI_REQ_CANCELED 2 #define LE_SUSPEND_SCAN_WINDOW 0x0012 -#define LE_SUSPEND_SCAN_INTERVAL 0x0060 +#define LE_SUSPEND_SCAN_INTERVAL 0x0400 void hci_req_init(struct hci_request *req, struct hci_dev *hdev) { @@ -890,7 +890,7 @@ void hci_req_add_le_passive_scan(struct hci_request *req) struct hci_dev *hdev = req->hdev; u8 own_addr_type; u8 filter_policy; - u8 window, interval; + u16 window, interval; if (hdev->scanning_paused) { bt_dev_dbg(hdev, "Scanning is paused for suspend"); @@ -1090,7 +1090,7 @@ void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next) disconnect_counter); set_bit(SUSPEND_DISCONNECTING, hdev->suspend_tasks); } - } else if (next == BT_SUSPEND_COMPLETE) { + } else if (next == BT_SUSPEND_CONFIGURE_WAKE) { /* Unpause to take care of updating scanning params */ hdev->scanning_paused = false; /* Enable event filter for paired devices */ @@ -1447,7 +1447,7 @@ void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); hdev->scan_rsp_data_len = len; - cp.handle = 0; + cp.handle = instance; cp.length = len; cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; @@ -1591,7 +1591,7 @@ void __hci_req_update_adv_data(struct hci_request *req, u8 instance) hdev->adv_data_len = len; cp.length = len; - cp.handle = 0; + cp.handle = instance; cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; @@ -1876,7 +1876,7 @@ int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) memset(&cp, 0, sizeof(cp)); - cp.handle = 0; + cp.handle = instance; bacpy(&cp.bdaddr, &random_addr); hci_req_add(req, @@ -2723,6 +2723,8 @@ static int active_scan(struct hci_request *req, unsigned long opt) uint16_t interval = opt; struct hci_dev *hdev = req->hdev; u8 own_addr_type; + /* White list is not used for discovery */ + u8 filter_policy = 0x00; int err; BT_DBG("%s", hdev->name); @@ -2744,7 +2746,7 @@ static int active_scan(struct hci_request *req, unsigned long opt) own_addr_type = ADDR_LE_DEV_PUBLIC; hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN, - own_addr_type, 0); + own_addr_type, filter_policy); return 0; } diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 9c4a093f8960..caf38a8ea6a8 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1579,11 +1579,13 @@ static int hci_mgmt_cmd(struct hci_mgmt_chan *chan, struct sock *sk, } } - no_hdev = (handler->flags & HCI_MGMT_NO_HDEV); - if (no_hdev != !hdev) { - err = mgmt_cmd_status(sk, index, opcode, - MGMT_STATUS_INVALID_INDEX); - goto done; + if (!(handler->flags & HCI_MGMT_HDEV_OPTIONAL)) { + no_hdev = (handler->flags & HCI_MGMT_NO_HDEV); + if (no_hdev != !hdev) { + err = mgmt_cmd_status(sk, index, opcode, + MGMT_STATUS_INVALID_INDEX); + goto done; + } } var_len = (handler->flags & HCI_MGMT_VAR_LEN); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fd9d0d08f9c9..fe913a5c754a 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -5927,7 +5927,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, if (!enable_ecred) return -EINVAL; - if (cmd_len < sizeof(*req) || cmd_len - sizeof(*req) % sizeof(u16)) { + if (cmd_len < sizeof(*req) || (cmd_len - sizeof(*req)) % sizeof(u16)) { result = L2CAP_CR_LE_INVALID_PARAMS; goto response; } @@ -5964,7 +5964,7 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn, } result = L2CAP_CR_LE_SUCCESS; - cmd_len -= sizeof(req); + cmd_len -= sizeof(*req); num_scid = cmd_len / sizeof(u16); for (i = 0; i < num_scid; i++) { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 117ba20ea194..a995d2c51fa7 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -395,6 +395,24 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, return sizeof(struct sockaddr_l2); } +static int l2cap_get_mode(struct l2cap_chan *chan) +{ + switch (chan->mode) { + case L2CAP_MODE_BASIC: + return BT_MODE_BASIC; + case L2CAP_MODE_ERTM: + return BT_MODE_ERTM; + case L2CAP_MODE_STREAMING: + return BT_MODE_STREAMING; + case L2CAP_MODE_LE_FLOWCTL: + return BT_MODE_LE_FLOWCTL; + case L2CAP_MODE_EXT_FLOWCTL: + return BT_MODE_EXT_FLOWCTL; + } + + return -EINVAL; +} + static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen) { @@ -424,6 +442,20 @@ static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, break; } + /* Only BR/EDR modes are supported here */ + switch (chan->mode) { + case L2CAP_MODE_BASIC: + case L2CAP_MODE_ERTM: + case L2CAP_MODE_STREAMING: + break; + default: + err = -EINVAL; + break; + } + + if (err < 0) + break; + memset(&opts, 0, sizeof(opts)); opts.imtu = chan->imtu; opts.omtu = chan->omtu; @@ -508,7 +540,7 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, struct bt_security sec; struct bt_power pwr; u32 phys; - int len, err = 0; + int len, mode, err = 0; BT_DBG("sk %p", sk); @@ -624,6 +656,27 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, err = -EFAULT; break; + case BT_MODE: + if (!enable_ecred) { + err = -ENOPROTOOPT; + break; + } + + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + err = -EINVAL; + break; + } + + mode = l2cap_get_mode(chan); + if (mode < 0) { + err = mode; + break; + } + + if (put_user(mode, (u8 __user *) optval)) + err = -EFAULT; + break; + default: err = -ENOPROTOOPT; break; @@ -698,10 +751,8 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; } - chan->mode = opts.mode; - switch (chan->mode) { - case L2CAP_MODE_LE_FLOWCTL: - break; + /* Only BR/EDR modes are supported here */ + switch (opts.mode) { case L2CAP_MODE_BASIC: clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); break; @@ -715,6 +766,11 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, break; } + if (err < 0) + break; + + chan->mode = opts.mode; + BT_DBG("mode 0x%2.2x", chan->mode); chan->imtu = opts.imtu; @@ -763,6 +819,45 @@ static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, return err; } +static int l2cap_set_mode(struct l2cap_chan *chan, u8 mode) +{ + switch (mode) { + case BT_MODE_BASIC: + if (bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_BASIC; + clear_bit(CONF_STATE2_DEVICE, &chan->conf_state); + break; + case BT_MODE_ERTM: + if (!disable_ertm || bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_ERTM; + break; + case BT_MODE_STREAMING: + if (!disable_ertm || bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_STREAMING; + break; + case BT_MODE_LE_FLOWCTL: + if (!bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_LE_FLOWCTL; + break; + case BT_MODE_EXT_FLOWCTL: + /* TODO: Add support for ECRED PDUs to BR/EDR */ + if (!bdaddr_type_is_le(chan->src_type)) + return -EINVAL; + mode = L2CAP_MODE_EXT_FLOWCTL; + break; + default: + return -EINVAL; + } + + chan->mode = mode; + + return 0; +} + static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { @@ -968,6 +1063,39 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; + case BT_MODE: + if (!enable_ecred) { + err = -ENOPROTOOPT; + break; + } + + BT_DBG("sk->sk_state %u", sk->sk_state); + + if (sk->sk_state != BT_BOUND) { + err = -EINVAL; + break; + } + + if (chan->chan_type != L2CAP_CHAN_CONN_ORIENTED) { + err = -EINVAL; + break; + } + + if (get_user(opt, (u8 __user *) optval)) { + err = -EFAULT; + break; + } + + BT_DBG("opt %u", opt); + + err = l2cap_set_mode(chan, opt); + if (err) + break; + + BT_DBG("mode 0x%2.2x", chan->mode); + + break; + default: err = -ENOPROTOOPT; break; @@ -1143,14 +1271,21 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) struct l2cap_conn *conn; int err = 0; - BT_DBG("sock %p, sk %p", sock, sk); + BT_DBG("sock %p, sk %p, how %d", sock, sk, how); + + /* 'how' parameter is mapped to sk_shutdown as follows: + * SHUT_RD (0) --> RCV_SHUTDOWN (1) + * SHUT_WR (1) --> SEND_SHUTDOWN (2) + * SHUT_RDWR (2) --> SHUTDOWN_MASK (3) + */ + how++; if (!sk) return 0; lock_sock(sk); - if (sk->sk_shutdown) + if ((sk->sk_shutdown & how) == how) goto shutdown_already; BT_DBG("Handling sock shutdown"); @@ -1173,11 +1308,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) * has already been actioned to close the L2CAP * link such as by l2cap_disconnection_req(). */ - if (sk->sk_shutdown) - goto has_shutdown; + if ((sk->sk_shutdown & how) == how) + goto shutdown_matched; } - sk->sk_shutdown = SHUTDOWN_MASK; + /* Try setting the RCV_SHUTDOWN bit, return early if SEND_SHUTDOWN + * is already set + */ + if ((how & RCV_SHUTDOWN) && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + sk->sk_shutdown |= RCV_SHUTDOWN; + if ((sk->sk_shutdown & how) == how) + goto shutdown_matched; + } + + sk->sk_shutdown |= SEND_SHUTDOWN; release_sock(sk); l2cap_chan_lock(chan); @@ -1207,7 +1351,7 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); -has_shutdown: +shutdown_matched: l2cap_chan_put(chan); sock_put(sk); @@ -1235,7 +1379,7 @@ static int l2cap_sock_release(struct socket *sock) bt_sock_unlink(&l2cap_sk_list, sk); - err = l2cap_sock_shutdown(sock, 2); + err = l2cap_sock_shutdown(sock, SHUT_RDWR); chan = l2cap_pi(sk)->chan; l2cap_chan_hold(chan); diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index c09e0a3a0ed9..5326f41a58b7 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -183,6 +183,39 @@ void bt_err(const char *format, ...) } EXPORT_SYMBOL(bt_err); +#ifdef CONFIG_BT_FEATURE_DEBUG +static bool debug_enable; + +void bt_dbg_set(bool enable) +{ + debug_enable = enable; +} + +bool bt_dbg_get(void) +{ + return debug_enable; +} + +void bt_dbg(const char *format, ...) +{ + struct va_format vaf; + va_list args; + + if (likely(!debug_enable)) + return; + + va_start(args, format); + + vaf.fmt = format; + vaf.va = &args; + + printk(KERN_DEBUG pr_fmt("%pV"), &vaf); + + va_end(args); +} +EXPORT_SYMBOL(bt_dbg); +#endif + void bt_warn_ratelimited(const char *format, ...) { struct va_format vaf; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 6552003a170e..9e8a3cccc6ca 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -38,7 +38,7 @@ #include "mgmt_util.h" #define MGMT_VERSION 1 -#define MGMT_REVISION 16 +#define MGMT_REVISION 17 static const u16 mgmt_commands[] = { MGMT_OP_READ_INDEX_LIST, @@ -108,6 +108,9 @@ static const u16 mgmt_commands[] = { MGMT_OP_SET_APPEARANCE, MGMT_OP_SET_BLOCKED_KEYS, MGMT_OP_SET_WIDEBAND_SPEECH, + MGMT_OP_READ_SECURITY_INFO, + MGMT_OP_READ_EXP_FEATURES_INFO, + MGMT_OP_SET_EXP_FEATURE, }; static const u16 mgmt_events[] = { @@ -146,6 +149,8 @@ static const u16 mgmt_events[] = { MGMT_EV_ADVERTISING_ADDED, MGMT_EV_ADVERTISING_REMOVED, MGMT_EV_EXT_INFO_CHANGED, + MGMT_EV_PHY_CONFIGURATION_CHANGED, + MGMT_EV_EXP_FEATURE_CHANGED, }; static const u16 mgmt_untrusted_commands[] = { @@ -155,6 +160,8 @@ static const u16 mgmt_untrusted_commands[] = { MGMT_OP_READ_CONFIG_INFO, MGMT_OP_READ_EXT_INDEX_LIST, MGMT_OP_READ_EXT_INFO, + MGMT_OP_READ_SECURITY_INFO, + MGMT_OP_READ_EXP_FEATURES_INFO, }; static const u16 mgmt_untrusted_events[] = { @@ -169,6 +176,7 @@ static const u16 mgmt_untrusted_events[] = { MGMT_EV_EXT_INDEX_ADDED, MGMT_EV_EXT_INDEX_REMOVED, MGMT_EV_EXT_INFO_CHANGED, + MGMT_EV_EXP_FEATURE_CHANGED, }; #define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000) @@ -291,7 +299,7 @@ static int read_version(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_rp_read_version rp; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); mgmt_fill_version_info(&rp); @@ -307,7 +315,7 @@ static int read_commands(struct sock *sk, struct hci_dev *hdev, void *data, size_t rp_size; int i, err; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); if (hci_sock_test_flag(sk, HCI_SOCK_TRUSTED)) { num_commands = ARRAY_SIZE(mgmt_commands); @@ -360,7 +368,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, u16 count; int err; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); @@ -394,7 +402,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, if (d->dev_type == HCI_PRIMARY && !hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); - BT_DBG("Added hci%u", d->id); + bt_dev_dbg(hdev, "Added hci%u", d->id); } } @@ -420,7 +428,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, u16 count; int err; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); @@ -454,7 +462,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, if (d->dev_type == HCI_PRIMARY && hci_dev_test_flag(d, HCI_UNCONFIGURED)) { rp->index[count++] = cpu_to_le16(d->id); - BT_DBG("Added hci%u", d->id); + bt_dev_dbg(hdev, "Added hci%u", d->id); } } @@ -479,7 +487,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, u16 count; int err; - BT_DBG("sock %p", sk); + bt_dev_dbg(hdev, "sock %p", sk); read_lock(&hci_dev_list_lock); @@ -521,7 +529,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, rp->entry[count].bus = d->bus; rp->entry[count++].index = cpu_to_le16(d->id); - BT_DBG("Added hci%u", d->id); + bt_dev_dbg(hdev, "Added hci%u", d->id); } rp->num_controllers = cpu_to_le16(count); @@ -597,7 +605,7 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev, struct mgmt_rp_read_config_info rp; u32 options = 0; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -937,7 +945,7 @@ static void rpa_expired(struct work_struct *work) rpa_expired.work); struct hci_request req; - BT_DBG(""); + bt_dev_dbg(hdev, ""); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); @@ -977,7 +985,7 @@ static int read_controller_info(struct sock *sk, struct hci_dev *hdev, { struct mgmt_rp_read_info rp; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -1033,7 +1041,7 @@ static int read_ext_controller_info(struct sock *sk, struct hci_dev *hdev, struct mgmt_rp_read_ext_info *rp = (void *)buf; u16 eir_len; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); memset(&buf, 0, sizeof(buf)); @@ -1092,7 +1100,7 @@ static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev) static void clean_up_hci_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("%s status 0x%02x", hdev->name, status); + bt_dev_dbg(hdev, "status 0x%02x", status); if (hci_conn_count(hdev) == 0) { cancel_delayed_work(&hdev->power_off); @@ -1168,7 +1176,7 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_pending_cmd *cmd; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_POWERED, @@ -1309,7 +1317,7 @@ void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -1348,7 +1356,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data, u16 timeout; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) @@ -1474,7 +1482,7 @@ void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -1534,7 +1542,7 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_pending_cmd *cmd; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) && !hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) @@ -1591,7 +1599,7 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data, bool changed; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BONDABLE, @@ -1635,7 +1643,7 @@ static int set_link_security(struct sock *sk, struct hci_dev *hdev, void *data, u8 val, status; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_bredr_support(hdev); if (status) @@ -1703,7 +1711,7 @@ static int set_ssp(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) u8 status; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_bredr_support(hdev); if (status) @@ -1784,7 +1792,7 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) u8 status; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_bredr_support(hdev); if (status) @@ -1890,7 +1898,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) int err; u8 val, enabled; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_LE, @@ -2051,7 +2059,7 @@ unlock: static void add_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); mgmt_class_complete(hdev, MGMT_OP_ADD_UUID, status); } @@ -2064,7 +2072,7 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) struct bt_uuid *uuid; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -2130,7 +2138,7 @@ static bool enable_service_cache(struct hci_dev *hdev) static void remove_uuid_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); mgmt_class_complete(hdev, MGMT_OP_REMOVE_UUID, status); } @@ -2145,7 +2153,7 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_request req; int err, found; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -2216,7 +2224,7 @@ unlock: static void set_class_complete(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); mgmt_class_complete(hdev, MGMT_OP_SET_DEV_CLASS, status); } @@ -2229,7 +2237,7 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_request req; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEV_CLASS, @@ -2302,7 +2310,7 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, bool changed; int i; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, @@ -2328,8 +2336,8 @@ static int load_link_keys(struct sock *sk, struct hci_dev *hdev, void *data, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LINK_KEYS, MGMT_STATUS_INVALID_PARAMS); - BT_DBG("%s debug_keys %u key_count %u", hdev->name, cp->debug_keys, - key_count); + bt_dev_dbg(hdev, "debug_keys %u key_count %u", cp->debug_keys, + key_count); for (i = 0; i < key_count; i++) { struct mgmt_link_key_info *key = &cp->keys[i]; @@ -2530,7 +2538,7 @@ static int disconnect(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -2614,7 +2622,7 @@ static int get_connections(struct sock *sk, struct hci_dev *hdev, void *data, int err; u16 i; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -2690,7 +2698,7 @@ static int pin_code_reply(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_pending_cmd *cmd; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -2748,7 +2756,7 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_set_io_capability *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->io_capability > SMP_IO_KEYBOARD_DISPLAY) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_IO_CAPABILITY, @@ -2758,8 +2766,7 @@ static int set_io_capability(struct sock *sk, struct hci_dev *hdev, void *data, hdev->io_capability = cp->io_capability; - BT_DBG("%s IO capability set to 0x%02x", hdev->name, - hdev->io_capability); + bt_dev_dbg(hdev, "IO capability set to 0x%02x", hdev->io_capability); hci_dev_unlock(hdev); @@ -2871,7 +2878,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -3000,7 +3007,7 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -3111,7 +3118,7 @@ static int pin_code_neg_reply(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_pin_code_neg_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_PIN_CODE_NEG_REPLY, @@ -3123,7 +3130,7 @@ static int user_confirm_reply(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_user_confirm_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); if (len != sizeof(*cp)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_USER_CONFIRM_REPLY, @@ -3139,7 +3146,7 @@ static int user_confirm_neg_reply(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_user_confirm_neg_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_CONFIRM_NEG_REPLY, @@ -3151,7 +3158,7 @@ static int user_passkey_reply(struct sock *sk, struct hci_dev *hdev, void *data, { struct mgmt_cp_user_passkey_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_PASSKEY_REPLY, @@ -3163,7 +3170,7 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev, { struct mgmt_cp_user_passkey_neg_reply *cp = data; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); return user_pairing_resp(sk, hdev, &cp->addr, MGMT_OP_USER_PASSKEY_NEG_REPLY, @@ -3204,7 +3211,7 @@ static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode) struct mgmt_cp_set_local_name *cp; struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -3239,7 +3246,7 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_request req; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -3308,7 +3315,7 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data, u16 appearance; int err; - BT_DBG(""); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_APPEARANCE, @@ -3340,7 +3347,7 @@ static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev, { struct mgmt_rp_get_phy_confguration rp; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -3373,7 +3380,7 @@ static void set_default_phy_complete(struct hci_dev *hdev, u8 status, { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -3411,7 +3418,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, bool changed = false; int err; - BT_DBG("sock %p %s", sk, hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); configurable_phys = get_configurable_phys(hdev); supported_phys = get_supported_phys(hdev); @@ -3564,7 +3571,7 @@ static int set_blocked_keys(struct sock *sk, struct hci_dev *hdev, void *data, u16 key_count, expected_len; int i; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); key_count = __le16_to_cpu(keys->key_count); if (key_count > max_key_count) { @@ -3610,7 +3617,7 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, int err; bool changed = false; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks)) return mgmt_cmd_status(sk, hdev->id, @@ -3659,6 +3666,189 @@ unlock: return err; } +static int read_security_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + char buf[16]; + struct mgmt_rp_read_security_info *rp = (void *)buf; + u16 sec_len = 0; + u8 flags = 0; + + bt_dev_dbg(hdev, "sock %p", sk); + + memset(&buf, 0, sizeof(buf)); + + hci_dev_lock(hdev); + + /* When the Read Simple Pairing Options command is supported, then + * the remote public key validation is supported. + */ + if (hdev->commands[41] & 0x08) + flags |= 0x01; /* Remote public key validation (BR/EDR) */ + + flags |= 0x02; /* Remote public key validation (LE) */ + + /* When the Read Encryption Key Size command is supported, then the + * encryption key size is enforced. + */ + if (hdev->commands[20] & 0x10) + flags |= 0x04; /* Encryption key size enforcement (BR/EDR) */ + + flags |= 0x08; /* Encryption key size enforcement (LE) */ + + sec_len = eir_append_data(rp->sec, sec_len, 0x01, &flags, 1); + + /* When the Read Simple Pairing Options command is supported, then + * also max encryption key size information is provided. + */ + if (hdev->commands[41] & 0x08) + sec_len = eir_append_le16(rp->sec, sec_len, 0x02, + hdev->max_enc_key_size); + + sec_len = eir_append_le16(rp->sec, sec_len, 0x03, SMP_MAX_ENC_KEY_SIZE); + + rp->sec_len = cpu_to_le16(sec_len); + + hci_dev_unlock(hdev); + + return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_READ_SECURITY_INFO, 0, + rp, sizeof(*rp) + sec_len); +} + +#ifdef CONFIG_BT_FEATURE_DEBUG +/* d4992530-b9ec-469f-ab01-6c481c47da1c */ +static const u8 debug_uuid[16] = { + 0x1c, 0xda, 0x47, 0x1c, 0x48, 0x6c, 0x01, 0xab, + 0x9f, 0x46, 0xec, 0xb9, 0x30, 0x25, 0x99, 0xd4, +}; +#endif + +static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + char buf[42]; + struct mgmt_rp_read_exp_features_info *rp = (void *)buf; + u16 idx = 0; + + bt_dev_dbg(hdev, "sock %p", sk); + + memset(&buf, 0, sizeof(buf)); + +#ifdef CONFIG_BT_FEATURE_DEBUG + if (!hdev) { + u32 flags = bt_dbg_get() ? BIT(0) : 0; + + memcpy(rp->features[idx].uuid, debug_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } +#endif + + rp->feature_count = cpu_to_le16(idx); + + /* After reading the experimental features information, enable + * the events to update client on any future change. + */ + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_READ_EXP_FEATURES_INFO, + 0, rp, sizeof(*rp) + (20 * idx)); +} + +#ifdef CONFIG_BT_FEATURE_DEBUG +static int exp_debug_feature_changed(bool enabled, struct sock *skip) +{ + struct mgmt_ev_exp_feature_changed ev; + + memset(&ev, 0, sizeof(ev)); + memcpy(ev.uuid, debug_uuid, 16); + ev.flags = cpu_to_le32(enabled ? BIT(0) : 0); + + return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, NULL, + &ev, sizeof(ev), + HCI_MGMT_EXP_FEATURE_EVENTS, skip); +} +#endif + +static int set_exp_feature(struct sock *sk, struct hci_dev *hdev, + void *data, u16 data_len) +{ + struct mgmt_cp_set_exp_feature *cp = data; + struct mgmt_rp_set_exp_feature rp; + + bt_dev_dbg(hdev, "sock %p", sk); + + if (!memcmp(cp->uuid, ZERO_KEY, 16)) { + memset(rp.uuid, 0, 16); + rp.flags = cpu_to_le32(0); + +#ifdef CONFIG_BT_FEATURE_DEBUG + if (!hdev) { + bool changed = bt_dbg_get(); + + bt_dbg_set(false); + + if (changed) + exp_debug_feature_changed(false, sk); + } +#endif + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + } + +#ifdef CONFIG_BT_FEATURE_DEBUG + if (!memcmp(cp->uuid, debug_uuid, 16)) { + bool val, changed; + int err; + + /* Command requires to use the non-controller index */ + if (hdev) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + val = !!cp->param[0]; + changed = val ? !bt_dbg_get() : bt_dbg_get(); + bt_dbg_set(val); + + memcpy(rp.uuid, debug_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + err = mgmt_cmd_complete(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_debug_feature_changed(val, sk); + + return err; + } +#endif + + return mgmt_cmd_status(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_NOT_SUPPORTED); +} + static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb) { @@ -3666,7 +3856,7 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status, size_t rp_size = sizeof(mgmt_rp); struct mgmt_pending_cmd *cmd; - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_DATA, hdev); if (!cmd) @@ -3725,7 +3915,7 @@ static int read_local_oob_data(struct sock *sk, struct hci_dev *hdev, struct hci_request req; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -3775,7 +3965,7 @@ static int add_remote_oob_data(struct sock *sk, struct hci_dev *hdev, struct mgmt_addr_info *addr = data; int err; - BT_DBG("%s ", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(addr->type)) return mgmt_cmd_complete(sk, hdev->id, @@ -3884,7 +4074,7 @@ static int remove_remote_oob_data(struct sock *sk, struct hci_dev *hdev, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->addr.type != BDADDR_BREDR) return mgmt_cmd_complete(sk, hdev->id, @@ -3918,7 +4108,7 @@ void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); hci_dev_lock(hdev); @@ -3979,7 +4169,7 @@ static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -4071,7 +4261,7 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -4166,7 +4356,7 @@ void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status) { struct mgmt_pending_cmd *cmd; - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); hci_dev_lock(hdev); @@ -4192,7 +4382,7 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data, struct mgmt_pending_cmd *cmd; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -4234,7 +4424,7 @@ static int confirm_name(struct sock *sk, struct hci_dev *hdev, void *data, struct inquiry_entry *e; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -4276,7 +4466,7 @@ static int block_device(struct sock *sk, struct hci_dev *hdev, void *data, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_BLOCK_DEVICE, @@ -4312,7 +4502,7 @@ static int unblock_device(struct sock *sk, struct hci_dev *hdev, void *data, u8 status; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type)) return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_UNBLOCK_DEVICE, @@ -4349,7 +4539,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, int err; __u16 source; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); source = __le16_to_cpu(cp->source); @@ -4379,7 +4569,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data, static void enable_advertising_instance(struct hci_dev *hdev, u8 status, u16 opcode) { - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); } static void set_advertising_complete(struct hci_dev *hdev, u8 status, @@ -4465,7 +4655,7 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, u8 val, status; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_le_support(hdev); if (status) @@ -4574,7 +4764,7 @@ static int set_static_address(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_static_address *cp = data; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_STATIC_ADDRESS, @@ -4619,7 +4809,7 @@ static int set_scan_params(struct sock *sk, struct hci_dev *hdev, __u16 interval, window; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_SCAN_PARAMS, @@ -4674,7 +4864,7 @@ static void fast_connectable_complete(struct hci_dev *hdev, u8 status, { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -4711,7 +4901,7 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev, struct hci_request req; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) || hdev->hci_ver < BLUETOOTH_VER_1_2) @@ -4772,7 +4962,7 @@ static void set_bredr_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct mgmt_pending_cmd *cmd; - BT_DBG("status 0x%02x", status); + bt_dev_dbg(hdev, "status 0x%02x", status); hci_dev_lock(hdev); @@ -4807,7 +4997,7 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) struct hci_request req; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_bredr_capable(hdev) || !lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_BREDR, @@ -4917,7 +5107,7 @@ static void sc_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) struct mgmt_pending_cmd *cmd; struct mgmt_mode *cp; - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -4966,7 +5156,7 @@ static int set_secure_conn(struct sock *sk, struct hci_dev *hdev, u8 val; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_sc_capable(hdev) && !hci_dev_test_flag(hdev, HCI_LE_ENABLED)) @@ -5052,7 +5242,7 @@ static int set_debug_keys(struct sock *sk, struct hci_dev *hdev, bool changed, use_changed; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (cp->val != 0x00 && cp->val != 0x01 && cp->val != 0x02) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_DEBUG_KEYS, @@ -5099,7 +5289,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data, bool changed; int err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY, @@ -5174,7 +5364,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, u16 irk_count, expected_len; int i, err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_IRKS, @@ -5196,7 +5386,7 @@ static int load_irks(struct sock *sk, struct hci_dev *hdev, void *cp_data, MGMT_STATUS_INVALID_PARAMS); } - BT_DBG("%s irk_count %u", hdev->name, irk_count); + bt_dev_dbg(hdev, "irk_count %u", irk_count); for (i = 0; i < irk_count; i++) { struct mgmt_irk_info *key = &cp->irks[i]; @@ -5264,7 +5454,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, u16 key_count, expected_len; int i, err; - BT_DBG("request for %s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_LOAD_LONG_TERM_KEYS, @@ -5286,7 +5476,7 @@ static int load_long_term_keys(struct sock *sk, struct hci_dev *hdev, MGMT_STATUS_INVALID_PARAMS); } - BT_DBG("%s key_count %u", hdev->name, key_count); + bt_dev_dbg(hdev, "key_count %u", key_count); for (i = 0; i < key_count; i++) { struct mgmt_ltk_info *key = &cp->keys[i]; @@ -5387,7 +5577,7 @@ static void conn_info_refresh_complete(struct hci_dev *hdev, u8 hci_status, u16 handle; u8 status; - BT_DBG("status 0x%02x", hci_status); + bt_dev_dbg(hdev, "status 0x%02x", hci_status); hci_dev_lock(hdev); @@ -5441,7 +5631,7 @@ static int get_conn_info(struct sock *sk, struct hci_dev *hdev, void *data, unsigned long conn_info_age; int err = 0; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -5595,7 +5785,7 @@ static void get_clock_info_complete(struct hci_dev *hdev, u8 status, u16 opcode) struct mgmt_pending_cmd *cmd; struct hci_conn *conn; - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); hci_dev_lock(hdev); @@ -5632,7 +5822,7 @@ static int get_clock_info(struct sock *sk, struct hci_dev *hdev, void *data, struct hci_conn *conn; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); memset(&rp, 0, sizeof(rp)); bacpy(&rp.addr.bdaddr, &cp->addr.bdaddr); @@ -5753,8 +5943,8 @@ static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, params->auto_connect = auto_connect; - BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, - auto_connect); + bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u", + addr, addr_type, auto_connect); return 0; } @@ -5778,7 +5968,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, u8 auto_conn, addr_type; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!bdaddr_type_is_valid(cp->addr.type) || !bacmp(&cp->addr.bdaddr, BDADDR_ANY)) @@ -5876,7 +6066,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_remove_device *cp = data; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -5985,7 +6175,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, kfree(p); } - BT_DBG("All LE connection parameters were removed"); + bt_dev_dbg(hdev, "All LE connection parameters were removed"); hci_update_background_scan(hdev); } @@ -6028,7 +6218,7 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, MGMT_STATUS_INVALID_PARAMS); } - BT_DBG("%s param_count %u", hdev->name, param_count); + bt_dev_dbg(hdev, "param_count %u", param_count); hci_dev_lock(hdev); @@ -6040,8 +6230,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, u16 min, max, latency, timeout; u8 addr_type; - BT_DBG("Adding %pMR (type %u)", ¶m->addr.bdaddr, - param->addr.type); + bt_dev_dbg(hdev, "Adding %pMR (type %u)", ¶m->addr.bdaddr, + param->addr.type); if (param->addr.type == BDADDR_LE_PUBLIC) { addr_type = ADDR_LE_DEV_PUBLIC; @@ -6057,8 +6247,8 @@ static int load_conn_param(struct sock *sk, struct hci_dev *hdev, void *data, latency = le16_to_cpu(param->latency); timeout = le16_to_cpu(param->timeout); - BT_DBG("min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x", - min, max, latency, timeout); + bt_dev_dbg(hdev, "min 0x%04x max 0x%04x latency 0x%04x timeout 0x%04x", + min, max, latency, timeout); if (hci_check_conn_params(min, max, latency, timeout) < 0) { bt_dev_err(hdev, "ignoring invalid connection parameters"); @@ -6091,7 +6281,7 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev, bool changed; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, @@ -6147,7 +6337,7 @@ static int set_public_address(struct sock *sk, struct hci_dev *hdev, bool changed; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PUBLIC_ADDRESS, @@ -6202,7 +6392,7 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, u8 status, u16 eir_len; int err; - BT_DBG("%s status %u", hdev->name, status); + bt_dev_dbg(hdev, "status %u", status); cmd = pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev); if (!cmd) @@ -6341,7 +6531,7 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev, u8 status, flags, role, addr[7], hash[16], rand[16]; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (hdev_is_powered(hdev)) { switch (cp->type) { @@ -6528,7 +6718,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, u32 supported_flags; u8 *instance; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_READ_ADV_FEATURES, @@ -6671,7 +6861,7 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status, struct adv_info *adv_instance, *n; u8 instance; - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); hci_dev_lock(hdev); @@ -6730,7 +6920,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, struct mgmt_pending_cmd *cmd; struct hci_request req; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); status = mgmt_le_support(hdev); if (status) @@ -6867,7 +7057,7 @@ static void remove_advertising_complete(struct hci_dev *hdev, u8 status, struct mgmt_cp_remove_advertising *cp; struct mgmt_rp_remove_advertising rp; - BT_DBG("status %d", status); + bt_dev_dbg(hdev, "status %d", status); hci_dev_lock(hdev); @@ -6899,7 +7089,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev, struct hci_request req; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); hci_dev_lock(hdev); @@ -6971,7 +7161,7 @@ static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev, u32 flags, supported_flags; int err; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "sock %p", sk); if (!lmp_le_capable(hdev)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO, @@ -7099,6 +7289,14 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { set_blocked_keys, MGMT_OP_SET_BLOCKED_KEYS_SIZE, HCI_MGMT_VAR_LEN }, { set_wideband_speech, MGMT_SETTING_SIZE }, + { read_security_info, MGMT_READ_SECURITY_INFO_SIZE, + HCI_MGMT_UNTRUSTED }, + { read_exp_features_info, MGMT_READ_EXP_FEATURES_INFO_SIZE, + HCI_MGMT_UNTRUSTED | + HCI_MGMT_HDEV_OPTIONAL }, + { set_exp_feature, MGMT_SET_EXP_FEATURE_SIZE, + HCI_MGMT_VAR_LEN | + HCI_MGMT_HDEV_OPTIONAL }, }; void mgmt_index_added(struct hci_dev *hdev) @@ -7197,7 +7395,7 @@ void mgmt_power_on(struct hci_dev *hdev, int err) { struct cmd_lookup match = { NULL, hdev }; - BT_DBG("err %d", err); + bt_dev_dbg(hdev, "err %d", err); hci_dev_lock(hdev); @@ -7616,7 +7814,7 @@ int mgmt_user_confirm_request(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct mgmt_ev_user_confirm_request ev; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); @@ -7632,7 +7830,7 @@ int mgmt_user_passkey_request(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct mgmt_ev_user_passkey_request ev; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); @@ -7693,7 +7891,7 @@ int mgmt_user_passkey_notify(struct hci_dev *hdev, bdaddr_t *bdaddr, { struct mgmt_ev_passkey_notify ev; - BT_DBG("%s", hdev->name); + bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); bacpy(&ev.addr.bdaddr, bdaddr); ev.addr.type = link_to_bdaddr(link_type, addr_type); @@ -8112,7 +8310,7 @@ void mgmt_discovering(struct hci_dev *hdev, u8 discovering) { struct mgmt_ev_discovering ev; - BT_DBG("%s discovering %u", hdev->name, discovering); + bt_dev_dbg(hdev, "discovering %u", discovering); memset(&ev, 0, sizeof(ev)); ev.type = hdev->discovery.type; diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c new file mode 100644 index 000000000000..d6c4e6b5ae77 --- /dev/null +++ b/net/bluetooth/msft.c @@ -0,0 +1,141 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google Corporation + */ + +#include <net/bluetooth/bluetooth.h> +#include <net/bluetooth/hci_core.h> + +#include "msft.h" + +#define MSFT_OP_READ_SUPPORTED_FEATURES 0x00 +struct msft_cp_read_supported_features { + __u8 sub_opcode; +} __packed; +struct msft_rp_read_supported_features { + __u8 status; + __u8 sub_opcode; + __le64 features; + __u8 evt_prefix_len; + __u8 evt_prefix[0]; +} __packed; + +struct msft_data { + __u64 features; + __u8 evt_prefix_len; + __u8 *evt_prefix; +}; + +static bool read_supported_features(struct hci_dev *hdev, + struct msft_data *msft) +{ + struct msft_cp_read_supported_features cp; + struct msft_rp_read_supported_features *rp; + struct sk_buff *skb; + + cp.sub_opcode = MSFT_OP_READ_SUPPORTED_FEATURES; + + skb = __hci_cmd_sync(hdev, hdev->msft_opcode, sizeof(cp), &cp, + HCI_CMD_TIMEOUT); + if (IS_ERR(skb)) { + bt_dev_err(hdev, "Failed to read MSFT supported features (%ld)", + PTR_ERR(skb)); + return false; + } + + if (skb->len < sizeof(*rp)) { + bt_dev_err(hdev, "MSFT supported features length mismatch"); + goto failed; + } + + rp = (struct msft_rp_read_supported_features *)skb->data; + + if (rp->sub_opcode != MSFT_OP_READ_SUPPORTED_FEATURES) + goto failed; + + if (rp->evt_prefix_len > 0) { + msft->evt_prefix = kmemdup(rp->evt_prefix, rp->evt_prefix_len, + GFP_KERNEL); + if (!msft->evt_prefix) + goto failed; + } + + msft->evt_prefix_len = rp->evt_prefix_len; + msft->features = __le64_to_cpu(rp->features); + + kfree_skb(skb); + return true; + +failed: + kfree_skb(skb); + return false; +} + +void msft_do_open(struct hci_dev *hdev) +{ + struct msft_data *msft; + + if (hdev->msft_opcode == HCI_OP_NOP) + return; + + bt_dev_dbg(hdev, "Initialize MSFT extension"); + + msft = kzalloc(sizeof(*msft), GFP_KERNEL); + if (!msft) + return; + + if (!read_supported_features(hdev, msft)) { + kfree(msft); + return; + } + + hdev->msft_data = msft; +} + +void msft_do_close(struct hci_dev *hdev) +{ + struct msft_data *msft = hdev->msft_data; + + if (!msft) + return; + + bt_dev_dbg(hdev, "Cleanup of MSFT extension"); + + hdev->msft_data = NULL; + + kfree(msft->evt_prefix); + kfree(msft); +} + +void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct msft_data *msft = hdev->msft_data; + u8 event; + + if (!msft) + return; + + /* When the extension has defined an event prefix, check that it + * matches, and otherwise just return. + */ + if (msft->evt_prefix_len > 0) { + if (skb->len < msft->evt_prefix_len) + return; + + if (memcmp(skb->data, msft->evt_prefix, msft->evt_prefix_len)) + return; + + skb_pull(skb, msft->evt_prefix_len); + } + + /* Every event starts at least with an event code and the rest of + * the data is variable and depends on the event code. + */ + if (skb->len < 1) + return; + + event = *skb->data; + skb_pull(skb, 1); + + bt_dev_dbg(hdev, "MSFT vendor event %u", event); +} diff --git a/net/bluetooth/msft.h b/net/bluetooth/msft.h new file mode 100644 index 000000000000..5aa9130e1f8a --- /dev/null +++ b/net/bluetooth/msft.h @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Google Corporation + */ + +#if IS_ENABLED(CONFIG_BT_MSFTEXT) + +void msft_do_open(struct hci_dev *hdev); +void msft_do_close(struct hci_dev *hdev); +void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb); + +#else + +static inline void msft_do_open(struct hci_dev *hdev) {} +static inline void msft_do_close(struct hci_dev *hdev) {} +static inline void msft_vendor_evt(struct hci_dev *hdev, struct sk_buff *skb) {} + +#endif diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index b4eaf21360ef..df14eebe80da 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -64,15 +64,13 @@ static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) { struct sock *sk = d->owner, *parent; - unsigned long flags; if (!sk) return; BT_DBG("dlc %p state %ld err %d", d, d->state, err); - local_irq_save(flags); - bh_lock_sock(sk); + spin_lock_bh(&sk->sk_lock.slock); if (err) sk->sk_err = err; @@ -93,8 +91,7 @@ static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) sk->sk_state_change(sk); } - bh_unlock_sock(sk); - local_irq_restore(flags); + spin_unlock_bh(&sk->sk_lock.slock); if (parent && sock_flag(sk, SOCK_ZAPPED)) { /* We have to drop DLC lock here, otherwise diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index d022f126eb02..c2c5ab05fa7e 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -504,7 +504,7 @@ bool smp_irk_matches(struct hci_dev *hdev, const u8 irk[16], if (!chan || !chan->data) return false; - BT_DBG("RPA %pMR IRK %*phN", bdaddr, 16, irk); + bt_dev_dbg(hdev, "RPA %pMR IRK %*phN", bdaddr, 16, irk); err = smp_ah(irk, &bdaddr->b[3], hash); if (err) @@ -530,7 +530,7 @@ int smp_generate_rpa(struct hci_dev *hdev, const u8 irk[16], bdaddr_t *rpa) if (err < 0) return err; - BT_DBG("RPA %pMR", rpa); + bt_dev_dbg(hdev, "RPA %pMR", rpa); return 0; } @@ -547,7 +547,7 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) smp = chan->data; if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS)) { - BT_DBG("Using debug keys"); + bt_dev_dbg(hdev, "Using debug keys"); err = set_ecdh_privkey(smp->tfm_ecdh, debug_sk); if (err) return err; @@ -726,6 +726,10 @@ static u8 check_enc_key_size(struct l2cap_conn *conn, __u8 max_key_size) struct hci_dev *hdev = conn->hcon->hdev; struct smp_chan *smp = chan->data; + if (conn->hcon->pending_sec_level == BT_SECURITY_FIPS && + max_key_size != SMP_MAX_ENC_KEY_SIZE) + return SMP_ENC_KEY_SIZE; + if (max_key_size > hdev->le_max_key_size || max_key_size < SMP_MIN_ENC_KEY_SIZE) return SMP_ENC_KEY_SIZE; @@ -850,7 +854,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, struct l2cap_chan *chan = conn->smp; struct smp_chan *smp = chan->data; u32 passkey = 0; - int ret = 0; + int ret; /* Initialize key for JUST WORKS */ memset(smp->tk, 0, sizeof(smp->tk)); @@ -879,9 +883,16 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth, hcon->io_capability == HCI_IO_NO_INPUT_OUTPUT) smp->method = JUST_WORKS; - /* If Just Works, Continue with Zero TK */ + /* If Just Works, Continue with Zero TK and ask user-space for + * confirmation */ if (smp->method == JUST_WORKS) { - set_bit(SMP_FLAG_TK_VALID, &smp->flags); + ret = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, + hcon->type, + hcon->dst_type, + passkey, 1); + if (ret) + return ret; + set_bit(SMP_FLAG_WAIT_USER, &smp->flags); return 0; } @@ -1856,7 +1867,7 @@ static u8 sc_send_public_key(struct smp_chan *smp) { struct hci_dev *hdev = smp->conn->hcon->hdev; - BT_DBG(""); + bt_dev_dbg(hdev, ""); if (test_bit(SMP_FLAG_LOCAL_OOB, &smp->flags)) { struct l2cap_chan *chan = hdev->smp_data; @@ -2190,7 +2201,7 @@ mackey_and_ltk: if (err) return SMP_UNSPECIFIED; - if (smp->method == JUST_WORKS || smp->method == REQ_OOB) { + if (smp->method == REQ_OOB) { if (hcon->out) { sc_dhkey_check(smp); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); @@ -2205,6 +2216,9 @@ mackey_and_ltk: confirm_hint = 0; confirm: + if (smp->method == JUST_WORKS) + confirm_hint = 1; + err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, confirm_hint); if (err) @@ -2381,12 +2395,17 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level) authreq |= SMP_AUTH_CT2; } - /* Require MITM if IO Capability allows or the security level - * requires it. + /* Don't attempt to set MITM if setting is overridden by debugfs + * Needed to pass certification test SM/MAS/PKE/BV-01-C */ - if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || - hcon->pending_sec_level > BT_SECURITY_MEDIUM) - authreq |= SMP_AUTH_MITM; + if (!hci_dev_test_flag(hcon->hdev, HCI_FORCE_NO_MITM)) { + /* Require MITM if IO Capability allows or the security level + * requires it. + */ + if (hcon->io_capability != HCI_IO_NO_INPUT_OUTPUT || + hcon->pending_sec_level > BT_SECURITY_MEDIUM) + authreq |= SMP_AUTH_MITM; + } if (hcon->role == HCI_ROLE_MASTER) { struct smp_cmd_pairing cp; diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 29dbdd4c29f6..bfd4ccd80847 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -160,16 +160,20 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + u32 user_size = kattr->test.data_size_in; void *data; if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); + if (user_size > size) + return ERR_PTR(-EMSGSIZE); + data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); - if (copy_from_user(data + headroom, data_in, size)) { + if (copy_from_user(data + headroom, data_in, user_size)) { kfree(data); return ERR_PTR(-EFAULT); } @@ -470,25 +474,32 @@ out: int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 headroom = XDP_PACKET_HEADROOM; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct xdp_buff xdp = {}; u32 retval, duration; + u32 max_data_sz; void *data; int ret; if (kattr->test.ctx_in || kattr->test.ctx_out) return -EINVAL; - data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); + /* XDP have extra tailroom as (most) drivers use full page */ + max_data_sz = 4096 - headroom - tailroom; + + data = bpf_test_init(kattr, max_data_sz, headroom, tailroom); if (IS_ERR(data)) return PTR_ERR(data); xdp.data_hard_start = data; - xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data = data + headroom; xdp.data_meta = xdp.data; xdp.data_end = xdp.data + size; + xdp.frame_sz = headroom + max_data_sz + tailroom; rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); xdp.rxq = &rxqueue->xdp_rxq; @@ -496,8 +507,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); if (ret) goto out; - if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN || - xdp.data_end != xdp.data + size) + if (xdp.data != data + headroom || xdp.data_end != xdp.data + size) size = xdp.data_end - xdp.data; ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); out: diff --git a/net/bridge/Kconfig b/net/bridge/Kconfig index e4fb050e2078..51a6414145d2 100644 --- a/net/bridge/Kconfig +++ b/net/bridge/Kconfig @@ -61,3 +61,15 @@ config BRIDGE_VLAN_FILTERING Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config BRIDGE_MRP + bool "MRP protocol" + depends on BRIDGE + default n + help + If you say Y here, then the Ethernet bridge will be able to run MRP + protocol to detect loops + + Say N to exclude this support and reduce the binary size. + + If unsure, say N. diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 49da7ae6f077..ccb394236fbd 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -25,3 +25,5 @@ bridge-$(CONFIG_BRIDGE_VLAN_FILTERING) += br_vlan.o br_vlan_tunnel.o br_vlan_opt bridge-$(CONFIG_NET_SWITCHDEV) += br_switchdev.o obj-$(CONFIG_NETFILTER) += netfilter/ + +bridge-$(CONFIG_BRIDGE_MRP) += br_mrp_switchdev.o br_mrp.o br_mrp_netlink.o diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c index 37908561a64b..b18cdf03edb3 100644 --- a/net/bridge/br_arp_nd_proxy.c +++ b/net/bridge/br_arp_nd_proxy.c @@ -276,6 +276,10 @@ static void br_nd_send(struct net_bridge *br, struct net_bridge_port *p, ns_olen = request->len - (skb_network_offset(request) + sizeof(struct ipv6hdr)) - sizeof(*ns); for (i = 0; i < ns_olen - 1; i += (ns->opt[i + 1] << 3)) { + if (!ns->opt[i + 1]) { + kfree_skb(reply); + return; + } if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { daddr = ns->opt + i + sizeof(struct nd_opt_hdr); break; diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0e3dbc5f3c34..8ec1362588af 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -463,6 +463,9 @@ void br_dev_setup(struct net_device *dev) spin_lock_init(&br->lock); INIT_LIST_HEAD(&br->port_list); INIT_HLIST_HEAD(&br->fdb_list); +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + INIT_LIST_HEAD(&br->mrp_list); +#endif spin_lock_init(&br->hash_lock); br->bridge_id.prio[0] = 0x80; diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 4fe30b182ee7..a0e9a7937412 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -333,6 +333,8 @@ static void del_nbp(struct net_bridge_port *p) br_stp_disable_port(p); spin_unlock_bh(&br->lock); + br_mrp_port_del(br, p); + br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); @@ -561,18 +563,32 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, unsigned br_hr, dev_hr; bool changed_addr; - /* Don't allow bridging non-ethernet like devices, or DSA-enabled - * master network devices since the bridge layer rx_handler prevents - * the DSA fake ethertype handler to be invoked, so we do not strip off - * the DSA switch tag protocol header and the bridge layer just return - * RX_HANDLER_CONSUMED, stopping RX processing for these frames. - */ + /* Don't allow bridging non-ethernet like devices. */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || - !is_valid_ether_addr(dev->dev_addr) || - netdev_uses_dsa(dev)) + !is_valid_ether_addr(dev->dev_addr)) return -EINVAL; + /* Also don't allow bridging of net devices that are DSA masters, since + * the bridge layer rx_handler prevents the DSA fake ethertype handler + * to be invoked, so we don't get the chance to strip off and parse the + * DSA switch tag protocol header (the bridge layer just returns + * RX_HANDLER_CONSUMED, stopping RX processing for these frames). + * The only case where that would not be an issue is when bridging can + * already be offloaded, such as when the DSA master is itself a DSA + * or plain switchdev port, and is bridged only with other ports from + * the same hardware device. + */ + if (netdev_uses_dsa(dev)) { + list_for_each_entry(p, &br->port_list, list) { + if (!netdev_port_same_parent_id(dev, p->dev)) { + NL_SET_ERR_MSG(extack, + "Cannot do software bridging with a DSA master"); + return -EINVAL; + } + } + } + /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { NL_SET_ERR_MSG(extack, @@ -616,7 +632,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, if (err) goto err3; - err = netdev_rx_handler_register(dev, br_handle_frame, p); + err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p); if (err) goto err4; diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index fcc260840028..59a318b9f646 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -17,6 +17,7 @@ #endif #include <linux/neighbour.h> #include <net/arp.h> +#include <net/dsa.h> #include <linux/export.h> #include <linux/rculist.h> #include "br_private.h" @@ -257,7 +258,7 @@ frame_finish: * Return NULL if skb is handled * note: already called with rcu_read_lock */ -rx_handler_result_t br_handle_frame(struct sk_buff **pskb) +static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) { struct net_bridge_port *p; struct sk_buff *skb = *pskb; @@ -342,6 +343,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) } } + if (unlikely(br_mrp_process(p, skb))) + return RX_HANDLER_PASS; + forward: switch (p->state) { case BR_STATE_FORWARDING: @@ -356,3 +360,23 @@ drop: } return RX_HANDLER_CONSUMED; } + +/* This function has no purpose other than to appease the br_port_get_rcu/rtnl + * helpers which identify bridged ports according to the rx_handler installed + * on them (so there _needs_ to be a bridge rx_handler even if we don't need it + * to do anything useful). This bridge won't support traffic to/from the stack, + * but only hardware bridging. So return RX_HANDLER_PASS so we don't steal + * frames from the ETH_P_XDSA packet_type handler. + */ +static rx_handler_result_t br_handle_frame_dummy(struct sk_buff **pskb) +{ + return RX_HANDLER_PASS; +} + +rx_handler_func_t *br_get_rx_handler(const struct net_device *dev) +{ + if (netdev_uses_dsa(dev)) + return br_handle_frame_dummy; + + return br_handle_frame; +} diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index ae22d784b88a..5e71fc8b826f 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -242,8 +242,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; - br_stp_set_enabled(br, args[1]); - ret = 0; + ret = br_stp_set_enabled(br, args[1], NULL); break; case BRCTL_SET_BRIDGE_PRIORITY: diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c new file mode 100644 index 000000000000..24986ec7d38c --- /dev/null +++ b/net/bridge/br_mrp.c @@ -0,0 +1,677 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <linux/mrp_bridge.h> +#include "br_private_mrp.h" + +static const u8 mrp_test_dmac[ETH_ALEN] = { 0x1, 0x15, 0x4e, 0x0, 0x0, 0x1 }; + +static struct net_bridge_port *br_mrp_get_port(struct net_bridge *br, + u32 ifindex) +{ + struct net_bridge_port *res = NULL; + struct net_bridge_port *port; + + list_for_each_entry(port, &br->port_list, list) { + if (port->dev->ifindex == ifindex) { + res = port; + break; + } + } + + return res; +} + +static struct br_mrp *br_mrp_find_id(struct net_bridge *br, u32 ring_id) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (mrp->ring_id == ring_id) { + res = mrp; + break; + } + } + + return res; +} + +static bool br_mrp_unique_ifindex(struct net_bridge *br, u32 ifindex) +{ + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + struct net_bridge_port *p; + + p = rtnl_dereference(mrp->p_port); + if (p && p->dev->ifindex == ifindex) + return false; + + p = rtnl_dereference(mrp->s_port); + if (p && p->dev->ifindex == ifindex) + return false; + } + + return true; +} + +static struct br_mrp *br_mrp_find_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct br_mrp *res = NULL; + struct br_mrp *mrp; + + list_for_each_entry_rcu(mrp, &br->mrp_list, list, + lockdep_rtnl_is_held()) { + if (rcu_access_pointer(mrp->p_port) == p || + rcu_access_pointer(mrp->s_port) == p) { + res = mrp; + break; + } + } + + return res; +} + +static int br_mrp_next_seq(struct br_mrp *mrp) +{ + mrp->seq_id++; + return mrp->seq_id; +} + +static struct sk_buff *br_mrp_skb_alloc(struct net_bridge_port *p, + const u8 *src, const u8 *dst) +{ + struct ethhdr *eth_hdr; + struct sk_buff *skb; + u16 *version; + + skb = dev_alloc_skb(MRP_MAX_FRAME_LENGTH); + if (!skb) + return NULL; + + skb->dev = p->dev; + skb->protocol = htons(ETH_P_MRP); + skb->priority = MRP_FRAME_PRIO; + skb_reserve(skb, sizeof(*eth_hdr)); + + eth_hdr = skb_push(skb, sizeof(*eth_hdr)); + ether_addr_copy(eth_hdr->h_dest, dst); + ether_addr_copy(eth_hdr->h_source, src); + eth_hdr->h_proto = htons(ETH_P_MRP); + + version = skb_put(skb, sizeof(*version)); + *version = cpu_to_be16(MRP_VERSION); + + return skb; +} + +static void br_mrp_skb_tlv(struct sk_buff *skb, + enum br_mrp_tlv_header_type type, + u8 length) +{ + struct br_mrp_tlv_hdr *hdr; + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->type = type; + hdr->length = length; +} + +static void br_mrp_skb_common(struct sk_buff *skb, struct br_mrp *mrp) +{ + struct br_mrp_common_hdr *hdr; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_COMMON, sizeof(*hdr)); + + hdr = skb_put(skb, sizeof(*hdr)); + hdr->seq_id = cpu_to_be16(br_mrp_next_seq(mrp)); + memset(hdr->domain, 0xff, MRP_DOMAIN_UUID_LENGTH); +} + +static struct sk_buff *br_mrp_alloc_test_skb(struct br_mrp *mrp, + struct net_bridge_port *p, + enum br_mrp_port_role_type port_role) +{ + struct br_mrp_ring_test_hdr *hdr = NULL; + struct sk_buff *skb = NULL; + + if (!p) + return NULL; + + skb = br_mrp_skb_alloc(p, p->dev->dev_addr, mrp_test_dmac); + if (!skb) + return NULL; + + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_RING_TEST, sizeof(*hdr)); + hdr = skb_put(skb, sizeof(*hdr)); + + hdr->prio = cpu_to_be16(mrp->prio); + ether_addr_copy(hdr->sa, p->br->dev->dev_addr); + hdr->port_role = cpu_to_be16(port_role); + hdr->state = cpu_to_be16(mrp->ring_state); + hdr->transitions = cpu_to_be16(mrp->ring_transitions); + hdr->timestamp = cpu_to_be32(jiffies_to_msecs(jiffies)); + + br_mrp_skb_common(skb, mrp); + br_mrp_skb_tlv(skb, BR_MRP_TLV_HEADER_END, 0x0); + + return skb; +} + +/* This function is continuously called in the following cases: + * - when node role is MRM, in this case test_monitor is always set to false + * because it needs to notify the userspace that the ring is open and needs to + * send MRP_Test frames + * - when node role is MRA, there are 2 subcases: + * - when MRA behaves as MRM, in this case is similar with MRM role + * - when MRA behaves as MRC, in this case test_monitor is set to true, + * because it needs to detect when it stops seeing MRP_Test frames + * from MRM node but it doesn't need to send MRP_Test frames. + */ +static void br_mrp_test_work_expired(struct work_struct *work) +{ + struct delayed_work *del_work = to_delayed_work(work); + struct br_mrp *mrp = container_of(del_work, struct br_mrp, test_work); + struct net_bridge_port *p; + bool notify_open = false; + struct sk_buff *skb; + + if (time_before_eq(mrp->test_end, jiffies)) + return; + + if (mrp->test_count_miss < mrp->test_max_miss) { + mrp->test_count_miss++; + } else { + /* Notify that the ring is open only if the ring state is + * closed, otherwise it would continue to notify at every + * interval. + * Also notify that the ring is open when the node has the + * role MRA and behaves as MRC. The reason is that the + * userspace needs to know when the MRM stopped sending + * MRP_Test frames so that the current node to try to take + * the role of a MRM. + */ + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED || + mrp->test_monitor) + notify_open = true; + } + + rcu_read_lock(); + + p = rcu_dereference(mrp->p_port); + if (p) { + if (!mrp->test_monitor) { + skb = br_mrp_alloc_test_skb(mrp, p, + BR_MRP_PORT_ROLE_PRIMARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + } + + if (notify_open && !mrp->ring_role_offloaded) + br_mrp_port_open(p->dev, true); + } + + p = rcu_dereference(mrp->s_port); + if (p) { + if (!mrp->test_monitor) { + skb = br_mrp_alloc_test_skb(mrp, p, + BR_MRP_PORT_ROLE_SECONDARY); + if (!skb) + goto out; + + skb_reset_network_header(skb); + dev_queue_xmit(skb); + } + + if (notify_open && !mrp->ring_role_offloaded) + br_mrp_port_open(p->dev, true); + } + +out: + rcu_read_unlock(); + + queue_delayed_work(system_wq, &mrp->test_work, + usecs_to_jiffies(mrp->test_interval)); +} + +/* Deletes the MRP instance. + * note: called under rtnl_lock + */ +static void br_mrp_del_impl(struct net_bridge *br, struct br_mrp *mrp) +{ + struct net_bridge_port *p; + u8 state; + + /* Stop sending MRP_Test frames */ + cancel_delayed_work_sync(&mrp->test_work); + br_mrp_switchdev_send_ring_test(br, mrp, 0, 0, 0, 0); + + br_mrp_switchdev_del(br, mrp); + + /* Reset the ports */ + p = rtnl_dereference(mrp->p_port); + if (p) { + spin_lock_bh(&br->lock); + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, state); + rcu_assign_pointer(mrp->p_port, NULL); + } + + p = rtnl_dereference(mrp->s_port); + if (p) { + spin_lock_bh(&br->lock); + state = netif_running(br->dev) ? + BR_STATE_FORWARDING : BR_STATE_DISABLED; + p->state = state; + p->flags &= ~BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + br_mrp_port_switchdev_set_state(p, state); + rcu_assign_pointer(mrp->s_port, NULL); + } + + list_del_rcu(&mrp->list); + kfree_rcu(mrp, rcu); +} + +/* Adds a new MRP instance. + * note: called under rtnl_lock + */ +int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance) +{ + struct net_bridge_port *p; + struct br_mrp *mrp; + int err; + + /* If the ring exists, it is not possible to create another one with the + * same ring_id + */ + mrp = br_mrp_find_id(br, instance->ring_id); + if (mrp) + return -EINVAL; + + if (!br_mrp_get_port(br, instance->p_ifindex) || + !br_mrp_get_port(br, instance->s_ifindex)) + return -EINVAL; + + /* It is not possible to have the same port part of multiple rings */ + if (!br_mrp_unique_ifindex(br, instance->p_ifindex) || + !br_mrp_unique_ifindex(br, instance->s_ifindex)) + return -EINVAL; + + mrp = kzalloc(sizeof(*mrp), GFP_KERNEL); + if (!mrp) + return -ENOMEM; + + mrp->ring_id = instance->ring_id; + mrp->prio = instance->prio; + + p = br_mrp_get_port(br, instance->p_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->p_port, p); + + p = br_mrp_get_port(br, instance->s_ifindex); + spin_lock_bh(&br->lock); + p->state = BR_STATE_FORWARDING; + p->flags |= BR_MRP_AWARE; + spin_unlock_bh(&br->lock); + rcu_assign_pointer(mrp->s_port, p); + + INIT_DELAYED_WORK(&mrp->test_work, br_mrp_test_work_expired); + list_add_tail_rcu(&mrp->list, &br->mrp_list); + + err = br_mrp_switchdev_add(br, mrp); + if (err) + goto delete_mrp; + + return 0; + +delete_mrp: + br_mrp_del_impl(br, mrp); + + return err; +} + +/* Deletes the MRP instance from which the port is part of + * note: called under rtnl_lock + */ +void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p) +{ + struct br_mrp *mrp = br_mrp_find_port(br, p); + + /* If the port is not part of a MRP instance just bail out */ + if (!mrp) + return; + + br_mrp_del_impl(br, mrp); +} + +/* Deletes existing MRP instance based on ring_id + * note: called under rtnl_lock + */ +int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance) +{ + struct br_mrp *mrp = br_mrp_find_id(br, instance->ring_id); + + if (!mrp) + return -EINVAL; + + br_mrp_del_impl(br, mrp); + + return 0; +} + +/* Set port state, port state can be forwarding, blocked or disabled + * note: already called with rtnl_lock + */ +int br_mrp_set_port_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state) +{ + if (!p || !(p->flags & BR_MRP_AWARE)) + return -EINVAL; + + spin_lock_bh(&p->br->lock); + + if (state == BR_MRP_PORT_STATE_FORWARDING) + p->state = BR_STATE_FORWARDING; + else + p->state = BR_STATE_BLOCKING; + + spin_unlock_bh(&p->br->lock); + + br_mrp_port_switchdev_set_state(p, state); + + return 0; +} + +/* Set port role, port role can be primary or secondary + * note: already called with rtnl_lock + */ +int br_mrp_set_port_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role) +{ + struct br_mrp *mrp; + + if (!p || !(p->flags & BR_MRP_AWARE)) + return -EINVAL; + + mrp = br_mrp_find_port(p->br, p); + + if (!mrp) + return -EINVAL; + + if (role == BR_MRP_PORT_ROLE_PRIMARY) + rcu_assign_pointer(mrp->p_port, p); + else + rcu_assign_pointer(mrp->s_port, p); + + br_mrp_port_switchdev_set_role(p, role); + + return 0; +} + +/* Set ring state, ring state can be only Open or Closed + * note: already called with rtnl_lock + */ +int br_mrp_set_ring_state(struct net_bridge *br, + struct br_mrp_ring_state *state) +{ + struct br_mrp *mrp = br_mrp_find_id(br, state->ring_id); + + if (!mrp) + return -EINVAL; + + if (mrp->ring_state == BR_MRP_RING_STATE_CLOSED && + state->ring_state != BR_MRP_RING_STATE_CLOSED) + mrp->ring_transitions++; + + mrp->ring_state = state->ring_state; + + br_mrp_switchdev_set_ring_state(br, mrp, state->ring_state); + + return 0; +} + +/* Set ring role, ring role can be only MRM(Media Redundancy Manager) or + * MRC(Media Redundancy Client). + * note: already called with rtnl_lock + */ +int br_mrp_set_ring_role(struct net_bridge *br, + struct br_mrp_ring_role *role) +{ + struct br_mrp *mrp = br_mrp_find_id(br, role->ring_id); + int err; + + if (!mrp) + return -EINVAL; + + mrp->ring_role = role->ring_role; + + /* If there is an error just bailed out */ + err = br_mrp_switchdev_set_ring_role(br, mrp, role->ring_role); + if (err && err != -EOPNOTSUPP) + return err; + + /* Now detect if the HW actually applied the role or not. If the HW + * applied the role it means that the SW will not to do those operations + * anymore. For example if the role ir MRM then the HW will notify the + * SW when ring is open, but if the is not pushed to the HW the SW will + * need to detect when the ring is open + */ + mrp->ring_role_offloaded = err == -EOPNOTSUPP ? 0 : 1; + + return 0; +} + +/* Start to generate or monitor MRP test frames, the frames are generated by + * HW and if it fails, they are generated by the SW. + * note: already called with rtnl_lock + */ +int br_mrp_start_test(struct net_bridge *br, + struct br_mrp_start_test *test) +{ + struct br_mrp *mrp = br_mrp_find_id(br, test->ring_id); + + if (!mrp) + return -EINVAL; + + /* Try to push it to the HW and if it fails then continue with SW + * implementation and if that also fails then return error. + */ + if (!br_mrp_switchdev_send_ring_test(br, mrp, test->interval, + test->max_miss, test->period, + test->monitor)) + return 0; + + mrp->test_interval = test->interval; + mrp->test_end = jiffies + usecs_to_jiffies(test->period); + mrp->test_max_miss = test->max_miss; + mrp->test_monitor = test->monitor; + mrp->test_count_miss = 0; + queue_delayed_work(system_wq, &mrp->test_work, + usecs_to_jiffies(test->interval)); + + return 0; +} + +/* Process only MRP Test frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static void br_mrp_mrm_process(struct br_mrp *mrp, struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return; + + if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) + return; + + mrp->test_count_miss = 0; + + /* Notify the userspace that the ring is closed only when the ring is + * not closed + */ + if (mrp->ring_state != BR_MRP_RING_STATE_CLOSED) + br_mrp_port_open(port->dev, false); +} + +/* Determin if the test hdr has a better priority than the node */ +static bool br_mrp_test_better_than_own(struct br_mrp *mrp, + struct net_bridge *br, + const struct br_mrp_ring_test_hdr *hdr) +{ + u16 prio = be16_to_cpu(hdr->prio); + + if (prio < mrp->prio || + (prio == mrp->prio && + ether_addr_to_u64(hdr->sa) < ether_addr_to_u64(br->dev->dev_addr))) + return true; + + return false; +} + +/* Process only MRP Test frame. All the other MRP frames are processed by + * userspace application + * note: already called with rcu_read_lock + */ +static void br_mrp_mra_process(struct br_mrp *mrp, struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + const struct br_mrp_ring_test_hdr *test_hdr; + struct br_mrp_ring_test_hdr _test_hdr; + const struct br_mrp_tlv_hdr *hdr; + struct br_mrp_tlv_hdr _hdr; + + /* Each MRP header starts with a version field which is 16 bits. + * Therefore skip the version and get directly the TLV header. + */ + hdr = skb_header_pointer(skb, sizeof(uint16_t), sizeof(_hdr), &_hdr); + if (!hdr) + return; + + if (hdr->type != BR_MRP_TLV_HEADER_RING_TEST) + return; + + test_hdr = skb_header_pointer(skb, sizeof(uint16_t) + sizeof(_hdr), + sizeof(_test_hdr), &_test_hdr); + if (!test_hdr) + return; + + /* Only frames that have a better priority than the node will + * clear the miss counter because otherwise the node will need to behave + * as MRM. + */ + if (br_mrp_test_better_than_own(mrp, br, test_hdr)) + mrp->test_count_miss = 0; +} + +/* This will just forward the frame to the other mrp ring port(MRC role) or will + * not do anything. + * note: already called with rcu_read_lock + */ +static int br_mrp_rcv(struct net_bridge_port *p, + struct sk_buff *skb, struct net_device *dev) +{ + struct net_device *s_dev, *p_dev, *d_dev; + struct net_bridge_port *p_port, *s_port; + struct net_bridge *br; + struct sk_buff *nskb; + struct br_mrp *mrp; + + /* If port is disabled don't accept any frames */ + if (p->state == BR_STATE_DISABLED) + return 0; + + br = p->br; + mrp = br_mrp_find_port(br, p); + if (unlikely(!mrp)) + return 0; + + p_port = rcu_dereference(mrp->p_port); + if (!p_port) + return 0; + + s_port = rcu_dereference(mrp->s_port); + if (!s_port) + return 0; + + /* If the role is MRM then don't forward the frames */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRM) { + br_mrp_mrm_process(mrp, p, skb); + return 1; + } + + /* If the role is MRA then don't forward the frames if it behaves as + * MRM node + */ + if (mrp->ring_role == BR_MRP_RING_ROLE_MRA) { + if (!mrp->test_monitor) { + br_mrp_mrm_process(mrp, p, skb); + return 1; + } + + br_mrp_mra_process(mrp, br, p, skb); + } + + /* Clone the frame and forward it on the other MRP port */ + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + return 0; + + p_dev = p_port->dev; + s_dev = s_port->dev; + + if (p_dev == dev) + d_dev = s_dev; + else + d_dev = p_dev; + + nskb->dev = d_dev; + skb_push(nskb, ETH_HLEN); + dev_queue_xmit(nskb); + + return 1; +} + +/* Check if the frame was received on a port that is part of MRP ring + * and if the frame has MRP eth. In that case process the frame otherwise do + * normal forwarding. + * note: already called with rcu_read_lock + */ +int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +{ + /* If there is no MRP instance do normal forwarding */ + if (likely(!(p->flags & BR_MRP_AWARE))) + goto out; + + if (unlikely(skb->protocol == htons(ETH_P_MRP))) + return br_mrp_rcv(p, skb, p->dev); + +out: + return 0; +} + +bool br_mrp_enabled(struct net_bridge *br) +{ + return !list_empty(&br->mrp_list); +} diff --git a/net/bridge/br_mrp_netlink.c b/net/bridge/br_mrp_netlink.c new file mode 100644 index 000000000000..34b3a8776991 --- /dev/null +++ b/net/bridge/br_mrp_netlink.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/genetlink.h> + +#include <uapi/linux/mrp_bridge.h> +#include "br_private.h" +#include "br_private_mrp.h" + +static const struct nla_policy br_mrp_policy[IFLA_BRIDGE_MRP_MAX + 1] = { + [IFLA_BRIDGE_MRP_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_INSTANCE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_PORT_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_PORT_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_RING_STATE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_RING_ROLE] = { .type = NLA_NESTED }, + [IFLA_BRIDGE_MRP_START_TEST] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy +br_mrp_instance_policy[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1] = { + [IFLA_BRIDGE_MRP_INSTANCE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_INSTANCE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_INSTANCE_PRIO] = { .type = NLA_U16 }, +}; + +static int br_mrp_instance_parse(struct net_bridge *br, struct nlattr *attr, + int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_INSTANCE_MAX + 1]; + struct br_mrp_instance inst; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_INSTANCE_MAX, attr, + br_mrp_instance_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX] || + !tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or P_IFINDEX or S_IFINDEX"); + return -EINVAL; + } + + memset(&inst, 0, sizeof(inst)); + + inst.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_RING_ID]); + inst.p_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_P_IFINDEX]); + inst.s_ifindex = nla_get_u32(tb[IFLA_BRIDGE_MRP_INSTANCE_S_IFINDEX]); + inst.prio = MRP_DEFAULT_PRIO; + + if (tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]) + inst.prio = nla_get_u16(tb[IFLA_BRIDGE_MRP_INSTANCE_PRIO]); + + if (cmd == RTM_SETLINK) + return br_mrp_add(br, &inst); + else + return br_mrp_del(br, &inst); + + return 0; +} + +static const struct nla_policy +br_mrp_port_state_policy[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_PORT_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_PORT_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_port_state_parse(struct net_bridge_port *p, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_STATE_MAX + 1]; + enum br_mrp_port_state_type state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_STATE_MAX, attr, + br_mrp_port_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing attribute: STATE"); + return -EINVAL; + } + + state = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_STATE_STATE]); + + return br_mrp_set_port_state(p, state); +} + +static const struct nla_policy +br_mrp_port_role_policy[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_PORT_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_PORT_ROLE_ROLE] = { .type = NLA_U32 }, +}; + +static int br_mrp_port_role_parse(struct net_bridge_port *p, + struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_PORT_ROLE_MAX + 1]; + enum br_mrp_port_role_type role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_PORT_ROLE_MAX, attr, + br_mrp_port_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, "Missing attribute: ROLE"); + return -EINVAL; + } + + role = nla_get_u32(tb[IFLA_BRIDGE_MRP_PORT_ROLE_ROLE]); + + return br_mrp_set_port_role(p, role); +} + +static const struct nla_policy +br_mrp_ring_state_policy[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1] = { + [IFLA_BRIDGE_MRP_RING_STATE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_RING_STATE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_RING_STATE_STATE] = { .type = NLA_U32 }, +}; + +static int br_mrp_ring_state_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_RING_STATE_MAX + 1]; + struct br_mrp_ring_state state; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_STATE_MAX, attr, + br_mrp_ring_state_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or STATE"); + return -EINVAL; + } + + memset(&state, 0x0, sizeof(state)); + + state.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_RING_ID]); + state.ring_state = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_STATE_STATE]); + + return br_mrp_set_ring_state(br, &state); +} + +static const struct nla_policy +br_mrp_ring_role_policy[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1] = { + [IFLA_BRIDGE_MRP_RING_ROLE_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_RING_ROLE_ROLE] = { .type = NLA_U32 }, +}; + +static int br_mrp_ring_role_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_RING_ROLE_MAX + 1]; + struct br_mrp_ring_role role; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_RING_ROLE_MAX, attr, + br_mrp_ring_role_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID] || + !tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or ROLE"); + return -EINVAL; + } + + memset(&role, 0x0, sizeof(role)); + + role.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_RING_ID]); + role.ring_role = nla_get_u32(tb[IFLA_BRIDGE_MRP_RING_ROLE_ROLE]); + + return br_mrp_set_ring_role(br, &role); +} + +static const struct nla_policy +br_mrp_start_test_policy[IFLA_BRIDGE_MRP_START_TEST_MAX + 1] = { + [IFLA_BRIDGE_MRP_START_TEST_UNSPEC] = { .type = NLA_REJECT }, + [IFLA_BRIDGE_MRP_START_TEST_RING_ID] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_INTERVAL] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_PERIOD] = { .type = NLA_U32 }, + [IFLA_BRIDGE_MRP_START_TEST_MONITOR] = { .type = NLA_U32 }, +}; + +static int br_mrp_start_test_parse(struct net_bridge *br, struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_START_TEST_MAX + 1]; + struct br_mrp_start_test test; + int err; + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_START_TEST_MAX, attr, + br_mrp_start_test_policy, extack); + if (err) + return err; + + if (!tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID] || + !tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL] || + !tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS] || + !tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]) { + NL_SET_ERR_MSG_MOD(extack, + "Missing attribute: RING_ID or INTERVAL or MAX_MISS or PERIOD"); + return -EINVAL; + } + + memset(&test, 0x0, sizeof(test)); + + test.ring_id = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_RING_ID]); + test.interval = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_INTERVAL]); + test.max_miss = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MAX_MISS]); + test.period = nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_PERIOD]); + test.monitor = false; + + if (tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]) + test.monitor = + nla_get_u32(tb[IFLA_BRIDGE_MRP_START_TEST_MONITOR]); + + return br_mrp_start_test(br, &test); +} + +int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[IFLA_BRIDGE_MRP_MAX + 1]; + int err; + + /* When this function is called for a port then the br pointer is + * invalid, therefor set the br to point correctly + */ + if (p) + br = p->br; + + if (br->stp_enabled != BR_NO_STP) { + NL_SET_ERR_MSG_MOD(extack, "MRP can't be enabled if STP is already enabled"); + return -EINVAL; + } + + err = nla_parse_nested(tb, IFLA_BRIDGE_MRP_MAX, attr, + br_mrp_policy, extack); + if (err) + return err; + + if (tb[IFLA_BRIDGE_MRP_INSTANCE]) { + err = br_mrp_instance_parse(br, tb[IFLA_BRIDGE_MRP_INSTANCE], + cmd, extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_PORT_STATE]) { + err = br_mrp_port_state_parse(p, tb[IFLA_BRIDGE_MRP_PORT_STATE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_PORT_ROLE]) { + err = br_mrp_port_role_parse(p, tb[IFLA_BRIDGE_MRP_PORT_ROLE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_RING_STATE]) { + err = br_mrp_ring_state_parse(br, + tb[IFLA_BRIDGE_MRP_RING_STATE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_RING_ROLE]) { + err = br_mrp_ring_role_parse(br, tb[IFLA_BRIDGE_MRP_RING_ROLE], + extack); + if (err) + return err; + } + + if (tb[IFLA_BRIDGE_MRP_START_TEST]) { + err = br_mrp_start_test_parse(br, + tb[IFLA_BRIDGE_MRP_START_TEST], + extack); + if (err) + return err; + } + + return 0; +} + +int br_mrp_port_open(struct net_device *dev, u8 loc) +{ + struct net_bridge_port *p; + int err = 0; + + p = br_port_get_rcu(dev); + if (!p) { + err = -EINVAL; + goto out; + } + + if (loc) + p->flags |= BR_MRP_LOST_CONT; + else + p->flags &= ~BR_MRP_LOST_CONT; + + br_ifinfo_notify(RTM_NEWLINK, NULL, p); + +out: + return err; +} diff --git a/net/bridge/br_mrp_switchdev.c b/net/bridge/br_mrp_switchdev.c new file mode 100644 index 000000000000..0da68a0da4b5 --- /dev/null +++ b/net/bridge/br_mrp_switchdev.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/switchdev.h> + +#include "br_private_mrp.h" + +int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp) +{ + struct switchdev_obj_mrp mrp_obj = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_MRP, + .p_port = rtnl_dereference(mrp->p_port)->dev, + .s_port = rtnl_dereference(mrp->s_port)->dev, + .ring_id = mrp->ring_id, + .prio = mrp->prio, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_obj.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp) +{ + struct switchdev_obj_mrp mrp_obj = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_MRP, + .p_port = NULL, + .s_port = NULL, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_del(br->dev, &mrp_obj.obj); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_switchdev_set_ring_role(struct net_bridge *br, + struct br_mrp *mrp, + enum br_mrp_ring_role_type role) +{ + struct switchdev_obj_ring_role_mrp mrp_role = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_ROLE_MRP, + .ring_role = role, + .ring_id = mrp->ring_id, + }; + int err; + + if (role == BR_MRP_RING_ROLE_DISABLED) + err = switchdev_port_obj_del(br->dev, &mrp_role.obj); + else + err = switchdev_port_obj_add(br->dev, &mrp_role.obj, NULL); + + return err; +} + +int br_mrp_switchdev_send_ring_test(struct net_bridge *br, + struct br_mrp *mrp, u32 interval, + u8 max_miss, u32 period, + bool monitor) +{ + struct switchdev_obj_ring_test_mrp test = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_TEST_MRP, + .interval = interval, + .max_miss = max_miss, + .ring_id = mrp->ring_id, + .period = period, + .monitor = monitor, + }; + int err; + + if (interval == 0) + err = switchdev_port_obj_del(br->dev, &test.obj); + else + err = switchdev_port_obj_add(br->dev, &test.obj, NULL); + + return err; +} + +int br_mrp_switchdev_set_ring_state(struct net_bridge *br, + struct br_mrp *mrp, + enum br_mrp_ring_state_type state) +{ + struct switchdev_obj_ring_state_mrp mrp_state = { + .obj.orig_dev = br->dev, + .obj.id = SWITCHDEV_OBJ_ID_RING_STATE_MRP, + .ring_state = state, + .ring_id = mrp->ring_id, + }; + int err; + + err = switchdev_port_obj_add(br->dev, &mrp_state.obj, NULL); + + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} + +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_MRP_PORT_STATE, + .u.mrp_port_state = state, + }; + int err; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + br_warn(p->br, "error setting offload MRP state on port %u(%s)\n", + (unsigned int)p->port_no, p->dev->name); + + return err; +} + +int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role) +{ + struct switchdev_attr attr = { + .orig_dev = p->dev, + .id = SWITCHDEV_ATTR_ID_MRP_PORT_ROLE, + .u.mrp_port_role = role, + }; + int err; + + err = switchdev_port_attr_set(p->dev, &attr); + if (err && err != -EOPNOTSUPP) + return err; + + return 0; +} diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 59980ecfc962..04c3f9a82650 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1027,7 +1027,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index a0f5dbee8f9c..240e260e3461 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -151,6 +151,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MULTICAST_ROUTER */ #endif + nla_total_size(sizeof(u16)) /* IFLA_BRPORT_GROUP_FWD_MASK */ + + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_RING_OPEN */ + 0; } @@ -213,6 +214,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, nla_put_u16(skb, IFLA_BRPORT_GROUP_FWD_MASK, p->group_fwd_mask) || nla_put_u8(skb, IFLA_BRPORT_NEIGH_SUPPRESS, !!(p->flags & BR_NEIGH_SUPPRESS)) || + nla_put_u8(skb, IFLA_BRPORT_MRP_RING_OPEN, !!(p->flags & + BR_MRP_LOST_CONT)) || nla_put_u8(skb, IFLA_BRPORT_ISOLATED, !!(p->flags & BR_ISOLATED))) return -EMSGSIZE; @@ -670,6 +673,11 @@ static int br_afspec(struct net_bridge *br, if (err) return err; break; + case IFLA_BRIDGE_MRP: + err = br_mrp_parse(br, p, attr, cmd, extack); + if (err) + return err; + break; } } @@ -1102,7 +1110,9 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (data[IFLA_BR_STP_STATE]) { u32 stp_enabled = nla_get_u32(data[IFLA_BR_STP_STATE]); - br_stp_set_enabled(br, stp_enabled); + err = br_stp_set_enabled(br, stp_enabled, extack); + if (err) + return err; } if (data[IFLA_BR_PRIORITY]) { diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 1f97703a52ff..7501be4eeba0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -428,6 +428,10 @@ struct net_bridge { int offload_fwd_mark; #endif struct hlist_head fdb_list; + +#if IS_ENABLED(CONFIG_BRIDGE_MRP) + struct list_head __rcu mrp_list; +#endif }; struct br_input_skb_cb { @@ -594,10 +598,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev); static inline void br_netpoll_send_skb(const struct net_bridge_port *p, struct sk_buff *skb) { - struct netpoll *np = p->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(p->np, skb); } int br_netpoll_enable(struct net_bridge_port *p); @@ -701,16 +702,16 @@ int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev); /* br_input.c */ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb); -rx_handler_result_t br_handle_frame(struct sk_buff **pskb); +rx_handler_func_t *br_get_rx_handler(const struct net_device *dev); static inline bool br_rx_handler_check_rcu(const struct net_device *dev) { - return rcu_dereference(dev->rx_handler) == br_handle_frame; + return rcu_dereference(dev->rx_handler) == br_get_rx_handler(dev); } static inline bool br_rx_handler_check_rtnl(const struct net_device *dev) { - return rcu_dereference_rtnl(dev->rx_handler) == br_handle_frame; + return rcu_dereference_rtnl(dev->rx_handler) == br_get_rx_handler(dev); } static inline struct net_bridge_port *br_port_get_check_rcu(const struct net_device *dev) @@ -1279,7 +1280,8 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time); /* br_stp_if.c */ void br_stp_enable_bridge(struct net_bridge *br); void br_stp_disable_bridge(struct net_bridge *br); -void br_stp_set_enabled(struct net_bridge *br, unsigned long val); +int br_stp_set_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack); void br_stp_enable_port(struct net_bridge_port *p); void br_stp_disable_port(struct net_bridge_port *p); bool br_stp_recalculate_bridge_id(struct net_bridge *br); @@ -1304,6 +1306,37 @@ unsigned long br_timer_value(const struct timer_list *timer); extern int (*br_fdb_test_addr_hook)(struct net_device *dev, unsigned char *addr); #endif +/* br_mrp.c */ +#if IS_ENABLED(CONFIG_BRIDGE_MRP) +int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, struct netlink_ext_ack *extack); +int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb); +bool br_mrp_enabled(struct net_bridge *br); +void br_mrp_port_del(struct net_bridge *br, struct net_bridge_port *p); +#else +static inline int br_mrp_parse(struct net_bridge *br, struct net_bridge_port *p, + struct nlattr *attr, int cmd, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + +static inline int br_mrp_process(struct net_bridge_port *p, struct sk_buff *skb) +{ + return 0; +} + +static inline bool br_mrp_enabled(struct net_bridge *br) +{ + return false; +} + +static inline void br_mrp_port_del(struct net_bridge *br, + struct net_bridge_port *p) +{ +} +#endif + /* br_netlink.c */ extern struct rtnl_link_ops br_link_ops; int br_netlink_init(void); diff --git a/net/bridge/br_private_mrp.h b/net/bridge/br_private_mrp.h new file mode 100644 index 000000000000..33b255e38ffe --- /dev/null +++ b/net/bridge/br_private_mrp.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ + +#ifndef _BR_PRIVATE_MRP_H_ +#define _BR_PRIVATE_MRP_H_ + +#include "br_private.h" +#include <uapi/linux/mrp_bridge.h> + +struct br_mrp { + /* list of mrp instances */ + struct list_head __rcu list; + + struct net_bridge_port __rcu *p_port; + struct net_bridge_port __rcu *s_port; + + u32 ring_id; + u16 prio; + + enum br_mrp_ring_role_type ring_role; + u8 ring_role_offloaded; + enum br_mrp_ring_state_type ring_state; + u32 ring_transitions; + + struct delayed_work test_work; + u32 test_interval; + unsigned long test_end; + u32 test_count_miss; + u32 test_max_miss; + bool test_monitor; + + u32 seq_id; + + struct rcu_head rcu; +}; + +/* br_mrp.c */ +int br_mrp_add(struct net_bridge *br, struct br_mrp_instance *instance); +int br_mrp_del(struct net_bridge *br, struct br_mrp_instance *instance); +int br_mrp_set_port_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state); +int br_mrp_set_port_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role); +int br_mrp_set_ring_state(struct net_bridge *br, + struct br_mrp_ring_state *state); +int br_mrp_set_ring_role(struct net_bridge *br, struct br_mrp_ring_role *role); +int br_mrp_start_test(struct net_bridge *br, struct br_mrp_start_test *test); + +/* br_mrp_switchdev.c */ +int br_mrp_switchdev_add(struct net_bridge *br, struct br_mrp *mrp); +int br_mrp_switchdev_del(struct net_bridge *br, struct br_mrp *mrp); +int br_mrp_switchdev_set_ring_role(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_role_type role); +int br_mrp_switchdev_set_ring_state(struct net_bridge *br, struct br_mrp *mrp, + enum br_mrp_ring_state_type state); +int br_mrp_switchdev_send_ring_test(struct net_bridge *br, struct br_mrp *mrp, + u32 interval, u8 max_miss, u32 period, + bool monitor); +int br_mrp_port_switchdev_set_state(struct net_bridge_port *p, + enum br_mrp_port_state_type state); +int br_mrp_port_switchdev_set_role(struct net_bridge_port *p, + enum br_mrp_port_role_type role); + +/* br_mrp_netlink.c */ +int br_mrp_port_open(struct net_device *dev, u8 loc); + +#endif /* _BR_PRIVATE_MRP_H */ diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 1f14b8455345..3e88be7aa269 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -36,6 +36,12 @@ void br_set_state(struct net_bridge_port *p, unsigned int state) }; int err; + /* Don't change the state of the ports if they are driven by a different + * protocol. + */ + if (p->flags & BR_MRP_AWARE) + return; + p->state = state; err = switchdev_port_attr_set(p->dev, &attr); if (err && err != -EOPNOTSUPP) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index d174d3a566aa..ba55851fe132 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -196,10 +196,17 @@ static void br_stp_stop(struct net_bridge *br) br->stp_enabled = BR_NO_STP; } -void br_stp_set_enabled(struct net_bridge *br, unsigned long val) +int br_stp_set_enabled(struct net_bridge *br, unsigned long val, + struct netlink_ext_ack *extack) { ASSERT_RTNL(); + if (br_mrp_enabled(br)) { + NL_SET_ERR_MSG_MOD(extack, + "STP can't be enabled if MRP is already enabled"); + return -EINVAL; + } + if (val) { if (br->stp_enabled == BR_NO_STP) br_stp_start(br); @@ -207,6 +214,8 @@ void br_stp_set_enabled(struct net_bridge *br, unsigned long val) if (br->stp_enabled != BR_NO_STP) br_stp_stop(br); } + + return 0; } /* called under bridge lock */ diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 9ab0f00b1081..7db06e3f642a 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -126,9 +126,7 @@ static ssize_t stp_state_show(struct device *d, static int set_stp_state(struct net_bridge *br, unsigned long val) { - br_stp_set_enabled(br, val); - - return 0; + return br_stp_set_enabled(br, val, NULL); } static ssize_t stp_state_store(struct device *d, diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 195d2d67be8a..c10e5a55758d 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -142,7 +142,7 @@ static void caif_flow_cb(struct sk_buff *skb) spin_lock_bh(&caifd->flow_lock); send_xoff = caifd->xoff; - caifd->xoff = 0; + caifd->xoff = false; dtor = caifd->xoff_skb_dtor; if (WARN_ON(caifd->xoff_skb != skb)) @@ -220,7 +220,7 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt) pr_debug("queue has stopped(%d) or is full (%d > %d)\n", netif_queue_stopped(caifd->netdev), qlen, high); - caifd->xoff = 1; + caifd->xoff = true; caifd->xoff_skb = skb; caifd->xoff_skb_dtor = skb->destructor; skb->destructor = caif_flow_cb; @@ -407,7 +407,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, break; } - caifd->xoff = 0; + caifd->xoff = false; cfcnfg_set_phy_state(cfg, &caifd->layer, true); rcu_read_unlock(); @@ -442,7 +442,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, if (caifd->xoff_skb_dtor != NULL && caifd->xoff_skb != NULL) caifd->xoff_skb->destructor = caifd->xoff_skb_dtor; - caifd->xoff = 0; + caifd->xoff = false; caifd->xoff_skb_dtor = NULL; caifd->xoff_skb = NULL; diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index a56628962852..79b6a04d8eb6 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -211,7 +211,8 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, } } -static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) +static netdev_tx_t chnl_net_start_xmit(struct sk_buff *skb, + struct net_device *dev) { struct chnl_net *priv; struct cfpkt *pkt = NULL; diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index 2e8e6f904920..d7bec7adc267 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -39,6 +39,6 @@ config CEPH_LIB_USE_DNS_RESOLVER be resolved using the CONFIG_DNS_RESOLVER facility. For information on how to use CONFIG_DNS_RESOLVER consult - Documentation/networking/dns_resolver.txt + Documentation/networking/dns_resolver.rst If unsure, say N. diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f8ca5edc5f2c..27d6ab11f9ee 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -490,15 +490,8 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } - if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) { - int optval = 1; - - ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char *)&optval, sizeof(optval)); - if (ret) - pr_err("kernel_setsockopt(TCP_NODELAY) failed: %d", - ret); - } + if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) + tcp_sock_set_nodelay(sock->sk); con->sock = sock; return 0; diff --git a/net/compat.c b/net/compat.c index 4bed96e84d9a..5e3041a2c37d 100644 --- a/net/compat.c +++ b/net/compat.c @@ -56,7 +56,8 @@ int __get_compat_msghdr(struct msghdr *kmsg, if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); - kmsg->msg_control = compat_ptr(msg.msg_control); + kmsg->msg_control_is_user = true; + kmsg->msg_control_user = compat_ptr(msg.msg_control); kmsg->msg_controllen = msg.msg_controllen; if (save_addr) @@ -121,7 +122,7 @@ int get_compat_msghdr(struct msghdr *kmsg, ((ucmlen) >= sizeof(struct compat_cmsghdr) && \ (ucmlen) <= (unsigned long) \ ((mhdr)->msg_controllen - \ - ((char *)(ucmsg) - (char *)(mhdr)->msg_control))) + ((char __user *)(ucmsg) - (char __user *)(mhdr)->msg_control_user))) static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg, struct compat_cmsghdr __user *cmsg, int cmsg_len) @@ -182,20 +183,21 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, memset(kcmsg, 0, kcmlen); ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); while (ucmsg != NULL) { - if (__get_user(ucmlen, &ucmsg->cmsg_len)) + struct compat_cmsghdr cmsg; + if (copy_from_user(&cmsg, ucmsg, sizeof(cmsg))) goto Efault; - if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) + if (!CMSG_COMPAT_OK(cmsg.cmsg_len, ucmsg, kmsg)) goto Einval; - tmp = ((ucmlen - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); + tmp = ((cmsg.cmsg_len - sizeof(*ucmsg)) + sizeof(struct cmsghdr)); if ((char *)kcmsg_base + kcmlen - (char *)kcmsg < CMSG_ALIGN(tmp)) goto Einval; kcmsg->cmsg_len = tmp; + kcmsg->cmsg_level = cmsg.cmsg_level; + kcmsg->cmsg_type = cmsg.cmsg_type; tmp = CMSG_ALIGN(tmp); - if (__get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level) || - __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type) || - copy_from_user(CMSG_DATA(kcmsg), + if (copy_from_user(CMSG_DATA(kcmsg), CMSG_COMPAT_DATA(ucmsg), - (ucmlen - sizeof(*ucmsg)))) + (cmsg.cmsg_len - sizeof(*ucmsg)))) goto Efault; /* Advance. */ @@ -447,200 +449,6 @@ COMPAT_SYSCALL_DEFINE5(getsockopt, int, fd, int, level, int, optname, return __compat_sys_getsockopt(fd, level, optname, optval, optlen); } -struct compat_group_req { - __u32 gr_interface; - struct __kernel_sockaddr_storage gr_group - __aligned(4); -} __packed; - -struct compat_group_source_req { - __u32 gsr_interface; - struct __kernel_sockaddr_storage gsr_group - __aligned(4); - struct __kernel_sockaddr_storage gsr_source - __aligned(4); -} __packed; - -struct compat_group_filter { - __u32 gf_interface; - struct __kernel_sockaddr_storage gf_group - __aligned(4); - __u32 gf_fmode; - __u32 gf_numsrc; - struct __kernel_sockaddr_storage gf_slist[1] - __aligned(4); -} __packed; - -#define __COMPAT_GF0_SIZE (sizeof(struct compat_group_filter) - \ - sizeof(struct __kernel_sockaddr_storage)) - - -int compat_mc_setsockopt(struct sock *sock, int level, int optname, - char __user *optval, unsigned int optlen, - int (*setsockopt)(struct sock *, int, int, char __user *, unsigned int)) -{ - char __user *koptval = optval; - int koptlen = optlen; - - switch (optname) { - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - { - struct compat_group_req __user *gr32 = (void __user *)optval; - struct group_req __user *kgr = - compat_alloc_user_space(sizeof(struct group_req)); - u32 interface; - - if (!access_ok(gr32, sizeof(*gr32)) || - !access_ok(kgr, sizeof(struct group_req)) || - __get_user(interface, &gr32->gr_interface) || - __put_user(interface, &kgr->gr_interface) || - copy_in_user(&kgr->gr_group, &gr32->gr_group, - sizeof(kgr->gr_group))) - return -EFAULT; - koptval = (char __user *)kgr; - koptlen = sizeof(struct group_req); - break; - } - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - { - struct compat_group_source_req __user *gsr32 = (void __user *)optval; - struct group_source_req __user *kgsr = compat_alloc_user_space( - sizeof(struct group_source_req)); - u32 interface; - - if (!access_ok(gsr32, sizeof(*gsr32)) || - !access_ok(kgsr, - sizeof(struct group_source_req)) || - __get_user(interface, &gsr32->gsr_interface) || - __put_user(interface, &kgsr->gsr_interface) || - copy_in_user(&kgsr->gsr_group, &gsr32->gsr_group, - sizeof(kgsr->gsr_group)) || - copy_in_user(&kgsr->gsr_source, &gsr32->gsr_source, - sizeof(kgsr->gsr_source))) - return -EFAULT; - koptval = (char __user *)kgsr; - koptlen = sizeof(struct group_source_req); - break; - } - case MCAST_MSFILTER: - { - struct compat_group_filter __user *gf32 = (void __user *)optval; - struct group_filter __user *kgf; - u32 interface, fmode, numsrc; - - if (!access_ok(gf32, __COMPAT_GF0_SIZE) || - __get_user(interface, &gf32->gf_interface) || - __get_user(fmode, &gf32->gf_fmode) || - __get_user(numsrc, &gf32->gf_numsrc)) - return -EFAULT; - koptlen = optlen + sizeof(struct group_filter) - - sizeof(struct compat_group_filter); - if (koptlen < GROUP_FILTER_SIZE(numsrc)) - return -EINVAL; - kgf = compat_alloc_user_space(koptlen); - if (!access_ok(kgf, koptlen) || - __put_user(interface, &kgf->gf_interface) || - __put_user(fmode, &kgf->gf_fmode) || - __put_user(numsrc, &kgf->gf_numsrc) || - copy_in_user(&kgf->gf_group, &gf32->gf_group, - sizeof(kgf->gf_group)) || - (numsrc && copy_in_user(kgf->gf_slist, gf32->gf_slist, - numsrc * sizeof(kgf->gf_slist[0])))) - return -EFAULT; - koptval = (char __user *)kgf; - break; - } - - default: - break; - } - return setsockopt(sock, level, optname, koptval, koptlen); -} -EXPORT_SYMBOL(compat_mc_setsockopt); - -int compat_mc_getsockopt(struct sock *sock, int level, int optname, - char __user *optval, int __user *optlen, - int (*getsockopt)(struct sock *, int, int, char __user *, int __user *)) -{ - struct compat_group_filter __user *gf32 = (void __user *)optval; - struct group_filter __user *kgf; - int __user *koptlen; - u32 interface, fmode, numsrc; - int klen, ulen, err; - - if (optname != MCAST_MSFILTER) - return getsockopt(sock, level, optname, optval, optlen); - - koptlen = compat_alloc_user_space(sizeof(*koptlen)); - if (!access_ok(optlen, sizeof(*optlen)) || - __get_user(ulen, optlen)) - return -EFAULT; - - /* adjust len for pad */ - klen = ulen + sizeof(*kgf) - sizeof(*gf32); - - if (klen < GROUP_FILTER_SIZE(0)) - return -EINVAL; - - if (!access_ok(koptlen, sizeof(*koptlen)) || - __put_user(klen, koptlen)) - return -EFAULT; - - /* have to allow space for previous compat_alloc_user_space, too */ - kgf = compat_alloc_user_space(klen+sizeof(*optlen)); - - if (!access_ok(gf32, __COMPAT_GF0_SIZE) || - __get_user(interface, &gf32->gf_interface) || - __get_user(fmode, &gf32->gf_fmode) || - __get_user(numsrc, &gf32->gf_numsrc) || - __put_user(interface, &kgf->gf_interface) || - __put_user(fmode, &kgf->gf_fmode) || - __put_user(numsrc, &kgf->gf_numsrc) || - copy_in_user(&kgf->gf_group, &gf32->gf_group, sizeof(kgf->gf_group))) - return -EFAULT; - - err = getsockopt(sock, level, optname, (char __user *)kgf, koptlen); - if (err) - return err; - - if (!access_ok(koptlen, sizeof(*koptlen)) || - __get_user(klen, koptlen)) - return -EFAULT; - - ulen = klen - (sizeof(*kgf)-sizeof(*gf32)); - - if (!access_ok(optlen, sizeof(*optlen)) || - __put_user(ulen, optlen)) - return -EFAULT; - - if (!access_ok(kgf, klen) || - !access_ok(gf32, ulen) || - __get_user(interface, &kgf->gf_interface) || - __get_user(fmode, &kgf->gf_fmode) || - __get_user(numsrc, &kgf->gf_numsrc) || - __put_user(interface, &gf32->gf_interface) || - __put_user(fmode, &gf32->gf_fmode) || - __put_user(numsrc, &gf32->gf_numsrc)) - return -EFAULT; - if (numsrc) { - int copylen; - - klen -= GROUP_FILTER_SIZE(0); - copylen = numsrc * sizeof(gf32->gf_slist[0]); - if (copylen > klen) - copylen = klen; - if (copy_in_user(gf32->gf_slist, kgf->gf_slist, copylen)) - return -EFAULT; - } - return err; -} -EXPORT_SYMBOL(compat_mc_getsockopt); - - /* Argument list sizes for compat_sys_socketcall */ #define AL(x) ((x) * sizeof(u32)) static unsigned char nas[21] = { diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 756b63b6f7b3..d2c4d16dadba 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; if (attr->value_size > MAX_VALUE_SIZE) @@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { diff --git a/net/core/dev.c b/net/core/dev.c index 2d8aceee4284..10684833f864 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -398,6 +398,74 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, + ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +#endif + /******************************************************************************* * * Protocol management and registration routines @@ -4549,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_meta = xdp->data; xdp->data_end = xdp->data + hlen; xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; @@ -4572,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb_reset_network_header(skb); } - /* check if bpf_xdp_adjust_tail was used. it can only "shrink" - * pckt. - */ - off = orig_data_end - xdp->data_end; + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); - skb->len -= off; - + skb->len += off; /* positive on grow, negative on shrink */ } /* check if XDP changed eth hdr such SKB needs update */ @@ -5350,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; + if (new) { + u32 i; + + /* generic XDP does not work with DEVMAPs that can + * have a bpf_prog installed on an entry + */ + for (i = 0; i < new->aux->used_map_cnt; i++) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) + return -EINVAL; + } + } + switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -6237,7 +6319,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags, val, new; + unsigned long flags, val, new, timeout = 0; + bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list @@ -6249,20 +6332,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; + if (work_done) { + if (n->gro_bitmask) + timeout = READ_ONCE(n->dev->gro_flush_timeout); + n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = READ_ONCE(n->dev->gro_flush_timeout); + if (timeout) + ret = false; + } if (n->gro_bitmask) { - unsigned long timeout = 0; - - if (work_done) - timeout = n->dev->gro_flush_timeout; - /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); - if (timeout) - hrtimer_start(&n->timer, ns_to_ktime(timeout), - HRTIMER_MODE_REL_PINNED); } gro_normal_list(n); @@ -6294,7 +6380,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; } - return true; + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; } EXPORT_SYMBOL(napi_complete_done); @@ -6474,7 +6563,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_bitmask && !napi_disable_pending(napi) && + if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); @@ -7796,6 +7885,28 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; @@ -8736,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); @@ -9148,6 +9265,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else @@ -9208,7 +9330,7 @@ static void netdev_init_one_queue(struct net_device *dev, { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); - lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; @@ -9255,22 +9377,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -static void netdev_register_lockdep_key(struct net_device *dev) -{ - lockdep_register_key(&dev->qdisc_tx_busylock_key); - lockdep_register_key(&dev->qdisc_running_key); - lockdep_register_key(&dev->qdisc_xmit_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); -} - -static void netdev_unregister_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->qdisc_tx_busylock_key); - lockdep_unregister_key(&dev->qdisc_running_key); - lockdep_unregister_key(&dev->qdisc_xmit_lock_key); - lockdep_unregister_key(&dev->addr_list_lock_key); -} - void netdev_update_lockdep_key(struct net_device *dev) { lockdep_unregister_key(&dev->addr_list_lock_key); @@ -9837,7 +9943,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - netdev_register_lockdep_key(dev); + lockdep_register_key(&dev->addr_list_lock_key); dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; @@ -9926,7 +10032,7 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - netdev_unregister_lockdep_key(dev); + lockdep_unregister_key(&dev->addr_list_lock_key); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { diff --git a/net/core/devlink.c b/net/core/devlink.c index 899edcee7dab..2cafbc808b09 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3716,24 +3716,26 @@ nla_put_failure: return err; } -static void devlink_nl_region_notify(struct devlink_region *region, - struct devlink_snapshot *snapshot, - enum devlink_command cmd) +static struct sk_buff * +devlink_nl_region_notify_build(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd, u32 portid, u32 seq) { struct devlink *devlink = region->devlink; struct sk_buff *msg; void *hdr; int err; - WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - return; + return ERR_PTR(-ENOMEM); - hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); - if (!hdr) + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); + if (!hdr) { + err = -EMSGSIZE; goto out_free_msg; + } err = devlink_nl_put_handle(msg, devlink); if (err) @@ -3757,15 +3759,30 @@ static void devlink_nl_region_notify(struct devlink_region *region, } genlmsg_end(msg, hdr); - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); - - return; + return msg; out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); + return ERR_PTR(err); +} + +static void devlink_nl_region_notify(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + + WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + + msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); + if (IS_ERR(msg)) + return; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } /** @@ -4069,6 +4086,8 @@ static int devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; u32 snapshot_id; @@ -4080,11 +4099,6 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { - NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided"); - return -EINVAL; - } - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); region = devlink_region_get_by_name(devlink, region_name); if (!region) { @@ -4102,16 +4116,25 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -ENOSPC; } - snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; + if (snapshot_id_attr) { + snapshot_id = nla_get_u32(snapshot_id_attr); - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); - return -EEXIST; - } + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); + return -EEXIST; + } - err = __devlink_snapshot_id_insert(devlink, snapshot_id); - if (err) - return err; + err = __devlink_snapshot_id_insert(devlink, snapshot_id); + if (err) + return err; + } else { + err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id"); + return err; + } + } err = region->ops->snapshot(devlink, info->extack, &data); if (err) @@ -4121,6 +4144,27 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) if (err) goto err_snapshot_create; + if (!snapshot_id_attr) { + struct sk_buff *msg; + + snapshot = devlink_region_snapshot_get_by_id(region, + snapshot_id); + if (WARN_ON(!snapshot)) + return -EINVAL; + + msg = devlink_nl_region_notify_build(region, snapshot, + DEVLINK_CMD_REGION_NEW, + info->snd_portid, + info->snd_seq); + err = PTR_ERR_OR_ZERO(msg); + if (err) + goto err_notify; + + err = genlmsg_reply(msg, info); + if (err) + goto err_notify; + } + return 0; err_snapshot_create: @@ -4128,6 +4172,10 @@ err_snapshot_create: err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); return err; + +err_notify: + devlink_region_snapshot_del(region, snapshot); + return err; } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, @@ -4167,7 +4215,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, struct nlattr **attrs, u64 start_offset, u64 end_offset, - bool dump, u64 *new_offset) { struct devlink_snapshot *snapshot; @@ -4182,9 +4229,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, if (!snapshot) return -EINVAL; - if (end_offset > region->size || dump) - end_offset = region->size; - while (curr_offset < end_offset) { u32 data_size; u8 *data; @@ -4212,13 +4256,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - u64 ret_offset, start_offset, end_offset = 0; + u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - bool dump = true; void *hdr; int err; @@ -4246,8 +4289,21 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + } + + if (end_offset > region->size) + end_offset = region->size; + /* return 0 if there is no further data to read */ - if (start_offset >= region->size) { + if (start_offset == end_offset) { err = 0; goto out_unlock; } @@ -4274,27 +4330,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && - attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { - if (!start_offset) - start_offset = - nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - - end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); - dump = false; - - if (start_offset == end_offset) { - err = 0; - goto nla_put_failure; - } - } - err = devlink_nl_region_read_snapshot_fill(skb, devlink, region, attrs, start_offset, - end_offset, dump, - &ret_offset); + end_offset, &ret_offset); if (err && err != -EMSGSIZE) goto nla_put_failure; @@ -5830,7 +5869,8 @@ devlink_trap_action_get_from_info(struct genl_info *info, val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: /* fall-through */ - case DEVLINK_TRAP_ACTION_TRAP: + case DEVLINK_TRAP_ACTION_TRAP: /* fall-through */ + case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: @@ -8455,6 +8495,50 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), + DEVLINK_TRAP(STP, CONTROL), + DEVLINK_TRAP(LACP, CONTROL), + DEVLINK_TRAP(LLDP, CONTROL), + DEVLINK_TRAP(IGMP_QUERY, CONTROL), + DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), + DEVLINK_TRAP(MLD_QUERY, CONTROL), + DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V1_DONE, CONTROL), + DEVLINK_TRAP(IPV4_DHCP, CONTROL), + DEVLINK_TRAP(IPV6_DHCP, CONTROL), + DEVLINK_TRAP(ARP_REQUEST, CONTROL), + DEVLINK_TRAP(ARP_RESPONSE, CONTROL), + DEVLINK_TRAP(ARP_OVERLAY, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), + DEVLINK_TRAP(IPV4_BFD, CONTROL), + DEVLINK_TRAP(IPV6_BFD, CONTROL), + DEVLINK_TRAP(IPV4_OSPF, CONTROL), + DEVLINK_TRAP(IPV6_OSPF, CONTROL), + DEVLINK_TRAP(IPV4_BGP, CONTROL), + DEVLINK_TRAP(IPV6_BGP, CONTROL), + DEVLINK_TRAP(IPV4_VRRP, CONTROL), + DEVLINK_TRAP(IPV6_VRRP, CONTROL), + DEVLINK_TRAP(IPV4_PIM, CONTROL), + DEVLINK_TRAP(IPV6_PIM, CONTROL), + DEVLINK_TRAP(UC_LB, CONTROL), + DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), + DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), + DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), + DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), + DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(PTP_EVENT, CONTROL), + DEVLINK_TRAP(PTP_GENERAL, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8466,9 +8550,28 @@ static const struct devlink_trap devlink_trap_generic[] = { static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), + DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), + DEVLINK_TRAP_GROUP(STP), + DEVLINK_TRAP_GROUP(LACP), + DEVLINK_TRAP_GROUP(LLDP), + DEVLINK_TRAP_GROUP(MC_SNOOPING), + DEVLINK_TRAP_GROUP(DHCP), + DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), + DEVLINK_TRAP_GROUP(BFD), + DEVLINK_TRAP_GROUP(OSPF), + DEVLINK_TRAP_GROUP(BGP), + DEVLINK_TRAP_GROUP(VRRP), + DEVLINK_TRAP_GROUP(PIM), + DEVLINK_TRAP_GROUP(UC_LB), + DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), + DEVLINK_TRAP_GROUP(IPV6), + DEVLINK_TRAP_GROUP(PTP_EVENT), + DEVLINK_TRAP_GROUP(PTP_GENERAL), + DEVLINK_TRAP_GROUP(ACL_SAMPLE), + DEVLINK_TRAP_GROUP(ACL_TRAP), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) @@ -8806,6 +8909,13 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + /* Control packets were not dropped by the device or encountered an + * exception during forwarding and therefore should not be reported to + * the kernel's drop monitor. + */ + if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL) + return; + devlink_trap_report_metadata_fill(&hw_metadata, trap_item, in_devlink_port, fa_cookie); net_dm_hw_report(skb, &hw_metadata); diff --git a/net/core/dst.c b/net/core/dst.c index 193af526e908..d6b6ced0d451 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -81,11 +81,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, { struct dst_entry *dst; - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) { - printk_ratelimited(KERN_NOTICE "Route cache is full: " - "consider increasing sysctl " - "net.ipv[4|6].route.max_size.\n"); + pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); return NULL; } } diff --git a/net/core/filter.c b/net/core/filter.c index 5cc9276f1023..d01a244b5087 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, offset); } -BPF_CALL_0(bpf_get_raw_cpu_id) -{ - return raw_smp_processor_id(); -} - -static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = bpf_get_raw_cpu_id, - .gpl_only = false, - .ret_type = RET_INTEGER, -}; - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -2026,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -3124,7 +3147,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3174,7 +3198,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3202,6 +3227,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; @@ -3422,15 +3449,26 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; - /* only shrinking is allowed for now. */ - if (unlikely(offset >= 0)) + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) + return -EINVAL; + + /* ALL drivers MUST init xdp->frame_sz, chicken check below */ + if (unlikely(xdp->frame_sz > PAGE_SIZE)) { + WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); return -EINVAL; + } if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + xdp->data_end = data_end; return 0; @@ -4014,16 +4052,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { }; #ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); - struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgroup_id(cgrp); + return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4033,16 +4077,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, - ancestor_level) +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); struct cgroup *ancestor; struct cgroup *cgrp; - if (!sk || !sk_fullsock(sk)) - return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4051,6 +4091,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, return cgroup_id(ancestor); } +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, @@ -4058,6 +4109,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -4205,38 +4281,24 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, - void *, data, u64, size) -{ - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; - - return bpf_event_output(map, flags, data, size, NULL, 0, NULL); -} +#define SOCKOPT_CC_REINIT (1 << 0) -static const struct bpf_func_proto bpf_event_output_data_proto = { - .func = bpf_event_output_data, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE_OR_ZERO, -}; - -BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen, u32 flags) { - struct sock *sk = bpf_sock->sk; + char devname[IFNAMSIZ]; + struct net *net; + int ifindex; int ret = 0; int val; if (!sk_fullsock(sk)) return -EINVAL; + sock_owned_by_me(sk); + if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) + if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); @@ -4277,6 +4339,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk_dst_reset(sk); } break; + case SO_BINDTODEVICE: + ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + optlen = min_t(long, optlen, IFNAMSIZ - 1); + strncpy(devname, optval, optlen); + devname[optlen] = 0; + + ifindex = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + ret = -ENODEV; + + net = sock_net(sk); + dev = dev_get_by_name(net, devname); + if (!dev) + break; + ifindex = dev->ifindex; + dev_put(dev); + } + ret = sock_bindtoindex(sk, ifindex, false); +#endif + break; default: ret = -EINVAL; } @@ -4329,7 +4414,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; + bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); @@ -4376,24 +4461,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, return ret; } -static const struct bpf_func_proto bpf_setsockopt_proto = { - .func = bpf_setsockopt, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, -}; - -BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { - struct sock *sk = bpf_sock->sk; - if (!sk_fullsock(sk)) goto err_clear; + + sock_owned_by_me(sk); + #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; @@ -4459,8 +4534,71 @@ err_clear: return -EINVAL; } -static const struct bpf_func_proto bpf_getsockopt_proto = { - .func = bpf_getsockopt, +BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .func = bpf_sock_addr_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { + .func = bpf_sock_addr_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + flags |= SOCKOPT_CC_REINIT; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .func = bpf_sock_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { + .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -4500,30 +4638,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, { #ifdef CONFIG_INET struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; int err; - /* Binding to port can be expensive so it's prohibited in the helper. - * Only binding to IP is supported. - */ err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) - return err; - return __inet_bind(sk, addr, addr_len, true, false); + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) - return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ @@ -5983,52 +6119,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_raw_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - default: - break; - } - - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - switch (func_id) { - case BPF_FUNC_spin_lock: - return &bpf_spin_lock_proto; - case BPF_FUNC_spin_unlock: - return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - default: - return NULL; - } -} +const struct bpf_func_proto bpf_event_output_data_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) @@ -6119,6 +6210,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -6163,8 +6270,22 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: @@ -6193,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6213,6 +6336,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: @@ -6335,9 +6460,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: - return &bpf_setsockopt_proto; + return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: - return &bpf_getsockopt_proto; + return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: @@ -6384,6 +6509,26 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif default: return bpf_base_func_proto(func_id); } @@ -6504,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6628,7 +6775,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; } @@ -6640,7 +6787,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; default: @@ -6770,6 +6917,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): + case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): @@ -6935,6 +7083,13 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + if (type == BPF_WRITE) { if (bpf_prog_is_dev_bound(prog->aux)) { switch (off) { @@ -6990,6 +7145,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7001,6 +7158,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; @@ -7033,6 +7192,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -7063,10 +7223,6 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case bpf_ctx_range(struct bpf_sock_addr, user_port): - if (size != size_default) - return false; - break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; @@ -7197,6 +7353,11 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): @@ -7812,6 +7973,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_state), target_size)); break; + case offsetof(struct bpf_sock, rx_queue_mapping): +#ifdef CONFIG_XPS + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_rx_queue_mapping, + sizeof_field(struct sock, + sk_rx_queue_mapping), + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, + 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); + *target_size = 2; +#endif + break; } return insn - insn_buf; @@ -7882,6 +8060,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_rxq_info, queue_index)); break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; } return insn - insn_buf; @@ -7962,8 +8150,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): @@ -7998,9 +8186,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr_in6, uaddr, - sin6_port, tmp_reg); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): @@ -8531,6 +8721,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; } return insn - insn_buf; @@ -8786,6 +8982,10 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5dceed467f64..d02df0b6d0d9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -31,8 +31,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_labels.h> #endif - -static DEFINE_MUTEX(flow_dissector_mutex); +#include <linux/bpf-netns.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -70,54 +69,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; - struct bpf_prog *attached; - struct net *net; - - if (attr->query.query_flags) - return -EINVAL; - - net = get_net_ns_by_fd(attr->query.target_fd); - if (IS_ERR(net)) - return PTR_ERR(net); - - rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); - - put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; -} - -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; - struct net *net; - int ret = 0; - - net = current->nsproxy->net_ns; - mutex_lock(&flow_dissector_mutex); if (net == &init_net) { /* BPF flow dissector in the root namespace overrides @@ -130,70 +86,29 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(ns->bpf.progs[type])) + return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(init_net.bpf.progs[type])) + return -EEXIST; } - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (attached == prog) { + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (attached == prog) /* The same program cannot be attached twice */ - ret = -EINVAL; - goto out; - } - rcu_assign_pointer(net->flow_dissector_prog, prog); + return -EINVAL; + + rcu_assign_pointer(net->bpf.progs[type], prog); if (attached) bpf_prog_put(attached); -out: - mutex_unlock(&flow_dissector_mutex); - return ret; -} - -static int flow_dissector_bpf_prog_detach(struct net *net) -{ - struct bpf_prog *attached; - - mutex_lock(&flow_dissector_mutex); - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (!attached) { - mutex_unlock(&flow_dissector_mutex); - return -ENOENT; - } - RCU_INIT_POINTER(net->flow_dissector_prog, NULL); - bpf_prog_put(attached); - mutex_unlock(&flow_dissector_mutex); return 0; } - -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) -{ - return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns); -} - -static void __net_exit flow_dissector_pernet_pre_exit(struct net *net) -{ - /* We're not racing with attach/detach because there are no - * references to netns left when pre_exit gets called. - */ - if (rcu_access_pointer(net->flow_dissector_prog)) - flow_dissector_bpf_prog_detach(net); -} - -static struct pernet_operations flow_dissector_pernet_ops __net_initdata = { - .pre_exit = flow_dissector_pernet_pre_exit, -}; +#endif /* CONFIG_BPF_SYSCALL */ /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -480,47 +395,59 @@ EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen) + void *target_container, void *data, int nhoff, int hlen, + int lse_index, bool *entropy_label) { - struct flow_dissector_key_keyid *key_keyid; - struct mpls_label *hdr, _hdr[2]; - u32 entry, label; + struct mpls_label *hdr, _hdr; + u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; + if (lse_index >= FLOW_DIS_MPLS_MAX) + return FLOW_DISSECT_RET_OUT_GOOD; + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; - entry = ntohl(hdr[0].entry); + entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; + struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); - key_mpls->mpls_label = label; - key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK) - >> MPLS_LS_TTL_SHIFT; - key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK) - >> MPLS_LS_TC_SHIFT; - key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK) - >> MPLS_LS_S_SHIFT; + lse = &key_mpls->ls[lse_index]; + + lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + lse->mpls_bos = bos; + lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + lse->mpls_label = label; + dissector_set_mpls_lse(key_mpls, lse_index); } - if (label == MPLS_LABEL_ENTROPY) { + if (*entropy_label && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + struct flow_dissector_key_keyid *key_keyid; + key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); - key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK); + key_keyid->keyid = cpu_to_be32(label); } - return FLOW_DISSECT_RET_OUT_GOOD; + + *entropy_label = label == MPLS_LABEL_ENTROPY; + + return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret @@ -979,6 +906,8 @@ bool __skb_flow_dissect(const struct net *net, struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + bool mpls_el = false; + int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -1030,11 +959,13 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + rcu_read_lock(); - attached = rcu_dereference(init_net.flow_dissector_prog); + attached = rcu_dereference(init_net.bpf.progs[type]); if (!attached) - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { struct bpf_flow_keys flow_keys; @@ -1278,7 +1209,10 @@ proto_again: case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, - nhoff, hlen); + nhoff, hlen, mpls_lse, + &mpls_el); + nhoff += sizeof(struct mpls_label); + mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { @@ -1852,7 +1786,6 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - - return register_pernet_subsys(&flow_dissector_pernet_ops); + return 0; } core_initcall(init_default_flow_dissectors); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index e951b743bed3..0cfc35e6be28 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -8,6 +8,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; + int i; rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); @@ -15,6 +16,11 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) return NULL; rule->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; return rule; } @@ -311,240 +317,159 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, } EXPORT_SYMBOL(flow_block_cb_setup_simple); -static LIST_HEAD(block_cb_list); +static DEFINE_MUTEX(flow_indr_block_lock); +static LIST_HEAD(flow_block_indr_list); +static LIST_HEAD(flow_block_indr_dev_list); -static struct rhashtable indr_setup_block_ht; - -struct flow_indr_block_cb { - struct list_head list; - void *cb_priv; - flow_indr_block_bind_cb_t *cb; - void *cb_ident; -}; - -struct flow_indr_block_dev { - struct rhash_head ht_node; - struct net_device *dev; - unsigned int refcnt; - struct list_head cb_list; -}; - -static const struct rhashtable_params flow_indr_setup_block_ht_params = { - .key_offset = offsetof(struct flow_indr_block_dev, dev), - .head_offset = offsetof(struct flow_indr_block_dev, ht_node), - .key_len = sizeof(struct net_device *), +struct flow_indr_dev { + struct list_head list; + flow_indr_block_bind_cb_t *cb; + void *cb_priv; + refcount_t refcnt; + struct rcu_head rcu; }; -static struct flow_indr_block_dev * -flow_indr_block_dev_lookup(struct net_device *dev) -{ - return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, - flow_indr_setup_block_ht_params); -} - -static struct flow_indr_block_dev * -flow_indr_block_dev_get(struct net_device *dev) +static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, + void *cb_priv) { - struct flow_indr_block_dev *indr_dev; + struct flow_indr_dev *indr_dev; - indr_dev = flow_indr_block_dev_lookup(dev); - if (indr_dev) - goto inc_ref; - - indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); + indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); if (!indr_dev) return NULL; - INIT_LIST_HEAD(&indr_dev->cb_list); - indr_dev->dev = dev; - if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params)) { - kfree(indr_dev); - return NULL; - } + indr_dev->cb = cb; + indr_dev->cb_priv = cb_priv; + refcount_set(&indr_dev->refcnt, 1); -inc_ref: - indr_dev->refcnt++; return indr_dev; } -static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev) -{ - if (--indr_dev->refcnt) - return; - - rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params); - kfree(indr_dev); -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev, - flow_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - if (indr_block_cb->cb == cb && - indr_block_cb->cb_ident == cb_ident) - return indr_block_cb; - return NULL; -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, void *cb_ident) +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) { - struct flow_indr_block_cb *indr_block_cb; + struct flow_indr_dev *indr_dev; - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (indr_block_cb) - return ERR_PTR(-EEXIST); + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) { + if (indr_dev->cb == cb && + indr_dev->cb_priv == cb_priv) { + refcount_inc(&indr_dev->refcnt); + mutex_unlock(&flow_indr_block_lock); + return 0; + } + } - indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); - if (!indr_block_cb) - return ERR_PTR(-ENOMEM); + indr_dev = flow_indr_dev_alloc(cb, cb_priv); + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return -ENOMEM; + } - indr_block_cb->cb_priv = cb_priv; - indr_block_cb->cb = cb; - indr_block_cb->cb_ident = cb_ident; - list_add(&indr_block_cb->list, &indr_dev->cb_list); + list_add(&indr_dev->list, &flow_block_indr_dev_list); + mutex_unlock(&flow_indr_block_lock); - return indr_block_cb; + return 0; } +EXPORT_SYMBOL(flow_indr_dev_register); -static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) +static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv, + struct list_head *cleanup_list) { - list_del(&indr_block_cb->list); - kfree(indr_block_cb); -} - -static DEFINE_MUTEX(flow_indr_block_cb_lock); + struct flow_block_cb *this, *next; -static void flow_block_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command) -{ - struct flow_indr_block_entry *entry; - - mutex_lock(&flow_indr_block_cb_lock); - list_for_each_entry(entry, &block_cb_list, list) { - entry->cb(dev, cb, cb_priv, command); + list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { + if (this->cb == setup_cb && + this->cb_priv == cb_priv) { + list_move(&this->indr.list, cleanup_list); + return; + } } - mutex_unlock(&flow_indr_block_cb_lock); } -int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) +static void flow_block_indr_notify(struct list_head *cleanup_list) { - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - int err; - - indr_dev = flow_indr_block_dev_get(dev); - if (!indr_dev) - return -ENOMEM; - - indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); - err = PTR_ERR_OR_ZERO(indr_block_cb); - if (err) - goto err_dev_put; - - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_BIND); - - return 0; + struct flow_block_cb *this, *next; -err_dev_put: - flow_indr_block_dev_put(indr_dev); - return err; -} -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register); - -int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - int err; - - rtnl_lock(); - err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident); - rtnl_unlock(); - - return err; + list_for_each_entry_safe(this, next, cleanup_list, indr.list) { + list_del(&this->indr.list); + this->indr.cleanup(this); + } } -EXPORT_SYMBOL_GPL(flow_indr_block_cb_register); -void __flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + flow_setup_cb_t *setup_cb) { - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; + struct flow_indr_dev *this, *next, *indr_dev = NULL; + LIST_HEAD(cleanup_list); - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; + mutex_lock(&flow_indr_block_lock); + list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) { + if (this->cb == cb && + this->cb_priv == cb_priv && + refcount_dec_and_test(&this->refcnt)) { + indr_dev = this; + list_del(&indr_dev->list); + break; + } + } - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (!indr_block_cb) + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); return; + } - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_UNBIND); + __flow_block_indr_cleanup(setup_cb, cb_priv, &cleanup_list); + mutex_unlock(&flow_indr_block_lock); - flow_indr_block_cb_del(indr_block_cb); - flow_indr_block_dev_put(indr_dev); + flow_block_indr_notify(&cleanup_list); + kfree(indr_dev); } -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister); +EXPORT_SYMBOL(flow_indr_dev_unregister); -void flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) +static void flow_block_indr_init(struct flow_block_cb *flow_block, + struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { - rtnl_lock(); - __flow_indr_block_cb_unregister(dev, cb, cb_ident); - rtnl_unlock(); + flow_block->indr.binder_type = bo->binder_type; + flow_block->indr.data = data; + flow_block->indr.dev = dev; + flow_block->indr.cleanup = cleanup; } -EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); -void flow_indr_block_call(struct net_device *dev, - struct flow_block_offload *bo, - enum flow_block_command command, - enum tc_setup_type type) +static void __flow_block_indr_binding(struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) { - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; + struct flow_block_cb *block_cb; - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo); + list_for_each_entry(block_cb, &bo->cb_list, list) { + switch (bo->command) { + case FLOW_BLOCK_BIND: + flow_block_indr_init(block_cb, bo, dev, data, cleanup); + list_add(&block_cb->indr.list, &flow_block_indr_list); + break; + case FLOW_BLOCK_UNBIND: + list_del(&block_cb->indr.list); + break; + } + } } -EXPORT_SYMBOL_GPL(flow_indr_block_call); -void flow_indr_add_block_cb(struct flow_indr_block_entry *entry) +int flow_indr_dev_setup_offload(struct net_device *dev, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)) { - mutex_lock(&flow_indr_block_cb_lock); - list_add_tail(&entry->list, &block_cb_list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_add_block_cb); + struct flow_indr_dev *this; -void flow_indr_del_block_cb(struct flow_indr_block_entry *entry) -{ - mutex_lock(&flow_indr_block_cb_lock); - list_del(&entry->list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_del_block_cb); + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(this, &flow_block_indr_dev_list, list) + this->cb(dev, this->cb_priv, type, bo); -static int __init init_flow_indr_rhashtable(void) -{ - return rhashtable_init(&indr_setup_block_ht, - &flow_indr_setup_block_ht_params); + __flow_block_indr_binding(bo, dev, data, cleanup); + mutex_unlock(&flow_indr_block_lock); + + return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; } -subsys_initcall(init_flow_indr_rhashtable); +EXPORT_SYMBOL(flow_indr_dev_setup_offload); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 1d653fbfcf52..e491b083b348 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -6,7 +6,7 @@ * Jamal Hadi Salim * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * - * See Documentation/networking/gen_stats.txt + * See Documentation/networking/gen_stats.rst */ #include <linux/types.h> diff --git a/net/core/link_watch.c b/net/core/link_watch.c index f153e0601838..75431ca9300f 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { + if (netif_testing(dev)) + return IF_OPER_TESTING; + if (!netif_carrier_ok(dev)) return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); @@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev) write_lock_bh(&dev_base_lock); switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; - case IF_LINK_MODE_DEFAULT: default: break; @@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev) void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ - if (!netif_carrier_ok(dev) || netif_dormant(dev)) + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) rfc2863_policy(dev); } diff --git a/net/core/neighbour.c b/net/core/neighbour.c index dbe0c6ead773..ef6b5a8f629c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family) } const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, @@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, + [NDA_NH_ID] = { .type = NLA_U32 }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3379,7 +3381,7 @@ EXPORT_SYMBOL(neigh_app_ns); static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; @@ -3443,8 +3445,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) } static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; @@ -3457,8 +3459,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3467,8 +3469,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3479,8 +3480,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); @@ -3489,8 +3490,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, } int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3500,8 +3500,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); @@ -3510,8 +3510,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, } static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4773ad6ec111..e353b822bb15 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -243,6 +243,18 @@ static ssize_t duplex_show(struct device *dev, } static DEVICE_ATTR_RO(duplex); +static ssize_t testing_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sprintf(buf, fmt_dec, !!netif_testing(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RO(testing); + static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,7 +272,7 @@ static const char *const operstates[] = { "notpresent", /* currently unused */ "down", "lowerlayerdown", - "testing", /* currently unused */ + "testing", "dormant", "up" }; @@ -355,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - dev->gro_flush_timeout = val; + WRITE_ONCE(dev->gro_flush_timeout, val); return 0; } @@ -370,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + WRITE_ONCE(dev->napi_defer_hard_irqs, val); + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -524,6 +553,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, + &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, @@ -532,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, @@ -1774,12 +1805,12 @@ static struct class net_class __ro_after_init = { #ifdef CONFIG_OF_NET static int of_dev_node_match(struct device *dev, const void *data) { - int ret = 0; - - if (dev->parent) - ret = dev->parent->of_node == data; + for (; dev; dev = dev->parent) { + if (dev->of_node == data) + return 1; + } - return ret == 0 ? dev->of_node == data : ret; + return 0; } /* diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 849380a622ef..093e90e52bc2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -69,10 +69,11 @@ module_param(carrier_timeout, uint, 0644); #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) -static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, + struct net_device *dev, + struct netdev_queue *txq) { - int status = NETDEV_TX_OK; + netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); @@ -304,20 +305,22 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev) +static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { - int status = NETDEV_TX_BUSY; + netdev_tx_t status = NETDEV_TX_BUSY; + struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); - npinfo = rcu_dereference_bh(np->dev->npinfo); + dev = np->dev; + npinfo = rcu_dereference_bh(dev->npinfo); + if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return; + return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ @@ -356,8 +359,25 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + return NETDEV_TX_OK; +} + +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + unsigned long flags; + netdev_tx_t ret; + + if (unlikely(!np)) { + dev_kfree_skb_irq(skb); + ret = NET_XMIT_DROP; + } else { + local_irq_save(flags); + ret = __netpoll_send_skb(np, skb); + local_irq_restore(flags); + } + return ret; } -EXPORT_SYMBOL(netpoll_send_skb_on_dev); +EXPORT_SYMBOL(netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 08e2811b5274..b53b6d38c4df 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -56,7 +56,7 @@ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) * * 021124 Finished major redesign and rewrite for new functionality. - * See Documentation/networking/pktgen.txt for how to use this. + * See Documentation/networking/pktgen.rst for how to use this. * * The new operation: * For each CPU one thread/process is created at start. This process checks diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 709ebbf8ab5b..2269199c5891 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition) switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) + !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; + case IF_OPER_TESTING: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_TESTING; + break; + case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) @@ -3990,8 +3997,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; - int err = -EINVAL; __u8 *addr; + int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) diff --git a/net/core/scm.c b/net/core/scm.c index dc6fed1f221c..875df1c2989d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -212,16 +212,12 @@ EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { - struct cmsghdr __user *cm - = (__force struct cmsghdr __user *)msg->msg_control; - struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); - int err; - if (MSG_CMSG_COMPAT & msg->msg_flags) + if (msg->msg_flags & MSG_CMSG_COMPAT) return put_cmsg_compat(msg, level, type, len, data); - if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } @@ -229,23 +225,30 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - - err = -EFAULT; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) - goto out; - if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) - goto out; - cmlen = CMSG_SPACE(len); - if (msg->msg_controllen < cmlen) - cmlen = msg->msg_controllen; + + if (msg->msg_control_is_user) { + struct cmsghdr __user *cm = msg->msg_control_user; + struct cmsghdr cmhdr; + + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr) || + copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) + return -EFAULT; + } else { + struct cmsghdr *cm = msg->msg_control; + + cm->cmsg_level = level; + cm->cmsg_type = type; + cm->cmsg_len = cmlen; + memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); + } + + cmlen = min(CMSG_SPACE(len), msg->msg_controllen); msg->msg_control += cmlen; msg->msg_controllen -= cmlen; - err = 0; -out: - return err; + return 0; } EXPORT_SYMBOL(put_cmsg); @@ -277,78 +280,90 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter } EXPORT_SYMBOL(put_cmsg_scm_timestamping); +static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) +{ + struct socket *sock; + int new_fd; + int error; + + error = security_file_receive(file); + if (error) + return error; + + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; + + error = put_user(new_fd, ufd); + if (error) { + put_unused_fd(new_fd); + return error; + } + + /* Bump the usage count and install the file. */ + sock = sock_from_file(file, &error); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } + fd_install(new_fd, get_file(file)); + return 0; +} + +static int scm_max_fds(struct msghdr *msg) +{ + if (msg->msg_controllen <= sizeof(struct cmsghdr)) + return 0; + return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int); +} + void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm = (__force struct cmsghdr __user*)msg->msg_control; - - int fdmax = 0; - int fdnum = scm->fp->count; - struct file **fp = scm->fp->fp; - int __user *cmfptr; + int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); + int __user *cmsg_data = CMSG_USER_DATA(cm); int err = 0, i; - if (MSG_CMSG_COMPAT & msg->msg_flags) { + if (msg->msg_flags & MSG_CMSG_COMPAT) { scm_detach_fds_compat(msg, scm); return; } - if (msg->msg_controllen > sizeof(struct cmsghdr)) - fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) - / sizeof(int)); - - if (fdnum < fdmax) - fdmax = fdnum; + /* no use for FD passing from kernel space callers */ + if (WARN_ON_ONCE(!msg->msg_control_is_user)) + return; - for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax; - i++, cmfptr++) - { - struct socket *sock; - int new_fd; - err = security_file_receive(fp[i]); + for (i = 0; i < fdmax; i++) { + err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) break; - err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags - ? O_CLOEXEC : 0); - if (err < 0) - break; - new_fd = err; - err = put_user(new_fd, cmfptr); - if (err) { - put_unused_fd(new_fd); - break; - } - /* Bump the usage count and install the file. */ - sock = sock_from_file(fp[i], &err); - if (sock) { - sock_update_netprioidx(&sock->sk->sk_cgrp_data); - sock_update_classid(&sock->sk->sk_cgrp_data); - } - fd_install(new_fd, get_file(fp[i])); } - if (i > 0) - { - int cmlen = CMSG_LEN(i*sizeof(int)); + if (i > 0) { + int cmlen = CMSG_LEN(i * sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) err = put_user(cmlen, &cm->cmsg_len); if (!err) { - cmlen = CMSG_SPACE(i*sizeof(int)); + cmlen = CMSG_SPACE(i * sizeof(int)); if (msg->msg_controllen < cmlen) cmlen = msg->msg_controllen; msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } } - if (i < fdnum || (fdnum && fdmax <= 0)) + + if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) msg->msg_flags |= MSG_CTRUNC; /* - * All of the files that fit in the message have had their - * usage counts incremented, so we just free the list. + * All of the files that fit in the message have had their usage counts + * incremented, so we just free the list. */ __scm_destroy(scm); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7e29590482ce..b8afefe6f6b6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags); static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, const char msg[]) { - pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", + pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : "<NULL>"); @@ -3727,7 +3727,6 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive_list); /** * skb_segment - Perform protocol segmentation on skb. @@ -4191,7 +4190,6 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive); #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 @@ -6087,13 +6085,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) /** * __skb_ext_alloc - allocate a new skb extensions storage * + * @flags: See kmalloc(). + * * Returns the newly allocated pointer. The pointer can later attached to a * skb via __skb_ext_set(). * Note: caller must handle the skb_ext as an opaque data. */ -struct skb_ext *__skb_ext_alloc(void) +struct skb_ext *__skb_ext_alloc(gfp_t flags) { - struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); if (new) { memset(new->offset, 0, sizeof(new->offset)); @@ -6188,7 +6188,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = __skb_ext_alloc(); + new = __skb_ext_alloc(GFP_ATOMIC); if (!new) return NULL; } diff --git a/net/core/skmsg.c b/net/core/skmsg.c index c479372f2cd2..351afbf6bfba 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -7,6 +7,7 @@ #include <net/sock.h> #include <net/tcp.h> +#include <net/tls.h> static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -682,13 +683,75 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp) return container_of(parser, struct sk_psock, parser); } -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; bool ingress; + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) { + kfree_skb(skb); + return; + } + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + kfree_skb(skb); + return; + } + + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + } else { + kfree_skb(skb); + } +} + +static void sk_psock_tls_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + switch (verdict) { + case __SK_REDIRECT: + sk_psock_skb_redirect(psock, skb); + break; + case __SK_PASS: + case __SK_DROP: + default: + break; + } +} + +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog; + int ret = __SK_PASS; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_tls_verdict_apply(psock, skb, ret); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sock *sk_other; + switch (verdict) { case __SK_PASS: sk_other = psock->sk; @@ -707,25 +770,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } goto out_free; case __SK_REDIRECT: - sk_other = tcp_skb_bpf_redirect_fetch(skb); - if (unlikely(!sk_other)) - goto out_free; - psock_other = sk_psock(sk_other); - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) - goto out_free; - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - break; - } - /* fall-through */ + sk_psock_skb_redirect(psock, skb); + break; case __SK_DROP: /* fall-through */ default: @@ -779,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); - write_unlock_bh(&sk->sk_callback_lock); + if (tls_sw_has_ctx_rx(sk)) { + psock->parser.saved_data_ready(sk); + } else { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } } rcu_read_unlock(); } diff --git a/net/core/sock.c b/net/core/sock.c index b714162213ae..6c4acf1f0220 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -594,6 +594,20 @@ out: return ret; } +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) +{ + int ret; + + if (lock_sk) + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + if (lock_sk) + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + static int sock_setbindtodevice(struct sock *sk, char __user *optval, int optlen) { @@ -634,10 +648,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - lock_sock(sk); - ret = sock_setbindtodevice_locked(sk, index); - release_sock(sk); - + return sock_bindtoindex(sk, index, true); out: #endif @@ -712,6 +723,111 @@ bool sk_mc_loop(struct sock *sk) } EXPORT_SYMBOL(sk_mc_loop); +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + sk->sk_lingertime = 0; + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + +void sock_set_priority(struct sock *sk, u32 priority) +{ + lock_sock(sk); + sk->sk_priority = priority; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_priority); + +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + lock_sock(sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + sk->sk_sndtimeo = secs * HZ; + else + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + } +} + +void sock_enable_timestamps(struct sock *sk) +{ + lock_sock(sk); + __sock_set_timestamps(sk, true, false, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_enable_timestamps); + +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -808,30 +924,7 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_rmem_max); -set_rcvbuf: - /* Ensure val * 2 fits into an int, to prevent max_t() - * from treating it as a negative value. - */ - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); + __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); break; case SO_RCVBUFFORCE: @@ -843,9 +936,8 @@ set_rcvbuf: /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ - if (val < 0) - val = 0; - goto set_rcvbuf; + __sock_set_rcvbuf(sk, max(val, 0)); + break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) @@ -903,28 +995,17 @@ set_rcvbuf: break; case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; case SO_TIMESTAMPNS_NEW: - if (valbool) { - if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) - sock_set_flag(sk, SOCK_TSTAMP_NEW); - else - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - - if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - else - sock_set_flag(sk, SOCK_RCVTSTAMPNS); - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - } + __sock_set_timestamps(sk, valbool, true, true); break; - case SO_TIMESTAMPING_NEW: sock_set_flag(sk, SOCK_TSTAMP_NEW); /* fall through */ @@ -1152,27 +1233,35 @@ set_rcvbuf: break; case SO_TXTIME: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - } else if (optlen != sizeof(struct sock_txtime)) { + if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; + break; } else if (copy_from_user(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; + break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; - } else { - sock_valbool_flag(sk, SOCK_TXTIME, true); - sk->sk_clockid = sk_txtime.clockid; - sk->sk_txtime_deadline_mode = - !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); - sk->sk_txtime_report_errors = - !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + break; + } + /* CLOCK_MONOTONIC is only used by sch_fq, and this packet + * scheduler has enough safe guards. + */ + if (sk_txtime.clockid != CLOCK_MONOTONIC && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; } + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: - ret = sock_setbindtodevice_locked(sk, val); + ret = sock_bindtoindex_locked(sk, val); break; default: @@ -3625,3 +3714,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ + +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +{ + if (!sk->sk_prot->bind_add) + return -EOPNOTSUPP; + return sk->sk_prot->bind_add(sk, addr, addr_len); +} +EXPORT_SYMBOL(sock_bind_add); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index b08dfae10f88..00a26cf2cfe9 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -343,7 +343,14 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static void *sock_map_lookup(struct bpf_map *map, void *key) { - return __sock_map_lookup_elem(map, *(u32 *)key); + struct sock *sk; + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) @@ -1051,7 +1058,14 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) static void *sock_hash_lookup(struct bpf_map *map, void *key) { - return __sock_hash_lookup_elem(map, key); + struct sock *sk; + + sk = __sock_hash_lookup_elem(map, key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void sock_hash_release_progs(struct bpf_map *map) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 9f9e00ba3ad7..b109cc8a6dd8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,7 @@ #include <net/pkt_sched.h> static int two __maybe_unused = 2; +static int three = 3; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -39,13 +40,14 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); * IPv6: reset all settings to default * 1 - Both inherit all current settings from init_net * 2 - Both reset all settings to default + * 3 - Both inherit all settings from current netns */ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; @@ -115,8 +117,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; @@ -180,10 +181,7 @@ write_unlock: } if (len < *lenp) kbuf[len++] = '\n'; - if (copy_to_user(buffer, kbuf, len)) { - ret = -EFAULT; - goto done; - } + memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } @@ -194,8 +192,7 @@ done: } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; @@ -217,7 +214,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { @@ -236,7 +233,7 @@ static int set_default_qdisc(struct ctl_table *table, int write, #endif static int proc_do_dev_weight(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -251,7 +248,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, } static int proc_do_rss_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; @@ -264,7 +261,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; @@ -291,8 +288,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -303,8 +299,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -560,7 +555,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, }, { .procname = "high_order_alloc_disable", diff --git a/net/core/xdp.c b/net/core/xdp.c index 4c7ea85486af..90f44f382115 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -11,11 +11,13 @@ #include <linux/slab.h> #include <linux/idr.h> #include <linux/rhashtable.h> +#include <linux/bug.h> #include <net/page_pool.h> #include <net/xdp.h> #include <net/xdp_priv.h> /* struct xdp_mem_allocator */ #include <trace/events/xdp.h> +#include <net/xdp_sock_drv.h> #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -108,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -static void mem_id_disconnect(int id) -{ - struct xdp_mem_allocator *xa; - - mutex_lock(&mem_id_lock); - - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return; - } - - trace_mem_disconnect(xa); - - if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - - mutex_unlock(&mem_id_lock); -} - void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -142,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) - return mem_id_disconnect(id); - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); @@ -300,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) + if (type == MEM_TYPE_PAGE_POOL) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -357,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. + * of xdp_frames/pages in those cases. This path is never used by the + * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of + * the switch-statement. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -382,36 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_ZERO_COPY: - /* NB! Only valid from an xdp_buff! */ - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - xa->zc_alloc->free(xa->zc_alloc, handle); - rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); break; } } void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0); + __xdp_return(xdpf->data, &xdpf->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } -EXPORT_SYMBOL_GPL(xdp_return_buff); /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) @@ -492,7 +464,14 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->metasize = metasize; xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - xdp_return_buff(xdp); + xsk_buff_free(xdp); return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 9c3b27c257bb..7dce4f6c7025 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -108,11 +108,6 @@ extern int sysctl_dccp_sync_ratelimit; #define ADD48(a, b) (((a) + (b)) & UINT48_MAX) #define SUB48(a, b) ADD48((a), COMPLEMENT48(b)) -static inline void dccp_set_seqno(u64 *seqno, u64 value) -{ - *seqno = value & UINT48_MAX; -} - static inline void dccp_inc_seqno(u64 *seqno) { *seqno = ADD48(*seqno, 1); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 1e5e08cc0bfc..650187d68851 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1082,6 +1082,7 @@ static const struct proto_ops inet6_dccp_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig index 0935453ccfd5..8f98fb2f2ec9 100644 --- a/net/decnet/Kconfig +++ b/net/decnet/Kconfig @@ -15,7 +15,7 @@ config DECNET <http://linux-decnet.sourceforge.net/>. More detailed documentation is available in - <file:Documentation/networking/decnet.txt>. + <file:Documentation/networking/decnet.rst>. Be sure to say Y to "/proc file system support" and "Sysctl support" below when using DECnet, since you will need sysctl support to aid @@ -40,4 +40,4 @@ config DECNET_ROUTER filtering" option will be required for the forthcoming routing daemon to work. - See <file:Documentation/networking/decnet.txt> for more information. + See <file:Documentation/networking/decnet.rst> for more information. diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index cca7ae712995..65abcf1b3210 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -160,8 +160,8 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *, + loff_t *); static struct dn_dev_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table dn_dev_vars[5]; @@ -245,8 +245,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } static int dn_forwarding_proc(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { #ifdef CONFIG_DECNET_ROUTER struct net_device *dev = table->extra1; diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 55bf64a22b59..deae519bdeec 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str) } static int dn_node_address_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char addr[DN_ASCBUF_LEN]; size_t len; @@ -148,10 +147,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, if (write) { len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); - - if (copy_from_user(addr, buffer, len)) - return -EFAULT; - + memcpy(addr, buffer, len); addr[len] = 0; strip_it(addr); @@ -173,11 +169,9 @@ static int dn_node_address_handler(struct ctl_table *table, int write, len = strlen(addr); addr[len++] = '\n'; - if (len > *lenp) len = *lenp; - - if (copy_to_user(buffer, addr, len)) - return -EFAULT; - + if (len > *lenp) + len = *lenp; + memcpy(buffer, addr, len); *lenp = len; *ppos += len; @@ -185,8 +179,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, } static int dn_def_dev_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { size_t len; struct net_device *dev; @@ -201,9 +194,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (*lenp > 16) return -E2BIG; - if (copy_from_user(devname, buffer, *lenp)) - return -EFAULT; - + memcpy(devname, buffer, *lenp); devname[*lenp] = 0; strip_it(devname); @@ -238,9 +229,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, devname, len)) - return -EFAULT; - + memcpy(buffer, devname, len); *lenp = len; *ppos += len; diff --git a/net/devres.c b/net/devres.c new file mode 100644 index 000000000000..57a6a88d11f6 --- /dev/null +++ b/net/devres.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * This file contains all networking devres helpers. + */ + +#include <linux/device.h> +#include <linux/etherdevice.h> +#include <linux/netdevice.h> + +struct net_device_devres { + struct net_device *ndev; +}; + +static void devm_free_netdev(struct device *dev, void *this) +{ + struct net_device_devres *res = this; + + free_netdev(res->ndev); +} + +struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, + unsigned int txqs, unsigned int rxqs) +{ + struct net_device_devres *dr; + + dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL); + if (!dr) + return NULL; + + dr->ndev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs); + if (!dr->ndev) { + devres_free(dr); + return NULL; + } + + devres_add(dev, dr); + + return dr->ndev; +} +EXPORT_SYMBOL(devm_alloc_etherdev_mqs); + +static void devm_netdev_release(struct device *dev, void *this) +{ + struct net_device_devres *res = this; + + unregister_netdev(res->ndev); +} + +static int netdev_devres_match(struct device *dev, void *this, void *match_data) +{ + struct net_device_devres *res = this; + struct net_device *ndev = match_data; + + return ndev == res->ndev; +} + +/** + * devm_register_netdev - resource managed variant of register_netdev() + * @dev: managing device for this netdev - usually the parent device + * @ndev: device to register + * + * This is a devres variant of register_netdev() for which the unregister + * function will be call automatically when the managing device is + * detached. Note: the net_device used must also be resource managed by + * the same struct device. + */ +int devm_register_netdev(struct device *dev, struct net_device *ndev) +{ + struct net_device_devres *dr; + int ret; + + /* struct net_device must itself be managed. For now a managed netdev + * can only be allocated by devm_alloc_etherdev_mqs() so the check is + * straightforward. + */ + if (WARN_ON(!devres_find(dev, devm_free_netdev, + netdev_devres_match, ndev))) + return -EINVAL; + + dr = devres_alloc(devm_netdev_release, sizeof(*dr), GFP_KERNEL); + if (!dr) + return -ENOMEM; + + ret = register_netdev(ndev); + if (ret) { + devres_free(dr); + return ret; + } + + dr->ndev = ndev; + devres_add(ndev->dev.parent, dr); + + return 0; +} +EXPORT_SYMBOL(devm_register_netdev); diff --git a/net/dns_resolver/Kconfig b/net/dns_resolver/Kconfig index 0a1c2238b4bd..255df9b6e9e8 100644 --- a/net/dns_resolver/Kconfig +++ b/net/dns_resolver/Kconfig @@ -19,7 +19,7 @@ config DNS_RESOLVER SMB2 later. DNS Resolver is supported by the userspace upcall helper "/sbin/dns.resolver" via /etc/request-key.conf. - See <file:Documentation/networking/dns_resolver.txt> for further + See <file:Documentation/networking/dns_resolver.rst> for further information. To compile this as a module, choose M here: the module will be called diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index ad53eb31d40f..3aced951d5ab 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -1,6 +1,6 @@ /* Key type used to cache DNS lookups made by the kernel * - * See Documentation/networking/dns_resolver.txt + * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index cab4e0df924f..82b084cc1cc6 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -1,7 +1,7 @@ /* Upcall routine, designed to work as a key type and working through * /sbin/request-key to contact userspace when handling DNS queries. * - * See Documentation/networking/dns_resolver.txt + * See Documentation/networking/dns_resolver.rst * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 92663dcb3aa2..739613070d07 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -9,6 +9,7 @@ menuconfig NET_DSA tristate "Distributed Switch Architecture" depends on HAVE_NET_DSA depends on BRIDGE || BRIDGE=n + select GRO_CELLS select NET_SWITCHDEV select PHYLINK select NET_DEVLINK diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index ee2610c4d46a..1ce9ba8cf545 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -234,7 +234,7 @@ static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, if (dsa_skb_defer_rx_timestamp(p, skb)) return 0; - netif_receive_skb(skb); + gro_cells_receive(&p->gcells, skb); return 0; } @@ -412,6 +412,15 @@ void dsa_devlink_resource_occ_get_unregister(struct dsa_switch *ds, } EXPORT_SYMBOL_GPL(dsa_devlink_resource_occ_get_unregister); +struct dsa_port *dsa_port_from_netdev(struct net_device *netdev) +{ + if (!netdev || !dsa_slave_dev_check(netdev)) + return ERR_PTR(-ENODEV); + + return dsa_slave_to_port(netdev); +} +EXPORT_SYMBOL_GPL(dsa_port_from_netdev); + static int __init dsa_init_module(void) { int rc; diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index d90665b465b8..076908fdd29b 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -24,6 +24,27 @@ LIST_HEAD(dsa_tree_list); static const struct devlink_ops dsa_devlink_ops = { }; +struct dsa_switch *dsa_switch_find(int tree_index, int sw_index) +{ + struct dsa_switch_tree *dst; + struct dsa_port *dp; + + list_for_each_entry(dst, &dsa_tree_list, list) { + if (dst->index != tree_index) + continue; + + list_for_each_entry(dp, &dst->ports, list) { + if (dp->ds->index != sw_index) + continue; + + return dp->ds; + } + } + + return NULL; +} +EXPORT_SYMBOL_GPL(dsa_switch_find); + static struct dsa_switch_tree *dsa_tree_find(int index) { struct dsa_switch_tree *dst; diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 904cc7c9b882..adecf73bd608 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -11,6 +11,7 @@ #include <linux/netdevice.h> #include <linux/netpoll.h> #include <net/dsa.h> +#include <net/gro_cells.h> enum { DSA_NOTIFIER_AGEING_TIME, @@ -34,6 +35,7 @@ struct dsa_notifier_ageing_time_info { /* DSA_NOTIFIER_BRIDGE_* */ struct dsa_notifier_bridge_info { struct net_device *br; + int tree_index; int sw_index; int port; }; @@ -77,6 +79,8 @@ struct dsa_slave_priv { struct pcpu_sw_netstats *stats64; + struct gro_cells gcells; + /* DSA port data, such as switch, port index, etc. */ struct dsa_port *dp; @@ -134,6 +138,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br); void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br); int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, struct switchdev_trans *trans); +bool dsa_port_skip_vlan_configuration(struct dsa_port *dp); int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, struct switchdev_trans *trans); int dsa_port_mtu_change(struct dsa_port *dp, int new_mtu, diff --git a/net/dsa/port.c b/net/dsa/port.c index a58fdd362574..e23ece229c7e 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -13,6 +13,23 @@ #include "dsa_priv.h" +static int dsa_broadcast(unsigned long e, void *v) +{ + struct dsa_switch_tree *dst; + int err = 0; + + list_for_each_entry(dst, &dsa_tree_list, list) { + struct raw_notifier_head *nh = &dst->nh; + + err = raw_notifier_call_chain(nh, e, v); + err = notifier_to_errno(err); + if (err) + break; + } + + return err; +} + static int dsa_port_notify(const struct dsa_port *dp, unsigned long e, void *v) { struct raw_notifier_head *nh = &dp->ds->dst->nh; @@ -120,6 +137,7 @@ void dsa_port_disable(struct dsa_port *dp) int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { + .tree_index = dp->ds->dst->index, .sw_index = dp->ds->index, .port = dp->index, .br = br, @@ -136,7 +154,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) */ dp->bridge_dev = br; - err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_JOIN, &info); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_JOIN, &info); /* The bridging is rolled back on error */ if (err) { @@ -150,6 +168,7 @@ int dsa_port_bridge_join(struct dsa_port *dp, struct net_device *br) void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) { struct dsa_notifier_bridge_info info = { + .tree_index = dp->ds->dst->index, .sw_index = dp->ds->index, .port = dp->index, .br = br, @@ -161,7 +180,7 @@ void dsa_port_bridge_leave(struct dsa_port *dp, struct net_device *br) */ dp->bridge_dev = NULL; - err = dsa_port_notify(dp, DSA_NOTIFIER_BRIDGE_LEAVE, &info); + err = dsa_broadcast(DSA_NOTIFIER_BRIDGE_LEAVE, &info); if (err) pr_err("DSA: failed to notify DSA_NOTIFIER_BRIDGE_LEAVE\n"); @@ -238,6 +257,20 @@ int dsa_port_vlan_filtering(struct dsa_port *dp, bool vlan_filtering, return 0; } +/* This enforces legacy behavior for switch drivers which assume they can't + * receive VLAN configuration when enslaved to a bridge with vlan_filtering=0 + */ +bool dsa_port_skip_vlan_configuration(struct dsa_port *dp) +{ + struct dsa_switch *ds = dp->ds; + + if (!dp->bridge_dev) + return false; + + return (!ds->configure_vlan_while_not_filtering && + !br_vlan_enabled(dp->bridge_dev)); +} + int dsa_port_ageing_time(struct dsa_port *dp, clock_t ageing_clock, struct switchdev_trans *trans) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index d3bcb9afa795..4c7f086a047b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -314,7 +314,7 @@ static int dsa_slave_vlan_add(struct net_device *dev, if (obj->orig_dev != dev) return -EOPNOTSUPP; - if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev)) + if (dsa_port_skip_vlan_configuration(dp)) return 0; vlan = *SWITCHDEV_OBJ_PORT_VLAN(obj); @@ -381,7 +381,7 @@ static int dsa_slave_vlan_del(struct net_device *dev, if (obj->orig_dev != dev) return -EOPNOTSUPP; - if (dp->bridge_dev && !br_vlan_enabled(dp->bridge_dev)) + if (dsa_port_skip_vlan_configuration(dp)) return 0; /* Do not deprogram the CPU port as it may be shared with other user @@ -445,12 +445,11 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_slave_priv *p = netdev_priv(dev); - if (p->netpoll) - netpoll_send_skb(p->netpoll, skb); + return netpoll_send_skb(p->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, @@ -911,13 +910,13 @@ dsa_slave_add_cls_matchall_police(struct net_device *dev, if (!ds->ops->port_policer_add) { NL_SET_ERR_MSG_MOD(extack, - "Policing offload not implemented\n"); + "Policing offload not implemented"); return -EOPNOTSUPP; } if (!ingress) { NL_SET_ERR_MSG_MOD(extack, - "Only supported on ingress qdisc\n"); + "Only supported on ingress qdisc"); return -EOPNOTSUPP; } @@ -928,7 +927,7 @@ dsa_slave_add_cls_matchall_police(struct net_device *dev, list_for_each_entry(mall_tc_entry, &p->mall_tc_list, list) { if (mall_tc_entry->type == DSA_PORT_MALL_POLICER) { NL_SET_ERR_MSG_MOD(extack, - "Only one port policer allowed\n"); + "Only one port policer allowed"); return -EEXIST; } } @@ -1241,7 +1240,7 @@ static int dsa_slave_vlan_rx_add_vid(struct net_device *dev, __be16 proto, * need to emulate the switchdev prepare + commit phase. */ if (dp->bridge_dev) { - if (!br_vlan_enabled(dp->bridge_dev)) + if (dsa_port_skip_vlan_configuration(dp)) return 0; /* br_vlan_get_info() returns -EINVAL or -ENOENT if the @@ -1275,7 +1274,7 @@ static int dsa_slave_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, * need to emulate the switchdev prepare + commit phase. */ if (dp->bridge_dev) { - if (!br_vlan_enabled(dp->bridge_dev)) + if (dsa_port_skip_vlan_configuration(dp)) return 0; /* br_vlan_get_info() returns -EINVAL or -ENOENT if the @@ -1588,10 +1587,10 @@ void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up) } EXPORT_SYMBOL_GPL(dsa_port_phylink_mac_change); -static void dsa_slave_phylink_fixed_state(struct net_device *dev, +static void dsa_slave_phylink_fixed_state(struct phylink_config *config, struct phylink_link_state *state) { - struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); struct dsa_switch *ds = dp->ds; /* No need to check that this operation is valid, the callback would @@ -1631,6 +1630,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) dp->pl_config.dev = &slave_dev->dev; dp->pl_config.type = PHYLINK_NETDEV; + /* The get_fixed_state callback takes precedence over polling the + * link GPIO in PHYLINK (see phylink_get_fixed_state). Only set + * this if the switch provides such a callback. + */ + if (ds->ops->phylink_fixed_state) { + dp->pl_config.get_fixed_state = dsa_slave_phylink_fixed_state; + dp->pl_config.poll_fixed_state = true; + } + dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(port_dn), mode, &dsa_port_phylink_mac_ops); if (IS_ERR(dp->pl)) { @@ -1639,13 +1647,6 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return PTR_ERR(dp->pl); } - /* Register only if the switch provides such a callback, since this - * callback takes precedence over polling the link GPIO in PHYLINK - * (see phylink_get_fixed_state). - */ - if (ds->ops->phylink_fixed_state) - phylink_fixed_state_cb(dp->pl, dsa_slave_phylink_fixed_state); - if (ds->ops->get_phy_flags) phy_flags = ds->ops->get_phy_flags(ds, dp->index); @@ -1667,6 +1668,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } +static struct lock_class_key dsa_slave_netdev_xmit_lock_key; +static void dsa_slave_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &dsa_slave_netdev_xmit_lock_key); +} + int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); @@ -1751,6 +1761,9 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); + netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, + NULL); + SET_NETDEV_DEV(slave_dev, port->ds->dev); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; @@ -1761,6 +1774,11 @@ int dsa_slave_create(struct dsa_port *port) free_netdev(slave_dev); return -ENOMEM; } + + ret = gro_cells_init(&p->gcells, slave_dev); + if (ret) + goto out_free; + p->dp = port; INIT_LIST_HEAD(&p->mall_tc_list); p->xmit = cpu_dp->tag_ops->xmit; @@ -1778,7 +1796,7 @@ int dsa_slave_create(struct dsa_port *port) ret = dsa_slave_phy_setup(slave_dev); if (ret) { netdev_err(master, "error %d setting up slave phy\n", ret); - goto out_free; + goto out_gcells; } dsa_slave_notify(slave_dev, DSA_PORT_REGISTER); @@ -1797,6 +1815,8 @@ out_phy: phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); phylink_destroy(p->dp->pl); +out_gcells: + gro_cells_destroy(&p->gcells); out_free: free_percpu(p->stats64); free_netdev(slave_dev); @@ -1817,6 +1837,7 @@ void dsa_slave_destroy(struct net_device *slave_dev) dsa_slave_notify(slave_dev, DSA_PORT_UNREGISTER); unregister_netdev(slave_dev); phylink_destroy(dp->pl); + gro_cells_destroy(&p->gcells); free_percpu(p->stats64); free_netdev(slave_dev); } diff --git a/net/dsa/switch.c b/net/dsa/switch.c index f3c32ff552b3..86c8dc5c32a0 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -89,11 +89,16 @@ static int dsa_switch_mtu(struct dsa_switch *ds, static int dsa_switch_bridge_join(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { - if (ds->index == info->sw_index && ds->ops->port_bridge_join) + struct dsa_switch_tree *dst = ds->dst; + + if (dst->index == info->tree_index && ds->index == info->sw_index && + ds->ops->port_bridge_join) return ds->ops->port_bridge_join(ds, info->port, info->br); - if (ds->index != info->sw_index && ds->ops->crosschip_bridge_join) - return ds->ops->crosschip_bridge_join(ds, info->sw_index, + if ((dst->index != info->tree_index || ds->index != info->sw_index) && + ds->ops->crosschip_bridge_join) + return ds->ops->crosschip_bridge_join(ds, info->tree_index, + info->sw_index, info->port, info->br); return 0; @@ -103,13 +108,17 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, struct dsa_notifier_bridge_info *info) { bool unset_vlan_filtering = br_vlan_enabled(info->br); + struct dsa_switch_tree *dst = ds->dst; int err, i; - if (ds->index == info->sw_index && ds->ops->port_bridge_leave) + if (dst->index == info->tree_index && ds->index == info->sw_index && + ds->ops->port_bridge_join) ds->ops->port_bridge_leave(ds, info->port, info->br); - if (ds->index != info->sw_index && ds->ops->crosschip_bridge_leave) - ds->ops->crosschip_bridge_leave(ds, info->sw_index, info->port, + if ((dst->index != info->tree_index || ds->index != info->sw_index) && + ds->ops->crosschip_bridge_join) + ds->ops->crosschip_bridge_leave(ds, info->tree_index, + info->sw_index, info->port, info->br); /* If the bridge was vlan_filtering, the bridge core doesn't trigger an diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index b97ad93d1c1a..780b2a15ac9b 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -8,6 +8,7 @@ */ #include <linux/if_bridge.h> #include <linux/if_vlan.h> +#include <linux/dsa/8021q.h> #include "dsa_priv.h" @@ -16,7 +17,7 @@ * * | 11 | 10 | 9 | 8 | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | * +-----------+-----+-----------------+-----------+-----------------------+ - * | DIR | RSV | SWITCH_ID | RSV | PORT | + * | DIR | SVL | SWITCH_ID | SUBVLAN | PORT | * +-----------+-----+-----------------+-----------+-----------------------+ * * DIR - VID[11:10]: @@ -26,17 +27,24 @@ * These values make the special VIDs of 0, 1 and 4095 to be left * unused by this coding scheme. * - * RSV - VID[9]: - * To be used for further expansion of SWITCH_ID or for other purposes. - * Must be transmitted as zero and ignored on receive. + * SVL/SUBVLAN - { VID[9], VID[5:4] }: + * Sub-VLAN encoding. Valid only when DIR indicates an RX VLAN. + * * 0 (0b000): Field does not encode a sub-VLAN, either because + * received traffic is untagged, PVID-tagged or because a second + * VLAN tag is present after this tag and not inside of it. + * * 1 (0b001): Received traffic is tagged with a VID value private + * to the host. This field encodes the index in the host's lookup + * table through which the value of the ingress VLAN ID can be + * recovered. + * * 2 (0b010): Field encodes a sub-VLAN. + * ... + * * 7 (0b111): Field encodes a sub-VLAN. + * When DIR indicates a TX VLAN, SUBVLAN must be transmitted as zero + * (by the host) and ignored on receive (by the switch). * * SWITCH_ID - VID[8:6]: * Index of switch within DSA tree. Must be between 0 and 7. * - * RSV - VID[5:4]: - * To be used for further expansion of PORT or for other purposes. - * Must be transmitted as zero and ignored on receive. - * * PORT - VID[3:0]: * Index of switch port. Must be between 0 and 15. */ @@ -53,6 +61,18 @@ #define DSA_8021Q_SWITCH_ID(x) (((x) << DSA_8021Q_SWITCH_ID_SHIFT) & \ DSA_8021Q_SWITCH_ID_MASK) +#define DSA_8021Q_SUBVLAN_HI_SHIFT 9 +#define DSA_8021Q_SUBVLAN_HI_MASK GENMASK(9, 9) +#define DSA_8021Q_SUBVLAN_LO_SHIFT 4 +#define DSA_8021Q_SUBVLAN_LO_MASK GENMASK(4, 3) +#define DSA_8021Q_SUBVLAN_HI(x) (((x) & GENMASK(2, 2)) >> 2) +#define DSA_8021Q_SUBVLAN_LO(x) ((x) & GENMASK(1, 0)) +#define DSA_8021Q_SUBVLAN(x) \ + (((DSA_8021Q_SUBVLAN_LO(x) << DSA_8021Q_SUBVLAN_LO_SHIFT) & \ + DSA_8021Q_SUBVLAN_LO_MASK) | \ + ((DSA_8021Q_SUBVLAN_HI(x) << DSA_8021Q_SUBVLAN_HI_SHIFT) & \ + DSA_8021Q_SUBVLAN_HI_MASK)) + #define DSA_8021Q_PORT_SHIFT 0 #define DSA_8021Q_PORT_MASK GENMASK(3, 0) #define DSA_8021Q_PORT(x) (((x) << DSA_8021Q_PORT_SHIFT) & \ @@ -78,6 +98,13 @@ u16 dsa_8021q_rx_vid(struct dsa_switch *ds, int port) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid); +u16 dsa_8021q_rx_vid_subvlan(struct dsa_switch *ds, int port, u16 subvlan) +{ + return DSA_8021Q_DIR_RX | DSA_8021Q_SWITCH_ID(ds->index) | + DSA_8021Q_PORT(port) | DSA_8021Q_SUBVLAN(subvlan); +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_vid_subvlan); + /* Returns the decoded switch ID from the RX VID. */ int dsa_8021q_rx_switch_id(u16 vid) { @@ -92,33 +119,26 @@ int dsa_8021q_rx_source_port(u16 vid) } EXPORT_SYMBOL_GPL(dsa_8021q_rx_source_port); -static int dsa_8021q_restore_pvid(struct dsa_switch *ds, int port) +/* Returns the decoded subvlan from the RX VID. */ +u16 dsa_8021q_rx_subvlan(u16 vid) { - struct bridge_vlan_info vinfo; - struct net_device *slave; - u16 pvid; - int err; - - if (!dsa_is_user_port(ds, port)) - return 0; - - slave = dsa_to_port(ds, port)->slave; + u16 svl_hi, svl_lo; - err = br_vlan_get_pvid(slave, &pvid); - if (!pvid || err < 0) - /* There is no pvid on the bridge for this port, which is - * perfectly valid. Nothing to restore, bye-bye! - */ - return 0; + svl_hi = (vid & DSA_8021Q_SUBVLAN_HI_MASK) >> + DSA_8021Q_SUBVLAN_HI_SHIFT; + svl_lo = (vid & DSA_8021Q_SUBVLAN_LO_MASK) >> + DSA_8021Q_SUBVLAN_LO_SHIFT; - err = br_vlan_get_info(slave, pvid, &vinfo); - if (err < 0) { - dev_err(ds->dev, "Couldn't determine PVID attributes\n"); - return err; - } + return (svl_hi << 2) | svl_lo; +} +EXPORT_SYMBOL_GPL(dsa_8021q_rx_subvlan); - return dsa_port_vid_add(dsa_to_port(ds, port), pvid, vinfo.flags); +bool vid_is_dsa_8021q(u16 vid) +{ + return ((vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_RX || + (vid & DSA_8021Q_DIR_MASK) == DSA_8021Q_DIR_TX); } +EXPORT_SYMBOL_GPL(vid_is_dsa_8021q); /* If @enabled is true, installs @vid with @flags into the switch port's HW * filter. @@ -130,39 +150,11 @@ static int dsa_8021q_vid_apply(struct dsa_switch *ds, int port, u16 vid, u16 flags, bool enabled) { struct dsa_port *dp = dsa_to_port(ds, port); - struct bridge_vlan_info vinfo; - int err; if (enabled) return dsa_port_vid_add(dp, vid, flags); - err = dsa_port_vid_del(dp, vid); - if (err < 0) - return err; - - /* Nothing to restore from the bridge for a non-user port. - * The CPU port VLANs are restored implicitly with the user ports, - * similar to how the bridge does in dsa_slave_vlan_add and - * dsa_slave_vlan_del. - */ - if (!dsa_is_user_port(ds, port)) - return 0; - - err = br_vlan_get_info(dp->slave, vid, &vinfo); - /* Couldn't determine bridge attributes for this vid, - * it means the bridge had not configured it. - */ - if (err < 0) - return 0; - - /* Restore the VID from the bridge */ - err = dsa_port_vid_add(dp, vid, vinfo.flags); - if (err < 0) - return err; - - vinfo.flags &= ~BRIDGE_VLAN_INFO_PVID; - - return dsa_port_vid_add(dp->cpu_dp, vid, vinfo.flags); + return dsa_port_vid_del(dp, vid); } /* RX VLAN tagging (left) and TX VLAN tagging (right) setup shown for a single @@ -281,13 +273,149 @@ int dsa_port_setup_8021q_tagging(struct dsa_switch *ds, int port, bool enabled) return err; } - if (!enabled) - err = dsa_8021q_restore_pvid(ds, port); - return err; } EXPORT_SYMBOL_GPL(dsa_port_setup_8021q_tagging); +static int dsa_8021q_crosschip_link_apply(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, bool enabled) +{ + u16 rx_vid = dsa_8021q_rx_vid(ds, port); + + /* @rx_vid of local @ds port @port goes to @other_port of + * @other_ds + */ + return dsa_8021q_vid_apply(other_ds, other_port, rx_vid, + BRIDGE_VLAN_INFO_UNTAGGED, enabled); +} + +static int dsa_8021q_crosschip_link_add(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, + struct list_head *crosschip_links) +{ + struct dsa_8021q_crosschip_link *c; + + list_for_each_entry(c, crosschip_links, list) { + if (c->port == port && c->other_ds == other_ds && + c->other_port == other_port) { + refcount_inc(&c->refcount); + return 0; + } + } + + dev_dbg(ds->dev, "adding crosschip link from port %d to %s port %d\n", + port, dev_name(other_ds->dev), other_port); + + c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return -ENOMEM; + + c->port = port; + c->other_ds = other_ds; + c->other_port = other_port; + refcount_set(&c->refcount, 1); + + list_add(&c->list, crosschip_links); + + return 0; +} + +static void dsa_8021q_crosschip_link_del(struct dsa_switch *ds, + struct dsa_8021q_crosschip_link *c, + struct list_head *crosschip_links, + bool *keep) +{ + *keep = !refcount_dec_and_test(&c->refcount); + + if (*keep) + return; + + dev_dbg(ds->dev, + "deleting crosschip link from port %d to %s port %d\n", + c->port, dev_name(c->other_ds->dev), c->other_port); + + list_del(&c->list); + kfree(c); +} + +/* Make traffic from local port @port be received by remote port @other_port. + * This means that our @rx_vid needs to be installed on @other_ds's upstream + * and user ports. The user ports should be egress-untagged so that they can + * pop the dsa_8021q VLAN. But the @other_upstream can be either egress-tagged + * or untagged: it doesn't matter, since it should never egress a frame having + * our @rx_vid. + */ +int dsa_8021q_crosschip_bridge_join(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, + struct list_head *crosschip_links) +{ + /* @other_upstream is how @other_ds reaches us. If we are part + * of disjoint trees, then we are probably connected through + * our CPU ports. If we're part of the same tree though, we should + * probably use dsa_towards_port. + */ + int other_upstream = dsa_upstream_port(other_ds, other_port); + int rc; + + rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, + other_port, crosschip_links); + if (rc) + return rc; + + rc = dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_port, true); + if (rc) + return rc; + + rc = dsa_8021q_crosschip_link_add(ds, port, other_ds, + other_upstream, + crosschip_links); + if (rc) + return rc; + + return dsa_8021q_crosschip_link_apply(ds, port, other_ds, + other_upstream, true); +} +EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_join); + +int dsa_8021q_crosschip_bridge_leave(struct dsa_switch *ds, int port, + struct dsa_switch *other_ds, + int other_port, + struct list_head *crosschip_links) +{ + int other_upstream = dsa_upstream_port(other_ds, other_port); + struct dsa_8021q_crosschip_link *c, *n; + + list_for_each_entry_safe(c, n, crosschip_links, list) { + if (c->port == port && c->other_ds == other_ds && + (c->other_port == other_port || + c->other_port == other_upstream)) { + struct dsa_switch *other_ds = c->other_ds; + int other_port = c->other_port; + bool keep; + int rc; + + dsa_8021q_crosschip_link_del(ds, c, crosschip_links, + &keep); + if (keep) + continue; + + rc = dsa_8021q_crosschip_link_apply(ds, port, + other_ds, + other_port, + false); + if (rc) + return rc; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_8021q_crosschip_bridge_leave); + struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev, u16 tpid, u16 tci) { diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index 59de1315100f..b0c98ee4e13b 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -228,7 +228,7 @@ static struct sk_buff *ocelot_rcv(struct sk_buff *skb, return skb; } -static struct dsa_device_ops ocelot_netdev_ops = { +static const struct dsa_device_ops ocelot_netdev_ops = { .name = "ocelot", .proto = DSA_TAG_PROTO_OCELOT, .xmit = ocelot_xmit, diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index d553bf36bd41..9b4a4d719291 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -69,12 +69,25 @@ static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) return true; } +static bool sja1105_can_use_vlan_as_tags(const struct sk_buff *skb) +{ + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + + if (hdr->h_vlan_proto == htons(ETH_P_SJA1105)) + return true; + + if (hdr->h_vlan_proto != htons(ETH_P_8021Q)) + return false; + + return vid_is_dsa_8021q(ntohs(hdr->h_vlan_TCI) & VLAN_VID_MASK); +} + /* This is the first time the tagger sees the frame on RX. * Figure out if we can decode it. */ static bool sja1105_filter(const struct sk_buff *skb, struct net_device *dev) { - if (!dsa_port_is_vlan_filtering(dev->dsa_ptr)) + if (sja1105_can_use_vlan_as_tags(skb)) return true; if (sja1105_is_link_local(skb)) return true; @@ -96,6 +109,11 @@ static struct sk_buff *sja1105_defer_xmit(struct sja1105_port *sp, return NULL; } +static u16 sja1105_xmit_tpid(struct sja1105_port *sp) +{ + return sp->xmit_tpid; +} + static struct sk_buff *sja1105_xmit(struct sk_buff *skb, struct net_device *netdev) { @@ -111,15 +129,7 @@ static struct sk_buff *sja1105_xmit(struct sk_buff *skb, if (unlikely(sja1105_is_link_local(skb))) return sja1105_defer_xmit(dp->priv, skb); - /* If we are under a vlan_filtering bridge, IP termination on - * switch ports based on 802.1Q tags is simply too brittle to - * be passable. So just defer to the dsa_slave_notag_xmit - * implementation. - */ - if (dsa_port_is_vlan_filtering(dp)) - return skb; - - return dsa_8021q_xmit(skb, netdev, ETH_P_SJA1105, + return dsa_8021q_xmit(skb, netdev, sja1105_xmit_tpid(dp->priv), ((pcp << VLAN_PRIO_SHIFT) | tx_vid)); } @@ -244,6 +254,20 @@ static struct sk_buff return skb; } +static void sja1105_decode_subvlan(struct sk_buff *skb, u16 subvlan) +{ + struct dsa_port *dp = dsa_slave_to_port(skb->dev); + struct sja1105_port *sp = dp->priv; + u16 vid = sp->subvlan_map[subvlan]; + u16 vlan_tci; + + if (vid == VLAN_N_VID) + return; + + vlan_tci = (skb->priority << VLAN_PRIO_SHIFT) | vid; + __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci); +} + static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct net_device *netdev, struct packet_type *pt) @@ -253,12 +277,13 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, struct ethhdr *hdr; u16 tpid, vid, tci; bool is_link_local; + u16 subvlan = 0; bool is_tagged; bool is_meta; hdr = eth_hdr(skb); tpid = ntohs(hdr->h_proto); - is_tagged = (tpid == ETH_P_SJA1105); + is_tagged = (tpid == ETH_P_SJA1105 || tpid == ETH_P_8021Q); is_link_local = sja1105_is_link_local(skb); is_meta = sja1105_is_meta_frame(skb); @@ -276,6 +301,7 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, source_port = dsa_8021q_rx_source_port(vid); switch_id = dsa_8021q_rx_switch_id(vid); skb->priority = (tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT; + subvlan = dsa_8021q_rx_subvlan(vid); } else if (is_link_local) { /* Management traffic path. Switch embeds the switch ID and * port ID into bytes of the destination MAC, courtesy of @@ -300,11 +326,14 @@ static struct sk_buff *sja1105_rcv(struct sk_buff *skb, return NULL; } + if (subvlan) + sja1105_decode_subvlan(skb, subvlan); + return sja1105_rcv_meta_state_machine(skb, &meta, is_link_local, is_meta); } -static struct dsa_device_ops sja1105_netdev_ops = { +static const struct dsa_device_ops sja1105_netdev_ops = { .name = "sja1105", .proto = DSA_TAG_PROTO_SJA1105, .xmit = sja1105_xmit, diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index c8b903302ff2..dac65180c4ef 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -400,34 +400,6 @@ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, } EXPORT_SYMBOL(alloc_etherdev_mqs); -static void devm_free_netdev(struct device *dev, void *res) -{ - free_netdev(*(struct net_device **)res); -} - -struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, - unsigned int txqs, unsigned int rxqs) -{ - struct net_device **dr; - struct net_device *netdev; - - dr = devres_alloc(devm_free_netdev, sizeof(*dr), GFP_KERNEL); - if (!dr) - return NULL; - - netdev = alloc_etherdev_mqs(sizeof_priv, txqs, rxqs); - if (!netdev) { - devres_free(dr); - return NULL; - } - - *dr = netdev; - devres_add(dev, dr); - - return netdev; -} -EXPORT_SYMBOL(devm_alloc_etherdev_mqs); - ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) { return scnprintf(buf, PAGE_SIZE, "%*phC\n", len, addr); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 6c360c9c9370..0c2b94f20499 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -6,4 +6,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ - channels.o coalesce.o pause.o eee.o tsinfo.o + channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o diff --git a/net/ethtool/cabletest.c b/net/ethtool/cabletest.c new file mode 100644 index 000000000000..7b7a0456c15c --- /dev/null +++ b/net/ethtool/cabletest.c @@ -0,0 +1,431 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/phy.h> +#include <linux/ethtool_netlink.h> +#include "netlink.h" +#include "common.h" + +/* 802.3 standard allows 100 meters for BaseT cables. However longer + * cables might work, depending on the quality of the cables and the + * PHY. So allow testing for up to 150 meters. + */ +#define MAX_CABLE_LENGTH_CM (150 * 100) + +static const struct nla_policy +cable_test_act_policy[ETHTOOL_A_CABLE_TEST_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CABLE_TEST_HEADER] = { .type = NLA_NESTED }, +}; + +static int ethnl_cable_test_started(struct phy_device *phydev, u8 cmd) +{ + struct sk_buff *skb; + int err = -ENOMEM; + void *ehdr; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto out; + + ehdr = ethnl_bcastmsg_put(skb, cmd); + if (!ehdr) { + err = -EMSGSIZE; + goto out; + } + + err = ethnl_fill_reply_header(skb, phydev->attached_dev, + ETHTOOL_A_CABLE_TEST_NTF_HEADER); + if (err) + goto out; + + err = nla_put_u8(skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_NTF_STATUS_STARTED); + if (err) + goto out; + + genlmsg_end(skb, ehdr); + + return ethnl_multicast(skb, phydev->attached_dev); + +out: + nlmsg_free(skb); + phydev_err(phydev, "%s: Error %pe\n", __func__, ERR_PTR(err)); + + return err; +} + +int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct net_device *dev; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_CABLE_TEST_MAX, + cable_test_act_policy, info->extack); + if (ret < 0) + return ret; + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_CABLE_TEST_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out_dev_put; + } + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = phy_start_cable_test(dev->phydev, info->extack); + + ethnl_ops_complete(dev); + + if (!ret) + ethnl_cable_test_started(dev->phydev, + ETHTOOL_MSG_CABLE_TEST_NTF); + +out_rtnl: + rtnl_unlock(); +out_dev_put: + dev_put(dev); + return ret; +} + +int ethnl_cable_test_alloc(struct phy_device *phydev, u8 cmd) +{ + int err = -ENOMEM; + + /* One TDR sample occupies 20 bytes. For a 150 meter cable, + * with four pairs, around 12K is needed. + */ + phydev->skb = genlmsg_new(SZ_16K, GFP_KERNEL); + if (!phydev->skb) + goto out; + + phydev->ehdr = ethnl_bcastmsg_put(phydev->skb, cmd); + if (!phydev->ehdr) { + err = -EMSGSIZE; + goto out; + } + + err = ethnl_fill_reply_header(phydev->skb, phydev->attached_dev, + ETHTOOL_A_CABLE_TEST_NTF_HEADER); + if (err) + goto out; + + err = nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_TEST_NTF_STATUS, + ETHTOOL_A_CABLE_TEST_NTF_STATUS_COMPLETED); + if (err) + goto out; + + phydev->nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_TEST_NTF_NEST); + if (!phydev->nest) { + err = -EMSGSIZE; + goto out; + } + + return 0; + +out: + nlmsg_free(phydev->skb); + phydev->skb = NULL; + return err; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_alloc); + +void ethnl_cable_test_free(struct phy_device *phydev) +{ + nlmsg_free(phydev->skb); + phydev->skb = NULL; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_free); + +void ethnl_cable_test_finished(struct phy_device *phydev) +{ + nla_nest_end(phydev->skb, phydev->nest); + + genlmsg_end(phydev->skb, phydev->ehdr); + + ethnl_multicast(phydev->skb, phydev->attached_dev); +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_finished); + +int ethnl_cable_test_result(struct phy_device *phydev, u8 pair, u8 result) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_NEST_RESULT); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_PAIR, pair)) + goto err; + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_RESULT_CODE, result)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_result); + +int ethnl_cable_test_fault_length(struct phy_device *phydev, u8 pair, u32 cm) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_NEST_FAULT_LENGTH); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_PAIR, pair)) + goto err; + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_FAULT_LENGTH_CM, cm)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_fault_length); + +struct cable_test_tdr_req_info { + struct ethnl_req_info base; +}; + +static const struct nla_policy +cable_test_tdr_act_cfg_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP] = { .type = NLA_U32 }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR] = { .type = NLA_U8 }, +}; + +static const struct nla_policy +cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1] = { + [ETHTOOL_A_CABLE_TEST_TDR_UNSPEC] = { .type = NLA_REJECT }, + [ETHTOOL_A_CABLE_TEST_TDR_HEADER] = { .type = NLA_NESTED }, + [ETHTOOL_A_CABLE_TEST_TDR_CFG] = { .type = NLA_NESTED }, +}; + +/* CABLE_TEST_TDR_ACT */ +static int ethnl_act_cable_test_tdr_cfg(const struct nlattr *nest, + struct genl_info *info, + struct phy_tdr_config *cfg) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX + 1]; + int ret; + + ret = nla_parse_nested(tb, ETHTOOL_A_CABLE_TEST_TDR_CFG_MAX, nest, + cable_test_tdr_act_cfg_policy, info->extack); + if (ret < 0) + return ret; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]) + cfg->first = nla_get_u32( + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST]); + else + cfg->first = 100; + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]) + cfg->last = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST]); + else + cfg->last = MAX_CABLE_LENGTH_CM; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]) + cfg->step = nla_get_u32(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP]); + else + cfg->step = 100; + + if (tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]) { + cfg->pair = nla_get_u8(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR]); + if (cfg->pair > ETHTOOL_A_CABLE_PAIR_D) { + NL_SET_ERR_MSG_ATTR( + info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_PAIR], + "invalid pair parameter"); + return -EINVAL; + } + } else { + cfg->pair = PHY_PAIR_ALL; + } + + if (cfg->first > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_FIRST], + "invalid first parameter"); + return -EINVAL; + } + + if (cfg->last > MAX_CABLE_LENGTH_CM) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_LAST], + "invalid last parameter"); + return -EINVAL; + } + + if (cfg->first > cfg->last) { + NL_SET_ERR_MSG(info->extack, "invalid first/last parameter"); + return -EINVAL; + } + + if (!cfg->step) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "invalid step parameter"); + return -EINVAL; + } + + if (cfg->step > (cfg->last - cfg->first)) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[ETHTOOL_A_CABLE_TEST_TDR_CFG_STEP], + "step parameter too big"); + return -EINVAL; + } + + return 0; +} + +int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *tb[ETHTOOL_A_CABLE_TEST_TDR_MAX + 1]; + struct ethnl_req_info req_info = {}; + struct phy_tdr_config cfg; + struct net_device *dev; + int ret; + + ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, + ETHTOOL_A_CABLE_TEST_TDR_MAX, + cable_test_tdr_act_policy, info->extack); + if (ret < 0) + return ret; + + ret = ethnl_parse_header_dev_get(&req_info, + tb[ETHTOOL_A_CABLE_TEST_TDR_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + if (!dev->phydev) { + ret = -EOPNOTSUPP; + goto out_dev_put; + } + + ret = ethnl_act_cable_test_tdr_cfg(tb[ETHTOOL_A_CABLE_TEST_TDR_CFG], + info, &cfg); + if (ret) + goto out_dev_put; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = phy_start_cable_test_tdr(dev->phydev, info->extack, &cfg); + + ethnl_ops_complete(dev); + + if (!ret) + ethnl_cable_test_started(dev->phydev, + ETHTOOL_MSG_CABLE_TEST_TDR_NTF); + +out_rtnl: + rtnl_unlock(); +out_dev_put: + dev_put(dev); + return ret; +} + +int ethnl_cable_test_amplitude(struct phy_device *phydev, + u8 pair, s16 mV) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, + ETHTOOL_A_CABLE_TDR_NEST_AMPLITUDE); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u8(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_PAIR, pair)) + goto err; + if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_AMPLITUDE_mV, mV)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_amplitude); + +int ethnl_cable_test_pulse(struct phy_device *phydev, u16 mV) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_PULSE); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u16(phydev->skb, ETHTOOL_A_CABLE_PULSE_mV, mV)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_pulse); + +int ethnl_cable_test_step(struct phy_device *phydev, u32 first, u32 last, + u32 step) +{ + struct nlattr *nest; + int ret = -EMSGSIZE; + + nest = nla_nest_start(phydev->skb, ETHTOOL_A_CABLE_TDR_NEST_STEP); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_FIRST_DISTANCE, + first)) + goto err; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_LAST_DISTANCE, last)) + goto err; + + if (nla_put_u32(phydev->skb, ETHTOOL_A_CABLE_STEP_STEP_DISTANCE, step)) + goto err; + + nla_nest_end(phydev->skb, nest); + return 0; + +err: + nla_nest_cancel(phydev->skb, nest); + return ret; +} +EXPORT_SYMBOL_GPL(ethnl_cable_test_step); diff --git a/net/ethtool/channels.c b/net/ethtool/channels.c index 389924b65d05..9ef54cdcf662 100644 --- a/net/ethtool/channels.c +++ b/net/ethtool/channels.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include "netlink.h" #include "common.h" @@ -129,13 +129,13 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ETHTOOL_A_CHANNELS_MAX + 1]; unsigned int from_channel, old_total, i; + bool mod = false, mod_combined = false; struct ethtool_channels channels = {}; struct ethnl_req_info req_info = {}; const struct nlattr *err_attr; const struct ethtool_ops *ops; struct net_device *dev; u32 max_rx_in_use = 0; - bool mod = false; int ret; ret = nlmsg_parse(info->nlhdr, GENL_HDRLEN, tb, @@ -170,7 +170,8 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) ethnl_update_u32(&channels.other_count, tb[ETHTOOL_A_CHANNELS_OTHER_COUNT], &mod); ethnl_update_u32(&channels.combined_count, - tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod); + tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT], &mod_combined); + mod |= mod_combined; ret = 0; if (!mod) goto out_ops; @@ -193,6 +194,21 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info) goto out_ops; } + /* ensure there is at least one RX and one TX channel */ + if (!channels.combined_count && !channels.rx_count) + err_attr = tb[ETHTOOL_A_CHANNELS_RX_COUNT]; + else if (!channels.combined_count && !channels.tx_count) + err_attr = tb[ETHTOOL_A_CHANNELS_TX_COUNT]; + else + err_attr = NULL; + if (err_attr) { + if (mod_combined) + err_attr = tb[ETHTOOL_A_CHANNELS_COMBINED_COUNT]; + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, err_attr, "requested channel counts would result in no RX or TX channel being configured"); + goto out_ops; + } + /* ensure the new Rx count fits within the configured Rx flow * indirection table settings */ diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 89d0b1827aaf..b5df90c981c2 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -17,7 +17,6 @@ #include <linux/phy.h> #include <linux/bitops.h> #include <linux/uaccess.h> -#include <linux/vermagic.h> #include <linux/vmalloc.h> #include <linux/sfp.h> #include <linux/slab.h> @@ -25,10 +24,10 @@ #include <linux/sched/signal.h> #include <linux/net.h> #include <net/devlink.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/flow_offload.h> #include <linux/ethtool_netlink.h> - +#include <generated/utsrelease.h> #include "common.h" /* @@ -553,6 +552,8 @@ static int ethtool_get_link_ksettings(struct net_device *dev, link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS; link_ksettings.base.link_mode_masks_nwords = __ETHTOOL_LINK_MODE_MASK_NU32; + link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED; + link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; return store_link_ksettings_for_user(useraddr, &link_ksettings); } @@ -590,6 +591,10 @@ static int ethtool_set_link_ksettings(struct net_device *dev, != link_ksettings.base.link_mode_masks_nwords) return -EINVAL; + if (link_ksettings.base.master_slave_cfg || + link_ksettings.base.master_slave_state) + return -EINVAL; + err = dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); if (err >= 0) { ethtool_notify(dev, ETHTOOL_MSG_LINKINFO_NTF, NULL); @@ -1505,11 +1510,14 @@ static noinline_for_stack int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) { struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE }; + int ret; if (!dev->ethtool_ops->get_coalesce) return -EOPNOTSUPP; - dev->ethtool_ops->get_coalesce(dev, &coalesce); + ret = dev->ethtool_ops->get_coalesce(dev, &coalesce); + if (ret) + return ret; if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) return -EFAULT; @@ -1664,6 +1672,12 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, dev->ethtool_ops->get_channels(dev, &curr); + if (channels.rx_count == curr.rx_count && + channels.tx_count == curr.tx_count && + channels.combined_count == curr.combined_count && + channels.other_count == curr.other_count) + return 0; + /* ensure new counts are within the maximums */ if (channels.rx_count > curr.max_rx || channels.tx_count > curr.max_tx || @@ -1671,6 +1685,11 @@ static noinline_for_stack int ethtool_set_channels(struct net_device *dev, channels.other_count > curr.max_other) return -EINVAL; + /* ensure there is at least one RX and one TX channel */ + if (!channels.combined_count && + (!channels.rx_count || !channels.tx_count)) + return -EINVAL; + /* ensure the new Rx count fits within the configured Rx flow * indirection table settings */ if (netif_is_rxfh_configured(dev) && @@ -1746,7 +1765,9 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) if (!data) return -ENOMEM; + netif_testing_on(dev); ops->self_test(dev, &test, data); + netif_testing_off(dev); ret = -EFAULT; if (copy_to_user(useraddr, &test, sizeof(test))) diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 452608c6d856..fd4f3e58c6f6 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -27,6 +27,8 @@ linkmodes_get_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT }, }; static int linkmodes_prepare_data(const struct ethnl_req_info *req_base, @@ -63,6 +65,7 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, { const struct linkmodes_reply_data *data = LINKMODES_REPDATA(reply_base); const struct ethtool_link_ksettings *ksettings = &data->ksettings; + const struct ethtool_link_settings *lsettings = &ksettings->base; bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; int len, ret; @@ -86,6 +89,12 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, len += ret; } + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED) + len += nla_total_size(sizeof(u8)); + + if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED) + len += nla_total_size(sizeof(u8)); + return len; } @@ -122,6 +131,16 @@ static int linkmodes_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKMODES_DUPLEX, lsettings->duplex)) return -EMSGSIZE; + if (lsettings->master_slave_cfg != MASTER_SLAVE_CFG_UNSUPPORTED && + nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG, + lsettings->master_slave_cfg)) + return -EMSGSIZE; + + if (lsettings->master_slave_state != MASTER_SLAVE_STATE_UNSUPPORTED && + nla_put_u8(skb, ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE, + lsettings->master_slave_state)) + return -EMSGSIZE; + return 0; } @@ -249,6 +268,8 @@ linkmodes_set_policy[ETHTOOL_A_LINKMODES_MAX + 1] = { [ETHTOOL_A_LINKMODES_PEER] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKMODES_SPEED] = { .type = NLA_U32 }, [ETHTOOL_A_LINKMODES_DUPLEX] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG] = { .type = NLA_U8 }, + [ETHTOOL_A_LINKMODES_MASTER_SLAVE_STATE] = { .type = NLA_REJECT }, }; /* Set advertised link modes to all supported modes matching requested speed @@ -287,14 +308,45 @@ static bool ethnl_auto_linkmodes(struct ethtool_link_ksettings *ksettings, __ETHTOOL_LINK_MODE_MASK_NBITS); } +static bool ethnl_validate_master_slave_cfg(u8 cfg) +{ + switch (cfg) { + case MASTER_SLAVE_CFG_MASTER_PREFERRED: + case MASTER_SLAVE_CFG_SLAVE_PREFERRED: + case MASTER_SLAVE_CFG_MASTER_FORCE: + case MASTER_SLAVE_CFG_SLAVE_FORCE: + return true; + } + + return false; +} + static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, struct ethtool_link_ksettings *ksettings, bool *mod) { struct ethtool_link_settings *lsettings = &ksettings->base; bool req_speed, req_duplex; + const struct nlattr *master_slave_cfg; int ret; + master_slave_cfg = tb[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG]; + if (master_slave_cfg) { + u8 cfg = nla_get_u8(master_slave_cfg); + + if (lsettings->master_slave_cfg == MASTER_SLAVE_CFG_UNSUPPORTED) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave configuration not supported by device"); + return -EOPNOTSUPP; + } + + if (!ethnl_validate_master_slave_cfg(cfg)) { + NL_SET_ERR_MSG_ATTR(info->extack, master_slave_cfg, + "master/slave value is invalid"); + return -EOPNOTSUPP; + } + } + *mod = false; req_speed = tb[ETHTOOL_A_LINKMODES_SPEED]; req_duplex = tb[ETHTOOL_A_LINKMODES_DUPLEX]; @@ -311,6 +363,7 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, mod); ethnl_update_u8(&lsettings->duplex, tb[ETHTOOL_A_LINKMODES_DUPLEX], mod); + ethnl_update_u8(&lsettings->master_slave_cfg, master_slave_cfg, mod); if (!tb[ETHTOOL_A_LINKMODES_OURS] && lsettings->autoneg && (req_speed || req_duplex) && diff --git a/net/ethtool/linkstate.c b/net/ethtool/linkstate.c index 2740cde0a182..7f47ba89054e 100644 --- a/net/ethtool/linkstate.c +++ b/net/ethtool/linkstate.c @@ -2,6 +2,7 @@ #include "netlink.h" #include "common.h" +#include <linux/phy.h> struct linkstate_req_info { struct ethnl_req_info base; @@ -10,6 +11,8 @@ struct linkstate_req_info { struct linkstate_reply_data { struct ethnl_reply_data base; int link; + int sqi; + int sqi_max; }; #define LINKSTATE_REPDATA(__reply_base) \ @@ -20,8 +23,46 @@ linkstate_get_policy[ETHTOOL_A_LINKSTATE_MAX + 1] = { [ETHTOOL_A_LINKSTATE_UNSPEC] = { .type = NLA_REJECT }, [ETHTOOL_A_LINKSTATE_HEADER] = { .type = NLA_NESTED }, [ETHTOOL_A_LINKSTATE_LINK] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_SQI] = { .type = NLA_REJECT }, + [ETHTOOL_A_LINKSTATE_SQI_MAX] = { .type = NLA_REJECT }, }; +static int linkstate_get_sqi(struct net_device *dev) +{ + struct phy_device *phydev = dev->phydev; + int ret; + + if (!phydev) + return -EOPNOTSUPP; + + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi) + ret = -EOPNOTSUPP; + else + ret = phydev->drv->get_sqi(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} + +static int linkstate_get_sqi_max(struct net_device *dev) +{ + struct phy_device *phydev = dev->phydev; + int ret; + + if (!phydev) + return -EOPNOTSUPP; + + mutex_lock(&phydev->lock); + if (!phydev->drv || !phydev->drv->get_sqi_max) + ret = -EOPNOTSUPP; + else + ret = phydev->drv->get_sqi_max(phydev); + mutex_unlock(&phydev->lock); + + return ret; +} + static int linkstate_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) @@ -34,6 +75,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, if (ret < 0) return ret; data->link = __ethtool_get_link(dev); + + ret = linkstate_get_sqi(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + + data->sqi = ret; + + ret = linkstate_get_sqi_max(dev); + if (ret < 0 && ret != -EOPNOTSUPP) + return ret; + + data->sqi_max = ret; + ethnl_ops_complete(dev); return 0; @@ -42,8 +96,19 @@ static int linkstate_prepare_data(const struct ethnl_req_info *req_base, static int linkstate_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { - return nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + struct linkstate_reply_data *data = LINKSTATE_REPDATA(reply_base); + int len; + + len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ + 0; + + if (data->sqi != -EOPNOTSUPP) + len += nla_total_size(sizeof(u32)); + + if (data->sqi_max != -EOPNOTSUPP) + len += nla_total_size(sizeof(u32)); + + return len; } static int linkstate_fill_reply(struct sk_buff *skb, @@ -56,6 +121,14 @@ static int linkstate_fill_reply(struct sk_buff *skb, nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) return -EMSGSIZE; + if (data->sqi != -EOPNOTSUPP && + nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) + return -EMSGSIZE; + + if (data->sqi_max != -EOPNOTSUPP && + nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) + return -EMSGSIZE; + return 0; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index ed5357210193..88fd07f47040 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -181,13 +181,13 @@ err: return NULL; } -static void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) +void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, ðtool_genl_family, 0, cmd); } -static int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) +int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) { return genlmsg_multicast_netns(ðtool_genl_family, dev_net(dev), skb, 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); @@ -839,6 +839,16 @@ static const struct genl_ops ethtool_genl_ops[] = { .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, }, + { + .cmd = ETHTOOL_MSG_CABLE_TEST_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_act_cable_test, + }, + { + .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_act_cable_test_tdr, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 81b8fa020bcb..9a96b6e90dc2 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -19,6 +19,8 @@ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); +void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); +int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); /** * ethnl_strz_size() - calculate attribute length for fixed size string @@ -357,5 +359,7 @@ int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); +int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); +int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index fc7027314ad8..cd99f548e440 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -125,13 +125,11 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) static int hsr_dev_change_mtu(struct net_device *dev, int new_mtu) { struct hsr_priv *hsr; - struct hsr_port *master; hsr = netdev_priv(dev); - master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (new_mtu > hsr_get_max_mtu(hsr)) { - netdev_info(master->dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", + netdev_info(dev, "A HSR master's MTU cannot be greater than the smallest MTU of its slaves minus the HSR Tag length (%d octets).\n", HSR_HLEN); return -EINVAL; } diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index ddd9605bad04..ed13760463de 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -321,7 +321,7 @@ static int hsr_fill_frame_info(struct hsr_frame_info *frame, if (ethhdr->h_proto == htons(ETH_P_8021Q)) { frame->is_vlan = true; /* FIXME: */ - WARN_ONCE(1, "HSR: VLAN not yet supported"); + netdev_warn_once(skb->dev, "VLAN not yet supported"); } if (ethhdr->h_proto == htons(ETH_P_PRP) || ethhdr->h_proto == htons(ETH_P_HSR)) { diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 26d6c39f24e1..e2564de67603 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -15,12 +15,23 @@ #include "hsr_framereg.h" #include "hsr_slave.h" +static bool hsr_slave_empty(struct hsr_priv *hsr) +{ + struct hsr_port *port; + + hsr_for_each_port(hsr, port) + if (port->type != HSR_PT_MASTER) + return false; + return true; +} + static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, void *ptr) { - struct net_device *dev; struct hsr_port *port, *master; + struct net_device *dev; struct hsr_priv *hsr; + LIST_HEAD(list_kill); int mtu_max; int res; @@ -85,8 +96,15 @@ static int hsr_netdev_notify(struct notifier_block *nb, unsigned long event, master->dev->mtu = mtu_max; break; case NETDEV_UNREGISTER: - if (!is_hsr_master(dev)) + if (!is_hsr_master(dev)) { + master = hsr_port_get_hsr(port->hsr, HSR_PT_MASTER); hsr_del_port(port); + if (hsr_slave_empty(master->hsr)) { + unregister_netdevice_queue(master->dev, + &list_kill); + unregister_netdevice_many(&list_kill); + } + } break; case NETDEV_PRE_TYPE_CHANGE: /* HSR works only on Ethernet devices. Refuse slave to change diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 7321cf8d6d2c..f74193465bf5 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -62,15 +62,6 @@ struct hsr_tag { * with the path field in-between, which seems strange. I'm guessing the MAC * address definition is in error. */ -static inline u16 get_hsr_tag_path(struct hsr_tag *ht) -{ - return ntohs(ht->path_and_LSDU_size) >> 12; -} - -static inline u16 get_hsr_tag_LSDU_size(struct hsr_tag *ht) -{ - return ntohs(ht->path_and_LSDU_size) & 0x0FFF; -} static inline void set_hsr_tag_path(struct hsr_tag *ht, u16 path) { @@ -103,16 +94,6 @@ struct hsr_sup_payload { unsigned char macaddress_A[ETH_ALEN]; } __packed; -static inline u16 get_hsr_stag_path(struct hsr_sup_tag *hst) -{ - return get_hsr_tag_path((struct hsr_tag *)hst); -} - -static inline u16 get_hsr_stag_HSR_ver(struct hsr_sup_tag *hst) -{ - return get_hsr_tag_LSDU_size((struct hsr_tag *)hst); -} - static inline void set_hsr_stag_path(struct hsr_sup_tag *hst, u16 path) { set_hsr_tag_path((struct hsr_tag *)hst, path); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index c0b107cdd715..3297e7fa9945 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,6 +58,13 @@ static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; +static int lowpan_dev_init(struct net_device *ldev) +{ + netdev_lockdep_set_classes(ldev); + + return 0; +} + static int lowpan_open(struct net_device *dev) { if (!open_count) @@ -89,6 +96,7 @@ static int lowpan_get_iflink(const struct net_device *dev) } static const struct net_device_ops lowpan_netdev_ops = { + .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index ee179380a766..b34d050c9687 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -240,7 +240,7 @@ static inline bool lowpan_is_reserved(u8 dispatch) return ((dispatch >= 0x44 && dispatch <= 0x4F) || (dispatch >= 0x51 && dispatch <= 0x5F) || (dispatch >= 0xc8 && dispatch <= 0xdf) || - (dispatch >= 0xe8 && dispatch <= 0xff)); + dispatch >= 0xe8); } /* lowpan_rx_h_check checks on generic 6LoWPAN requirements diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 25a8888826b8..23ba5045e3d3 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -49,7 +49,7 @@ config IP_ADVANCED_ROUTER Note that some distributions enable it in startup scripts. For details about rp_filter strict and loose mode read - <file:Documentation/networking/ip-sysctl.txt>. + <file:Documentation/networking/ip-sysctl.rst>. If unsure, say N here. @@ -384,6 +384,7 @@ config INET_ESPINTCP depends on XFRM && INET_ESP select STREAM_PARSER select NET_SOCK_MSG + select XFRM_ESPINTCP help Support for RFC 8229 encapsulation of ESP and IKE over TCP/IPv4 sockets. diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cf58e29cf746..02aa5cb3a4fd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -116,6 +116,7 @@ #include <linux/mroute.h> #endif #include <net/l3mdev.h> +#include <net/compat.h> #include <trace/events/sock.h> @@ -450,12 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - return __inet_bind(sk, uaddr, addr_len, false, true); + return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); } EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) + u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -506,7 +507,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - if (with_lock) + if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ @@ -520,16 +521,18 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; } - err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); - if (err) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - goto out_release_sock; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } } @@ -543,7 +546,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk_dst_reset(sk); err = 0; out_release_sock: - if (with_lock) + if (flags & BIND_WITH_LOCK) release_sock(sk); out: return err; @@ -753,12 +756,11 @@ do_err: } EXPORT_SYMBOL(inet_accept); - /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -779,6 +781,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET4_GETSOCKNAME, + NULL); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } @@ -968,17 +975,42 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) EXPORT_SYMBOL(inet_ioctl); #ifdef CONFIG_COMPAT +static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd, + struct compat_rtentry __user *ur) +{ + compat_uptr_t rtdev; + struct rtentry rt; + + if (copy_from_user(&rt.rt_dst, &ur->rt_dst, + 3 * sizeof(struct sockaddr)) || + get_user(rt.rt_flags, &ur->rt_flags) || + get_user(rt.rt_metric, &ur->rt_metric) || + get_user(rt.rt_mtu, &ur->rt_mtu) || + get_user(rt.rt_window, &ur->rt_window) || + get_user(rt.rt_irtt, &ur->rt_irtt) || + get_user(rtdev, &ur->rt_dev)) + return -EFAULT; + + rt.rt_dev = compat_ptr(rtdev); + return ip_rt_ioctl(sock_net(sk), cmd, &rt); +} + static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; - int err = -ENOIOCTLCMD; - - if (sk->sk_prot->compat_ioctl) - err = sk->sk_prot->compat_ioctl(sk, cmd, arg); - return err; + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: + return inet_compat_routing_ioctl(sk, cmd, argp); + default: + if (!sk->sk_prot->compat_ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->compat_ioctl(sk, cmd, arg); + } } -#endif +#endif /* CONFIG_COMPAT */ const struct proto_ops inet_stream_ops = { .family = PF_INET, @@ -1835,6 +1867,7 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; net->ipv4.sysctl_tcp_early_demux = 1; + net->ipv4.sysctl_nexthop_compat_mode = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif @@ -1914,7 +1947,7 @@ static int __init inet_init(void) { struct inet_protosw *q; struct list_head *r; - int rc = -EINVAL; + int rc; sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5267b6b191eb..123a6d39438f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2367,8 +2367,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) } static int devinet_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2420,8 +2419,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, } static int devinet_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -2464,8 +2462,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, } static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -2670,11 +2667,24 @@ static __net_init int devinet_init_net(struct net *net) tbl[0].extra2 = net; #endif - if ((!IS_ENABLED(CONFIG_SYSCTL) || - sysctl_devconf_inherit_init_net != 2) && - !net_eq(net, &init_net)) { - memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); - memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); + if (!net_eq(net, &init_net)) { + if (IS_ENABLED(CONFIG_SYSCTL) && + sysctl_devconf_inherit_init_net == 3) { + /* copy from the current netns */ + memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, + sizeof(ipv4_devconf)); + memcpy(dflt, + current->nsproxy->net_ns->ipv4.devconf_dflt, + sizeof(ipv4_devconf_dflt)); + } else if (!IS_ENABLED(CONFIG_SYSCTL) || + sysctl_devconf_inherit_init_net != 2) { + /* inherit == 0 or 1: copy from init_net */ + memcpy(all, init_net.ipv4.devconf_all, + sizeof(ipv4_devconf)); + memcpy(dflt, init_net.ipv4.devconf_dflt, + sizeof(ipv4_devconf_dflt)); + } + /* else inherit == 2: use compiled values */ } #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 55ca2e521828..e53871e4a097 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1780,6 +1780,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; if (nexthop_is_blackhole(fi->nh)) rtm->rtm_type = RTN_BLACKHOLE; + if (!fi->fib_net->ipv4.sysctl_nexthop_compat_mode) + goto offload; } if (nhs == 1) { @@ -1805,6 +1807,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } +offload: if (fri->offload) rtm->rtm_flags |= RTM_F_OFFLOAD; if (fri->trap) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fc61f51d87a3..956a806649f7 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -853,7 +853,7 @@ static bool icmp_unreach(struct sk_buff *skb) case ICMP_FRAG_NEEDED: /* for documentation of the ip_no_pmtu_disc * values please see - * Documentation/networking/ip-sysctl.txt + * Documentation/networking/ip-sysctl.rst */ switch (net->ipv4.sysctl_ip_no_pmtu_disc) { default: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 47f0502b2101..7b272bbed2b4 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2565,9 +2565,9 @@ done: } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen) + struct sockaddr_storage __user *p) { - int err, i, count, copycount; + int i, count, copycount; struct sockaddr_in *psin; __be32 addr; struct ip_mc_socklist *pmc; @@ -2583,37 +2583,29 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; - err = -EADDRNOTAVAIL; - for_each_pmc_rtnl(inet, pmc) { if (pmc->multi.imr_multiaddr.s_addr == addr && pmc->multi.imr_ifindex == gsf->gf_interface) break; } if (!pmc) /* must have a prior join */ - goto done; + return -EADDRNOTAVAIL; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || - copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { - return -EFAULT; - } - for (i = 0; i < copycount; i++) { + for (i = 0; i < copycount; i++, p++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; - if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + if (copy_to_user(p, &ss, sizeof(ss))) return -EFAULT; } return 0; -done: - return err; } /* diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 65c29f2bd89f..f40b1b72f979 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -901,11 +901,7 @@ void inet_csk_prepare_forced_close(struct sock *sk) /* sk_clone_lock locked the socket and set refcnt to 2 */ bh_unlock_sock(sk); sock_put(sk); - - /* The below has to be done to allow calling inet_csk_destroy_sock */ - sock_set_flag(sk, SOCK_DEAD); - percpu_counter_inc(sk->sk_prot->orphan_count); - inet_sk(sk)->inet_num = 0; + inet_csk_prepare_for_destroy_sock(sk); } EXPORT_SYMBOL(inet_csk_prepare_forced_close); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 5d50aad3cdbf..125f4f8a36b4 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -43,6 +43,9 @@ struct inet_diag_entry { u16 userlocks; u32 ifindex; u32 mark; +#ifdef CONFIG_SOCK_CGROUP_DATA + u64 cgroup_id; +#endif }; static DEFINE_MUTEX(inet_diag_table_mutex); @@ -162,6 +165,13 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, goto errout; } +#ifdef CONFIG_SOCK_CGROUP_DATA + if (nla_put_u64_64bit(skb, INET_DIAG_CGROUP_ID, + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)), + INET_DIAG_PAD)) + goto errout; +#endif + r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); r->idiag_inode = sock_i_ino(sk); @@ -675,6 +685,16 @@ static int inet_diag_bc_run(const struct nlattr *_bc, yes = 0; break; } +#ifdef CONFIG_SOCK_CGROUP_DATA + case INET_DIAG_BC_CGROUP_COND: { + u64 cgroup_id; + + cgroup_id = get_unaligned((const u64 *)(op + 1)); + if (cgroup_id != entry->cgroup_id) + yes = 0; + break; + } +#endif } if (yes) { @@ -725,6 +745,10 @@ int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; else entry.mark = 0; +#ifdef CONFIG_SOCK_CGROUP_DATA + entry.cgroup_id = sk_fullsock(sk) ? + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; +#endif return inet_diag_bc_run(bc, &entry); } @@ -814,6 +838,15 @@ static bool valid_markcond(const struct inet_diag_bc_op *op, int len, return len >= *min_len; } +#ifdef CONFIG_SOCK_CGROUP_DATA +static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, + int *min_len) +{ + *min_len += sizeof(u64); + return len >= *min_len; +} +#endif + static int inet_diag_bc_audit(const struct nlattr *attr, const struct sk_buff *skb) { @@ -856,6 +889,12 @@ static int inet_diag_bc_audit(const struct nlattr *attr, if (!valid_markcond(bc, len, &min_len)) return -EINVAL; break; +#ifdef CONFIG_SOCK_CGROUP_DATA + case INET_DIAG_BC_CGROUP_COND: + if (!valid_cgroupcond(bc, len, &min_len)) + return -EINVAL; + break; +#endif case INET_DIAG_BC_AUTO: case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 029b24eeafba..4e31f23e4117 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -248,6 +248,15 @@ static void gre_err(struct sk_buff *skb, u32 info) ipgre_err(skb, info, &tpi); } +static bool is_erspan_type1(int gre_hdr_len) +{ + /* Both ERSPAN type I (version 0) and type II (version 1) use + * protocol 0x88BE, but the type I has only 4-byte GRE header, + * while type II has 8-byte. + */ + return gre_hdr_len == 4; +} + static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int gre_hdr_len) { @@ -262,17 +271,26 @@ static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, int len; itn = net_generic(net, erspan_net_id); - iph = ip_hdr(skb); - ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); - ver = ershdr->ver; - - tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, - tpi->flags | TUNNEL_KEY, - iph->saddr, iph->daddr, tpi->key); + if (is_erspan_type1(gre_hdr_len)) { + ver = 0; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + } else { + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = ershdr->ver; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_KEY, + iph->saddr, iph->daddr, tpi->key); + } if (tunnel) { - len = gre_hdr_len + erspan_hdr_len(ver); + if (is_erspan_type1(gre_hdr_len)) + len = gre_hdr_len; + else + len = gre_hdr_len + erspan_hdr_len(ver); + if (unlikely(!pskb_may_pull(skb, len))) return PACKET_REJECT; @@ -665,7 +683,10 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, } /* Push ERSPAN header */ - if (tunnel->erspan_ver == 1) { + if (tunnel->erspan_ver == 0) { + proto = htons(ETH_P_ERSPAN); + tunnel->parms.o_flags &= ~TUNNEL_SEQ; + } else if (tunnel->erspan_ver == 1) { erspan_build_header(skb, ntohl(tunnel->parms.o_key), tunnel->index, truncate, true); @@ -747,45 +768,37 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) } } -static int ipgre_tunnel_ioctl(struct net_device *dev, - struct ifreq *ifr, int cmd) +static int ipgre_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, + int cmd) { - struct ip_tunnel_parm p; int err; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || - p.iph.ihl != 5 || (p.iph.frag_off & htons(~IP_DF)) || - ((p.i_flags | p.o_flags) & (GRE_VERSION | GRE_ROUTING))) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_GRE || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF)) || + ((p->i_flags | p->o_flags) & (GRE_VERSION | GRE_ROUTING))) return -EINVAL; } - p.i_flags = gre_flags_to_tnl_flags(p.i_flags); - p.o_flags = gre_flags_to_tnl_flags(p.o_flags); + p->i_flags = gre_flags_to_tnl_flags(p->i_flags); + p->o_flags = gre_flags_to_tnl_flags(p->o_flags); - err = ip_tunnel_ioctl(dev, &p, cmd); + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd == SIOCCHGTUNNEL) { struct ip_tunnel *t = netdev_priv(dev); - t->parms.i_flags = p.i_flags; - t->parms.o_flags = p.o_flags; + t->parms.i_flags = p->i_flags; + t->parms.o_flags = p->o_flags; if (strcmp(dev->rtnl_link_ops->kind, "erspan")) ipgre_link_update(dev, true); } - p.i_flags = gre_tnl_flags_to_gre_flags(p.i_flags); - p.o_flags = gre_tnl_flags_to_gre_flags(p.o_flags); - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; - + p->i_flags = gre_tnl_flags_to_gre_flags(p->i_flags); + p->o_flags = gre_tnl_flags_to_gre_flags(p->o_flags); return 0; } @@ -903,10 +916,11 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_stop = ipgre_close, #endif .ndo_start_xmit = ipgre_xmit, - .ndo_do_ioctl = ipgre_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipgre_tunnel_ctl, }; #define GRE_FEATURES (NETIF_F_SG | \ @@ -1066,7 +1080,11 @@ static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], if (ret) return ret; - /* ERSPAN should only have GRE sequence and key flag */ + if (data[IFLA_GRE_ERSPAN_VER] && + nla_get_u8(data[IFLA_GRE_ERSPAN_VER]) == 0) + return 0; + + /* ERSPAN type II/III should only have GRE sequence and key flag */ if (data[IFLA_GRE_OFLAGS]) flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); if (data[IFLA_GRE_IFLAGS]) @@ -1174,7 +1192,7 @@ static int erspan_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_ERSPAN_VER]) { t->erspan_ver = nla_get_u8(data[IFLA_GRE_ERSPAN_VER]); - if (t->erspan_ver != 1 && t->erspan_ver != 2) + if (t->erspan_ver > 2) return -EINVAL; } @@ -1259,7 +1277,11 @@ static int erspan_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - tunnel->tun_hlen = 8; + if (tunnel->erspan_ver == 0) + tunnel->tun_hlen = 4; /* 4-byte GRE hdr. */ + else + tunnel->tun_hlen = 8; /* 8-byte GRE hdr. */ + tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen + erspan_hdr_len(tunnel->erspan_ver); @@ -1456,8 +1478,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel_parm *p = &t->parms; __be16 o_flags = p->o_flags; - if (t->erspan_ver == 1 || t->erspan_ver == 2) { - if (!t->collect_md) + if (t->erspan_ver <= 2) { + if (t->erspan_ver != 0 && !t->collect_md) o_flags |= TUNNEL_KEY; if (nla_put_u8(skb, IFLA_GRE_ERSPAN_VER, t->erspan_ver)) @@ -1466,7 +1488,7 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) if (t->erspan_ver == 1) { if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) goto nla_put_failure; - } else { + } else if (t->erspan_ver == 2) { if (nla_put_u8(skb, IFLA_GRE_ERSPAN_DIR, t->dir)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ERSPAN_HWID, t->hwid)) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aa3fd61818c4..84ec3703c909 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -560,6 +560,61 @@ out: return err; } +static void __ip_sock_set_tos(struct sock *sk, int val) +{ + if (sk->sk_type == SOCK_STREAM) { + val &= ~INET_ECN_MASK; + val |= inet_sk(sk)->tos & INET_ECN_MASK; + } + if (inet_sk(sk)->tos != val) { + inet_sk(sk)->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } +} + +void ip_sock_set_tos(struct sock *sk, int val) +{ + lock_sock(sk); + __ip_sock_set_tos(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_tos); + +void ip_sock_set_freebind(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->freebind = true; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_freebind); + +void ip_sock_set_recverr(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->recverr = true; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_recverr); + +int ip_sock_set_mtu_discover(struct sock *sk, int val) +{ + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_OMIT) + return -EINVAL; + lock_sock(sk); + inet_sk(sk)->pmtudisc = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(ip_sock_set_mtu_discover); + +void ip_sock_set_pktinfo(struct sock *sk) +{ + lock_sock(sk); + inet_sk(sk)->cmsg_flags |= IP_CMSG_PKTINFO; + release_sock(sk); +} +EXPORT_SYMBOL(ip_sock_set_pktinfo); /* * Socket option code for IP. This is the end of the line after any @@ -587,6 +642,86 @@ static bool setsockopt_needs_rtnl(int optname) return false; } +static int set_mcast_msfilter(struct sock *sk, int ifindex, + int numsrc, int fmode, + struct sockaddr_storage *group, + struct sockaddr_storage *list) +{ + int msize = IP_MSFILTER_SIZE(numsrc); + struct ip_msfilter *msf; + struct sockaddr_in *psin; + int err, i; + + msf = kmalloc(msize, GFP_KERNEL); + if (!msf) + return -ENOBUFS; + + psin = (struct sockaddr_in *)group; + if (psin->sin_family != AF_INET) + goto Eaddrnotavail; + msf->imsf_multiaddr = psin->sin_addr.s_addr; + msf->imsf_interface = 0; + msf->imsf_fmode = fmode; + msf->imsf_numsrc = numsrc; + for (i = 0; i < numsrc; ++i) { + psin = (struct sockaddr_in *)&list[i]; + + if (psin->sin_family != AF_INET) + goto Eaddrnotavail; + msf->imsf_slist[i] = psin->sin_addr.s_addr; + } + err = ip_mc_msfilter(sk, msf, ifindex); + kfree(msf); + return err; + +Eaddrnotavail: + kfree(msf); + return -EADDRNOTAVAIL; +} + +static int do_mcast_group_source(struct sock *sk, int optname, + struct group_source_req *greqs) +{ + struct ip_mreq_source mreqs; + struct sockaddr_in *psin; + int omode, add, err; + + if (greqs->gsr_group.ss_family != AF_INET || + greqs->gsr_source.ss_family != AF_INET) + return -EADDRNOTAVAIL; + + psin = (struct sockaddr_in *)&greqs->gsr_group; + mreqs.imr_multiaddr = psin->sin_addr.s_addr; + psin = (struct sockaddr_in *)&greqs->gsr_source; + mreqs.imr_sourceaddr = psin->sin_addr.s_addr; + mreqs.imr_interface = 0; /* use index for mc_source */ + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct ip_mreqn mreq; + + psin = (struct sockaddr_in *)&greqs->gsr_group; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_address.s_addr = 0; + mreq.imr_ifindex = greqs->gsr_interface; + err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); + if (err && err != -EADDRINUSE) + return err; + greqs->gsr_interface = mreq.imr_ifindex; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + return ip_mc_source(add, omode, sk, &mreqs, greqs->gsr_interface); +} + static int do_ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -743,15 +878,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->cmsg_flags &= ~IP_CMSG_RECVFRAGSIZE; break; case IP_TOS: /* This sets both TOS and Precedence */ - if (sk->sk_type == SOCK_STREAM) { - val &= ~INET_ECN_MASK; - val |= inet->tos & INET_ECN_MASK; - } - if (inet->tos != val) { - inet->tos = val; - sk->sk_priority = rt_tos2priority(val); - sk_dst_reset(sk); - } + __ip_sock_set_tos(sk, val); break; case IP_TTL: if (optlen < 1) @@ -1029,9 +1156,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, case MCAST_UNBLOCK_SOURCE: { struct group_source_req greqs; - struct ip_mreq_source mreqs; - struct sockaddr_in *psin; - int omode, add; if (optlen != sizeof(struct group_source_req)) goto e_inval; @@ -1039,50 +1163,12 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EFAULT; break; } - if (greqs.gsr_group.ss_family != AF_INET || - greqs.gsr_source.ss_family != AF_INET) { - err = -EADDRNOTAVAIL; - break; - } - psin = (struct sockaddr_in *)&greqs.gsr_group; - mreqs.imr_multiaddr = psin->sin_addr.s_addr; - psin = (struct sockaddr_in *)&greqs.gsr_source; - mreqs.imr_sourceaddr = psin->sin_addr.s_addr; - mreqs.imr_interface = 0; /* use index for mc_source */ - - if (optname == MCAST_BLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 1; - } else if (optname == MCAST_UNBLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 0; - } else if (optname == MCAST_JOIN_SOURCE_GROUP) { - struct ip_mreqn mreq; - - psin = (struct sockaddr_in *)&greqs.gsr_group; - mreq.imr_multiaddr = psin->sin_addr; - mreq.imr_address.s_addr = 0; - mreq.imr_ifindex = greqs.gsr_interface; - err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE); - if (err && err != -EADDRINUSE) - break; - greqs.gsr_interface = mreq.imr_ifindex; - omode = MCAST_INCLUDE; - add = 1; - } else /* MCAST_LEAVE_SOURCE_GROUP */ { - omode = MCAST_INCLUDE; - add = 0; - } - err = ip_mc_source(add, omode, sk, &mreqs, - greqs.gsr_interface); + err = do_mcast_group_source(sk, optname, &greqs); break; } case MCAST_MSFILTER: { - struct sockaddr_in *psin; - struct ip_msfilter *msf = NULL; struct group_filter *gsf = NULL; - int msize, i, ifindex; if (optlen < GROUP_FILTER_SIZE(0)) goto e_inval; @@ -1095,7 +1181,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = PTR_ERR(gsf); break; } - /* numsrc >= (4G-140)/128 overflow in 32 bits */ if (gsf->gf_numsrc >= 0x1ffffff || gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) { @@ -1106,36 +1191,10 @@ static int do_ip_setsockopt(struct sock *sk, int level, err = -EINVAL; goto mc_msf_out; } - msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); - msf = kmalloc(msize, GFP_KERNEL); - if (!msf) { - err = -ENOBUFS; - goto mc_msf_out; - } - ifindex = gsf->gf_interface; - psin = (struct sockaddr_in *)&gsf->gf_group; - if (psin->sin_family != AF_INET) { - err = -EADDRNOTAVAIL; - goto mc_msf_out; - } - msf->imsf_multiaddr = psin->sin_addr.s_addr; - msf->imsf_interface = 0; - msf->imsf_fmode = gsf->gf_fmode; - msf->imsf_numsrc = gsf->gf_numsrc; - err = -EADDRNOTAVAIL; - for (i = 0; i < gsf->gf_numsrc; ++i) { - psin = (struct sockaddr_in *)&gsf->gf_slist[i]; - - if (psin->sin_family != AF_INET) - goto mc_msf_out; - msf->imsf_slist[i] = psin->sin_addr.s_addr; - } - kfree(gsf); - gsf = NULL; - - err = ip_mc_msfilter(sk, msf, ifindex); + err = set_mcast_msfilter(sk, gsf->gf_interface, + gsf->gf_numsrc, gsf->gf_fmode, + &gsf->gf_group, gsf->gf_slist); mc_msf_out: - kfree(msf); kfree(gsf); break; } @@ -1272,9 +1331,113 @@ int compat_ip_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_IP) return -ENOPROTOOPT; - if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) - return compat_mc_setsockopt(sk, level, optname, optval, optlen, - ip_setsockopt); + switch (optname) { + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct compat_group_req __user *gr32 = (void __user *)optval; + struct group_req greq; + struct sockaddr_in *psin = (struct sockaddr_in *)&greq.gr_group; + struct ip_mreqn mreq; + + if (optlen < sizeof(struct compat_group_req)) + return -EINVAL; + + if (get_user(greq.gr_interface, &gr32->gr_interface) || + copy_from_user(&greq.gr_group, &gr32->gr_group, + sizeof(greq.gr_group))) + return -EFAULT; + + if (psin->sin_family != AF_INET) + return -EINVAL; + + memset(&mreq, 0, sizeof(mreq)); + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + rtnl_lock(); + lock_sock(sk); + if (optname == MCAST_JOIN_GROUP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + release_sock(sk); + rtnl_unlock(); + return err; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct compat_group_source_req __user *gsr32 = (void __user *)optval; + struct group_source_req greqs; + + if (optlen != sizeof(struct compat_group_source_req)) + return -EINVAL; + + if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || + copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, + sizeof(greqs.gsr_group)) || + copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, + sizeof(greqs.gsr_source))) + return -EFAULT; + + rtnl_lock(); + lock_sock(sk); + err = do_mcast_group_source(sk, optname, &greqs); + release_sock(sk); + rtnl_unlock(); + return err; + } + case MCAST_MSFILTER: + { + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + unsigned int n; + void *p; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + if (copy_from_user(gf32, optval, optlen)) { + err = -EFAULT; + goto mc_msf_out; + } + + n = gf32->gf_numsrc; + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (n >= 0x1ffffff) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + + rtnl_lock(); + lock_sock(sk); + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (n > sock_net(sk)->ipv4.sysctl_igmp_max_msf) + err = -ENOBUFS; + else + err = set_mcast_msfilter(sk, gf32->gf_interface, + n, gf32->gf_fmode, + &gf32->gf_group, gf32->gf_slist); + release_sock(sk); + rtnl_unlock(); +mc_msf_out: + kfree(p); + return err; + } + } err = do_ip_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER @@ -1465,19 +1628,28 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, } case MCAST_MSFILTER: { + struct group_filter __user *p = (void __user *)optval; struct group_filter gsf; + const int size0 = offsetof(struct group_filter, gf_slist); + int num; - if (len < GROUP_FILTER_SIZE(0)) { + if (len < size0) { err = -EINVAL; goto out; } - if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { + if (copy_from_user(&gsf, p, size0)) { err = -EFAULT; goto out; } - err = ip_mc_gsfget(sk, &gsf, - (struct group_filter __user *)optval, - optlen); + num = gsf.gf_numsrc; + err = ip_mc_gsfget(sk, &gsf, p->gf_slist); + if (err) + goto out; + if (gsf.gf_numsrc < num) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + err = -EFAULT; goto out; } case IP_MULTICAST_ALL: @@ -1492,7 +1664,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control = (__force void *) optval; + msg.msg_control_is_user = true; + msg.msg_control_user = optval; msg.msg_controllen = len; msg.msg_flags = flags; @@ -1589,9 +1762,47 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname, { int err; - if (optname == MCAST_MSFILTER) - return compat_mc_getsockopt(sk, level, optname, optval, optlen, - ip_getsockopt); + if (optname == MCAST_MSFILTER) { + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = (void __user *)optval; + struct compat_group_filter gf32; + struct group_filter gf; + int ulen, err; + int num; + + if (level != SOL_IP) + return -EOPNOTSUPP; + + if (get_user(ulen, optlen)) + return -EFAULT; + + if (ulen < size0) + return -EINVAL; + + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + rtnl_lock(); + lock_sock(sk); + err = ip_mc_gsfget(sk, &gf, p->gf_slist); + release_sock(sk); + rtnl_unlock(); + if (err) + return err; + if (gf.gf_numsrc < num) + num = gf.gf_numsrc; + ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); + if (put_user(ulen, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; + } err = do_ip_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index cd4b84310d92..f4f1d11eab50 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -860,7 +860,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, netdev_state_change(dev); } -int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; struct ip_tunnel *t = netdev_priv(dev); @@ -960,6 +960,20 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) done: return err; } +EXPORT_SYMBOL_GPL(ip_tunnel_ctl); + +int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct ip_tunnel_parm p; + int err; + + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd); + if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + return -EFAULT; + return err; +} EXPORT_SYMBOL_GPL(ip_tunnel_ioctl); int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 1dda7c155c48..1d9c8cff5ac3 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -399,38 +399,31 @@ static int vti4_err(struct sk_buff *skb, u32 info) } static int -vti_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; - struct ip_tunnel_parm p; - - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || - p.iph.ihl != 5) + if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || + p->iph.ihl != 5) return -EINVAL; } - if (!(p.i_flags & GRE_KEY)) - p.i_key = 0; - if (!(p.o_flags & GRE_KEY)) - p.o_key = 0; + if (!(p->i_flags & GRE_KEY)) + p->i_key = 0; + if (!(p->o_flags & GRE_KEY)) + p->o_key = 0; - p.i_flags = VTI_ISVTI; + p->i_flags = VTI_ISVTI; - err = ip_tunnel_ioctl(dev, &p, cmd); + err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { - p.i_flags |= GRE_KEY; - p.o_flags |= GRE_KEY; + p->i_flags |= GRE_KEY; + p->o_flags |= GRE_KEY; } - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; return 0; } @@ -438,10 +431,11 @@ static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, - .ndo_do_ioctl = vti_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 678575adaf3b..40fea52c8277 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -327,41 +327,29 @@ static bool ipip_tunnel_ioctl_verify_protocol(u8 ipproto) } static int -ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { - int err = 0; - struct ip_tunnel_parm p; - - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - return -EFAULT; - if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { - if (p.iph.version != 4 || - !ipip_tunnel_ioctl_verify_protocol(p.iph.protocol) || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + if (p->iph.version != 4 || + !ipip_tunnel_ioctl_verify_protocol(p->iph.protocol) || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; } - p.i_key = p.o_key = 0; - p.i_flags = p.o_flags = 0; - err = ip_tunnel_ioctl(dev, &p, cmd); - if (err) - return err; - - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) - return -EFAULT; - - return 0; + p->i_key = p->o_key = 0; + p->i_flags = p->o_flags = 0; + return ip_tunnel_ctl(dev, p, cmd); } static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = ipip_tunnel_xmit, - .ndo_do_ioctl = ipip_tunnel_ioctl, + .ndo_do_ioctl = ip_tunnel_ioctl, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipip_tunnel_ctl, }; #define IPIP_FEATURES (NETIF_F_SG | \ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index b2363b82b48d..f5c7a58844a4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -421,37 +421,6 @@ static void ipmr_free_table(struct mr_table *mrt) /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ -static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) -{ - struct net *net = dev_net(dev); - - dev_close(dev); - - dev = __dev_get_by_name(net, "tunl0"); - if (dev) { - const struct net_device_ops *ops = dev->netdev_ops; - struct ifreq ifr; - struct ip_tunnel_parm p; - - memset(&p, 0, sizeof(p)); - p.iph.daddr = v->vifc_rmt_addr.s_addr; - p.iph.saddr = v->vifc_lcl_addr.s_addr; - p.iph.version = 4; - p.iph.ihl = 5; - p.iph.protocol = IPPROTO_IPIP; - sprintf(p.name, "dvmrp%d", v->vifc_vifi); - ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - - if (ops->ndo_do_ioctl) { - mm_segment_t oldfs = get_fs(); - - set_fs(KERNEL_DS); - ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL); - set_fs(oldfs); - } - } -} - /* Initialize ipmr pimreg/tunnel in_device */ static bool ipmr_init_vif_indev(const struct net_device *dev) { @@ -471,51 +440,52 @@ static bool ipmr_init_vif_indev(const struct net_device *dev) static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) { - struct net_device *dev; - - dev = __dev_get_by_name(net, "tunl0"); + struct net_device *tunnel_dev, *new_dev; + struct ip_tunnel_parm p = { }; + int err; - if (dev) { - const struct net_device_ops *ops = dev->netdev_ops; - int err; - struct ifreq ifr; - struct ip_tunnel_parm p; + tunnel_dev = __dev_get_by_name(net, "tunl0"); + if (!tunnel_dev) + goto out; - memset(&p, 0, sizeof(p)); - p.iph.daddr = v->vifc_rmt_addr.s_addr; - p.iph.saddr = v->vifc_lcl_addr.s_addr; - p.iph.version = 4; - p.iph.ihl = 5; - p.iph.protocol = IPPROTO_IPIP; - sprintf(p.name, "dvmrp%d", v->vifc_vifi); - ifr.ifr_ifru.ifru_data = (__force void __user *)&p; + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); - if (ops->ndo_do_ioctl) { - mm_segment_t oldfs = get_fs(); + if (!tunnel_dev->netdev_ops->ndo_tunnel_ctl) + goto out; + err = tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, + SIOCADDTUNNEL); + if (err) + goto out; - set_fs(KERNEL_DS); - err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); - set_fs(oldfs); - } else { - err = -EOPNOTSUPP; - } - dev = NULL; - - if (err == 0 && - (dev = __dev_get_by_name(net, p.name)) != NULL) { - dev->flags |= IFF_MULTICAST; - if (!ipmr_init_vif_indev(dev)) - goto failure; - if (dev_open(dev, NULL)) - goto failure; - dev_hold(dev); - } - } - return dev; + new_dev = __dev_get_by_name(net, p.name); + if (!new_dev) + goto out; -failure: - unregister_netdevice(dev); - return NULL; + new_dev->flags |= IFF_MULTICAST; + if (!ipmr_init_vif_indev(new_dev)) + goto out_unregister; + if (dev_open(new_dev, NULL)) + goto out_unregister; + dev_hold(new_dev); + err = dev_set_allmulti(new_dev, 1); + if (err) { + dev_close(new_dev); + tunnel_dev->netdev_ops->ndo_tunnel_ctl(tunnel_dev, &p, + SIOCDELTUNNEL); + dev_put(new_dev); + new_dev = ERR_PTR(err); + } + return new_dev; + +out_unregister: + unregister_netdevice(new_dev); +out: + return ERR_PTR(-ENOBUFS); } #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) @@ -867,14 +837,8 @@ static int vif_add(struct net *net, struct mr_table *mrt, break; case VIFF_TUNNEL: dev = ipmr_new_tunnel(net, vifc); - if (!dev) - return -ENOBUFS; - err = dev_set_allmulti(dev, 1); - if (err) { - ipmr_del_tunnel(dev, vifc); - dev_put(dev); - return err; - } + if (IS_ERR(dev)) + return PTR_ERR(dev); break; case VIFF_USE_IFINDEX: case 0: diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 563f71bcb2d7..400a9f89ebdb 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -33,8 +33,20 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = { [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, + [NHA_FDB] = { .type = NLA_FLAG }, }; +static int call_nexthop_notifiers(struct net *net, + enum nexthop_event_type event_type, + struct nexthop *nh) +{ + int err; + + err = atomic_notifier_call_chain(&net->nexthop.notifier_chain, + event_type, nh); + return notifier_to_errno(err); +} + static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; @@ -114,6 +126,7 @@ static struct nexthop *nexthop_alloc(void) INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); + INIT_LIST_HEAD(&nh->fdb_list); } return nh; } @@ -234,6 +247,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; + if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB)) + goto nla_put_failure; + if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); @@ -248,7 +264,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; - } else { + } else if (!nh->is_fdb_nh) { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; @@ -395,12 +411,35 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, return true; } +static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, + struct netlink_ext_ack *extack) +{ + struct nh_info *nhi; + + if (!nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); + return -EINVAL; + } + + nhi = rtnl_dereference(nh->nh_info); + if (*nh_family == AF_UNSPEC) { + *nh_family = nhi->family; + } else if (*nh_family != nhi->family) { + NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); + return -EINVAL; + } + + return 0; +} + static int nh_check_attr_group(struct net *net, struct nlattr *tb[], struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); + u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; + u8 nhg_fdb = 0; if (len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, @@ -429,6 +468,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], } } + if (tb[NHA_FDB]) + nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; @@ -440,11 +481,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[], } if (!valid_group_nh(nh, len, extack)) return -EINVAL; + + if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) + return -EINVAL; + + if (!nhg_fdb && nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); + return -EINVAL; + } } for (i = NHA_GROUP_TYPE + 1; i < __NHA_MAX; ++i) { if (!tb[i]) continue; - + if (tb[NHA_FDB]) + continue; NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; @@ -503,6 +553,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) if (hash > atomic_read(&nhge->upper_bound)) continue; + if (nhge->nh->is_fdb_nh) + return nhge->nh; + /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ @@ -572,6 +625,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, { struct nh_info *nhi; + if (nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + return -EINVAL; + } + /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source @@ -648,6 +706,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope, { int err = 0; + if (nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); + err = -EINVAL; + goto out; + } + if (nh->is_group) { struct nh_group *nhg; @@ -787,6 +851,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) bool do_flush = false; struct fib_info *fi; + call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh); + list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; @@ -798,7 +864,8 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); - ipv6_stub->ip6_del_rt(net, f6i); + ipv6_stub->ip6_del_rt(net, f6i, + !net->ipv4.sysctl_nexthop_compat_mode); } } @@ -1055,7 +1122,7 @@ out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); - if (replace_notify) + if (replace_notify && net->ipv4.sysctl_nexthop_compat_mode) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } @@ -1118,10 +1185,10 @@ static struct nexthop *nexthop_create_group(struct net *net, /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); - if (!nhg) { + if (!nhg->spare) { kfree(nhg); kfree(nh); - return NULL; + return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; @@ -1148,6 +1215,9 @@ static struct nexthop *nexthop_create_group(struct net *net, nh_group_rebalance(nhg); } + if (cfg->nh_fdb) + nh->is_fdb_nh = 1; + rcu_assign_pointer(nh->nh_grp, nhg); return nh; @@ -1176,7 +1246,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; - u32 tb_id = l3mdev_fib_table(cfg->dev); + u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); @@ -1185,6 +1255,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, goto out; } + if (nh->is_fdb_nh) + goto out; + /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { @@ -1210,6 +1283,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, .fc_flags = cfg->nh_flags, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, + .fc_is_fdb = cfg->nh_fdb, }; int err; @@ -1251,6 +1325,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; + if (cfg->nh_fdb) + nh->is_fdb_nh = 1; + if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; @@ -1272,7 +1349,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, } /* add the entry to the device based hash */ - nexthop_devhash_add(net, nhi); + if (!nh->is_fdb_nh) + nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); @@ -1376,6 +1454,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); + if (tb[NHA_FDB]) { + if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { + NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); + goto out; + } + if (nhm->nh_flags) { + NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); + goto out; + } + cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); + } + if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); @@ -1399,8 +1490,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || - tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { - NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif"); + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { + NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } @@ -1409,26 +1500,28 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, goto out; } - if (!tb[NHA_OIF]) { - NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops"); + if (!cfg->nh_fdb && !tb[NHA_OIF]) { + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } - cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); - if (cfg->nh_ifindex) - cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); + if (!cfg->nh_fdb && tb[NHA_OIF]) { + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); + if (cfg->nh_ifindex) + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); - if (!cfg->dev) { - NL_SET_ERR_MSG(extack, "Invalid device index"); - goto out; - } else if (!(cfg->dev->flags & IFF_UP)) { - NL_SET_ERR_MSG(extack, "Nexthop device is not up"); - err = -ENETDOWN; - goto out; - } else if (!netif_carrier_ok(cfg->dev)) { - NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); - err = -ENETDOWN; - goto out; + if (!cfg->dev) { + NL_SET_ERR_MSG(extack, "Invalid device index"); + goto out; + } else if (!(cfg->dev->flags & IFF_UP)) { + NL_SET_ERR_MSG(extack, "Nexthop device is not up"); + err = -ENETDOWN; + goto out; + } else if (!netif_carrier_ok(cfg->dev)) { + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); + err = -ENETDOWN; + goto out; + } } err = -EINVAL; @@ -1657,7 +1750,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx, static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, int *master_idx, bool *group_filter, - struct netlink_callback *cb) + bool *fdb_filter, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; struct nlattr *tb[NHA_MAX + 1]; @@ -1694,6 +1787,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, case NHA_GROUPS: *group_filter = true; break; + case NHA_FDB: + *fdb_filter = true; + break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request"); return -EINVAL; @@ -1712,17 +1808,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx, /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { + bool group_filter = false, fdb_filter = false; struct nhmsg *nhm = nlmsg_data(cb->nlh); int dev_filter_idx = 0, master_idx = 0; struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; - bool group_filter = false; struct rb_node *node; int idx = 0, s_idx; int err; err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx, - &group_filter, cb); + &group_filter, &fdb_filter, cb); if (err < 0) return err; @@ -1807,6 +1903,19 @@ static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; +int register_nexthop_notifier(struct net *net, struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&net->nexthop.notifier_chain, nb); +} +EXPORT_SYMBOL(register_nexthop_notifier); + +int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&net->nexthop.notifier_chain, + nb); +} +EXPORT_SYMBOL(unregister_nexthop_notifier); + static void __net_exit nexthop_net_exit(struct net *net) { rtnl_lock(); @@ -1823,6 +1932,7 @@ static int __net_init nexthop_net_init(struct net *net) net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; + ATOMIC_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b73f540fa19b..1d7076b78e63 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3334,8 +3334,7 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..5653e3b011bf 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -71,8 +71,7 @@ static void set_local_port_range(struct net *net, int range[2]) /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.ip_local_ports.range); @@ -107,7 +106,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); @@ -168,8 +167,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; @@ -204,8 +202,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, } static int ipv4_fwd_update_priority(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; @@ -221,7 +218,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, } static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); @@ -241,9 +238,8 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, } static int proc_tcp_available_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; @@ -258,9 +254,8 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl, } static int proc_allowed_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; @@ -296,8 +291,7 @@ static int sscanf_key(char *buf, __le32 *key) } static int proc_tcp_fastopen_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); @@ -399,7 +393,7 @@ static void proc_configure_early_demux(int enabled, int protocol) } static int proc_tcp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -415,7 +409,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, } static int proc_udp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -431,8 +425,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, } static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, @@ -447,8 +440,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, } static int proc_tcp_available_ulp(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; @@ -466,7 +458,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, @@ -711,6 +703,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_tcp_early_demux }, { + .procname = "nexthop_compat_mode", + .data = &init_net.ipv4.sysctl_nexthop_compat_mode, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, .maxlen = sizeof(int), @@ -1321,6 +1322,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_doulongvec_minmax, }, { + .procname = "tcp_comp_sack_slack_ns", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index dd401757eea1..15d47d5e7951 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2801,6 +2801,163 @@ static void tcp_enable_tx_delay(void) } } +/* When set indicates to always queue non-full frames. Later the user clears + * this option and we transmit any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly filled frames when the + * user (for example) must write out headers with a write() call first and then + * use sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is stronger than + * TCP_NODELAY. + */ +static void __tcp_sock_set_cork(struct sock *sk, bool on) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (on) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle & TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } +} + +void tcp_sock_set_cork(struct sock *sk, bool on) +{ + lock_sock(sk); + __tcp_sock_set_cork(sk, on); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_cork); + +/* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is + * remembered, but it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make an explicit push, which overrides + * even TCP_CORK for currently queued segments. + */ +static void __tcp_sock_set_nodelay(struct sock *sk, bool on) +{ + if (on) { + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk); + } else { + tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; + } +} + +void tcp_sock_set_nodelay(struct sock *sk) +{ + lock_sock(sk); + __tcp_sock_set_nodelay(sk, true); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_nodelay); + +static void __tcp_sock_set_quickack(struct sock *sk, int val) +{ + if (!val) { + inet_csk_enter_pingpong_mode(sk); + return; + } + + inet_csk_exit_pingpong_mode(sk); + if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + inet_csk_ack_scheduled(sk)) { + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; + tcp_cleanup_rbuf(sk, 1); + if (!(val & 1)) + inet_csk_enter_pingpong_mode(sk); + } +} + +void tcp_sock_set_quickack(struct sock *sk, int val) +{ + lock_sock(sk); + __tcp_sock_set_quickack(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_quickack); + +int tcp_sock_set_syncnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_SYNCNT) + return -EINVAL; + + lock_sock(sk); + inet_csk(sk)->icsk_syn_retries = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_syncnt); + +void tcp_sock_set_user_timeout(struct sock *sk, u32 val) +{ + lock_sock(sk); + inet_csk(sk)->icsk_user_timeout = val; + release_sock(sk); +} +EXPORT_SYMBOL(tcp_sock_set_user_timeout); + +static int __tcp_sock_set_keepidle(struct sock *sk, int val) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (val < 1 || val > MAX_TCP_KEEPIDLE) + return -EINVAL; + + tp->keepalive_time = val * HZ; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + u32 elapsed = keepalive_time_elapsed(tp); + + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + inet_csk_reset_keepalive_timer(sk, elapsed); + } + + return 0; +} + +int tcp_sock_set_keepidle(struct sock *sk, int val) +{ + int err; + + lock_sock(sk); + err = __tcp_sock_set_keepidle(sk, val); + release_sock(sk); + return err; +} +EXPORT_SYMBOL(tcp_sock_set_keepidle); + +int tcp_sock_set_keepintvl(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPINTVL) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_intvl = val * HZ; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepintvl); + +int tcp_sock_set_keepcnt(struct sock *sk, int val) +{ + if (val < 1 || val > MAX_TCP_KEEPCNT) + return -EINVAL; + + lock_sock(sk); + tcp_sk(sk)->keepalive_probes = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(tcp_sock_set_keepcnt); + /* * Socket option code for TCP. */ @@ -2898,20 +3055,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_NODELAY: - if (val) { - /* TCP_NODELAY is weaker than TCP_CORK, so that - * this option on corked socket is remembered, but - * it is not activated until cork is cleared. - * - * However, when TCP_NODELAY is set we make - * an explicit push, which overrides even TCP_CORK - * for currently queued segments. - */ - tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } else { - tp->nonagle &= ~TCP_NAGLE_OFF; - } + __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: @@ -2979,43 +3123,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_CORK: - /* When set indicates to always queue non-full frames. - * Later the user clears this option and we transmit - * any pending partial frames in the queue. This is - * meant to be used alongside sendfile() to get properly - * filled frames when the user (for example) must write - * out headers with a write() call first and then use - * sendfile to send out the data parts. - * - * TCP_CORK can be set together with TCP_NODELAY and it is - * stronger than TCP_NODELAY. - */ - if (val) { - tp->nonagle |= TCP_NAGLE_CORK; - } else { - tp->nonagle &= ~TCP_NAGLE_CORK; - if (tp->nonagle&TCP_NAGLE_OFF) - tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk); - } + __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: - if (val < 1 || val > MAX_TCP_KEEPIDLE) - err = -EINVAL; - else { - tp->keepalive_time = val * HZ; - if (sock_flag(sk, SOCK_KEEPOPEN) && - !((1 << sk->sk_state) & - (TCPF_CLOSE | TCPF_LISTEN))) { - u32 elapsed = keepalive_time_elapsed(tp); - if (tp->keepalive_time > elapsed) - elapsed = tp->keepalive_time - elapsed; - else - elapsed = 0; - inet_csk_reset_keepalive_timer(sk, elapsed); - } - } + err = __tcp_sock_set_keepidle(sk, val); break; case TCP_KEEPINTVL: if (val < 1 || val > MAX_TCP_KEEPINTVL) @@ -3046,8 +3158,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, case TCP_LINGER2: if (val < 0) tp->linger2 = -1; - else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ) - tp->linger2 = 0; + else if (val > TCP_FIN_TIMEOUT_MAX / HZ) + tp->linger2 = TCP_FIN_TIMEOUT_MAX; else tp->linger2 = val * HZ; break; @@ -3072,19 +3184,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_QUICKACK: - if (!val) { - inet_csk_enter_pingpong_mode(sk); - } else { - inet_csk_exit_pingpong_mode(sk); - if ((1 << sk->sk_state) & - (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - inet_csk_ack_scheduled(sk)) { - icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - tcp_cleanup_rbuf(sk, 1); - if (!(val & 1)) - inet_csk_enter_pingpong_mode(sk); - } - } + __tcp_sock_set_quickack(sk, val); break; #ifdef CONFIG_TCP_MD5SIG diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 29c6fc8c7716..83330a6cb242 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -437,7 +437,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* 3. Try to fixup all. It is made immediately after connection enters * established state. */ -void tcp_init_buffer_space(struct sock *sk) +static void tcp_init_buffer_space(struct sock *sk) { int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win; struct tcp_sock *tp = tcp_sk(sk); @@ -2183,8 +2183,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag) } /* Detect loss in event "A" above by marking head of queue up as lost. - * For non-SACK(Reno) senders, the first "packets" number of segments - * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * For RFC3517 SACK, a segment is considered lost if it * has at least tp->reordering SACKed seqments above it; "packets" refers to * the maximum SACKed segments to pass before reaching this limit. */ @@ -2192,10 +2191,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt, oldcnt, lost; - unsigned int mss; + int cnt; /* Use SACK to deduce losses of new sequences sent during recovery */ - const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; + const u32 loss_high = tp->snd_nxt; WARN_ON(packets > tp->packets_out); skb = tp->lost_skb_hint; @@ -2218,26 +2216,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break; - oldcnt = cnt; - if (tcp_is_reno(tp) || - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) cnt += tcp_skb_pcount(skb); - if (cnt > packets) { - if (tcp_is_sack(tp) || - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || - (oldcnt >= packets)) - break; - - mss = tcp_skb_mss(skb); - /* If needed, chop off the prefix to mark as lost. */ - lost = (packets - oldcnt) * mss; - if (lost < skb->len && - tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, - lost, mss, GFP_ATOMIC) < 0) - break; - cnt = packets; - } + if (cnt > packets) + break; tcp_skb_mark_lost(tp, skb); @@ -2849,8 +2832,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, if (tcp_try_undo_partial(sk, prior_snd_una)) return; /* Partial ACK arrived. Force fast retransmit. */ - do_lost = tcp_is_reno(tp) || - tcp_force_fast_retransmit(sk); + do_lost = tcp_force_fast_retransmit(sk); } if (tcp_try_undo_dsack(sk)) { tcp_try_keep_open(sk); @@ -3014,7 +2996,7 @@ void tcp_rearm_rto(struct sock *sk) rto = usecs_to_jiffies(max_t(int, delta_us, 1)); } tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, - TCP_RTO_MAX, tcp_rtx_queue_head(sk)); + TCP_RTO_MAX); } } @@ -3291,7 +3273,7 @@ static void tcp_ack_probe(struct sock *sk) unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX); tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - when, TCP_RTO_MAX, NULL); + when, TCP_RTO_MAX); } } @@ -4323,6 +4305,33 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } +static void tcp_sack_compress_send_ack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->compressed_ack) + return; + + if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) + __sock_put(sk); + + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, + tp->compressed_ack - 1); + + tp->compressed_ack = 0; + tcp_send_ack(sk); +} + +/* Reasonable amount of sack blocks included in TCP SACK option + * The max is 4, but this becomes 3 if TCP timestamps are there. + * Given that SACK packets might be lost, be conservative and use 2. + */ +#define TCP_SACK_BLOCKS_EXPECTED 2 + static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) { struct tcp_sock *tp = tcp_sk(sk); @@ -4335,6 +4344,8 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) for (this_sack = 0; this_sack < cur_sacks; this_sack++, sp++) { if (tcp_sack_extend(sp, seq, end_seq)) { + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) + tcp_sack_compress_send_ack(sk); /* Rotate this_sack to the first one. */ for (; this_sack > 0; this_sack--, sp--) swap(*sp, *(sp - 1)); @@ -4344,6 +4355,9 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) } } + if (this_sack >= TCP_SACK_BLOCKS_EXPECTED) + tcp_sack_compress_send_ack(sk); + /* Could not find an adjacent existing SACK, build a new one, * put it at the front, and shift everyone else down. We * always know there is at least one SACK present already here. @@ -4351,8 +4365,6 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) * If the sack array is full, forget about the last one. */ if (this_sack >= TCP_NUM_SACKS) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - tcp_send_ack(sk); this_sack--; tp->rx_opt.num_sacks--; sp--; @@ -5272,15 +5284,13 @@ send_now: if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { tp->compressed_ack_rcv_nxt = tp->rcv_nxt; - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) - NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = 0; + tp->dup_ack_counter = 0; } - - if (++tp->compressed_ack <= TCP_FASTRETRANS_THRESH) + if (tp->dup_ack_counter < TCP_FASTRETRANS_THRESH) { + tp->dup_ack_counter++; goto send_now; - + } + tp->compressed_ack++; if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; @@ -5293,8 +5303,9 @@ send_now: delay = min_t(unsigned long, sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns, rtt * (NSEC_PER_USEC >> 3)/20); sock_hold(sk); - hrtimer_start(&tp->compressed_ack_timer, ns_to_ktime(delay), - HRTIMER_MODE_REL_PINNED_SOFT); + hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), + sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns, + HRTIMER_MODE_REL_PINNED_SOFT); } static inline void tcp_ack_snd_check(struct sock *sk) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 83a5d24e13b8..ad6435ba6d72 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -403,6 +403,46 @@ void tcp_req_err(struct sock *sk, u32 seq, bool abort) } EXPORT_SYMBOL(tcp_req_err); +/* TCP-LD (RFC 6069) logic */ +void tcp_ld_RTO_revert(struct sock *sk, u32 seq) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + s32 remaining; + u32 delta_us; + + if (sock_owned_by_user(sk)) + return; + + if (seq != tp->snd_una || !icsk->icsk_retransmits || + !icsk->icsk_backoff) + return; + + skb = tcp_rtx_queue_head(sk); + if (WARN_ON_ONCE(!skb)) + return; + + icsk->icsk_backoff--; + icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT; + icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + + tcp_mstamp_refresh(tp); + delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); + remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us); + + if (remaining > 0) { + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + remaining, TCP_RTO_MAX); + } else { + /* RTO revert clocked out retransmission. + * Will retransmit now. + */ + tcp_retransmit_timer(sk); + } +} +EXPORT_SYMBOL(tcp_ld_RTO_revert); + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -419,27 +459,23 @@ EXPORT_SYMBOL(tcp_req_err); * */ -int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) +int tcp_v4_err(struct sk_buff *skb, u32 info) { - const struct iphdr *iph = (const struct iphdr *)icmp_skb->data; - struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); - struct inet_connection_sock *icsk; + const struct iphdr *iph = (const struct iphdr *)skb->data; + struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); struct tcp_sock *tp; struct inet_sock *inet; - const int type = icmp_hdr(icmp_skb)->type; - const int code = icmp_hdr(icmp_skb)->code; + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; struct sock *sk; - struct sk_buff *skb; struct request_sock *fastopen; u32 seq, snd_una; - s32 remaining; - u32 delta_us; int err; - struct net *net = dev_net(icmp_skb->dev); + struct net *net = dev_net(skb->dev); sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, ntohs(th->source), - inet_iif(icmp_skb), 0); + inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; @@ -476,7 +512,6 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) goto out; } - icsk = inet_csk(sk); tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); @@ -490,7 +525,7 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) switch (type) { case ICMP_REDIRECT: if (!sock_owned_by_user(sk)) - do_redirect(icmp_skb, sk); + do_redirect(skb, sk); goto out; case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ @@ -521,41 +556,12 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } err = icmp_err_convert[code].errno; - /* check if icmp_skb allows revert of backoff - * (see draft-zimmermann-tcp-lcd) */ - if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) - break; - if (seq != tp->snd_una || !icsk->icsk_retransmits || - !icsk->icsk_backoff || fastopen) - break; - - if (sock_owned_by_user(sk)) - break; - - skb = tcp_rtx_queue_head(sk); - if (WARN_ON_ONCE(!skb)) - break; - - icsk->icsk_backoff--; - icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : - TCP_TIMEOUT_INIT; - icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); - - - tcp_mstamp_refresh(tp); - delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb)); - remaining = icsk->icsk_rto - - usecs_to_jiffies(delta_us); - - if (remaining > 0) { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - remaining, TCP_RTO_MAX); - } else { - /* RTO revert clocked out retransmission. - * Will retransmit now */ - tcp_retransmit_timer(sk); - } - + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && + (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH)) + tcp_ld_RTO_revert(sk, seq); break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; @@ -573,6 +579,8 @@ int tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (fastopen && !fastopen->sk) break; + ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th); + if (!sock_owned_by_user(sk)) { sk->sk_err = err; @@ -2780,6 +2788,7 @@ static int __net_init tcp_sk_init(struct net *net) sizeof(init_net.ipv4.sysctl_tcp_wmem)); } net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; + net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7e40322cc5ec..495dda2449fe 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,7 +774,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!child) goto listen_overflow; - if (own_req && sk_is_mptcp(child) && mptcp_sk_is_subflow(child)) { + if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(sk, req); return child; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2f45cde168c4..a50e1990a845 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -184,10 +184,10 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, { struct tcp_sock *tp = tcp_sk(sk); - if (unlikely(tp->compressed_ack > TCP_FASTRETRANS_THRESH)) { + if (unlikely(tp->compressed_ack)) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, - tp->compressed_ack - TCP_FASTRETRANS_THRESH); - tp->compressed_ack = TCP_FASTRETRANS_THRESH; + tp->compressed_ack); + tp->compressed_ack = 0; if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1) __sock_put(sk); } @@ -2593,8 +2593,7 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto) if (rto_delta_us > 0) timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); - tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, - TCP_RTO_MAX, NULL); + tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); return true; } @@ -2772,8 +2771,12 @@ u32 __tcp_select_window(struct sock *sk) int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int allowed_space = tcp_full_space(sk); - int full_space = min_t(int, tp->window_clamp, allowed_space); - int window; + int full_space, window; + + if (sk_is_mptcp(sk)) + mptcp_space(sk, &free_space, &allowed_space); + + full_space = min_t(int, tp->window_clamp, allowed_space); if (unlikely(mss > full_space)) { mss = full_space; @@ -3109,6 +3112,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb, *rtx_head, *hole = NULL; struct tcp_sock *tp = tcp_sk(sk); + bool rearm_timer = false; u32 max_segs; int mib_idx; @@ -3131,7 +3135,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) segs = tp->snd_cwnd - tcp_packets_in_flight(tp); if (segs <= 0) - return; + break; sacked = TCP_SKB_CB(skb)->sacked; /* In case tcp_shift_skb_data() have aggregated large skbs, * we need to make sure not sending too bigs TSO packets @@ -3156,10 +3160,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) continue; if (tcp_small_queue_check(sk, skb, 1)) - return; + break; if (tcp_retransmit_skb(sk, skb, segs)) - return; + break; NET_ADD_STATS(sock_net(sk), mib_idx, tcp_skb_pcount(skb)); @@ -3168,11 +3172,13 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == rtx_head && icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT) - tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX, - skb); + rearm_timer = true; + } + if (rearm_timer) + tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } /* We allow to exceed memory limits for FIN packets to expedite @@ -3903,7 +3909,7 @@ void tcp_send_probe0(struct sock *sk) */ timeout = TCP_RESOURCE_PROBE_INTERVAL; } - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c3f26dcd6704..ada046f425d2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -753,8 +753,14 @@ static enum hrtimer_restart tcp_compressed_ack_kick(struct hrtimer *timer) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) { - if (tp->compressed_ack > TCP_FASTRETRANS_THRESH) + if (tp->compressed_ack) { + /* Since we have to send one ack finally, + * substract one from tp->compressed_ack to keep + * LINUX_MIB_TCPACKCOMPRESSED accurate. + */ + tp->compressed_ack--; tcp_send_ack(sk); + } } else { if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &sk->sk_tsq_flags)) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32564b350823..1b7ebbcae497 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -112,6 +112,9 @@ #include <net/sock_reuseport.h> #include <net/addrconf.h> #include <net/udp_tunnel.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6_stubs.h> +#endif struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); @@ -2563,7 +2566,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: case UDP_ENCAP_ESPINUDP_NON_IKE: - up->encap_rcv = xfrm4_udp_encap_rcv; +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + up->encap_rcv = ipv6_stub->xfrm6_udp_encap_rcv; + else +#endif + up->encap_rcv = xfrm4_udp_encap_rcv; #endif fallthrough; case UDP_ENCAP_L2TPINUDP: diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 150e6f0fdbf5..3eecba0874aa 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -22,9 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index f8de2482a529..ad2afeef4f10 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -18,11 +18,6 @@ #include <net/ip.h> #include <net/xfrm.h> -int xfrm4_extract_input(struct xfrm_state *x, struct sk_buff *skb) -{ - return xfrm4_extract_header(skb); -} - static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 30ddb9dc9398..3cff51ba72bb 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -14,77 +14,18 @@ #include <net/xfrm.h> #include <net/icmp.h> -static int xfrm4_tunnel_check_size(struct sk_buff *skb) -{ - int mtu, ret = 0; - - if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) - goto out; - - if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) - goto out; - - mtu = dst_mtu(skb_dst(skb)); - if ((!skb_is_gso(skb) && skb->len > mtu) || - (skb_is_gso(skb) && - !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { - skb->protocol = htons(ETH_P_IP); - - if (skb->sk) - xfrm_local_error(skb, mtu); - else - icmp_send(skb, ICMP_DEST_UNREACH, - ICMP_FRAG_NEEDED, htonl(mtu)); - ret = -EMSGSIZE; - } -out: - return ret; -} - -int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - - err = xfrm4_tunnel_check_size(skb); - if (err) - return err; - - XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol; - - return xfrm4_extract_header(skb); -} - -int xfrm4_output_finish(struct sock *sk, struct sk_buff *skb) -{ - memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - - IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; - - return xfrm_output(sk, skb); -} - static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { +#ifdef CONFIG_NETFILTER struct xfrm_state *x = skb_dst(skb)->xfrm; - const struct xfrm_state_afinfo *afinfo; - int ret = -EAFNOSUPPORT; -#ifdef CONFIG_NETFILTER if (!x) { IPCB(skb)->flags |= IPSKB_REROUTED; return dst_output(net, sk, skb); } #endif - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); - if (likely(afinfo)) - ret = afinfo->output_finish(sk, skb); - else - kfree_skb(skb); - rcu_read_unlock(); - - return ret; + return xfrm_output(sk, skb); } int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index f8ed3c3bb928..87d4db591488 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -8,36 +8,12 @@ * */ -#include <net/ip.h> #include <net/xfrm.h> -#include <linux/pfkeyv2.h> -#include <linux/ipsec.h> -#include <linux/netfilter_ipv4.h> -#include <linux/export.h> - -int xfrm4_extract_header(struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); - XFRM_MODE_SKB_CB(skb)->id = iph->id; - XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; - XFRM_MODE_SKB_CB(skb)->tos = iph->tos; - XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; - XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); - memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, - sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); - - return 0; -} static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, .proto = IPPROTO_IPIP, .output = xfrm4_output, - .output_finish = xfrm4_output_finish, - .extract_input = xfrm4_extract_input, - .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, .local_error = xfrm4_local_error, }; diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 2ccaee98fddb..4f03aece2980 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -13,7 +13,7 @@ menuconfig IPV6 For general information about IPv6, see <https://en.wikipedia.org/wiki/IPv6>. For specific information about IPv6 under Linux, see - Documentation/networking/ipv6.txt and read the HOWTO at + Documentation/networking/ipv6.rst and read the HOWTO at <http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/> To compile this protocol support as a module, choose M here: the @@ -88,6 +88,18 @@ config INET6_ESP_OFFLOAD If unsure, say N. +config INET6_ESPINTCP + bool "IPv6: ESP in TCP encapsulation (RFC 8229)" + depends on XFRM && INET6_ESP + select STREAM_PARSER + select NET_SOCK_MSG + select XFRM_ESPINTCP + help + Support for RFC 8229 encapsulation of ESP and IKE over + TCP/IPv6 sockets. + + If unsure, say N. + config INET6_IPCOMP tristate "IPv6: IPComp transformation" select INET6_XFRM_TUNNEL diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f131cedf5ba6..840bfdb3d7bd 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -135,8 +135,7 @@ static inline void addrconf_sysctl_unregister(struct inet6_dev *idev) } #endif -static void ipv6_regen_rndid(struct inet6_dev *idev); -static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void ipv6_gen_rnd_iid(struct in6_addr *addr); static int ipv6_generate_eui64(u8 *eui, struct net_device *dev); static int ipv6_count_addresses(const struct inet6_dev *idev); @@ -432,8 +431,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) dev->type == ARPHRD_SIT || dev->type == ARPHRD_NONE) { ndev->cnf.use_tempaddr = -1; - } else - ipv6_regen_rndid(ndev); + } ndev->token = in6addr_any; @@ -1238,7 +1236,7 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, ifp->idev->dev, 0, RTF_DEFAULT, true); if (f6i) { if (del_rt) - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { if (!(f6i->fib6_flags & RTF_EXPIRES)) fib6_set_expires(f6i, expires); @@ -1306,29 +1304,21 @@ out: in6_ifa_put(ifp); } -static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, - struct inet6_ifaddr *ift, - bool block) +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block) { struct inet6_dev *idev = ifp->idev; - struct in6_addr addr, *tmpaddr; unsigned long tmp_tstamp, age; unsigned long regen_advance; - struct ifa6_config cfg; - int ret = 0; unsigned long now = jiffies; - long max_desync_factor; s32 cnf_temp_preferred_lft; + struct inet6_ifaddr *ift; + struct ifa6_config cfg; + long max_desync_factor; + struct in6_addr addr; + int ret = 0; write_lock_bh(&idev->lock); - if (ift) { - spin_lock_bh(&ift->lock); - memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); - spin_unlock_bh(&ift->lock); - tmpaddr = &addr; - } else { - tmpaddr = NULL; - } + retry: in6_dev_hold(idev); if (idev->cnf.use_tempaddr <= 0) { @@ -1351,8 +1341,8 @@ retry: } in6_ifa_hold(ifp); memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); - ipv6_try_regen_rndid(idev, tmpaddr); - memcpy(&addr.s6_addr[8], idev->rndid, 8); + ipv6_gen_rnd_iid(&addr); + age = (now - ifp->tstamp) / HZ; regen_advance = idev->cnf.regen_max_retry * @@ -1417,7 +1407,6 @@ retry: in6_ifa_put(ifp); in6_dev_put(idev); pr_info("%s: retry temporary address regeneration\n", __func__); - tmpaddr = &addr; write_lock_bh(&idev->lock); goto retry; } @@ -2032,7 +2021,7 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed) if (ifpub) { in6_ifa_hold(ifpub); spin_unlock_bh(&ifp->lock); - ipv6_create_tempaddr(ifpub, ifp, true); + ipv6_create_tempaddr(ifpub, true); in6_ifa_put(ifpub); } else { spin_unlock_bh(&ifp->lock); @@ -2329,40 +2318,38 @@ static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) return err; } -/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ -static void ipv6_regen_rndid(struct inet6_dev *idev) +/* Generation of a randomized Interface Identifier + * draft-ietf-6man-rfc4941bis, Section 3.3.1 + */ + +static void ipv6_gen_rnd_iid(struct in6_addr *addr) { regen: - get_random_bytes(idev->rndid, sizeof(idev->rndid)); - idev->rndid[0] &= ~0x02; + get_random_bytes(&addr->s6_addr[8], 8); - /* - * <draft-ietf-ipngwg-temp-addresses-v2-00.txt>: - * check if generated address is not inappropriate + /* <draft-ietf-6man-rfc4941bis-08.txt>, Section 3.3.1: + * check if generated address is not inappropriate: * - * - Reserved subnet anycast (RFC 2526) - * 11111101 11....11 1xxxxxxx - * - ISATAP (RFC4214) 6.1 - * 00-00-5E-FE-xx-xx-xx-xx - * - value 0 - * - XXX: already assigned to an address on the device + * - Reserved IPv6 Interface Identifers + * - XXX: already assigned to an address on the device */ - if (idev->rndid[0] == 0xfd && - (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff && - (idev->rndid[7]&0x80)) + + /* Subnet-router anycast: 0000:0000:0000:0000 */ + if (!(addr->s6_addr32[2] | addr->s6_addr32[3])) goto regen; - if ((idev->rndid[0]|idev->rndid[1]) == 0) { - if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe) - goto regen; - if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) - goto regen; - } -} -static void ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) -{ - if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) - ipv6_regen_rndid(idev); + /* IANA Ethernet block: 0200:5EFF:FE00:0000-0200:5EFF:FE00:5212 + * Proxy Mobile IPv6: 0200:5EFF:FE00:5213 + * IANA Ethernet block: 0200:5EFF:FE00:5214-0200:5EFF:FEFF:FFFF + */ + if (ntohl(addr->s6_addr32[2]) == 0x02005eff && + (ntohl(addr->s6_addr32[3]) & 0Xff000000) == 0xfe000000) + goto regen; + + /* Reserved subnet anycast addresses */ + if (ntohl(addr->s6_addr32[2]) == 0xfdffffff && + ntohl(addr->s6_addr32[3]) >= 0Xffffff80) + goto regen; } /* @@ -2544,7 +2531,7 @@ static void manage_tempaddrs(struct inet6_dev *idev, * no temporary address currently exists. */ read_unlock_bh(&idev->lock); - ipv6_create_tempaddr(ifp, NULL, false); + ipv6_create_tempaddr(ifp, false); } else { read_unlock_bh(&idev->lock); } @@ -2564,7 +2551,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, __u32 valid_lft, u32 prefered_lft) { struct inet6_ifaddr *ifp = ipv6_get_ifaddr(net, addr, dev, 1); - int create = 0, update_lft = 0; + int create = 0; if (!ifp && valid_lft) { int max_addresses = in6_dev->cnf.max_addresses; @@ -2608,32 +2595,19 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, unsigned long now; u32 stored_lft; - /* update lifetime (RFC2462 5.5.3 e) */ + /* Update lifetime (RFC4862 5.5.3 e) + * We deviate from RFC4862 by honoring all Valid Lifetimes to + * improve the reaction of SLAAC to renumbering events + * (draft-gont-6man-slaac-renum-06, Section 4.2) + */ spin_lock_bh(&ifp->lock); now = jiffies; if (ifp->valid_lft > (now - ifp->tstamp) / HZ) stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; else stored_lft = 0; - if (!create && stored_lft) { - const u32 minimum_lft = min_t(u32, - stored_lft, MIN_VALID_LIFETIME); - valid_lft = max(valid_lft, minimum_lft); - - /* RFC4862 Section 5.5.3e: - * "Note that the preferred lifetime of the - * corresponding address is always reset to - * the Preferred Lifetime in the received - * Prefix Information option, regardless of - * whether the valid lifetime is also reset or - * ignored." - * - * So we should always update prefered_lft here. - */ - update_lft = 1; - } - if (update_lft) { + if (!create && stored_lft) { ifp->valid_lft = valid_lft; ifp->prefered_lft = prefered_lft; ifp->tstamp = now; @@ -2731,7 +2705,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (rt) { /* Autoconf prefix route */ if (valid_lft == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ @@ -2809,6 +2783,33 @@ put: in6_dev_put(in6_dev); } +static int addrconf_set_sit_dstaddr(struct net *net, struct net_device *dev, + struct in6_ifreq *ireq) +{ + struct ip_tunnel_parm p = { }; + int err; + + if (!(ipv6_addr_type(&ireq->ifr6_addr) & IPV6_ADDR_COMPATv4)) + return -EADDRNOTAVAIL; + + p.iph.daddr = ireq->ifr6_addr.s6_addr32[3]; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPV6; + p.iph.ttl = 64; + + if (!dev->netdev_ops->ndo_tunnel_ctl) + return -EOPNOTSUPP; + err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, SIOCADDTUNNEL); + if (err) + return err; + + dev = __dev_get_by_name(net, p.name); + if (!dev) + return -ENOBUFS; + return dev_open(dev, NULL); +} + /* * Set destination address. * Special case for SIT interfaces where we create a new "virtual" @@ -2816,61 +2817,19 @@ put: */ int addrconf_set_dstaddr(struct net *net, void __user *arg) { - struct in6_ifreq ireq; struct net_device *dev; - int err = -EINVAL; - - rtnl_lock(); + struct in6_ifreq ireq; + int err = -ENODEV; - err = -EFAULT; + if (!IS_ENABLED(CONFIG_IPV6_SIT)) + return -ENODEV; if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) - goto err_exit; + return -EFAULT; + rtnl_lock(); dev = __dev_get_by_index(net, ireq.ifr6_ifindex); - - err = -ENODEV; - if (!dev) - goto err_exit; - -#if IS_ENABLED(CONFIG_IPV6_SIT) - if (dev->type == ARPHRD_SIT) { - const struct net_device_ops *ops = dev->netdev_ops; - struct ifreq ifr; - struct ip_tunnel_parm p; - - err = -EADDRNOTAVAIL; - if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) - goto err_exit; - - memset(&p, 0, sizeof(p)); - p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; - p.iph.saddr = 0; - p.iph.version = 4; - p.iph.ihl = 5; - p.iph.protocol = IPPROTO_IPV6; - p.iph.ttl = 64; - ifr.ifr_ifru.ifru_data = (__force void __user *)&p; - - if (ops->ndo_do_ioctl) { - mm_segment_t oldfs = get_fs(); - - set_fs(KERNEL_DS); - err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); - set_fs(oldfs); - } else - err = -EOPNOTSUPP; - - if (err == 0) { - err = -ENOBUFS; - dev = __dev_get_by_name(net, p.name); - if (!dev) - goto err_exit; - err = dev_open(dev, NULL); - } - } -#endif - -err_exit: + if (dev && dev->type == ARPHRD_SIT) + err = addrconf_set_sit_dstaddr(net, dev, &ireq); rtnl_unlock(); return err; } @@ -3826,7 +3785,7 @@ restart: spin_unlock_bh(&ifa->lock); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); if (state != INET6_IFADDR_STATE_DEAD) { __ipv6_ifa_notify(RTM_DELADDR, ifa); @@ -4544,7 +4503,7 @@ restart: ifpub->regen_count = 0; spin_unlock(&ifpub->lock); rcu_read_unlock_bh(); - ipv6_create_tempaddr(ifpub, ifp, true); + ipv6_create_tempaddr(ifpub, true); in6_ifa_put(ifpub); in6_ifa_put(ifp); rcu_read_lock_bh(); @@ -4665,7 +4624,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF; if (f6i->fib6_metric != prio) { /* delete old one */ - ip6_del_rt(dev_net(ifp->idev->dev), f6i); + ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); /* add new one */ addrconf_prefix_route(modify_peer ? &ifp->peer_addr : &ifp->addr, @@ -6086,10 +6045,10 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) ifp->idev->dev, 0, 0, false); if (rt) - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); } if (ifp->rt) { - ip6_del_rt(net, ifp->rt); + ip6_del_rt(net, ifp->rt, false); ifp->rt = NULL; } rt_genid_bump_ipv6(net); @@ -6108,9 +6067,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL -static -int addrconf_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6134,9 +6092,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct inet6_dev *idev = ctl->extra1; int min_mtu = IPV6_MIN_MTU; @@ -6206,9 +6163,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) return 0; } -static -int addrconf_sysctl_disable(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6232,9 +6188,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int ret; @@ -6275,7 +6230,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, } static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -6337,7 +6292,7 @@ out: } static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int err; @@ -6404,8 +6359,7 @@ out: static int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6505,10 +6459,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) return 0; } -static -int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -7024,9 +6976,26 @@ static int __net_init addrconf_init_net(struct net *net) goto err_alloc_dflt; if (IS_ENABLED(CONFIG_SYSCTL) && - sysctl_devconf_inherit_init_net == 1 && !net_eq(net, &init_net)) { - memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf)); - memcpy(dflt, init_net.ipv6.devconf_dflt, sizeof(ipv6_devconf_dflt)); + !net_eq(net, &init_net)) { + switch (sysctl_devconf_inherit_init_net) { + case 1: /* copy from init_net */ + memcpy(all, init_net.ipv6.devconf_all, + sizeof(ipv6_devconf)); + memcpy(dflt, init_net.ipv6.devconf_dflt, + sizeof(ipv6_devconf_dflt)); + break; + case 3: /* copy from the current netns */ + memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all, + sizeof(ipv6_devconf)); + memcpy(dflt, + current->nsproxy->net_ns->ipv6.devconf_dflt, + sizeof(ipv6_devconf_dflt)); + break; + case 0: + case 2: + /* use compiled values */ + break; + } } /* these will be inherited by all namespaces */ diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index ea00ce3d4117..9ebf3fe0d2b1 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -185,7 +185,8 @@ static int eafnosupport_fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, return -EAFNOSUPPORT; } -static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt) +static int eafnosupport_ip6_del_rt(struct net *net, struct fib6_info *rt, + bool skip_notify) { return -EAFNOSUPPORT; } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 345baa0a754f..b304b882e031 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -60,6 +60,8 @@ #include <net/calipso.h> #include <net/seg6.h> #include <net/rpl.h> +#include <net/compat.h> +#include <net/xfrm.h> #include <linux/uaccess.h> #include <linux/mroute6.h> @@ -273,7 +275,7 @@ out_rcu_unlock: } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) + u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -297,7 +299,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - if (with_lock) + if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ @@ -400,18 +402,20 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); err = -EADDRINUSE; goto out; } - err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); - if (err) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - goto out; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } } @@ -423,7 +427,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_dport = 0; inet->inet_daddr = 0; out: - if (with_lock) + if (flags & BIND_WITH_LOCK) release_sock(sk); return err; out_unlock: @@ -451,7 +455,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, false, true); + return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); } EXPORT_SYMBOL(inet6_bind); @@ -502,9 +506,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* * This does both peername and sockname. */ - int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -529,9 +532,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET6_GETPEERNAME : + BPF_CGROUP_INET6_GETSOCKNAME, + NULL); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); @@ -540,21 +547,25 @@ EXPORT_SYMBOL(inet6_getname); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { + void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; struct net *net = sock_net(sk); switch (cmd) { case SIOCADDRT: - case SIOCDELRT: - - return ipv6_route_ioctl(net, cmd, (void __user *)arg); + case SIOCDELRT: { + struct in6_rtmsg rtmsg; + if (copy_from_user(&rtmsg, argp, sizeof(rtmsg))) + return -EFAULT; + return ipv6_route_ioctl(net, cmd, &rtmsg); + } case SIOCSIFADDR: - return addrconf_add_ifaddr(net, (void __user *) arg); + return addrconf_add_ifaddr(net, argp); case SIOCDIFADDR: - return addrconf_del_ifaddr(net, (void __user *) arg); + return addrconf_del_ifaddr(net, argp); case SIOCSIFDSTADDR: - return addrconf_set_dstaddr(net, (void __user *) arg); + return addrconf_set_dstaddr(net, argp); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; @@ -565,6 +576,56 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } EXPORT_SYMBOL(inet6_ioctl); +#ifdef CONFIG_COMPAT +struct compat_in6_rtmsg { + struct in6_addr rtmsg_dst; + struct in6_addr rtmsg_src; + struct in6_addr rtmsg_gateway; + u32 rtmsg_type; + u16 rtmsg_dst_len; + u16 rtmsg_src_len; + u32 rtmsg_metric; + u32 rtmsg_info; + u32 rtmsg_flags; + s32 rtmsg_ifindex; +}; + +static int inet6_compat_routing_ioctl(struct sock *sk, unsigned int cmd, + struct compat_in6_rtmsg __user *ur) +{ + struct in6_rtmsg rt; + + if (copy_from_user(&rt.rtmsg_dst, &ur->rtmsg_dst, + 3 * sizeof(struct in6_addr)) || + get_user(rt.rtmsg_type, &ur->rtmsg_type) || + get_user(rt.rtmsg_dst_len, &ur->rtmsg_dst_len) || + get_user(rt.rtmsg_src_len, &ur->rtmsg_src_len) || + get_user(rt.rtmsg_metric, &ur->rtmsg_metric) || + get_user(rt.rtmsg_info, &ur->rtmsg_info) || + get_user(rt.rtmsg_flags, &ur->rtmsg_flags) || + get_user(rt.rtmsg_ifindex, &ur->rtmsg_ifindex)) + return -EFAULT; + + + return ipv6_route_ioctl(sock_net(sk), cmd, &rt); +} + +int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + void __user *argp = compat_ptr(arg); + struct sock *sk = sock->sk; + + switch (cmd) { + case SIOCADDRT: + case SIOCDELRT: + return inet6_compat_routing_ioctl(sk, cmd, argp); + default: + return -ENOIOCTLCMD; + } +} +EXPORT_SYMBOL_GPL(inet6_compat_ioctl); +#endif /* CONFIG_COMPAT */ + INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, size_t)); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -626,6 +687,7 @@ const struct proto_ops inet6_stream_ops = { .read_sock = tcp_read_sock, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif @@ -654,6 +716,7 @@ const struct proto_ops inet6_dgram_ops = { .sendpage = sock_no_sendpage, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif @@ -961,6 +1024,11 @@ static const struct ipv6_stub ipv6_stub_impl = { .ip6_del_rt = ip6_del_rt, .udpv6_encap_enable = udpv6_encap_enable, .ndisc_send_na = ndisc_send_na, +#if IS_ENABLED(CONFIG_XFRM) + .xfrm6_local_rxpmtu = xfrm6_local_rxpmtu, + .xfrm6_udp_encap_rcv = xfrm6_udp_encap_rcv, + .xfrm6_rcv_encap = xfrm6_rcv_encap, +#endif .nd_tbl = &nd_tbl, }; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 45e2adc56610..d88d97617f7e 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -767,6 +767,7 @@ static const struct xfrm_type ah6_type = { static struct xfrm6_protocol ah6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = ah6_rcv_cb, .err_handler = ah6_err, .priority = 0, diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index fed91ab7ec46..893261230ffc 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -364,7 +364,7 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr) ipv6_del_acaddr_hash(aca); addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); return 0; @@ -393,7 +393,7 @@ void ipv6_ac_destroy_dev(struct inet6_dev *idev) addrconf_leave_solict(idev, &aca->aca_addr); - ip6_del_rt(dev_net(idev->dev), aca->aca_rt); + ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false); aca_put(aca); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 11143d039f16..c43592771126 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -26,11 +26,16 @@ #include <linux/random.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <net/ip6_checksum.h> #include <net/ip6_route.h> #include <net/icmp.h> #include <net/ipv6.h> #include <net/protocol.h> +#include <net/udp.h> #include <linux/icmpv6.h> +#include <net/tcp.h> +#include <net/espintcp.h> +#include <net/inet6_hashtables.h> #include <linux/highmem.h> @@ -39,6 +44,11 @@ struct esp_skb_cb { void *tmp; }; +struct esp_output_extra { + __be32 seqhi; + u32 esphoff; +}; + #define ESP_SKB_CB(__skb) ((struct esp_skb_cb *)&((__skb)->cb[0])) /* @@ -72,9 +82,9 @@ static void *esp_alloc_tmp(struct crypto_aead *aead, int nfrags, int seqihlen) return kmalloc(len, GFP_ATOMIC); } -static inline __be32 *esp_tmp_seqhi(void *tmp) +static inline void *esp_tmp_extra(void *tmp) { - return PTR_ALIGN((__be32 *)tmp, __alignof__(__be32)); + return PTR_ALIGN(tmp, __alignof__(struct esp_output_extra)); } static inline u8 *esp_tmp_iv(struct crypto_aead *aead, void *tmp, int seqhilen) @@ -104,16 +114,17 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead, static void esp_ssg_unref(struct xfrm_state *x, void *tmp) { + struct esp_output_extra *extra = esp_tmp_extra(tmp); struct crypto_aead *aead = x->data; - int seqhilen = 0; + int extralen = 0; u8 *iv; struct aead_request *req; struct scatterlist *sg; if (x->props.flags & XFRM_STATE_ESN) - seqhilen += sizeof(__be32); + extralen += sizeof(*extra); - iv = esp_tmp_iv(aead, tmp, seqhilen); + iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); /* Unref skb_frag_pages in the src scatterlist if necessary. @@ -124,6 +135,149 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +#ifdef CONFIG_INET6_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + +static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct esp_tcp_sk *esk; + __be16 sport, dport; + struct sock *nsk; + struct sock *sk; + + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } + spin_unlock_bh(&x->lock); + + sk = __inet6_lookup_established(xs_net(x), &tcp_hashinfo, &x->id.daddr.in6, + dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + if (!sk) + return ERR_PTR(-ENOENT); + + if (!tcp_is_ulp_esp(sk)) { + sock_put(sk); + return ERR_PTR(-EINVAL); + } + + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + + return sk; +} + +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) +{ + struct sock *sk; + int err; + + rcu_read_lock(); + + sk = esp6_find_tcp_sk(x); + err = PTR_ERR_OR_ZERO(sk); + if (err) + goto out; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + err = espintcp_queue_out(sk, skb); + else + err = espintcp_push_skb(sk, skb); + bh_unlock_sock(sk); + +out: + rcu_read_unlock(); + return err; +} + +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + return esp_output_tcp_finish(x, skb); +} + +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + local_bh_disable(); + err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb); + local_bh_enable(); + + /* EINPROGRESS just happens to do the right thing. It + * actually means that the skb has been consumed and + * isn't coming back. + */ + return err ?: -EINPROGRESS; +} +#else +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + kfree_skb(skb); + + return -EOPNOTSUPP; +} +#endif + +static void esp_output_encap_csum(struct sk_buff *skb) +{ + /* UDP encap with IPv6 requires a valid checksum */ + if (*skb_mac_header(skb) == IPPROTO_UDP) { + struct udphdr *uh = udp_hdr(skb); + struct ipv6hdr *ip6h = ipv6_hdr(skb); + int len = ntohs(uh->len); + unsigned int offset = skb_transport_offset(skb); + __wsum csum = skb_checksum(skb, offset, skb->len - offset, 0); + + uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = CSUM_MANGLED_0; + } +} + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -143,6 +297,8 @@ static void esp_output_done(struct crypto_async_request *base, int err) esp_ssg_unref(x, tmp); kfree(tmp); + esp_output_encap_csum(skb); + if (xo && (xo->flags & XFRM_DEV_RESUME)) { if (err) { XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); @@ -154,7 +310,11 @@ static void esp_output_done(struct crypto_async_request *base, int err) secpath_reset(skb); xfrm_dev_resume(skb); } else { - xfrm_output_resume(skb, err); + if (!err && + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + esp_output_tail_tcp(x, skb); + else + xfrm_output_resume(skb, err); } } @@ -163,7 +323,7 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset) { struct ip_esp_hdr *esph = (void *)(skb->data + offset); void *tmp = ESP_SKB_CB(skb)->tmp; - __be32 *seqhi = esp_tmp_seqhi(tmp); + __be32 *seqhi = esp_tmp_extra(tmp); esph->seq_no = esph->spi; esph->spi = *seqhi; @@ -171,27 +331,36 @@ static void esp_restore_header(struct sk_buff *skb, unsigned int offset) static void esp_output_restore_header(struct sk_buff *skb) { - esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32)); + void *tmp = ESP_SKB_CB(skb)->tmp; + struct esp_output_extra *extra = esp_tmp_extra(tmp); + + esp_restore_header(skb, skb_transport_offset(skb) + extra->esphoff - + sizeof(__be32)); } static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, struct xfrm_state *x, struct ip_esp_hdr *esph, - __be32 *seqhi) + struct esp_output_extra *extra) { /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after * encryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { + __u32 seqhi; struct xfrm_offload *xo = xfrm_offload(skb); - esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); - *seqhi = esph->spi; if (xo) - esph->seq_no = htonl(xo->seq.hi); + seqhi = xo->seq.hi; else - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + seqhi = XFRM_SKB_CB(skb)->seq.output.hi; + + extra->esphoff = (unsigned char *)esph - + skb_transport_header(skb); + esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); + extra->seqhi = esph->spi; + esph->seq_no = htonl(seqhi); } esph->spi = x->id.spi; @@ -207,15 +376,122 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err) esp_output_done(base, err); } +static struct ip_esp_hdr *esp6_output_udp_encap(struct sk_buff *skb, + int encap_type, + struct esp_info *esp, + __be16 sport, + __be16 dport) +{ + struct udphdr *uh; + __be32 *udpdata32; + unsigned int len; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > U16_MAX) + return ERR_PTR(-EMSGSIZE); + + uh = (struct udphdr *)esp->esph; + uh->source = sport; + uh->dest = dport; + uh->len = htons(len); + uh->check = 0; + + *skb_mac_header(skb) = IPPROTO_UDP; + + if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + return (struct ip_esp_hdr *)(udpdata32 + 2); + } + + return (struct ip_esp_hdr *)(uh + 1); +} + +#ifdef CONFIG_INET6_ESPINTCP +static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + __be16 *lenp = (void *)esp->esph; + struct ip_esp_hdr *esph; + unsigned int len; + struct sock *sk; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); + + rcu_read_lock(); + sk = esp6_find_tcp_sk(x); + rcu_read_unlock(); + + if (IS_ERR(sk)) + return ERR_CAST(sk); + + *lenp = htons(len); + esph = (struct ip_esp_hdr *)(lenp + 1); + + return esph; +} +#else +static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif + +static int esp6_output_encap(struct xfrm_state *x, struct sk_buff *skb, + struct esp_info *esp) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph; + __be16 sport, dport; + int encap_type; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + esph = esp6_output_udp_encap(skb, encap_type, esp, sport, dport); + break; + case TCP_ENCAP_ESPINTCP: + esph = esp6_output_tcp_encap(x, skb, esp); + break; + } + + if (IS_ERR(esph)) + return PTR_ERR(esph); + + esp->esph = esph; + + return 0; +} + int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { u8 *tail; u8 *vaddr; int nfrags; + int esph_offset; struct page *page; struct sk_buff *trailer; int tailen = esp->tailen; + if (x->encap) { + int err = esp6_output_encap(x, skb, esp); + + if (err < 0) + return err; + } + if (!skb_cloned(skb)) { if (tailen <= skb_tailroom(skb)) { nfrags = 1; @@ -274,10 +550,13 @@ int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info } cow: + esph_offset = (unsigned char *)esp->esph - skb_transport_header(skb); + nfrags = skb_cow_data(skb, tailen, &trailer); if (nfrags < 0) goto out; tail = skb_tail_pointer(trailer); + esp->esph = (struct ip_esp_hdr *)(skb_transport_header(skb) + esph_offset); skip_cow: esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); @@ -295,20 +574,20 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info void *tmp; int ivlen; int assoclen; - int seqhilen; - __be32 *seqhi; + int extralen; struct page *page; struct ip_esp_hdr *esph; struct aead_request *req; struct crypto_aead *aead; struct scatterlist *sg, *dsg; + struct esp_output_extra *extra; int err = -ENOMEM; assoclen = sizeof(struct ip_esp_hdr); - seqhilen = 0; + extralen = 0; if (x->props.flags & XFRM_STATE_ESN) { - seqhilen += sizeof(__be32); + extralen += sizeof(*extra); assoclen += sizeof(__be32); } @@ -316,12 +595,12 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info alen = crypto_aead_authsize(aead); ivlen = crypto_aead_ivsize(aead); - tmp = esp_alloc_tmp(aead, esp->nfrags + 2, seqhilen); + tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen); if (!tmp) goto error; - seqhi = esp_tmp_seqhi(tmp); - iv = esp_tmp_iv(aead, tmp, seqhilen); + extra = esp_tmp_extra(tmp); + iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); @@ -330,7 +609,8 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info else dsg = &sg[esp->nfrags]; - esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi); + esph = esp_output_set_esn(skb, x, esp->esph, extra); + esp->esph = esph; sg_init_table(sg, esp->nfrags); err = skb_to_sgvec(skb, sg, @@ -394,11 +674,15 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info case 0: if ((x->props.flags & XFRM_STATE_ESN)) esp_output_restore_header(skb); + esp_output_encap_csum(skb); } if (sg != dsg) esp_ssg_unref(x, tmp); + if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + err = esp_output_tail_tcp(x, skb); + error_free: kfree(tmp); error: @@ -438,11 +722,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; + esp.esph = ip_esp_hdr(skb); + esp.nfrags = esp6_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; - esph = ip_esp_hdr(skb); + esph = esp.esph; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); @@ -517,6 +803,60 @@ int esp6_input_done2(struct sk_buff *skb, int err) if (unlikely(err < 0)) goto out; + if (x->encap) { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct xfrm_encap_tmpl *encap = x->encap; + struct udphdr *uh = (void *)(skb_network_header(skb) + hdr_len); + struct tcphdr *th = (void *)(skb_network_header(skb) + hdr_len); + __be16 source; + + switch (x->encap->encap_type) { + case TCP_ENCAP_ESPINTCP: + source = th->source; + break; + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + source = uh->source; + break; + default: + WARN_ON_ONCE(1); + err = -EINVAL; + goto out; + } + + /* + * 1) if the NAT-T peer's IP or port changed then + * advertize the change to the keying daemon. + * This is an inbound SA, so just compare + * SRC ports. + */ + if (!ipv6_addr_equal(&ip6h->saddr, &x->props.saddr.in6) || + source != encap->encap_sport) { + xfrm_address_t ipaddr; + + memcpy(&ipaddr.a6, &ip6h->saddr.s6_addr, sizeof(ipaddr.a6)); + km_new_mapping(x, &ipaddr, source); + + /* XXX: perhaps add an extra + * policy check here, to see + * if we should allow or + * reject a packet from a + * different source + * address/port. + */ + } + + /* + * 2) ignore UDP/TCP checksums in case + * of NAT-T in Transport Mode, or + * perform other post-processing fixes + * as per draft-ietf-ipsec-udp-encaps-06, + * section 3.1.2 + */ + if (x->props.mode == XFRM_MODE_TRANSPORT) + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); skb_pull_rcsum(skb, hlen); @@ -632,7 +972,7 @@ skip_cow: goto out; ESP_SKB_CB(skb)->tmp = tmp; - seqhi = esp_tmp_seqhi(tmp); + seqhi = esp_tmp_extra(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); @@ -836,9 +1176,6 @@ static int esp6_init_state(struct xfrm_state *x) u32 align; int err; - if (x->encap) - return -EINVAL; - x->data = NULL; if (x->aead) @@ -867,6 +1204,30 @@ static int esp6_init_state(struct xfrm_state *x) break; } + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + + switch (encap->encap_type) { + default: + err = -EINVAL; + goto error; + case UDP_ENCAP_ESPINUDP: + x->props.header_len += sizeof(struct udphdr); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); + break; +#ifdef CONFIG_INET6_ESPINTCP + case TCP_ENCAP_ESPINTCP: + /* only the length field, TCP encap is done by + * the socket + */ + x->props.header_len += 2; + break; +#endif + } + } + align = ALIGN(crypto_aead_blocksize(aead), 4); x->props.trailer_len = align + 1 + crypto_aead_authsize(aead); @@ -893,6 +1254,7 @@ static const struct xfrm_type esp6_type = { static struct xfrm6_protocol esp6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = esp6_rcv_cb, .err_handler = esp6_err, .priority = 0, diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index ab0eea336c70..55addea1948f 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -284,7 +284,6 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features int alen; int blksize; struct xfrm_offload *xo; - struct ip_esp_hdr *esph; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; @@ -325,13 +324,13 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features seq = xo->seq.low; - esph = ip_esp_hdr(skb); - esph->spi = x->id.spi; + esp.esph = ip_esp_hdr(skb); + esp.esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { - esph->seq_no = htonl(seq); + esp.esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h index bb6fc0d54dae..ad5f6f6ba333 100644 --- a/net/ipv6/ila/ila.h +++ b/net/ipv6/ila/ila.h @@ -68,11 +68,6 @@ static inline struct ila_addr *ila_a2i(struct in6_addr *addr) return (struct ila_addr *)addr; } -static inline bool ila_addr_is_ila(struct ila_addr *iaddr) -{ - return (iaddr->ident.type != ILA_ATYPE_IID); -} - struct ila_params { struct ila_locator locator; struct ila_locator locator_match; diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 5fc1f4e0c0cf..a1ac0e3d8c60 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -601,8 +601,6 @@ out_ret: return ret; } -#define ILA_HASH_TABLE_SIZE 1024 - int ila_xlat_init_net(struct net *net) { struct ila_net *ilan = net_generic(net, ila_net_id); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 20314895509c..49ee89bbcba0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2467,7 +2467,7 @@ void fib6_gc_cleanup(void) } #ifdef CONFIG_PROC_FS -static int ipv6_route_seq_show(struct seq_file *seq, void *v) +static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; @@ -2625,7 +2625,7 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) return w->node && !(w->state == FWS_U && w->node == w->root); } -static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU_BH) { struct net *net = seq_file_net(seq); @@ -2637,6 +2637,62 @@ static void ipv6_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock_bh(); } +#if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) +static int ipv6_route_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__ipv6_route ctx; + + ctx.meta = meta; + ctx.rt = v; + return bpf_iter_run_prog(prog, &ctx); +} + +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + struct ipv6_route_iter *iter = seq->private; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + int ret; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return ipv6_route_native_seq_show(seq, v); + + ret = ipv6_route_prog_seq_show(prog, &meta, v); + iter->w.leaf = NULL; + + return ret; +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)ipv6_route_prog_seq_show(prog, &meta, v); + } + + ipv6_route_native_seq_stop(seq, v); +} +#else +static int ipv6_route_seq_show(struct seq_file *seq, void *v) +{ + return ipv6_route_native_seq_show(seq, v); +} + +static void ipv6_route_seq_stop(struct seq_file *seq, void *v) +{ + ipv6_route_native_seq_stop(seq, v); +} +#endif + const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 4703b09808d0..821d96c720b9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -89,6 +89,11 @@ struct ip6_tnl_net { struct ip6_tnl __rcu *collect_md_tun; }; +static inline int ip6_tnl_mpls_supported(void) +{ + return IS_ENABLED(CONFIG_MPLS); +} + static struct net_device_stats *ip6_get_stats(struct net_device *dev) { struct pcpu_sw_netstats tmp, sum = { 0 }; @@ -718,6 +723,20 @@ ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static int +mplsip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + __u32 rel_info = ntohl(info); + int err, rel_msg = 0; + u8 rel_type = type; + u8 rel_code = code; + + err = ip6_tnl_err(skb, IPPROTO_MPLS, opt, &rel_type, &rel_code, + &rel_msg, &rel_info, offset); + return err; +} + static int ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, const struct ipv6hdr *ipv6h, struct sk_buff *skb) @@ -740,6 +759,14 @@ static int ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, return IP6_ECN_decapsulate(ipv6h, skb); } +static inline int mplsip6_dscp_ecn_decapsulate(const struct ip6_tnl *t, + const struct ipv6hdr *ipv6h, + struct sk_buff *skb) +{ + /* ECN is not supported in AF_MPLS */ + return 0; +} + __u32 ip6_tnl_get_cap(struct ip6_tnl *t, const struct in6_addr *laddr, const struct in6_addr *raddr) @@ -901,6 +928,11 @@ static const struct tnl_ptk_info tpi_v4 = { .proto = htons(ETH_P_IP), }; +static const struct tnl_ptk_info tpi_mpls = { + /* no tunnel info required for mplsip6. */ + .proto = htons(ETH_P_MPLS_UC), +}; + static int ipxip6_rcv(struct sk_buff *skb, u8 ipproto, const struct tnl_ptk_info *tpi, int (*dscp_ecn_decapsulate)(const struct ip6_tnl *t, @@ -958,6 +990,12 @@ static int ip6ip6_rcv(struct sk_buff *skb) ip6ip6_dscp_ecn_decapsulate); } +static int mplsip6_rcv(struct sk_buff *skb) +{ + return ipxip6_rcv(skb, IPPROTO_MPLS, &tpi_mpls, + mplsip6_dscp_ecn_decapsulate); +} + struct ipv6_tel_txoption { struct ipv6_txoptions ops; __u8 dst_opt[8]; @@ -1232,6 +1270,8 @@ route_lookup: ipv6_push_frag_opts(skb, &opt.ops, &proto); } + skb_set_inner_ipproto(skb, proto); + skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); @@ -1253,22 +1293,22 @@ tx_err_dst_release: EXPORT_SYMBOL(ip6_tnl_xmit); static inline int -ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +ipxip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, + u8 protocol) { struct ip6_tnl *t = netdev_priv(dev); + struct ipv6hdr *ipv6h; const struct iphdr *iph; int encap_limit = -1; + __u16 offset; struct flowi6 fl6; - __u8 dsfield; + __u8 dsfield, orig_dsfield; __u32 mtu; u8 tproto; int err; - iph = ip_hdr(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - tproto = READ_ONCE(t->parms.proto); - if (tproto != IPPROTO_IPIP && tproto != 0) + if (tproto != protocol && tproto != 0) return -1; if (t->parms.collect_md) { @@ -1281,129 +1321,100 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) return -1; key = &tun_info->key; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; + fl6.flowi6_proto = protocol; fl6.saddr = key->u.ipv6.src; fl6.daddr = key->u.ipv6.dst; fl6.flowlabel = key->label; dsfield = key->tos; + switch (protocol) { + case IPPROTO_IPIP: + iph = ip_hdr(skb); + orig_dsfield = ipv4_get_dsfield(iph); + break; + case IPPROTO_IPV6: + ipv6h = ipv6_hdr(skb); + orig_dsfield = ipv6_get_dsfield(ipv6h); + break; + default: + orig_dsfield = dsfield; + break; + } } else { if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) encap_limit = t->parms.encap_limit; + if (protocol == IPPROTO_IPV6) { + offset = ip6_tnl_parse_tlv_enc_lim(skb, + skb_network_header(skb)); + /* ip6_tnl_parse_tlv_enc_lim() might have + * reallocated skb->head + */ + if (offset > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; - memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPIP; - - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv4_get_dsfield(iph); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) - fl6.flowi6_mark = skb->mark; - else - fl6.flowi6_mark = t->parms.fwmark; - } - - fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - dsfield = INET_ECN_encapsulate(dsfield, ipv4_get_dsfield(iph)); - - if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) - return -1; - - skb_set_inner_ipproto(skb, IPPROTO_IPIP); - - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - IPPROTO_IPIP); - if (err != 0) { - /* XXX: send ICMP error even if DF is not set. */ - if (err == -EMSGSIZE) - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, - htonl(mtu)); - return -1; - } - - return 0; -} - -static inline int -ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) -{ - struct ip6_tnl *t = netdev_priv(dev); - struct ipv6hdr *ipv6h; - int encap_limit = -1; - __u16 offset; - struct flowi6 fl6; - __u8 dsfield; - __u32 mtu; - u8 tproto; - int err; - - ipv6h = ipv6_hdr(skb); - tproto = READ_ONCE(t->parms.proto); - if ((tproto != IPPROTO_IPV6 && tproto != 0) || - ip6_tnl_addr_conflict(t, ipv6h)) - return -1; - - if (t->parms.collect_md) { - struct ip_tunnel_info *tun_info; - const struct ip_tunnel_key *key; - - tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || - ip_tunnel_info_af(tun_info) != AF_INET6)) - return -1; - key = &tun_info->key; - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; - fl6.saddr = key->u.ipv6.src; - fl6.daddr = key->u.ipv6.dst; - fl6.flowlabel = key->label; - dsfield = key->tos; - } else { - offset = ip6_tnl_parse_tlv_enc_lim(skb, skb_network_header(skb)); - /* ip6_tnl_parse_tlv_enc_lim() might have reallocated skb->head */ - ipv6h = ipv6_hdr(skb); - if (offset > 0) { - struct ipv6_tlv_tnl_enc_lim *tel; - - tel = (void *)&skb_network_header(skb)[offset]; - if (tel->encap_limit == 0) { - icmpv6_send(skb, ICMPV6_PARAMPROB, - ICMPV6_HDR_FIELD, offset + 2); - return -1; + tel = (void *)&skb_network_header(skb)[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2); + return -1; + } + encap_limit = tel->encap_limit - 1; } - encap_limit = tel->encap_limit - 1; - } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { - encap_limit = t->parms.encap_limit; } memcpy(&fl6, &t->fl.u.ip6, sizeof(fl6)); - fl6.flowi6_proto = IPPROTO_IPV6; + fl6.flowi6_proto = protocol; - if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) - dsfield = ipv6_get_dsfield(ipv6h); - else - dsfield = ip6_tclass(t->parms.flowinfo); - if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) - fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; else fl6.flowi6_mark = t->parms.fwmark; + switch (protocol) { + case IPPROTO_IPIP: + iph = ip_hdr(skb); + orig_dsfield = ipv4_get_dsfield(iph); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + dsfield = orig_dsfield; + else + dsfield = ip6_tclass(t->parms.flowinfo); + break; + case IPPROTO_IPV6: + ipv6h = ipv6_hdr(skb); + orig_dsfield = ipv6_get_dsfield(ipv6h); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS) + dsfield = orig_dsfield; + else + dsfield = ip6_tclass(t->parms.flowinfo); + if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL) + fl6.flowlabel |= ip6_flowlabel(ipv6h); + break; + default: + orig_dsfield = dsfield = ip6_tclass(t->parms.flowinfo); + break; + } } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); - dsfield = INET_ECN_encapsulate(dsfield, ipv6_get_dsfield(ipv6h)); + dsfield = INET_ECN_encapsulate(dsfield, orig_dsfield); if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6)) return -1; - skb_set_inner_ipproto(skb, IPPROTO_IPV6); - err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, - IPPROTO_IPV6); + protocol); if (err != 0) { + /* XXX: send ICMP error even if DF is not set. */ if (err == -EMSGSIZE) - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + switch (protocol) { + case IPPROTO_IPIP: + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + break; + case IPPROTO_IPV6: + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + break; + default: + break; + } return -1; } @@ -1415,6 +1426,7 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net_device_stats *stats = &t->dev->stats; + u8 ipproto; int ret; if (!pskb_inet_may_pull(skb)) @@ -1422,15 +1434,21 @@ ip6_tnl_start_xmit(struct sk_buff *skb, struct net_device *dev) switch (skb->protocol) { case htons(ETH_P_IP): - ret = ip4ip6_tnl_xmit(skb, dev); + ipproto = IPPROTO_IPIP; break; case htons(ETH_P_IPV6): - ret = ip6ip6_tnl_xmit(skb, dev); + if (ip6_tnl_addr_conflict(t, ipv6_hdr(skb))) + goto tx_err; + ipproto = IPPROTO_IPV6; + break; + case htons(ETH_P_MPLS_UC): + ipproto = IPPROTO_MPLS; break; default: goto tx_err; } + ret = ipxip6_tnl_xmit(skb, dev, ipproto); if (ret < 0) goto tx_err; @@ -2218,6 +2236,12 @@ static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .priority = 1, }; +static struct xfrm6_tunnel mplsip6_handler __read_mostly = { + .handler = mplsip6_rcv, + .err_handler = mplsip6_err, + .priority = 1, +}; + static void __net_exit ip6_tnl_destroy_tunnels(struct net *net, struct list_head *list) { struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); @@ -2332,6 +2356,15 @@ static int __init ip6_tunnel_init(void) pr_err("%s: can't register ip6ip6\n", __func__); goto out_ip6ip6; } + + if (ip6_tnl_mpls_supported()) { + err = xfrm6_tunnel_register(&mplsip6_handler, AF_MPLS); + if (err < 0) { + pr_err("%s: can't register mplsip6\n", __func__); + goto out_mplsip6; + } + } + err = rtnl_link_register(&ip6_link_ops); if (err < 0) goto rtnl_link_failed; @@ -2339,6 +2372,9 @@ static int __init ip6_tunnel_init(void) return 0; rtnl_link_failed: + if (ip6_tnl_mpls_supported()) + xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS); +out_mplsip6: xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6); out_ip6ip6: xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET); @@ -2361,6 +2397,9 @@ static void __exit ip6_tunnel_cleanup(void) if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6)) pr_info("%s: can't deregister ip6ip6\n", __func__); + if (ip6_tnl_mpls_supported() && + xfrm6_tunnel_deregister(&mplsip6_handler, AF_MPLS)) + pr_info("%s: can't deregister mplsip6\n", __func__); unregister_pernet_device(&ip6_tnl_net_ops); } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 58956a6b66a2..cdc4d4ee2420 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -25,17 +25,12 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->ipv6_v6only) { - int val = 1; - - err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, - (char *) &val, sizeof(val)); + err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index cc6180e08a4f..1147f647b9a0 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -296,7 +296,8 @@ static void vti6_dev_uninit(struct net_device *dev) dev_put(dev); } -static int vti6_rcv(struct sk_buff *skb) +static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); @@ -323,7 +324,10 @@ static int vti6_rcv(struct sk_buff *skb) rcu_read_unlock(); - return xfrm6_rcv_tnl(skb, t); + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + return xfrm_input(skb, nexthdr, spi, encap_type); } rcu_read_unlock(); return -EINVAL; @@ -332,6 +336,13 @@ discard: return 0; } +static int vti6_rcv(struct sk_buff *skb) +{ + int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff]; + + return vti6_input_proto(skb, nexthdr, 0, 0); +} + static int vti6_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; @@ -1185,6 +1196,7 @@ static struct pernet_operations vti6_net_ops = { static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, @@ -1192,6 +1204,7 @@ static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, @@ -1199,6 +1212,7 @@ static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .handler = vti6_rcv, + .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 3752bd3e92ce..99668bfebd85 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -183,6 +183,7 @@ static const struct xfrm_type ipcomp6_type = { static struct xfrm6_protocol ipcomp6_protocol = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = ipcomp6_rcv_cb, .err_handler = ipcomp6_err, .priority = 0, diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 18d05403d3b5..2c843ff5e3a9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -136,6 +136,41 @@ static bool setsockopt_needs_rtnl(int optname) return false; } +static int do_ipv6_mcast_group_source(struct sock *sk, int optname, + struct group_source_req *greqs) +{ + int omode, add; + + if (greqs->gsr_group.ss_family != AF_INET6 || + greqs->gsr_source.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct sockaddr_in6 *psin6; + int retv; + + psin6 = (struct sockaddr_in6 *)&greqs->gsr_group; + retv = ipv6_sock_mc_join_ssm(sk, greqs->gsr_interface, + &psin6->sin6_addr, + MCAST_INCLUDE); + /* prior join w/ different source is ok */ + if (retv && retv != -EADDRINUSE) + return retv; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + return ip6_mc_source(add, omode, sk, greqs); +} + static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { @@ -183,14 +218,15 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = -EBUSY; break; } - } - if (sk->sk_protocol == IPPROTO_TCP && - sk->sk_prot != &tcpv6_prot) { - retv = -EBUSY; + } else if (sk->sk_protocol == IPPROTO_TCP) { + if (sk->sk_prot != &tcpv6_prot) { + retv = -EBUSY; + break; + } + } else { break; } - if (sk->sk_protocol != IPPROTO_TCP) - break; + if (sk->sk_state != TCP_ESTABLISHED) { retv = -ENOTCONN; break; @@ -715,7 +751,6 @@ done: case MCAST_UNBLOCK_SOURCE: { struct group_source_req greqs; - int omode, add; if (optlen < sizeof(struct group_source_req)) goto e_inval; @@ -723,34 +758,7 @@ done: retv = -EFAULT; break; } - if (greqs.gsr_group.ss_family != AF_INET6 || - greqs.gsr_source.ss_family != AF_INET6) { - retv = -EADDRNOTAVAIL; - break; - } - if (optname == MCAST_BLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 1; - } else if (optname == MCAST_UNBLOCK_SOURCE) { - omode = MCAST_EXCLUDE; - add = 0; - } else if (optname == MCAST_JOIN_SOURCE_GROUP) { - struct sockaddr_in6 *psin6; - - psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; - retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface, - &psin6->sin6_addr, - MCAST_INCLUDE); - /* prior join w/ different source is ok */ - if (retv && retv != -EADDRINUSE) - break; - omode = MCAST_INCLUDE; - add = 1; - } else /* MCAST_LEAVE_SOURCE_GROUP */ { - omode = MCAST_INCLUDE; - add = 0; - } - retv = ip6_mc_source(add, omode, sk, &greqs); + retv = do_ipv6_mcast_group_source(sk, optname, &greqs); break; } case MCAST_MSFILTER: @@ -780,7 +788,7 @@ done: retv = -EINVAL; break; } - retv = ip6_mc_msfilter(sk, gsf); + retv = ip6_mc_msfilter(sk, gsf, gsf->gf_slist); kfree(gsf); break; @@ -838,67 +846,10 @@ done: break; case IPV6_ADDR_PREFERENCES: - { - unsigned int pref = 0; - unsigned int prefmask = ~0; - if (optlen < sizeof(int)) goto e_inval; - - retv = -EINVAL; - - /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ - switch (val & (IPV6_PREFER_SRC_PUBLIC| - IPV6_PREFER_SRC_TMP| - IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { - case IPV6_PREFER_SRC_PUBLIC: - pref |= IPV6_PREFER_SRC_PUBLIC; - break; - case IPV6_PREFER_SRC_TMP: - pref |= IPV6_PREFER_SRC_TMP; - break; - case IPV6_PREFER_SRC_PUBTMP_DEFAULT: - break; - case 0: - goto pref_skip_pubtmp; - default: - goto e_inval; - } - - prefmask &= ~(IPV6_PREFER_SRC_PUBLIC| - IPV6_PREFER_SRC_TMP); -pref_skip_pubtmp: - - /* check HOME/COA conflicts */ - switch (val & (IPV6_PREFER_SRC_HOME|IPV6_PREFER_SRC_COA)) { - case IPV6_PREFER_SRC_HOME: - break; - case IPV6_PREFER_SRC_COA: - pref |= IPV6_PREFER_SRC_COA; - case 0: - goto pref_skip_coa; - default: - goto e_inval; - } - - prefmask &= ~IPV6_PREFER_SRC_COA; -pref_skip_coa: - - /* check CGA/NONCGA conflicts */ - switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { - case IPV6_PREFER_SRC_CGA: - case IPV6_PREFER_SRC_NONCGA: - case 0: - break; - default: - goto e_inval; - } - - np->srcprefs = (np->srcprefs & prefmask) | pref; - retv = 0; - + retv = __ip6_sock_set_addr_preferences(sk, val); break; - } case IPV6_MINHOPCOUNT: if (optlen < sizeof(int)) goto e_inval; @@ -973,9 +924,110 @@ int compat_ipv6_setsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; - if (optname >= MCAST_JOIN_GROUP && optname <= MCAST_MSFILTER) - return compat_mc_setsockopt(sk, level, optname, optval, optlen, - ipv6_setsockopt); + switch (optname) { + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct compat_group_req __user *gr32 = (void __user *)optval; + struct group_req greq; + struct sockaddr_in6 *psin6 = (struct sockaddr_in6 *)&greq.gr_group; + + if (optlen < sizeof(struct compat_group_req)) + return -EINVAL; + + if (get_user(greq.gr_interface, &gr32->gr_interface) || + copy_from_user(&greq.gr_group, &gr32->gr_group, + sizeof(greq.gr_group))) + return -EFAULT; + + if (greq.gr_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + + rtnl_lock(); + lock_sock(sk); + if (optname == MCAST_JOIN_GROUP) + err = ipv6_sock_mc_join(sk, greq.gr_interface, + &psin6->sin6_addr); + else + err = ipv6_sock_mc_drop(sk, greq.gr_interface, + &psin6->sin6_addr); + release_sock(sk); + rtnl_unlock(); + return err; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct compat_group_source_req __user *gsr32 = (void __user *)optval; + struct group_source_req greqs; + + if (optlen < sizeof(struct compat_group_source_req)) + return -EINVAL; + + if (get_user(greqs.gsr_interface, &gsr32->gsr_interface) || + copy_from_user(&greqs.gsr_group, &gsr32->gsr_group, + sizeof(greqs.gsr_group)) || + copy_from_user(&greqs.gsr_source, &gsr32->gsr_source, + sizeof(greqs.gsr_source))) + return -EFAULT; + + rtnl_lock(); + lock_sock(sk); + err = do_ipv6_mcast_group_source(sk, optname, &greqs); + release_sock(sk); + rtnl_unlock(); + return err; + } + case MCAST_MSFILTER: + { + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter *gf32; + void *p; + int n; + + if (optlen < size0) + return -EINVAL; + if (optlen > sysctl_optmem_max - 4) + return -ENOBUFS; + + p = kmalloc(optlen + 4, GFP_KERNEL); + if (!p) + return -ENOMEM; + + gf32 = p + 4; /* we want ->gf_group and ->gf_slist aligned */ + if (copy_from_user(gf32, optval, optlen)) { + err = -EFAULT; + goto mc_msf_out; + } + + n = gf32->gf_numsrc; + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (n >= 0x1ffffffU || + n > sysctl_mld_max_msf) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (offsetof(struct compat_group_filter, gf_slist[n]) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + + rtnl_lock(); + lock_sock(sk); + err = ip6_mc_msfilter(sk, &(struct group_filter){ + .gf_interface = gf32->gf_interface, + .gf_group = gf32->gf_group, + .gf_fmode = gf32->gf_fmode, + .gf_numsrc = gf32->gf_numsrc}, gf32->gf_slist); + release_sock(sk); + rtnl_unlock(); +mc_msf_out: + kfree(p); + return err; + } + } err = do_ipv6_setsockopt(sk, level, optname, optval, optlen); #ifdef CONFIG_NETFILTER @@ -1048,18 +1100,28 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case MCAST_MSFILTER: { + struct group_filter __user *p = (void __user *)optval; struct group_filter gsf; + const int size0 = offsetof(struct group_filter, gf_slist); + int num; int err; - if (len < GROUP_FILTER_SIZE(0)) + if (len < size0) return -EINVAL; - if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) + if (copy_from_user(&gsf, p, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; + num = gsf.gf_numsrc; lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, - (struct group_filter __user *)optval, optlen); + err = ip6_mc_msfget(sk, &gsf, p->gf_slist); + if (!err) { + if (num > gsf.gf_numsrc) + num = gsf.gf_numsrc; + if (put_user(GROUP_FILTER_SIZE(num), optlen) || + copy_to_user(p, &gsf, size0)) + err = -EFAULT; + } release_sock(sk); return err; } @@ -1075,6 +1137,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, msg.msg_control = optval; msg.msg_controllen = len; msg.msg_flags = flags; + msg.msg_control_is_user = true; lock_sock(sk); skb = np->pktoptions; @@ -1427,9 +1490,44 @@ int compat_ipv6_getsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; - if (optname == MCAST_MSFILTER) - return compat_mc_getsockopt(sk, level, optname, optval, optlen, - ipv6_getsockopt); + if (optname == MCAST_MSFILTER) { + const int size0 = offsetof(struct compat_group_filter, gf_slist); + struct compat_group_filter __user *p = (void __user *)optval; + struct compat_group_filter gf32; + struct group_filter gf; + int ulen, err; + int num; + + if (get_user(ulen, optlen)) + return -EFAULT; + + if (ulen < size0) + return -EINVAL; + + if (copy_from_user(&gf32, p, size0)) + return -EFAULT; + + gf.gf_interface = gf32.gf_interface; + gf.gf_fmode = gf32.gf_fmode; + num = gf.gf_numsrc = gf32.gf_numsrc; + gf.gf_group = gf32.gf_group; + + if (gf.gf_group.ss_family != AF_INET6) + return -EADDRNOTAVAIL; + lock_sock(sk); + err = ip6_mc_msfget(sk, &gf, p->gf_slist); + release_sock(sk); + if (err) + return err; + if (num > gf.gf_numsrc) + num = gf.gf_numsrc; + ulen = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); + if (put_user(ulen, optlen) || + put_user(gf.gf_fmode, &p->gf_fmode) || + put_user(gf.gf_numsrc, &p->gf_numsrc)) + return -EFAULT; + return 0; + } err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, MSG_CMSG_COMPAT); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index eaa4c2cc2fbb..7e12d2114158 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -457,7 +457,8 @@ done: return err; } -int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) +int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, + struct sockaddr_storage *list) { const struct in6_addr *group; struct ipv6_mc_socklist *pmc; @@ -509,10 +510,10 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) goto done; } newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; - for (i = 0; i < newpsl->sl_count; ++i) { + for (i = 0; i < newpsl->sl_count; ++i, ++list) { struct sockaddr_in6 *psin6; - psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i]; + psin6 = (struct sockaddr_in6 *)list; newpsl->sl_addr[i] = psin6->sin6_addr; } err = ip6_mc_add_src(idev, group, gsf->gf_fmode, @@ -547,7 +548,7 @@ done: } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct group_filter __user *optval, int __user *optlen) + struct sockaddr_storage *p) { int err, i, count, copycount; const struct in6_addr *group; @@ -592,14 +593,10 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || - copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { - return -EFAULT; - } /* changes to psl require the socket lock, and a write lock * on pmc->sflock. We have the socket lock so reading here is safe. */ - for (i = 0; i < copycount; i++) { + for (i = 0; i < copycount; i++, p++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -607,7 +604,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, memset(&ss, 0, sizeof(ss)); psin6->sin6_family = AF_INET6; psin6->sin6_addr = psl->sl_addr[i]; - if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + if (copy_to_user(p, &ss, sizeof(ss))) return -EFAULT; } return 0; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..27f29b957ee7 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1302,7 +1302,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) } } if (rt && lifetime == 0) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } @@ -1835,7 +1835,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0028aa1d7869..8ef5a7b30524 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1377,6 +1377,7 @@ const struct proto_ops inet6_sockraw_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ff847a324220..82cbb46a2a4f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -984,7 +984,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, gwaddr, dev); if (rt && !lifetime) { - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); rt = NULL; } @@ -1377,7 +1377,7 @@ static struct rt6_info *ip6_rt_pcpu_alloc(const struct fib6_result *res) rcu_read_lock(); dev = ip6_rt_get_dev_rcu(res); - pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags); + pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags | DST_NOCOUNT); rcu_read_unlock(); if (!pcpu_rt) { fib6_info_release(f6i); @@ -3197,6 +3197,9 @@ static int ip6_dst_gc(struct dst_ops *ops) int entries; entries = dst_entries_get_fast(ops); + if (entries > rt_max_size) + entries = dst_entries_get_slow(ops); + if (time_after(rt_last_gc + rt_min_interval, jiffies) && entries <= rt_max_size) goto out; @@ -3418,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, #ifdef CONFIG_IPV6_ROUTER_PREF fib6_nh->last_probe = jiffies; #endif + if (cfg->fc_is_fdb) { + fib6_nh->fib_nh_gw6 = cfg->fc_gateway; + fib6_nh->fib_nh_gw_family = AF_INET6; + return 0; + } err = -ENODEV; if (cfg->fc_ifindex) { @@ -3756,9 +3764,12 @@ out: return err; } -int ip6_del_rt(struct net *net, struct fib6_info *rt) +int ip6_del_rt(struct net *net, struct fib6_info *rt, bool skip_notify) { - struct nl_info info = { .nl_net = net }; + struct nl_info info = { + .nl_net = net, + .skip_notify = skip_notify + }; return __ip6_del_rt(rt, &info); } @@ -4279,7 +4290,7 @@ restart: (!idev || idev->cnf.accept_ra != 2) && fib6_info_hold_safe(rt)) { rcu_read_unlock(); - ip6_del_rt(net, rt); + ip6_del_rt(net, rt, false); goto restart; } } @@ -4330,41 +4341,29 @@ static void rtmsg_to_fib6_config(struct net *net, }; } -int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) { struct fib6_config cfg; - struct in6_rtmsg rtmsg; int err; - switch (cmd) { - case SIOCADDRT: /* Add a route */ - case SIOCDELRT: /* Delete a route */ - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - return -EPERM; - err = copy_from_user(&rtmsg, arg, - sizeof(struct in6_rtmsg)); - if (err) - return -EFAULT; - - rtmsg_to_fib6_config(net, &rtmsg, &cfg); + if (cmd != SIOCADDRT && cmd != SIOCDELRT) + return -EINVAL; + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; - rtnl_lock(); - switch (cmd) { - case SIOCADDRT: - err = ip6_route_add(&cfg, GFP_KERNEL, NULL); - break; - case SIOCDELRT: - err = ip6_route_del(&cfg, NULL); - break; - default: - err = -EINVAL; - } - rtnl_unlock(); + rtmsg_to_fib6_config(net, rtmsg, &cfg); - return err; + rtnl_lock(); + switch (cmd) { + case SIOCADDRT: + err = ip6_route_add(&cfg, GFP_KERNEL, NULL); + break; + case SIOCDELRT: + err = ip6_route_del(&cfg, NULL); + break; } - - return -EINVAL; + rtnl_unlock(); + return err; } /* @@ -5581,7 +5580,8 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, if (nexthop_is_blackhole(rt->nh)) rtm->rtm_type = RTN_BLACKHOLE; - if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) + if (net->ipv4.sysctl_nexthop_compat_mode && + rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0) goto nla_put_failure; rtm->rtm_flags |= nh_flags; @@ -6115,9 +6115,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_SYSCTL -static -int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; @@ -6417,6 +6416,35 @@ void __init ip6_route_init_special_entries(void) #endif } +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *rt) + +static const struct bpf_iter_reg ipv6_route_reg_info = { + .target = "ipv6_route", + .seq_ops = &ipv6_route_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct ipv6_route_iter), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__ipv6_route, rt), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_iter_register(void) +{ + return bpf_iter_reg_target(&ipv6_route_reg_info); +} + +static void bpf_iter_unregister(void) +{ + bpf_iter_unreg_target(&ipv6_route_reg_info); +} +#endif +#endif + int __init ip6_route_init(void) { int ret; @@ -6479,6 +6507,14 @@ int __init ip6_route_init(void) if (ret) goto out_register_late_subsys; +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + ret = bpf_iter_register(); + if (ret) + goto out_register_late_subsys; +#endif +#endif + for_each_possible_cpu(cpu) { struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu); @@ -6511,6 +6547,11 @@ out_kmem_cache: void ip6_route_cleanup(void) { +#if IS_BUILTIN(CONFIG_IPV6) +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + bpf_iter_unregister(); +#endif +#endif unregister_netdevice_notifier(&ip6_route_dev_notifier); unregister_pernet_subsys(&ip6_route_net_late_ops); fib6_rules_cleanup(); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 98954830c40b..1fbb4dfbb191 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -83,6 +83,13 @@ struct sit_net { struct net_device *fb_tunnel_dev; }; +static inline struct sit_net *dev_to_sit_net(struct net_device *dev) +{ + struct ip_tunnel *t = netdev_priv(dev); + + return net_generic(t->net, sit_net_id); +} + /* * Must be invoked with rcu_read_lock */ @@ -291,14 +298,18 @@ __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) } -static int ipip6_tunnel_get_prl(struct ip_tunnel *t, - struct ip_tunnel_prl __user *a) +static int ipip6_tunnel_get_prl(struct net_device *dev, struct ifreq *ifr) { + struct ip_tunnel_prl __user *a = ifr->ifr_ifru.ifru_data; + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; unsigned int cmax, c = 0, ca, len; int ret = 0; + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + return -EINVAL; + if (copy_from_user(&kprl, a, sizeof(kprl))) return -EFAULT; cmax = kprl.datalen / sizeof(kprl); @@ -441,6 +452,35 @@ out: return err; } +static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ifreq *ifr, + int cmd) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_prl prl; + int err; + + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + return -EINVAL; + + if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) + return -EFAULT; + + switch (cmd) { + case SIOCDELPRL: + err = ipip6_tunnel_del_prl(t, &prl); + break; + case SIOCADDPRL: + case SIOCCHGPRL: + err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); + break; + } + dst_cache_reset(&t->dst_cache); + netdev_state_change(dev); + return err; +} + static int isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) { @@ -1151,7 +1191,53 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, netdev_state_change(t->dev); return 0; } -#endif + +static int +ipip6_tunnel_get6rd(struct net_device *dev, struct ifreq *ifr) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_6rd ip6rd; + struct ip_tunnel_parm p; + + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + return -EFAULT; + t = ipip6_tunnel_locate(t->net, &p, 0); + } + if (!t) + t = netdev_priv(dev); + + ip6rd.prefix = t->ip6rd.prefix; + ip6rd.relay_prefix = t->ip6rd.relay_prefix; + ip6rd.prefixlen = t->ip6rd.prefixlen; + ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; + if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, sizeof(ip6rd))) + return -EFAULT; + return 0; +} + +static int +ipip6_tunnel_6rdctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct ip_tunnel *t = netdev_priv(dev); + struct ip_tunnel_6rd ip6rd; + int err; + + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, sizeof(ip6rd))) + return -EFAULT; + + if (cmd != SIOCDEL6RD) { + err = ipip6_tunnel_update_6rd(t, &ip6rd); + if (err < 0) + return err; + } else + ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev)); + return 0; +} + +#endif /* CONFIG_IPV6_SIT_6RD */ static bool ipip6_valid_ip_proto(u8 ipproto) { @@ -1164,185 +1250,145 @@ static bool ipip6_valid_ip_proto(u8 ipproto) } static int -ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +__ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) +{ + if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (!ipip6_valid_ip_proto(p->iph.protocol)) + return -EINVAL; + if (p->iph.version != 4 || + p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) + return -EINVAL; + + if (p->iph.ttl) + p->iph.frag_off |= htons(IP_DF); + return 0; +} + +static int +ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) { - int err = 0; - struct ip_tunnel_parm p; - struct ip_tunnel_prl prl; struct ip_tunnel *t = netdev_priv(dev); - struct net *net = t->net; - struct sit_net *sitn = net_generic(net, sit_net_id); -#ifdef CONFIG_IPV6_SIT_6RD - struct ip_tunnel_6rd ip6rd; -#endif - switch (cmd) { - case SIOCGETTUNNEL: -#ifdef CONFIG_IPV6_SIT_6RD - case SIOCGET6RD: -#endif - if (dev == sitn->fb_tunnel_dev) { - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { - err = -EFAULT; - break; - } - t = ipip6_tunnel_locate(net, &p, 0); - if (!t) - t = netdev_priv(dev); - } + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) + t = ipip6_tunnel_locate(t->net, p, 0); + if (!t) + t = netdev_priv(dev); + memcpy(p, &t->parms, sizeof(*p)); + return 0; +} - err = -EFAULT; - if (cmd == SIOCGETTUNNEL) { - memcpy(&p, &t->parms, sizeof(p)); - if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, - sizeof(p))) - goto done; -#ifdef CONFIG_IPV6_SIT_6RD +static int +ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) +{ + struct ip_tunnel *t = netdev_priv(dev); + int err; + + err = __ipip6_tunnel_ioctl_validate(t->net, p); + if (err) + return err; + + t = ipip6_tunnel_locate(t->net, p, 1); + if (!t) + return -ENOBUFS; + return 0; +} + +static int +ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) +{ + struct ip_tunnel *t = netdev_priv(dev); + int err; + + err = __ipip6_tunnel_ioctl_validate(t->net, p); + if (err) + return err; + + t = ipip6_tunnel_locate(t->net, p, 0); + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + if (!t) + return -ENOENT; + } else { + if (t) { + if (t->dev != dev) + return -EEXIST; } else { - ip6rd.prefix = t->ip6rd.prefix; - ip6rd.relay_prefix = t->ip6rd.relay_prefix; - ip6rd.prefixlen = t->ip6rd.prefixlen; - ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd, - sizeof(ip6rd))) - goto done; -#endif + if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) || + (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr)) + return -EINVAL; + t = netdev_priv(dev); } - err = 0; - break; - case SIOCADDTUNNEL: - case SIOCCHGTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; + ipip6_tunnel_update(t, p, t->fwmark); + } - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - - err = -EINVAL; - if (!ipip6_valid_ip_proto(p.iph.protocol)) - goto done; - if (p.iph.version != 4 || - p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) - goto done; - if (p.iph.ttl) - p.iph.frag_off |= htons(IP_DF); - - t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL); - - if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { - if (t) { - if (t->dev != dev) { - err = -EEXIST; - break; - } - } else { - if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || - (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { - err = -EINVAL; - break; - } - t = netdev_priv(dev); - } + return 0; +} - ipip6_tunnel_update(t, &p, t->fwmark); - } +static int +ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) +{ + struct ip_tunnel *t = netdev_priv(dev); - if (t) { - err = 0; - if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) - err = -EFAULT; - } else - err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); - break; + if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) + return -EPERM; + + if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { + t = ipip6_tunnel_locate(t->net, p, 0); + if (!t) + return -ENOENT; + if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev)) + return -EPERM; + dev = t->dev; + } + unregister_netdevice(dev); + return 0; +} +static int +ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) +{ + switch (cmd) { + case SIOCGETTUNNEL: + return ipip6_tunnel_get(dev, p); + case SIOCADDTUNNEL: + return ipip6_tunnel_add(dev, p); + case SIOCCHGTUNNEL: + return ipip6_tunnel_change(dev, p); case SIOCDELTUNNEL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - if (dev == sitn->fb_tunnel_dev) { - err = -EFAULT; - if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) - goto done; - err = -ENOENT; - t = ipip6_tunnel_locate(net, &p, 0); - if (!t) - goto done; - err = -EPERM; - if (t == netdev_priv(sitn->fb_tunnel_dev)) - goto done; - dev = t->dev; - } - unregister_netdevice(dev); - err = 0; - break; + return ipip6_tunnel_del(dev, p); + default: + return -EINVAL; + } +} +static int +ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + switch (cmd) { + case SIOCGETTUNNEL: + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + case SIOCDELTUNNEL: + return ip_tunnel_ioctl(dev, ifr, cmd); case SIOCGETPRL: - err = -EINVAL; - if (dev == sitn->fb_tunnel_dev) - goto done; - err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data); - break; - + return ipip6_tunnel_get_prl(dev, ifr); case SIOCADDPRL: case SIOCDELPRL: case SIOCCHGPRL: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - err = -EINVAL; - if (dev == sitn->fb_tunnel_dev) - goto done; - err = -EFAULT; - if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl))) - goto done; - - switch (cmd) { - case SIOCDELPRL: - err = ipip6_tunnel_del_prl(t, &prl); - break; - case SIOCADDPRL: - case SIOCCHGPRL: - err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); - break; - } - dst_cache_reset(&t->dst_cache); - netdev_state_change(dev); - break; - + return ipip6_tunnel_prl_ctl(dev, ifr, cmd); #ifdef CONFIG_IPV6_SIT_6RD + case SIOCGET6RD: + return ipip6_tunnel_get6rd(dev, ifr); case SIOCADD6RD: case SIOCCHG6RD: case SIOCDEL6RD: - err = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) - goto done; - - err = -EFAULT; - if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data, - sizeof(ip6rd))) - goto done; - - if (cmd != SIOCDEL6RD) { - err = ipip6_tunnel_update_6rd(t, &ip6rd); - if (err < 0) - goto done; - } else - ipip6_tunnel_clone_6rd(dev, sitn); - - err = 0; - break; + return ipip6_tunnel_6rdctl(dev, ifr, cmd); #endif - default: - err = -EINVAL; + return -EINVAL; } - -done: - return err; } static const struct net_device_ops ipip6_netdev_ops = { @@ -1352,6 +1398,7 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_get_stats64 = ip_tunnel_get_stats64, .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; static void ipip6_dev_free(struct net_device *dev) diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 63b657aa8d29..fac2135aa47b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -26,8 +26,7 @@ static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 413b3425ac66..f67d45ff00b4 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -463,6 +463,8 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (fastopen && !fastopen->sk) break; + ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); + if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ @@ -471,6 +473,15 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else sk->sk_err_soft = err; goto out; + case TCP_LISTEN: + break; + default: + /* check if this ICMP message allows revert of backoff. + * (see RFC 6069) + */ + if (!fastopen && type == ICMPV6_DEST_UNREACH && + code == ICMPV6_NOROUTE) + tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && np->recverr) { @@ -2110,6 +2121,7 @@ struct proto tcpv6_prot = { #endif .diag_destroy = tcp_abort, }; +EXPORT_SYMBOL_GPL(tcpv6_prot); /* thinking of making this const? Don't. * early_demux can change based on sysctl. diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 21e7b95ddbfa..06c02ebe6b9b 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -21,8 +21,14 @@ static struct xfrm6_tunnel __rcu *tunnel6_handlers __read_mostly; static struct xfrm6_tunnel __rcu *tunnel46_handlers __read_mostly; +static struct xfrm6_tunnel __rcu *tunnelmpls6_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); +static inline int xfrm6_tunnel_mpls_supported(void) +{ + return IS_ENABLED(CONFIG_MPLS); +} + int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) { struct xfrm6_tunnel __rcu **pprev; @@ -32,8 +38,21 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) mutex_lock(&tunnel6_mutex); - for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - (t = rcu_dereference_protected(*pprev, + switch (family) { + case AF_INET6: + pprev = &tunnel6_handlers; + break; + case AF_INET: + pprev = &tunnel46_handlers; + break; + case AF_MPLS: + pprev = &tunnelmpls6_handlers; + break; + default: + goto err; + } + + for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) @@ -62,8 +81,21 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) mutex_lock(&tunnel6_mutex); - for (pprev = (family == AF_INET6) ? &tunnel6_handlers : &tunnel46_handlers; - (t = rcu_dereference_protected(*pprev, + switch (family) { + case AF_INET6: + pprev = &tunnel6_handlers; + break; + case AF_INET: + pprev = &tunnel46_handlers; + break; + case AF_MPLS: + pprev = &tunnelmpls6_handlers; + break; + default: + goto err; + } + + for (; (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel6_mutex))) != NULL; pprev = &t->next) { if (t == handler) { @@ -73,6 +105,7 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) } } +err: mutex_unlock(&tunnel6_mutex); synchronize_net(); @@ -86,6 +119,24 @@ EXPORT_SYMBOL(xfrm6_tunnel_deregister); handler != NULL; \ handler = rcu_dereference(handler->next)) \ +static int tunnelmpls6_rcv(struct sk_buff *skb) +{ + struct xfrm6_tunnel *handler; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + + for_each_tunnel_rcu(tunnelmpls6_handlers, handler) + if (!handler->handler(skb)) + return 0; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} + static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -146,6 +197,18 @@ static int tunnel46_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return -ENOENT; } +static int tunnelmpls6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + u8 type, u8 code, int offset, __be32 info) +{ + struct xfrm6_tunnel *handler; + + for_each_tunnel_rcu(tunnelmpls6_handlers, handler) + if (!handler->err_handler(skb, opt, type, code, offset, info)) + return 0; + + return -ENOENT; +} + static const struct inet6_protocol tunnel6_protocol = { .handler = tunnel6_rcv, .err_handler = tunnel6_err, @@ -158,6 +221,12 @@ static const struct inet6_protocol tunnel46_protocol = { .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; +static const struct inet6_protocol tunnelmpls6_protocol = { + .handler = tunnelmpls6_rcv, + .err_handler = tunnelmpls6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + static int __init tunnel6_init(void) { if (inet6_add_protocol(&tunnel6_protocol, IPPROTO_IPV6)) { @@ -169,6 +238,13 @@ static int __init tunnel6_init(void) inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); return -EAGAIN; } + if (xfrm6_tunnel_mpls_supported() && + inet6_add_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) { + pr_err("%s: can't add protocol\n", __func__); + inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6); + inet6_del_protocol(&tunnel46_protocol, IPPROTO_IPIP); + return -EAGAIN; + } return 0; } @@ -178,6 +254,9 @@ static void __exit tunnel6_fini(void) pr_err("%s: can't remove protocol\n", __func__); if (inet6_del_protocol(&tunnel6_protocol, IPPROTO_IPV6)) pr_err("%s: can't remove protocol\n", __func__); + if (xfrm6_tunnel_mpls_supported() && + inet6_del_protocol(&tunnelmpls6_protocol, IPPROTO_MPLS)) + pr_err("%s: can't remove protocol\n", __func__); } module_init(tunnel6_init); diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index a52cb3fc6df5..04cbeefd8982 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -17,11 +17,6 @@ #include <net/ipv6.h> #include <net/xfrm.h> -int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) -{ - return xfrm6_extract_header(skb); -} - int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t) { @@ -35,9 +30,12 @@ EXPORT_SYMBOL(xfrm6_rcv_spi); static int xfrm6_transport_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { - if (xfrm_trans_queue(skb, ip6_rcv_finish)) - __kfree_skb(skb); - return -1; + if (xfrm_trans_queue(skb, ip6_rcv_finish)) { + kfree_skb(skb); + return NET_RX_DROP; + } + + return 0; } int xfrm6_transport_finish(struct sk_buff *skb, int async) @@ -60,13 +58,106 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async) if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); skb_reset_transport_header(skb); - return -1; + return 0; } NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm6_transport_finish2); - return -1; + return 0; +} + +/* If it's a keepalive packet, then just eat it. + * If it's an encapsulated packet, then pass it to the + * IPsec xfrm input. + * Returns 0 if skb passed to xfrm or was dropped. + * Returns >0 if skb should be passed to UDP. + * Returns <0 if skb should be resubmitted (-ret is protocol) + */ +int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + struct udphdr *uh; + struct ipv6hdr *ip6h; + int len; + int ip6hlen = sizeof(struct ipv6hdr); + + __u8 *udpdata; + __be32 *udpdata32; + __u16 encap_type = up->encap_type; + + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; + + /* If this is a paged skb, make sure we pull up + * whatever data we need to look at. */ + len = skb->len - sizeof(struct udphdr); + if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) + return 1; + + /* Now we can get the pointers */ + uh = udp_hdr(skb); + udpdata = (__u8 *)uh + sizeof(struct udphdr); + udpdata32 = (__be32 *)udpdata; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + goto drop; + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { + /* ESP Packet without Non-ESP header */ + len = sizeof(struct udphdr); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + goto drop; + } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && + udpdata32[0] == 0 && udpdata32[1] == 0) { + + /* ESP Packet with Non-IKE marker */ + len = sizeof(struct udphdr) + 2 * sizeof(u32); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + } + + /* At this point we are sure that this is an ESPinUDP packet, + * so we need to remove 'len' bytes from the packet (the UDP + * header and optional ESP marker bytes) and then modify the + * protocol to ESP, and then call into the transform receiver. + */ + if (skb_unclone(skb, GFP_ATOMIC)) + goto drop; + + /* Now we can update and verify the packet length... */ + ip6h = ipv6_hdr(skb); + ip6h->payload_len = htons(ntohs(ip6h->payload_len) - len); + if (skb->len < ip6hlen + len) { + /* packet is too small!?! */ + goto drop; + } + + /* pull the data buffer up to the ESP header and set the + * transport header to point to ESP. Keep UDP on the stack + * for later. + */ + __skb_pull(skb, len); + skb_reset_transport_header(skb); + + /* process ESP */ + return xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, encap_type); + +drop: + kfree_skb(skb); + return 0; } int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index e34167f790e6..8b84d534b19d 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -23,24 +23,7 @@ int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, } EXPORT_SYMBOL(xfrm6_find_1stfragopt); -static int xfrm6_local_dontfrag(struct sk_buff *skb) -{ - int proto; - struct sock *sk = skb->sk; - - if (sk) { - if (sk->sk_family != AF_INET6) - return 0; - - proto = sk->sk_protocol; - if (proto == IPPROTO_UDP || proto == IPPROTO_RAW) - return inet6_sk(sk)->dontfrag; - } - - return 0; -} - -static void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) +void xfrm6_local_rxpmtu(struct sk_buff *skb, u32 mtu) { struct flowi6 fl6; struct sock *sk = skb->sk; @@ -64,80 +47,9 @@ void xfrm6_local_error(struct sk_buff *skb, u32 mtu) ipv6_local_error(sk, EMSGSIZE, &fl6, mtu); } -static int xfrm6_tunnel_check_size(struct sk_buff *skb) -{ - int mtu, ret = 0; - struct dst_entry *dst = skb_dst(skb); - - if (skb->ignore_df) - goto out; - - mtu = dst_mtu(dst); - if (mtu < IPV6_MIN_MTU) - mtu = IPV6_MIN_MTU; - - if ((!skb_is_gso(skb) && skb->len > mtu) || - (skb_is_gso(skb) && - !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) { - skb->dev = dst->dev; - skb->protocol = htons(ETH_P_IPV6); - - if (xfrm6_local_dontfrag(skb)) - xfrm6_local_rxpmtu(skb, mtu); - else if (skb->sk) - xfrm_local_error(skb, mtu); - else - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - ret = -EMSGSIZE; - } -out: - return ret; -} - -int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) -{ - int err; - - err = xfrm6_tunnel_check_size(skb); - if (err) - return err; - - XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; - - return xfrm6_extract_header(skb); -} - -int xfrm6_output_finish(struct sock *sk, struct sk_buff *skb) -{ - memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); - - IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; - - return xfrm_output(sk, skb); -} - -static int __xfrm6_output_state_finish(struct xfrm_state *x, struct sock *sk, - struct sk_buff *skb) -{ - const struct xfrm_state_afinfo *afinfo; - int ret = -EAFNOSUPPORT; - - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); - if (likely(afinfo)) - ret = afinfo->output_finish(sk, skb); - else - kfree_skb(skb); - rcu_read_unlock(); - - return ret; -} - static int __xfrm6_output_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct xfrm_state *x = skb_dst(skb)->xfrm; - - return __xfrm6_output_state_finish(x, sk, skb); + return xfrm_output(sk, skb); } static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) @@ -164,7 +76,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) toobig = skb->len > mtu && !skb_is_gso(skb); - if (toobig && xfrm6_local_dontfrag(skb)) { + if (toobig && xfrm6_local_dontfrag(skb->sk)) { xfrm6_local_rxpmtu(skb, mtu); kfree_skb(skb); return -EMSGSIZE; @@ -179,7 +91,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) __xfrm6_output_finish); skip_frag: - return __xfrm6_output_state_finish(x, sk, skb); + return xfrm_output(sk, skb); } int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_protocol.c b/net/ipv6/xfrm6_protocol.c index 34cb65c7d5a7..ea2f805d3b01 100644 --- a/net/ipv6/xfrm6_protocol.c +++ b/net/ipv6/xfrm6_protocol.c @@ -14,6 +14,7 @@ #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/icmpv6.h> +#include <net/ip6_route.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/xfrm.h> @@ -58,6 +59,53 @@ static int xfrm6_rcv_cb(struct sk_buff *skb, u8 protocol, int err) return 0; } +int xfrm6_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, + int encap_type) +{ + int ret; + struct xfrm6_protocol *handler; + struct xfrm6_protocol __rcu **head = proto_handlers(nexthdr); + + XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; + XFRM_SPI_SKB_CB(skb)->family = AF_INET6; + XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); + + if (!head) + goto out; + + if (!skb_dst(skb)) { + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + int flags = RT6_LOOKUP_F_HAS_SADDR; + struct dst_entry *dst; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + dst = ip6_route_input_lookup(dev_net(skb->dev), skb->dev, &fl6, + skb, flags); + if (dst->error) + goto drop; + skb_dst_set(skb, dst); + } + + for_each_protocol_rcu(*head, handler) + if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) + return ret; + +out: + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); + +drop: + kfree_skb(skb); + return 0; +} +EXPORT_SYMBOL(xfrm6_rcv_encap); + static int xfrm6_esp_rcv(struct sk_buff *skb) { int ret; diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index 78daadecbdef..6610b2198fa9 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -13,37 +13,11 @@ */ #include <net/xfrm.h> -#include <linux/pfkeyv2.h> -#include <linux/ipsec.h> -#include <linux/netfilter_ipv6.h> -#include <linux/export.h> -#include <net/dsfield.h> -#include <net/ipv6.h> -#include <net/addrconf.h> - -int xfrm6_extract_header(struct sk_buff *skb) -{ - struct ipv6hdr *iph = ipv6_hdr(skb); - - XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); - XFRM_MODE_SKB_CB(skb)->id = 0; - XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); - XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); - XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; - XFRM_MODE_SKB_CB(skb)->optlen = 0; - memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, - sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); - - return 0; -} static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, .proto = IPPROTO_IPV6, .output = xfrm6_output, - .output_finish = xfrm6_output_finish, - .extract_input = xfrm6_extract_input, - .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, .local_error = xfrm6_local_error, }; diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index c4bdcbc84b07..ee0add15497d 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/types.h> +#include <linux/limits.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/kernel.h> @@ -36,8 +37,6 @@ static char iucv_userid[80]; -static const struct proto_ops iucv_sock_ops; - static struct proto iucv_proto = { .name = "AF_IUCV", .owner = THIS_MODULE, @@ -85,14 +84,11 @@ do { \ __ret; \ }) +static struct sock *iucv_accept_dequeue(struct sock *parent, + struct socket *newsock); static void iucv_sock_kill(struct sock *sk); static void iucv_sock_close(struct sock *sk); -static void iucv_sever_path(struct sock *, int); -static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); -static int afiucv_hs_send(struct iucv_message *imsg, struct sock *sock, - struct sk_buff *skb, u8 flags); static void afiucv_hs_callback_txnotify(struct sk_buff *, enum iucv_tx_notify); /* Call Back functions */ @@ -127,110 +123,6 @@ static inline void low_nmcpy(unsigned char *dst, char *src) memcpy(&dst[8], src, 8); } -static int afiucv_pm_prepare(struct device *dev) -{ -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "afiucv_pm_prepare\n"); -#endif - return 0; -} - -static void afiucv_pm_complete(struct device *dev) -{ -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "afiucv_pm_complete\n"); -#endif -} - -/** - * afiucv_pm_freeze() - Freeze PM callback - * @dev: AFIUCV dummy device - * - * Sever all established IUCV communication pathes - */ -static int afiucv_pm_freeze(struct device *dev) -{ - struct iucv_sock *iucv; - struct sock *sk; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "afiucv_pm_freeze\n"); -#endif - read_lock(&iucv_sk_list.lock); - sk_for_each(sk, &iucv_sk_list.head) { - iucv = iucv_sk(sk); - switch (sk->sk_state) { - case IUCV_DISCONN: - case IUCV_CLOSING: - case IUCV_CONNECTED: - iucv_sever_path(sk, 0); - break; - case IUCV_OPEN: - case IUCV_BOUND: - case IUCV_LISTEN: - case IUCV_CLOSED: - default: - break; - } - skb_queue_purge(&iucv->send_skb_q); - skb_queue_purge(&iucv->backlog_skb_q); - } - read_unlock(&iucv_sk_list.lock); - return 0; -} - -/** - * afiucv_pm_restore_thaw() - Thaw and restore PM callback - * @dev: AFIUCV dummy device - * - * socket clean up after freeze - */ -static int afiucv_pm_restore_thaw(struct device *dev) -{ - struct sock *sk; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "afiucv_pm_restore_thaw\n"); -#endif - read_lock(&iucv_sk_list.lock); - sk_for_each(sk, &iucv_sk_list.head) { - switch (sk->sk_state) { - case IUCV_CONNECTED: - sk->sk_err = EPIPE; - sk->sk_state = IUCV_DISCONN; - sk->sk_state_change(sk); - break; - case IUCV_DISCONN: - case IUCV_CLOSING: - case IUCV_LISTEN: - case IUCV_BOUND: - case IUCV_OPEN: - default: - break; - } - } - read_unlock(&iucv_sk_list.lock); - return 0; -} - -static const struct dev_pm_ops afiucv_pm_ops = { - .prepare = afiucv_pm_prepare, - .complete = afiucv_pm_complete, - .freeze = afiucv_pm_freeze, - .thaw = afiucv_pm_restore_thaw, - .restore = afiucv_pm_restore_thaw, -}; - -static struct device_driver af_iucv_driver = { - .owner = THIS_MODULE, - .name = "afiucv", - .bus = NULL, - .pm = &afiucv_pm_ops, -}; - -/* dummy device used as trigger for PM functions */ -static struct device *af_iucv_dev; - /** * iucv_msg_length() - Returns the length of an iucv message. * @msg: Pointer to struct iucv_message, MUST NOT be NULL @@ -435,6 +327,20 @@ static void iucv_sock_cleanup_listen(struct sock *parent) parent->sk_state = IUCV_CLOSED; } +static void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_add_node(sk, &l->head); + write_unlock_bh(&l->lock); +} + +static void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_del_node_init(sk); + write_unlock_bh(&l->lock); +} + /* Kill socket (only if zapped and orphaned) */ static void iucv_sock_kill(struct sock *sk) { @@ -607,53 +513,7 @@ static struct sock *iucv_sock_alloc(struct socket *sock, int proto, gfp_t prio, return sk; } -/* Create an IUCV socket */ -static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, - int kern) -{ - struct sock *sk; - - if (protocol && protocol != PF_IUCV) - return -EPROTONOSUPPORT; - - sock->state = SS_UNCONNECTED; - - switch (sock->type) { - case SOCK_STREAM: - sock->ops = &iucv_sock_ops; - break; - case SOCK_SEQPACKET: - /* currently, proto ops can handle both sk types */ - sock->ops = &iucv_sock_ops; - break; - default: - return -ESOCKTNOSUPPORT; - } - - sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern); - if (!sk) - return -ENOMEM; - - iucv_sock_init(sk, NULL); - - return 0; -} - -void iucv_sock_link(struct iucv_sock_list *l, struct sock *sk) -{ - write_lock_bh(&l->lock); - sk_add_node(sk, &l->head); - write_unlock_bh(&l->lock); -} - -void iucv_sock_unlink(struct iucv_sock_list *l, struct sock *sk) -{ - write_lock_bh(&l->lock); - sk_del_node_init(sk); - write_unlock_bh(&l->lock); -} - -void iucv_accept_enqueue(struct sock *parent, struct sock *sk) +static void iucv_accept_enqueue(struct sock *parent, struct sock *sk) { unsigned long flags; struct iucv_sock *par = iucv_sk(parent); @@ -666,7 +526,7 @@ void iucv_accept_enqueue(struct sock *parent, struct sock *sk) sk_acceptq_added(parent); } -void iucv_accept_unlink(struct sock *sk) +static void iucv_accept_unlink(struct sock *sk) { unsigned long flags; struct iucv_sock *par = iucv_sk(iucv_sk(sk)->parent); @@ -679,7 +539,8 @@ void iucv_accept_unlink(struct sock *sk) sock_put(sk); } -struct sock *iucv_accept_dequeue(struct sock *parent, struct socket *newsock) +static struct sock *iucv_accept_dequeue(struct sock *parent, + struct socket *newsock) { struct iucv_sock *isk, *n; struct sock *sk; @@ -1100,7 +961,6 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg, /* initialize defaults */ cmsg_done = 0; /* check for duplicate headers */ - txmsg.class = 0; /* iterate over control messages */ for_each_cmsghdr(cmsg, msg) { @@ -1511,8 +1371,8 @@ static inline __poll_t iucv_accept_poll(struct sock *parent) return 0; } -__poll_t iucv_sock_poll(struct file *file, struct socket *sock, - poll_table *wait) +static __poll_t iucv_sock_poll(struct file *file, struct socket *sock, + poll_table *wait) { struct sock *sk = sock->sk; __poll_t mask = 0; @@ -1664,7 +1524,7 @@ static int iucv_sock_setsockopt(struct socket *sock, int level, int optname, switch (sk->sk_state) { case IUCV_OPEN: case IUCV_BOUND: - if (val < 1 || val > (u16)(~0)) + if (val < 1 || val > U16_MAX) rc = -EINVAL; else iucv->msglimit = val; @@ -2396,6 +2256,35 @@ static const struct proto_ops iucv_sock_ops = { .getsockopt = iucv_sock_getsockopt, }; +static int iucv_sock_create(struct net *net, struct socket *sock, int protocol, + int kern) +{ + struct sock *sk; + + if (protocol && protocol != PF_IUCV) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + switch (sock->type) { + case SOCK_STREAM: + case SOCK_SEQPACKET: + /* currently, proto ops can handle both sk types */ + sock->ops = &iucv_sock_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + sk = iucv_sock_alloc(sock, protocol, GFP_KERNEL, kern); + if (!sk) + return -ENOMEM; + + iucv_sock_init(sk, NULL); + + return 0; +} + static const struct net_proto_family iucv_sock_family_ops = { .family = AF_IUCV, .owner = THIS_MODULE, @@ -2409,45 +2298,11 @@ static struct packet_type iucv_packet_type = { static int afiucv_iucv_init(void) { - int err; - - err = pr_iucv->iucv_register(&af_iucv_handler, 0); - if (err) - goto out; - /* establish dummy device */ - af_iucv_driver.bus = pr_iucv->bus; - err = driver_register(&af_iucv_driver); - if (err) - goto out_iucv; - af_iucv_dev = kzalloc(sizeof(struct device), GFP_KERNEL); - if (!af_iucv_dev) { - err = -ENOMEM; - goto out_driver; - } - dev_set_name(af_iucv_dev, "af_iucv"); - af_iucv_dev->bus = pr_iucv->bus; - af_iucv_dev->parent = pr_iucv->root; - af_iucv_dev->release = (void (*)(struct device *))kfree; - af_iucv_dev->driver = &af_iucv_driver; - err = device_register(af_iucv_dev); - if (err) - goto out_iucv_dev; - return 0; - -out_iucv_dev: - put_device(af_iucv_dev); -out_driver: - driver_unregister(&af_iucv_driver); -out_iucv: - pr_iucv->iucv_unregister(&af_iucv_handler, 0); -out: - return err; + return pr_iucv->iucv_register(&af_iucv_handler, 0); } static void afiucv_iucv_exit(void) { - device_unregister(af_iucv_dev); - driver_unregister(&af_iucv_driver); pr_iucv->iucv_unregister(&af_iucv_handler, 0); } diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 9a2d023842fe..19250a0c85d3 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -67,32 +67,9 @@ static int iucv_bus_match(struct device *dev, struct device_driver *drv) return 0; } -enum iucv_pm_states { - IUCV_PM_INITIAL = 0, - IUCV_PM_FREEZING = 1, - IUCV_PM_THAWING = 2, - IUCV_PM_RESTORING = 3, -}; -static enum iucv_pm_states iucv_pm_state; - -static int iucv_pm_prepare(struct device *); -static void iucv_pm_complete(struct device *); -static int iucv_pm_freeze(struct device *); -static int iucv_pm_thaw(struct device *); -static int iucv_pm_restore(struct device *); - -static const struct dev_pm_ops iucv_pm_ops = { - .prepare = iucv_pm_prepare, - .complete = iucv_pm_complete, - .freeze = iucv_pm_freeze, - .thaw = iucv_pm_thaw, - .restore = iucv_pm_restore, -}; - struct bus_type iucv_bus = { .name = "iucv", .match = iucv_bus_match, - .pm = &iucv_pm_ops, }; EXPORT_SYMBOL(iucv_bus); @@ -435,31 +412,6 @@ static void iucv_block_cpu(void *data) } /** - * iucv_block_cpu_almost - * @data: unused - * - * Allow connection-severed interrupts only on this cpu. - */ -static void iucv_block_cpu_almost(void *data) -{ - int cpu = smp_processor_id(); - union iucv_param *parm; - - /* Allow iucv control interrupts only */ - parm = iucv_param_irq[cpu]; - memset(parm, 0, sizeof(union iucv_param)); - parm->set_mask.ipmask = 0x08; - iucv_call_b2f0(IUCV_SETMASK, parm); - /* Allow iucv-severed interrupt only */ - memset(parm, 0, sizeof(union iucv_param)); - parm->set_mask.ipmask = 0x20; - iucv_call_b2f0(IUCV_SETCONTROLMASK, parm); - - /* Clear indication that iucv interrupts are allowed for this cpu. */ - cpumask_clear_cpu(cpu, &iucv_irq_cpumask); -} - -/** * iucv_declare_cpu * @data: unused * @@ -1834,146 +1786,6 @@ static void iucv_external_interrupt(struct ext_code ext_code, spin_unlock(&iucv_queue_lock); } -static int iucv_pm_prepare(struct device *dev) -{ - int rc = 0; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_INFO "iucv_pm_prepare\n"); -#endif - if (dev->driver && dev->driver->pm && dev->driver->pm->prepare) - rc = dev->driver->pm->prepare(dev); - return rc; -} - -static void iucv_pm_complete(struct device *dev) -{ -#ifdef CONFIG_PM_DEBUG - printk(KERN_INFO "iucv_pm_complete\n"); -#endif - if (dev->driver && dev->driver->pm && dev->driver->pm->complete) - dev->driver->pm->complete(dev); -} - -/** - * iucv_path_table_empty() - determine if iucv path table is empty - * - * Returns 0 if there are still iucv pathes defined - * 1 if there are no iucv pathes defined - */ -static int iucv_path_table_empty(void) -{ - int i; - - for (i = 0; i < iucv_max_pathid; i++) { - if (iucv_path_table[i]) - return 0; - } - return 1; -} - -/** - * iucv_pm_freeze() - Freeze PM callback - * @dev: iucv-based device - * - * disable iucv interrupts - * invoke callback function of the iucv-based driver - * shut down iucv, if no iucv-pathes are established anymore - */ -static int iucv_pm_freeze(struct device *dev) -{ - int cpu; - struct iucv_irq_list *p, *n; - int rc = 0; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "iucv_pm_freeze\n"); -#endif - if (iucv_pm_state != IUCV_PM_FREEZING) { - for_each_cpu(cpu, &iucv_irq_cpumask) - smp_call_function_single(cpu, iucv_block_cpu_almost, - NULL, 1); - cancel_work_sync(&iucv_work); - list_for_each_entry_safe(p, n, &iucv_work_queue, list) { - list_del_init(&p->list); - iucv_sever_pathid(p->data.ippathid, - iucv_error_no_listener); - kfree(p); - } - } - iucv_pm_state = IUCV_PM_FREEZING; - if (dev->driver && dev->driver->pm && dev->driver->pm->freeze) - rc = dev->driver->pm->freeze(dev); - if (iucv_path_table_empty()) - iucv_disable(); - return rc; -} - -/** - * iucv_pm_thaw() - Thaw PM callback - * @dev: iucv-based device - * - * make iucv ready for use again: allocate path table, declare interrupt buffers - * and enable iucv interrupts - * invoke callback function of the iucv-based driver - */ -static int iucv_pm_thaw(struct device *dev) -{ - int rc = 0; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "iucv_pm_thaw\n"); -#endif - iucv_pm_state = IUCV_PM_THAWING; - if (!iucv_path_table) { - rc = iucv_enable(); - if (rc) - goto out; - } - if (cpumask_empty(&iucv_irq_cpumask)) { - if (iucv_nonsmp_handler) - /* enable interrupts on one cpu */ - iucv_allow_cpu(NULL); - else - /* enable interrupts on all cpus */ - iucv_setmask_mp(); - } - if (dev->driver && dev->driver->pm && dev->driver->pm->thaw) - rc = dev->driver->pm->thaw(dev); -out: - return rc; -} - -/** - * iucv_pm_restore() - Restore PM callback - * @dev: iucv-based device - * - * make iucv ready for use again: allocate path table, declare interrupt buffers - * and enable iucv interrupts - * invoke callback function of the iucv-based driver - */ -static int iucv_pm_restore(struct device *dev) -{ - int rc = 0; - -#ifdef CONFIG_PM_DEBUG - printk(KERN_WARNING "iucv_pm_restore %p\n", iucv_path_table); -#endif - if ((iucv_pm_state != IUCV_PM_RESTORING) && iucv_path_table) - pr_warn("Suspending Linux did not completely close all IUCV connections\n"); - iucv_pm_state = IUCV_PM_RESTORING; - if (cpumask_empty(&iucv_irq_cpumask)) { - rc = iucv_query_maxconn(); - rc = iucv_enable(); - if (rc) - goto out; - } - if (dev->driver && dev->driver->pm && dev->driver->pm->restore) - rc = dev->driver->pm->restore(dev); -out: - return rc; -} - struct iucv_interface iucv_if = { .message_receive = iucv_message_receive, .__message_receive = __iucv_message_receive, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d3b520b9b2c9..fd5ac2788e45 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -56,6 +56,7 @@ static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); + netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 0fa694bd3f6a..526ed2c24dd5 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -772,6 +772,7 @@ static const struct proto_ops l2tp_ip6_ops = { .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/lapb/Kconfig b/net/lapb/Kconfig index 6acfc999c952..5b50e8d64f26 100644 --- a/net/lapb/Kconfig +++ b/net/lapb/Kconfig @@ -15,7 +15,7 @@ config LAPB currently supports LAPB only over Ethernet connections. If you want to use LAPB connections over Ethernet, say Y here and to "LAPB over Ethernet driver" below. Read - <file:Documentation/networking/lapb-module.txt> for technical + <file:Documentation/networking/lapb-module.rst> for technical details. To compile this driver as a module, choose M here: the diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 4d1c335e06e5..7f245e9f114c 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ /** @@ -292,7 +292,8 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, goto end; } - if (!sta->sta.ht_cap.ht_supported) { + if (!sta->sta.ht_cap.ht_supported && + sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) { ht_dbg(sta->sdata, "STA %pM erroneously requests BA session on tid %d w/o QoS\n", sta->sta.addr, tid); diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 33da6f738c99..b37c8a983d88 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include <linux/ieee80211.h> @@ -448,6 +448,45 @@ static void sta_addba_resp_timer_expired(struct timer_list *t) ieee80211_stop_tx_ba_session(&sta->sta, tid); } +static void ieee80211_send_addba_with_timeout(struct sta_info *sta, + struct tid_ampdu_tx *tid_tx) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_local *local = sta->local; + u8 tid = tid_tx->tid; + u16 buf_size; + + /* activate the timer for the recipient's addBA response */ + mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); + ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n", + sta->sta.addr, tid); + + spin_lock_bh(&sta->lock); + sta->ampdu_mlme.last_addba_req_time[tid] = jiffies; + sta->ampdu_mlme.addba_req_num[tid]++; + spin_unlock_bh(&sta->lock); + + if (sta->sta.he_cap.has_he) { + buf_size = local->hw.max_tx_aggregation_subframes; + } else { + /* + * We really should use what the driver told us it will + * transmit as the maximum, but certain APs (e.g. the + * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) + * will crash when we use a lower number. + */ + buf_size = IEEE80211_MAX_AMPDU_BUF_HT; + } + + /* send AddBA request */ + ieee80211_send_addba_request(sdata, sta->sta.addr, tid, + tid_tx->dialog_token, + sta->tid_seq[tid] >> 4, + buf_size, tid_tx->timeout); + + WARN_ON(test_and_set_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)); +} + void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) { struct tid_ampdu_tx *tid_tx; @@ -462,7 +501,6 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) .timeout = 0, }; int ret; - u16 buf_size; tid_tx = rcu_dereference_protected_tid_tx(sta, tid); @@ -485,7 +523,9 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) params.ssn = sta->tid_seq[tid] >> 4; ret = drv_ampdu_action(local, sdata, ¶ms); - if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { + if (ret == IEEE80211_AMPDU_TX_START_DELAY_ADDBA) { + return; + } else if (ret == IEEE80211_AMPDU_TX_START_IMMEDIATE) { /* * We didn't send the request yet, so don't need to check * here if we already got a response, just mark as driver @@ -508,32 +548,7 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid) return; } - /* activate the timer for the recipient's addBA response */ - mod_timer(&tid_tx->addba_resp_timer, jiffies + ADDBA_RESP_INTERVAL); - ht_dbg(sdata, "activated addBA response timer on %pM tid %d\n", - sta->sta.addr, tid); - - spin_lock_bh(&sta->lock); - sta->ampdu_mlme.last_addba_req_time[tid] = jiffies; - sta->ampdu_mlme.addba_req_num[tid]++; - spin_unlock_bh(&sta->lock); - - if (sta->sta.he_cap.has_he) { - buf_size = local->hw.max_tx_aggregation_subframes; - } else { - /* - * We really should use what the driver told us it will - * transmit as the maximum, but certain APs (e.g. the - * LinkSys WRT120N with FW v1.0.07 build 002 Jun 18 2012) - * will crash when we use a lower number. - */ - buf_size = IEEE80211_MAX_AMPDU_BUF_HT; - } - - /* send AddBA request */ - ieee80211_send_addba_request(sdata, sta->sta.addr, tid, - tid_tx->dialog_token, params.ssn, - buf_size, tid_tx->timeout); + ieee80211_send_addba_with_timeout(sta, tid_tx); } /* @@ -578,7 +593,8 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, "Requested to start BA session on reserved tid=%d", tid)) return -EINVAL; - if (!pubsta->ht_cap.ht_supported) + if (!pubsta->ht_cap.ht_supported && + sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) return -EINVAL; if (WARN_ON_ONCE(!local->ops->ampdu_action)) @@ -754,6 +770,12 @@ void ieee80211_start_tx_ba_cb(struct sta_info *sta, int tid, if (WARN_ON(test_and_set_bit(HT_AGG_STATE_DRV_READY, &tid_tx->state))) return; + if (!test_bit(HT_AGG_STATE_SENT_ADDBA, &tid_tx->state)) { + ieee80211_send_addba_with_timeout(sta, tid_tx); + /* RESPONSE_RECEIVED state whould trigger the flow again */ + return; + } + if (test_bit(HT_AGG_STATE_RESPONSE_RECEIVED, &tid_tx->state)) ieee80211_agg_tx_operational(local, sta, tid); } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 0f72813fed53..9b360544ad6f 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -994,7 +994,7 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, BSS_CHANGED_TWT | BSS_CHANGED_HE_OBSS_PD | BSS_CHANGED_HE_BSS_COLOR; - int err; + int i, err; int prev_beacon_int; old = sdata_dereference(sdata->u.ap.beacon, sdata); @@ -1085,6 +1085,17 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; + sdata->beacon_rate_set = false; + if (wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { + sdata->beacon_rateidx_mask[i] = + params->beacon_rate.control[i].legacy; + if (sdata->beacon_rateidx_mask[i]) + sdata->beacon_rate_set = true; + } + } + err = ieee80211_assign_beacon(sdata, ¶ms->beacon, NULL); if (err < 0) { ieee80211_vif_release_channel(sdata); @@ -1189,6 +1200,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev) ieee80211_free_keys(sdata, true); sdata->vif.bss_conf.enable_beacon = false; + sdata->beacon_rate_set = false; sdata->vif.bss_conf.ssid_len = 0; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); @@ -1508,7 +1520,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->he_capa) ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, (void *)params->he_capa, - params->he_capa_len, sta); + params->he_capa_len, + (void *)params->he_6ghz_capa, + sta); if (params->opmode_notif_used) { /* returned value is only needed for rc update, but the @@ -1949,6 +1963,7 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, const u8 *old_ie; struct ieee80211_sub_if_data *sdata = container_of(ifmsh, struct ieee80211_sub_if_data, u.mesh); + int i; /* allocate information elements */ new_ie = NULL; @@ -1987,6 +2002,17 @@ static int copy_mesh_setup(struct ieee80211_if_mesh *ifmsh, sdata->vif.bss_conf.beacon_int = setup->beacon_interval; sdata->vif.bss_conf.dtim_period = setup->dtim_period; + sdata->beacon_rate_set = false; + if (wiphy_ext_feature_isset(sdata->local->hw.wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_LEGACY)) { + for (i = 0; i < NUM_NL80211_BANDS; i++) { + sdata->beacon_rateidx_mask[i] = + setup->beacon_rate.control[i].legacy; + if (sdata->beacon_rateidx_mask[i]) + sdata->beacon_rate_set = true; + } + } + return 0; } @@ -2172,7 +2198,8 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (!sdata->vif.bss_conf.use_short_slot && - sband->band == NL80211_BAND_5GHZ) { + (sband->band == NL80211_BAND_5GHZ || + sband->band == NL80211_BAND_6GHZ)) { sdata->vif.bss_conf.use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } @@ -3287,6 +3314,12 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; } + if (params->chandef.chan->freq_offset) { + /* this may work, but is untested */ + err = -EOPNOTSUPP; + goto out; + } + chanctx = container_of(conf, struct ieee80211_chanctx, conf); ch_switch.timestamp = 0; @@ -3398,41 +3431,43 @@ int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, return 0; } -static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, +static void +ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg) + struct mgmt_frame_regs *upd) { struct ieee80211_local *local = wiphy_priv(wiphy); struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + u32 preq_mask = BIT(IEEE80211_STYPE_PROBE_REQ >> 4); + u32 action_mask = BIT(IEEE80211_STYPE_ACTION >> 4); + bool global_change, intf_change; + + global_change = + (local->probe_req_reg != !!(upd->global_stypes & preq_mask)) || + (local->rx_mcast_action_reg != + !!(upd->global_mcast_stypes & action_mask)); + local->probe_req_reg = upd->global_stypes & preq_mask; + local->rx_mcast_action_reg = upd->global_mcast_stypes & action_mask; + + intf_change = (sdata->vif.probe_req_reg != + !!(upd->interface_stypes & preq_mask)) || + (sdata->vif.rx_mcast_action_reg != + !!(upd->interface_mcast_stypes & action_mask)); + sdata->vif.probe_req_reg = upd->interface_stypes & preq_mask; + sdata->vif.rx_mcast_action_reg = + upd->interface_mcast_stypes & action_mask; + + if (!local->open_count) + return; - switch (frame_type) { - case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ: - if (reg) { - local->probe_req_reg++; - sdata->vif.probe_req_reg++; - } else { - if (local->probe_req_reg) - local->probe_req_reg--; - - if (sdata->vif.probe_req_reg) - sdata->vif.probe_req_reg--; - } - - if (!local->open_count) - break; - - if (sdata->vif.probe_req_reg == 1) - drv_config_iface_filter(local, sdata, FIF_PROBE_REQ, - FIF_PROBE_REQ); - else if (sdata->vif.probe_req_reg == 0) - drv_config_iface_filter(local, sdata, 0, - FIF_PROBE_REQ); + if (intf_change && ieee80211_sdata_running(sdata)) + drv_config_iface_filter(local, sdata, + sdata->vif.probe_req_reg ? + FIF_PROBE_REQ : 0, + FIF_PROBE_REQ); + if (global_change) ieee80211_configure_filter(local); - break; - default: - break; - } } static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) @@ -3925,7 +3960,7 @@ static int ieee80211_set_tid_config(struct wiphy *wiphy, static int ieee80211_reset_tid_config(struct wiphy *wiphy, struct net_device *dev, - const u8 *peer, u8 tid) + const u8 *peer, u8 tids) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta; @@ -3935,7 +3970,7 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy, return -EOPNOTSUPP; if (!peer) - return drv_reset_tid_config(sdata->local, sdata, NULL, tid); + return drv_reset_tid_config(sdata->local, sdata, NULL, tids); mutex_lock(&sdata->local->sta_mtx); sta = sta_info_get_bss(sdata, peer); @@ -3944,7 +3979,7 @@ static int ieee80211_reset_tid_config(struct wiphy *wiphy, return -ENOENT; } - ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tid); + ret = drv_reset_tid_config(sdata->local, sdata, &sta->sta, tids); mutex_unlock(&sdata->local->sta_mtx); return ret; @@ -4017,7 +4052,8 @@ const struct cfg80211_ops mac80211_config_ops = { .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, .set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config, - .mgmt_frame_register = ieee80211_mgmt_frame_register, + .update_mgmt_frame_registrations = + ieee80211_update_mgmt_frame_registrations, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, .set_rekey_data = ieee80211_set_rekey_data, diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 9c94baaf693c..e6e192f53e4e 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -533,6 +533,7 @@ static void ieee80211_del_chanctx(struct ieee80211_local *local, struct cfg80211_chan_def *chandef = &local->_oper_chandef; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = chandef->chan->center_freq; + chandef->freq1_offset = chandef->chan->freq_offset; chandef->center_freq2 = 0; /* NOTE: Disabling radar is only valid here for diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 3dbe7c5cefd1..d7e955127d5c 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -236,7 +236,7 @@ IEEE80211_IF_FILE_R(hw_queues); /* STA attributes */ IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC); -IEEE80211_IF_FILE(aid, u.mgd.aid, DEC); +IEEE80211_IF_FILE(aid, vif.bss_conf.aid, DEC); IEEE80211_IF_FILE(beacon_timeout, u.mgd.beacon_timeout, JIFFIES_TO_MS); static int ieee80211_set_smps(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 3877710e3b48..de69fc9c4f07 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1375,12 +1375,12 @@ static inline int drv_set_tid_config(struct ieee80211_local *local, static inline int drv_reset_tid_config(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - struct ieee80211_sta *sta, u8 tid) + struct ieee80211_sta *sta, u8 tids) { int ret; might_sleep(); - ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tid); + ret = local->ops->reset_tid_config(&local->hw, &sdata->vif, sta, tids); trace_drv_return_int(local, ret); return ret; diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 1087f715338b..cc26f239838b 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -8,10 +8,55 @@ #include "ieee80211_i.h" +static void +ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_capa, + struct sta_info *sta) +{ + enum ieee80211_smps_mode smps_mode; + + if (sta->sdata->vif.type == NL80211_IFTYPE_AP || + sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { + switch (le16_get_bits(he_6ghz_capa->capa, + IEEE80211_HE_6GHZ_CAP_SM_PS)) { + case WLAN_HT_CAP_SM_PS_INVALID: + case WLAN_HT_CAP_SM_PS_STATIC: + smps_mode = IEEE80211_SMPS_STATIC; + break; + case WLAN_HT_CAP_SM_PS_DYNAMIC: + smps_mode = IEEE80211_SMPS_DYNAMIC; + break; + case WLAN_HT_CAP_SM_PS_DISABLED: + smps_mode = IEEE80211_SMPS_OFF; + break; + } + + sta->sta.smps_mode = smps_mode; + } else { + sta->sta.smps_mode = IEEE80211_SMPS_OFF; + } + + switch (le16_get_bits(he_6ghz_capa->capa, + IEEE80211_HE_6GHZ_CAP_MAX_MPDU_LEN)) { + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; + break; + case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: + default: + sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; + break; + } + + sta->sta.he_6ghz_capa = *he_6ghz_capa; +} + void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, + const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct sta_info *sta) { struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; @@ -53,21 +98,21 @@ ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(sta); sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); + + if (sband->band == NL80211_BAND_6GHZ && he_6ghz_capa) + ieee80211_update_from_he_6ghz_capa(he_6ghz_capa, sta); } void ieee80211_he_op_ie_to_bss_conf(struct ieee80211_vif *vif, - const struct ieee80211_he_operation *he_op_ie_elem) + const struct ieee80211_he_operation *he_op_ie) { - struct ieee80211_he_operation *he_operation = - &vif->bss_conf.he_operation; - - if (!he_op_ie_elem) { - memset(he_operation, 0, sizeof(*he_operation)); + memset(&vif->bss_conf.he_oper, 0, sizeof(vif->bss_conf.he_oper)); + if (!he_op_ie) return; - } - vif->bss_conf.he_operation = *he_op_ie_elem; + vif->bss_conf.he_oper.params = __le32_to_cpu(he_op_ie->he_oper_params); + vif->bss_conf.he_oper.nss_set = __le16_to_cpu(he_op_ie->he_mcs_nss_set); } void diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index d40744903fa9..81d26fef41e9 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2019 Intel Corporation + * Copyright(c) 2018-2020 Intel Corporation */ #include <linux/delay.h> @@ -781,6 +781,7 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, enum nl80211_channel_type ch_type; int err; u32 sta_flags; + u32 vht_cap_info = 0; sdata_assert_lock(sdata); @@ -798,9 +799,13 @@ ieee80211_ibss_process_chanswitch(struct ieee80211_sub_if_data *sdata, break; } + if (elems->vht_cap_elem) + vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + memset(¶ms, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, ifibss->chandef.chan->band, + vht_cap_info, sta_flags, ifibss->bssid, &csa_ie); /* can't switch to destination channel, fail */ if (err < 0) @@ -1060,8 +1065,10 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, /* we both use VHT */ struct ieee80211_vht_cap cap_ie; struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap; + u32 vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); - ieee80211_chandef_vht_oper(&local->hw, + ieee80211_chandef_vht_oper(&local->hw, vht_cap_info, elems->vht_operation, elems->ht_operation, &chandef); @@ -1758,6 +1765,11 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, int i; int ret; + if (params->chandef.chan->freq_offset) { + /* this may work, but is untested */ + return -EOPNOTSUPP; + } + ret = cfg80211_chandef_dfs_required(local->hw.wiphy, ¶ms->chandef, sdata->wdev.iftype); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f8ed4f621f7f..ec1a71ac65f2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -111,6 +111,8 @@ struct ieee80211_bss { size_t supp_rates_len; struct ieee80211_rate *beacon_rate; + u32 vht_cap_info; + /* * During association, we save an ERP value from a probe response so * that we can feed ERP info to the driver when handling the @@ -267,7 +269,7 @@ struct probe_resp { struct rcu_head rcu_head; int len; u16 csa_counter_offsets[IEEE80211_MAX_CSA_COUNTERS_NUM]; - u8 data[0]; + u8 data[]; }; struct ps_data { @@ -450,8 +452,6 @@ struct ieee80211_if_managed { u8 bssid[ETH_ALEN] __aligned(2); - u16 aid; - bool powersave; /* powersave requested for this iface */ bool broken_ap; /* AP is broken -- turn off powersave */ bool have_beacon; @@ -964,6 +964,10 @@ struct ieee80211_sub_if_data { bool rc_has_vht_mcs_mask[NUM_NL80211_BANDS]; u16 rc_rateidx_vht_mcs_mask[NUM_NL80211_BANDS][NL80211_VHT_NSS_MAX]; + /* Beacon frame (non-MCS) rate (as a bitmap) */ + u32 beacon_rateidx_mask[NUM_NL80211_BANDS]; + bool beacon_rate_set; + union { struct ieee80211_if_ap ap; struct ieee80211_if_wds wds; @@ -1169,7 +1173,8 @@ struct ieee80211_local { /* number of interfaces with corresponding FIF_ flags */ int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, fif_probe_req; - int probe_req_reg; + bool probe_req_reg; + bool rx_mcast_action_reg; unsigned int filter_flags; /* FIF_* */ bool wiphy_ciphers_allocated; @@ -1491,6 +1496,7 @@ struct ieee802_11_elems { const struct ieee80211_he_operation *he_operation; const struct ieee80211_he_spr *he_spr; const struct ieee80211_mu_edca_param_set *mu_edca_param_set; + const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const u8 *uora_element; const u8 *mesh_id; const u8 *peering; @@ -1780,7 +1786,8 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, - u32 ctrl_flags); + u32 ctrl_flags, + u64 *cookie); void ieee80211_purge_tx_queue(struct ieee80211_hw *hw, struct sk_buff_head *skbs); struct sk_buff * @@ -1797,7 +1804,8 @@ void ieee80211_check_fast_xmit_iface(struct ieee80211_sub_if_data *sdata); void ieee80211_clear_fast_xmit(struct sta_info *sta); int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted); + const u8 *dest, __be16 proto, bool unencrypted, + u64 *cookie); int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len); @@ -1891,6 +1899,7 @@ void ieee80211_he_cap_ie_to_sta_he_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const u8 *he_cap_ie, u8 he_cap_len, + const struct ieee80211_he_6ghz_capa *he_6ghz_capa, struct sta_info *sta); void ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif, @@ -1909,6 +1918,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, * @sdata: the sdata of the interface which has received the frame * @elems: parsed 802.11 elements received with the frame * @current_band: indicates the current band + * @vht_cap_info: VHT capabilities of the transmitter * @sta_flags: contains information about own capabilities and restrictions * to decide which channel switch announcements can be accepted. Only the * following subset of &enum ieee80211_sta_flags are evaluated: @@ -1923,6 +1933,7 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, + u32 vht_cap_info, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie); @@ -2133,7 +2144,7 @@ enum { IEEE80211_PROBE_FLAG_RANDOM_SN = BIT(2), }; -int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, +int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, @@ -2171,7 +2182,9 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata, u8 iftype); u8 *ieee80211_ie_build_he_cap(u8 *pos, const struct ieee80211_sta_he_cap *he_cap, u8 *end); -u8 *ieee80211_ie_build_he_oper(u8 *pos); +void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); +u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef); int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); @@ -2186,10 +2199,13 @@ u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); /* channel management */ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, +bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef); +bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_he_operation *he_oper, + struct cfg80211_chan_def *chandef); u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c); int __must_check diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index d069825705d6..f900c84fb40f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -644,6 +644,11 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) local->fif_probe_req++; } + if (sdata->vif.probe_req_reg) + drv_config_iface_filter(local, sdata, + FIF_PROBE_REQ, + FIF_PROBE_REQ); + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) changed |= ieee80211_reset_erp_info(sdata); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 6423173bb87e..b4a2efe8e83a 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -64,6 +64,9 @@ void ieee80211_configure_filter(struct ieee80211_local *local) if (local->fif_pspoll) new_flags |= FIF_PSPOLL; + if (local->rx_mcast_action_reg) + new_flags |= FIF_MCAST_ACTION; + spin_lock_bh(&local->filter_lock); changed_flags = local->filter_flags ^ new_flags; @@ -104,13 +107,15 @@ static u32 ieee80211_hw_conf_chan(struct ieee80211_local *local) chandef.chan = local->tmp_channel; chandef.width = NL80211_CHAN_WIDTH_20_NOHT; chandef.center_freq1 = chandef.chan->center_freq; + chandef.freq1_offset = chandef.chan->freq_offset; } else chandef = local->_oper_chandef; WARN(!cfg80211_chandef_valid(&chandef), - "control:%d MHz width:%d center: %d/%d MHz", - chandef.chan->center_freq, chandef.width, - chandef.center_freq1, chandef.center_freq2); + "control:%d.%03d MHz width:%d center: %d.%03d/%d MHz", + chandef.chan->center_freq, chandef.chan->freq_offset, + chandef.width, chandef.center_freq1, chandef.freq1_offset, + chandef.center_freq2); if (!cfg80211_chandef_identical(&chandef, &local->_oper_chandef)) local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; @@ -591,6 +596,10 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_CONTROL_PORT_NO_PREAUTH); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211_TX_STATUS); + wiphy_ext_feature_set(wiphy, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ); if (!ops->hw_scan) { wiphy->features |= NL80211_FEATURE_LOW_PRIORITY_SCAN | diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 36978a0e5000..5f1ca25b6c97 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation * Authors: Luis Carlos Cobo <luisca@cozybit.com> * Javier Cardona <javier@cozybit.com> */ @@ -63,6 +63,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, u32 basic_rates = 0; struct cfg80211_chan_def sta_chan_def; struct ieee80211_supported_band *sband; + u32 vht_cap_info = 0; /* * As support for each feature is added, check for matching @@ -96,9 +97,14 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan, NL80211_CHAN_NO_HT); ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def); - ieee80211_chandef_vht_oper(&sdata->local->hw, + + if (ie->vht_cap_elem) + vht_cap_info = le32_to_cpu(ie->vht_cap_elem->vht_cap_info); + + ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, ie->vht_operation, ie->ht_operation, &sta_chan_def); + ieee80211_chandef_he_6ghz_oper(sdata, ie->he_operation, &sta_chan_def); if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef, &sta_chan_def)) @@ -415,6 +421,10 @@ int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + /* HT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!sband->ht_cap.ht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -452,6 +462,10 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[channel->band]; ht_cap = &sband->ht_cap; + /* HT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!ht_cap->ht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -479,6 +493,10 @@ int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + /* VHT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!sband->vht_cap.vht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -516,6 +534,10 @@ int mesh_add_vht_oper_ie(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[channel->band]; vht_cap = &sband->vht_cap; + /* VHT not allowed in 6 GHz */ + if (sband->band == NL80211_BAND_6GHZ) + return 0; + if (!vht_cap->vht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -565,6 +587,7 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, { const struct ieee80211_sta_he_cap *he_cap; struct ieee80211_supported_band *sband; + u32 len; u8 *pos; sband = ieee80211_get_sband(sdata); @@ -578,12 +601,23 @@ int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_10) return 0; - if (skb_tailroom(skb) < 2 + 1 + sizeof(struct ieee80211_he_operation)) + len = 2 + 1 + sizeof(struct ieee80211_he_operation); + if (sdata->vif.bss_conf.chandef.chan->band == NL80211_BAND_6GHZ) + len += sizeof(struct ieee80211_he_6ghz_oper); + + if (skb_tailroom(skb) < len) return -ENOMEM; - pos = skb_put(skb, 2 + 1 + sizeof(struct ieee80211_he_operation)); - ieee80211_ie_build_he_oper(pos); + pos = skb_put(skb, len); + ieee80211_ie_build_he_oper(pos, &sdata->vif.bss_conf.chandef); + + return 0; +} +int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + ieee80211_ie_build_he_6ghz_cap(sdata, skb); return 0; } @@ -766,6 +800,8 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + sizeof(struct ieee80211_he_6ghz_oper) + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + ifmsh->ie_len; bcn = kzalloc(sizeof(*bcn) + head_len + tail_len, GFP_KERNEL); @@ -885,6 +921,7 @@ ieee80211_mesh_build_beacon(struct ieee80211_if_mesh *ifmsh) mesh_add_vht_oper_ie(sdata, skb) || mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) || mesh_add_he_oper_ie(sdata, skb) || + mesh_add_he_6ghz_cap_ie(sdata, skb) || mesh_add_vendor_ies(sdata, skb)) goto out_free; @@ -994,6 +1031,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata) /* stop the beacon */ ifmsh->mesh_id_len = 0; sdata->vif.bss_conf.enable_beacon = false; + sdata->beacon_rate_set = false; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); @@ -1044,7 +1082,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee80211_supported_band *sband; int err; - u32 sta_flags; + u32 sta_flags, vht_cap_info = 0; sdata_assert_lock(sdata); @@ -1067,8 +1105,13 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, break; } + if (elems->vht_cap_elem) + vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + memset(¶ms, 0, sizeof(params)); err = ieee80211_parse_ch_switch_ie(sdata, elems, sband->band, + vht_cap_info, sta_flags, sdata->vif.addr, &csa_ie); if (err < 0) diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index 953f720754e8..40492d1bd8fd 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -222,6 +222,8 @@ int mesh_add_he_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u8 ie_len); int mesh_add_he_oper_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void mesh_rmc_free(struct ieee80211_sub_if_data *sdata); int mesh_rmc_init(struct ieee80211_sub_if_data *sdata); void ieee80211s_init(void); diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 737c5f4dbf52..798e4b6b383f 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -238,6 +238,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, 2 + sizeof(struct ieee80211_vht_operation) + ie_len_he_cap + 2 + 1 + sizeof(struct ieee80211_he_operation) + + sizeof(struct ieee80211_he_6ghz_oper) + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + 2 + 8 + /* peering IE */ sdata->u.mesh.ie_len); if (!skb) @@ -328,7 +330,8 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, mesh_add_vht_cap_ie(sdata, skb) || mesh_add_vht_oper_ie(sdata, skb) || mesh_add_he_cap_ie(sdata, skb, ie_len_he_cap) || - mesh_add_he_oper_ie(sdata, skb)) + mesh_add_he_oper_ie(sdata, skb) || + mesh_add_he_6ghz_cap_ie(sdata, skb)) goto free; } @@ -441,7 +444,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, elems->vht_cap_elem, sta); ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, - elems->he_cap_len, sta); + elems->he_cap_len, + elems->he_6ghz_capa, + sta); if (bw != sta->sta.bandwidth) changed |= IEEE80211_RC_BW_CHANGED; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 16d75da0996a..5820ef02a587 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -145,6 +145,7 @@ static u32 ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_channel *channel, + u32 vht_cap_info, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, const struct ieee80211_he_operation *he_oper, @@ -155,13 +156,23 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_ht_cap sta_ht_cap; u32 ht_cfreq, ret; - memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); - ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); - memset(chandef, 0, sizeof(struct cfg80211_chan_def)); chandef->chan = channel; chandef->width = NL80211_CHAN_WIDTH_20_NOHT; chandef->center_freq1 = channel->center_freq; + chandef->freq1_offset = channel->freq_offset; + + if (channel->band == NL80211_BAND_6GHZ) { + if (!ieee80211_chandef_he_6ghz_oper(sdata, he_oper, chandef)) + ret = IEEE80211_STA_DISABLE_HT | + IEEE80211_STA_DISABLE_VHT | + IEEE80211_STA_DISABLE_HE; + vht_chandef = *chandef; + goto out; + } + + memcpy(&sta_ht_cap, &sband->ht_cap, sizeof(sta_ht_cap)); + ieee80211_apply_htcap_overrides(sdata, &sta_ht_cap); if (!ht_oper || !sta_ht_cap.ht_supported) { ret = IEEE80211_STA_DISABLE_HT | @@ -222,7 +233,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, memcpy(&he_oper_vht_cap, he_oper->optional, 3); he_oper_vht_cap.basic_mcs_set = cpu_to_le16(0); - if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_cap_info, &he_oper_vht_cap, ht_oper, &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HE)) @@ -231,8 +242,10 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata, ret = IEEE80211_STA_DISABLE_HE; goto out; } - } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, vht_oper, - ht_oper, &vht_chandef)) { + } else if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + vht_cap_info, + vht_oper, ht_oper, + &vht_chandef)) { if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) sdata_info(sdata, "AP VHT information is invalid, disable VHT\n"); @@ -328,6 +341,7 @@ out: static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, const struct ieee80211_ht_cap *ht_cap, + const struct ieee80211_vht_cap *vht_cap, const struct ieee80211_ht_operation *ht_oper, const struct ieee80211_vht_operation *vht_oper, const struct ieee80211_he_operation *he_oper, @@ -342,6 +356,7 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, u16 ht_opmode; u32 flags; enum ieee80211_sta_rx_bandwidth new_sta_bw; + u32 vht_cap_info = 0; int ret; /* if HT was/is disabled, don't track any bandwidth changes */ @@ -370,8 +385,11 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.ht_operation_mode = ht_opmode; } + if (vht_cap) + vht_cap_info = le32_to_cpu(vht_cap->vht_cap_info); + /* calculate new channel (type) based on HT/VHT/HE operation IEs */ - flags = ieee80211_determine_chantype(sdata, sband, chan, + flags = ieee80211_determine_chantype(sdata, sband, chan, vht_cap_info, ht_oper, vht_oper, he_oper, &chandef, true); @@ -396,9 +414,12 @@ static int ieee80211_config_bw(struct ieee80211_sub_if_data *sdata, return 0; sdata_info(sdata, - "AP %pM changed bandwidth, new config is %d MHz, width %d (%d/%d MHz)\n", - ifmgd->bssid, chandef.chan->center_freq, chandef.width, - chandef.center_freq1, chandef.center_freq2); + "AP %pM changed bandwidth, new config is %d.%03d MHz, " + "width %d (%d.%03d/%d MHz)\n", + ifmgd->bssid, chandef.chan->center_freq, + chandef.chan->freq_offset, chandef.width, + chandef.center_freq1, chandef.freq1_offset, + chandef.center_freq2); if (flags != (ifmgd->flags & (IEEE80211_STA_DISABLE_HT | IEEE80211_STA_DISABLE_VHT | @@ -654,6 +675,8 @@ static void ieee80211_add_he_ie(struct ieee80211_sub_if_data *sdata, he_cap->he_cap_elem.phy_cap_info); pos = skb_put(skb, he_cap_size); ieee80211_ie_build_he_cap(pos, he_cap, pos + he_cap_size); + + ieee80211_ie_build_he_6ghz_cap(sdata, skb); } static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) @@ -727,6 +750,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + 1 + sizeof(struct ieee80211_he_cap_elem) + /* HE */ sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN + + 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa) + assoc_data->ie_len + /* extra IEs */ (assoc_data->fils_kek_len ? 16 /* AES-SIV */ : 0) + 9, /* WMM */ @@ -899,7 +923,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))) ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) + if (sband->band != NL80211_BAND_6GHZ && + !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) ieee80211_add_ht_ie(sdata, skb, assoc_data->ap_ht_param, sband, chan, sdata->smps_mode); @@ -953,7 +978,8 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) offset = noffset; } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) + if (sband->band != NL80211_BAND_6GHZ && + !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) ieee80211_add_vht_ie(sdata, skb, sband, &assoc_data->ap_vht_cap); @@ -1320,6 +1346,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, enum nl80211_band current_band; struct ieee80211_csa_ie csa_ie; struct ieee80211_channel_switch ch_switch; + struct ieee80211_bss *bss; int res; sdata_assert_lock(sdata); @@ -1331,7 +1358,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, return; current_band = cbss->channel->band; + bss = (void *)cbss->priv; res = ieee80211_parse_ch_switch_ie(sdata, elems, current_band, + bss->vht_cap_info, ifmgd->flags, ifmgd->associated->bssid, &csa_ie); @@ -1364,10 +1393,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, if (!cfg80211_chandef_usable(local->hw.wiphy, &csa_ie.chandef, IEEE80211_CHAN_DISABLED)) { sdata_info(sdata, - "AP %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n", + "AP %pM switches to unsupported channel " + "(%d.%03d MHz, width:%d, CF1/2: %d.%03d/%d MHz), " + "disconnecting\n", ifmgd->associated->bssid, csa_ie.chandef.chan->center_freq, + csa_ie.chandef.chan->freq_offset, csa_ie.chandef.width, csa_ie.chandef.center_freq1, + csa_ie.chandef.freq1_offset, csa_ie.chandef.center_freq2); ieee80211_queue_work(&local->hw, &ifmgd->csa_connection_drop_work); @@ -1500,6 +1533,7 @@ ieee80211_find_80211h_pwr_constr(struct ieee80211_sub_if_data *sdata, chan_increment = 1; break; case NL80211_BAND_5GHZ: + case NL80211_BAND_6GHZ: chan_increment = 4; break; } @@ -2137,7 +2171,8 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); - if (sband->band == NL80211_BAND_5GHZ) + if (sband->band == NL80211_BAND_5GHZ || + sband->band == NL80211_BAND_6GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { @@ -2948,10 +2983,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, } if (status_code != WLAN_STATUS_SUCCESS) { + cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); + + if (auth_alg == WLAN_AUTH_SAE && + status_code == WLAN_STATUS_ANTI_CLOG_REQUIRED) + return; + sdata_info(sdata, "%pM denied authentication (status %d)\n", mgmt->sa, status_code); ieee80211_destroy_auth_data(sdata, false); - cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); event.u.mlme.status = MLME_DENIED; event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); @@ -3149,15 +3189,16 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, *have_higher_than_11mbit = true; /* - * Skip HT and VHT BSS membership selectors since they're not - * rates. + * Skip HT, VHT and HE BSS membership selectors since they're + * not rates. * * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly * the same. */ if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) || - supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY)) + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) || + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY)) continue; for (j = 0; j < sband->n_bitrates; j++) { @@ -3220,6 +3261,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; const struct cfg80211_bss_ies *bss_ies = NULL; struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data; + bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; u32 changed = 0; int err; bool ret; @@ -3249,7 +3291,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, return false; } - ifmgd->aid = aid; + sdata->vif.bss_conf.aid = aid; ifmgd->tdls_chan_switch_prohibited = elems->ext_capab && elems->ext_capab_len >= 5 && (elems->ext_capab[4] & WLAN_EXT_CAPA5_TDLS_CH_SW_PROHIBITED); @@ -3261,11 +3303,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * 2G/3G/4G wifi routers, reported models include the "Onda PN51T", * "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device. */ - if ((assoc_data->wmm && !elems->wmm_param) || - (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && - (!elems->ht_cap_elem || !elems->ht_operation)) || - (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && - (!elems->vht_cap_elem || !elems->vht_operation))) { + if (!is_6ghz && + ((assoc_data->wmm && !elems->wmm_param) || + (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && + (!elems->ht_cap_elem || !elems->ht_operation)) || + (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + (!elems->vht_cap_elem || !elems->vht_operation)))) { const struct cfg80211_bss_ies *ies; struct ieee802_11_elems bss_elems; @@ -3323,7 +3366,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, * We previously checked these in the beacon/probe response, so * they should be present here. This is just a safety net. */ - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && + if (!is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && (!elems->wmm_param || !elems->ht_cap_elem || !elems->ht_operation)) { sdata_info(sdata, "HT AP is missing WMM params or HT capability/operation\n"); @@ -3331,7 +3374,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && + if (!is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && (!elems->vht_cap_elem || !elems->vht_operation)) { sdata_info(sdata, "VHT AP is missing VHT capability/operation\n"); @@ -3339,6 +3382,14 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } + if (is_6ghz && !(ifmgd->flags & IEEE80211_STA_DISABLE_HE) && + !elems->he_6ghz_capa) { + sdata_info(sdata, + "HE 6 GHz AP is missing HE 6 GHz band capability\n"); + ret = false; + goto out; + } + mutex_lock(&sdata->local->sta_mtx); /* * station info was already allocated and inserted before @@ -3381,13 +3432,23 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, ieee80211_he_cap_ie_to_sta_he_cap(sdata, sband, elems->he_cap, elems->he_cap_len, + elems->he_6ghz_capa, sta); bss_conf->he_support = sta->sta.he_cap.has_he; + if (elems->rsnx && elems->rsnx_len && + (elems->rsnx[0] & WLAN_RSNX_CAPA_PROTECTED_TWT) && + wiphy_ext_feature_isset(local->hw.wiphy, + NL80211_EXT_FEATURE_PROTECTED_TWT)) + bss_conf->twt_protected = true; + else + bss_conf->twt_protected = false; + changed |= ieee80211_recalc_twt_req(sdata, sta, elems); } else { bss_conf->he_support = false; bss_conf->twt_requester = false; + bss_conf->twt_protected = false; } if (bss_conf->he_support) { @@ -3521,9 +3582,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, bss_conf->protected_keep_alive = false; } - /* set AID and assoc capability, + /* set assoc capability (AID was already set earlier), * ieee80211_set_associated() will tell the driver */ - bss_conf->aid = aid; bss_conf->assoc_capability = capab_info; ieee80211_set_associated(sdata, cbss, changed); @@ -3661,7 +3721,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, sdata_assert_lock(sdata); - channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + channel = ieee80211_get_channel_khz(local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); if (!channel) return; @@ -3877,7 +3938,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, return; } - if (rx_status->freq != chanctx_conf->def.chan->center_freq) { + if (ieee80211_rx_status_to_khz(rx_status) != + ieee80211_channel_to_khz(chanctx_conf->def.chan)) { rcu_read_unlock(); return; } @@ -3948,7 +4010,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, mgmt->bssid, bssid); if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid)) { + ieee80211_check_tim(elems.tim, elems.tim_len, bss_conf->aid)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; @@ -4070,8 +4132,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, changed |= ieee80211_recalc_twt_req(sdata, sta, &elems); - if (ieee80211_config_bw(sdata, sta, - elems.ht_cap_elem, elems.ht_operation, + if (ieee80211_config_bw(sdata, sta, elems.ht_cap_elem, + elems.vht_cap_elem, elems.ht_operation, elems.vht_operation, elems.he_operation, bssid, &changed)) { mutex_unlock(&local->sta_mtx); @@ -4788,6 +4850,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, const struct ieee80211_he_operation *he_oper = NULL; struct ieee80211_supported_band *sband; struct cfg80211_chan_def chandef; + bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; + struct ieee80211_bss *bss = (void *)cbss->priv; int ret; u32 i; bool have_80mhz; @@ -4799,21 +4863,23 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, IEEE80211_STA_DISABLE_160MHZ); /* disable HT/VHT/HE if we don't support them */ - if (!sband->ht_cap.ht_supported) { + if (!sband->ht_cap.ht_supported && !is_6ghz) { ifmgd->flags |= IEEE80211_STA_DISABLE_HT; ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; ifmgd->flags |= IEEE80211_STA_DISABLE_HE; } - if (!sband->vht_cap.vht_supported) + if (!sband->vht_cap.vht_supported && !is_6ghz) { ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + ifmgd->flags |= IEEE80211_STA_DISABLE_HE; + } if (!ieee80211_get_he_sta_cap(sband)) ifmgd->flags |= IEEE80211_STA_DISABLE_HE; rcu_read_lock(); - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT) && !is_6ghz) { const u8 *ht_oper_ie, *ht_cap_ie; ht_oper_ie = ieee80211_bss_get_ie(cbss, WLAN_EID_HT_OPERATION); @@ -4830,7 +4896,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } } - if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT)) { + if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT) && !is_6ghz) { const u8 *vht_oper_ie, *vht_cap; vht_oper_ie = ieee80211_bss_get_ie(cbss, @@ -4886,6 +4952,7 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ifmgd->flags |= ieee80211_determine_chantype(sdata, sband, cbss->channel, + bss->vht_cap_info, ht_oper, vht_oper, he_oper, &chandef, false); @@ -4894,6 +4961,11 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); + if (ifmgd->flags & IEEE80211_STA_DISABLE_HE && is_6ghz) { + sdata_info(sdata, "Rejecting non-HE 6/7 GHz connection"); + return -EINVAL; + } + /* will change later if needed */ sdata->smps_mode = IEEE80211_SMPS_OFF; @@ -5022,8 +5094,16 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, * doesn't happen any more, but keep the workaround so * in case some *other* APs are buggy in different ways * we can connect -- with a warning. + * Allow this workaround only in case the AP provided at least + * one rate. */ - if (!basic_rates && min_rate_index >= 0) { + if (min_rate_index < 0) { + sdata_info(sdata, + "No legacy rates in association response\n"); + + sta_info_free(local, new_sta); + return -EINVAL; + } else if (!basic_rates) { sdata_info(sdata, "No basic rates, using min rate instead\n"); basic_rates = BIT(min_rate_index); @@ -5267,6 +5347,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, struct cfg80211_assoc_request *req) { + bool is_6ghz = req->bss->channel->band == NL80211_BAND_6GHZ; struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_bss *bss = (void *)req->bss->priv; @@ -5409,14 +5490,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, if (ht_ie && ht_ie[1] >= sizeof(struct ieee80211_ht_operation)) assoc_data->ap_ht_param = ((struct ieee80211_ht_operation *)(ht_ie + 2))->ht_param; - else + else if (!is_6ghz) ifmgd->flags |= IEEE80211_STA_DISABLE_HT; vht_ie = ieee80211_bss_get_ie(req->bss, WLAN_EID_VHT_CAPABILITY); if (vht_ie && vht_ie[1] >= sizeof(struct ieee80211_vht_cap)) memcpy(&assoc_data->ap_vht_cap, vht_ie + 2, sizeof(struct ieee80211_vht_cap)); - else - ifmgd->flags |= IEEE80211_STA_DISABLE_VHT; + else if (!is_6ghz) + ifmgd->flags |= IEEE80211_STA_DISABLE_VHT | + IEEE80211_STA_DISABLE_HE; rcu_read_unlock(); if (WARN((sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_UAPSD) && @@ -5517,7 +5599,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, assoc_data->timeout_started = true; assoc_data->need_beacon = true; } else if (beacon_ies) { - const u8 *ie; + const struct element *elem; u8 dtim_count = 0; ieee80211_get_dtim(beacon_ies, &dtim_count, @@ -5534,15 +5616,15 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.sync_dtim_count = dtim_count; } - ie = cfg80211_find_ext_ie(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, - beacon_ies->data, beacon_ies->len); - if (ie && ie[1] >= 3) - sdata->vif.bss_conf.profile_periodicity = ie[4]; + elem = cfg80211_find_ext_elem(WLAN_EID_EXT_MULTIPLE_BSSID_CONFIGURATION, + beacon_ies->data, beacon_ies->len); + if (elem && elem->datalen >= 3) + sdata->vif.bss_conf.profile_periodicity = elem->data[2]; - ie = cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, - beacon_ies->data, beacon_ies->len); - if (ie && ie[1] >= 11 && - (ie[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) + elem = cfg80211_find_elem(WLAN_EID_EXT_CAPABILITY, + beacon_ies->data, beacon_ies->len); + if (elem && elem->datalen >= 11 && + (elem->data[10] & WLAN_EXT_CAPA11_EMA_SUPPORT)) sdata->vif.bss_conf.ema_ap = true; } else { assoc_data->timeout = jiffies; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index c710504ccf1a..db3b8bf75656 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -557,6 +557,10 @@ static int ieee80211_start_roc_work(struct ieee80211_local *local, lockdep_assert_held(&local->mtx); + if (channel->freq_offset) + /* this may work, but is untested */ + return -EOPNOTSUPP; + if (local->use_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 5dc3e5bc4e64..b11a2af55b06 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010-2013 Felix Fietkau <nbd@openwrt.org> + * Copyright (C) 2019-2020 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> @@ -490,7 +491,7 @@ minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); - if (tmp_cck_tp_rate && tmp_cck_tp > tmp_mcs_tp) { + if (tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { minstrel_ht_sort_best_tp_rates(mi, tmp_cck_tp_rate[i], tmp_mcs_tp_rate); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 91a13aee4378..21854a61a2b7 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -93,13 +93,44 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, * This function cleans up the SKB, i.e. it removes all the stuff * only useful for monitoring. */ -static void remove_monitor_info(struct sk_buff *skb, - unsigned int present_fcs_len, - unsigned int rtap_space) +static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb, + unsigned int present_fcs_len, + unsigned int rtap_space) { + struct ieee80211_hdr *hdr; + unsigned int hdrlen; + __le16 fc; + if (present_fcs_len) __pskb_trim(skb, skb->len - present_fcs_len); __pskb_pull(skb, rtap_space); + + hdr = (void *)skb->data; + fc = hdr->frame_control; + + /* + * Remove the HT-Control field (if present) on management + * frames after we've sent the frame to monitoring. We + * (currently) don't need it, and don't properly parse + * frames with it present, due to the assumption of a + * fixed management header length. + */ + if (likely(!ieee80211_is_mgmt(fc) || !ieee80211_has_order(fc))) + return skb; + + hdrlen = ieee80211_hdrlen(fc); + hdr->frame_control &= ~cpu_to_le16(IEEE80211_FCTL_ORDER); + + if (!pskb_may_pull(skb, hdrlen)) { + dev_kfree_skb(skb); + return NULL; + } + + memmove(skb->data + IEEE80211_HT_CTL_LEN, skb->data, + hdrlen - IEEE80211_HT_CTL_LEN); + __pskb_pull(skb, IEEE80211_HT_CTL_LEN); + + return skb; } static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, @@ -412,6 +443,7 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, pos++; /* IEEE80211_RADIOTAP_CHANNEL */ + /* TODO: frequency offset in KHz */ put_unaligned_le16(status->freq, pos); pos += 2; if (status->bw == RATE_INFO_BW_10) @@ -826,8 +858,8 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } - remove_monitor_info(origskb, present_fcs_len, rtap_space); - return origskb; + return ieee80211_clean_skb(origskb, present_fcs_len, + rtap_space); } ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_space); @@ -870,8 +902,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, if (!origskb) return NULL; - remove_monitor_info(origskb, present_fcs_len, rtap_space); - return origskb; + return ieee80211_clean_skb(origskb, present_fcs_len, rtap_space); } static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) @@ -1984,8 +2015,12 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + - NUM_DEFAULT_BEACON_KEYS) + NUM_DEFAULT_BEACON_KEYS) { + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, + skb->len); return RX_DROP_MONITOR; /* unexpected BIP keyidx */ + } rx->key = ieee80211_rx_get_bigtk(rx, mmie_keyidx); if (!rx->key) @@ -2131,6 +2166,10 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED; + if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE)) + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, skb->len); + return result; } @@ -2411,8 +2450,12 @@ static int ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) return -EACCES; } if (unlikely(ieee80211_is_beacon(fc) && rx->key && - ieee80211_get_mmie_keyidx(rx->skb) < 0)) + ieee80211_get_mmie_keyidx(rx->skb) < 0)) { + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + rx->skb->data, + rx->skb->len); return -EACCES; + } /* * When using MFP, Action frames are not allowed prior to * having configured keys. @@ -3082,9 +3125,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; - cfg80211_report_obss_beacon(rx->local->hw.wiphy, - rx->skb->data, rx->skb->len, - status->freq, sig); + cfg80211_report_obss_beacon_khz(rx->local->hw.wiphy, + rx->skb->data, rx->skb->len, + ieee80211_rx_status_to_khz(status), + sig); rx->flags |= IEEE80211_RX_BEACON_REPORTED; } @@ -3340,19 +3384,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) } } break; - case WLAN_CATEGORY_SA_QUERY: - if (len < (IEEE80211_MIN_ACTION_SIZE + - sizeof(mgmt->u.action.u.sa_query))) - break; - - switch (mgmt->u.action.u.sa_query.action) { - case WLAN_ACTION_SA_QUERY_REQUEST: - if (sdata->vif.type != NL80211_IFTYPE_STATION) - break; - ieee80211_process_sa_query_req(sdata, mgmt, len); - goto handled; - } - break; case WLAN_CATEGORY_SELF_PROTECTED: if (len < (IEEE80211_MIN_ACTION_SIZE + sizeof(mgmt->u.action.u.self_prot.action_code))) @@ -3430,8 +3461,9 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) !(status->flag & RX_FLAG_NO_SIGNAL_VAL)) sig = status->signal; - if (cfg80211_rx_mgmt(&rx->sdata->wdev, status->freq, sig, - rx->skb->data, rx->skb->len, 0)) { + if (cfg80211_rx_mgmt_khz(&rx->sdata->wdev, + ieee80211_rx_status_to_khz(status), sig, + rx->skb->data, rx->skb->len, 0)) { if (rx->sta) rx->sta->rx_stats.packets++; dev_kfree_skb(rx->skb); @@ -3442,6 +3474,41 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) } static ieee80211_rx_result debug_noinline +ieee80211_rx_h_action_post_userspace(struct ieee80211_rx_data *rx) +{ + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + int len = rx->skb->len; + + if (!ieee80211_is_action(mgmt->frame_control)) + return RX_CONTINUE; + + switch (mgmt->u.action.category) { + case WLAN_CATEGORY_SA_QUERY: + if (len < (IEEE80211_MIN_ACTION_SIZE + + sizeof(mgmt->u.action.u.sa_query))) + break; + + switch (mgmt->u.action.u.sa_query.action) { + case WLAN_ACTION_SA_QUERY_REQUEST: + if (sdata->vif.type != NL80211_IFTYPE_STATION) + break; + ieee80211_process_sa_query_req(sdata, mgmt, len); + goto handled; + } + break; + } + + return RX_CONTINUE; + + handled: + if (rx->sta) + rx->sta->rx_stats.packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; +} + +static ieee80211_rx_result debug_noinline ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) { struct ieee80211_local *local = rx->local; @@ -3721,6 +3788,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, CALL_RXH(ieee80211_rx_h_mgmt_check); CALL_RXH(ieee80211_rx_h_action); CALL_RXH(ieee80211_rx_h_userspace_mgmt); + CALL_RXH(ieee80211_rx_h_action_post_userspace); CALL_RXH(ieee80211_rx_h_action_return); CALL_RXH(ieee80211_rx_h_mgmt); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index fdac8192a519..ad90bbe57457 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -132,6 +132,12 @@ ieee80211_update_bss_from_elems(struct ieee80211_local *local, bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } + + if (elems->vht_cap_elem) + bss->vht_cap_info = + le32_to_cpu(elems->vht_cap_elem->vht_cap_info); + else + bss->vht_cap_info = 0; } struct ieee80211_bss * @@ -275,7 +281,8 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) return; } - channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq); + channel = ieee80211_get_channel_khz(local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; @@ -306,8 +313,9 @@ ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef, } /* return false if no more work */ -static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) +static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; @@ -354,7 +362,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; - ielen = ieee80211_build_preq_ies(local, + ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, @@ -394,9 +402,12 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (WARN_ON(!local->scan_req)) return; + scan_sdata = rcu_dereference_protected(local->scan_sdata, + lockdep_is_held(&local->mtx)); + if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && - ieee80211_prep_hw_scan(local)) { + ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, @@ -425,9 +436,6 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) cfg80211_scan_done(scan_req, &local->scan_info); } RCU_INIT_POINTER(local->scan_req, NULL); - - scan_sdata = rcu_dereference_protected(local->scan_sdata, - lockdep_is_held(&local->mtx)); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; @@ -769,7 +777,7 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, ieee80211_recalc_idle(local); if (hw_scan) { - WARN_ON(!ieee80211_prep_hw_scan(local)); + WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); @@ -896,6 +904,7 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; + local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; switch (scan_req->scan_width) { case NL80211_BSS_CHAN_WIDTH_5: @@ -1266,7 +1275,7 @@ int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, ieee80211_prepare_scan_chandef(&chandef, req->scan_width); - ieee80211_build_preq_ies(local, ie, num_bands * iebufsz, + ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 5fe2b645912f..ae1cb2c68722 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2008, Intel Corporation * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2018 Intel Corporation + * Copyright (C) 2018, 2020 Intel Corporation */ #include <linux/ieee80211.h> @@ -22,6 +22,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, enum nl80211_band current_band, + u32 vht_cap_info, u32 sta_flags, u8 *bssid, struct ieee80211_csa_ie *csa_ie) { @@ -150,6 +151,7 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, /* ignore if parsing fails */ if (!ieee80211_chandef_vht_oper(&sdata->local->hw, + vht_cap_info, &vht_oper, &ht_oper, &new_vht_chandef)) new_vht_chandef.chan = NULL; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 36f1abaab9ff..49728047dfad 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -3,6 +3,7 @@ * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015-2017 Intel Deutschland GmbH + * Copyright(c) 2020 Intel Corporation */ #ifndef STA_INFO_H @@ -68,6 +69,8 @@ * @WLAN_STA_MPSP_RECIPIENT: local STA is recipient of a MPSP. * @WLAN_STA_PS_DELIVER: station woke up, but we're still blocking TX * until pending frames are delivered + * @WLAN_STA_USES_ENCRYPTION: This station was configured for encryption, + * so drop all packets without a key later. * * @NUM_WLAN_STA_FLAGS: number of defined flags */ @@ -116,6 +119,7 @@ enum ieee80211_sta_info_flags { #define HT_AGG_STATE_WANT_STOP 5 #define HT_AGG_STATE_START_CB 6 #define HT_AGG_STATE_STOP_CB 7 +#define HT_AGG_STATE_SENT_ADDBA 8 DECLARE_EWMA(avg_signal, 10, 8) enum ieee80211_agg_stop_reason { diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 22512805eafb..7b1bacac39c6 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -649,10 +649,17 @@ static void ieee80211_report_ack_skb(struct ieee80211_local *local, info->status.ack_signal, info->status.is_valid_ack_signal, GFP_ATOMIC); - else + else if (ieee80211_is_mgmt(hdr->frame_control)) cfg80211_mgmt_tx_status(&sdata->wdev, cookie, skb->data, skb->len, acked, GFP_ATOMIC); + else + cfg80211_control_port_tx_status(&sdata->wdev, + cookie, + skb->data, + skb->len, + acked, + GFP_ATOMIC); } rcu_read_unlock(); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index fca1f5477396..4b0cff4a07bd 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -226,12 +226,11 @@ static void ieee80211_tdls_add_link_ie(struct ieee80211_sub_if_data *sdata, static void ieee80211_tdls_add_aid(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; u8 *pos = skb_put(skb, 4); *pos++ = WLAN_EID_AID; *pos++ = 2; /* len */ - put_unaligned_le16(ifmgd->aid, pos); + put_unaligned_le16(sdata->vif.bss_conf.aid, pos); } /* translate numbering in the WMM parameter IE to the mac80211 notation */ @@ -1055,7 +1054,7 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, /* disable bottom halves when entering the Tx path */ local_bh_disable(); - __ieee80211_subif_start_xmit(skb, dev, flags, 0); + __ieee80211_subif_start_xmit(skb, dev, flags, 0, NULL); local_bh_enable(); return ret; @@ -1567,6 +1566,10 @@ ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, u32 ch_sw_tm_ie; int ret; + if (chandef->chan->freq_offset) + /* this may work, but is untested */ + return -EOPNOTSUPP; + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, addr); if (!sta) { diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 427f51a0a994..1b4709694d2a 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -37,32 +37,42 @@ #define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" #define CHANDEF_ENTRY __field(u32, control_freq) \ + __field(u32, freq_offset) \ __field(u32, chan_width) \ __field(u32, center_freq1) \ + __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHANDEF_ASSIGN(c) \ __entry->control_freq = (c) ? ((c)->chan ? (c)->chan->center_freq : 0) : 0; \ + __entry->freq_offset = (c) ? ((c)->chan ? (c)->chan->freq_offset : 0) : 0; \ __entry->chan_width = (c) ? (c)->width : 0; \ __entry->center_freq1 = (c) ? (c)->center_freq1 : 0; \ + __entry->freq1_offset = (c) ? (c)->freq1_offset : 0; \ __entry->center_freq2 = (c) ? (c)->center_freq2 : 0; -#define CHANDEF_PR_FMT " control:%d MHz width:%d center: %d/%d MHz" -#define CHANDEF_PR_ARG __entry->control_freq, __entry->chan_width, \ - __entry->center_freq1, __entry->center_freq2 +#define CHANDEF_PR_FMT " control:%d.%03d MHz width:%d center: %d.%03d/%d MHz" +#define CHANDEF_PR_ARG __entry->control_freq, __entry->freq_offset, __entry->chan_width, \ + __entry->center_freq1, __entry->freq1_offset, __entry->center_freq2 #define MIN_CHANDEF_ENTRY \ __field(u32, min_control_freq) \ + __field(u32, min_freq_offset) \ __field(u32, min_chan_width) \ __field(u32, min_center_freq1) \ + __field(u32, min_freq1_offset) \ __field(u32, min_center_freq2) #define MIN_CHANDEF_ASSIGN(c) \ __entry->min_control_freq = (c)->chan ? (c)->chan->center_freq : 0; \ + __entry->min_freq_offset = (c)->chan ? (c)->chan->freq_offset : 0; \ __entry->min_chan_width = (c)->width; \ __entry->min_center_freq1 = (c)->center_freq1; \ + __entry->freq1_offset = (c)->freq1_offset; \ __entry->min_center_freq2 = (c)->center_freq2; -#define MIN_CHANDEF_PR_FMT " min_control:%d MHz min_width:%d min_center: %d/%d MHz" -#define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_chan_width, \ - __entry->min_center_freq1, __entry->min_center_freq2 +#define MIN_CHANDEF_PR_FMT " min_control:%d.%03d MHz min_width:%d min_center: %d.%03d/%d MHz" +#define MIN_CHANDEF_PR_ARG __entry->min_control_freq, __entry->min_freq_offset, \ + __entry->min_chan_width, \ + __entry->min_center_freq1, __entry->min_freq1_offset, \ + __entry->min_center_freq2 #define CHANCTX_ENTRY CHANDEF_ENTRY \ MIN_CHANDEF_ENTRY \ @@ -412,6 +422,7 @@ TRACE_EVENT(drv_bss_info_changed, __field(s32, cqm_rssi_hyst) __field(u32, channel_width) __field(u32, channel_cfreq1) + __field(u32, channel_cfreq1_offset) __dynamic_array(u32, arp_addr_list, info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? IEEE80211_BSS_ARP_ADDR_LIST_LEN : @@ -452,6 +463,7 @@ TRACE_EVENT(drv_bss_info_changed, __entry->cqm_rssi_hyst = info->cqm_rssi_hyst; __entry->channel_width = info->chandef.width; __entry->channel_cfreq1 = info->chandef.center_freq1; + __entry->channel_cfreq1_offset = info->chandef.freq1_offset; __entry->arp_addr_cnt = info->arp_addr_cnt; memcpy(__get_dynamic_array(arp_addr_list), info->arp_addr_list, sizeof(u32) * (info->arp_addr_cnt > IEEE80211_BSS_ARP_ADDR_LIST_LEN ? @@ -1223,6 +1235,7 @@ TRACE_EVENT(drv_remain_on_channel, LOCAL_ENTRY VIF_ENTRY __field(int, center_freq) + __field(int, freq_offset) __field(unsigned int, duration) __field(u32, type) ), @@ -1231,14 +1244,16 @@ TRACE_EVENT(drv_remain_on_channel, LOCAL_ASSIGN; VIF_ASSIGN; __entry->center_freq = chan->center_freq; + __entry->freq_offset = chan->freq_offset; __entry->duration = duration; __entry->type = type; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " freq:%dMHz duration:%dms type=%d", + LOCAL_PR_FMT VIF_PR_FMT " freq:%d.%03dMHz duration:%dms type=%d", LOCAL_PR_ARG, VIF_PR_ARG, - __entry->center_freq, __entry->duration, __entry->type + __entry->center_freq, __entry->freq_offset, + __entry->duration, __entry->type ) ); @@ -1546,8 +1561,10 @@ struct trace_vif_entry { struct trace_chandef_entry { u32 control_freq; + u32 freq_offset; u32 chan_width; u32 center_freq1; + u32 freq1_offset; u32 center_freq2; } __packed; @@ -1597,18 +1614,26 @@ TRACE_EVENT(drv_switch_vif_chanctx, sizeof(local_vifs[i].vif.vif_name)); SWITCH_ENTRY_ASSIGN(old_chandef.control_freq, old_ctx->def.chan->center_freq); + SWITCH_ENTRY_ASSIGN(old_chandef.freq_offset, + old_ctx->def.chan->freq_offset); SWITCH_ENTRY_ASSIGN(old_chandef.chan_width, old_ctx->def.width); SWITCH_ENTRY_ASSIGN(old_chandef.center_freq1, old_ctx->def.center_freq1); + SWITCH_ENTRY_ASSIGN(old_chandef.freq1_offset, + old_ctx->def.freq1_offset); SWITCH_ENTRY_ASSIGN(old_chandef.center_freq2, old_ctx->def.center_freq2); SWITCH_ENTRY_ASSIGN(new_chandef.control_freq, new_ctx->def.chan->center_freq); + SWITCH_ENTRY_ASSIGN(new_chandef.freq_offset, + new_ctx->def.chan->freq_offset); SWITCH_ENTRY_ASSIGN(new_chandef.chan_width, new_ctx->def.width); SWITCH_ENTRY_ASSIGN(new_chandef.center_freq1, new_ctx->def.center_freq1); + SWITCH_ENTRY_ASSIGN(new_chandef.freq1_offset, + new_ctx->def.freq1_offset); SWITCH_ENTRY_ASSIGN(new_chandef.center_freq2, new_ctx->def.center_freq2); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 82846aca86d9..e9ce658141f5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2144,7 +2144,7 @@ static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local, /* * Please update the file - * Documentation/networking/mac80211-injection.txt + * Documentation/networking/mac80211-injection.rst * when parsing new fields here. */ @@ -2436,13 +2436,19 @@ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, return 0; } -static int ieee80211_store_ack_skb(struct ieee80211_local *local, +static u16 ieee80211_store_ack_skb(struct ieee80211_local *local, struct sk_buff *skb, - u32 *info_flags) + u32 *info_flags, + u64 *cookie) { - struct sk_buff *ack_skb = skb_clone_sk(skb); + struct sk_buff *ack_skb; u16 info_id = 0; + if (skb->sk) + ack_skb = skb_clone_sk(skb); + else + ack_skb = skb_clone(skb, GFP_ATOMIC); + if (ack_skb) { unsigned long flags; int id; @@ -2455,6 +2461,10 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local, if (id >= 0) { info_id = id; *info_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + if (cookie) { + *cookie = ieee80211_mgmt_tx_cookie(local); + IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie; + } } else { kfree_skb(ack_skb); } @@ -2484,7 +2494,8 @@ static int ieee80211_store_ack_skb(struct ieee80211_local *local, */ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, u32 info_flags, - struct sta_info *sta, u32 ctrl_flags) + struct sta_info *sta, u32 ctrl_flags, + u64 *cookie) { struct ieee80211_local *local = sdata->local; struct ieee80211_tx_info *info; @@ -2755,9 +2766,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, goto free; } - if (unlikely(!multicast && skb->sk && - skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) - info_id = ieee80211_store_ack_skb(local, skb, &info_flags); + if (unlikely(!multicast && ((skb->sk && + skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS) || + ctrl_flags & IEEE80211_TX_CTL_REQ_TX_STATUS))) + info_id = ieee80211_store_ack_skb(local, skb, &info_flags, + cookie); /* * If the skb is shared we need to obtain our own copy. @@ -3913,7 +3926,8 @@ EXPORT_SYMBOL(ieee80211_txq_schedule_start); void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct net_device *dev, u32 info_flags, - u32 ctrl_flags) + u32 ctrl_flags, + u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -3983,7 +3997,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, skb_mark_not_on_list(skb); skb = ieee80211_build_hdr(sdata, skb, info_flags, - sta, ctrl_flags); + sta, ctrl_flags, cookie); if (IS_ERR(skb)) { kfree_skb_list(next); goto out; @@ -4125,9 +4139,9 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, __skb_queue_head_init(&queue); ieee80211_convert_to_unicast(skb, dev, &queue); while ((skb = __skb_dequeue(&queue))) - __ieee80211_subif_start_xmit(skb, dev, 0, 0); + __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); } else { - __ieee80211_subif_start_xmit(skb, dev, 0, 0); + __ieee80211_subif_start_xmit(skb, dev, 0, 0, NULL); } return NETDEV_TX_OK; @@ -4215,7 +4229,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, if (unlikely(!multicast && skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)) - ieee80211_store_ack_skb(local, skb, &info->flags); + ieee80211_store_ack_skb(local, skb, &info->flags, NULL); memset(info, 0, sizeof(*info)); @@ -4299,7 +4313,7 @@ ieee80211_build_data_template(struct ieee80211_sub_if_data *sdata, goto out; } - skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0); + skb = ieee80211_build_hdr(sdata, skb, info_flags, sta, 0, NULL); if (IS_ERR(skb)) goto out; @@ -4883,7 +4897,10 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, txrc.bss_conf = &sdata->vif.bss_conf; txrc.skb = skb; txrc.reported_rate.idx = -1; - txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; + if (sdata->beacon_rate_set && sdata->beacon_rateidx_mask[band]) + txrc.rate_idx_mask = sdata->beacon_rateidx_mask[band]; + else + txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; txrc.bss = true; rate_control_get_rate(sdata, NULL, &txrc); @@ -5006,7 +5023,7 @@ struct sk_buff *ieee80211_pspoll_get(struct ieee80211_hw *hw, pspoll = skb_put_zero(skb, sizeof(*pspoll)); pspoll->frame_control = cpu_to_le16(IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL); - pspoll->aid = cpu_to_le16(ifmgd->aid); + pspoll->aid = cpu_to_le16(sdata->vif.bss_conf.aid); /* aid in PS-Poll has its two MSBs each set to 1 */ pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); @@ -5336,14 +5353,15 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, const u8 *buf, size_t len, - const u8 *dest, __be16 proto, bool unencrypted) + const u8 *dest, __be16 proto, bool unencrypted, + u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct ethhdr *ehdr; u32 ctrl_flags = 0; - u32 flags; + u32 flags = 0; /* Only accept CONTROL_PORT_PROTOCOL configured in CONNECT/ASSOCIATE * or Pre-Authentication @@ -5356,9 +5374,13 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, ctrl_flags |= IEEE80211_TX_CTRL_PORT_CTRL_PROTO; if (unencrypted) - flags = IEEE80211_TX_INTFL_DONT_ENCRYPT; - else - flags = 0; + flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + if (cookie) + ctrl_flags |= IEEE80211_TX_CTL_REQ_TX_STATUS; + + flags |= IEEE80211_TX_INTFL_NL80211_FRAME_TX | + IEEE80211_TX_CTL_INJECTED; skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(struct ethhdr) + len); @@ -5379,10 +5401,15 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_network_header(skb); skb_reset_mac_header(skb); + /* mutex lock is only needed for incrementing the cookie counter */ + mutex_lock(&local->mtx); + local_bh_disable(); - __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags); + __ieee80211_subif_start_xmit(skb, skb->dev, flags, ctrl_flags, cookie); local_bh_enable(); + mutex_unlock(&local->mtx); + return 0; } @@ -5409,7 +5436,8 @@ int ieee80211_probe_mesh_link(struct wiphy *wiphy, struct net_device *dev, local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, 0, - IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP); + IEEE80211_TX_CTRL_SKIP_MPATH_LOOKUP, + NULL); local_bh_enable(); return 0; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 20436c86b9bf..21c94094a699 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -936,6 +936,10 @@ static void ieee80211_parse_extension_element(u32 *crc, len >= ieee80211_he_spr_size(data)) elems->he_spr = data; break; + case WLAN_EID_EXT_HE_6GHZ_CAPA: + if (len == sizeof(*elems->he_6ghz_capa)) + elems->he_6ghz_capa = data; + break; } } @@ -1659,7 +1663,20 @@ void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata, } } -static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, +static u8 *ieee80211_write_he_6ghz_cap(u8 *pos, __le16 cap, u8 *end) +{ + if ((end - pos) < 5) + return pos; + + *pos++ = WLAN_EID_EXTENSION; + *pos++ = 1 + sizeof(cap); + *pos++ = WLAN_EID_EXT_HE_6GHZ_CAPA; + memcpy(pos, &cap, sizeof(cap)); + + return pos + 2; +} + +static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, const u8 *ie, size_t ie_len, enum nl80211_band band, @@ -1667,6 +1684,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, struct cfg80211_chan_def *chandef, size_t *offset, u32 flags) { + struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; const struct ieee80211_sta_he_cap *he_cap; u8 *pos = buffer, *end = buffer + buffer_len; @@ -1844,6 +1862,14 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, pos = ieee80211_ie_build_he_cap(pos, he_cap, end); if (!pos) goto out_err; + + if (sband->band == NL80211_BAND_6GHZ) { + enum nl80211_iftype iftype = + ieee80211_vif_type_p2p(&sdata->vif); + __le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype); + + pos = ieee80211_write_he_6ghz_cap(pos, cap, end); + } } /* @@ -1858,7 +1884,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_local *local, return pos - buffer; } -int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, +int ieee80211_build_preq_ies(struct ieee80211_sub_if_data *sdata, u8 *buffer, size_t buffer_len, struct ieee80211_scan_ies *ie_desc, const u8 *ie, size_t ie_len, @@ -1873,7 +1899,7 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, for (i = 0; i < NUM_NL80211_BANDS; i++) { if (bands_used & BIT(i)) { - pos += ieee80211_build_preq_ies_band(local, + pos += ieee80211_build_preq_ies_band(sdata, buffer + pos, buffer_len - pos, ie, ie_len, i, @@ -1935,7 +1961,7 @@ struct sk_buff *ieee80211_build_probe_req(struct ieee80211_sub_if_data *sdata, return NULL; rate_masks[chan->band] = ratemask; - ies_len = ieee80211_build_preq_ies(local, skb_tail_pointer(skb), + ies_len = ieee80211_build_preq_ies(sdata, skb_tail_pointer(skb), skb_tailroom(skb), &dummy_ie_desc, ie, ie_len, BIT(chan->band), rate_masks, &chandef, flags); @@ -2835,6 +2861,50 @@ end: return pos; } +void ieee80211_ie_build_he_6ghz_cap(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + struct ieee80211_supported_band *sband; + const struct ieee80211_sband_iftype_data *iftd; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + u8 *pos; + u16 cap; + + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + + iftd = ieee80211_get_sband_iftype_data(sband, iftype); + if (WARN_ON(!iftd)) + return; + + cap = le16_to_cpu(iftd->he_6ghz_capa.capa); + cap &= ~IEEE80211_HE_6GHZ_CAP_SM_PS; + + switch (sdata->smps_mode) { + case IEEE80211_SMPS_AUTOMATIC: + case IEEE80211_SMPS_NUM_MODES: + WARN_ON(1); + /* fall through */ + case IEEE80211_SMPS_OFF: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DISABLED, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + case IEEE80211_SMPS_STATIC: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_STATIC, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + case IEEE80211_SMPS_DYNAMIC: + cap |= u16_encode_bits(WLAN_HT_CAP_SM_PS_DYNAMIC, + IEEE80211_HE_6GHZ_CAP_SM_PS); + break; + } + + pos = skb_put(skb, 2 + 1 + sizeof(cap)); + ieee80211_write_he_6ghz_cap(pos, cpu_to_le16(cap), + pos + 2 + 1 + sizeof(cap)); +} + u8 *ieee80211_ie_build_ht_oper(u8 *pos, struct ieee80211_sta_ht_cap *ht_cap, const struct cfg80211_chan_def *chandef, u16 prot_mode, bool rifs_mode) @@ -2958,13 +3028,18 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, return pos + sizeof(struct ieee80211_vht_operation); } -u8 *ieee80211_ie_build_he_oper(u8 *pos) +u8 *ieee80211_ie_build_he_oper(u8 *pos, struct cfg80211_chan_def *chandef) { struct ieee80211_he_operation *he_oper; + struct ieee80211_he_6ghz_oper *he_6ghz_op; u32 he_oper_params; + u8 ie_len = 1 + sizeof(struct ieee80211_he_operation); + + if (chandef->chan->band == NL80211_BAND_6GHZ) + ie_len += sizeof(struct ieee80211_he_6ghz_oper); *pos++ = WLAN_EID_EXTENSION; - *pos++ = 1 + sizeof(struct ieee80211_he_operation); + *pos++ = ie_len; *pos++ = WLAN_EID_EXT_HE_OPERATION; he_oper_params = 0; @@ -2974,16 +3049,68 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos) IEEE80211_HE_OPERATION_ER_SU_DISABLE); he_oper_params |= u32_encode_bits(1, IEEE80211_HE_OPERATION_BSS_COLOR_DISABLED); + if (chandef->chan->band == NL80211_BAND_6GHZ) + he_oper_params |= u32_encode_bits(1, + IEEE80211_HE_OPERATION_6GHZ_OP_INFO); he_oper = (struct ieee80211_he_operation *)pos; he_oper->he_oper_params = cpu_to_le32(he_oper_params); /* don't require special HE peer rates */ he_oper->he_mcs_nss_set = cpu_to_le16(0xffff); + pos += sizeof(struct ieee80211_he_operation); - /* TODO add VHT operational and 6GHz operational subelement? */ + if (chandef->chan->band != NL80211_BAND_6GHZ) + goto out; - return pos + sizeof(struct ieee80211_vht_operation); + /* TODO add VHT operational */ + he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos; + he_6ghz_op->minrate = 6; /* 6 Mbps */ + he_6ghz_op->primary = + ieee80211_frequency_to_channel(chandef->chan->center_freq); + he_6ghz_op->ccfs0 = + ieee80211_frequency_to_channel(chandef->center_freq1); + if (chandef->center_freq2) + he_6ghz_op->ccfs1 = + ieee80211_frequency_to_channel(chandef->center_freq2); + else + he_6ghz_op->ccfs1 = 0; + + switch (chandef->width) { + case NL80211_CHAN_WIDTH_160: + /* Convert 160 MHz channel width to new style as interop + * workaround. + */ + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; + he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0; + if (chandef->chan->center_freq < chandef->center_freq1) + he_6ghz_op->ccfs0 -= 8; + else + he_6ghz_op->ccfs0 += 8; + fallthrough; + case NL80211_CHAN_WIDTH_80P80: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ; + break; + case NL80211_CHAN_WIDTH_80: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ; + break; + case NL80211_CHAN_WIDTH_40: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ; + break; + default: + he_6ghz_op->control = + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ; + break; + } + + pos += sizeof(struct ieee80211_he_6ghz_oper); + +out: + return pos; } bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, @@ -3013,7 +3140,7 @@ bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper, return true; } -bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, +bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap_info, const struct ieee80211_vht_operation *oper, const struct ieee80211_ht_operation *htop, struct cfg80211_chan_def *chandef) @@ -3025,6 +3152,10 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, u32 vht_cap; bool support_80_80 = false; bool support_160 = false; + u8 ext_nss_bw_supp = u32_get_bits(vht_cap_info, + IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); + u8 supp_chwidth = u32_get_bits(vht_cap_info, + IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK); if (!oper || !htop) return false; @@ -3044,11 +3175,48 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, IEEE80211_HT_OP_MODE_CCFS2_MASK) >> IEEE80211_HT_OP_MODE_CCFS2_SHIFT; - /* when parsing (and we know how to) CCFS1 and CCFS2 are equivalent */ ccf0 = ccfs0; - ccf1 = ccfs1; - if (!ccfs1 && ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) + + /* if not supported, parse as though we didn't understand it */ + if (!ieee80211_hw_check(hw, SUPPORTS_VHT_EXT_NSS_BW)) + ext_nss_bw_supp = 0; + + /* + * Cf. IEEE 802.11 Table 9-250 + * + * We really just consider that because it's inefficient to connect + * at a higher bandwidth than we'll actually be able to use. + */ + switch ((supp_chwidth << 4) | ext_nss_bw_supp) { + default: + case 0x00: + ccf1 = 0; + support_160 = false; + support_80_80 = false; + break; + case 0x01: + support_80_80 = false; + /* fall through */ + case 0x02: + case 0x03: ccf1 = ccfs2; + break; + case 0x10: + ccf1 = ccfs1; + break; + case 0x11: + case 0x12: + if (!ccfs1) + ccf1 = ccfs2; + else + ccf1 = ccfs1; + break; + case 0x13: + case 0x20: + case 0x23: + ccf1 = ccfs1; + break; + } cf0 = ieee80211_channel_to_frequency(ccf0, chandef->chan->band); cf1 = ieee80211_channel_to_frequency(ccf1, chandef->chan->band); @@ -3096,6 +3264,112 @@ bool ieee80211_chandef_vht_oper(struct ieee80211_hw *hw, return true; } +bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_he_operation *he_oper, + struct cfg80211_chan_def *chandef) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); + const struct ieee80211_sta_he_cap *he_cap; + struct cfg80211_chan_def he_chandef = *chandef; + const struct ieee80211_he_6ghz_oper *he_6ghz_oper; + bool support_80_80, support_160; + u8 he_phy_cap; + u32 freq; + + if (chandef->chan->band != NL80211_BAND_6GHZ) + return true; + + sband = local->hw.wiphy->bands[NL80211_BAND_6GHZ]; + + he_cap = ieee80211_get_he_iftype_cap(sband, iftype); + if (!he_cap) { + sdata_info(sdata, "Missing iftype sband data/HE cap"); + return false; + } + + he_phy_cap = he_cap->he_cap_elem.phy_cap_info[0]; + support_160 = + he_phy_cap & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; + support_80_80 = + he_phy_cap & + IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G; + + if (!he_oper) { + sdata_info(sdata, + "HE is not advertised on (on %d MHz), expect issues\n", + chandef->chan->center_freq); + return false; + } + + he_6ghz_oper = ieee80211_he_6ghz_oper(he_oper); + + if (!he_6ghz_oper) { + sdata_info(sdata, + "HE 6GHz operation missing (on %d MHz), expect issues\n", + chandef->chan->center_freq); + return false; + } + + freq = ieee80211_channel_to_frequency(he_6ghz_oper->primary, + NL80211_BAND_6GHZ); + he_chandef.chan = ieee80211_get_channel(sdata->local->hw.wiphy, freq); + + switch (u8_get_bits(he_6ghz_oper->control, + IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH)) { + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_20MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_20; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_40MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_40; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_80MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_80; + break; + case IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ: + he_chandef.width = NL80211_CHAN_WIDTH_80; + if (!he_6ghz_oper->ccfs1) + break; + if (abs(he_6ghz_oper->ccfs1 - he_6ghz_oper->ccfs0) == 8) { + if (support_160) + he_chandef.width = NL80211_CHAN_WIDTH_160; + } else { + if (support_80_80) + he_chandef.width = NL80211_CHAN_WIDTH_80P80; + } + break; + } + + if (he_chandef.width == NL80211_CHAN_WIDTH_160) { + he_chandef.center_freq1 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, + NL80211_BAND_6GHZ); + } else { + he_chandef.center_freq1 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs0, + NL80211_BAND_6GHZ); + he_chandef.center_freq2 = + ieee80211_channel_to_frequency(he_6ghz_oper->ccfs1, + NL80211_BAND_6GHZ); + } + + if (!cfg80211_chandef_valid(&he_chandef)) { + sdata_info(sdata, + "HE 6GHz operation resulted in invalid chandef: %d MHz/%d/%d MHz/%d MHz\n", + he_chandef.chan ? he_chandef.chan->center_freq : 0, + he_chandef.width, + he_chandef.center_freq1, + he_chandef.center_freq2); + return false; + } + + *chandef = he_chandef; + + return true; +} + int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef, const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 632f07401850..9c6045f9c24d 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -4,7 +4,7 @@ * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2020 Intel Corporation */ #include <linux/ieee80211.h> @@ -575,15 +575,21 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: + /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: + /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: - sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; + if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; + else + sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: + /* legacy only, no longer used by newer spec */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 4701edffb1f7..fd30ea61336e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1362,8 +1362,7 @@ done: (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -1594,7 +1593,8 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, dev->type == ARPHRD_IPGRE || dev->type == ARPHRD_IP6GRE || dev->type == ARPHRD_SIT || - dev->type == ARPHRD_TUNNEL) { + dev->type == ARPHRD_TUNNEL || + dev->type == ARPHRD_TUNNEL6) { mdev = mpls_add_dev(dev); if (IS_ERR(mdev)) return notifier_from_errno(PTR_ERR(mdev)); @@ -2594,7 +2594,7 @@ nolabels: } static int mpls_platform_labels(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 0e9aa94adc07..838cdfc10e47 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -172,17 +172,6 @@ struct mpls_route { /* next hop label forwarding entry */ #define endfor_nexthops(rt) } -static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos) -{ - struct mpls_shim_hdr result; - result.label_stack_entry = - cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) | - (tc << MPLS_LS_TC_SHIFT) | - (bos ? (1 << MPLS_LS_S_SHIFT) : 0) | - (ttl << MPLS_LS_TTL_SHIFT)); - return result; -} - static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr) { struct mpls_entry_decoded result; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 7793b6011fa7..01f1f4cf4902 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -517,7 +517,16 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return ret; } - ack_size = TCPOLEN_MPTCP_DSS_ACK64; + if (subflow->use_64bit_ack) { + ack_size = TCPOLEN_MPTCP_DSS_ACK64; + opts->ext_copy.data_ack = msk->ack_seq; + opts->ext_copy.ack64 = 1; + } else { + ack_size = TCPOLEN_MPTCP_DSS_ACK32; + opts->ext_copy.data_ack32 = (uint32_t)(msk->ack_seq); + opts->ext_copy.ack64 = 0; + } + opts->ext_copy.use_ack = 1; /* Add kind/length/subtype/flag overhead if mapping is not populated */ if (dss_size == 0) @@ -525,10 +534,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, dss_size += ack_size; - opts->ext_copy.data_ack = msk->ack_seq; - opts->ext_copy.ack64 = 1; - opts->ext_copy.use_ack = 1; - *size = ALIGN(dss_size, 4); return true; } @@ -987,8 +992,13 @@ mp_capable_done: u8 flags = 0; if (mpext->use_ack) { - len += TCPOLEN_MPTCP_DSS_ACK64; - flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; + flags = MPTCP_DSS_HAS_ACK; + if (mpext->ack64) { + len += TCPOLEN_MPTCP_DSS_ACK64; + flags |= MPTCP_DSS_ACK64; + } else { + len += TCPOLEN_MPTCP_DSS_ACK32; + } } if (mpext->use_map) { @@ -1005,8 +1015,13 @@ mp_capable_done: *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); if (mpext->use_ack) { - put_unaligned_be64(mpext->data_ack, ptr); - ptr += 2; + if (mpext->ack64) { + put_unaligned_be64(mpext->data_ack, ptr); + ptr += 2; + } else { + put_unaligned_be32(mpext->data_ack32, ptr); + ptr += 1; + } } if (mpext->use_map) { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 34dd0e278a82..14b253d10ccf 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -144,12 +144,29 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, unsigned int offset, size_t copy_len) { struct sock *sk = (struct sock *)msk; + struct sk_buff *tail; __skb_unlink(skb, &ssk->sk_receive_queue); - skb_set_owner_r(skb, sk); - __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_ext_reset(skb); + skb_orphan(skb); msk->ack_seq += copy_len; + + tail = skb_peek_tail(&sk->sk_receive_queue); + if (offset == 0 && tail) { + bool fragstolen; + int delta; + + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { + kfree_skb_partial(skb, fragstolen); + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + return; + } + } + + skb_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); MPTCP_SKB_CB(skb)->offset = offset; } @@ -367,8 +384,10 @@ static void mptcp_stop_timer(struct sock *sk) static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { + const struct sock *sk = (const struct sock *)msk; + if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(); + msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); return !!msk->cached_ext; } @@ -510,20 +529,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * fooled into a warning if we don't init here */ pfrag = sk_page_frag(sk); - while ((!retransmission && !mptcp_page_frag_refill(ssk, pfrag)) || - !mptcp_ext_cache_refill(msk)) { - ret = sk_stream_wait_memory(ssk, timeo); - if (ret) - return ret; - - /* if sk_stream_wait_memory() sleeps snd_una can change - * significantly, refresh the rtx queue - */ - mptcp_clean_una(sk); - - if (unlikely(__mptcp_needs_tcp_fallback(msk))) - return 0; - } if (!retransmission) { write_seq = &msk->write_seq; page = pfrag->page; @@ -590,7 +595,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * access the skb after the sendpages call */ ret = do_tcp_sendpages(ssk, page, offset, psize, - msg->msg_flags | MSG_SENDPAGE_NOTLAST); + msg->msg_flags | MSG_SENDPAGE_NOTLAST | MSG_DONTWAIT); if (ret <= 0) return ret; @@ -653,6 +658,15 @@ out: return ret; } +static void mptcp_nospace(struct mptcp_sock *msk, struct socket *sock) +{ + clear_bit(MPTCP_SEND_SPACE, &msk->flags); + smp_mb__after_atomic(); /* msk->flags is changed by write_space cb */ + + /* enables sk->write_space() callbacks */ + set_bit(SOCK_NOSPACE, &sock->flags); +} + static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; @@ -660,19 +674,17 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); + if (!mptcp_ext_cache_refill(msk)) + return NULL; + mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (!sk_stream_memory_free(ssk)) { struct socket *sock = ssk->sk_socket; - if (sock) { - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - - /* enables sk->write_space() callbacks */ - set_bit(SOCK_NOSPACE, &sock->flags); - } + if (sock) + mptcp_nospace(msk, sock); return NULL; } @@ -698,22 +710,19 @@ static void ssk_check_wmem(struct mptcp_sock *msk, struct sock *ssk) return; sock = READ_ONCE(ssk->sk_socket); - - if (sock) { - clear_bit(MPTCP_SEND_SPACE, &msk->flags); - smp_mb__after_atomic(); - /* set NOSPACE only after clearing SEND_SPACE flag */ - set_bit(SOCK_NOSPACE, &sock->flags); - } + if (sock) + mptcp_nospace(msk, sock); } static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { int mss_now = 0, size_goal = 0, ret = 0; struct mptcp_sock *msk = mptcp_sk(sk); + struct page_frag *pfrag; struct socket *ssock; size_t copied = 0; struct sock *ssk; + bool tx_ok; long timeo; if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL)) @@ -738,11 +747,29 @@ fallback: return ret >= 0 ? ret + copied : (copied ? copied : ret); } + pfrag = sk_page_frag(sk); +restart: mptcp_clean_una(sk); +wait_for_sndbuf: __mptcp_flush_join_list(msk); ssk = mptcp_subflow_get_send(msk); - while (!sk_stream_memory_free(sk) || !ssk) { + while (!sk_stream_memory_free(sk) || + !ssk || + !mptcp_page_frag_refill(ssk, pfrag)) { + if (ssk) { + /* make sure retransmit timer is + * running before we wait for memory. + * + * The retransmit timer might be needed + * to make the peer send an up-to-date + * MPTCP Ack. + */ + mptcp_set_timeout(sk, ssk); + if (!mptcp_timer_pending(sk)) + mptcp_reset_timer(sk); + } + ret = sk_stream_wait_memory(sk, &timeo); if (ret) goto out; @@ -759,11 +786,18 @@ fallback: pr_debug("conn_list->subflow=%p", ssk); lock_sock(ssk); - while (msg_data_left(msg)) { + tx_ok = msg_data_left(msg); + while (tx_ok) { ret = mptcp_sendmsg_frag(sk, ssk, msg, NULL, &timeo, &mss_now, &size_goal); - if (ret < 0) + if (ret < 0) { + if (ret == -EAGAIN && timeo > 0) { + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto restart; + } break; + } if (ret == 0 && unlikely(__mptcp_needs_tcp_fallback(msk))) { /* Can happen for passive sockets: * 3WHS negotiated MPTCP, but first packet after is @@ -777,6 +811,50 @@ fallback: } copied += ret; + + tx_ok = msg_data_left(msg); + if (!tx_ok) + break; + + if (!sk_stream_memory_free(ssk) || + !mptcp_page_frag_refill(ssk, pfrag) || + !mptcp_ext_cache_refill(msk)) { + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + tcp_push(ssk, msg->msg_flags, mss_now, + tcp_sk(ssk)->nonagle, size_goal); + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto restart; + } + + /* memory is charged to mptcp level socket as well, i.e. + * if msg is very large, mptcp socket may run out of buffer + * space. mptcp_clean_una() will release data that has + * been acked at mptcp level in the mean time, so there is + * a good chance we can continue sending data right away. + * + * Normally, when the tcp subflow can accept more data, then + * so can the MPTCP socket. However, we need to cope with + * peers that might lag behind in their MPTCP-level + * acknowledgements, i.e. data might have been acked at + * tcp level only. So, we must also check the MPTCP socket + * limits before we send more data. + */ + if (unlikely(!sk_stream_memory_free(sk))) { + tcp_push(ssk, msg->msg_flags, mss_now, + tcp_sk(ssk)->nonagle, size_goal); + mptcp_clean_una(sk); + if (!sk_stream_memory_free(sk)) { + /* can't send more for now, need to wait for + * MPTCP-level ACKs from peer. + * + * Wakeup will happen via mptcp_clean_una(). + */ + mptcp_set_timeout(sk, ssk); + release_sock(ssk); + goto wait_for_sndbuf; + } + } } mptcp_set_timeout(sk, ssk); @@ -1095,7 +1173,7 @@ static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); struct sock *ssk, *sk = &msk->sk.icsk_inet.sk; - int orig_len, orig_offset, ret, mss_now = 0, size_goal = 0; + int orig_len, orig_offset, mss_now = 0, size_goal = 0; struct mptcp_data_frag *dfrag; u64 orig_write_seq; size_t copied = 0; @@ -1117,6 +1195,9 @@ static void mptcp_worker(struct work_struct *work) if (!dfrag) goto unlock; + if (!mptcp_ext_cache_refill(msk)) + goto reset_unlock; + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) goto reset_unlock; @@ -1128,8 +1209,8 @@ static void mptcp_worker(struct work_struct *work) orig_offset = dfrag->offset; orig_write_seq = dfrag->data_seq; while (dfrag->data_len > 0) { - ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, &mss_now, - &size_goal); + int ret = mptcp_sendmsg_frag(sk, ssk, &msg, dfrag, &timeo, + &mss_now, &size_goal); if (ret < 0) break; @@ -1137,6 +1218,9 @@ static void mptcp_worker(struct work_struct *work) copied += ret; dfrag->data_len -= ret; dfrag->offset += ret; + + if (!mptcp_ext_cache_refill(msk)) + break; } if (copied) tcp_push(ssk, msg.msg_flags, mss_now, tcp_sk(ssk)->nonagle, @@ -1653,13 +1737,6 @@ bool mptcp_finish_join(struct sock *sk) return true; } -bool mptcp_sk_is_subflow(const struct sock *sk) -{ - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - - return subflow->mp_join == 1; -} - static bool mptcp_memory_free(const struct sock *sk, int wake) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -2037,6 +2114,7 @@ static const struct proto_ops mptcp_v6_stream_ops = { .mmap = sock_no_mmap, .sendpage = inet_sendpage, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d0803dfb8108..809687d3f410 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -289,6 +289,7 @@ struct mptcp_subflow_context { data_avail : 1, rx_eof : 1, data_fin_tx_enable : 1, + use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ u64 data_fin_tx_seq; u32 remote_nonce; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 8968b2c065e7..493b98a0825c 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -413,6 +413,20 @@ static void subflow_ulp_fallback(struct sock *sk, tcp_sk(sk)->is_mptcp = 0; } +static void subflow_drop_ctx(struct sock *ssk) +{ + struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk); + + if (!ctx) + return; + + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + + kfree_rcu(ctx, rcu); +} + static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, @@ -475,18 +489,17 @@ create_child: if (child && *own_req) { struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child); + tcp_rsk(req)->drop_req = false; + /* we need to fallback on ctx allocation failure and on pre-reqs * checking above. In the latter scenario we additionally need * to reset the context to non MPTCP status. */ if (!ctx || fallback) { if (fallback_is_fatal) - goto close_child; + goto dispose_child; - if (ctx) { - subflow_ulp_fallback(child, ctx); - kfree_rcu(ctx, rcu); - } + subflow_drop_ctx(child); goto out; } @@ -510,13 +523,14 @@ create_child: owner = mptcp_token_get_sock(ctx->token); if (!owner) - goto close_child; + goto dispose_child; ctx->conn = (struct sock *)owner; if (!mptcp_finish_join(child)) - goto close_child; + goto dispose_child; SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX); + tcp_rsk(req)->drop_req = true; } } @@ -533,11 +547,15 @@ out: !mptcp_subflow_ctx(child)->conn)); return child; -close_child: +dispose_child: + subflow_drop_ctx(child); + tcp_rsk(req)->drop_req = true; tcp_send_active_reset(child, GFP_ATOMIC); - inet_csk_prepare_forced_close(child); + inet_csk_prepare_for_destroy_sock(child); tcp_done(child); - return NULL; + + /* The last child reference will be released by the caller */ + return child; } static struct inet_connection_sock_af_ops subflow_specific; @@ -666,9 +684,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk) if (!mpext->dsn64) { map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, mpext->data_seq); + subflow->use_64bit_ack = 0; pr_debug("expanded seq=%llu", subflow->map_seq); } else { map_seq = mpext->data_seq; + subflow->use_64bit_ack = 1; } if (subflow->map_valid) { @@ -850,6 +870,24 @@ bool mptcp_subflow_data_available(struct sock *sk) return subflow->data_avail; } +/* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy, + * not the ssk one. + * + * In mptcp, rwin is about the mptcp-level connection data. + * + * Data that is still on the ssk rx queue can thus be ignored, + * as far as mptcp peer is concerened that data is still inflight. + * DSS ACK is updated when skb is moved to the mptcp rx queue. + */ +void mptcp_space(const struct sock *ssk, int *space, int *full_space) +{ + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + const struct sock *sk = subflow->conn; + + *space = tcp_space(sk); + *full_space = tcp_full_space(sk); +} + static void subflow_data_ready(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 468fea1aebba..3a3915d2e1ea 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -1043,7 +1043,7 @@ config NETFILTER_XT_TARGET_TPROXY on Netfilter connection tracking and NAT, unlike REDIRECT. For it to work you will have to configure certain iptables rules and use policy routing. For more information on how to set it up - see Documentation/networking/tproxy.txt. + see Documentation/networking/tproxy.rst. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8d14a1acbc37..412656c34f20 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1736,7 +1736,7 @@ static int three = 3; static int proc_do_defense_mode(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; @@ -1763,7 +1763,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, static int proc_do_sync_threshold(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val[2]; @@ -1788,7 +1788,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, static int proc_do_sync_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index bb72ca5f3999..79cd9dde457b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1974,13 +1974,22 @@ const struct nla_policy nf_ct_port_nla_policy[CTA_PROTO_MAX+1] = { EXPORT_SYMBOL_GPL(nf_ct_port_nla_policy); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_PROTO_SRC_PORT] || !tb[CTA_PROTO_DST_PORT]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) { + if (!tb[CTA_PROTO_SRC_PORT]) + return -EINVAL; + + t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); + } - t->src.u.tcp.port = nla_get_be16(tb[CTA_PROTO_SRC_PORT]); - t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + if (flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) { + if (!tb[CTA_PROTO_DST_PORT]) + return -EINVAL; + + t->dst.u.tcp.port = nla_get_be16(tb[CTA_PROTO_DST_PORT]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9ddfcd002d3b..d7bd8b1f27d5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -54,6 +54,8 @@ #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> +#include "nf_internals.h" + MODULE_LICENSE("GPL"); static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, @@ -544,14 +546,16 @@ static int ctnetlink_dump_info(struct sk_buff *skb, struct nf_conn *ct) static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, - struct nf_conn *ct, bool extinfo) + struct nf_conn *ct, bool extinfo, unsigned int flags) { const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nlattr *nest_parms; - unsigned int flags = portid ? NLM_F_MULTI : 0, event; + unsigned int event; + if (portid) + flags |= NLM_F_MULTI; event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) @@ -847,17 +851,70 @@ static int ctnetlink_done(struct netlink_callback *cb) } struct ctnetlink_filter { + u_int32_t cta_flags; u8 family; + + u_int32_t orig_flags; + u_int32_t reply_flags; + + struct nf_conntrack_tuple orig; + struct nf_conntrack_tuple reply; + struct nf_conntrack_zone zone; + struct { u_int32_t val; u_int32_t mask; } mark; }; +static const struct nla_policy cta_filter_nla_policy[CTA_FILTER_MAX + 1] = { + [CTA_FILTER_ORIG_FLAGS] = { .type = NLA_U32 }, + [CTA_FILTER_REPLY_FLAGS] = { .type = NLA_U32 }, +}; + +static int ctnetlink_parse_filter(const struct nlattr *attr, + struct ctnetlink_filter *filter) +{ + struct nlattr *tb[CTA_FILTER_MAX + 1]; + int ret = 0; + + ret = nla_parse_nested(tb, CTA_FILTER_MAX, attr, cta_filter_nla_policy, + NULL); + if (ret) + return ret; + + if (tb[CTA_FILTER_ORIG_FLAGS]) { + filter->orig_flags = nla_get_u32(tb[CTA_FILTER_ORIG_FLAGS]); + if (filter->orig_flags & ~CTA_FILTER_F_ALL) + return -EOPNOTSUPP; + } + + if (tb[CTA_FILTER_REPLY_FLAGS]) { + filter->reply_flags = nla_get_u32(tb[CTA_FILTER_REPLY_FLAGS]); + if (filter->reply_flags & ~CTA_FILTER_F_ALL) + return -EOPNOTSUPP; + } + + return 0; +} + +static int ctnetlink_parse_zone(const struct nlattr *attr, + struct nf_conntrack_zone *zone); +static int ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, + u32 type, u_int8_t l3num, + struct nf_conntrack_zone *zone, + u_int32_t flags); + +/* applied on filters */ +#define CTA_FILTER_F_CTA_MARK (1 << 0) +#define CTA_FILTER_F_CTA_MARK_MASK (1 << 1) + static struct ctnetlink_filter * ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) { struct ctnetlink_filter *filter; + int err; #ifndef CONFIG_NF_CONNTRACK_MARK if (cda[CTA_MARK] || cda[CTA_MARK_MASK]) @@ -871,14 +928,65 @@ ctnetlink_alloc_filter(const struct nlattr * const cda[], u8 family) filter->family = family; #ifdef CONFIG_NF_CONNTRACK_MARK - if (cda[CTA_MARK] && cda[CTA_MARK_MASK]) { + if (cda[CTA_MARK]) { filter->mark.val = ntohl(nla_get_be32(cda[CTA_MARK])); - filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK); + + if (cda[CTA_MARK_MASK]) { + filter->mark.mask = ntohl(nla_get_be32(cda[CTA_MARK_MASK])); + filter->cta_flags |= CTA_FILTER_FLAG(CTA_MARK_MASK); + } else { + filter->mark.mask = 0xffffffff; + } + } else if (cda[CTA_MARK_MASK]) { + return ERR_PTR(-EINVAL); } #endif + if (!cda[CTA_FILTER]) + return filter; + + err = ctnetlink_parse_zone(cda[CTA_ZONE], &filter->zone); + if (err < 0) + return ERR_PTR(err); + + err = ctnetlink_parse_filter(cda[CTA_FILTER], filter); + if (err < 0) + return ERR_PTR(err); + + if (filter->orig_flags) { + if (!cda[CTA_TUPLE_ORIG]) + return ERR_PTR(-EINVAL); + + err = ctnetlink_parse_tuple_filter(cda, &filter->orig, + CTA_TUPLE_ORIG, + filter->family, + &filter->zone, + filter->orig_flags); + if (err < 0) + return ERR_PTR(err); + } + + if (filter->reply_flags) { + if (!cda[CTA_TUPLE_REPLY]) + return ERR_PTR(-EINVAL); + + err = ctnetlink_parse_tuple_filter(cda, &filter->reply, + CTA_TUPLE_REPLY, + filter->family, + &filter->zone, + filter->orig_flags); + if (err < 0) + return ERR_PTR(err); + } + return filter; } +static bool ctnetlink_needs_filter(u8 family, const struct nlattr * const *cda) +{ + return family || cda[CTA_MARK] || cda[CTA_FILTER]; +} + static int ctnetlink_start(struct netlink_callback *cb) { const struct nlattr * const *cda = cb->data; @@ -886,7 +994,7 @@ static int ctnetlink_start(struct netlink_callback *cb) struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u8 family = nfmsg->nfgen_family; - if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + if (ctnetlink_needs_filter(family, cda)) { filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -896,9 +1004,79 @@ static int ctnetlink_start(struct netlink_callback *cb) return 0; } +static int ctnetlink_filter_match_tuple(struct nf_conntrack_tuple *filter_tuple, + struct nf_conntrack_tuple *ct_tuple, + u_int32_t flags, int family) +{ + switch (family) { + case NFPROTO_IPV4: + if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && + filter_tuple->src.u3.ip != ct_tuple->src.u3.ip) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && + filter_tuple->dst.u3.ip != ct_tuple->dst.u3.ip) + return 0; + break; + case NFPROTO_IPV6: + if ((flags & CTA_FILTER_FLAG(CTA_IP_SRC)) && + !ipv6_addr_cmp(&filter_tuple->src.u3.in6, + &ct_tuple->src.u3.in6)) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_IP_DST)) && + !ipv6_addr_cmp(&filter_tuple->dst.u3.in6, + &ct_tuple->dst.u3.in6)) + return 0; + break; + } + + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) && + filter_tuple->dst.protonum != ct_tuple->dst.protonum) + return 0; + + switch (ct_tuple->dst.protonum) { + case IPPROTO_TCP: + case IPPROTO_UDP: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_SRC_PORT)) && + filter_tuple->src.u.tcp.port != ct_tuple->src.u.tcp.port) + return 0; + + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_DST_PORT)) && + filter_tuple->dst.u.tcp.port != ct_tuple->dst.u.tcp.port) + return 0; + break; + case IPPROTO_ICMP: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) && + filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) && + filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) && + filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) + return 0; + break; + case IPPROTO_ICMPV6: + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) && + filter_tuple->dst.u.icmp.type != ct_tuple->dst.u.icmp.type) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) && + filter_tuple->dst.u.icmp.code != ct_tuple->dst.u.icmp.code) + return 0; + if ((flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) && + filter_tuple->src.u.icmp.id != ct_tuple->src.u.icmp.id) + return 0; + break; + } + + return 1; +} + static int ctnetlink_filter_match(struct nf_conn *ct, void *data) { struct ctnetlink_filter *filter = data; + struct nf_conntrack_tuple *tuple; if (filter == NULL) goto out; @@ -910,8 +1088,28 @@ static int ctnetlink_filter_match(struct nf_conn *ct, void *data) if (filter->family && nf_ct_l3num(ct) != filter->family) goto ignore_entry; + if (filter->orig_flags) { + tuple = nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL); + if (!ctnetlink_filter_match_tuple(&filter->orig, tuple, + filter->orig_flags, + filter->family)) + goto ignore_entry; + } + + if (filter->reply_flags) { + tuple = nf_ct_tuple(ct, IP_CT_DIR_REPLY); + if (!ctnetlink_filter_match_tuple(&filter->reply, tuple, + filter->reply_flags, + filter->family)) + goto ignore_entry; + } + #ifdef CONFIG_NF_CONNTRACK_MARK - if ((ct->mark & filter->mark.mask) != filter->mark.val) + if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK_MASK)) && + (ct->mark & filter->mark.mask) != filter->mark.val) + goto ignore_entry; + else if ((filter->cta_flags & CTA_FILTER_FLAG(CTA_MARK)) && + ct->mark != filter->mark.val) goto ignore_entry; #endif @@ -925,6 +1123,7 @@ ignore_entry: static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { + unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; @@ -979,7 +1178,7 @@ restart: ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, true); + ct, true, flags); if (res < 0) { nf_conntrack_get(&ct->ct_general); cb->args[1] = (unsigned long)ct; @@ -1014,31 +1213,50 @@ out: } static int ipv4_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_IP_V4_SRC]) + return -EINVAL; + + t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); + } + + if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { + if (!tb[CTA_IP_V4_DST]) + return -EINVAL; - t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]); - t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]); + } return 0; } static int ipv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *t) + struct nf_conntrack_tuple *t, + u_int32_t flags) { - if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_IP_V6_SRC]) + return -EINVAL; - t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); - t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]); + } + + if (flags & CTA_FILTER_FLAG(CTA_IP_DST)) { + if (!tb[CTA_IP_V6_DST]) + return -EINVAL; + + t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]); + } return 0; } static int ctnetlink_parse_tuple_ip(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { struct nlattr *tb[CTA_IP_MAX+1]; int ret = 0; @@ -1054,10 +1272,10 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, switch (tuple->src.l3num) { case NFPROTO_IPV4: - ret = ipv4_nlattr_to_tuple(tb, tuple); + ret = ipv4_nlattr_to_tuple(tb, tuple, flags); break; case NFPROTO_IPV6: - ret = ipv6_nlattr_to_tuple(tb, tuple); + ret = ipv6_nlattr_to_tuple(tb, tuple, flags); break; } @@ -1069,7 +1287,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { }; static int ctnetlink_parse_tuple_proto(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { const struct nf_conntrack_l4proto *l4proto; struct nlattr *tb[CTA_PROTO_MAX+1]; @@ -1080,8 +1299,12 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, if (ret < 0) return ret; + if (!(flags & CTA_FILTER_FLAG(CTA_PROTO_NUM))) + return 0; + if (!tb[CTA_PROTO_NUM]) return -EINVAL; + tuple->dst.protonum = nla_get_u8(tb[CTA_PROTO_NUM]); rcu_read_lock(); @@ -1092,7 +1315,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, l4proto->nla_policy, NULL); if (ret == 0) - ret = l4proto->nlattr_to_tuple(tb, tuple); + ret = l4proto->nlattr_to_tuple(tb, tuple, flags); } rcu_read_unlock(); @@ -1143,10 +1366,21 @@ static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; +#define CTA_FILTER_F_ALL_CTA_PROTO \ + (CTA_FILTER_F_CTA_PROTO_SRC_PORT | \ + CTA_FILTER_F_CTA_PROTO_DST_PORT | \ + CTA_FILTER_F_CTA_PROTO_ICMP_TYPE | \ + CTA_FILTER_F_CTA_PROTO_ICMP_CODE | \ + CTA_FILTER_F_CTA_PROTO_ICMP_ID | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE | \ + CTA_FILTER_F_CTA_PROTO_ICMPV6_ID) + static int -ctnetlink_parse_tuple(const struct nlattr * const cda[], - struct nf_conntrack_tuple *tuple, u32 type, - u_int8_t l3num, struct nf_conntrack_zone *zone) +ctnetlink_parse_tuple_filter(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone, + u_int32_t flags) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -1158,23 +1392,32 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (err < 0) return err; - if (!tb[CTA_TUPLE_IP]) - return -EINVAL; tuple->src.l3num = l3num; - err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple); - if (err < 0) - return err; + if (flags & CTA_FILTER_FLAG(CTA_IP_DST) || + flags & CTA_FILTER_FLAG(CTA_IP_SRC)) { + if (!tb[CTA_TUPLE_IP]) + return -EINVAL; - if (!tb[CTA_TUPLE_PROTO]) - return -EINVAL; + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple, flags); + if (err < 0) + return err; + } - err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple); - if (err < 0) - return err; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_NUM)) { + if (!tb[CTA_TUPLE_PROTO]) + return -EINVAL; - if (tb[CTA_TUPLE_ZONE]) { + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO], tuple, flags); + if (err < 0) + return err; + } else if (flags & CTA_FILTER_FLAG(ALL_CTA_PROTO)) { + /* Can't manage proto flags without a protonum */ + return -EINVAL; + } + + if ((flags & CTA_FILTER_FLAG(CTA_TUPLE_ZONE)) && tb[CTA_TUPLE_ZONE]) { if (!zone) return -EINVAL; @@ -1193,6 +1436,15 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], return 0; } +static int +ctnetlink_parse_tuple(const struct nlattr * const cda[], + struct nf_conntrack_tuple *tuple, u32 type, + u_int8_t l3num, struct nf_conntrack_zone *zone) +{ + return ctnetlink_parse_tuple_filter(cda, tuple, type, l3num, zone, + CTA_FILTER_FLAG(ALL)); +} + static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, @@ -1240,6 +1492,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = { .len = NF_CT_LABELS_MAX_SIZE }, [CTA_LABELS_MASK] = { .type = NLA_BINARY, .len = NF_CT_LABELS_MAX_SIZE }, + [CTA_FILTER] = { .type = NLA_NESTED }, }; static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data) @@ -1256,7 +1509,10 @@ static int ctnetlink_flush_conntrack(struct net *net, { struct ctnetlink_filter *filter = NULL; - if (family || (cda[CTA_MARK] && cda[CTA_MARK_MASK])) { + if (ctnetlink_needs_filter(family, cda)) { + if (cda[CTA_FILTER]) + return -EOPNOTSUPP; + filter = ctnetlink_alloc_filter(cda, family); if (IS_ERR(filter)) return PTR_ERR(filter); @@ -1385,7 +1641,7 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, } err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, - NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true); + NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); if (err <= 0) goto free; @@ -1458,7 +1714,7 @@ restart: res = ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), - ct, dying ? true : false); + ct, dying ? true : false, 0); if (res < 0) { if (!atomic_inc_not_zero(&ct->ct_general.use)) continue; diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index c2e3dff773bc..4efd8741c105 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -20,6 +20,8 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> +#include "nf_internals.h" + static const unsigned int nf_ct_icmp_timeout = 30*HZ; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, @@ -271,20 +273,32 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { }; static int icmp_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { - if (!tb[CTA_PROTO_ICMP_TYPE] || - !tb[CTA_PROTO_ICMP_CODE] || - !tb[CTA_PROTO_ICMP_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - - if (tuple->dst.u.icmp.type >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_TYPE)) { + if (!tb[CTA_PROTO_ICMP_TYPE]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) + return -EINVAL; + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_CODE)) { + if (!tb[CTA_PROTO_ICMP_CODE]) + return -EINVAL; + + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMP_ID)) { + if (!tb[CTA_PROTO_ICMP_ID]) + return -EINVAL; + + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 6f9144e1f1c1..facd8c64ec4e 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -24,6 +24,8 @@ #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> +#include "nf_internals.h" + static const unsigned int nf_ct_icmpv6_timeout = 30*HZ; bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, @@ -193,21 +195,33 @@ static const struct nla_policy icmpv6_nla_policy[CTA_PROTO_MAX+1] = { }; static int icmpv6_nlattr_to_tuple(struct nlattr *tb[], - struct nf_conntrack_tuple *tuple) + struct nf_conntrack_tuple *tuple, + u_int32_t flags) { - if (!tb[CTA_PROTO_ICMPV6_TYPE] || - !tb[CTA_PROTO_ICMPV6_CODE] || - !tb[CTA_PROTO_ICMPV6_ID]) - return -EINVAL; - - tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); - tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); - tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); - - if (tuple->dst.u.icmp.type < 128 || - tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || - !invmap[tuple->dst.u.icmp.type - 128]) - return -EINVAL; + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_TYPE)) { + if (!tb[CTA_PROTO_ICMPV6_TYPE]) + return -EINVAL; + + tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMPV6_TYPE]); + if (tuple->dst.u.icmp.type < 128 || + tuple->dst.u.icmp.type - 128 >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type - 128]) + return -EINVAL; + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_CODE)) { + if (!tb[CTA_PROTO_ICMPV6_CODE]) + return -EINVAL; + + tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMPV6_CODE]); + } + + if (flags & CTA_FILTER_FLAG(CTA_PROTO_ICMPV6_ID)) { + if (!tb[CTA_PROTO_ICMPV6_ID]) + return -EINVAL; + + tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMPV6_ID]); + } return 0; } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..6a26299cb064 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -348,7 +348,9 @@ static int ct_seq_show(struct seq_file *s, void *v) if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; - if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) + if (test_bit(IPS_HW_OFFLOAD_BIT, &ct->status)) + seq_puts(s, "[HW_OFFLOAD] "); + else if (test_bit(IPS_OFFLOAD_BIT, &ct->status)) seq_puts(s, "[OFFLOAD] "); else if (test_bit(IPS_ASSURED_BIT, &ct->status)) seq_puts(s, "[ASSURED] "); @@ -517,7 +519,7 @@ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 42da6e337276..6a3034f84ab6 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -588,8 +588,8 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) flow_offload_teardown(flow); } -static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, - struct net_device *dev) +void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable, + struct net_device *dev) { nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev); flush_delayed_work(&flowtable->gc_work); @@ -602,7 +602,7 @@ void nf_flow_table_cleanup(struct net_device *dev) mutex_lock(&flowtable_lock); list_for_each_entry(flowtable, &flowtables, list) - nf_flow_table_iterate_cleanup(flowtable, dev); + nf_flow_table_gc_cleanup(flowtable, dev); mutex_unlock(&flowtable_lock); } EXPORT_SYMBOL_GPL(nf_flow_table_cleanup); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 2276a73ccba2..62651e6683f6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -754,12 +754,15 @@ static void flow_offload_work_add(struct flow_offload_work *offload) err = flow_offload_rule_add(offload, flow_rule); if (err < 0) set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); + else + set_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); nf_flow_offload_destroy(flow_rule); } static void flow_offload_work_del(struct flow_offload_work *offload) { + clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); @@ -939,6 +942,18 @@ static void nf_flow_table_block_offload_init(struct flow_block_offload *bo, INIT_LIST_HEAD(&bo->cb_list); } +static void nf_flow_table_indr_cleanup(struct flow_block_cb *block_cb) +{ + struct nf_flowtable *flowtable = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + + nf_flow_table_gc_cleanup(flowtable, dev); + down_write(&flowtable->flow_block_lock); + list_del(&block_cb->list); + flow_block_cb_free(block_cb); + up_write(&flowtable->flow_block_lock); +} + static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, struct nf_flowtable *flowtable, struct net_device *dev, @@ -947,12 +962,9 @@ static int nf_flow_table_indr_offload_cmd(struct flow_block_offload *bo, { nf_flow_table_block_offload_init(bo, dev_net(dev), cmd, flowtable, extack); - flow_indr_block_call(dev, bo, cmd, TC_SETUP_FT); - if (list_empty(&bo->cb_list)) - return -EOPNOTSUPP; - - return 0; + return flow_indr_dev_setup_offload(dev, TC_SETUP_FT, flowtable, bo, + nf_flow_table_indr_cleanup); } static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, @@ -996,69 +1008,6 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, } EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); -static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev, - struct nf_flowtable *flowtable, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo; - - if (!flowtable) - return; - - nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable, - &extack); - - cb(dev, cb_priv, TC_SETUP_FT, &bo); - - nf_flow_table_block_setup(flowtable, &bo, cmd); -} - -static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable, - struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) - return; - - nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd); -} - -static void nf_flow_table_indr_block_cb(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct net *net = dev_net(dev); - struct nft_flowtable *nft_ft; - struct nft_table *table; - struct nft_hook *hook; - - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(nft_ft, &table->flowtables, list) { - list_for_each_entry(hook, &nft_ft->hook_list, list) { - if (hook->ops.dev != dev) - continue; - - nf_flow_table_indr_block_cb_cmd(&nft_ft->data, - dev, cb, - cb_priv, cmd); - } - } - } - mutex_unlock(&net->nft.commit_mutex); -} - -static struct flow_indr_block_entry block_ing_entry = { - .cb = nf_flow_table_indr_block_cb, - .list = LIST_HEAD_INIT(block_ing_entry.list), -}; - int nf_flow_table_offload_init(void) { nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", @@ -1066,13 +1015,10 @@ int nf_flow_table_offload_init(void) if (!nf_flow_offload_wq) return -ENOMEM; - flow_indr_add_block_cb(&block_ing_entry); - return 0; } void nf_flow_table_offload_exit(void) { - flow_indr_del_block_cb(&block_ing_entry); destroy_workqueue(nf_flow_offload_wq); } diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index d6c43902ebd7..832ae64179f0 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -6,6 +6,23 @@ #include <linux/skbuff.h> #include <linux/netdevice.h> +/* nf_conntrack_netlink.c: applied on tuple filters */ +#define CTA_FILTER_F_CTA_IP_SRC (1 << 0) +#define CTA_FILTER_F_CTA_IP_DST (1 << 1) +#define CTA_FILTER_F_CTA_TUPLE_ZONE (1 << 2) +#define CTA_FILTER_F_CTA_PROTO_NUM (1 << 3) +#define CTA_FILTER_F_CTA_PROTO_SRC_PORT (1 << 4) +#define CTA_FILTER_F_CTA_PROTO_DST_PORT (1 << 5) +#define CTA_FILTER_F_CTA_PROTO_ICMP_TYPE (1 << 6) +#define CTA_FILTER_F_CTA_PROTO_ICMP_CODE (1 << 7) +#define CTA_FILTER_F_CTA_PROTO_ICMP_ID (1 << 8) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_TYPE (1 << 9) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_CODE (1 << 10) +#define CTA_FILTER_F_CTA_PROTO_ICMPV6_ID (1 << 11) +#define CTA_FILTER_F_MAX (1 << 12) +#define CTA_FILTER_F_ALL (CTA_FILTER_F_MAX-1) +#define CTA_FILTER_FLAG(ctattr) CTA_FILTER_F_ ## ctattr + /* nf_queue.c */ void nf_queue_nf_hook_drop(struct net *net); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index bb25d4c794c7..6cb9f9474b05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -414,7 +414,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = { }; static int nf_log_proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 9780bd93b7e4..073aa1051d43 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1669,6 +1669,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, goto err_hook_dev; } hook->ops.dev = dev; + hook->inactive = false; return hook; @@ -1678,17 +1679,17 @@ err_hook_alloc: return ERR_PTR(err); } -static bool nft_hook_list_find(struct list_head *hook_list, - const struct nft_hook *this) +static struct nft_hook *nft_hook_list_find(struct list_head *hook_list, + const struct nft_hook *this) { struct nft_hook *hook; list_for_each_entry(hook, hook_list, list) { if (this->ops.dev == hook->ops.dev) - return true; + return hook; } - return false; + return NULL; } static int nf_tables_parse_netdev_hooks(struct net *net, @@ -1723,8 +1724,6 @@ static int nf_tables_parse_netdev_hooks(struct net *net, goto err_hook; } } - if (!n) - return -EINVAL; return 0; @@ -1761,6 +1760,9 @@ static int nft_chain_parse_netdev(struct net *net, hook_list); if (err < 0) return err; + + if (list_empty(hook_list)) + return -EINVAL; } else { return -EINVAL; } @@ -4669,6 +4671,25 @@ static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, return 0; } +static int nft_setelem_parse_data(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data_desc *desc, + struct nft_data *data, + struct nlattr *attr) +{ + int err; + + err = nft_data_init(ctx, data, NFT_DATA_VALUE_MAXLEN, desc, attr); + if (err < 0) + return err; + + if (desc->type != NFT_DATA_VERDICT && desc->len != set->dlen) { + nft_data_release(data, desc->type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -4946,7 +4967,6 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_expr *expr = NULL; struct nft_userdata *udata; struct nft_data_desc desc; - struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; u32 flags = 0; @@ -5072,15 +5092,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &desc, - nla[NFTA_SET_ELEM_DATA]); + err = nft_setelem_parse_data(ctx, set, &desc, &elem.data.val, + nla[NFTA_SET_ELEM_DATA]); if (err < 0) goto err_parse_key_end; - err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) - goto err_parse_data; - dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { @@ -5094,14 +5110,14 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, continue; err = nft_validate_register_store(&bind_ctx, dreg, - &data, + &elem.data.val, desc.type, desc.len); if (err < 0) goto err_parse_data; if (desc.type == NFT_DATA_VERDICT && - (data.verdict.code == NFT_GOTO || - data.verdict.code == NFT_JUMP)) + (elem.data.val.verdict.code == NFT_GOTO || + elem.data.val.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } @@ -5123,7 +5139,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = -ENOMEM; elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, - elem.key_end.val.data, data.data, + elem.key_end.val.data, elem.data.val.data, timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) goto err_parse_data; @@ -5201,7 +5217,7 @@ err_trans: nf_tables_set_elem_destroy(ctx, set, elem.priv); err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, desc.type); + nft_data_release(&elem.data.val, desc.type); err_parse_key_end: nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); err_parse_key: @@ -6164,50 +6180,77 @@ nft_flowtable_lookup_byhandle(const struct nft_table *table, return ERR_PTR(-ENOENT); } +struct nft_flowtable_hook { + u32 num; + int priority; + struct list_head list; +}; + static const struct nla_policy nft_flowtable_hook_policy[NFTA_FLOWTABLE_HOOK_MAX + 1] = { [NFTA_FLOWTABLE_HOOK_NUM] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_PRIORITY] = { .type = NLA_U32 }, [NFTA_FLOWTABLE_HOOK_DEVS] = { .type = NLA_NESTED }, }; -static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx, - const struct nlattr *attr, - struct nft_flowtable *flowtable) +static int nft_flowtable_parse_hook(const struct nft_ctx *ctx, + const struct nlattr *attr, + struct nft_flowtable_hook *flowtable_hook, + struct nft_flowtable *flowtable, bool add) { struct nlattr *tb[NFTA_FLOWTABLE_HOOK_MAX + 1]; struct nft_hook *hook; int hooknum, priority; int err; + INIT_LIST_HEAD(&flowtable_hook->list); + err = nla_parse_nested_deprecated(tb, NFTA_FLOWTABLE_HOOK_MAX, attr, nft_flowtable_hook_policy, NULL); if (err < 0) return err; - if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || - !tb[NFTA_FLOWTABLE_HOOK_PRIORITY] || - !tb[NFTA_FLOWTABLE_HOOK_DEVS]) - return -EINVAL; + if (add) { + if (!tb[NFTA_FLOWTABLE_HOOK_NUM] || + !tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) + return -EINVAL; - hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); - if (hooknum != NF_NETDEV_INGRESS) - return -EINVAL; + hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); + if (hooknum != NF_NETDEV_INGRESS) + return -EOPNOTSUPP; - priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); - err = nf_tables_parse_netdev_hooks(ctx->net, - tb[NFTA_FLOWTABLE_HOOK_DEVS], - &flowtable->hook_list); - if (err < 0) - return err; + flowtable_hook->priority = priority; + flowtable_hook->num = hooknum; + } else { + if (tb[NFTA_FLOWTABLE_HOOK_NUM]) { + hooknum = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_NUM])); + if (hooknum != flowtable->hooknum) + return -EOPNOTSUPP; + } - flowtable->hooknum = hooknum; - flowtable->data.priority = priority; + if (tb[NFTA_FLOWTABLE_HOOK_PRIORITY]) { + priority = ntohl(nla_get_be32(tb[NFTA_FLOWTABLE_HOOK_PRIORITY])); + if (priority != flowtable->data.priority) + return -EOPNOTSUPP; + } - list_for_each_entry(hook, &flowtable->hook_list, list) { + flowtable_hook->priority = flowtable->data.priority; + flowtable_hook->num = flowtable->hooknum; + } + + if (tb[NFTA_FLOWTABLE_HOOK_DEVS]) { + err = nf_tables_parse_netdev_hooks(ctx->net, + tb[NFTA_FLOWTABLE_HOOK_DEVS], + &flowtable_hook->list); + if (err < 0) + return err; + } + + list_for_each_entry(hook, &flowtable_hook->list, list) { hook->ops.pf = NFPROTO_NETDEV; - hook->ops.hooknum = hooknum; - hook->ops.priority = priority; + hook->ops.hooknum = flowtable_hook->num; + hook->ops.priority = flowtable_hook->priority; hook->ops.priv = &flowtable->data; hook->ops.hook = flowtable->data.type->hook; } @@ -6256,23 +6299,24 @@ static void nft_unregister_flowtable_hook(struct net *net, } static void nft_unregister_flowtable_net_hooks(struct net *net, - struct nft_flowtable *flowtable) + struct list_head *hook_list) { struct nft_hook *hook; - list_for_each_entry(hook, &flowtable->hook_list, list) + list_for_each_entry(hook, hook_list, list) nf_unregister_net_hook(net, &hook->ops); } static int nft_register_flowtable_net_hooks(struct net *net, struct nft_table *table, + struct list_head *hook_list, struct nft_flowtable *flowtable) { struct nft_hook *hook, *hook2, *next; struct nft_flowtable *ft; int err, i = 0; - list_for_each_entry(hook, &flowtable->hook_list, list) { + list_for_each_entry(hook, hook_list, list) { list_for_each_entry(ft, &table->flowtables, list) { list_for_each_entry(hook2, &ft->hook_list, list) { if (hook->ops.dev == hook2->ops.dev && @@ -6303,7 +6347,7 @@ static int nft_register_flowtable_net_hooks(struct net *net, return 0; err_unregister_net_hooks: - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + list_for_each_entry_safe(hook, next, hook_list, list) { if (i-- <= 0) break; @@ -6315,6 +6359,72 @@ err_unregister_net_hooks: return err; } +static void nft_flowtable_hooks_destroy(struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, hook_list, list) { + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } +} + +static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, + struct nft_flowtable *flowtable) +{ + const struct nlattr * const *nla = ctx->nla; + struct nft_flowtable_hook flowtable_hook; + struct nft_hook *hook, *next; + struct nft_trans *trans; + bool unregister = false; + int err; + + err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, flowtable, false); + if (err < 0) + return err; + + list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { + if (nft_hook_list_find(&flowtable->hook_list, hook)) { + list_del(&hook->list); + kfree(hook); + } + } + + err = nft_register_flowtable_net_hooks(ctx->net, ctx->table, + &flowtable_hook.list, flowtable); + if (err < 0) + goto err_flowtable_update_hook; + + trans = nft_trans_alloc(ctx, NFT_MSG_NEWFLOWTABLE, + sizeof(struct nft_trans_flowtable)); + if (!trans) { + unregister = true; + err = -ENOMEM; + goto err_flowtable_update_hook; + } + + nft_trans_flowtable(trans) = flowtable; + nft_trans_flowtable_update(trans) = true; + INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + list_splice(&flowtable_hook.list, &nft_trans_flowtable_hooks(trans)); + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; + +err_flowtable_update_hook: + list_for_each_entry_safe(hook, next, &flowtable_hook.list, list) { + if (unregister) + nft_unregister_flowtable_hook(ctx->net, flowtable, hook); + list_del_rcu(&hook->list); + kfree_rcu(hook, rcu); + } + + return err; + +} + static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -6322,6 +6432,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, struct netlink_ext_ack *extack) { const struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nft_flowtable_hook flowtable_hook; const struct nf_flowtable_type *type; u8 genmask = nft_genmask_next(net); int family = nfmsg->nfgen_family; @@ -6357,7 +6468,9 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, return -EEXIST; } - return 0; + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + return nft_flowtable_update(&ctx, nlh, flowtable); } nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); @@ -6395,17 +6508,20 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk, if (err < 0) goto err3; - err = nf_tables_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], - flowtable); + err = nft_flowtable_parse_hook(&ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, flowtable, true); if (err < 0) goto err4; - err = nft_register_flowtable_net_hooks(ctx.net, table, flowtable); + list_splice(&flowtable_hook.list, &flowtable->hook_list); + flowtable->data.priority = flowtable_hook.priority; + flowtable->hooknum = flowtable_hook.num; + + err = nft_register_flowtable_net_hooks(ctx.net, table, + &flowtable->hook_list, + flowtable); if (err < 0) { - list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - list_del_rcu(&hook->list); - kfree_rcu(hook, rcu); - } + nft_flowtable_hooks_destroy(&flowtable->hook_list); goto err4; } @@ -6434,6 +6550,51 @@ err1: return err; } +static int nft_delflowtable_hook(struct nft_ctx *ctx, + struct nft_flowtable *flowtable) +{ + const struct nlattr * const *nla = ctx->nla; + struct nft_flowtable_hook flowtable_hook; + struct nft_hook *this, *next, *hook; + struct nft_trans *trans; + int err; + + err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK], + &flowtable_hook, flowtable, false); + if (err < 0) + return err; + + list_for_each_entry_safe(this, next, &flowtable_hook.list, list) { + hook = nft_hook_list_find(&flowtable->hook_list, this); + if (!hook) { + err = -ENOENT; + goto err_flowtable_del_hook; + } + hook->inactive = true; + list_del(&this->list); + kfree(this); + } + + trans = nft_trans_alloc(ctx, NFT_MSG_DELFLOWTABLE, + sizeof(struct nft_trans_flowtable)); + if (!trans) + return -ENOMEM; + + nft_trans_flowtable(trans) = flowtable; + nft_trans_flowtable_update(trans) = true; + INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans)); + + list_add_tail(&trans->list, &ctx->net->nft.commit_list); + + return 0; + +err_flowtable_del_hook: + list_for_each_entry(hook, &flowtable_hook.list, list) + hook->inactive = false; + + return err; +} + static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -6472,20 +6633,25 @@ static int nf_tables_delflowtable(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, attr); return PTR_ERR(flowtable); } + + nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); + + if (nla[NFTA_FLOWTABLE_HOOK]) + return nft_delflowtable_hook(&ctx, flowtable); + if (flowtable->use > 0) { NL_SET_BAD_ATTR(extack, attr); return -EBUSY; } - nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla); - return nft_delflowtable(&ctx, flowtable); } static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq, int event, u32 flags, int family, - struct nft_flowtable *flowtable) + struct nft_flowtable *flowtable, + struct list_head *hook_list) { struct nlattr *nest, *nest_devs; struct nfgenmsg *nfmsg; @@ -6521,7 +6687,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!nest_devs) goto nla_put_failure; - list_for_each_entry_rcu(hook, &flowtable->hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list) { if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } @@ -6574,7 +6740,9 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, cb->nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, NLM_F_MULTI | NLM_F_APPEND, - table->family, flowtable) < 0) + table->family, + flowtable, + &flowtable->hook_list) < 0) goto done; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -6671,7 +6839,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, err = nf_tables_fill_flowtable_info(skb2, net, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFT_MSG_NEWFLOWTABLE, 0, family, - flowtable); + flowtable, &flowtable->hook_list); if (err < 0) goto err; @@ -6683,6 +6851,7 @@ err: static void nf_tables_flowtable_notify(struct nft_ctx *ctx, struct nft_flowtable *flowtable, + struct list_head *hook_list, int event) { struct sk_buff *skb; @@ -6698,7 +6867,7 @@ static void nf_tables_flowtable_notify(struct nft_ctx *ctx, err = nf_tables_fill_flowtable_info(skb, ctx->net, ctx->portid, ctx->seq, event, 0, - ctx->family, flowtable); + ctx->family, flowtable, hook_list); if (err < 0) { kfree_skb(skb); goto err; @@ -7084,7 +7253,10 @@ static void nft_commit_release(struct nft_trans *trans) nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_DELFLOWTABLE: - nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) + nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + else + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } @@ -7245,6 +7417,17 @@ static void nft_chain_del(struct nft_chain *chain) list_del_rcu(&chain->list); } +static void nft_flowtable_hooks_del(struct nft_flowtable *flowtable, + struct list_head *hook_list) +{ + struct nft_hook *hook, *next; + + list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { + if (hook->inactive) + list_move(&hook->list, hook_list); + } +} + static void nf_tables_module_autoload_cleanup(struct net *net) { struct nft_module_request *req, *next; @@ -7453,19 +7636,41 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NFT_MSG_DELOBJ); break; case NFT_MSG_NEWFLOWTABLE: - nft_clear(net, nft_trans_flowtable(trans)); - nf_tables_flowtable_notify(&trans->ctx, - nft_trans_flowtable(trans), - NFT_MSG_NEWFLOWTABLE); + if (nft_trans_flowtable_update(trans)) { + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans), + NFT_MSG_NEWFLOWTABLE); + list_splice(&nft_trans_flowtable_hooks(trans), + &nft_trans_flowtable(trans)->hook_list); + } else { + nft_clear(net, nft_trans_flowtable(trans)); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, + NFT_MSG_NEWFLOWTABLE); + } nft_trans_destroy(trans); break; case NFT_MSG_DELFLOWTABLE: - list_del_rcu(&nft_trans_flowtable(trans)->list); - nf_tables_flowtable_notify(&trans->ctx, - nft_trans_flowtable(trans), - NFT_MSG_DELFLOWTABLE); - nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) { + nft_flowtable_hooks_del(nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans)); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable_hooks(trans), + NFT_MSG_DELFLOWTABLE); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable_hooks(trans)); + } else { + list_del_rcu(&nft_trans_flowtable(trans)->list); + nf_tables_flowtable_notify(&trans->ctx, + nft_trans_flowtable(trans), + &nft_trans_flowtable(trans)->hook_list, + NFT_MSG_DELFLOWTABLE); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable(trans)->hook_list); + } break; } } @@ -7514,7 +7719,10 @@ static void nf_tables_abort_release(struct nft_trans *trans) nft_obj_destroy(&trans->ctx, nft_trans_obj(trans)); break; case NFT_MSG_NEWFLOWTABLE: - nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) + nft_flowtable_hooks_destroy(&nft_trans_flowtable_hooks(trans)); + else + nf_tables_flowtable_destroy(nft_trans_flowtable(trans)); break; } kfree(trans); @@ -7524,6 +7732,7 @@ static int __nf_tables_abort(struct net *net, bool autoload) { struct nft_trans *trans, *next; struct nft_trans_elem *te; + struct nft_hook *hook; list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list, list) { @@ -7621,14 +7830,24 @@ static int __nf_tables_abort(struct net *net, bool autoload) nft_trans_destroy(trans); break; case NFT_MSG_NEWFLOWTABLE: - trans->ctx.table->use--; - list_del_rcu(&nft_trans_flowtable(trans)->list); - nft_unregister_flowtable_net_hooks(net, - nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) { + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable_hooks(trans)); + } else { + trans->ctx.table->use--; + list_del_rcu(&nft_trans_flowtable(trans)->list); + nft_unregister_flowtable_net_hooks(net, + &nft_trans_flowtable(trans)->hook_list); + } break; case NFT_MSG_DELFLOWTABLE: - trans->ctx.table->use++; - nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + if (nft_trans_flowtable_update(trans)) { + list_for_each_entry(hook, &nft_trans_flowtable(trans)->hook_list, list) + hook->inactive = false; + } else { + trans->ctx.table->use++; + nft_clear(trans->ctx.net, nft_trans_flowtable(trans)); + } nft_trans_destroy(trans); break; } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 954bccb7f32a..185fc82c99aa 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -285,40 +285,41 @@ static int nft_block_offload_cmd(struct nft_base_chain *chain, return nft_block_setup(chain, &bo, cmd); } -static void nft_indr_block_ing_cmd(struct net_device *dev, - struct nft_base_chain *chain, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) +static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { + struct nft_base_chain *basechain = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; struct netlink_ext_ack extack = {}; + struct net *net = dev_net(dev); struct flow_block_offload bo; - if (!chain) - return; - - nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); - - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - nft_block_setup(chain, &bo, cmd); + nft_flow_block_offload_init(&bo, dev_net(dev), FLOW_BLOCK_UNBIND, + basechain, &extack); + mutex_lock(&net->nft.commit_mutex); + list_move(&block_cb->list, &bo.cb_list); + nft_flow_offload_unbind(&bo, basechain); + mutex_unlock(&net->nft.commit_mutex); } -static int nft_indr_block_offload_cmd(struct nft_base_chain *chain, +static int nft_indr_block_offload_cmd(struct nft_base_chain *basechain, struct net_device *dev, enum flow_block_command cmd) { struct netlink_ext_ack extack = {}; struct flow_block_offload bo; + int err; - nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); + nft_flow_block_offload_init(&bo, dev_net(dev), cmd, basechain, &extack); - flow_indr_block_call(dev, &bo, cmd, TC_SETUP_BLOCK); + err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, basechain, &bo, + nft_indr_block_cleanup); + if (err < 0) + return err; if (list_empty(&bo.cb_list)) return -EOPNOTSUPP; - return nft_block_setup(chain, &bo, cmd); + return nft_block_setup(basechain, &bo, cmd); } #define FLOW_SETUP_BLOCK TC_SETUP_BLOCK @@ -555,24 +556,6 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) return NULL; } -static void nft_indr_block_cb(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command cmd) -{ - struct net *net = dev_net(dev); - struct nft_chain *chain; - - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); - if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { - struct nft_base_chain *basechain; - - basechain = nft_base_chain(chain); - nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd); - } - mutex_unlock(&net->nft.commit_mutex); -} - static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -594,30 +577,16 @@ static int nft_offload_netdev_event(struct notifier_block *this, return NOTIFY_DONE; } -static struct flow_indr_block_entry block_ing_entry = { - .cb = nft_indr_block_cb, - .list = LIST_HEAD_INIT(block_ing_entry.list), -}; - static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { - int err; - - err = register_netdevice_notifier(&nft_offload_netdev_notifier); - if (err < 0) - return err; - - flow_indr_add_block_cb(&block_ing_entry); - - return 0; + return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { - flow_indr_del_block_cb(&block_ing_entry); unregister_netdevice_notifier(&nft_offload_netdev_notifier); } diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 8b44a4de5329..23a7bfd10521 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -30,6 +30,76 @@ struct nft_nat { u16 flags; }; +static void nft_nat_setup_addr(struct nf_nat_range2 *range, + const struct nft_regs *regs, + const struct nft_nat *priv) +{ + switch (priv->family) { + case AF_INET: + range->min_addr.ip = (__force __be32) + regs->data[priv->sreg_addr_min]; + range->max_addr.ip = (__force __be32) + regs->data[priv->sreg_addr_max]; + break; + case AF_INET6: + memcpy(range->min_addr.ip6, ®s->data[priv->sreg_addr_min], + sizeof(range->min_addr.ip6)); + memcpy(range->max_addr.ip6, ®s->data[priv->sreg_addr_max], + sizeof(range->max_addr.ip6)); + break; + } +} + +static void nft_nat_setup_proto(struct nf_nat_range2 *range, + const struct nft_regs *regs, + const struct nft_nat *priv) +{ + range->min_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_min]); + range->max_proto.all = (__force __be16) + nft_reg_load16(®s->data[priv->sreg_proto_max]); +} + +static void nft_nat_setup_netmap(struct nf_nat_range2 *range, + const struct nft_pktinfo *pkt, + const struct nft_nat *priv) +{ + struct sk_buff *skb = pkt->skb; + union nf_inet_addr new_addr; + __be32 netmask; + int i, len = 0; + + switch (priv->type) { + case NFT_NAT_SNAT: + if (nft_pf(pkt) == NFPROTO_IPV4) { + new_addr.ip = ip_hdr(skb)->saddr; + len = sizeof(struct in_addr); + } else { + new_addr.in6 = ipv6_hdr(skb)->saddr; + len = sizeof(struct in6_addr); + } + break; + case NFT_NAT_DNAT: + if (nft_pf(pkt) == NFPROTO_IPV4) { + new_addr.ip = ip_hdr(skb)->daddr; + len = sizeof(struct in_addr); + } else { + new_addr.in6 = ipv6_hdr(skb)->daddr; + len = sizeof(struct in6_addr); + } + break; + } + + for (i = 0; i < len / sizeof(__be32); i++) { + netmask = ~(range->min_addr.ip6[i] ^ range->max_addr.ip6[i]); + new_addr.ip6[i] &= ~netmask; + new_addr.ip6[i] |= range->min_addr.ip6[i] & netmask; + } + + range->min_addr = new_addr; + range->max_addr = new_addr; +} + static void nft_nat_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -40,33 +110,17 @@ static void nft_nat_eval(const struct nft_expr *expr, struct nf_nat_range2 range; memset(&range, 0, sizeof(range)); - if (priv->sreg_addr_min) { - if (priv->family == AF_INET) { - range.min_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_min]; - range.max_addr.ip = (__force __be32) - regs->data[priv->sreg_addr_max]; - } else { - memcpy(range.min_addr.ip6, - ®s->data[priv->sreg_addr_min], - sizeof(range.min_addr.ip6)); - memcpy(range.max_addr.ip6, - ®s->data[priv->sreg_addr_max], - sizeof(range.max_addr.ip6)); - } - range.flags |= NF_NAT_RANGE_MAP_IPS; + if (priv->sreg_addr_min) { + nft_nat_setup_addr(&range, regs, priv); + if (priv->flags & NF_NAT_RANGE_NETMAP) + nft_nat_setup_netmap(&range, pkt, priv); } - if (priv->sreg_proto_min) { - range.min_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_min]); - range.max_proto.all = (__force __be16)nft_reg_load16( - ®s->data[priv->sreg_proto_max]); - range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; - } + if (priv->sreg_proto_min) + nft_nat_setup_proto(&range, regs, priv); - range.flags |= priv->flags; + range.flags = priv->flags; regs->verdict.code = nf_nat_setup_info(ct, &range, priv->type); } @@ -129,7 +183,7 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, priv->type = NF_NAT_MANIP_DST; break; default: - return -EINVAL; + return -EOPNOTSUPP; } if (tb[NFTA_NAT_FAMILY] == NULL) @@ -169,6 +223,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else { priv->sreg_addr_max = priv->sreg_addr_min; } + + priv->flags |= NF_NAT_RANGE_MAP_IPS; } plen = sizeof_field(struct nf_nat_range, min_addr.all); @@ -191,12 +247,14 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, } else { priv->sreg_proto_max = priv->sreg_proto_min; } + + priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[NFTA_NAT_FLAGS]) { - priv->flags = ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); + priv->flags |= ntohl(nla_get_be32(tb[NFTA_NAT_FLAGS])); if (priv->flags & ~NF_NAT_RANGE_MASK) - return -EINVAL; + return -EOPNOTSUPP; } return nf_ct_netns_get(ctx->net, family); diff --git a/net/netlink/Makefile b/net/netlink/Makefile index de42df7f0068..e05202708c90 100644 --- a/net/netlink/Makefile +++ b/net/netlink/Makefile @@ -3,7 +3,7 @@ # Makefile for the netlink driver. # -obj-y := af_netlink.o genetlink.o +obj-y := af_netlink.o genetlink.o policy.o obj-$(CONFIG_NETLINK_DIAG) += netlink_diag.o netlink_diag-y := diag.o diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 5ded01ca8b20..4f2c3b14ddbf 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2596,7 +2596,7 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) return __netlink_seq_next(seq); } -static void netlink_seq_stop(struct seq_file *seq, void *v) +static void netlink_native_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; @@ -2607,7 +2607,7 @@ static void netlink_seq_stop(struct seq_file *seq, void *v) } -static int netlink_seq_show(struct seq_file *seq, void *v) +static int netlink_native_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, @@ -2634,6 +2634,68 @@ static int netlink_seq_show(struct seq_file *seq, void *v) return 0; } +#ifdef CONFIG_BPF_SYSCALL +struct bpf_iter__netlink { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct netlink_sock *, sk); +}; + +DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk) + +static int netlink_prog_seq_show(struct bpf_prog *prog, + struct bpf_iter_meta *meta, + void *v) +{ + struct bpf_iter__netlink ctx; + + meta->seq_num--; /* skip SEQ_START_TOKEN */ + ctx.meta = meta; + ctx.sk = nlk_sk((struct sock *)v); + return bpf_iter_run_prog(prog, &ctx); +} + +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, false); + if (!prog) + return netlink_native_seq_show(seq, v); + + if (v != SEQ_START_TOKEN) + return netlink_prog_seq_show(prog, &meta, v); + + return 0; +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + if (!v) { + meta.seq = seq; + prog = bpf_iter_get_info(&meta, true); + if (prog) + (void)netlink_prog_seq_show(prog, &meta, v); + } + + netlink_native_seq_stop(seq, v); +} +#else +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + return netlink_native_seq_show(seq, v); +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + netlink_native_seq_stop(seq, v); +} +#endif + static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, @@ -2740,6 +2802,26 @@ static const struct rhashtable_params netlink_rhashtable_params = { .automatic_shrinking = true, }; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) +static const struct bpf_iter_reg netlink_reg_info = { + .target = "netlink", + .seq_ops = &netlink_seq_ops, + .init_seq_private = bpf_iter_init_seq_net, + .fini_seq_private = bpf_iter_fini_seq_net, + .seq_priv_size = sizeof(struct nl_seq_iter), + .ctx_arg_info_size = 1, + .ctx_arg_info = { + { offsetof(struct bpf_iter__netlink, sk), + PTR_TO_BTF_ID_OR_NULL }, + }, +}; + +static int __init bpf_iter_register(void) +{ + return bpf_iter_reg_target(&netlink_reg_info); +} +#endif + static int __init netlink_proto_init(void) { int i; @@ -2748,6 +2830,12 @@ static int __init netlink_proto_init(void) if (err != 0) goto out; +#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) + err = bpf_iter_register(); + if (err) + goto out; +#endif + BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 9f357aa22b94..2f049692e012 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1043,6 +1043,80 @@ static int genl_ctrl_event(int event, const struct genl_family *family, return 0; } +static int ctrl_dumppolicy(struct sk_buff *skb, struct netlink_callback *cb) +{ + const struct genl_family *rt; + unsigned int fam_id = cb->args[0]; + int err; + + if (!fam_id) { + struct nlattr *tb[CTRL_ATTR_MAX + 1]; + + err = genlmsg_parse(cb->nlh, &genl_ctrl, tb, + genl_ctrl.maxattr, + genl_ctrl.policy, cb->extack); + if (err) + return err; + + if (!tb[CTRL_ATTR_FAMILY_ID] && !tb[CTRL_ATTR_FAMILY_NAME]) + return -EINVAL; + + if (tb[CTRL_ATTR_FAMILY_ID]) { + fam_id = nla_get_u16(tb[CTRL_ATTR_FAMILY_ID]); + } else { + rt = genl_family_find_byname( + nla_data(tb[CTRL_ATTR_FAMILY_NAME])); + if (!rt) + return -ENOENT; + fam_id = rt->id; + } + } + + rt = genl_family_find_byid(fam_id); + if (!rt) + return -ENOENT; + + if (!rt->policy) + return -ENODATA; + + err = netlink_policy_dump_start(rt->policy, rt->maxattr, &cb->args[1]); + if (err) + return err; + + while (netlink_policy_dump_loop(&cb->args[1])) { + void *hdr; + struct nlattr *nest; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &genl_ctrl, + NLM_F_MULTI, CTRL_CMD_GETPOLICY); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u16(skb, CTRL_ATTR_FAMILY_ID, rt->id)) + goto nla_put_failure; + + nest = nla_nest_start(skb, CTRL_ATTR_POLICY); + if (!nest) + goto nla_put_failure; + + if (netlink_policy_dump_write(skb, cb->args[1])) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + genlmsg_end(skb, hdr); + continue; + +nla_put_failure: + genlmsg_cancel(skb, hdr); + break; + } + + cb->args[0] = fam_id; + return skb->len; +} + static const struct genl_ops genl_ctrl_ops[] = { { .cmd = CTRL_CMD_GETFAMILY, @@ -1050,6 +1124,10 @@ static const struct genl_ops genl_ctrl_ops[] = { .doit = ctrl_getfamily, .dumpit = ctrl_dumpfamily, }, + { + .cmd = CTRL_CMD_GETPOLICY, + .dumpit = ctrl_dumppolicy, + }, }; static const struct genl_multicast_group genl_ctrl_groups[] = { diff --git a/net/netlink/policy.c b/net/netlink/policy.c new file mode 100644 index 000000000000..f6491853c797 --- /dev/null +++ b/net/netlink/policy.c @@ -0,0 +1,308 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * NETLINK Policy advertisement to userspace + * + * Authors: Johannes Berg <johannes@sipsolutions.net> + * + * Copyright 2019 Intel Corporation + */ + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <net/netlink.h> + +#define INITIAL_POLICIES_ALLOC 10 + +struct nl_policy_dump { + unsigned int policy_idx; + unsigned int attr_idx; + unsigned int n_alloc; + struct { + const struct nla_policy *policy; + unsigned int maxtype; + } policies[]; +}; + +static int add_policy(struct nl_policy_dump **statep, + const struct nla_policy *policy, + unsigned int maxtype) +{ + struct nl_policy_dump *state = *statep; + unsigned int n_alloc, i; + + if (!policy || !maxtype) + return 0; + + for (i = 0; i < state->n_alloc; i++) { + if (state->policies[i].policy == policy) + return 0; + + if (!state->policies[i].policy) { + state->policies[i].policy = policy; + state->policies[i].maxtype = maxtype; + return 0; + } + } + + n_alloc = state->n_alloc + INITIAL_POLICIES_ALLOC; + state = krealloc(state, struct_size(state, policies, n_alloc), + GFP_KERNEL); + if (!state) + return -ENOMEM; + + state->policies[state->n_alloc].policy = policy; + state->policies[state->n_alloc].maxtype = maxtype; + state->n_alloc = n_alloc; + *statep = state; + + return 0; +} + +static unsigned int get_policy_idx(struct nl_policy_dump *state, + const struct nla_policy *policy) +{ + unsigned int i; + + for (i = 0; i < state->n_alloc; i++) { + if (state->policies[i].policy == policy) + return i; + } + + WARN_ON_ONCE(1); + return -1; +} + +int netlink_policy_dump_start(const struct nla_policy *policy, + unsigned int maxtype, + unsigned long *_state) +{ + struct nl_policy_dump *state; + unsigned int policy_idx; + int err; + + /* also returns 0 if "*_state" is our ERR_PTR() end marker */ + if (*_state) + return 0; + + /* + * walk the policies and nested ones first, and build + * a linear list of them. + */ + + state = kzalloc(struct_size(state, policies, INITIAL_POLICIES_ALLOC), + GFP_KERNEL); + if (!state) + return -ENOMEM; + state->n_alloc = INITIAL_POLICIES_ALLOC; + + err = add_policy(&state, policy, maxtype); + if (err) + return err; + + for (policy_idx = 0; + policy_idx < state->n_alloc && state->policies[policy_idx].policy; + policy_idx++) { + const struct nla_policy *policy; + unsigned int type; + + policy = state->policies[policy_idx].policy; + + for (type = 0; + type <= state->policies[policy_idx].maxtype; + type++) { + switch (policy[type].type) { + case NLA_NESTED: + case NLA_NESTED_ARRAY: + err = add_policy(&state, + policy[type].nested_policy, + policy[type].len); + if (err) + return err; + break; + default: + break; + } + } + } + + *_state = (unsigned long)state; + + return 0; +} + +static bool netlink_policy_dump_finished(struct nl_policy_dump *state) +{ + return state->policy_idx >= state->n_alloc || + !state->policies[state->policy_idx].policy; +} + +bool netlink_policy_dump_loop(unsigned long *_state) +{ + struct nl_policy_dump *state = (void *)*_state; + + if (IS_ERR(state)) + return false; + + if (netlink_policy_dump_finished(state)) { + kfree(state); + /* store end marker instead of freed state */ + *_state = (unsigned long)ERR_PTR(-ENOENT); + return false; + } + + return true; +} + +int netlink_policy_dump_write(struct sk_buff *skb, unsigned long _state) +{ + struct nl_policy_dump *state = (void *)_state; + const struct nla_policy *pt; + struct nlattr *policy, *attr; + enum netlink_attribute_type type; + bool again; + +send_attribute: + again = false; + + pt = &state->policies[state->policy_idx].policy[state->attr_idx]; + + policy = nla_nest_start(skb, state->policy_idx); + if (!policy) + return -ENOBUFS; + + attr = nla_nest_start(skb, state->attr_idx); + if (!attr) + goto nla_put_failure; + + switch (pt->type) { + default: + case NLA_UNSPEC: + case NLA_REJECT: + /* skip - use NLA_MIN_LEN to advertise such */ + nla_nest_cancel(skb, policy); + again = true; + goto next; + case NLA_NESTED: + type = NL_ATTR_TYPE_NESTED; + /* fall through */ + case NLA_NESTED_ARRAY: + if (pt->type == NLA_NESTED_ARRAY) + type = NL_ATTR_TYPE_NESTED_ARRAY; + if (pt->nested_policy && pt->len && + (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_IDX, + get_policy_idx(state, pt->nested_policy)) || + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_POLICY_MAXTYPE, + pt->len))) + goto nla_put_failure; + break; + case NLA_U8: + case NLA_U16: + case NLA_U32: + case NLA_U64: + case NLA_MSECS: { + struct netlink_range_validation range; + + if (pt->type == NLA_U8) + type = NL_ATTR_TYPE_U8; + else if (pt->type == NLA_U16) + type = NL_ATTR_TYPE_U16; + else if (pt->type == NLA_U32) + type = NL_ATTR_TYPE_U32; + else + type = NL_ATTR_TYPE_U64; + + nla_get_range_unsigned(pt, &range); + + if (nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_U, + range.min, NL_POLICY_TYPE_ATTR_PAD) || + nla_put_u64_64bit(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_U, + range.max, NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + case NLA_S8: + case NLA_S16: + case NLA_S32: + case NLA_S64: { + struct netlink_range_validation_signed range; + + if (pt->type == NLA_S8) + type = NL_ATTR_TYPE_S8; + else if (pt->type == NLA_S16) + type = NL_ATTR_TYPE_S16; + else if (pt->type == NLA_S32) + type = NL_ATTR_TYPE_S32; + else + type = NL_ATTR_TYPE_S64; + + nla_get_range_signed(pt, &range); + + if (nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MIN_VALUE_S, + range.min, NL_POLICY_TYPE_ATTR_PAD) || + nla_put_s64(skb, NL_POLICY_TYPE_ATTR_MAX_VALUE_S, + range.max, NL_POLICY_TYPE_ATTR_PAD)) + goto nla_put_failure; + break; + } + case NLA_BITFIELD32: + type = NL_ATTR_TYPE_BITFIELD32; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_BITFIELD32_MASK, + pt->bitfield32_valid)) + goto nla_put_failure; + break; + case NLA_EXACT_LEN: + type = NL_ATTR_TYPE_BINARY; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len) || + nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, pt->len)) + goto nla_put_failure; + break; + case NLA_STRING: + case NLA_NUL_STRING: + case NLA_BINARY: + if (pt->type == NLA_STRING) + type = NL_ATTR_TYPE_STRING; + else if (pt->type == NLA_NUL_STRING) + type = NL_ATTR_TYPE_NUL_STRING; + else + type = NL_ATTR_TYPE_BINARY; + if (pt->len && nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MAX_LENGTH, + pt->len)) + goto nla_put_failure; + break; + case NLA_MIN_LEN: + type = NL_ATTR_TYPE_BINARY; + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_MIN_LENGTH, pt->len)) + goto nla_put_failure; + break; + case NLA_FLAG: + type = NL_ATTR_TYPE_FLAG; + break; + } + + if (nla_put_u32(skb, NL_POLICY_TYPE_ATTR_TYPE, type)) + goto nla_put_failure; + + /* finish and move state to next attribute */ + nla_nest_end(skb, attr); + nla_nest_end(skb, policy); + +next: + state->attr_idx += 1; + if (state->attr_idx > state->policies[state->policy_idx].maxtype) { + state->attr_idx = 0; + state->policy_idx++; + } + + if (again) { + if (netlink_policy_dump_finished(state)) + return -ENODATA; + goto send_attribute; + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, policy); + return -ENOBUFS; +} diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 7b1a74f74aad..eccc7d366e17 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -64,6 +64,26 @@ static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; /* + * NETROM network devices are virtual network devices encapsulating NETROM + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key nr_netdev_xmit_lock_key; + +static void nr_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); +} + +static void nr_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); +} + +/* * Socket removal during an interrupt is now safe. */ static void nr_remove_socket(struct sock *sk) @@ -1394,6 +1414,7 @@ static int __init nr_proto_init(void) free_netdev(dev); goto fail; } + nr_set_lockdep_key(dev); dev_nr[i] = dev; } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index e239a46c2f94..2016dd107939 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -82,7 +82,7 @@ struct datapath { u32 max_headroom; /* Switch meters. */ - struct hlist_head *meters; + struct dp_meter_table meter_tbl; }; /** diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 5010d1ddd4bd..3d3d8e094546 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -12,6 +12,7 @@ #include <linux/openvswitch.h> #include <linux/netlink.h> #include <linux/rculist.h> +#include <linux/swap.h> #include <net/netlink.h> #include <net/genetlink.h> @@ -19,8 +20,6 @@ #include "datapath.h" #include "meter.h" -#define METER_HASH_BUCKETS 1024 - static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, @@ -39,6 +38,11 @@ static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) +{ + return id % ti->n_meters; +} + static void ovs_meter_free(struct dp_meter *meter) { if (!meter) @@ -47,40 +51,162 @@ static void ovs_meter_free(struct dp_meter *meter) kfree_rcu(meter, rcu); } -static struct hlist_head *meter_hash_bucket(const struct datapath *dp, - u32 meter_id) -{ - return &dp->meters[meter_id & (METER_HASH_BUCKETS - 1)]; -} - /* Call with ovs_mutex or RCU read lock. */ -static struct dp_meter *lookup_meter(const struct datapath *dp, +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; - struct hlist_head *head; - head = meter_hash_bucket(dp, meter_id); - hlist_for_each_entry_rcu(meter, head, dp_hash_node, - lockdep_ovsl_is_held()) { - if (meter->id == meter_id) - return meter; - } + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); + if (meter && likely(meter->id == meter_id)) + return meter; + return NULL; } -static void attach_meter(struct datapath *dp, struct dp_meter *meter) +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) +{ + struct dp_meter_instance *ti; + + ti = kvzalloc(sizeof(*ti) + + sizeof(struct dp_meter *) * size, + GFP_KERNEL); + if (!ti) + return NULL; + + ti->n_meters = size; + + return ti; +} + +static void dp_meter_instance_free(struct dp_meter_instance *ti) +{ + kvfree(ti); +} + +static void dp_meter_instance_free_rcu(struct rcu_head *rcu) { - struct hlist_head *head = meter_hash_bucket(dp, meter->id); + struct dp_meter_instance *ti; - hlist_add_head_rcu(&meter->dp_hash_node, head); + ti = container_of(rcu, struct dp_meter_instance, rcu); + kvfree(ti); } -static void detach_meter(struct dp_meter *meter) +static int +dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + int n_meters = min(size, ti->n_meters); + struct dp_meter_instance *new_ti; + int i; + + new_ti = dp_meter_instance_alloc(size); + if (!new_ti) + return -ENOMEM; + + for (i = 0; i < n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + new_ti->dp_meters[i] = ti->dp_meters[i]; + + rcu_assign_pointer(tbl->ti, new_ti); + call_rcu(&ti->rcu, dp_meter_instance_free_rcu); + + return 0; +} + +static void dp_meter_instance_insert(struct dp_meter_instance *ti, + struct dp_meter *meter) +{ + u32 hash; + + hash = meter_hash(ti, meter->id); + rcu_assign_pointer(ti->dp_meters[hash], meter); +} + +static void dp_meter_instance_remove(struct dp_meter_instance *ti, + struct dp_meter *meter) { + u32 hash; + + hash = meter_hash(ti, meter->id); + RCU_INIT_POINTER(ti->dp_meters[hash], NULL); +} + +static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter->id); + int err; + + /* In generally, slots selected should be empty, because + * OvS uses id-pool to fetch a available id. + */ + if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) + return -EBUSY; + + dp_meter_instance_insert(ti, meter); + + /* That function is thread-safe. */ + tbl->count++; + if (tbl->count >= tbl->max_meters_allowed) { + err = -EFBIG; + goto attach_err; + } + + if (tbl->count >= ti->n_meters && + dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { + err = -ENOMEM; + goto attach_err; + } + + return 0; + +attach_err: + dp_meter_instance_remove(ti, meter); + tbl->count--; + return err; +} + +static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) +{ + struct dp_meter_instance *ti; + ASSERT_OVSL(); - if (meter) - hlist_del_rcu(&meter->dp_hash_node); + if (!meter) + return 0; + + ti = rcu_dereference_ovsl(tbl->ti); + dp_meter_instance_remove(ti, meter); + + tbl->count--; + + /* Shrink the meter array if necessary. */ + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && + tbl->count <= (ti->n_meters / 4)) { + int half_size = ti->n_meters / 2; + int i; + + /* Avoid hash collision, don't move slots to other place. + * Make sure there are no references of meters in array + * which will be released. + */ + for (i = half_size; i < ti->n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + goto out; + + if (dp_meter_instance_realloc(tbl, half_size)) + goto shrink_err; + } + +out: + return 0; + +shrink_err: + dp_meter_instance_insert(ti, meter); + tbl->count++; + return -ENOMEM; } static struct sk_buff * @@ -116,12 +242,11 @@ static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) goto error; - if (!meter) - return 0; - if (nla_put(reply, OVS_METER_ATTR_STATS, - sizeof(struct ovs_flow_stats), &meter->stats) || - nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, + sizeof(struct ovs_flow_stats), &meter->stats)) + goto error; + + if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, OVS_METER_ATTR_PAD)) goto error; @@ -150,18 +275,32 @@ error: static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { - struct sk_buff *reply; + struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; - int err; + struct sk_buff *reply; + struct datapath *dp; + int err = -EMSGSIZE; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); - if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, U32_MAX) || - nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, + dp->meter_tbl.max_meters_allowed)) + goto exit_unlock; + + ovs_unlock(); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); @@ -180,9 +319,10 @@ static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); +exit_unlock: + ovs_unlock(); nla_put_failure: nlmsg_free(reply); - err = -EMSGSIZE; return err; } @@ -252,8 +392,8 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) * * Start with a full bucket. */ - band->bucket = (band->burst_size + band->rate) * 1000; - band_max_delta_t = band->bucket / band->rate; + band->bucket = (band->burst_size + band->rate) * 1000ULL; + band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; @@ -273,14 +413,14 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) struct sk_buff *reply; struct ovs_header *ovs_reply_header; struct ovs_header *ovs_header = info->userhdr; + struct dp_meter_table *meter_tbl; struct datapath *dp; int err; u32 meter_id; bool failed; - if (!a[OVS_METER_ATTR_ID]) { - return -ENODEV; - } + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; meter = dp_meter_create(a); if (IS_ERR_OR_NULL(meter)) @@ -300,12 +440,18 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } + meter_tbl = &dp->meter_tbl; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); - /* Cannot fail after this. */ - old_meter = lookup_meter(dp, meter_id); - detach_meter(old_meter); - attach_meter(dp, meter); + old_meter = lookup_meter(meter_tbl, meter_id); + err = detach_meter(meter_tbl, old_meter); + if (err) + goto exit_unlock; + + err = attach_meter(meter_tbl, meter); + if (err) + goto exit_unlock; + ovs_unlock(); /* Build response with the meter_id and stats from @@ -337,14 +483,14 @@ exit_free_meter: static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { - struct nlattr **a = info->attrs; - u32 meter_id; struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *meter; + struct sk_buff *reply; struct datapath *dp; + u32 meter_id; int err; - struct sk_buff *reply; - struct dp_meter *meter; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; @@ -365,7 +511,7 @@ static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) } /* Locate meter, copy stats. */ - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(&dp->meter_tbl, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; @@ -390,18 +536,17 @@ exit_unlock: static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { - struct nlattr **a = info->attrs; - u32 meter_id; struct ovs_header *ovs_header = info->userhdr; struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *old_meter; + struct sk_buff *reply; struct datapath *dp; + u32 meter_id; int err; - struct sk_buff *reply; - struct dp_meter *old_meter; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; - meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, &ovs_reply_header); @@ -416,14 +561,19 @@ static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) goto exit_unlock; } - old_meter = lookup_meter(dp, meter_id); + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + old_meter = lookup_meter(&dp->meter_tbl, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); - detach_meter(old_meter); + + err = detach_meter(&dp->meter_tbl, old_meter); + if (err) + goto exit_unlock; } + ovs_unlock(); ovs_meter_free(old_meter); genlmsg_end(reply, ovs_reply_header); @@ -443,16 +593,16 @@ exit_unlock: bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { - struct dp_meter *meter; - struct dp_meter_band *band; long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long long int long_delta_ms; - u32 delta_ms; - u32 cost; + struct dp_meter_band *band; + struct dp_meter *meter; int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; + u32 delta_ms; + u32 cost; - meter = lookup_meter(dp, meter_id); + meter = lookup_meter(&dp->meter_tbl, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; @@ -570,32 +720,39 @@ struct genl_family dp_meter_genl_family __ro_after_init = { int ovs_meters_init(struct datapath *dp) { - int i; - - dp->meters = kmalloc_array(METER_HASH_BUCKETS, - sizeof(struct hlist_head), GFP_KERNEL); + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti; + unsigned long free_mem_bytes; - if (!dp->meters) + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); + if (!ti) return -ENOMEM; - for (i = 0; i < METER_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->meters[i]); + /* Allow meters in a datapath to use ~3.12% of physical memory. */ + free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); + tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), + DP_METER_NUM_MAX); + if (!tbl->max_meters_allowed) + goto out_err; + + rcu_assign_pointer(tbl->ti, ti); + tbl->count = 0; return 0; + +out_err: + dp_meter_instance_free(ti); + return -ENOMEM; } void ovs_meters_exit(struct datapath *dp) { + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); int i; - for (i = 0; i < METER_HASH_BUCKETS; i++) { - struct hlist_head *head = &dp->meters[i]; - struct dp_meter *meter; - struct hlist_node *n; - - hlist_for_each_entry_safe(meter, n, head, dp_hash_node) - kfree(meter); - } + for (i = 0; i < ti->n_meters; i++) + ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); - kfree(dp->meters); + dp_meter_instance_free(ti); } diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h index f645913870bd..0c33889a8515 100644 --- a/net/openvswitch/meter.h +++ b/net/openvswitch/meter.h @@ -13,26 +13,26 @@ #include <linux/openvswitch.h> #include <linux/genetlink.h> #include <linux/skbuff.h> +#include <linux/bits.h> #include "flow.h" struct datapath; #define DP_MAX_BANDS 1 +#define DP_METER_ARRAY_SIZE_MIN BIT_ULL(10) +#define DP_METER_NUM_MAX (200000UL) struct dp_meter_band { u32 type; u32 rate; u32 burst_size; - u32 bucket; /* 1/1000 packets, or in bits */ + u64 bucket; /* 1/1000 packets, or in bits */ struct ovs_flow_stats stats; }; struct dp_meter { spinlock_t lock; /* Per meter lock */ struct rcu_head rcu; - struct hlist_node dp_hash_node; /*Element in datapath->meters - * hash table. - */ u32 id; u16 kbps:1, keep_stats:1; u16 n_bands; @@ -42,6 +42,18 @@ struct dp_meter { struct dp_meter_band bands[]; }; +struct dp_meter_instance { + struct rcu_head rcu; + u32 n_meters; + struct dp_meter __rcu *dp_meters[]; +}; + +struct dp_meter_table { + struct dp_meter_instance __rcu *ti; + u32 count; + u32 max_meters_allowed; +}; + extern struct genl_family dp_meter_genl_family; int ovs_meters_init(struct datapath *dp); void ovs_meters_exit(struct datapath *dp); diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 251e750fd9aa..0d0bf41381c2 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -49,8 +49,7 @@ void phonet_get_local_port_range(int *min, int *max) } static int proc_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2] = {local_port_range[0], local_port_range[1]}; diff --git a/net/psample/psample.c b/net/psample/psample.c index 6f2fbc6b9eb2..a042261a45c5 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -14,6 +14,8 @@ #include <net/genetlink.h> #include <net/psample.h> #include <linux/spinlock.h> +#include <net/ip_tunnels.h> +#include <net/dst_metadata.h> #define PSAMPLE_MAX_PACKET_SIZE 0xffff @@ -207,10 +209,159 @@ void psample_group_put(struct psample_group *group) } EXPORT_SYMBOL_GPL(psample_group_put); +#ifdef CONFIG_INET +static int __psample_ip_tun_to_nlattr(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + unsigned short tun_proto = ip_tunnel_info_af(tun_info); + const void *tun_opts = ip_tunnel_info_opts(tun_info); + const struct ip_tunnel_key *tun_key = &tun_info->key; + int tun_opts_len = tun_info->options_len; + + if (tun_key->tun_flags & TUNNEL_KEY && + nla_put_be64(skb, PSAMPLE_TUNNEL_KEY_ATTR_ID, tun_key->tun_id, + PSAMPLE_TUNNEL_KEY_ATTR_PAD)) + return -EMSGSIZE; + + if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE)) + return -EMSGSIZE; + + switch (tun_proto) { + case AF_INET: + if (tun_key->u.ipv4.src && + nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_SRC, + tun_key->u.ipv4.src)) + return -EMSGSIZE; + if (tun_key->u.ipv4.dst && + nla_put_in_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV4_DST, + tun_key->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&tun_key->u.ipv6.src) && + nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_SRC, + &tun_key->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&tun_key->u.ipv6.dst) && + nla_put_in6_addr(skb, PSAMPLE_TUNNEL_KEY_ATTR_IPV6_DST, + &tun_key->u.ipv6.dst)) + return -EMSGSIZE; + break; + } + if (tun_key->tos && + nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TOS, tun_key->tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, PSAMPLE_TUNNEL_KEY_ATTR_TTL, tun_key->ttl)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_CSUM) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if (tun_key->tp_src && + nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_SRC, tun_key->tp_src)) + return -EMSGSIZE; + if (tun_key->tp_dst && + nla_put_be16(skb, PSAMPLE_TUNNEL_KEY_ATTR_TP_DST, tun_key->tp_dst)) + return -EMSGSIZE; + if ((tun_key->tun_flags & TUNNEL_OAM) && + nla_put_flag(skb, PSAMPLE_TUNNEL_KEY_ATTR_OAM)) + return -EMSGSIZE; + if (tun_opts_len) { + if (tun_key->tun_flags & TUNNEL_GENEVE_OPT && + nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_GENEVE_OPTS, + tun_opts_len, tun_opts)) + return -EMSGSIZE; + else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT && + nla_put(skb, PSAMPLE_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + tun_opts_len, tun_opts)) + return -EMSGSIZE; + } + + return 0; +} + +static int psample_ip_tun_to_nlattr(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start_noflag(skb, PSAMPLE_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __psample_ip_tun_to_nlattr(skb, tun_info); + if (err) { + nla_nest_cancel(skb, nla); + return err; + } + + nla_nest_end(skb, nla); + + return 0; +} + +static int psample_tunnel_meta_len(struct ip_tunnel_info *tun_info) +{ + unsigned short tun_proto = ip_tunnel_info_af(tun_info); + const struct ip_tunnel_key *tun_key = &tun_info->key; + int tun_opts_len = tun_info->options_len; + int sum = 0; + + if (tun_key->tun_flags & TUNNEL_KEY) + sum += nla_total_size(sizeof(u64)); + + if (tun_info->mode & IP_TUNNEL_INFO_BRIDGE) + sum += nla_total_size(0); + + switch (tun_proto) { + case AF_INET: + if (tun_key->u.ipv4.src) + sum += nla_total_size(sizeof(u32)); + if (tun_key->u.ipv4.dst) + sum += nla_total_size(sizeof(u32)); + break; + case AF_INET6: + if (!ipv6_addr_any(&tun_key->u.ipv6.src)) + sum += nla_total_size(sizeof(struct in6_addr)); + if (!ipv6_addr_any(&tun_key->u.ipv6.dst)) + sum += nla_total_size(sizeof(struct in6_addr)); + break; + } + if (tun_key->tos) + sum += nla_total_size(sizeof(u8)); + sum += nla_total_size(sizeof(u8)); /* TTL */ + if (tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) + sum += nla_total_size(0); + if (tun_key->tun_flags & TUNNEL_CSUM) + sum += nla_total_size(0); + if (tun_key->tp_src) + sum += nla_total_size(sizeof(u16)); + if (tun_key->tp_dst) + sum += nla_total_size(sizeof(u16)); + if (tun_key->tun_flags & TUNNEL_OAM) + sum += nla_total_size(0); + if (tun_opts_len) { + if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) + sum += nla_total_size(tun_opts_len); + else if (tun_key->tun_flags & TUNNEL_ERSPAN_OPT) + sum += nla_total_size(tun_opts_len); + } + + return sum; +} +#endif + void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, u32 trunc_size, int in_ifindex, int out_ifindex, u32 sample_rate) { +#ifdef CONFIG_INET + struct ip_tunnel_info *tun_info; +#endif struct sk_buff *nl_skb; int data_len; int meta_len; @@ -224,6 +375,12 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, nla_total_size(sizeof(u32)) + /* group_num */ nla_total_size(sizeof(u32)); /* seq */ +#ifdef CONFIG_INET + tun_info = skb_tunnel_info(skb); + if (tun_info) + meta_len += psample_tunnel_meta_len(tun_info); +#endif + data_len = min(skb->len, trunc_size); if (meta_len + nla_total_size(data_len) > PSAMPLE_MAX_PACKET_SIZE) data_len = PSAMPLE_MAX_PACKET_SIZE - meta_len - NLA_HDRLEN @@ -278,6 +435,14 @@ void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, goto error; } +#ifdef CONFIG_INET + if (tun_info) { + ret = psample_ip_tun_to_nlattr(nl_skb, tun_info); + if (unlikely(ret < 0)) + goto error; + } +#endif + genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, PSAMPLE_NL_MCGRP_SAMPLE, GFP_ATOMIC); diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig index 63f89cc6e82c..f362ca316015 100644 --- a/net/qrtr/Kconfig +++ b/net/qrtr/Kconfig @@ -4,7 +4,6 @@ config QRTR tristate "Qualcomm IPC Router support" - depends on ARCH_QCOM || COMPILE_TEST ---help--- Say Y if you intend to use Qualcomm IPC router protocol. The protocol is used to communicate with services provided by other @@ -29,4 +28,11 @@ config QRTR_TUN implement endpoints of QRTR, for purpose of tunneling data to other hosts or testing purposes. +config QRTR_MHI + tristate "MHI IPC Router channels" + depends on MHI_BUS + help + Say Y here to support MHI based ipcrouter channels. MHI is the + transport used for communicating to external modems. + endif # QRTR diff --git a/net/qrtr/Makefile b/net/qrtr/Makefile index 32d4e923925d..1b1411d158a7 100644 --- a/net/qrtr/Makefile +++ b/net/qrtr/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_QRTR_SMD) += qrtr-smd.o qrtr-smd-y := smd.o obj-$(CONFIG_QRTR_TUN) += qrtr-tun.o qrtr-tun-y := tun.o +obj-$(CONFIG_QRTR_MHI) += qrtr-mhi.o +qrtr-mhi-y := mhi.o diff --git a/net/qrtr/mhi.c b/net/qrtr/mhi.c new file mode 100644 index 000000000000..ff0c41467fc1 --- /dev/null +++ b/net/qrtr/mhi.c @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018-2020, The Linux Foundation. All rights reserved. + */ + +#include <linux/mhi.h> +#include <linux/mod_devicetable.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <net/sock.h> + +#include "qrtr.h" + +struct qrtr_mhi_dev { + struct qrtr_endpoint ep; + struct mhi_device *mhi_dev; + struct device *dev; +}; + +/* From MHI to QRTR */ +static void qcom_mhi_qrtr_dl_callback(struct mhi_device *mhi_dev, + struct mhi_result *mhi_res) +{ + struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev); + int rc; + + if (!qdev || mhi_res->transaction_status) + return; + + rc = qrtr_endpoint_post(&qdev->ep, mhi_res->buf_addr, + mhi_res->bytes_xferd); + if (rc == -EINVAL) + dev_err(qdev->dev, "invalid ipcrouter packet\n"); +} + +/* From QRTR to MHI */ +static void qcom_mhi_qrtr_ul_callback(struct mhi_device *mhi_dev, + struct mhi_result *mhi_res) +{ + struct sk_buff *skb = mhi_res->buf_addr; + + if (skb->sk) + sock_put(skb->sk); + consume_skb(skb); +} + +/* Send data over MHI */ +static int qcom_mhi_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) +{ + struct qrtr_mhi_dev *qdev = container_of(ep, struct qrtr_mhi_dev, ep); + int rc; + + rc = skb_linearize(skb); + if (rc) + goto free_skb; + + rc = mhi_queue_skb(qdev->mhi_dev, DMA_TO_DEVICE, skb, skb->len, + MHI_EOT); + if (rc) + goto free_skb; + + if (skb->sk) + sock_hold(skb->sk); + + return rc; + +free_skb: + kfree_skb(skb); + + return rc; +} + +static int qcom_mhi_qrtr_probe(struct mhi_device *mhi_dev, + const struct mhi_device_id *id) +{ + struct qrtr_mhi_dev *qdev; + int rc; + + qdev = devm_kzalloc(&mhi_dev->dev, sizeof(*qdev), GFP_KERNEL); + if (!qdev) + return -ENOMEM; + + qdev->mhi_dev = mhi_dev; + qdev->dev = &mhi_dev->dev; + qdev->ep.xmit = qcom_mhi_qrtr_send; + + dev_set_drvdata(&mhi_dev->dev, qdev); + rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO); + if (rc) + return rc; + + dev_dbg(qdev->dev, "Qualcomm MHI QRTR driver probed\n"); + + return 0; +} + +static void qcom_mhi_qrtr_remove(struct mhi_device *mhi_dev) +{ + struct qrtr_mhi_dev *qdev = dev_get_drvdata(&mhi_dev->dev); + + qrtr_endpoint_unregister(&qdev->ep); + dev_set_drvdata(&mhi_dev->dev, NULL); +} + +static const struct mhi_device_id qcom_mhi_qrtr_id_table[] = { + { .chan = "IPCR" }, + {} +}; +MODULE_DEVICE_TABLE(mhi, qcom_mhi_qrtr_id_table); + +static struct mhi_driver qcom_mhi_qrtr_driver = { + .probe = qcom_mhi_qrtr_probe, + .remove = qcom_mhi_qrtr_remove, + .dl_xfer_cb = qcom_mhi_qrtr_dl_callback, + .ul_xfer_cb = qcom_mhi_qrtr_ul_callback, + .id_table = qcom_mhi_qrtr_id_table, + .driver = { + .name = "qcom_mhi_qrtr", + }, +}; + +module_mhi_driver(qcom_mhi_qrtr_driver); + +MODULE_AUTHOR("Chris Lew <clew@codeaurora.org>"); +MODULE_AUTHOR("Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>"); +MODULE_DESCRIPTION("Qualcomm IPC-Router MHI interface driver"); +MODULE_LICENSE("GPL v2"); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index c5b3202a14ca..d8252fdab851 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -12,6 +12,9 @@ #include "qrtr.h" +#define CREATE_TRACE_POINTS +#include <trace/events/qrtr.h> + static RADIX_TREE(nodes, GFP_KERNEL); static struct { @@ -105,8 +108,8 @@ static int service_announce_new(struct sockaddr_qrtr *dest, struct msghdr msg = { }; struct kvec iv; - trace_printk("advertising new server [%d:%x]@[%d:%d]\n", - srv->service, srv->instance, srv->node, srv->port); + trace_qrtr_ns_service_announce_new(srv->service, srv->instance, + srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); @@ -132,8 +135,8 @@ static int service_announce_del(struct sockaddr_qrtr *dest, struct kvec iv; int ret; - trace_printk("advertising removal of server [%d:%x]@[%d:%d]\n", - srv->service, srv->instance, srv->node, srv->port); + trace_qrtr_ns_service_announce_del(srv->service, srv->instance, + srv->node, srv->port); iv.iov_base = &pkt; iv.iov_len = sizeof(pkt); @@ -244,8 +247,8 @@ static struct qrtr_server *server_add(unsigned int service, radix_tree_insert(&node->servers, port, srv); - trace_printk("add server [%d:%x]@[%d:%d]\n", srv->service, - srv->instance, srv->node, srv->port); + trace_qrtr_ns_server_add(srv->service, srv->instance, + srv->node, srv->port); return srv; @@ -633,9 +636,8 @@ static void qrtr_ns_worker(struct work_struct *work) cmd = le32_to_cpu(pkt->cmd); if (cmd < ARRAY_SIZE(qrtr_ctrl_pkt_strings) && qrtr_ctrl_pkt_strings[cmd]) - trace_printk("%s from %d:%d\n", - qrtr_ctrl_pkt_strings[cmd], sq.sq_node, - sq.sq_port); + trace_qrtr_ns_message(qrtr_ctrl_pkt_strings[cmd], + sq.sq_node, sq.sq_port); ret = 0; switch (cmd) { diff --git a/net/rds/info.c b/net/rds/info.c index 03f6fd56d237..b6b46a8214a0 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -162,7 +162,6 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, struct rds_info_lengths lens; unsigned long nr_pages = 0; unsigned long start; - unsigned long i; rds_info_func func; struct page **pages = NULL; int ret; @@ -193,7 +192,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, ret = -ENOMEM; goto out; } - ret = get_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); + ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; @@ -235,8 +234,8 @@ call_func: ret = -EFAULT; out: - for (i = 0; pages && i < nr_pages; i++) - put_page(pages[i]); + if (pages) + unpin_user_pages(pages, nr_pages); kfree(pages); return ret; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 66121bc6f34e..43db0eca911f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos); + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -90,15 +89,6 @@ static struct ctl_table rds_tcp_sysctl_table[] = { { } }; -/* doing it this way avoids calling tcp_sk() */ -void rds_tcp_nonagle(struct socket *sock) -{ - int val = 1; - - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val, - sizeof(val)); -} - u32 rds_tcp_write_seq(struct rds_tcp_connection *tc) { /* seq# of the last byte of data in tcp send buffer */ @@ -503,7 +493,7 @@ void rds_tcp_tune(struct socket *sock) struct net *net = sock_net(sk); struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); lock_sock(sk); if (rtn->sndbuf_size > 0) { sk->sk_sndbuf = rtn->sndbuf_size; @@ -676,8 +666,7 @@ static void rds_tcp_sysctl_reset(struct net *net) } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos) + void *buffer, size_t *lenp, loff_t *fpos) { struct net *net = current->nsproxy->net_ns; int err; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 3c69361d21c7..bad9cf49d565 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -50,7 +50,6 @@ struct rds_tcp_statistics { /* tcp.c */ void rds_tcp_tune(struct socket *sock); -void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_restore_callbacks(struct socket *sock, @@ -71,9 +70,8 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); -int rds_tcp_keepalive(struct socket *sock); +void rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); -void rds_tcp_set_linger(struct socket *sock); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 008f50fb25dd..4e64598176b0 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -207,7 +207,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) if (sock) { if (rds_destroy_pending(cp->cp_conn)) - rds_tcp_set_linger(sock); + sock_no_linger(sock->sk); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 810a3a49e947..101cf14215a0 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -38,36 +38,19 @@ #include "rds.h" #include "tcp.h" -int rds_tcp_keepalive(struct socket *sock) +void rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ - int keepalive = 1; - int ret = 0; - - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&keepalive, sizeof(keepalive)); - if (ret < 0) - goto bail; - - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); - if (ret < 0) - goto bail; - - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - if (ret < 0) - goto bail; + sock_set_keepalive(sock->sk); + tcp_sock_set_keepcnt(sock->sk, keepcnt); + tcp_sock_set_keepidle(sock->sk, keepidle); /* KEEPINTVL is the interval between successive probes. We follow * the model in xs_tcp_finish_connecting() and re-use keepidle. */ - ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); -bail: - return ret; + tcp_sock_set_keepintvl(sock->sk, keepidle); } /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the @@ -111,17 +94,6 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) return NULL; } -void rds_tcp_set_linger(struct socket *sock) -{ - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; - - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); -} - int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; @@ -160,10 +132,7 @@ int rds_tcp_accept_one(struct socket *sock) new_sock->ops = sock->ops; __module_get(new_sock->ops->owner); - ret = rds_tcp_keepalive(new_sock); - if (ret < 0) - goto out; - + rds_tcp_keepalive(new_sock); rds_tcp_tune(new_sock); inet = inet_sk(new_sock->sk); @@ -241,7 +210,7 @@ rst_nsk: * be pending on it. By setting linger, we achieve the side-effect * of avoiding TIME_WAIT state on new_sock. */ - rds_tcp_set_linger(new_sock); + sock_no_linger(new_sock->sk); kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: @@ -303,7 +272,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) } sock->sk->sk_reuse = SK_CAN_REUSE; - rds_tcp_nonagle(sock); + tcp_sock_set_nodelay(sock->sk); write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = sock->sk->sk_data_ready; diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 78a2554a4497..8c4d1d6e9249 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -38,23 +38,18 @@ #include "rds.h" #include "tcp.h" -static void rds_tcp_cork(struct socket *sock, int val) -{ - kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val)); -} - void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 1); + tcp_sock_set_cork(tc->t_sock->sk, true); } void rds_tcp_xmit_path_complete(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; - rds_tcp_cork(tc->t_sock, 0); + tcp_sock_set_cork(tc->t_sock->sk, false); } /* the core send_sem serializes this with other xmit and shutdown */ diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 1e8eeb044b07..e7a872207b46 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -65,6 +65,26 @@ static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; /* + * ROSE network devices are virtual network devices encapsulating ROSE + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key rose_netdev_xmit_lock_key; + +static void rose_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); +} + +static void rose_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); +} + +/* * Convert a ROSE address into text. */ char *rose2asc(char *buf, const rose_address *addr) @@ -1511,6 +1531,7 @@ static int __init rose_proto_init(void) free_netdev(dev); goto fail; } + rose_set_lockdep_key(dev); dev_rose[i] = dev; } diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig index 57ebb29c26ad..d706bb408365 100644 --- a/net/rxrpc/Kconfig +++ b/net/rxrpc/Kconfig @@ -18,7 +18,7 @@ config AF_RXRPC This module at the moment only supports client operations and is currently incomplete. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. config AF_RXRPC_IPV6 bool "IPv6 support for RxRPC" @@ -41,7 +41,7 @@ config AF_RXRPC_DEBUG help Say Y here to make runtime controllable debugging messages appear. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. config RXKAD @@ -56,4 +56,4 @@ config RXKAD Provide kerberos 4 and AFS kaserver security handling for AF_RXRPC through the use of the key retention service. - See Documentation/networking/rxrpc.txt. + See Documentation/networking/rxrpc.rst. diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 15ee92d79581..394189b81849 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -571,6 +571,19 @@ out: return ret; } +int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val) +{ + if (sk->sk_state != RXRPC_UNBOUND) + return -EISCONN; + if (val > RXRPC_SECURITY_MAX) + return -EINVAL; + lock_sock(sk); + rxrpc_sk(sk)->min_sec_level = val; + release_sock(sk); + return 0; +} +EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); + /* * set RxRPC socket options */ diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 01135e54d95d..c8b2097f499c 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -107,7 +107,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) { struct sock *usk; - int ret, opt; + int ret; _enter("%p{%d,%d}", local, local->srx.transport_type, local->srx.transport.family); @@ -157,13 +157,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) switch (local->srx.transport.family) { case AF_INET6: /* we want to receive ICMPv6 errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IPV6, IPV6_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip6_sock_set_recverr(local->socket->sk); /* Fall through and set IPv4 options too otherwise we don't get * errors from IPv4 packets sent through the IPv6 socket. @@ -171,31 +165,13 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) /* Fall through */ case AF_INET: /* we want to receive ICMP errors */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_RECVERR, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip_sock_set_recverr(local->socket->sk); /* we want to set the don't fragment bit */ - opt = IP_PMTUDISC_DO; - ret = kernel_setsockopt(local->socket, SOL_IP, IP_MTU_DISCOVER, - (char *) &opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + ip_sock_set_mtu_discover(local->socket->sk, IP_PMTUDISC_DO); /* We want receive timestamps. */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS_OLD, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + sock_enable_timestamps(local->socket->sk); break; default: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index f8b632a5c619..1ba43c3df4ad 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -321,7 +321,7 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, struct kvec iov[2]; rxrpc_serial_t serial; size_t len; - int ret, opt; + int ret; _enter(",{%d}", skb->len); @@ -473,18 +473,14 @@ send_fragmentable: switch (conn->params.local->srx.transport.family) { case AF_INET6: case AF_INET: - opt = IP_PMTUDISC_DONT; - kernel_setsockopt(conn->params.local->socket, - SOL_IP, IP_MTU_DISCOVER, - (char *)&opt, sizeof(opt)); + ip_sock_set_mtu_discover(conn->params.local->socket->sk, + IP_PMTUDISC_DONT); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); - opt = IP_PMTUDISC_DO; - kernel_setsockopt(conn->params.local->socket, - SOL_IP, IP_MTU_DISCOVER, - (char *)&opt, sizeof(opt)); + ip_sock_set_mtu_discover(conn->params.local->socket->sk, + IP_PMTUDISC_DO); break; default: diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c index 18dade4e6f9a..e91acc95ff28 100644 --- a/net/rxrpc/sysctl.c +++ b/net/rxrpc/sysctl.c @@ -21,7 +21,7 @@ static const unsigned long max_jiffies = MAX_JIFFY_OFFSET; /* * RxRPC operating parameters. * - * See Documentation/networking/rxrpc.txt and the variable definitions for more + * See Documentation/networking/rxrpc.rst and the variable definitions for more * information on the individual parameters. */ static struct ctl_table rxrpc_sysctl_table[] = { diff --git a/net/sched/Kconfig b/net/sched/Kconfig index bfbefb7bff9d..2f20073f4f84 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -981,6 +981,18 @@ config NET_ACT_CT To compile this code as a module, choose M here: the module will be called act_ct. +config NET_ACT_GATE + tristate "Frame gate entry list control tc action" + depends on NET_CLS_ACT + help + Say Y here to allow to control the ingress flow to be passed at + specific time slot and be dropped at other specific time slot by + the gate entry list. + + If unsure, say N. + To compile this code as a module, choose M here: the + module will be called act_gate. + config NET_IFE_SKBMARK tristate "Support to encoding decoding skb mark on IFE action" depends on NET_ACT_IFE diff --git a/net/sched/Makefile b/net/sched/Makefile index 31c367a6cd09..66bbf9a98f9e 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o obj-$(CONFIG_NET_IFE_SKBTCINDEX) += act_meta_skbtcindex.o obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_ACT_CT) += act_ct.o +obj-$(CONFIG_NET_ACT_GATE) += act_gate.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index df4560909157..8ac7eb0a8309 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -766,12 +766,10 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) return a->ops->dump(skb, a, bind, ref); } -int -tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +static int +tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a) { - int err = -EINVAL; unsigned char *b = skb_tail_pointer(skb); - struct nlattr *nest; struct tc_cookie *cookie; if (nla_put_string(skb, TCA_KIND, a->ops->kind)) @@ -789,6 +787,23 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) } rcu_read_unlock(); + return 0; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + unsigned char *b = skb_tail_pointer(skb); + struct nlattr *nest; + + if (tcf_action_dump_terse(skb, a)) + goto nla_put_failure; + if (a->hw_stats != TCA_ACT_HW_STATS_ANY && nla_put_bitfield32(skb, TCA_ACT_HW_STATS, a->hw_stats, TCA_ACT_HW_STATS_ANY)) @@ -820,7 +835,7 @@ nla_put_failure: EXPORT_SYMBOL(tcf_action_dump_1); int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], - int bind, int ref) + int bind, int ref, bool terse) { struct tc_action *a; int err = -EINVAL, i; @@ -831,7 +846,8 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], nest = nla_nest_start_noflag(skb, i + 1); if (nest == NULL) goto nla_put_failure; - err = tcf_action_dump_1(skb, a, bind, ref); + err = terse ? tcf_action_dump_terse(skb, a) : + tcf_action_dump_1(skb, a, bind, ref); if (err < 0) goto errout; nla_nest_end(skb, nest); @@ -876,19 +892,14 @@ static u8 tcf_action_hw_stats_get(struct nlattr *hw_stats_attr) return hw_stats_bf.value; } -static const u32 tca_act_flags_allowed = TCA_ACT_FLAGS_NO_PERCPU_STATS; -static const u32 tca_act_hw_stats_allowed = TCA_ACT_HW_STATS_ANY; - static const struct nla_policy tcf_action_policy[TCA_ACT_MAX + 1] = { [TCA_ACT_KIND] = { .type = NLA_STRING }, [TCA_ACT_INDEX] = { .type = NLA_U32 }, [TCA_ACT_COOKIE] = { .type = NLA_BINARY, .len = TC_COOKIE_MAX_SIZE }, [TCA_ACT_OPTIONS] = { .type = NLA_NESTED }, - [TCA_ACT_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_flags_allowed }, - [TCA_ACT_HW_STATS] = { .type = NLA_BITFIELD32, - .validation_data = &tca_act_hw_stats_allowed }, + [TCA_ACT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_ACT_FLAGS_NO_PERCPU_STATS), + [TCA_ACT_HW_STATS] = NLA_POLICY_BITFIELD32(TCA_ACT_HW_STATS_ANY), }; struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, @@ -1138,7 +1149,7 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], if (!nest) goto out_nlmsg_trim; - if (tcf_action_dump(skb, actions, bind, ref) < 0) + if (tcf_action_dump(skb, actions, bind, ref, false) < 0) goto out_nlmsg_trim; nla_nest_end(skb, nest); @@ -1454,10 +1465,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, return ret; } -static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { - [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &tcaa_root_flags_allowed }, + [TCA_ROOT_FLAGS] = NLA_POLICY_BITFIELD32(TCA_FLAG_LARGE_DUMP_ON), [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 20577355235a..e29f0f45d688 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -30,6 +30,7 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #include <uapi/linux/netfilter/nf_nat.h> @@ -539,6 +540,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); nf_ct_set(skb, ct, ctinfo); + nf_ct_acct_update(ct, dir, skb->len); return true; } diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c new file mode 100644 index 000000000000..9c628591f452 --- /dev/null +++ b/net/sched/act_gate.c @@ -0,0 +1,639 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Copyright 2020 NXP */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <net/act_api.h> +#include <net/netlink.h> +#include <net/pkt_cls.h> +#include <net/tc_act/tc_gate.h> + +static unsigned int gate_net_id; +static struct tc_action_ops act_gate_ops; + +static ktime_t gate_get_time(struct tcf_gate *gact) +{ + ktime_t mono = ktime_get(); + + switch (gact->tk_offset) { + case TK_OFFS_MAX: + return mono; + default: + return ktime_mono_to_any(mono, gact->tk_offset); + } + + return KTIME_MAX; +} + +static int gate_get_start_time(struct tcf_gate *gact, ktime_t *start) +{ + struct tcf_gate_params *param = &gact->param; + ktime_t now, base, cycle; + u64 n; + + base = ns_to_ktime(param->tcfg_basetime); + now = gate_get_time(gact); + + if (ktime_after(base, now)) { + *start = base; + return 0; + } + + cycle = param->tcfg_cycletime; + + /* cycle time should not be zero */ + if (!cycle) + return -EFAULT; + + n = div64_u64(ktime_sub_ns(now, base), cycle); + *start = ktime_add_ns(base, (n + 1) * cycle); + return 0; +} + +static void gate_start_timer(struct tcf_gate *gact, ktime_t start) +{ + ktime_t expires; + + expires = hrtimer_get_expires(&gact->hitimer); + if (expires == 0) + expires = KTIME_MAX; + + start = min_t(ktime_t, start, expires); + + hrtimer_start(&gact->hitimer, start, HRTIMER_MODE_ABS_SOFT); +} + +static enum hrtimer_restart gate_timer_func(struct hrtimer *timer) +{ + struct tcf_gate *gact = container_of(timer, struct tcf_gate, + hitimer); + struct tcf_gate_params *p = &gact->param; + struct tcfg_gate_entry *next; + ktime_t close_time, now; + + spin_lock(&gact->tcf_lock); + + next = gact->next_entry; + + /* cycle start, clear pending bit, clear total octets */ + gact->current_gate_status = next->gate_state ? GATE_ACT_GATE_OPEN : 0; + gact->current_entry_octets = 0; + gact->current_max_octets = next->maxoctets; + + gact->current_close_time = ktime_add_ns(gact->current_close_time, + next->interval); + + close_time = gact->current_close_time; + + if (list_is_last(&next->list, &p->entries)) + next = list_first_entry(&p->entries, + struct tcfg_gate_entry, list); + else + next = list_next_entry(next, list); + + now = gate_get_time(gact); + + if (ktime_after(now, close_time)) { + ktime_t cycle, base; + u64 n; + + cycle = p->tcfg_cycletime; + base = ns_to_ktime(p->tcfg_basetime); + n = div64_u64(ktime_sub_ns(now, base), cycle); + close_time = ktime_add_ns(base, (n + 1) * cycle); + } + + gact->next_entry = next; + + hrtimer_set_expires(&gact->hitimer, close_time); + + spin_unlock(&gact->tcf_lock); + + return HRTIMER_RESTART; +} + +static int tcf_gate_act(struct sk_buff *skb, const struct tc_action *a, + struct tcf_result *res) +{ + struct tcf_gate *gact = to_gate(a); + + spin_lock(&gact->tcf_lock); + + tcf_lastuse_update(&gact->tcf_tm); + bstats_update(&gact->tcf_bstats, skb); + + if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { + spin_unlock(&gact->tcf_lock); + return gact->tcf_action; + } + + if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) + goto drop; + + if (gact->current_max_octets >= 0) { + gact->current_entry_octets += qdisc_pkt_len(skb); + if (gact->current_entry_octets > gact->current_max_octets) { + gact->tcf_qstats.overlimits++; + goto drop; + } + } + + spin_unlock(&gact->tcf_lock); + + return gact->tcf_action; +drop: + gact->tcf_qstats.drops++; + spin_unlock(&gact->tcf_lock); + + return TC_ACT_SHOT; +} + +static const struct nla_policy entry_policy[TCA_GATE_ENTRY_MAX + 1] = { + [TCA_GATE_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_GATE_ENTRY_GATE] = { .type = NLA_FLAG }, + [TCA_GATE_ENTRY_INTERVAL] = { .type = NLA_U32 }, + [TCA_GATE_ENTRY_IPV] = { .type = NLA_S32 }, + [TCA_GATE_ENTRY_MAX_OCTETS] = { .type = NLA_S32 }, +}; + +static const struct nla_policy gate_policy[TCA_GATE_MAX + 1] = { + [TCA_GATE_PARMS] = { .len = sizeof(struct tc_gate), + .type = NLA_EXACT_LEN }, + [TCA_GATE_PRIORITY] = { .type = NLA_S32 }, + [TCA_GATE_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_GATE_BASE_TIME] = { .type = NLA_U64 }, + [TCA_GATE_CYCLE_TIME] = { .type = NLA_U64 }, + [TCA_GATE_CYCLE_TIME_EXT] = { .type = NLA_U64 }, + [TCA_GATE_FLAGS] = { .type = NLA_U32 }, + [TCA_GATE_CLOCKID] = { .type = NLA_S32 }, +}; + +static int fill_gate_entry(struct nlattr **tb, struct tcfg_gate_entry *entry, + struct netlink_ext_ack *extack) +{ + u32 interval = 0; + + entry->gate_state = nla_get_flag(tb[TCA_GATE_ENTRY_GATE]); + + if (tb[TCA_GATE_ENTRY_INTERVAL]) + interval = nla_get_u32(tb[TCA_GATE_ENTRY_INTERVAL]); + + if (interval == 0) { + NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); + return -EINVAL; + } + + entry->interval = interval; + + if (tb[TCA_GATE_ENTRY_IPV]) + entry->ipv = nla_get_s32(tb[TCA_GATE_ENTRY_IPV]); + else + entry->ipv = -1; + + if (tb[TCA_GATE_ENTRY_MAX_OCTETS]) + entry->maxoctets = nla_get_s32(tb[TCA_GATE_ENTRY_MAX_OCTETS]); + else + entry->maxoctets = -1; + + return 0; +} + +static int parse_gate_entry(struct nlattr *n, struct tcfg_gate_entry *entry, + int index, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_GATE_ENTRY_MAX + 1] = { }; + int err; + + err = nla_parse_nested(tb, TCA_GATE_ENTRY_MAX, n, entry_policy, extack); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + entry->index = index; + + return fill_gate_entry(tb, entry, extack); +} + +static void release_entry_list(struct list_head *entries) +{ + struct tcfg_gate_entry *entry, *e; + + list_for_each_entry_safe(entry, e, entries, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int parse_gate_list(struct nlattr *list_attr, + struct tcf_gate_params *sched, + struct netlink_ext_ack *extack) +{ + struct tcfg_gate_entry *entry; + struct nlattr *n; + int err, rem; + int i = 0; + + if (!list_attr) + return -EINVAL; + + nla_for_each_nested(n, list_attr, rem) { + if (nla_type(n) != TCA_GATE_ONE_ENTRY) { + NL_SET_ERR_MSG(extack, "Attribute isn't type 'entry'"); + continue; + } + + entry = kzalloc(sizeof(*entry), GFP_ATOMIC); + if (!entry) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + err = -ENOMEM; + goto release_list; + } + + err = parse_gate_entry(n, entry, i, extack); + if (err < 0) { + kfree(entry); + goto release_list; + } + + list_add_tail(&entry->list, &sched->entries); + i++; + } + + sched->num_entries = i; + + return i; + +release_list: + release_entry_list(&sched->entries); + + return err; +} + +static int tcf_gate_init(struct net *net, struct nlattr *nla, + struct nlattr *est, struct tc_action **a, + int ovr, int bind, bool rtnl_held, + struct tcf_proto *tp, u32 flags, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + enum tk_offsets tk_offset = TK_OFFS_TAI; + struct nlattr *tb[TCA_GATE_MAX + 1]; + struct tcf_chain *goto_ch = NULL; + struct tcf_gate_params *p; + s32 clockid = CLOCK_TAI; + struct tcf_gate *gact; + struct tc_gate *parm; + int ret = 0, err; + u64 basetime = 0; + u32 gflags = 0; + s32 prio = -1; + ktime_t start; + u32 index; + + if (!nla) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_GATE_MAX, nla, gate_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_GATE_PARMS]) + return -EINVAL; + + parm = nla_data(tb[TCA_GATE_PARMS]); + index = parm->index; + + err = tcf_idr_check_alloc(tn, &index, a, bind); + if (err < 0) + return err; + + if (err && bind) + return 0; + + if (!err) { + ret = tcf_idr_create(tn, index, est, a, + &act_gate_ops, bind, false, 0); + if (ret) { + tcf_idr_cleanup(tn, index); + return ret; + } + + ret = ACT_P_CREATED; + } else if (!ovr) { + tcf_idr_release(*a, bind); + return -EEXIST; + } + if (ret == ACT_P_CREATED) { + to_gate(*a)->param.tcfg_clockid = -1; + INIT_LIST_HEAD(&(to_gate(*a)->param.entries)); + } + + if (tb[TCA_GATE_PRIORITY]) + prio = nla_get_s32(tb[TCA_GATE_PRIORITY]); + + if (tb[TCA_GATE_BASE_TIME]) + basetime = nla_get_u64(tb[TCA_GATE_BASE_TIME]); + + if (tb[TCA_GATE_FLAGS]) + gflags = nla_get_u32(tb[TCA_GATE_FLAGS]); + + if (tb[TCA_GATE_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_GATE_CLOCKID]); + switch (clockid) { + case CLOCK_REALTIME: + tk_offset = TK_OFFS_REAL; + break; + case CLOCK_MONOTONIC: + tk_offset = TK_OFFS_MAX; + break; + case CLOCK_BOOTTIME: + tk_offset = TK_OFFS_BOOT; + break; + case CLOCK_TAI: + tk_offset = TK_OFFS_TAI; + break; + default: + NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); + goto release_idr; + } + } + + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + gact = to_gate(*a); + + spin_lock_bh(&gact->tcf_lock); + p = &gact->param; + + if (tb[TCA_GATE_CYCLE_TIME]) { + p->tcfg_cycletime = nla_get_u64(tb[TCA_GATE_CYCLE_TIME]); + if (!p->tcfg_cycletime_ext) + goto chain_put; + } + + if (tb[TCA_GATE_ENTRY_LIST]) { + err = parse_gate_list(tb[TCA_GATE_ENTRY_LIST], p, extack); + if (err < 0) + goto chain_put; + } + + if (!p->tcfg_cycletime) { + struct tcfg_gate_entry *entry; + ktime_t cycle = 0; + + list_for_each_entry(entry, &p->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + p->tcfg_cycletime = cycle; + } + + if (tb[TCA_GATE_CYCLE_TIME_EXT]) + p->tcfg_cycletime_ext = + nla_get_u64(tb[TCA_GATE_CYCLE_TIME_EXT]); + + p->tcfg_priority = prio; + p->tcfg_basetime = basetime; + p->tcfg_clockid = clockid; + p->tcfg_flags = gflags; + + gact->tk_offset = tk_offset; + hrtimer_init(&gact->hitimer, clockid, HRTIMER_MODE_ABS_SOFT); + gact->hitimer.function = gate_timer_func; + + err = gate_get_start_time(gact, &start); + if (err < 0) { + NL_SET_ERR_MSG(extack, + "Internal error: failed get start time"); + release_entry_list(&p->entries); + goto chain_put; + } + + gact->current_close_time = start; + gact->current_gate_status = GATE_ACT_GATE_OPEN | GATE_ACT_PENDING; + + gact->next_entry = list_first_entry(&p->entries, + struct tcfg_gate_entry, list); + + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + + gate_start_timer(gact, start); + + spin_unlock_bh(&gact->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + + if (ret == ACT_P_CREATED) + tcf_idr_insert(tn, *a); + + return ret; + +chain_put: + spin_unlock_bh(&gact->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +release_idr: + tcf_idr_release(*a, bind); + return err; +} + +static void tcf_gate_cleanup(struct tc_action *a) +{ + struct tcf_gate *gact = to_gate(a); + struct tcf_gate_params *p; + + p = &gact->param; + if (p->tcfg_clockid != -1) + hrtimer_cancel(&gact->hitimer); + + release_entry_list(&p->entries); +} + +static int dumping_entry(struct sk_buff *skb, + struct tcfg_gate_entry *entry) +{ + struct nlattr *item; + + item = nla_nest_start_noflag(skb, TCA_GATE_ONE_ENTRY); + if (!item) + return -ENOSPC; + + if (nla_put_u32(skb, TCA_GATE_ENTRY_INDEX, entry->index)) + goto nla_put_failure; + + if (entry->gate_state && nla_put_flag(skb, TCA_GATE_ENTRY_GATE)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GATE_ENTRY_INTERVAL, entry->interval)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_ENTRY_MAX_OCTETS, entry->maxoctets)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_ENTRY_IPV, entry->ipv)) + goto nla_put_failure; + + return nla_nest_end(skb, item); + +nla_put_failure: + nla_nest_cancel(skb, item); + return -1; +} + +static int tcf_gate_dump(struct sk_buff *skb, struct tc_action *a, + int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_gate *gact = to_gate(a); + struct tc_gate opt = { + .index = gact->tcf_index, + .refcnt = refcount_read(&gact->tcf_refcnt) - ref, + .bindcnt = atomic_read(&gact->tcf_bindcnt) - bind, + }; + struct tcfg_gate_entry *entry; + struct tcf_gate_params *p; + struct nlattr *entry_list; + struct tcf_t t; + + spin_lock_bh(&gact->tcf_lock); + opt.action = gact->tcf_action; + + p = &gact->param; + + if (nla_put(skb, TCA_GATE_PARMS, sizeof(opt), &opt)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_BASE_TIME, + p->tcfg_basetime, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME, + p->tcfg_cycletime, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_u64_64bit(skb, TCA_GATE_CYCLE_TIME_EXT, + p->tcfg_cycletime_ext, TCA_GATE_PAD)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_CLOCKID, p->tcfg_clockid)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_GATE_FLAGS, p->tcfg_flags)) + goto nla_put_failure; + + if (nla_put_s32(skb, TCA_GATE_PRIORITY, p->tcfg_priority)) + goto nla_put_failure; + + entry_list = nla_nest_start_noflag(skb, TCA_GATE_ENTRY_LIST); + if (!entry_list) + goto nla_put_failure; + + list_for_each_entry(entry, &p->entries, list) { + if (dumping_entry(skb, entry) < 0) + goto nla_put_failure; + } + + nla_nest_end(skb, entry_list); + + tcf_tm_dump(&t, &gact->tcf_tm); + if (nla_put_64bit(skb, TCA_GATE_TM, sizeof(t), &t, TCA_GATE_PAD)) + goto nla_put_failure; + spin_unlock_bh(&gact->tcf_lock); + + return skb->len; + +nla_put_failure: + spin_unlock_bh(&gact->tcf_lock); + nlmsg_trim(skb, b); + return -1; +} + +static int tcf_gate_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u32 packets, + u64 lastuse, bool hw) +{ + struct tcf_gate *gact = to_gate(a); + struct tcf_t *tm = &gact->tcf_tm; + + tcf_action_update_stats(a, bytes, packets, false, hw); + tm->lastuse = max_t(u64, tm->lastuse, lastuse); +} + +static int tcf_gate_search(struct net *net, struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tcf_idr_search(tn, a, index); +} + +static size_t tcf_gate_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_gate)); +} + +static struct tc_action_ops act_gate_ops = { + .kind = "gate", + .id = TCA_ID_GATE, + .owner = THIS_MODULE, + .act = tcf_gate_act, + .dump = tcf_gate_dump, + .init = tcf_gate_init, + .cleanup = tcf_gate_cleanup, + .walk = tcf_gate_walker, + .stats_update = tcf_gate_stats_update, + .get_fill_size = tcf_gate_get_fill_size, + .lookup = tcf_gate_search, + .size = sizeof(struct tcf_gate), +}; + +static __net_init int gate_init_net(struct net *net) +{ + struct tc_action_net *tn = net_generic(net, gate_net_id); + + return tc_action_net_init(net, tn, &act_gate_ops); +} + +static void __net_exit gate_exit_net(struct list_head *net_list) +{ + tc_action_net_exit(net_list, gate_net_id); +} + +static struct pernet_operations gate_net_ops = { + .init = gate_init_net, + .exit_batch = gate_exit_net, + .id = &gate_net_id, + .size = sizeof(struct tc_action_net), +}; + +static int __init gate_init_module(void) +{ + return tcf_register_action(&act_gate_ops, &gate_net_ops); +} + +static void __exit gate_cleanup_module(void) +{ + tcf_unregister_action(&act_gate_ops, &gate_net_ops); +} + +module_init(gate_init_module); +module_exit(gate_cleanup_module); +MODULE_LICENSE("GPL v2"); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 0a7ecc292bd3..a00a203b2ef5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -39,6 +39,7 @@ #include <net/tc_act/tc_skbedit.h> #include <net/tc_act/tc_ct.h> #include <net/tc_act/tc_mpls.h> +#include <net/tc_act/tc_gate.h> #include <net/flow_offload.h> extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; @@ -620,96 +621,42 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); -static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command, bool ingress) -{ - struct flow_block_offload bo = { - .command = command, - .binder_type = ingress ? - FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS : - FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, - .net = dev_net(dev), - .block_shared = tcf_block_non_null_shared(block), - }; - INIT_LIST_HEAD(&bo.cb_list); - - if (!block) - return; - - bo.block = &block->flow_block; - - down_write(&block->cb_lock); - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - tcf_block_setup(block, &bo); - up_write(&block->cb_lock); -} - -static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress) +static void tcf_block_offload_init(struct flow_block_offload *bo, + struct net_device *dev, + enum flow_block_command command, + enum flow_block_binder_type binder_type, + struct flow_block *flow_block, + bool shared, struct netlink_ext_ack *extack) { - const struct Qdisc_class_ops *cops; - const struct Qdisc_ops *ops; - struct Qdisc *qdisc; - - if (!dev_ingress_queue(dev)) - return NULL; - - qdisc = dev_ingress_queue(dev)->qdisc_sleeping; - if (!qdisc) - return NULL; - - ops = qdisc->ops; - if (!ops) - return NULL; - - if (!ingress && !strcmp("ingress", ops->id)) - return NULL; - - cops = ops->cl_ops; - if (!cops) - return NULL; - - if (!cops->tcf_block) - return NULL; - - return cops->tcf_block(qdisc, - ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS, - NULL); + bo->net = dev_net(dev); + bo->command = command; + bo->binder_type = binder_type; + bo->block = flow_block; + bo->block_shared = shared; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); } -static void tc_indr_block_get_and_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command command) -{ - struct tcf_block *block; - - block = tc_dev_block(dev, true); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, true); - - block = tc_dev_block(dev, false); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, false); -} +static void tcf_block_unbind(struct tcf_block *block, + struct flow_block_offload *bo); -static void tc_indr_block_call(struct tcf_block *block, - struct net_device *dev, - struct tcf_block_ext_info *ei, - enum flow_block_command command, - struct netlink_ext_ack *extack) +static void tc_block_indr_cleanup(struct flow_block_cb *block_cb) { - struct flow_block_offload bo = { - .command = command, - .binder_type = ei->binder_type, - .net = dev_net(dev), - .block = &block->flow_block, - .block_shared = tcf_block_shared(block), - .extack = extack, - }; - INIT_LIST_HEAD(&bo.cb_list); + struct tcf_block *block = block_cb->indr.data; + struct net_device *dev = block_cb->indr.dev; + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; - flow_indr_block_call(dev, &bo, command, TC_SETUP_BLOCK); - tcf_block_setup(block, &bo); + tcf_block_offload_init(&bo, dev, FLOW_BLOCK_UNBIND, + block_cb->indr.binder_type, + &block->flow_block, tcf_block_shared(block), + &extack); + down_write(&block->cb_lock); + list_move(&block_cb->list, &bo.cb_list); + up_write(&block->cb_lock); + rtnl_lock(); + tcf_block_unbind(block, &bo); + rtnl_unlock(); } static bool tcf_block_offload_in_use(struct tcf_block *block) @@ -726,17 +673,21 @@ static int tcf_block_offload_cmd(struct tcf_block *block, struct flow_block_offload bo = {}; int err; - bo.net = dev_net(dev); - bo.command = command; - bo.binder_type = ei->binder_type; - bo.block = &block->flow_block; - bo.block_shared = tcf_block_shared(block); - bo.extack = extack; - INIT_LIST_HEAD(&bo.cb_list); + tcf_block_offload_init(&bo, dev, command, ei->binder_type, + &block->flow_block, tcf_block_shared(block), + extack); - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); - if (err < 0) + if (dev->netdev_ops->ndo_setup_tc) + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_BLOCK, &bo); + else + err = flow_indr_dev_setup_offload(dev, TC_SETUP_BLOCK, block, + &bo, tc_block_indr_cleanup); + + if (err < 0) { + if (err != -EOPNOTSUPP) + NL_SET_ERR_MSG(extack, "Driver ndo_setup_tc failed"); return err; + } return tcf_block_setup(block, &bo); } @@ -749,13 +700,13 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - if (!dev->netdev_ops->ndo_setup_tc) - goto no_offload_dev_inc; /* If tc offload feature is disabled and the block we try to bind * to already has some offloaded filters, forbid to bind. */ - if (!tc_can_offload(dev) && tcf_block_offload_in_use(block)) { + if (dev->netdev_ops->ndo_setup_tc && + !tc_can_offload(dev) && + tcf_block_offload_in_use(block)) { NL_SET_ERR_MSG(extack, "Bind to offloaded block failed as dev has offload disabled"); err = -EOPNOTSUPP; goto err_unlock; @@ -767,18 +718,15 @@ static int tcf_block_offload_bind(struct tcf_block *block, struct Qdisc *q, if (err) goto err_unlock; - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); up_write(&block->cb_lock); return 0; no_offload_dev_inc: - if (tcf_block_offload_in_use(block)) { - err = -EOPNOTSUPP; + if (tcf_block_offload_in_use(block)) goto err_unlock; - } + err = 0; block->nooffloaddevcnt++; - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_BIND, extack); err_unlock: up_write(&block->cb_lock); return err; @@ -791,10 +739,6 @@ static void tcf_block_offload_unbind(struct tcf_block *block, struct Qdisc *q, int err; down_write(&block->cb_lock); - tc_indr_block_call(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); - - if (!dev->netdev_ops->ndo_setup_tc) - goto no_offload_dev_dec; err = tcf_block_offload_cmd(block, dev, ei, FLOW_BLOCK_UNBIND, NULL); if (err == -EOPNOTSUPP) goto no_offload_dev_dec; @@ -1847,7 +1791,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, - bool rtnl_held) + bool terse_dump, bool rtnl_held) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1874,6 +1818,14 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, goto nla_put_failure; if (!fh) { tcm->tcm_handle = 0; + } else if (terse_dump) { + if (tp->ops->terse_dump) { + if (tp->ops->terse_dump(net, tp, fh, skb, tcm, + rtnl_held) < 0) + goto nla_put_failure; + } else { + goto cls_op_not_supp; + } } else { if (tp->ops->dump && tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) @@ -1884,6 +1836,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, out_nlmsg_trim: nla_put_failure: +cls_op_not_supp: nlmsg_trim(skb, b); return -1; } @@ -1904,7 +1857,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, - rtnl_held) <= 0) { + false, rtnl_held) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1936,7 +1889,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - rtnl_held) <= 0) { + false, rtnl_held) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; @@ -2497,6 +2450,7 @@ struct tcf_dump_args { struct tcf_block *block; struct Qdisc *q; u32 parent; + bool terse_dump; }; static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) @@ -2507,12 +2461,12 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, true); + RTM_NEWTFILTER, a->terse_dump, true); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, struct sk_buff *skb, struct netlink_callback *cb, - long index_start, long *p_index) + long index_start, long *p_index, bool terse) { struct net *net = sock_net(skb->sk); struct tcf_block *block = chain->block; @@ -2541,7 +2495,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, true) <= 0) + RTM_NEWTFILTER, false, true) <= 0) goto errout; cb->args[1] = 1; } @@ -2557,6 +2511,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, arg.w.skip = cb->args[1] - 1; arg.w.count = 0; arg.w.cookie = cb->args[2]; + arg.terse_dump = terse; tp->ops->walk(tp, &arg.w, true); cb->args[2] = arg.w.cookie; cb->args[1] = arg.w.count + 1; @@ -2570,6 +2525,10 @@ errout: return false; } +static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = { + [TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE), +}; + /* called with RTNL */ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) { @@ -2579,6 +2538,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) struct Qdisc *q = NULL; struct tcf_block *block; struct tcmsg *tcm = nlmsg_data(cb->nlh); + bool terse_dump = false; long index_start; long index; u32 parent; @@ -2588,10 +2548,17 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX, - NULL, cb->extack); + tcf_tfilter_dump_policy, cb->extack); if (err) return err; + if (tca[TCA_DUMP_FLAGS]) { + struct nla_bitfield32 flags = + nla_get_bitfield32(tca[TCA_DUMP_FLAGS]); + + terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE; + } + if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) { block = tcf_block_refcnt_get(net, tcm->tcm_block_index); if (!block) @@ -2649,7 +2616,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) nla_get_u32(tca[TCA_CHAIN]) != chain->index) continue; if (!tcf_chain_dump(chain, q, parent, skb, cb, - index_start, &index)) { + index_start, &index, terse_dump)) { tcf_chain_put(chain); err = -EMSGSIZE; break; @@ -3152,7 +3119,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts) if (nest == NULL) goto nla_put_failure; - if (tcf_action_dump(skb, exts->actions, 0, 0) < 0) + if (tcf_action_dump(skb, exts->actions, 0, 0, false) + < 0) goto nla_put_failure; nla_nest_end(skb, nest); } else if (exts->police) { @@ -3176,6 +3144,31 @@ nla_put_failure: } EXPORT_SYMBOL(tcf_exts_dump); +int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + struct nlattr *nest; + + if (!exts->action || !tcf_exts_has_actions(exts)) + return 0; + + nest = nla_nest_start_noflag(skb, exts->action); + if (!nest) + goto nla_put_failure; + + if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0) + goto nla_put_failure; + nla_nest_end(skb, nest); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +#else + return 0; +#endif +} +EXPORT_SYMBOL(tcf_exts_terse_dump); int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts) { @@ -3523,6 +3516,27 @@ static void tcf_sample_get_group(struct flow_action_entry *entry, #endif } +static void tcf_gate_entry_destructor(void *priv) +{ + struct action_gate_entry *oe = priv; + + kfree(oe); +} + +static int tcf_gate_get_entries(struct flow_action_entry *entry, + const struct tc_action *act) +{ + entry->gate.entries = tcf_gate_get_list(act); + + if (!entry->gate.entries) + return -EINVAL; + + entry->destructor = tcf_gate_entry_destructor; + entry->destructor_priv = entry->gate.entries; + + return 0; +} + static enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) { if (WARN_ON_ONCE(hw_stats > TCA_ACT_HW_STATS_ANY)) @@ -3679,6 +3693,17 @@ int tc_setup_flow_action(struct flow_action *flow_action, } else if (is_tcf_skbedit_priority(act)) { entry->id = FLOW_ACTION_PRIORITY; entry->priority = tcf_skbedit_priority(act); + } else if (is_tcf_gate(act)) { + entry->id = FLOW_ACTION_GATE; + entry->gate.index = tcf_gate_index(act); + entry->gate.prio = tcf_gate_prio(act); + entry->gate.basetime = tcf_gate_basetime(act); + entry->gate.cycletime = tcf_gate_cycletime(act); + entry->gate.cycletimeext = tcf_gate_cycletimeext(act); + entry->gate.num_entries = tcf_gate_num_entries(act); + err = tcf_gate_get_entries(entry, act); + if (err) + goto err_out; } else { err = -EOPNOTSUPP; goto err_out_locked; @@ -3739,11 +3764,6 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; -static struct flow_indr_block_entry block_entry = { - .cb = tc_indr_block_get_and_cmd, - .list = LIST_HEAD_INIT(block_entry.list), -}; - static int __init tc_filter_init(void) { int err; @@ -3756,8 +3776,6 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; - flow_indr_add_block_cb(&block_entry); - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 74a0febcafb8..b2da37286082 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -272,14 +272,16 @@ static struct cls_fl_filter *fl_lookup_range(struct fl_flow_mask *mask, return NULL; } -static struct cls_fl_filter *fl_lookup(struct fl_flow_mask *mask, - struct fl_flow_key *mkey, - struct fl_flow_key *key) +static noinline_for_stack +struct cls_fl_filter *fl_mask_lookup(struct fl_flow_mask *mask, struct fl_flow_key *key) { + struct fl_flow_key mkey; + + fl_set_masked_key(&mkey, key, mask); if ((mask->flags & TCA_FLOWER_MASK_FLAGS_RANGE)) - return fl_lookup_range(mask, mkey, key); + return fl_lookup_range(mask, &mkey, key); - return __fl_lookup(mask, mkey); + return __fl_lookup(mask, &mkey); } static u16 fl_ct_info_to_flower_map[] = { @@ -299,7 +301,6 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - struct fl_flow_key skb_mkey; struct fl_flow_key skb_key; struct fl_flow_mask *mask; struct cls_fl_filter *f; @@ -319,9 +320,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, ARRAY_SIZE(fl_ct_info_to_flower_map)); skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); - fl_set_masked_key(&skb_mkey, &skb_key, mask); - - f = fl_lookup(mask, &skb_mkey, &skb_key); + f = fl_mask_lookup(mask, &skb_key); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); @@ -668,6 +667,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, + [TCA_FLOWER_KEY_MPLS_OPTS] = { .type = NLA_NESTED }, [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 }, @@ -726,6 +726,15 @@ erspan_opt_policy[TCA_FLOWER_KEY_ENC_OPT_ERSPAN_MAX + 1] = { [TCA_FLOWER_KEY_ENC_OPT_ERSPAN_HWID] = { .type = NLA_U8 }, }; +static const struct nla_policy +mpls_stack_entry_policy[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1] = { + [TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_TC] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL] = { .type = NLA_U32 }, +}; + static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) @@ -776,14 +785,157 @@ static int fl_set_key_port_range(struct nlattr **tb, struct fl_flow_key *key, return 0; } +static int fl_set_key_mpls_lse(const struct nlattr *nla_lse, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX + 1]; + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_val; + u8 lse_index; + u8 depth; + int err; + + err = nla_parse_nested(tb, TCA_FLOWER_KEY_MPLS_OPT_LSE_MAX, nla_lse, + mpls_stack_entry_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]) { + NL_SET_ERR_MSG(extack, "Missing MPLS option \"depth\""); + return -EINVAL; + } + + depth = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH]); + + /* LSE depth starts at 1, for consistency with terminology used by + * RFC 3031 (section 3.9), where depth 0 refers to unlabeled packets. + */ + if (depth < 1 || depth > FLOW_DIS_MPLS_MAX) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH], + "Invalid MPLS depth"); + return -EINVAL; + } + lse_index = depth - 1; + + dissector_set_mpls_lse(key_val, lse_index); + dissector_set_mpls_lse(key_mask, lse_index); + + lse_val = &key_val->ls[lse_index]; + lse_mask = &key_mask->ls[lse_index]; + + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]) { + lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL]); + lse_mask->mpls_ttl = MPLS_TTL_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]) { + u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS]); + + if (bos & ~MPLS_BOS_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS], + "Bottom Of Stack (BOS) must be 0 or 1"); + return -EINVAL; + } + lse_val->mpls_bos = bos; + lse_mask->mpls_bos = MPLS_BOS_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]) { + u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC]); + + if (tc & ~MPLS_TC_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_TC], + "Traffic Class (TC) must be between 0 and 7"); + return -EINVAL; + } + lse_val->mpls_tc = tc; + lse_mask->mpls_tc = MPLS_TC_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]) { + u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL]); + + if (label & ~MPLS_LABEL_MASK) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL], + "Label must be between 0 and 1048575"); + return -EINVAL; + } + lse_val->mpls_label = label; + lse_mask->mpls_label = MPLS_LABEL_MASK; + } + + return 0; +} + +static int fl_set_key_mpls_opts(const struct nlattr *nla_mpls_opts, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask, + struct netlink_ext_ack *extack) +{ + struct nlattr *nla_lse; + int rem; + int err; + + if (!(nla_mpls_opts->nla_type & NLA_F_NESTED)) { + NL_SET_ERR_MSG_ATTR(extack, nla_mpls_opts, + "NLA_F_NESTED is missing"); + return -EINVAL; + } + + nla_for_each_nested(nla_lse, nla_mpls_opts, rem) { + if (nla_type(nla_lse) != TCA_FLOWER_KEY_MPLS_OPTS_LSE) { + NL_SET_ERR_MSG_ATTR(extack, nla_lse, + "Invalid MPLS option type"); + return -EINVAL; + } + + err = fl_set_key_mpls_lse(nla_lse, key_val, key_mask, extack); + if (err < 0) + return err; + } + if (rem) { + NL_SET_ERR_MSG(extack, + "Bytes leftover after parsing MPLS options"); + return -EINVAL; + } + + return 0; +} + static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, struct flow_dissector_key_mpls *key_mask, struct netlink_ext_ack *extack) { + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_val; + + if (tb[TCA_FLOWER_KEY_MPLS_OPTS]) { + if (tb[TCA_FLOWER_KEY_MPLS_TTL] || + tb[TCA_FLOWER_KEY_MPLS_BOS] || + tb[TCA_FLOWER_KEY_MPLS_TC] || + tb[TCA_FLOWER_KEY_MPLS_LABEL]) { + NL_SET_ERR_MSG_ATTR(extack, + tb[TCA_FLOWER_KEY_MPLS_OPTS], + "MPLS label, Traffic Class, Bottom Of Stack and Time To Live must be encapsulated in the MPLS options attribute"); + return -EBADMSG; + } + + return fl_set_key_mpls_opts(tb[TCA_FLOWER_KEY_MPLS_OPTS], + key_val, key_mask, extack); + } + + lse_val = &key_val->ls[0]; + lse_mask = &key_mask->ls[0]; + if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { - key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); - key_mask->mpls_ttl = MPLS_TTL_MASK; + lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); + lse_mask->mpls_ttl = MPLS_TTL_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); @@ -794,8 +946,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Bottom Of Stack (BOS) must be 0 or 1"); return -EINVAL; } - key_val->mpls_bos = bos; - key_mask->mpls_bos = MPLS_BOS_MASK; + lse_val->mpls_bos = bos; + lse_mask->mpls_bos = MPLS_BOS_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_TC]) { u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]); @@ -806,8 +960,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Traffic Class (TC) must be between 0 and 7"); return -EINVAL; } - key_val->mpls_tc = tc; - key_mask->mpls_tc = MPLS_TC_MASK; + lse_val->mpls_tc = tc; + lse_mask->mpls_tc = MPLS_TC_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]); @@ -818,8 +974,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Label must be between 0 and 1048575"); return -EINVAL; } - key_val->mpls_label = label; - key_mask->mpls_label = MPLS_LABEL_MASK; + lse_val->mpls_label = label; + lse_mask->mpls_label = MPLS_LABEL_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } return 0; } @@ -2218,35 +2376,132 @@ static int fl_dump_key_port_range(struct sk_buff *skb, struct fl_flow_key *key, return 0; } +static int fl_dump_key_mpls_opt_lse(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask, + u8 lse_index) +{ + struct flow_dissector_mpls_lse *lse_mask = &mpls_mask->ls[lse_index]; + struct flow_dissector_mpls_lse *lse_key = &mpls_key->ls[lse_index]; + int err; + + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_DEPTH, + lse_index + 1); + if (err) + return err; + + if (lse_mask->mpls_ttl) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TTL, + lse_key->mpls_ttl); + if (err) + return err; + } + if (lse_mask->mpls_bos) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_BOS, + lse_key->mpls_bos); + if (err) + return err; + } + if (lse_mask->mpls_tc) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_TC, + lse_key->mpls_tc); + if (err) + return err; + } + if (lse_mask->mpls_label) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_OPT_LSE_LABEL, + lse_key->mpls_label); + if (err) + return err; + } + + return 0; +} + +static int fl_dump_key_mpls_opts(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask) +{ + struct nlattr *opts; + struct nlattr *lse; + u8 lse_index; + int err; + + opts = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS); + if (!opts) + return -EMSGSIZE; + + for (lse_index = 0; lse_index < FLOW_DIS_MPLS_MAX; lse_index++) { + if (!(mpls_mask->used_lses & 1 << lse_index)) + continue; + + lse = nla_nest_start(skb, TCA_FLOWER_KEY_MPLS_OPTS_LSE); + if (!lse) { + err = -EMSGSIZE; + goto err_opts; + } + + err = fl_dump_key_mpls_opt_lse(skb, mpls_key, mpls_mask, + lse_index); + if (err) + goto err_opts_lse; + nla_nest_end(skb, lse); + } + nla_nest_end(skb, opts); + + return 0; + +err_opts_lse: + nla_nest_cancel(skb, lse); +err_opts: + nla_nest_cancel(skb, opts); + + return err; +} + static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) { + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_key; int err; - if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) + if (!mpls_mask->used_lses) return 0; - if (mpls_mask->mpls_ttl) { + + lse_mask = &mpls_mask->ls[0]; + lse_key = &mpls_key->ls[0]; + + /* For backward compatibility, don't use the MPLS nested attributes if + * the rule can be expressed using the old attributes. + */ + if (mpls_mask->used_lses & ~1 || + (!lse_mask->mpls_ttl && !lse_mask->mpls_bos && + !lse_mask->mpls_tc && !lse_mask->mpls_label)) + return fl_dump_key_mpls_opts(skb, mpls_key, mpls_mask); + + if (lse_mask->mpls_ttl) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, - mpls_key->mpls_ttl); + lse_key->mpls_ttl); if (err) return err; } - if (mpls_mask->mpls_tc) { + if (lse_mask->mpls_tc) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC, - mpls_key->mpls_tc); + lse_key->mpls_tc); if (err) return err; } - if (mpls_mask->mpls_label) { + if (lse_mask->mpls_label) { err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL, - mpls_key->mpls_label); + lse_key->mpls_label); if (err) return err; } - if (mpls_mask->mpls_bos) { + if (lse_mask->mpls_bos) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS, - mpls_key->mpls_bos); + lse_key->mpls_bos); if (err) return err; } @@ -2768,6 +3023,48 @@ nla_put_failure: return -1; } +static int fl_terse_dump(struct net *net, struct tcf_proto *tp, void *fh, + struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) +{ + struct cls_fl_filter *f = fh; + struct nlattr *nest; + bool skip_hw; + + if (!f) + return skb->len; + + t->tcm_handle = f->handle; + + nest = nla_nest_start_noflag(skb, TCA_OPTIONS); + if (!nest) + goto nla_put_failure; + + spin_lock(&tp->lock); + + skip_hw = tc_skip_hw(f->flags); + + if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) + goto nla_put_failure_locked; + + spin_unlock(&tp->lock); + + if (!skip_hw) + fl_hw_update_stats(tp, f, rtnl_held); + + if (tcf_exts_terse_dump(skb, &f->exts)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + + return skb->len; + +nla_put_failure_locked: + spin_unlock(&tp->lock); +nla_put_failure: + nla_nest_cancel(skb, nest); + return -1; +} + static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv) { struct fl_flow_tmplt *tmplt = tmplt_priv; @@ -2832,6 +3129,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .hw_add = fl_hw_add, .hw_del = fl_hw_del, .dump = fl_dump, + .terse_dump = fl_terse_dump, .bind_class = fl_bind_class, .tmplt_create = fl_tmplt_create, .tmplt_destroy = fl_tmplt_destroy, diff --git a/net/sched/em_ipt.c b/net/sched/em_ipt.c index eecfe072c508..18755d29fd15 100644 --- a/net/sched/em_ipt.c +++ b/net/sched/em_ipt.c @@ -199,7 +199,7 @@ static void em_ipt_destroy(struct tcf_ematch *em) im->match->destroy(&par); } module_put(im->match->me); - kfree((void *)im); + kfree(im); } static int em_ipt_match(struct sk_buff *skb, struct tcf_ematch *em, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 0d99df1e764d..9a3449b56bd6 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -32,6 +32,8 @@ #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <trace/events/qdisc.h> + /* Short review. @@ -1283,6 +1285,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, } qdisc_hash_add(sch, false); + trace_qdisc_create(ops, dev, parent); return sch; diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 1496e87cd07b..60f8ae578819 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -584,26 +584,48 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, return drop; } -static void cake_update_flowkeys(struct flow_keys *keys, +static bool cake_update_flowkeys(struct flow_keys *keys, const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conntrack_tuple tuple = {}; - bool rev = !skb->_nfct; + bool rev = !skb->_nfct, upd = false; + __be32 ip; if (tc_skb_protocol(skb) != htons(ETH_P_IP)) - return; + return false; if (!nf_ct_get_tuple_skb(&tuple, skb)) - return; + return false; - keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; - keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip; + if (ip != keys->addrs.v4addrs.src) { + keys->addrs.v4addrs.src = ip; + upd = true; + } + ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip; + if (ip != keys->addrs.v4addrs.dst) { + keys->addrs.v4addrs.dst = ip; + upd = true; + } if (keys->ports.ports) { - keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all; - keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all; + __be16 port; + + port = rev ? tuple.dst.u.all : tuple.src.u.all; + if (port != keys->ports.src) { + keys->ports.src = port; + upd = true; + } + port = rev ? tuple.src.u.all : tuple.dst.u.all; + if (port != keys->ports.dst) { + port = keys->ports.dst; + upd = true; + } } + return upd; +#else + return false; #endif } @@ -624,23 +646,36 @@ static bool cake_ddst(int flow_mode) static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { + bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS)); + bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS)); + bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG); u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0; u16 reduced_hash, srchost_idx, dsthost_idx; struct flow_keys keys, host_keys; + bool use_skbhash = skb->l4_hash; if (unlikely(flow_mode == CAKE_FLOW_NONE)) return 0; - /* If both overrides are set we can skip packet dissection entirely */ - if ((flow_override || !(flow_mode & CAKE_FLOW_FLOWS)) && - (host_override || !(flow_mode & CAKE_FLOW_HOSTS))) + /* If both overrides are set, or we can use the SKB hash and nat mode is + * disabled, we can skip packet dissection entirely. If nat mode is + * enabled there's another check below after doing the conntrack lookup. + */ + if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts) goto skip_hash; skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); - if (flow_mode & CAKE_FLOW_NAT_FLAG) - cake_update_flowkeys(&keys, skb); + /* Don't use the SKB hash if we change the lookup keys from conntrack */ + if (nat_enabled && cake_update_flowkeys(&keys, skb)) + use_skbhash = false; + + /* If we can still use the SKB hash and don't need the host hash, we can + * skip the rest of the hashing procedure + */ + if (use_skbhash && !hash_hosts) + goto skip_hash; /* flow_hash_from_keys() sorts the addresses by value, so we have * to preserve their order in a separate data structure to treat @@ -679,12 +714,14 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, /* This *must* be after the above switch, since as a * side-effect it sorts the src and dst addresses. */ - if (flow_mode & CAKE_FLOW_FLOWS) + if (hash_flows && !use_skbhash) flow_hash = flow_hash_from_keys(&keys); skip_hash: if (flow_override) flow_hash = flow_override - 1; + else if (use_skbhash) + flow_hash = skb->hash; if (host_override) { dsthost_hash = host_override - 1; srchost_hash = host_override - 1; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 1bcf8fbfd40e..bd618b00d319 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -131,7 +131,6 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx, } struct choke_skb_cb { - u16 classid; u8 keys_valid; struct flow_keys_digest keys; }; @@ -142,11 +141,6 @@ static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } -static inline void choke_set_classid(struct sk_buff *skb, u16 classid) -{ - choke_skb_cb(skb)->classid = classid; -} - /* * Compare flow of two packets * Returns true only if source and destination address and port match. diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 4c060134c736..8f06a808c59a 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -66,22 +66,27 @@ static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) * in linear list (head,tail), otherwise are placed in a rbtree (t_root). */ struct fq_flow { +/* First cache line : used in fq_gc(), fq_enqueue(), fq_dequeue() */ struct rb_root t_root; struct sk_buff *head; /* list of skbs for this flow : first skb */ union { struct sk_buff *tail; /* last skb in the list */ - unsigned long age; /* jiffies when flow was emptied, for gc */ + unsigned long age; /* (jiffies | 1UL) when flow was emptied, for gc */ }; struct rb_node fq_node; /* anchor in fq_root[] trees */ struct sock *sk; + u32 socket_hash; /* sk_hash */ int qlen; /* number of packets in flow queue */ + +/* Second cache line, used in fq_dequeue() */ int credit; - u32 socket_hash; /* sk_hash */ - struct fq_flow *next; /* next pointer in RR lists, or &detached */ + /* 32bit hole on 64bit arches */ + + struct fq_flow *next; /* next pointer in RR lists */ struct rb_node rate_node; /* anchor in q->delayed tree */ u64 time_next_packet; -}; +} ____cacheline_aligned_in_smp; struct fq_flow_head { struct fq_flow *first; @@ -95,6 +100,7 @@ struct fq_sched_data { struct rb_root delayed; /* for rate limited flows */ u64 time_next_delayed_flow; + u64 ktime_cache; /* copy of last ktime_get_ns() */ unsigned long unthrottle_latency_ns; struct fq_flow internal; /* for non classified or high prio packets */ @@ -104,12 +110,13 @@ struct fq_sched_data { u32 flow_plimit; /* max packets per flow */ unsigned long flow_max_rate; /* optional max rate per flow */ u64 ce_threshold; + u64 horizon; /* horizon in ns */ u32 orphan_mask; /* mask for orphaned skb */ u32 low_rate_threshold; struct rb_root *fq_root; u8 rate_enable; u8 fq_trees_log; - + u8 horizon_drop; u32 flows; u32 inactive_flows; u32 throttled_flows; @@ -118,6 +125,8 @@ struct fq_sched_data { u64 stat_internal_packets; u64 stat_throttled; u64 stat_ce_mark; + u64 stat_horizon_drops; + u64 stat_horizon_caps; u64 stat_flows_plimit; u64 stat_pkts_too_long; u64 stat_allocation_errors; @@ -126,20 +135,25 @@ struct fq_sched_data { struct qdisc_watchdog watchdog; }; -/* special value to mark a detached flow (not on old/new list) */ -static struct fq_flow detached, throttled; - +/* + * f->tail and f->age share the same location. + * We can use the low order bit to differentiate if this location points + * to a sk_buff or contains a jiffies value, if we force this value to be odd. + * This assumes f->tail low order bit must be 0 since alignof(struct sk_buff) >= 2 + */ static void fq_flow_set_detached(struct fq_flow *f) { - f->next = &detached; - f->age = jiffies; + f->age = jiffies | 1UL; } static bool fq_flow_is_detached(const struct fq_flow *f) { - return f->next == &detached; + return !!(f->age & 1UL); } +/* special value to mark a throttled flow (not on old/new list) */ +static struct fq_flow throttled; + static bool fq_flow_is_throttled(const struct fq_flow *f) { return f->next == &throttled; @@ -204,9 +218,10 @@ static void fq_gc(struct fq_sched_data *q, struct rb_root *root, struct sock *sk) { - struct fq_flow *f, *tofree[FQ_GC_MAX]; struct rb_node **p, *parent; - int fcnt = 0; + void *tofree[FQ_GC_MAX]; + struct fq_flow *f; + int i, fcnt = 0; p = &root->rb_node; parent = NULL; @@ -229,15 +244,18 @@ static void fq_gc(struct fq_sched_data *q, p = &parent->rb_left; } + if (!fcnt) + return; + + for (i = fcnt; i > 0; ) { + f = tofree[--i]; + rb_erase(&f->fq_node, root); + } q->flows -= fcnt; q->inactive_flows -= fcnt; q->stat_gc_flows += fcnt; - while (fcnt) { - struct fq_flow *f = tofree[--fcnt]; - rb_erase(&f->fq_node, root); - kmem_cache_free(fq_flow_cachep, f); - } + kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) @@ -370,19 +388,17 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, } } -/* remove one skb from head of flow queue */ -static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) +/* Remove one skb from flow queue. + * This skb must be the return value of prior fq_peek(). + */ +static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, + struct sk_buff *skb) { - struct sk_buff *skb = fq_peek(flow); - - if (skb) { - fq_erase_head(sch, flow, skb); - skb_mark_not_on_list(skb); - flow->qlen--; - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - } - return skb; + fq_erase_head(sch, flow, skb); + skb_mark_not_on_list(skb); + flow->qlen--; + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -390,8 +406,6 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) struct rb_node **p, *parent; struct sk_buff *head, *aux; - fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); - head = flow->head; if (!head || fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { @@ -419,6 +433,12 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) rb_insert_color(&skb->rbnode, &flow->t_root); } +static bool fq_packet_beyond_horizon(const struct sk_buff *skb, + const struct fq_sched_data *q) +{ + return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon)); +} + static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -428,6 +448,28 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(sch->q.qlen >= sch->limit)) return qdisc_drop(skb, sch, to_free); + if (!skb->tstamp) { + fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns(); + } else { + /* Check if packet timestamp is too far in the future. + * Try first if our cached value, to avoid ktime_get_ns() + * cost in most cases. + */ + if (fq_packet_beyond_horizon(skb, q)) { + /* Refresh our cache and check another time */ + q->ktime_cache = ktime_get_ns(); + if (fq_packet_beyond_horizon(skb, q)) { + if (q->horizon_drop) { + q->stat_horizon_drops++; + return qdisc_drop(skb, sch, to_free); + } + q->stat_horizon_caps++; + skb->tstamp = q->ktime_cache + q->horizon; + } + } + fq_skb_cb(skb)->time_to_send = skb->tstamp; + } + f = fq_classify(skb, q); if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) { q->stat_flows_plimit++; @@ -494,11 +536,13 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) if (!sch->q.qlen) return NULL; - skb = fq_dequeue_head(sch, &q->internal); - if (skb) + skb = fq_peek(&q->internal); + if (unlikely(skb)) { + fq_dequeue_skb(sch, &q->internal, skb); goto out; + } - now = ktime_get_ns(); + q->ktime_cache = now = ktime_get_ns(); fq_check_throttled(q, now); begin: head = &q->new_flows; @@ -532,14 +576,13 @@ begin: fq_flow_set_throttled(q, f); goto begin; } + prefetch(&skb->end); if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; } - } - - skb = fq_dequeue_head(sch, f); - if (!skb) { + fq_dequeue_skb(sch, f, skb); + } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ if ((head == &q->new_flows) && q->old_flows.first) { @@ -550,7 +593,6 @@ begin: } goto begin; } - prefetch(&skb->end); plen = qdisc_pkt_len(skb); f->credit -= plen; @@ -753,6 +795,8 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = { [TCA_FQ_LOW_RATE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_CE_THRESHOLD] = { .type = NLA_U32 }, [TCA_FQ_TIMER_SLACK] = { .type = NLA_U32 }, + [TCA_FQ_HORIZON] = { .type = NLA_U32 }, + [TCA_FQ_HORIZON_DROP] = { .type = NLA_U8 }, }; static int fq_change(struct Qdisc *sch, struct nlattr *opt, @@ -842,7 +886,15 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_TIMER_SLACK]) q->timer_slack = nla_get_u32(tb[TCA_FQ_TIMER_SLACK]); + if (tb[TCA_FQ_HORIZON]) + q->horizon = (u64)NSEC_PER_USEC * + nla_get_u32(tb[TCA_FQ_HORIZON]); + + if (tb[TCA_FQ_HORIZON_DROP]) + q->horizon_drop = nla_get_u8(tb[TCA_FQ_HORIZON_DROP]); + if (!err) { + sch_tree_unlock(sch); err = fq_resize(sch, fq_log); sch_tree_lock(sch); @@ -895,6 +947,9 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt, q->timer_slack = 10 * NSEC_PER_USEC; /* 10 usec of hrtimer slack */ + q->horizon = 10ULL * NSEC_PER_SEC; /* 10 seconds */ + q->horizon_drop = 1; /* by default, drop packets beyond horizon */ + /* Default ce_threshold of 4294 seconds */ q->ce_threshold = (u64)NSEC_PER_USEC * ~0U; @@ -912,6 +967,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct fq_sched_data *q = qdisc_priv(sch); u64 ce_threshold = q->ce_threshold; + u64 horizon = q->horizon; struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); @@ -921,6 +977,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) /* TCA_FQ_FLOW_DEFAULT_RATE is not used anymore */ do_div(ce_threshold, NSEC_PER_USEC); + do_div(horizon, NSEC_PER_USEC); if (nla_put_u32(skb, TCA_FQ_PLIMIT, sch->limit) || nla_put_u32(skb, TCA_FQ_FLOW_PLIMIT, q->flow_plimit) || @@ -936,7 +993,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) q->low_rate_threshold) || nla_put_u32(skb, TCA_FQ_CE_THRESHOLD, (u32)ce_threshold) || nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log) || - nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack)) + nla_put_u32(skb, TCA_FQ_TIMER_SLACK, q->timer_slack) || + nla_put_u32(skb, TCA_FQ_HORIZON, (u32)horizon) || + nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop)) goto nla_put_failure; return nla_nest_end(skb, opts); @@ -967,6 +1026,8 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) st.unthrottle_latency_ns = min_t(unsigned long, q->unthrottle_latency_ns, ~0U); st.ce_mark = q->stat_ce_mark; + st.horizon_drops = q->stat_horizon_drops; + st.horizon_caps = q->stat_horizon_caps; sch_tree_unlock(sch); return gnet_stats_copy_app(d, &st, sizeof(st)); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2efd5b61acef..b19a0021a0bd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -794,6 +794,9 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); +static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) @@ -846,9 +849,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; sch->flags = ops->static_flags; @@ -859,12 +870,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev_hold(dev); refcount_set(&sch->refcnt, 1); - if (sch != &noop_qdisc) { - lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->running, &dev->qdisc_running_key); - } - return sch; errout1: kfree(p); @@ -891,8 +896,10 @@ struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, } sch->parent = parentid; - if (!ops->init || ops->init(sch, NULL, extack) == 0) + if (!ops->init || ops->init(sch, NULL, extack) == 0) { + trace_qdisc_create(ops, dev_queue->dev, parentid); return sch; + } qdisc_put(sch); return NULL; @@ -906,6 +913,8 @@ void qdisc_reset(struct Qdisc *qdisc) const struct Qdisc_ops *ops = qdisc->ops; struct sk_buff *skb, *tmp; + trace_qdisc_reset(qdisc); + if (ops->reset) ops->reset(qdisc); @@ -944,7 +953,6 @@ static void qdisc_free_cb(struct rcu_head *head) static void qdisc_destroy(struct Qdisc *qdisc) { const struct Qdisc_ops *ops = qdisc->ops; - struct sk_buff *skb, *tmp; #ifdef CONFIG_NET_SCHED qdisc_hash_del(qdisc); @@ -952,23 +960,16 @@ static void qdisc_destroy(struct Qdisc *qdisc) qdisc_put_stab(rtnl_dereference(qdisc->stab)); #endif gen_kill_estimator(&qdisc->rate_est); - if (ops->reset) - ops->reset(qdisc); + + qdisc_reset(qdisc); + if (ops->destroy) ops->destroy(qdisc); module_put(ops->owner); dev_put(qdisc_dev(qdisc)); - skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) { - __skb_unlink(skb, &qdisc->gso_skb); - kfree_skb_list(skb); - } - - skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) { - __skb_unlink(skb, &qdisc->skb_bad_txq); - kfree_skb_list(skb); - } + trace_qdisc_destroy(qdisc); call_rcu(&qdisc->rcu, qdisc_free_cb); } @@ -1037,10 +1038,9 @@ static void attach_one_default_qdisc(struct net_device *dev, ops = &pfifo_fast_ops; qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL); - if (!qdisc) { - netdev_info(dev, "activation failed\n"); + if (!qdisc) return; - } + if (!netif_is_multiqueue(dev)) qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue->qdisc_sleeping = qdisc; @@ -1065,6 +1065,18 @@ static void attach_default_qdiscs(struct net_device *dev) qdisc->ops->attach(qdisc); } } + + /* Detect default qdisc setup/init failed and fallback to "noqueue" */ + if (dev->qdisc == &noop_qdisc) { + netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n", + default_qdisc_ops->id, noqueue_qdisc_ops.id); + dev->priv_flags |= IFF_NO_QUEUE; + netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); + dev->qdisc = txq->qdisc_sleeping; + qdisc_refcount_inc(dev->qdisc); + dev->priv_flags ^= IFF_NO_QUEUE; + } + #ifdef CONFIG_NET_SCHED if (dev->qdisc != &noop_qdisc) qdisc_hash_add(dev->qdisc, false); @@ -1116,6 +1128,28 @@ void dev_activate(struct net_device *dev) } EXPORT_SYMBOL(dev_activate); +static void qdisc_deactivate(struct Qdisc *qdisc) +{ + bool nolock = qdisc->flags & TCQ_F_NOLOCK; + + if (qdisc->flags & TCQ_F_BUILTIN) + return; + if (test_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state)) + return; + + if (nolock) + spin_lock_bh(&qdisc->seqlock); + spin_lock_bh(qdisc_lock(qdisc)); + + set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); + + qdisc_reset(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + if (nolock) + spin_unlock_bh(&qdisc->seqlock); +} + static void dev_deactivate_queue(struct net_device *dev, struct netdev_queue *dev_queue, void *_qdisc_default) @@ -1125,21 +1159,8 @@ static void dev_deactivate_queue(struct net_device *dev, qdisc = rtnl_dereference(dev_queue->qdisc); if (qdisc) { - bool nolock = qdisc->flags & TCQ_F_NOLOCK; - - if (nolock) - spin_lock_bh(&qdisc->seqlock); - spin_lock_bh(qdisc_lock(qdisc)); - - if (!(qdisc->flags & TCQ_F_BUILTIN)) - set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state); - + qdisc_deactivate(qdisc); rcu_assign_pointer(dev_queue->qdisc, qdisc_default); - qdisc_reset(qdisc); - - spin_unlock_bh(qdisc_lock(qdisc)); - if (nolock) - spin_unlock_bh(&qdisc->seqlock); } } @@ -1170,16 +1191,6 @@ static bool some_qdisc_is_busy(struct net_device *dev) return false; } -static void dev_qdisc_reset(struct net_device *dev, - struct netdev_queue *dev_queue, - void *none) -{ - struct Qdisc *qdisc = dev_queue->qdisc_sleeping; - - if (qdisc) - qdisc_reset(qdisc); -} - /** * dev_deactivate_many - deactivate transmissions on several devices * @head: list of devices to deactivate @@ -1216,12 +1227,6 @@ void dev_deactivate_many(struct list_head *head) */ schedule_timeout_uninterruptible(1); } - /* The new qdisc is assigned at this point so we can safely - * unwind stale skb lists and qdisc statistics - */ - netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL); - if (dev_ingress_queue(dev)) - dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL); } } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index c7de47c942e3..555a1b9e467f 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -48,7 +48,7 @@ struct red_sched_data { struct Qdisc *qdisc; }; -static const u32 red_supported_flags = TC_RED_HISTORIC_FLAGS | TC_RED_NODROP; +#define TC_RED_SUPPORTED_FLAGS (TC_RED_HISTORIC_FLAGS | TC_RED_NODROP) static inline int red_use_ecn(struct red_sched_data *q) { @@ -212,8 +212,7 @@ static const struct nla_policy red_policy[TCA_RED_MAX + 1] = { [TCA_RED_PARMS] = { .len = sizeof(struct tc_red_qopt) }, [TCA_RED_STAB] = { .len = RED_STAB_SIZE }, [TCA_RED_MAX_P] = { .type = NLA_U32 }, - [TCA_RED_FLAGS] = { .type = NLA_BITFIELD32, - .validation_data = &red_supported_flags }, + [TCA_RED_FLAGS] = NLA_POLICY_BITFIELD32(TC_RED_SUPPORTED_FLAGS), }; static int red_change(struct Qdisc *sch, struct nlattr *opt, @@ -248,7 +247,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, return -EINVAL; err = red_get_flags(ctl->flags, TC_RED_HISTORIC_FLAGS, - tb[TCA_RED_FLAGS], red_supported_flags, + tb[TCA_RED_FLAGS], TC_RED_SUPPORTED_FLAGS, &flags_bf, &userbits, extack); if (err) return err; @@ -372,7 +371,7 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) if (nla_put(skb, TCA_RED_PARMS, sizeof(opt), &opt) || nla_put_u32(skb, TCA_RED_MAX_P, q->parms.max_P) || nla_put_bitfield32(skb, TCA_RED_FLAGS, - q->flags, red_supported_flags)) + q->flags, TC_RED_SUPPORTED_FLAGS)) goto nla_put_failure; return nla_nest_end(skb, opts); diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 437079a4883d..72315137d7e7 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -432,7 +432,7 @@ void sctp_assoc_set_primary(struct sctp_association *asoc, changeover = 1 ; asoc->peer.primary_path = transport; - sctp_ulpevent_nofity_peer_addr_change(transport, + sctp_ulpevent_notify_peer_addr_change(transport, SCTP_ADDR_MADE_PRIM, 0); /* Set a default msg_name for events. */ @@ -574,7 +574,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc, asoc->peer.transport_count--; - sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); + sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_REMOVED, 0); sctp_transport_free(peer); } @@ -714,7 +714,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list); asoc->peer.transport_count++; - sctp_ulpevent_nofity_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); + sctp_ulpevent_notify_peer_addr_change(peer, SCTP_ADDR_ADDED, 0); /* If we do not yet have a primary path, set one. */ if (!asoc->peer.primary_path) { @@ -840,7 +840,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, * to the user. */ if (ulp_notify) - sctp_ulpevent_nofity_peer_addr_change(transport, + sctp_ulpevent_notify_peer_addr_change(transport, spc_state, error); /* Select new active and retran paths. */ diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index c87af430107a..ccfa0ab3e7f4 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -1032,6 +1032,7 @@ static const struct proto_ops inet6_seqpacket_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT + .compat_ioctl = inet6_compat_ioctl, .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 827a9903ee28..d57e1a002ffc 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -972,23 +972,22 @@ int sctp_asconf_mgmt(struct sctp_sock *sp, struct sctp_sockaddr_entry *addrw) * it. * * sk The sk of the socket - * addrs The pointer to the addresses in user land + * addrs The pointer to the addresses * addrssize Size of the addrs buffer * op Operation to perform (add or remove, see the flags of * sctp_bindx) * * Returns 0 if ok, <0 errno code on error. */ -static int sctp_setsockopt_bindx(struct sock *sk, - struct sockaddr __user *addrs, - int addrs_size, int op) +static int sctp_setsockopt_bindx_kernel(struct sock *sk, + struct sockaddr *addrs, int addrs_size, + int op) { - struct sockaddr *kaddrs; int err; int addrcnt = 0; int walk_size = 0; struct sockaddr *sa_addr; - void *addr_buf; + void *addr_buf = addrs; struct sctp_af *af; pr_debug("%s: sk:%p addrs:%p addrs_size:%d opt:%d\n", @@ -997,17 +996,10 @@ static int sctp_setsockopt_bindx(struct sock *sk, if (unlikely(addrs_size <= 0)) return -EINVAL; - kaddrs = memdup_user(addrs, addrs_size); - if (IS_ERR(kaddrs)) - return PTR_ERR(kaddrs); - /* Walk through the addrs buffer and count the number of addresses. */ - addr_buf = kaddrs; while (walk_size < addrs_size) { - if (walk_size + sizeof(sa_family_t) > addrs_size) { - kfree(kaddrs); + if (walk_size + sizeof(sa_family_t) > addrs_size) return -EINVAL; - } sa_addr = addr_buf; af = sctp_get_af_specific(sa_addr->sa_family); @@ -1015,10 +1007,8 @@ static int sctp_setsockopt_bindx(struct sock *sk, /* If the address family is not supported or if this address * causes the address buffer to overflow return EINVAL. */ - if (!af || (walk_size + af->sockaddr_len) > addrs_size) { - kfree(kaddrs); + if (!af || (walk_size + af->sockaddr_len) > addrs_size) return -EINVAL; - } addrcnt++; addr_buf += af->sockaddr_len; walk_size += af->sockaddr_len; @@ -1029,31 +1019,48 @@ static int sctp_setsockopt_bindx(struct sock *sk, case SCTP_BINDX_ADD_ADDR: /* Allow security module to validate bindx addresses. */ err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_BINDX_ADD, - (struct sockaddr *)kaddrs, - addrs_size); + addrs, addrs_size); if (err) - goto out; - err = sctp_bindx_add(sk, kaddrs, addrcnt); + return err; + err = sctp_bindx_add(sk, addrs, addrcnt); if (err) - goto out; - err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt); - break; - + return err; + return sctp_send_asconf_add_ip(sk, addrs, addrcnt); case SCTP_BINDX_REM_ADDR: - err = sctp_bindx_rem(sk, kaddrs, addrcnt); + err = sctp_bindx_rem(sk, addrs, addrcnt); if (err) - goto out; - err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt); - break; + return err; + return sctp_send_asconf_del_ip(sk, addrs, addrcnt); default: - err = -EINVAL; - break; + return -EINVAL; } +} -out: +static int sctp_setsockopt_bindx(struct sock *sk, + struct sockaddr __user *addrs, + int addrs_size, int op) +{ + struct sockaddr *kaddrs; + int err; + + kaddrs = memdup_user(addrs, addrs_size); + if (IS_ERR(kaddrs)) + return PTR_ERR(kaddrs); + err = sctp_setsockopt_bindx_kernel(sk, kaddrs, addrs_size, op); kfree(kaddrs); + return err; +} +static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, + int addrlen) +{ + int err; + + lock_sock(sk); + err = sctp_setsockopt_bindx_kernel(sk, addrs, addrlen, + SCTP_BINDX_ADD_ADDR); + release_sock(sk); return err; } @@ -9625,6 +9632,7 @@ struct proto sctp_prot = { .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, + .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, @@ -9667,6 +9675,7 @@ struct proto sctpv6_prot = { .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, + .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 4740aa70e652..c16c80963e55 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -43,20 +43,15 @@ static unsigned long max_autoclose_max = ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -343,8 +338,7 @@ static struct ctl_table sctp_net_table[] = { }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; @@ -389,8 +383,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -418,8 +411,7 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -447,8 +439,7 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, } static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) pr_warn_once("Changing rto_alpha or rto_beta may lead to " @@ -458,8 +449,7 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, } static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 77d5c36a8991..0c3d2b4d7321 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -336,7 +336,7 @@ fail: return NULL; } -void sctp_ulpevent_nofity_peer_addr_change(struct sctp_transport *transport, +void sctp_ulpevent_notify_peer_addr_change(struct sctp_transport *transport, int state, int error) { struct sctp_association *asoc = transport->asoc; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6fd44bdb0fc3..903321543838 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -337,50 +337,61 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } -/* register a new rmb, send confirm_rkey msg to register with peer */ -static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc, - bool conf_rkey) -{ - if (!rmb_desc->wr_reg) { - /* register memory region for new rmb */ - if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) { - rmb_desc->regerr = 1; - return -EFAULT; - } - rmb_desc->wr_reg = 1; +/* register the new rmb on all links */ +static int smcr_lgr_reg_rmbs(struct smc_link *link, + struct smc_buf_desc *rmb_desc) +{ + struct smc_link_group *lgr = link->lgr; + int i, rc = 0; + + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (rc) + return rc; + /* protect against parallel smc_llc_cli_rkey_exchange() and + * parallel smcr_link_reg_rmb() + */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc); + if (rc) + goto out; } - if (!conf_rkey) - return 0; + /* exchange confirm_rkey msg with peer */ - if (smc_llc_do_confirm_rkey(link, rmb_desc)) { - rmb_desc->regerr = 1; - return -EFAULT; + rc = smc_llc_do_confirm_rkey(link, rmb_desc); + if (rc) { + rc = -EFAULT; + goto out; } - return 0; + rmb_desc->is_conf_rkey = true; +out: + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + return rc; } -static int smc_clnt_conf_first_link(struct smc_sock *smc) +static int smcr_clnt_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; - int rest; + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; /* receive CONFIRM LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_rc) + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; rc = smc_ib_modify_qp_rts(link); @@ -389,34 +400,34 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc) smc_wr_remember_qp_attr(link); - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; + /* send CONFIRM LINK response over RoCE fabric */ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP); if (rc < 0) return SMC_CLC_DECL_TIMEOUT_CL; - /* receive ADD LINK request from server over RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + + /* optional 2nd link, receive ADD LINK request from server */ + qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; + if (rc == -EAGAIN) + rc = 0; /* no DECLINE received, go with one link */ + return rc; } - - /* send add link reject message, only one link supported for now */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); - + smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl); + smc_llc_cli_add_link(link, qentry); return 0; } @@ -596,8 +607,8 @@ static int smc_connect_rdma(struct smc_sock *smc, struct smc_clc_msg_accept_confirm *aclc, struct smc_init_info *ini) { + int i, reason_code = 0; struct smc_link *link; - int reason_code = 0; ini->is_smcd = false; ini->ib_lcl = &aclc->lcl; @@ -610,10 +621,28 @@ static int smc_connect_rdma(struct smc_sock *smc, mutex_unlock(&smc_client_lgr_pending); return reason_code; } - link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK]; smc_conn_save_peer_info(smc, aclc); + if (ini->cln_first_contact == SMC_FIRST_CONTACT) { + link = smc->conn.lnk; + } else { + /* set link that was assigned by server */ + link = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *l = &smc->conn.lgr->lnk[i]; + + if (l->peer_qpn == ntoh24(aclc->qpn)) { + link = l; + break; + } + } + if (!link) + return smc_connect_abort(smc, SMC_CLC_DECL_NOSRVLINK, + ini->cln_first_contact); + smc->conn.lnk = link; + } + /* create send buffer and rmb */ if (smc_buf_create(smc, false)) return smc_connect_abort(smc, SMC_CLC_DECL_MEM, @@ -622,7 +651,7 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, aclc); - if (smc_rmb_rtoken_handling(&smc->conn, aclc)) + if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK, ini->cln_first_contact); @@ -634,7 +663,7 @@ static int smc_connect_rdma(struct smc_sock *smc, return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK, ini->cln_first_contact); } else { - if (smc_reg_rmb(link, smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB, ini->cln_first_contact); } @@ -649,7 +678,9 @@ static int smc_connect_rdma(struct smc_sock *smc, if (ini->cln_first_contact == SMC_FIRST_CONTACT) { /* QP confirmation over RoCE fabric */ - reason_code = smc_clnt_conf_first_link(smc); + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_clnt_conf_first_link(smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) return smc_connect_abort(smc, reason_code, ini->cln_first_contact); @@ -999,17 +1030,13 @@ void smc_close_non_accepted(struct sock *sk) sock_put(sk); /* final sock_put */ } -static int smc_serv_conf_first_link(struct smc_sock *smc) +static int smcr_serv_conf_first_link(struct smc_sock *smc) { - struct net *net = sock_net(smc->clcsock->sk); - struct smc_link_group *lgr = smc->conn.lgr; - struct smc_link *link; - int rest; + struct smc_link *link = smc->conn.lnk; + struct smc_llc_qentry *qentry; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; - - if (smc_reg_rmb(link, smc->conn.rmb_desc, false)) + if (smcr_link_reg_rmb(link, smc->conn.rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; /* send CONFIRM LINK request to client over the RoCE fabric */ @@ -1018,40 +1045,29 @@ static int smc_serv_conf_first_link(struct smc_sock *smc) return SMC_CLC_DECL_TIMEOUT_CL; /* receive CONFIRM LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout( - &link->llc_confirm_resp, - SMC_LLC_WAIT_FIRST_TIME); - if (rest <= 0) { + qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { struct smc_clc_msg_decline dclc; rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc; } - - if (link->llc_confirm_resp_rc) + smc_llc_save_peer_uid(qentry); + rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP); + smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl); + if (rc) return SMC_CLC_DECL_RMBE_EC; - /* send ADD LINK request to client over the RoCE fabric */ - rc = smc_llc_send_add_link(link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); - if (rc < 0) - return SMC_CLC_DECL_TIMEOUT_AL; - - /* receive ADD LINK response from client over the RoCE fabric */ - rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp, - SMC_LLC_WAIT_TIME); - if (rest <= 0) { - struct smc_clc_msg_decline dclc; - - rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc), - SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT); - return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc; - } + /* confirm_rkey is implicit on 1st contact */ + smc->conn.rmb_desc->is_conf_rkey = true; - smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time); + smc_llc_link_active(link); + smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE); + /* initial contact - try to establish second link */ + smc_llc_srv_add_link(link); return 0; } @@ -1194,10 +1210,10 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, /* listen worker: register buffers */ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_connection *conn = &new_smc->conn; if (local_contact != SMC_FIRST_CONTACT) { - if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true)) + if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc)) return SMC_CLC_DECL_ERR_REGRMB; } smc_rmb_sync_sg_for_device(&new_smc->conn); @@ -1210,13 +1226,13 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, struct smc_clc_msg_accept_confirm *cclc, int local_contact) { - struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = new_smc->conn.lnk; int reason_code = 0; if (local_contact == SMC_FIRST_CONTACT) smc_link_save_peer_info(link, cclc); - if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) { + if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc)) { reason_code = SMC_CLC_DECL_ERR_RTOK; goto decline; } @@ -1227,7 +1243,9 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc, goto decline; } /* QP confirmation over RoCE fabric */ - reason_code = smc_serv_conf_first_link(new_smc); + smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK); + reason_code = smcr_serv_conf_first_link(new_smc); + smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl); if (reason_code) goto decline; } diff --git a/net/smc/smc.h b/net/smc/smc.h index be11ba41190f..6f1c42da7a4c 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -121,6 +121,7 @@ enum smc_urg_state { struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ + struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ @@ -142,6 +143,9 @@ struct smc_connection { * .prod cf. TCP snd_nxt * .cons cf. TCP sends ack */ + union smc_host_cursor local_tx_ctrl_fin; + /* prod crsr - confirmed by peer + */ union smc_host_cursor tx_curs_prep; /* tx - prepared data * snd_max..wmem_alloc */ @@ -153,6 +157,7 @@ struct smc_connection { */ atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ + u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ @@ -183,12 +188,14 @@ struct smc_connection { spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ + struct work_struct abort_work; /* abort the connection */ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ + u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 164f1584861b..a47e8855e045 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -47,17 +47,20 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, /* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */ smp_mb__after_atomic(); smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn); + smc_curs_copy(&conn->local_tx_ctrl_fin, &cdcpend->p_cursor, + conn); + conn->tx_cdc_seq_fin = cdcpend->ctrl_seq; } smc_tx_sndbuf_nonfull(smc); bh_unlock_sock(&smc->sk); } int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; int rc; rc = smc_wr_tx_get_free_slot(link, smc_cdc_tx_handler, wr_buf, @@ -91,12 +94,10 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend) { + struct smc_link *link = conn->lnk; union smc_host_cursor cfed; - struct smc_link *link; int rc; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; - smc_cdc_add_pending_send(conn, pend); conn->tx_cdc_seq++; @@ -106,22 +107,60 @@ int smc_cdc_msg_send(struct smc_connection *conn, if (!rc) { smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn); conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0; + } else { + conn->tx_cdc_seq--; + conn->local_tx_ctrl.seqno = conn->tx_cdc_seq; } return rc; } +/* send a validation msg indicating the move of a conn to an other QP link */ +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) +{ + struct smc_host_cdc_msg *local = &conn->local_tx_ctrl; + struct smc_link *link = conn->lnk; + struct smc_cdc_msg *peer; + int rc; + + peer = (struct smc_cdc_msg *)wr_buf; + peer->common.type = local->common.type; + peer->len = local->len; + peer->seqno = htons(conn->tx_cdc_seq_fin); /* seqno last compl. tx */ + peer->token = htonl(local->token); + peer->prod_flags.failover_validation = 1; + + rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend); + return rc; +} + static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn) { struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; + struct smc_link *link; + bool again = false; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, NULL, &pend); +again: + link = conn->lnk; + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend); if (rc) return rc; spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, try again one time*/ + spin_unlock_bh(&conn->send_lock); + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + if (again) + return -ENOLINK; + again = true; + goto again; + } rc = smc_cdc_msg_send(conn, wr_buf, pend); spin_unlock_bh(&conn->send_lock); return rc; @@ -165,7 +204,7 @@ static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend) void smc_cdc_tx_dismiss_slots(struct smc_connection *conn) { - struct smc_link *link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + struct smc_link *link = conn->lnk; smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE, smc_cdc_tx_filter, smc_cdc_tx_dismisser, @@ -239,6 +278,28 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc, sk_send_sigurg(&smc->sk); } +static void smc_cdc_msg_validate(struct smc_sock *smc, struct smc_cdc_msg *cdc, + struct smc_link *link) +{ + struct smc_connection *conn = &smc->conn; + u16 recv_seq = ntohs(cdc->seqno); + s16 diff; + + /* check that seqnum was seen before */ + diff = conn->local_rx_ctrl.seqno - recv_seq; + if (diff < 0) { /* diff larger than 0x7fff */ + /* drop connection */ + conn->out_of_sync = 1; /* prevent any further receives */ + spin_lock_bh(&conn->send_lock); + conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; + conn->lnk = link; + spin_unlock_bh(&conn->send_lock); + sock_hold(&smc->sk); /* sock_put in abort_work */ + if (!schedule_work(&conn->abort_work)) + sock_put(&smc->sk); + } +} + static void smc_cdc_msg_recv_action(struct smc_sock *smc, struct smc_cdc_msg *cdc) { @@ -369,16 +430,19 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf) read_lock_bh(&lgr->conns_lock); conn = smc_lgr_find_conn(ntohl(cdc->token), lgr); read_unlock_bh(&lgr->conns_lock); - if (!conn) + if (!conn || conn->out_of_sync) return; smc = container_of(conn, struct smc_sock, conn); - if (!cdc->prod_flags.failover_validation) { - if (smc_cdc_before(ntohs(cdc->seqno), - conn->local_rx_ctrl.seqno)) - /* received seqno is old */ - return; + if (cdc->prod_flags.failover_validation) { + smc_cdc_msg_validate(smc, cdc, link); + return; } + if (smc_cdc_before(ntohs(cdc->seqno), + conn->local_rx_ctrl.seqno)) + /* received seqno is old */ + return; + smc_cdc_msg_recv(smc, cdc); } diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h index 861dc24c588c..0a0a89abd38b 100644 --- a/net/smc/smc_cdc.h +++ b/net/smc/smc_cdc.h @@ -97,23 +97,6 @@ static inline void smc_curs_add(int size, union smc_host_cursor *curs, } } -/* SMC cursors are 8 bytes long and require atomic reading and writing */ -static inline u64 smc_curs_read(union smc_host_cursor *curs, - struct smc_connection *conn) -{ -#ifndef KERNEL_HAS_ATOMIC64 - unsigned long flags; - u64 ret; - - spin_lock_irqsave(&conn->acurs_lock, flags); - ret = curs->acurs; - spin_unlock_irqrestore(&conn->acurs_lock, flags); - return ret; -#else - return atomic64_read(&curs->acurs); -#endif -} - /* Copy cursor src into tgt */ static inline void smc_curs_copy(union smc_host_cursor *tgt, union smc_host_cursor *src, @@ -304,6 +287,7 @@ struct smc_cdc_tx_pend { }; int smc_cdc_get_free_slot(struct smc_connection *conn, + struct smc_link *link, struct smc_wr_buf **wr_buf, struct smc_rdma_wr **wr_rdma_buf, struct smc_cdc_tx_pend **pend); @@ -312,6 +296,9 @@ int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf, struct smc_cdc_tx_pend *pend); int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn); int smcd_cdc_msg_send(struct smc_connection *conn); +int smcr_cdc_msg_send_validation(struct smc_connection *conn, + struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf); int smc_cdc_init(void) __init; void smcd_cdc_rx_init(struct smc_connection *conn); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index ea0068f0173c..d5627df24215 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -496,7 +496,7 @@ int smc_clc_send_confirm(struct smc_sock *smc) sizeof(SMCD_EYECATCHER)); } else { /* SMC-R specific settings */ - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); cclc.hdr.path = SMC_TYPE_R; @@ -508,13 +508,13 @@ int smc_clc_send_confirm(struct smc_sock *smc) ETH_ALEN); hton24(cclc.qpn, link->roce_qp->qp_num); cclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); cclc.rmbe_idx = 1; /* for now: 1 RMB = 1 RMBE */ cclc.rmbe_alert_token = htonl(conn->alert_token_local); cclc.qp_mtu = min(link->path_mtu, link->peer_mtu); cclc.rmbe_size = conn->rmbe_size_short; cclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(cclc.psn, link->psn_initial); memcpy(cclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); @@ -572,7 +572,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); aclc.hdr.path = SMC_TYPE_R; - link = &conn->lgr->lnk[SMC_SINGLE_LINK]; + link = conn->lnk; memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE); @@ -580,13 +580,13 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact) ETH_ALEN); hton24(aclc.qpn, link->roce_qp->qp_num); aclc.rmb_rkey = - htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(conn->rmb_desc->mr_rx[link->link_idx]->rkey); aclc.rmbe_idx = 1; /* as long as 1 RMB = 1 RMBE */ aclc.rmbe_alert_token = htonl(conn->alert_token_local); aclc.qp_mtu = link->path_mtu; aclc.rmbe_size = conn->rmbe_size_short, aclc.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address - (conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (conn->rmb_desc->sgt[link->link_idx].sgl)); hton24(aclc.psn, link->psn_initial); memcpy(aclc.smcr_trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ca209272e5fa..465876701b75 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -44,6 +44,8 @@ #define SMC_CLC_DECL_DIFFPREFIX 0x03070000 /* IP prefix / subnet mismatch */ #define SMC_CLC_DECL_GETVLANERR 0x03080000 /* err to get vlan id of ip device*/ #define SMC_CLC_DECL_ISMVLANERR 0x03090000 /* err to reg vlan id on ism dev */ +#define SMC_CLC_DECL_NOACTLINK 0x030a0000 /* no active smc-r link in lgr */ +#define SMC_CLC_DECL_NOSRVLINK 0x030b0000 /* SMC-R link from srv not found */ #define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ #define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */ #define SMC_CLC_DECL_INTERR 0x09990000 /* internal error */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 824c5211b027..7964a21e5e6f 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -44,10 +44,20 @@ static struct smc_lgr_list smc_lgr_list = { /* established link groups */ static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); +struct smc_ib_up_work { + struct work_struct work; + struct smc_link_group *lgr; + struct smc_ib_device *smcibdev; + u8 ibport; +}; + static void smc_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc); static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft); +static void smc_link_up_work(struct work_struct *work); +static void smc_link_down_work(struct work_struct *work); + /* return head of link group list and its lock for a given link group */ static inline struct list_head *smc_lgr_list_head(struct smc_link_group *lgr, spinlock_t **lgr_lock) @@ -111,16 +121,60 @@ static void smc_lgr_add_alert_token(struct smc_connection *conn) rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } +/* assign an SMC-R link to the connection */ +static int smcr_lgr_conn_assign_link(struct smc_connection *conn, bool first) +{ + enum smc_link_state expected = first ? SMC_LNK_ACTIVATING : + SMC_LNK_ACTIVE; + int i, j; + + /* do link balancing */ + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &conn->lgr->lnk[i]; + + if (lnk->state != expected || lnk->link_is_asym) + continue; + if (conn->lgr->role == SMC_CLNT) { + conn->lnk = lnk; /* temporary, SMC server assigns link*/ + break; + } + if (conn->lgr->conns_num % 2) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + struct smc_link *lnk2; + + lnk2 = &conn->lgr->lnk[j]; + if (lnk2->state == expected && + !lnk2->link_is_asym) { + conn->lnk = lnk2; + break; + } + } + } + if (!conn->lnk) + conn->lnk = lnk; + break; + } + if (!conn->lnk) + return SMC_CLC_DECL_NOACTLINK; + return 0; +} + /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ -static void smc_lgr_register_conn(struct smc_connection *conn) +static int smc_lgr_register_conn(struct smc_connection *conn, bool first) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); + int rc; + if (!conn->lgr->is_smcd) { + rc = smcr_lgr_conn_assign_link(conn, first); + if (rc) + return rc; + } /* find a new alert_token_local value not yet used by some connection * in this link group */ @@ -132,6 +186,7 @@ static void smc_lgr_register_conn(struct smc_connection *conn) } smc_lgr_add_alert_token(conn); conn->lgr->conns_num++; + return 0; } /* Unregister connection and reset the alert token of the given connection< @@ -166,27 +221,33 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn) void smc_lgr_cleanup_early(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; + struct list_head *lgr_list; + spinlock_t *lgr_lock; if (!lgr) return; smc_conn_free(conn); - smc_lgr_forget(lgr); + lgr_list = smc_lgr_list_head(lgr, &lgr_lock); + spin_lock_bh(lgr_lock); + /* do not use this link group for new connections */ + if (!list_empty(lgr_list)) + list_del_init(lgr_list); + spin_unlock_bh(lgr_lock); smc_lgr_schedule_free_work_fast(lgr); } -/* Send delete link, either as client to request the initiation - * of the DELETE LINK sequence from server; or as server to - * initiate the delete processing. See smc_llc_rx_delete_link(). - */ -static int smc_link_send_delete(struct smc_link *lnk, bool orderly) +static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr) { - if (lnk->state == SMC_LNK_ACTIVE && - !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, orderly)) { - smc_llc_link_deleting(lnk); - return 0; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk)) + lnk->state = SMC_LNK_INACTIVE; } - return -ENOTCONN; + wake_up_interruptible_all(&lgr->llc_waiter); } static void smc_lgr_free(struct smc_link_group *lgr); @@ -197,7 +258,6 @@ static void smc_lgr_free_work(struct work_struct *work) struct smc_link_group, free_work); spinlock_t *lgr_lock; - struct smc_link *lnk; bool conns; smc_lgr_list_head(lgr, &lgr_lock); @@ -214,26 +274,17 @@ static void smc_lgr_free_work(struct work_struct *work) return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ - - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - if (!lgr->is_smcd && !lgr->terminating) { - /* try to send del link msg, on error free lgr immediately */ - if (lnk->state == SMC_LNK_ACTIVE && - !smc_link_send_delete(lnk, true)) { - /* reschedule in case we never receive a response */ - smc_lgr_schedule_free_work(lgr); - spin_unlock_bh(lgr_lock); - return; - } - } lgr->freeing = 1; /* this instance does the freeing, no new schedule */ spin_unlock_bh(lgr_lock); cancel_delayed_work(&lgr->free_work); - if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); + if (!lgr->is_smcd && !lgr->terminating) + smc_llc_send_link_delete_all(lgr, true, + SMC_LLC_DEL_PROG_INIT_TERM); if (lgr->is_smcd && !lgr->terminating) smc_ism_signal_shutdown(lgr); + if (!lgr->is_smcd) + smcr_lgr_link_deactivate_all(lgr); smc_lgr_free(lgr); } @@ -245,6 +296,89 @@ static void smc_lgr_terminate_work(struct work_struct *work) __smc_lgr_terminate(lgr, true); } +/* return next unique link id for the lgr */ +static u8 smcr_next_link_id(struct smc_link_group *lgr) +{ + u8 link_id; + int i; + + while (1) { + link_id = ++lgr->next_link_id; + if (!link_id) /* skip zero as link_id */ + link_id = ++lgr->next_link_id; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (smc_link_usable(&lgr->lnk[i]) && + lgr->lnk[i].link_id == link_id) + continue; + } + break; + } + return link_id; +} + +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini) +{ + u8 rndvec[3]; + int rc; + + get_device(&ini->ib_dev->ibdev->dev); + atomic_inc(&ini->ib_dev->lnk_cnt); + lnk->state = SMC_LNK_ACTIVATING; + lnk->link_id = smcr_next_link_id(lgr); + lnk->lgr = lgr; + lnk->link_idx = link_idx; + lnk->smcibdev = ini->ib_dev; + lnk->ibport = ini->ib_port; + lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; + smc_llc_link_set_uid(lnk); + INIT_WORK(&lnk->link_down_wrk, smc_link_down_work); + if (!ini->ib_dev->initialized) { + rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev); + if (rc) + goto out; + } + get_random_bytes(rndvec, sizeof(rndvec)); + lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + + (rndvec[2] << 16); + rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, + ini->vlan_id, lnk->gid, &lnk->sgid_index); + if (rc) + goto out; + rc = smc_llc_link_init(lnk); + if (rc) + goto out; + rc = smc_wr_alloc_link_mem(lnk); + if (rc) + goto clear_llc_lnk; + rc = smc_ib_create_protection_domain(lnk); + if (rc) + goto free_link_mem; + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_create_link(lnk); + if (rc) + goto destroy_qp; + return 0; + +destroy_qp: + smc_ib_destroy_queue_pair(lnk); +dealloc_pd: + smc_ib_dealloc_protection_domain(lnk); +free_link_mem: + smc_wr_free_link_mem(lnk); +clear_llc_lnk: + smc_llc_link_clear(lnk, false); +out: + put_device(&ini->ib_dev->ibdev->dev); + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&ini->ib_dev->lnk_cnt)) + wake_up(&ini->ib_dev->lnks_deleted); + return rc; +} + /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) { @@ -252,7 +386,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) struct list_head *lgr_list; struct smc_link *lnk; spinlock_t *lgr_lock; - u8 rndvec[3]; + u8 link_idx; int rc = 0; int i; @@ -274,13 +408,14 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->freefast = 0; lgr->freeing = 0; lgr->vlan_id = ini->vlan_id; - rwlock_init(&lgr->sndbufs_lock); - rwlock_init(&lgr->rmbs_lock); + mutex_init(&lgr->sndbufs_lock); + mutex_init(&lgr->rmbs_lock); rwlock_init(&lgr->conns_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } + lgr->next_link_id = 0; smc_lgr_list.num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_list.num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); @@ -297,48 +432,21 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&ini->ism_dev->lgr_cnt); } else { /* SMC-R specific settings */ - get_device(&ini->ib_dev->ibdev->dev); lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer, SMC_SYSTEMID_LEN); + memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1], + SMC_MAX_PNETID_LEN); + smc_llc_lgr_init(lgr, smc); - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - /* initialize link */ - lnk->state = SMC_LNK_ACTIVATING; - lnk->link_id = SMC_SINGLE_LINK; - lnk->smcibdev = ini->ib_dev; - lnk->ibport = ini->ib_port; - lgr_list = &smc_lgr_list.list; - lgr_lock = &smc_lgr_list.lock; - lnk->path_mtu = - ini->ib_dev->pattr[ini->ib_port - 1].active_mtu; - if (!ini->ib_dev->initialized) - smc_ib_setup_per_ibdev(ini->ib_dev); - get_random_bytes(rndvec, sizeof(rndvec)); - lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + - (rndvec[2] << 16); - rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport, - ini->vlan_id, lnk->gid, - &lnk->sgid_index); + link_idx = SMC_SINGLE_LINK; + lnk = &lgr->lnk[link_idx]; + rc = smcr_link_init(lgr, lnk, link_idx, ini); if (rc) goto free_lgr; - rc = smc_llc_link_init(lnk); - if (rc) - goto free_lgr; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; - rc = smc_ib_create_protection_domain(lnk); - if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; - rc = smc_wr_create_link(lnk); - if (rc) - goto destroy_qp; + lgr_list = &smc_lgr_list.list; + lgr_lock = &smc_lgr_list.lock; atomic_inc(&lgr_cnt); - atomic_inc(&ini->ib_dev->lnk_cnt); } smc->conn.lgr = lgr; spin_lock_bh(lgr_lock); @@ -346,14 +454,6 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) spin_unlock_bh(lgr_lock); return 0; -destroy_qp: - smc_ib_destroy_queue_pair(lnk); -dealloc_pd: - smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); -clear_llc_lnk: - smc_llc_link_clear(lnk); free_lgr: kfree(lgr); ism_put_vlan: @@ -369,29 +469,186 @@ out: return rc; } +static int smc_write_space(struct smc_connection *conn) +{ + int buffer_len = conn->peer_rmbe_size; + union smc_host_cursor prod; + union smc_host_cursor cons; + int space; + + smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + /* determine rx_buf space */ + space = buffer_len - smc_curs_diff(buffer_len, &cons, &prod); + return space; +} + +static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, + struct smc_wr_buf *wr_buf) +{ + struct smc_connection *conn = &smc->conn; + union smc_host_cursor cons, fin; + int rc = 0; + int diff; + + smc_curs_copy(&conn->tx_curs_sent, &conn->tx_curs_fin, conn); + smc_curs_copy(&fin, &conn->local_tx_ctrl_fin, conn); + /* set prod cursor to old state, enforce tx_rdma_writes() */ + smc_curs_copy(&conn->local_tx_ctrl.prod, &fin, conn); + smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn); + + if (smc_curs_comp(conn->peer_rmbe_size, &cons, &fin) < 0) { + /* cons cursor advanced more than fin, and prod was set + * fin above, so now prod is smaller than cons. Fix that. + */ + diff = smc_curs_diff(conn->peer_rmbe_size, &fin, &cons); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_sent, diff); + smc_curs_add(conn->sndbuf_desc->len, + &conn->tx_curs_fin, diff); + + smp_mb__before_atomic(); + atomic_add(diff, &conn->sndbuf_space); + smp_mb__after_atomic(); + + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl.prod, diff); + smc_curs_add(conn->peer_rmbe_size, + &conn->local_tx_ctrl_fin, diff); + } + /* recalculate, value is used by tx_rdma_writes() */ + atomic_set(&smc->conn.peer_rmbe_space, smc_write_space(conn)); + + if (smc->sk.sk_state != SMC_INIT && + smc->sk.sk_state != SMC_CLOSED) { + rc = smcr_cdc_msg_send_validation(conn, pend, wr_buf); + if (!rc) { + schedule_delayed_work(&conn->tx_work, 0); + smc->sk.sk_data_ready(&smc->sk); + } + } else { + smc_wr_tx_put_slot(conn->lnk, + (struct smc_wr_tx_pend_priv *)pend); + } + return rc; +} + +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err) +{ + struct smc_link *to_lnk = NULL; + struct smc_cdc_tx_pend *pend; + struct smc_connection *conn; + struct smc_wr_buf *wr_buf; + struct smc_sock *smc; + struct rb_node *node; + int i, rc = 0; + + /* link is inactive, wake up tx waiters */ + smc_wr_wakeup_tx_wait(from_lnk); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE || + i == from_lnk->link_idx) + continue; + if (is_dev_err && from_lnk->smcibdev == lgr->lnk[i].smcibdev && + from_lnk->ibport == lgr->lnk[i].ibport) { + continue; + } + to_lnk = &lgr->lnk[i]; + break; + } + if (!to_lnk) { + smc_lgr_terminate_sched(lgr); + return NULL; + } +again: + read_lock_bh(&lgr->conns_lock); + for (node = rb_first(&lgr->conns_all); node; node = rb_next(node)) { + conn = rb_entry(node, struct smc_connection, alert_node); + if (conn->lnk != from_lnk) + continue; + smc = container_of(conn, struct smc_sock, conn); + /* conn->lnk not yet set in SMC_INIT state */ + if (smc->sk.sk_state == SMC_INIT) + continue; + if (smc->sk.sk_state == SMC_CLOSED || + smc->sk.sk_state == SMC_PEERCLOSEWAIT1 || + smc->sk.sk_state == SMC_PEERCLOSEWAIT2 || + smc->sk.sk_state == SMC_APPFINCLOSEWAIT || + smc->sk.sk_state == SMC_APPCLOSEWAIT1 || + smc->sk.sk_state == SMC_APPCLOSEWAIT2 || + smc->sk.sk_state == SMC_PEERFINCLOSEWAIT || + smc->sk.sk_state == SMC_PEERABORTWAIT || + smc->sk.sk_state == SMC_PROCESSABORT) { + spin_lock_bh(&conn->send_lock); + conn->lnk = to_lnk; + spin_unlock_bh(&conn->send_lock); + continue; + } + sock_hold(&smc->sk); + read_unlock_bh(&lgr->conns_lock); + /* pre-fetch buffer outside of send_lock, might sleep */ + rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend); + if (rc) { + smcr_link_down_cond_sched(to_lnk); + return NULL; + } + /* avoid race with smcr_tx_sndbuf_nonempty() */ + spin_lock_bh(&conn->send_lock); + conn->lnk = to_lnk; + rc = smc_switch_cursor(smc, pend, wr_buf); + spin_unlock_bh(&conn->send_lock); + sock_put(&smc->sk); + if (rc) { + smcr_link_down_cond_sched(to_lnk); + return NULL; + } + goto again; + } + read_unlock_bh(&lgr->conns_lock); + return to_lnk; +} + +static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc, + struct smc_link_group *lgr) +{ + int rc; + + if (rmb_desc->is_conf_rkey && !list_empty(&lgr->list)) { + /* unregister rmb with peer */ + rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY); + if (!rc) { + /* protect against smc_llc_cli_rkey_exchange() */ + mutex_lock(&lgr->llc_conf_mutex); + smc_llc_do_delete_rkey(lgr, rmb_desc); + rmb_desc->is_conf_rkey = false; + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); + } + } + + if (rmb_desc->is_reg_err) { + /* buf registration failed, reuse not possible */ + mutex_lock(&lgr->rmbs_lock); + list_del(&rmb_desc->list); + mutex_unlock(&lgr->rmbs_lock); + + smc_buf_free(lgr, true, rmb_desc); + } else { + rmb_desc->used = 0; + } +} + static void smc_buf_unuse(struct smc_connection *conn, struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; - if (conn->rmb_desc) { - if (!conn->rmb_desc->regerr) { - if (!lgr->is_smcd && !list_empty(&lgr->list)) { - /* unregister rmb with peer */ - smc_llc_do_delete_rkey( - &lgr->lnk[SMC_SINGLE_LINK], - conn->rmb_desc); - } - conn->rmb_desc->used = 0; - } else { - /* buf registration failed, reuse not possible */ - write_lock_bh(&lgr->rmbs_lock); - list_del(&conn->rmb_desc->list); - write_unlock_bh(&lgr->rmbs_lock); - - smc_buf_free(lgr, true, conn->rmb_desc); - } - } + if (conn->rmb_desc && lgr->is_smcd) + conn->rmb_desc->used = 0; + else if (conn->rmb_desc) + smcr_buf_unuse(conn->rmb_desc, lgr); } /* remove a finished connection from its link group */ @@ -407,6 +664,8 @@ void smc_conn_free(struct smc_connection *conn) tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); + if (current_work() != &conn->abort_work) + cancel_work_sync(&conn->abort_work); } if (!list_empty(&lgr->list)) { smc_lgr_unregister_conn(conn); @@ -417,35 +676,91 @@ void smc_conn_free(struct smc_connection *conn) smc_lgr_schedule_free_work(lgr); } -static void smc_link_clear(struct smc_link *lnk) +/* unregister a link from a buf_desc */ +static void smcr_buf_unmap_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + if (is_rmb) + buf_desc->is_reg_mr[lnk->link_idx] = false; + if (!buf_desc->is_map_ib[lnk->link_idx]) + return; + if (is_rmb) { + if (buf_desc->mr_rx[lnk->link_idx]) { + smc_ib_put_memory_region( + buf_desc->mr_rx[lnk->link_idx]); + buf_desc->mr_rx[lnk->link_idx] = NULL; + } + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_FROM_DEVICE); + } else { + smc_ib_buf_unmap_sg(lnk, buf_desc, DMA_TO_DEVICE); + } + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + buf_desc->is_map_ib[lnk->link_idx] = false; +} + +/* unmap all buffers of lgr for a deleted link */ +static void smcr_buf_unmap_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + mutex_lock(&lgr->rmbs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) + smcr_buf_unmap_link(buf_desc, true, lnk); + mutex_unlock(&lgr->rmbs_lock); + mutex_lock(&lgr->sndbufs_lock); + list_for_each_entry_safe(buf_desc, bf, &lgr->sndbufs[i], + list) + smcr_buf_unmap_link(buf_desc, false, lnk); + mutex_unlock(&lgr->sndbufs_lock); + } +} + +static void smcr_rtoken_clear_link(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + lgr->rtokens[i][lnk->link_idx].rkey = 0; + lgr->rtokens[i][lnk->link_idx].dma_addr = 0; + } +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_clear(struct smc_link *lnk, bool log) { + struct smc_ib_device *smcibdev; + + if (!lnk->lgr || lnk->state == SMC_LNK_UNUSED) + return; lnk->peer_qpn = 0; - smc_llc_link_clear(lnk); + smc_llc_link_clear(lnk, log); + smcr_buf_unmap_lgr(lnk); + smcr_rtoken_clear_link(lnk); smc_ib_modify_qp_reset(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); - if (!atomic_dec_return(&lnk->smcibdev->lnk_cnt)) - wake_up(&lnk->smcibdev->lnks_deleted); + put_device(&lnk->smcibdev->ibdev->dev); + smcibdev = lnk->smcibdev; + memset(lnk, 0, sizeof(struct smc_link)); + lnk->state = SMC_LNK_UNUSED; + if (!atomic_dec_return(&smcibdev->lnk_cnt)) + wake_up(&smcibdev->lnks_deleted); } static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, struct smc_buf_desc *buf_desc) { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + smcr_buf_unmap_link(buf_desc, is_rmb, &lgr->lnk[i]); - if (is_rmb) { - if (buf_desc->mr_rx[SMC_SINGLE_LINK]) - smc_ib_put_memory_region( - buf_desc->mr_rx[SMC_SINGLE_LINK]); - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_FROM_DEVICE); - } else { - smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, - DMA_TO_DEVICE); - } - sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); if (buf_desc->pages) __free_pages(buf_desc->pages, buf_desc->order); kfree(buf_desc); @@ -503,6 +818,18 @@ static void smc_lgr_free_bufs(struct smc_link_group *lgr) /* remove a link group */ static void smc_lgr_free(struct smc_link_group *lgr) { + int i; + + if (!lgr->is_smcd) { + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_UNUSED) + smcr_link_clear(&lgr->lnk[i], false); + } + mutex_unlock(&lgr->llc_conf_mutex); + smc_llc_lgr_clear(lgr); + } + smc_lgr_free_bufs(lgr); if (lgr->is_smcd) { if (!lgr->terminating) { @@ -512,27 +839,12 @@ static void smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr->smcd->lgr_cnt)) wake_up(&lgr->smcd->lgrs_deleted); } else { - smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); - put_device(&lgr->lnk[SMC_SINGLE_LINK].smcibdev->ibdev->dev); if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } kfree(lgr); } -void smc_lgr_forget(struct smc_link_group *lgr) -{ - struct list_head *lgr_list; - spinlock_t *lgr_lock; - - lgr_list = smc_lgr_list_head(lgr, &lgr_lock); - spin_lock_bh(lgr_lock); - /* do not use this link group for new connections */ - if (!list_empty(lgr_list)) - list_del_init(lgr_list); - spin_unlock_bh(lgr_lock); -} - static void smcd_unregister_all_dmbs(struct smc_link_group *lgr) { int i; @@ -587,10 +899,12 @@ static void smc_lgr_cleanup(struct smc_link_group *lgr) smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); put_device(&lgr->smcd->dev); } else { - struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; + u32 rsn = lgr->llc_termination_rsn; - if (lnk->state != SMC_LNK_INACTIVE) - smc_llc_link_inactive(lnk); + if (!rsn) + rsn = SMC_LLC_DEL_PROG_INIT_TERM; + smc_llc_send_link_delete_all(lgr, false, rsn); + smcr_lgr_link_deactivate_all(lgr); } } @@ -606,11 +920,9 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) if (lgr->terminating) return; /* lgr already terminating */ - if (!soft) - cancel_delayed_work_sync(&lgr->free_work); + /* cancel free_work sync, will terminate when lgr->freeing is set */ + cancel_delayed_work_sync(&lgr->free_work); lgr->terminating = 1; - if (!lgr->is_smcd) - smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]); /* kill remaining link group connections */ read_lock_bh(&lgr->conns_lock); @@ -629,10 +941,7 @@ static void __smc_lgr_terminate(struct smc_link_group *lgr, bool soft) } read_unlock_bh(&lgr->conns_lock); smc_lgr_cleanup(lgr); - if (soft) - smc_lgr_schedule_free_work_fast(lgr); - else - smc_lgr_free(lgr); + smc_lgr_free(lgr); } /* unlink link group and schedule termination */ @@ -647,33 +956,11 @@ void smc_lgr_terminate_sched(struct smc_link_group *lgr) return; /* lgr already terminating */ } list_del_init(&lgr->list); + lgr->freeing = 1; spin_unlock_bh(lgr_lock); schedule_work(&lgr->terminate_work); } -/* Called when IB port is terminated */ -void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport) -{ - struct smc_link_group *lgr, *l; - LIST_HEAD(lgr_free_list); - - spin_lock_bh(&smc_lgr_list.lock); - list_for_each_entry_safe(lgr, l, &smc_lgr_list.list, list) { - if (!lgr->is_smcd && - lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev && - lgr->lnk[SMC_SINGLE_LINK].ibport == ibport) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; - } - } - spin_unlock_bh(&smc_lgr_list.lock); - - list_for_each_entry_safe(lgr, l, &lgr_free_list, list) { - list_del_init(&lgr->list); - __smc_lgr_terminate(lgr, false); - } -} - /* Called when peer lgr shutdown (regularly or abnormally) is received */ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) { @@ -688,6 +975,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan) if (peer_gid) /* peer triggered termination */ lgr->peer_shutdown = 1; list_move(&lgr->list, &lgr_free_list); + lgr->freeing = 1; } } spin_unlock_bh(&dev->lgr_lock); @@ -728,6 +1016,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) { struct smc_link_group *lgr, *lg; LIST_HEAD(lgr_free_list); + int i; spin_lock_bh(&smc_lgr_list.lock); if (!smcibdev) { @@ -736,9 +1025,9 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) lgr->freeing = 1; } else { list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) { - if (lgr->lnk[SMC_SINGLE_LINK].smcibdev == smcibdev) { - list_move(&lgr->list, &lgr_free_list); - lgr->freeing = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].smcibdev == smcibdev) + smcr_link_down_cond_sched(&lgr->lnk[i]); } } } @@ -746,6 +1035,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) list_for_each_entry_safe(lgr, lg, &lgr_free_list, list) { list_del_init(&lgr->list); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_OP_INIT_TERM); __smc_lgr_terminate(lgr, false); } @@ -759,6 +1049,225 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev) } } +/* set new lgr type and clear all asymmetric link tagging */ +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type) +{ + char *lgr_type = ""; + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + lgr->lnk[i].link_is_asym = false; + if (lgr->type == new_type) + return; + lgr->type = new_type; + + switch (lgr->type) { + case SMC_LGR_NONE: + lgr_type = "NONE"; + break; + case SMC_LGR_SINGLE: + lgr_type = "SINGLE"; + break; + case SMC_LGR_SYMMETRIC: + lgr_type = "SYMMETRIC"; + break; + case SMC_LGR_ASYMMETRIC_PEER: + lgr_type = "ASYMMETRIC_PEER"; + break; + case SMC_LGR_ASYMMETRIC_LOCAL: + lgr_type = "ASYMMETRIC_LOCAL"; + break; + } + pr_warn_ratelimited("smc: SMC-R lg %*phN state changed: " + "%s, pnetid %.16s\n", SMC_LGR_ID_SIZE, &lgr->id, + lgr_type, lgr->pnet_id); +} + +/* set new lgr type and tag a link as asymmetric */ +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx) +{ + smcr_lgr_set_type(lgr, new_type); + lgr->lnk[asym_lnk_idx].link_is_asym = true; +} + +/* abort connection, abort_work scheduled from tasklet context */ +static void smc_conn_abort_work(struct work_struct *work) +{ + struct smc_connection *conn = container_of(work, + struct smc_connection, + abort_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + + smc_conn_kill(conn, true); + sock_put(&smc->sk); /* sock_hold done by schedulers of abort_work */ +} + +/* link is up - establish alternate link if applicable */ +static void smcr_link_up(struct smc_link_group *lgr, + struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link *link = NULL; + + if (list_empty(&lgr->list) || + lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + return; + + if (lgr->role == SMC_SERV) { + /* trigger local add link processing */ + link = smc_llc_usable_link(lgr); + if (!link) + return; + smc_llc_srv_add_link_local(link); + } else { + /* invite server to start add link processing */ + u8 gid[SMC_GID_SIZE]; + + if (smc_ib_determine_gid(smcibdev, ibport, lgr->vlan_id, gid, + NULL)) + return; + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* some other llc task is ongoing */ + wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + SMC_LLC_WAIT_TIME); + } + if (list_empty(&lgr->list) || + !smc_ib_port_active(smcibdev, ibport)) + return; /* lgr or device no longer active */ + link = smc_llc_usable_link(lgr); + if (!link) + return; + smc_llc_send_add_link(link, smcibdev->mac[ibport - 1], gid, + NULL, SMC_LLC_REQ); + } +} + +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_ib_up_work *ib_work; + struct smc_link_group *lgr, *n; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN) || + lgr->type == SMC_LGR_SYMMETRIC || + lgr->type == SMC_LGR_ASYMMETRIC_PEER) + continue; + ib_work = kmalloc(sizeof(*ib_work), GFP_KERNEL); + if (!ib_work) + continue; + INIT_WORK(&ib_work->work, smc_link_up_work); + ib_work->lgr = lgr; + ib_work->smcibdev = smcibdev; + ib_work->ibport = ibport; + schedule_work(&ib_work->work); + } +} + +/* link is down - switch connections to alternate link, + * must be called under lgr->llc_conf_mutex lock + */ +static void smcr_link_down(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_link *to_lnk; + int del_link_id; + + if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list)) + return; + + smc_ib_modify_qp_reset(lnk); + to_lnk = smc_switch_conns(lgr, lnk, true); + if (!to_lnk) { /* no backup link available */ + smcr_link_clear(lnk, true); + return; + } + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + del_link_id = lnk->link_id; + + if (lgr->role == SMC_SERV) { + /* trigger local delete link processing */ + smc_llc_srv_delete_link_local(to_lnk, del_link_id); + } else { + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* another llc task is ongoing */ + mutex_unlock(&lgr->llc_conf_mutex); + wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE), + SMC_LLC_WAIT_TIME); + mutex_lock(&lgr->llc_conf_mutex); + } + smc_llc_send_delete_link(to_lnk, del_link_id, SMC_LLC_REQ, true, + SMC_LLC_DEL_LOST_PATH); + } +} + +/* must be called under lgr->llc_conf_mutex lock */ +void smcr_link_down_cond(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) + smcr_link_down(lnk); +} + +/* will get the lgr->llc_conf_mutex lock */ +void smcr_link_down_cond_sched(struct smc_link *lnk) +{ + if (smc_link_downing(&lnk->state)) + schedule_work(&lnk->link_down_wrk); +} + +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport) +{ + struct smc_link_group *lgr, *n; + int i; + + list_for_each_entry_safe(lgr, n, &smc_lgr_list.list, list) { + if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id, + SMC_MAX_PNETID_LEN)) + continue; /* lgr is not affected */ + if (list_empty(&lgr->list)) + continue; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; + + if (smc_link_usable(lnk) && + lnk->smcibdev == smcibdev && lnk->ibport == ibport) + smcr_link_down_cond_sched(lnk); + } + } +} + +static void smc_link_up_work(struct work_struct *work) +{ + struct smc_ib_up_work *ib_work = container_of(work, + struct smc_ib_up_work, + work); + struct smc_link_group *lgr = ib_work->lgr; + + if (list_empty(&lgr->list)) + goto out; + smcr_link_up(lgr, ib_work->smcibdev, ib_work->ibport); +out: + kfree(ib_work); +} + +static void smc_link_down_work(struct work_struct *work) +{ + struct smc_link *link = container_of(work, struct smc_link, + link_down_wrk); + struct smc_link_group *lgr = link->lgr; + + if (list_empty(&lgr->list)) + return; + wake_up_interruptible_all(&lgr->llc_waiter); + mutex_lock(&lgr->llc_conf_mutex); + smcr_link_down(link); + mutex_unlock(&lgr->llc_conf_mutex); +} + /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ @@ -810,15 +1319,21 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, struct smc_clc_msg_local *lcl, enum smc_lgr_role role, u32 clcqpn) { - return !memcmp(lgr->peer_systemid, lcl->id_for_peer, - SMC_SYSTEMID_LEN) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, - SMC_GID_SIZE) && - !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, - sizeof(lcl->mac)) && - lgr->role == role && - (lgr->role == SMC_SERV || - lgr->lnk[SMC_SINGLE_LINK].peer_qpn == clcqpn); + int i; + + if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) || + lgr->role != role) + return false; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].state != SMC_LNK_ACTIVE) + continue; + if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) && + !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) && + !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac))) + return true; + } + return false; } static bool smcd_lgr_match(struct smc_link_group *lgr, @@ -859,15 +1374,17 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) /* link group found */ ini->cln_first_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; - smc_lgr_register_conn(conn); /* add smc conn to lgr */ - if (delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); + if (!rc && delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(lgr_lock); + if (rc) + return rc; if (role == SMC_CLNT && !ini->srv_first_contact && ini->cln_first_contact == SMC_FIRST_CONTACT) { @@ -885,12 +1402,15 @@ create: goto out; lgr = conn->lgr; write_lock_bh(&lgr->conns_lock); - smc_lgr_register_conn(conn); /* add smc conn to lgr */ + rc = smc_lgr_register_conn(conn, true); write_unlock_bh(&lgr->conns_lock); + if (rc) + goto out; } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = SMC_WR_TX_SIZE; conn->urg_state = SMC_URG_READ; + INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work); if (ini->is_smcd) { conn->rx_off = sizeof(struct smcd_cdc_msg); smcd_cdc_rx_init(conn); /* init tasklet for this conn */ @@ -934,19 +1454,19 @@ int smc_uncompress_bufsize(u8 compressed) * buffer size; if not available, return NULL */ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize, - rwlock_t *lock, + struct mutex *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; - read_lock_bh(lock); + mutex_lock(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { - read_unlock_bh(lock); + mutex_unlock(lock); return buf_slot; } } - read_unlock_bh(lock); + mutex_unlock(lock); return NULL; } @@ -959,12 +1479,135 @@ static inline int smc_rmb_wnd_update_limit(int rmbe_size) return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } +/* map an rmb buf to a link */ +static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb, + struct smc_link *lnk) +{ + int rc; + + if (buf_desc->is_map_ib[lnk->link_idx]) + return 0; + + rc = sg_alloc_table(&buf_desc->sgt[lnk->link_idx], 1, GFP_KERNEL); + if (rc) + return rc; + sg_set_buf(buf_desc->sgt[lnk->link_idx].sgl, + buf_desc->cpu_addr, buf_desc->len); + + /* map sg table to DMA address */ + rc = smc_ib_buf_map_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + /* SMC protocol depends on mapping to one DMA address only */ + if (rc != 1) { + rc = -EAGAIN; + goto free_table; + } + + /* create a new memory region for the RMB */ + if (is_rmb) { + rc = smc_ib_get_memory_region(lnk->roce_pd, + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_LOCAL_WRITE, + buf_desc, lnk->link_idx); + if (rc) + goto buf_unmap; + smc_ib_sync_sg_for_device(lnk, buf_desc, DMA_FROM_DEVICE); + } + buf_desc->is_map_ib[lnk->link_idx] = true; + return 0; + +buf_unmap: + smc_ib_buf_unmap_sg(lnk, buf_desc, + is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); +free_table: + sg_free_table(&buf_desc->sgt[lnk->link_idx]); + return rc; +} + +/* register a new rmb on IB device, + * must be called under lgr->llc_conf_mutex lock + */ +int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc) +{ + if (list_empty(&link->lgr->list)) + return -ENOLINK; + if (!rmb_desc->is_reg_mr[link->link_idx]) { + /* register memory region for new rmb */ + if (smc_wr_reg_send(link, rmb_desc->mr_rx[link->link_idx])) { + rmb_desc->is_reg_err = true; + return -EFAULT; + } + rmb_desc->is_reg_mr[link->link_idx] = true; + } + return 0; +} + +static int _smcr_buf_map_lgr(struct smc_link *lnk, struct mutex *lock, + struct list_head *lst, bool is_rmb) +{ + struct smc_buf_desc *buf_desc, *bf; + int rc = 0; + + mutex_lock(lock); + list_for_each_entry_safe(buf_desc, bf, lst, list) { + if (!buf_desc->used) + continue; + rc = smcr_buf_map_link(buf_desc, is_rmb, lnk); + if (rc) + goto out; + } +out: + mutex_unlock(lock); + return rc; +} + +/* map all used buffers of lgr for a new link */ +int smcr_buf_map_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + int i, rc = 0; + + for (i = 0; i < SMC_RMBE_SIZES; i++) { + rc = _smcr_buf_map_lgr(lnk, &lgr->rmbs_lock, + &lgr->rmbs[i], true); + if (rc) + return rc; + rc = _smcr_buf_map_lgr(lnk, &lgr->sndbufs_lock, + &lgr->sndbufs[i], false); + if (rc) + return rc; + } + return 0; +} + +/* register all used buffers of lgr for a new link, + * must be called under lgr->llc_conf_mutex lock + */ +int smcr_buf_reg_lgr(struct smc_link *lnk) +{ + struct smc_link_group *lgr = lnk->lgr; + struct smc_buf_desc *buf_desc, *bf; + int i, rc = 0; + + mutex_lock(&lgr->rmbs_lock); + for (i = 0; i < SMC_RMBE_SIZES; i++) { + list_for_each_entry_safe(buf_desc, bf, &lgr->rmbs[i], list) { + if (!buf_desc->used) + continue; + rc = smcr_link_reg_rmb(lnk, buf_desc); + if (rc) + goto out; + } + } +out: + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; - struct smc_link *lnk; - int rc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); @@ -981,41 +1624,33 @@ static struct smc_buf_desc *smcr_new_buf_create(struct smc_link_group *lgr, return ERR_PTR(-EAGAIN); } buf_desc->cpu_addr = (void *)page_address(buf_desc->pages); + buf_desc->len = bufsize; + return buf_desc; +} - /* build the sg table from the pages */ - lnk = &lgr->lnk[SMC_SINGLE_LINK]; - rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, - GFP_KERNEL); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); - } - sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, - buf_desc->cpu_addr, bufsize); +/* map buf_desc on all usable links, + * unused buffers stay mapped as long as the link is up + */ +static int smcr_buf_map_usable_links(struct smc_link_group *lgr, + struct smc_buf_desc *buf_desc, bool is_rmb) +{ + int i, rc = 0; - /* map sg table to DMA address */ - rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, - is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); - /* SMC protocol depends on mapping to one DMA address only */ - if (rc != 1) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(-EAGAIN); - } + /* protect against parallel link reconfiguration */ + mutex_lock(&lgr->llc_conf_mutex); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + struct smc_link *lnk = &lgr->lnk[i]; - /* create a new memory region for the RMB */ - if (is_rmb) { - rc = smc_ib_get_memory_region(lnk->roce_pd, - IB_ACCESS_REMOTE_WRITE | - IB_ACCESS_LOCAL_WRITE, - buf_desc); - if (rc) { - smc_buf_free(lgr, is_rmb, buf_desc); - return ERR_PTR(rc); + if (!smc_link_usable(lnk)) + continue; + if (smcr_buf_map_link(buf_desc, is_rmb, lnk)) { + rc = -ENOMEM; + goto out; } } - - buf_desc->len = bufsize; - return buf_desc; +out: + mutex_unlock(&lgr->llc_conf_mutex); + return rc; } #define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ @@ -1062,8 +1697,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_link_group *lgr = conn->lgr; struct list_head *buf_list; int bufsize, bufsize_short; + struct mutex *lock; /* lock buffer list */ int sk_buf_size; - rwlock_t *lock; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ @@ -1104,15 +1739,22 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) continue; buf_desc->used = 1; - write_lock_bh(lock); + mutex_lock(lock); list_add(&buf_desc->list, buf_list); - write_unlock_bh(lock); + mutex_unlock(lock); break; /* found */ } if (IS_ERR(buf_desc)) return -ENOMEM; + if (!is_smcd) { + if (smcr_buf_map_usable_links(lgr, buf_desc, is_rmb)) { + smcr_buf_unuse(buf_desc, lgr); + return -ENOMEM; + } + } + if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; @@ -1132,42 +1774,44 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_cpu(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; - - if (!conn->lgr || conn->lgr->is_smcd) + if (!conn->lgr || conn->lgr->is_smcd || !smc_link_usable(conn->lnk)) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->sndbuf_desc, DMA_TO_DEVICE); + smc_ib_sync_sg_for_device(conn->lnk, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&conn->lgr->lnk[i])) + continue; + smc_ib_sync_sg_for_cpu(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { - struct smc_link_group *lgr = conn->lgr; + int i; if (!conn->lgr || conn->lgr->is_smcd) return; - smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, - conn->rmb_desc, DMA_FROM_DEVICE); + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&conn->lgr->lnk[i])) + continue; + smc_ib_sync_sg_for_device(&conn->lgr->lnk[i], conn->rmb_desc, + DMA_FROM_DEVICE); + } } /* create the send and receive buffer for an SMC socket; @@ -1202,16 +1846,64 @@ static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) return -ENOSPC; } +static int smc_rtoken_find_by_link(struct smc_link_group *lgr, int lnk_idx, + u32 rkey) +{ + int i; + + for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { + if (test_bit(i, lgr->rtokens_used_mask) && + lgr->rtokens[i][lnk_idx].rkey == rkey) + return i; + } + return -ENOENT; +} + +/* set rtoken for a new link to an existing rmb */ +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey) +{ + int rtok_idx; + + rtok_idx = smc_rtoken_find_by_link(lgr, link_idx, ntohl(nw_rkey_known)); + if (rtok_idx == -ENOENT) + return; + lgr->rtokens[rtok_idx][link_idx_new].rkey = ntohl(nw_rkey); + lgr->rtokens[rtok_idx][link_idx_new].dma_addr = be64_to_cpu(nw_vaddr); +} + +/* set rtoken for a new link whose link_id is given */ +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey) +{ + u64 dma_addr = be64_to_cpu(nw_vaddr); + u32 rkey = ntohl(nw_rkey); + bool found = false; + int link_idx; + + for (link_idx = 0; link_idx < SMC_LINKS_PER_LGR_MAX; link_idx++) { + if (lgr->lnk[link_idx].link_id == link_id) { + found = true; + break; + } + } + if (!found) + return; + lgr->rtokens[rtok_idx][link_idx].rkey = rkey; + lgr->rtokens[rtok_idx][link_idx].dma_addr = dma_addr; +} + /* add a new rtoken from peer */ -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && - (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && + lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; @@ -1220,23 +1912,25 @@ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; + lgr->rtokens[i][lnk->link_idx].rkey = rkey; + lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; return i; } -/* delete an rtoken */ -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) +/* delete an rtoken from all links */ +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) { + struct smc_link_group *lgr = smc_get_lgr(lnk); u32 rkey = ntohl(nw_rkey); - int i; + int i, j; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { - if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && + if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { - lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; - lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; - + for (j = 0; j < SMC_LINKS_PER_LGR_MAX; j++) { + lgr->rtokens[i][j].rkey = 0; + lgr->rtokens[i][j].dma_addr = 0; + } clear_bit(i, lgr->rtokens_used_mask); return 0; } @@ -1246,9 +1940,10 @@ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, + struct smc_link *lnk, struct smc_clc_msg_accept_confirm *clc) { - conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, + conn->rtoken_idx = smc_rtoken_add(lnk, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 8041db20c753..86d160f0d187 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -32,10 +32,10 @@ enum smc_lgr_role { /* possible roles of a link group */ }; enum smc_link_state { /* possible states of a link */ + SMC_LNK_UNUSED, /* link is unused */ SMC_LNK_INACTIVE, /* link is inactive */ SMC_LNK_ACTIVATING, /* link is being activated */ SMC_LNK_ACTIVE, /* link is active */ - SMC_LNK_DELETING, /* link is being deleted */ }; #define SMC_WR_BUF_SIZE 48 /* size of work request buffer */ @@ -70,6 +70,8 @@ struct smc_rdma_wr { /* work requests per message struct ib_rdma_wr wr_tx_rdma[SMC_MAX_RDMA_WRITES]; }; +#define SMC_LGR_ID_SIZE 4 + struct smc_link { struct smc_ib_device *smcibdev; /* ib-device */ u8 ibport; /* port - values 1 | 2 */ @@ -85,6 +87,7 @@ struct smc_link { struct smc_rdma_sges *wr_tx_rdma_sges;/*RDMA WRITE gather meta data*/ struct smc_rdma_wr *wr_tx_rdmas; /* WR RDMA WRITE */ struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */ + struct completion *wr_tx_compl; /* WR send CQE completion */ /* above four vectors have wr_tx_cnt elements and use the same index */ dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */ atomic_long_t wr_tx_id; /* seq # of last sent WR */ @@ -115,29 +118,23 @@ struct smc_link { u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ u8 link_id; /* unique # within link group */ + u8 link_uid[SMC_LGR_ID_SIZE]; /* unique lnk id */ + u8 peer_link_uid[SMC_LGR_ID_SIZE]; /* peer uid */ + u8 link_idx; /* index in lgr link array */ + u8 link_is_asym; /* is link asymmetric? */ + struct smc_link_group *lgr; /* parent link group */ + struct work_struct link_down_wrk; /* wrk to bring link down */ enum smc_link_state state; /* state of link */ - struct workqueue_struct *llc_wq; /* single thread work queue */ - struct completion llc_confirm; /* wait for rx of conf link */ - struct completion llc_confirm_resp; /* wait 4 rx of cnf lnk rsp */ - int llc_confirm_rc; /* rc from confirm link msg */ - int llc_confirm_resp_rc; /* rc from conf_resp msg */ - struct completion llc_add; /* wait for rx of add link */ - struct completion llc_add_resp; /* wait for rx of add link rsp*/ struct delayed_work llc_testlink_wrk; /* testlink worker */ struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ - struct completion llc_confirm_rkey; /* wait 4 rx of cnf rkey */ - int llc_confirm_rkey_rc; /* rc from cnf rkey msg */ - struct completion llc_delete_rkey; /* wait 4 rx of del rkey */ - int llc_delete_rkey_rc; /* rc from del rkey msg */ - struct mutex llc_delete_rkey_mutex; /* serialize usage */ }; /* For now we just allow one parallel link per link group. The SMC protocol * allows more (up to 8). */ -#define SMC_LINKS_PER_LGR_MAX 1 +#define SMC_LINKS_PER_LGR_MAX 3 #define SMC_SINGLE_LINK 0 #define SMC_FIRST_CONTACT 1 /* first contact to a peer */ @@ -150,25 +147,32 @@ struct smc_buf_desc { struct page *pages; int len; /* length of buffer */ u32 used; /* currently used / unused */ - u8 wr_reg : 1; /* mem region registered */ - u8 regerr : 1; /* err during registration */ union { struct { /* SMC-R */ - struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; - /* virtual buffer */ - struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; - /* for rmb only: memory region - * incl. rkey provided to peer - */ - u32 order; /* allocation order */ + struct sg_table sgt[SMC_LINKS_PER_LGR_MAX]; + /* virtual buffer */ + struct ib_mr *mr_rx[SMC_LINKS_PER_LGR_MAX]; + /* for rmb only: memory region + * incl. rkey provided to peer + */ + u32 order; /* allocation order */ + + u8 is_conf_rkey; + /* confirm_rkey done */ + u8 is_reg_mr[SMC_LINKS_PER_LGR_MAX]; + /* mem region registered */ + u8 is_map_ib[SMC_LINKS_PER_LGR_MAX]; + /* mem region mapped to lnk */ + u8 is_reg_err; + /* buffer registration err */ }; struct { /* SMC-D */ - unsigned short sba_idx; - /* SBA index number */ - u64 token; - /* DMB token number */ - dma_addr_t dma_addr; - /* DMA address */ + unsigned short sba_idx; + /* SBA index number */ + u64 token; + /* DMB token number */ + dma_addr_t dma_addr; + /* DMA address */ }; }; }; @@ -178,7 +182,6 @@ struct smc_rtoken { /* address/key of remote RMB */ u32 rkey; }; -#define SMC_LGR_ID_SIZE 4 #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, @@ -188,6 +191,28 @@ struct smc_rtoken { /* address/key of remote RMB */ struct smcd_dev; +enum smc_lgr_type { /* redundancy state of lgr */ + SMC_LGR_NONE, /* no active links, lgr to be deleted */ + SMC_LGR_SINGLE, /* 1 active RNIC on each peer */ + SMC_LGR_SYMMETRIC, /* 2 active RNICs on each peer */ + SMC_LGR_ASYMMETRIC_PEER, /* local has 2, peer 1 active RNICs */ + SMC_LGR_ASYMMETRIC_LOCAL, /* local has 1, peer 2 active RNICs */ +}; + +enum smc_llc_flowtype { + SMC_LLC_FLOW_NONE = 0, + SMC_LLC_FLOW_ADD_LINK = 2, + SMC_LLC_FLOW_DEL_LINK = 4, + SMC_LLC_FLOW_RKEY = 6, +}; + +struct smc_llc_qentry; + +struct smc_llc_flow { + enum smc_llc_flowtype type; + struct smc_llc_qentry *qentry; +}; + struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ @@ -196,9 +221,9 @@ struct smc_link_group { unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ - rwlock_t sndbufs_lock; /* protects tx buffers */ + struct mutex sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ - rwlock_t rmbs_lock; /* protects rx buffers */ + struct mutex rmbs_lock; /* protects rx buffers */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ @@ -222,6 +247,35 @@ struct smc_link_group { /* remote addr/key pairs */ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX); /* used rtoken elements */ + u8 next_link_id; + enum smc_lgr_type type; + /* redundancy state */ + u8 pnet_id[SMC_MAX_PNETID_LEN + 1]; + /* pnet id of this lgr */ + struct list_head llc_event_q; + /* queue for llc events */ + spinlock_t llc_event_q_lock; + /* protects llc_event_q */ + struct mutex llc_conf_mutex; + /* protects lgr reconfig. */ + struct work_struct llc_add_link_work; + struct work_struct llc_del_link_work; + struct work_struct llc_event_work; + /* llc event worker */ + wait_queue_head_t llc_waiter; + /* w4 next llc event */ + struct smc_llc_flow llc_flow_lcl; + /* llc local control field */ + struct smc_llc_flow llc_flow_rmt; + /* llc remote control field */ + struct smc_llc_qentry *delayed_event; + /* arrived when flow active */ + spinlock_t llc_flow_lock; + /* protects llc flow */ + int llc_testlink_time; + /* link keep alive time */ + u32 llc_termination_rsn; + /* rsn code for termination */ }; struct { /* SMC-D */ u64 peer_gid; @@ -285,24 +339,36 @@ static inline struct smc_connection *smc_lgr_find_conn( return res; } +/* returns true if the specified link is usable */ +static inline bool smc_link_usable(struct smc_link *lnk) +{ + if (lnk->state == SMC_LNK_UNUSED || lnk->state == SMC_LNK_INACTIVE) + return false; + return true; +} + struct smc_sock; struct smc_clc_msg_accept_confirm; struct smc_clc_msg_local; -void smc_lgr_forget(struct smc_link_group *lgr); void smc_lgr_cleanup_early(struct smc_connection *conn); void smc_lgr_terminate_sched(struct smc_link_group *lgr); -void smc_port_terminate(struct smc_ib_device *smcibdev, u8 ibport); +void smcr_port_add(struct smc_ib_device *smcibdev, u8 ibport); +void smcr_port_err(struct smc_ib_device *smcibdev, u8 ibport); void smc_smcd_terminate(struct smcd_dev *dev, u64 peer_gid, unsigned short vlan); void smc_smcd_terminate_all(struct smcd_dev *dev); void smc_smcr_terminate_all(struct smc_ib_device *smcibdev); int smc_buf_create(struct smc_sock *smc, bool is_smcd); int smc_uncompress_bufsize(u8 compressed); -int smc_rmb_rtoken_handling(struct smc_connection *conn, +int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link, struct smc_clc_msg_accept_confirm *clc); -int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey); -int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey); +int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey); +int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey); +void smc_rtoken_set(struct smc_link_group *lgr, int link_idx, int link_idx_new, + __be32 nw_rkey_known, __be64 nw_vaddr, __be32 nw_rkey); +void smc_rtoken_set2(struct smc_link_group *lgr, int rtok_idx, int link_id, + __be64 nw_vaddr, __be32 nw_rkey); void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn); void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn); void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn); @@ -315,8 +381,22 @@ void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr); int smc_core_init(void); void smc_core_exit(void); +int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, + u8 link_idx, struct smc_init_info *ini); +void smcr_link_clear(struct smc_link *lnk, bool log); +int smcr_buf_map_lgr(struct smc_link *lnk); +int smcr_buf_reg_lgr(struct smc_link *lnk); +void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); +void smcr_lgr_set_type_asym(struct smc_link_group *lgr, + enum smc_lgr_type new_type, int asym_lnk_idx); +int smcr_link_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc); +struct smc_link *smc_switch_conns(struct smc_link_group *lgr, + struct smc_link *from_lnk, bool is_dev_err); +void smcr_link_down_cond(struct smc_link *lnk); +void smcr_link_down_cond_sched(struct smc_link *lnk); + static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { - return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]); + return link->lgr; } #endif diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 04b6fefb8bce..f0a5064bf9bd 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -249,9 +249,10 @@ static void smc_ib_port_event_work(struct work_struct *work) clear_bit(port_idx, &smcibdev->port_event_mask); if (!smc_ib_port_active(smcibdev, port_idx + 1)) { set_bit(port_idx, smcibdev->ports_going_away); - smc_port_terminate(smcibdev, port_idx + 1); + smcr_port_err(smcibdev, port_idx + 1); } else { clear_bit(port_idx, smcibdev->ports_going_away); + smcr_port_add(smcibdev, port_idx + 1); } } } @@ -389,15 +390,15 @@ void smc_ib_put_memory_region(struct ib_mr *mr) ib_dereg_mr(mr); } -static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) +static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot, u8 link_idx) { unsigned int offset = 0; int sg_num; /* map the largest prefix of a dma mapped SG list */ - sg_num = ib_map_mr_sg(buf_slot->mr_rx[SMC_SINGLE_LINK], - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + sg_num = ib_map_mr_sg(buf_slot->mr_rx[link_idx], + buf_slot->sgt[link_idx].sgl, + buf_slot->sgt[link_idx].orig_nents, &offset, PAGE_SIZE); return sg_num; @@ -405,29 +406,29 @@ static int smc_ib_map_mr_sg(struct smc_buf_desc *buf_slot) /* Allocate a memory region and map the dma mapped SG list of buf_slot */ int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot) + struct smc_buf_desc *buf_slot, u8 link_idx) { - if (buf_slot->mr_rx[SMC_SINGLE_LINK]) + if (buf_slot->mr_rx[link_idx]) return 0; /* already done */ - buf_slot->mr_rx[SMC_SINGLE_LINK] = + buf_slot->mr_rx[link_idx] = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, 1 << buf_slot->order); - if (IS_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK])) { + if (IS_ERR(buf_slot->mr_rx[link_idx])) { int rc; - rc = PTR_ERR(buf_slot->mr_rx[SMC_SINGLE_LINK]); - buf_slot->mr_rx[SMC_SINGLE_LINK] = NULL; + rc = PTR_ERR(buf_slot->mr_rx[link_idx]); + buf_slot->mr_rx[link_idx] = NULL; return rc; } - if (smc_ib_map_mr_sg(buf_slot) != 1) + if (smc_ib_map_mr_sg(buf_slot, link_idx) != 1) return -EINVAL; return 0; } /* synchronize buffer usage for cpu access */ -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -435,11 +436,11 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_cpu(smcibdev->ibdev, + ib_dma_sync_single_for_cpu(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -447,7 +448,7 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, } /* synchronize buffer usage for device access */ -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { @@ -455,11 +456,11 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, unsigned int i; /* for now there is just one DMA address */ - for_each_sg(buf_slot->sgt[SMC_SINGLE_LINK].sgl, sg, - buf_slot->sgt[SMC_SINGLE_LINK].nents, i) { + for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, + buf_slot->sgt[lnk->link_idx].nents, i) { if (!sg_dma_len(sg)) break; - ib_dma_sync_single_for_device(smcibdev->ibdev, + ib_dma_sync_single_for_device(lnk->smcibdev->ibdev, sg_dma_address(sg), sg_dma_len(sg), data_direction); @@ -467,15 +468,15 @@ void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, } /* Map a new TX or RX buffer SG-table to DMA */ -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { int mapped_nents; - mapped_nents = ib_dma_map_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + mapped_nents = ib_dma_map_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); if (!mapped_nents) return -ENOMEM; @@ -483,18 +484,18 @@ int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, return mapped_nents; } -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction) { - if (!buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address) + if (!buf_slot->sgt[lnk->link_idx].sgl->dma_address) return; /* already unmapped */ - ib_dma_unmap_sg(smcibdev->ibdev, - buf_slot->sgt[SMC_SINGLE_LINK].sgl, - buf_slot->sgt[SMC_SINGLE_LINK].orig_nents, + ib_dma_unmap_sg(lnk->smcibdev->ibdev, + buf_slot->sgt[lnk->link_idx].sgl, + buf_slot->sgt[lnk->link_idx].orig_nents, data_direction); - buf_slot->sgt[SMC_SINGLE_LINK].sgl->dma_address = 0; + buf_slot->sgt[lnk->link_idx].sgl->dma_address = 0; } long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev) @@ -574,13 +575,23 @@ static void smc_ib_add_dev(struct ib_device *ibdev) /* trigger reading of the port attributes */ port_cnt = smcibdev->ibdev->phys_port_cnt; + pr_warn_ratelimited("smc: adding ib device %s with port count %d\n", + smcibdev->ibdev->name, port_cnt); for (i = 0; i < min_t(size_t, port_cnt, SMC_MAX_PORTS); i++) { set_bit(i, &smcibdev->port_event_mask); /* determine pnetids of the port */ - smc_pnetid_by_dev_port(ibdev->dev.parent, i, - smcibdev->pnetid[i]); + if (smc_pnetid_by_dev_port(ibdev->dev.parent, i, + smcibdev->pnetid[i])) + smc_pnetid_by_table_ib(smcibdev, i + 1); + pr_warn_ratelimited("smc: ib device %s port %d has pnetid " + "%.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); } schedule_work(&smcibdev->port_event_work); } @@ -597,6 +608,8 @@ static void smc_ib_remove_dev(struct ib_device *ibdev, void *client_data) spin_lock(&smc_ib_devices.lock); list_del_init(&smcibdev->list); /* remove from smc_ib_devices */ spin_unlock(&smc_ib_devices.lock); + pr_warn_ratelimited("smc: removing ib device %s\n", + smcibdev->ibdev->name); smc_smcr_terminate_all(smcibdev); smc_ib_cleanup_per_ibdev(smcibdev); ib_unregister_event_handler(&smcibdev->event_handler); diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index 5c2b115d36da..e6a696ae15f3 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -59,10 +59,10 @@ struct smc_link; int smc_ib_register_client(void) __init; void smc_ib_unregister_client(void); bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport); -int smc_ib_buf_map_sg(struct smc_ib_device *smcibdev, +int smc_ib_buf_map_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_buf_unmap_sg(struct smc_ib_device *smcibdev, +void smc_ib_buf_unmap_sg(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); void smc_ib_dealloc_protection_domain(struct smc_link *lnk); @@ -74,12 +74,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk); int smc_ib_modify_qp_reset(struct smc_link *lnk); long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev); int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags, - struct smc_buf_desc *buf_slot); + struct smc_buf_desc *buf_slot, u8 link_idx); void smc_ib_put_memory_region(struct ib_mr *mr); -void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_cpu(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); -void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev, +void smc_ib_sync_sg_for_device(struct smc_link *lnk, struct smc_buf_desc *buf_slot, enum dma_data_direction data_direction); int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport, diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 5c4727d5066e..91f85fc09fb8 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -296,7 +296,8 @@ struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, device_initialize(&smcd->dev); dev_set_name(&smcd->dev, name); smcd->ops = ops; - smc_pnetid_by_dev_port(parent, 0, smcd->pnetid); + if (smc_pnetid_by_dev_port(parent, 0, smcd->pnetid)) + smc_pnetid_by_table_smcd(smcd); spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); @@ -320,12 +321,18 @@ int smcd_register_dev(struct smcd_dev *smcd) list_add_tail(&smcd->list, &smcd_dev_list.list); spin_unlock(&smcd_dev_list.lock); + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&smcd->dev), smcd->pnetid, + smcd->pnetid_by_user ? " (user defined)" : ""); + return device_add(&smcd->dev); } EXPORT_SYMBOL_GPL(smcd_register_dev); void smcd_unregister_dev(struct smcd_dev *smcd) { + pr_warn_ratelimited("smc: removing smcd device %s\n", + dev_name(&smcd->dev)); spin_lock(&smcd_dev_list.lock); list_del_init(&smcd->list); spin_unlock(&smcd_dev_list.lock); diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 0e52aab53d97..391237b601fe 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -17,6 +17,7 @@ #include "smc_core.h" #include "smc_clc.h" #include "smc_llc.h" +#include "smc_pnet.h" #define SMC_LLC_DATA_LEN 40 @@ -58,11 +59,34 @@ struct smc_llc_msg_add_link { /* type 0x02 */ u8 sender_gid[SMC_GID_SIZE]; u8 sender_qp_num[3]; u8 link_num; - u8 flags2; /* QP mtu */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 reserved3 : 4, + qp_mtu : 4; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + reserved3 : 4; +#endif u8 initial_psn[3]; u8 reserved[8]; }; +struct smc_llc_msg_add_link_cont_rt { + __be32 rmb_key; + __be32 rmb_key_new; + __be64 rmb_vaddr_new; +}; + +#define SMC_LLC_RKEYS_PER_CONT_MSG 2 + +struct smc_llc_msg_add_link_cont { /* type 0x03 */ + struct smc_llc_hdr hd; + u8 link_num; + u8 num_rkeys; + u8 reserved2[2]; + struct smc_llc_msg_add_link_cont_rt rt[SMC_LLC_RKEYS_PER_CONT_MSG]; + u8 reserved[4]; +} __packed; /* format defined in RFC7609 */ + #define SMC_LLC_FLAG_DEL_LINK_ALL 0x40 #define SMC_LLC_FLAG_DEL_LINK_ORDERLY 0x20 @@ -98,13 +122,8 @@ struct smc_llc_msg_confirm_rkey { /* type 0x06 */ u8 reserved; }; -struct smc_llc_msg_confirm_rkey_cont { /* type 0x08 */ - struct smc_llc_hdr hd; - u8 num_rkeys; - struct smc_rmb_rtoken rtoken[SMC_LLC_RKEYS_PER_MSG]; -}; - #define SMC_LLC_DEL_RKEY_MAX 8 +#define SMC_LLC_FLAG_RKEY_RETRY 0x10 #define SMC_LLC_FLAG_RKEY_NEG 0x20 struct smc_llc_msg_delete_rkey { /* type 0x09 */ @@ -119,10 +138,10 @@ struct smc_llc_msg_delete_rkey { /* type 0x09 */ union smc_llc_msg { struct smc_llc_msg_confirm_link confirm_link; struct smc_llc_msg_add_link add_link; + struct smc_llc_msg_add_link_cont add_link_cont; struct smc_llc_msg_del_link delete_link; struct smc_llc_msg_confirm_rkey confirm_rkey; - struct smc_llc_msg_confirm_rkey_cont confirm_rkey_cont; struct smc_llc_msg_delete_rkey delete_rkey; struct smc_llc_msg_test_link test_link; @@ -134,6 +153,162 @@ union smc_llc_msg { #define SMC_LLC_FLAG_RESP 0x80 +struct smc_llc_qentry { + struct list_head list; + struct smc_link *link; + union smc_llc_msg msg; +}; + +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc); + +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry = flow->qentry; + + flow->qentry = NULL; + return qentry; +} + +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow) +{ + struct smc_llc_qentry *qentry; + + if (flow->qentry) { + qentry = flow->qentry; + flow->qentry = NULL; + kfree(qentry); + } +} + +static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + flow->qentry = qentry; +} + +/* try to start a new llc flow, initiated by an incoming llc msg */ +static bool smc_llc_flow_start(struct smc_llc_flow *flow, + struct smc_llc_qentry *qentry) +{ + struct smc_link_group *lgr = qentry->link->lgr; + + spin_lock_bh(&lgr->llc_flow_lock); + if (flow->type) { + /* a flow is already active */ + if ((qentry->msg.raw.hdr.common.type == SMC_LLC_ADD_LINK || + qentry->msg.raw.hdr.common.type == SMC_LLC_DELETE_LINK) && + !lgr->delayed_event) { + lgr->delayed_event = qentry; + } else { + /* forget this llc request */ + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_flow_lock); + return false; + } + switch (qentry->msg.raw.hdr.common.type) { + case SMC_LLC_ADD_LINK: + flow->type = SMC_LLC_FLOW_ADD_LINK; + break; + case SMC_LLC_DELETE_LINK: + flow->type = SMC_LLC_FLOW_DEL_LINK; + break; + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + flow->type = SMC_LLC_FLOW_RKEY; + break; + default: + flow->type = SMC_LLC_FLOW_NONE; + } + if (qentry == lgr->delayed_event) + lgr->delayed_event = NULL; + spin_unlock_bh(&lgr->llc_flow_lock); + smc_llc_flow_qentry_set(flow, qentry); + return true; +} + +/* start a new local llc flow, wait till current flow finished */ +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type) +{ + enum smc_llc_flowtype allowed_remote = SMC_LLC_FLOW_NONE; + int rc; + + /* all flows except confirm_rkey and delete_rkey are exclusive, + * confirm/delete rkey flows can run concurrently (local and remote) + */ + if (type == SMC_LLC_FLOW_RKEY) + allowed_remote = SMC_LLC_FLOW_RKEY; +again: + if (list_empty(&lgr->list)) + return -ENODEV; + spin_lock_bh(&lgr->llc_flow_lock); + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)) { + lgr->llc_flow_lcl.type = type; + spin_unlock_bh(&lgr->llc_flow_lock); + return 0; + } + spin_unlock_bh(&lgr->llc_flow_lock); + rc = wait_event_interruptible_timeout(lgr->llc_waiter, + (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_NONE && + (lgr->llc_flow_rmt.type == SMC_LLC_FLOW_NONE || + lgr->llc_flow_rmt.type == allowed_remote)), + SMC_LLC_WAIT_TIME); + if (!rc) + return -ETIMEDOUT; + goto again; +} + +/* finish the current llc flow */ +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow) +{ + spin_lock_bh(&lgr->llc_flow_lock); + memset(flow, 0, sizeof(*flow)); + flow->type = SMC_LLC_FLOW_NONE; + spin_unlock_bh(&lgr->llc_flow_lock); + if (!list_empty(&lgr->list) && lgr->delayed_event && + flow == &lgr->llc_flow_lcl) + schedule_work(&lgr->llc_event_work); + else + wake_up_interruptible(&lgr->llc_waiter); +} + +/* lnk is optional and used for early wakeup when link goes down, useful in + * cases where we wait for a response on the link after we sent a request + */ +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg) +{ + struct smc_llc_flow *flow = &lgr->llc_flow_lcl; + + wait_event_interruptible_timeout(lgr->llc_waiter, + (flow->qentry || + (lnk && !smc_link_usable(lnk)) || + list_empty(&lgr->list)), + time_out); + if (!flow->qentry || + (lnk && !smc_link_usable(lnk)) || list_empty(&lgr->list)) { + smc_llc_flow_qentry_del(flow); + goto out; + } + if (exp_msg && flow->qentry->msg.raw.hdr.common.type != exp_msg) { + if (exp_msg == SMC_LLC_ADD_LINK && + flow->qentry->msg.raw.hdr.common.type == + SMC_LLC_DELETE_LINK) { + /* flow_start will delay the unexpected msg */ + smc_llc_flow_start(&lgr->llc_flow_lcl, + smc_llc_flow_qentry_clr(flow)); + return NULL; + } + smc_llc_flow_qentry_del(flow); + } +out: + return flow->qentry; +} + /********************************** send *************************************/ struct smc_llc_tx_pend { @@ -186,7 +361,6 @@ static int smc_llc_add_pending_send(struct smc_link *link, int smc_llc_send_confirm_link(struct smc_link *link, enum smc_llc_reqresp reqresp) { - struct smc_link_group *lgr = smc_get_lgr(link); struct smc_llc_msg_confirm_link *confllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; @@ -207,35 +381,52 @@ int smc_llc_send_confirm_link(struct smc_link *link, memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE); hton24(confllc->sender_qp_num, link->roce_qp->qp_num); confllc->link_num = link->link_id; - memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE); - confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* enforce peer resp. */ + memcpy(confllc->link_uid, link->link_uid, SMC_LGR_ID_SIZE); + confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS; /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } /* send LLC confirm rkey request */ -static int smc_llc_send_confirm_rkey(struct smc_link *link, +static int smc_llc_send_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { struct smc_llc_msg_confirm_rkey *rkeyllc; struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; - int rc; + struct smc_link *link; + int i, rc, rtok_ix; - rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend); if (rc) return rc; rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf; memset(rkeyllc, 0, sizeof(*rkeyllc)); rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey); + + rtok_ix = 1; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + link = &send_link->lgr->lnk[i]; + if (link->state == SMC_LNK_ACTIVE && link != send_link) { + rkeyllc->rtoken[rtok_ix].link_id = link->link_id; + rkeyllc->rtoken[rtok_ix].rmb_key = + htonl(rmb_desc->mr_rx[link->link_idx]->rkey); + rkeyllc->rtoken[rtok_ix].rmb_vaddr = cpu_to_be64( + (u64)sg_dma_address( + rmb_desc->sgt[link->link_idx].sgl)); + rtok_ix++; + } + } + /* rkey of send_link is in rtoken[0] */ + rkeyllc->rtoken[0].num_rkeys = rtok_ix - 1; rkeyllc->rtoken[0].rmb_key = - htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + htonl(rmb_desc->mr_rx[send_link->link_idx]->rkey); rkeyllc->rtoken[0].rmb_vaddr = cpu_to_be64( - (u64)sg_dma_address(rmb_desc->sgt[SMC_SINGLE_LINK].sgl)); + (u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl)); /* send llc message */ - rc = smc_wr_tx_send(link, pend); + rc = smc_wr_tx_send(send_link, pend); return rc; } @@ -256,32 +447,15 @@ static int smc_llc_send_delete_rkey(struct smc_link *link, rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY; rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey); rkeyllc->num_rkeys = 1; - rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey); + rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* prepare an add link message */ -static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc, - struct smc_link *link, u8 mac[], u8 gid[], - enum smc_llc_reqresp reqresp) -{ - memset(addllc, 0, sizeof(*addllc)); - addllc->hd.common.type = SMC_LLC_ADD_LINK; - addllc->hd.length = sizeof(struct smc_llc_msg_add_link); - if (reqresp == SMC_LLC_RESP) { - addllc->hd.flags |= SMC_LLC_FLAG_RESP; - /* always reject more links for now */ - addllc->hd.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; - addllc->hd.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; - } - memcpy(addllc->sender_mac, mac, ETH_ALEN); - memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); -} - /* send ADD LINK request or response */ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, enum smc_llc_reqresp reqresp) { struct smc_llc_msg_add_link *addllc; @@ -293,32 +467,33 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], if (rc) return rc; addllc = (struct smc_llc_msg_add_link *)wr_buf; - smc_llc_prep_add_link(addllc, link, mac, gid, reqresp); + + memset(addllc, 0, sizeof(*addllc)); + addllc->hd.common.type = SMC_LLC_ADD_LINK; + addllc->hd.length = sizeof(struct smc_llc_msg_add_link); + if (reqresp == SMC_LLC_RESP) + addllc->hd.flags |= SMC_LLC_FLAG_RESP; + memcpy(addllc->sender_mac, mac, ETH_ALEN); + memcpy(addllc->sender_gid, gid, SMC_GID_SIZE); + if (link_new) { + addllc->link_num = link_new->link_id; + hton24(addllc->sender_qp_num, link_new->roce_qp->qp_num); + hton24(addllc->initial_psn, link_new->psn_initial); + if (reqresp == SMC_LLC_REQ) + addllc->qp_mtu = link_new->path_mtu; + else + addllc->qp_mtu = min(link_new->path_mtu, + link_new->peer_mtu); + } /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; } -/* prepare a delete link message */ -static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc, - struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly) -{ - memset(delllc, 0, sizeof(*delllc)); - delllc->hd.common.type = SMC_LLC_DELETE_LINK; - delllc->hd.length = sizeof(struct smc_llc_msg_add_link); - if (reqresp == SMC_LLC_RESP) - delllc->hd.flags |= SMC_LLC_FLAG_RESP; - /* DEL_LINK_ALL because only 1 link supported */ - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; - if (orderly) - delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; - delllc->link_num = link->link_id; -} - /* send DELETE LINK request or response */ -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly) +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason) { struct smc_llc_msg_del_link *delllc; struct smc_wr_tx_pend_priv *pend; @@ -329,7 +504,19 @@ int smc_llc_send_delete_link(struct smc_link *link, if (rc) return rc; delllc = (struct smc_llc_msg_del_link *)wr_buf; - smc_llc_prep_delete_link(delllc, link, reqresp, orderly); + + memset(delllc, 0, sizeof(*delllc)); + delllc->hd.common.type = SMC_LLC_DELETE_LINK; + delllc->hd.length = sizeof(struct smc_llc_msg_del_link); + if (reqresp == SMC_LLC_RESP) + delllc->hd.flags |= SMC_LLC_FLAG_RESP; + if (orderly) + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + if (link_del_id) + delllc->link_num = link_del_id; + else + delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc->reason = htonl(reason); /* send llc message */ rc = smc_wr_tx_send(link, pend); return rc; @@ -356,238 +543,1094 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16]) return rc; } -struct smc_llc_send_work { - struct work_struct work; - struct smc_link *link; - int llclen; - union smc_llc_msg llcbuf; -}; - -/* worker that sends a prepared message */ -static void smc_llc_send_message_work(struct work_struct *work) +/* schedule an llc send on link, may wait for buffers */ +static int smc_llc_send_message(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *llcwrk = container_of(work, - struct smc_llc_send_work, work); struct smc_wr_tx_pend_priv *pend; struct smc_wr_buf *wr_buf; int rc; - if (llcwrk->link->state == SMC_LNK_INACTIVE) - goto out; - rc = smc_llc_add_pending_send(llcwrk->link, &wr_buf, &pend); + if (!smc_link_usable(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); if (rc) - goto out; - memcpy(wr_buf, &llcwrk->llcbuf, llcwrk->llclen); - smc_wr_tx_send(llcwrk->link, pend); -out: - kfree(llcwrk); + return rc; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + return smc_wr_tx_send(link, pend); } -/* copy llcbuf and schedule an llc send on link */ -static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen) +/* schedule an llc send on link, may wait for buffers, + * and wait for send completion notification. + * @return 0 on success + */ +static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf) { - struct smc_llc_send_work *wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + int rc; - if (!wrk) - return -ENOMEM; - INIT_WORK(&wrk->work, smc_llc_send_message_work); - wrk->link = link; - wrk->llclen = llclen; - memcpy(&wrk->llcbuf, llcbuf, llclen); - queue_work(link->llc_wq, &wrk->work); - return 0; + if (!smc_link_usable(link)) + return -ENOLINK; + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg)); + return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME); } /********************************* receive ***********************************/ -static void smc_llc_rx_confirm_link(struct smc_link *link, - struct smc_llc_msg_confirm_link *llc) +static int smc_llc_alloc_alt_link(struct smc_link_group *lgr, + enum smc_lgr_type lgr_new_t) { - struct smc_link_group *lgr = smc_get_lgr(link); - int conf_rc; + int i; + + if (lgr->type == SMC_LGR_SYMMETRIC || + (lgr->type != SMC_LGR_SINGLE && + (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER))) + return -EMLINK; + + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) { + for (i = SMC_LINKS_PER_LGR_MAX - 1; i >= 0; i--) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } else { + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (lgr->lnk[i].state == SMC_LNK_UNUSED) + return i; + } + return -EMLINK; +} - /* RMBE eyecatchers are not supported */ - if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC) - conf_rc = 0; - else - conf_rc = ENOTSUPP; +/* return first buffer from any of the next buf lists */ +static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + struct smc_buf_desc *buf_pos; + + while (*buf_lst < SMC_RMBE_SIZES) { + buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst], + struct smc_buf_desc, list); + if (buf_pos) + return buf_pos; + (*buf_lst)++; + } + return NULL; +} + +/* return next rmb from buffer lists */ +static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr, + int *buf_lst, + struct smc_buf_desc *buf_pos) +{ + struct smc_buf_desc *buf_next; + + if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) { + (*buf_lst)++; + return _smc_llc_get_next_rmb(lgr, buf_lst); + } + buf_next = list_next_entry(buf_pos, list); + return buf_next; +} + +static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr, + int *buf_lst) +{ + *buf_lst = 0; + return smc_llc_get_next_rmb(lgr, buf_lst, NULL); +} + +/* send one add_link_continue msg */ +static int smc_llc_add_link_cont(struct smc_link *link, + struct smc_link *link_new, u8 *num_rkeys_todo, + int *buf_lst, struct smc_buf_desc **buf_pos) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + int prim_lnk_idx, lnk_idx, i, rc; + struct smc_wr_tx_pend_priv *pend; + struct smc_wr_buf *wr_buf; + struct smc_buf_desc *rmb; + u8 n; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_resp_rc = conf_rc; - complete(&link->llc_confirm_resp); + rc = smc_llc_add_pending_send(link, &wr_buf, &pend); + if (rc) + return rc; + addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf; + memset(addc_llc, 0, sizeof(*addc_llc)); + + prim_lnk_idx = link->link_idx; + lnk_idx = link_new->link_idx; + addc_llc->link_num = link_new->link_id; + addc_llc->num_rkeys = *num_rkeys_todo; + n = *num_rkeys_todo; + for (i = 0; i < min_t(u8, n, SMC_LLC_RKEYS_PER_CONT_MSG); i++) { + if (!*buf_pos) { + addc_llc->num_rkeys = addc_llc->num_rkeys - + *num_rkeys_todo; + *num_rkeys_todo = 0; + break; } - } else { - if (lgr->role == SMC_CLNT && - link->state == SMC_LNK_ACTIVATING) { - link->llc_confirm_rc = conf_rc; - link->link_id = llc->link_num; - complete(&link->llc_confirm); + rmb = *buf_pos; + + addc_llc->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey); + addc_llc->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey); + addc_llc->rt[i].rmb_vaddr_new = + cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl)); + + (*num_rkeys_todo)--; + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + while (*buf_pos && !(*buf_pos)->used) + *buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos); + } + addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT; + addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont); + if (lgr->role == SMC_CLNT) + addc_llc->hd.flags |= SMC_LLC_FLAG_RESP; + return smc_wr_tx_send(link, pend); +} + +static int smc_llc_cli_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + mutex_lock(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + break; + } + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + if (rc) + break; + } while (num_rkeys_send || num_rkeys_recv); + + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + +/* prepare and send an add link reject response */ +static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry) +{ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ; + qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH; + return smc_llc_send_message(qentry->link, &qentry->msg); +} + +static int smc_llc_cli_conf_link(struct smc_link *link, + struct smc_init_info *ini, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; + + /* receive CONFIRM LINK request over RoCE fabric */ + qentry = smc_llc_wait(lgr, NULL, SMC_LLC_WAIT_FIRST_TIME, 0); + if (!qentry) { + rc = smc_llc_send_delete_link(link, link_new->link_id, + SMC_LLC_REQ, false, + SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) { + /* received DELETE_LINK instead */ + qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return -ENOLINK; + } + smc_llc_save_peer_uid(qentry); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_modify_qp_rts(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; } + smc_wr_remember_qp_attr(link_new); + + rc = smcr_buf_reg_lgr(link_new); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + + /* send CONFIRM LINK response over RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_RESP); + if (rc) { + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; + } + smc_llc_link_active(link_new); + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); + return 0; } -static void smc_llc_rx_add_link(struct smc_link *link, - struct smc_llc_msg_add_link *llc) +static void smc_llc_save_add_link_info(struct smc_link *link, + struct smc_llc_msg_add_link *add_llc) { + link->peer_qpn = ntoh24(add_llc->sender_qp_num); + memcpy(link->peer_gid, add_llc->sender_gid, SMC_GID_SIZE); + memcpy(link->peer_mac, add_llc->sender_mac, ETH_ALEN); + link->peer_psn = ntoh24(add_llc->initial_psn); + link->peer_mtu = add_llc->qp_mtu; +} + +/* as an SMC client, process an add link request */ +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) +{ + struct smc_llc_msg_add_link *llc = &qentry->msg.add_link; + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_link *lnk_new = NULL; + struct smc_init_info ini; + int lnk_idx, rc = 0; + + ini.vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) { + if (!ini.ib_dev) + goto out_reject; + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + if (!ini.ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini.ib_dev = link->smcibdev; + ini.ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) + goto out_reject; + lnk_new = &lgr->lnk[lnk_idx]; + rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini); + if (rc) + goto out_reject; + smc_llc_save_add_link_info(lnk_new, llc); + lnk_new->link_id = llc->link_num; /* SMC server assigns link id */ + smc_llc_link_set_uid(lnk_new); - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVATING) - complete(&link->llc_add_resp); - } else { - if (link->state == SMC_LNK_ACTIVATING) { - complete(&link->llc_add); - return; - } + rc = smc_ib_ready_link(lnk_new); + if (rc) + goto out_clear_lnk; - if (lgr->role == SMC_SERV) { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_REQ); + rc = smcr_buf_map_lgr(lnk_new); + if (rc) + goto out_clear_lnk; - } else { - smc_llc_prep_add_link(llc, link, - link->smcibdev->mac[link->ibport - 1], - link->gid, SMC_LLC_RESP); + rc = smc_llc_send_add_link(link, + lnk_new->smcibdev->mac[ini.ib_port - 1], + lnk_new->gid, lnk_new, SMC_LLC_RESP); + if (rc) + goto out_clear_lnk; + rc = smc_llc_cli_rkey_exchange(link, lnk_new); + if (rc) { + rc = 0; + goto out_clear_lnk; + } + rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t); + if (!rc) + goto out; +out_clear_lnk: + smcr_link_clear(lnk_new, false); +out_reject: + smc_llc_cli_add_link_reject(qentry); +out: + kfree(qentry); + return rc; +} + +static void smc_llc_process_cli_add_link(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + + mutex_lock(&lgr->llc_conf_mutex); + smc_llc_cli_add_link(qentry->link, qentry); + mutex_unlock(&lgr->llc_conf_mutex); +} + +static int smc_llc_active_link_count(struct smc_link_group *lgr) +{ + int i, link_count = 0; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + link_count++; + } + return link_count; +} + +/* find the asymmetric link when 3 links are established */ +static struct smc_link *smc_llc_find_asym_link(struct smc_link_group *lgr) +{ + int asym_idx = -ENOENT; + int i, j, k; + bool found; + + /* determine asymmetric link */ + found = false; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + for (j = i + 1; j < SMC_LINKS_PER_LGR_MAX; j++) { + if (!smc_link_usable(&lgr->lnk[i]) || + !smc_link_usable(&lgr->lnk[j])) + continue; + if (!memcmp(lgr->lnk[i].gid, lgr->lnk[j].gid, + SMC_GID_SIZE)) { + found = true; /* asym_lnk is i or j */ + break; + } } - smc_llc_send_message(link, llc, sizeof(*llc)); + if (found) + break; } + if (!found) + goto out; /* no asymmetric link */ + for (k = 0; k < SMC_LINKS_PER_LGR_MAX; k++) { + if (!smc_link_usable(&lgr->lnk[k])) + continue; + if (k != i && + !memcmp(lgr->lnk[i].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = i; + break; + } + if (k != j && + !memcmp(lgr->lnk[j].peer_gid, lgr->lnk[k].peer_gid, + SMC_GID_SIZE)) { + asym_idx = j; + break; + } + } +out: + return (asym_idx < 0) ? NULL : &lgr->lnk[asym_idx]; } -static void smc_llc_rx_delete_link(struct smc_link *link, - struct smc_llc_msg_del_link *llc) +static void smc_llc_delete_asym_link(struct smc_link_group *lgr) { - struct smc_link_group *lgr = smc_get_lgr(link); + struct smc_link *lnk_new = NULL, *lnk_asym; + struct smc_llc_qentry *qentry; + int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (lgr->role == SMC_SERV) - smc_lgr_schedule_free_work_fast(lgr); - } else { - smc_lgr_forget(lgr); - smc_llc_link_deleting(link); - if (lgr->role == SMC_SERV) { - /* client asks to delete this link, send request */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true); - } else { - /* server requests to delete this link, send response */ - smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true); + lnk_asym = smc_llc_find_asym_link(lgr); + if (!lnk_asym) + return; /* no asymmetric link */ + if (!smc_link_downing(&lnk_asym->state)) + return; + lnk_new = smc_switch_conns(lgr, lnk_asym, false); + smc_wr_tx_wait_no_pending_sends(lnk_asym); + if (!lnk_new) + goto out_free; + /* change flow type from ADD_LINK into DEL_LINK */ + lgr->llc_flow_lcl.type = SMC_LLC_FLOW_DEL_LINK; + rc = smc_llc_send_delete_link(lnk_new, lnk_asym->link_id, SMC_LLC_REQ, + true, SMC_LLC_DEL_NO_ASYM_NEEDED); + if (rc) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + qentry = smc_llc_wait(lgr, lnk_new, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (!qentry) { + smcr_link_down_cond(lnk_new); + goto out_free; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); +out_free: + smcr_link_clear(lnk_asym, true); +} + +static int smc_llc_srv_rkey_exchange(struct smc_link *link, + struct smc_link *link_new) +{ + struct smc_llc_msg_add_link_cont *addc_llc; + struct smc_link_group *lgr = link->lgr; + u8 max, num_rkeys_send, num_rkeys_recv; + struct smc_llc_qentry *qentry = NULL; + struct smc_buf_desc *buf_pos; + int buf_lst; + int rc = 0; + int i; + + mutex_lock(&lgr->rmbs_lock); + num_rkeys_send = lgr->conns_num; + buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst); + do { + smc_llc_add_link_cont(link, link_new, &num_rkeys_send, + &buf_lst, &buf_pos); + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, + SMC_LLC_ADD_LINK_CONT); + if (!qentry) { + rc = -ETIMEDOUT; + goto out; } - smc_llc_send_message(link, llc, sizeof(*llc)); - smc_lgr_terminate_sched(lgr); + addc_llc = &qentry->msg.add_link_cont; + num_rkeys_recv = addc_llc->num_rkeys; + max = min_t(u8, num_rkeys_recv, SMC_LLC_RKEYS_PER_CONT_MSG); + for (i = 0; i < max; i++) { + smc_rtoken_set(lgr, link->link_idx, link_new->link_idx, + addc_llc->rt[i].rmb_key, + addc_llc->rt[i].rmb_vaddr_new, + addc_llc->rt[i].rmb_key_new); + num_rkeys_recv--; + } + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + } while (num_rkeys_send || num_rkeys_recv); +out: + mutex_unlock(&lgr->rmbs_lock); + return rc; +} + +static int smc_llc_srv_conf_link(struct smc_link *link, + struct smc_link *link_new, + enum smc_lgr_type lgr_new_t) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc; + + /* send CONFIRM LINK request over the RoCE fabric */ + rc = smc_llc_send_confirm_link(link_new, SMC_LLC_REQ); + if (rc) + return -ENOLINK; + /* receive CONFIRM LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, + SMC_LLC_CONFIRM_LINK); + if (!qentry) { + /* send DELETE LINK */ + smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ, + false, SMC_LLC_DEL_LOST_PATH); + return -ENOLINK; } + smc_llc_save_peer_uid(qentry); + smc_llc_link_active(link_new); + if (lgr_new_t == SMC_LGR_ASYMMETRIC_LOCAL || + lgr_new_t == SMC_LGR_ASYMMETRIC_PEER) + smcr_lgr_set_type_asym(lgr, lgr_new_t, link_new->link_idx); + else + smcr_lgr_set_type(lgr, lgr_new_t); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return 0; } -static void smc_llc_rx_test_link(struct smc_link *link, - struct smc_llc_msg_test_link *llc) +int smc_llc_srv_add_link(struct smc_link *link) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - if (link->state == SMC_LNK_ACTIVE) - complete(&link->llc_testlink_resp); - } else { - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC; + struct smc_link_group *lgr = link->lgr; + struct smc_llc_msg_add_link *add_llc; + struct smc_llc_qentry *qentry = NULL; + struct smc_link *link_new; + struct smc_init_info ini; + int lnk_idx, rc = 0; + + /* ignore client add link recommendation, start new flow */ + ini.vlan_id = lgr->vlan_id; + smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev); + if (!ini.ib_dev) { + lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL; + ini.ib_dev = link->smcibdev; + ini.ib_port = link->ibport; + } + lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t); + if (lnk_idx < 0) + return 0; + + rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini); + if (rc) + return rc; + link_new = &lgr->lnk[lnk_idx]; + rc = smc_llc_send_add_link(link, + link_new->smcibdev->mac[ini.ib_port - 1], + link_new->gid, link_new, SMC_LLC_REQ); + if (rc) + goto out_err; + /* receive ADD LINK response over the RoCE fabric */ + qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK); + if (!qentry) { + rc = -ETIMEDOUT; + goto out_err; } + add_llc = &qentry->msg.add_link; + if (add_llc->hd.flags & SMC_LLC_FLAG_ADD_LNK_REJ) { + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + rc = -ENOLINK; + goto out_err; + } + if (lgr->type == SMC_LGR_SINGLE && + (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) && + !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) { + lgr_new_t = SMC_LGR_ASYMMETRIC_PEER; + } + smc_llc_save_add_link_info(link_new, add_llc); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + + rc = smc_ib_ready_link(link_new); + if (rc) + goto out_err; + rc = smcr_buf_map_lgr(link_new); + if (rc) + goto out_err; + rc = smcr_buf_reg_lgr(link_new); + if (rc) + goto out_err; + rc = smc_llc_srv_rkey_exchange(link, link_new); + if (rc) + goto out_err; + rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t); + if (rc) + goto out_err; + return 0; +out_err: + smcr_link_clear(link_new, false); + return rc; } -static void smc_llc_rx_confirm_rkey(struct smc_link *link, - struct smc_llc_msg_confirm_rkey *llc) +static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) { + struct smc_link *link = lgr->llc_flow_lcl.qentry->link; int rc; - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_confirm_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_confirm_rkey); - } else { - rc = smc_rtoken_add(smc_get_lgr(link), - llc->rtoken[0].rmb_vaddr, - llc->rtoken[0].rmb_key); + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); - /* ignore rtokens for other links, we have only one link */ + mutex_lock(&lgr->llc_conf_mutex); + rc = smc_llc_srv_add_link(link); + if (!rc && lgr->type == SMC_LGR_SYMMETRIC) { + /* delete any asymmetric link */ + smc_llc_delete_asym_link(lgr); + } + mutex_unlock(&lgr->llc_conf_mutex); +} + +/* enqueue a local add_link req to trigger a new add_link flow, only as SERV */ +void smc_llc_srv_add_link_local(struct smc_link *link) +{ + struct smc_llc_msg_add_link add_llc = {0}; + + add_llc.hd.length = sizeof(add_llc); + add_llc.hd.common.type = SMC_LLC_ADD_LINK; + /* no dev and port needed, we as server ignore client data anyway */ + smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc); +} + +/* worker to process an add link message */ +static void smc_llc_add_link_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_add_link_work); - llc->hd.flags |= SMC_LLC_FLAG_RESP; - if (rc < 0) - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - smc_llc_send_message(link, llc, sizeof(*llc)); + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_add_link(lgr); + else + smc_llc_process_srv_add_link(lgr); +out: + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); } -static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link, - struct smc_llc_msg_confirm_rkey_cont *llc) +/* enqueue a local del_link msg to trigger a new del_link flow, + * called only for role SMC_SERV + */ +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) { - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - /* unused as long as we don't send this type of msg */ - } else { - /* ignore rtokens for other links, we have only one link */ - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + struct smc_llc_msg_del_link del_llc = {0}; + + del_llc.hd.length = sizeof(del_llc); + del_llc.hd.common.type = SMC_LLC_DELETE_LINK; + del_llc.link_num = del_link_id; + del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH); + del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + smc_llc_enqueue(link, (union smc_llc_msg *)&del_llc); +} + +static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) +{ + struct smc_link *lnk_del = NULL, *lnk_asym, *lnk; + struct smc_llc_msg_del_link *del_llc; + struct smc_llc_qentry *qentry; + int active_links; + int lnk_idx; + + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (del_llc->hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + smc_lgr_terminate_sched(lgr); + goto out; + } + mutex_lock(&lgr->llc_conf_mutex); + /* delete single link */ + for (lnk_idx = 0; lnk_idx < SMC_LINKS_PER_LGR_MAX; lnk_idx++) { + if (lgr->lnk[lnk_idx].link_id != del_llc->link_num) + continue; + lnk_del = &lgr->lnk[lnk_idx]; + break; + } + del_llc->hd.flags |= SMC_LLC_FLAG_RESP; + if (!lnk_del) { + /* link was not found */ + del_llc->reason = htonl(SMC_LLC_DEL_NOLNK); + smc_llc_send_message(lnk, &qentry->msg); + goto out_unlock; + } + lnk_asym = smc_llc_find_asym_link(lgr); + + del_llc->reason = 0; + smc_llc_send_message(lnk, &qentry->msg); /* response */ + + if (smc_link_downing(&lnk_del->state)) { + smc_switch_conns(lgr, lnk_del, false); + smc_wr_tx_wait_no_pending_sends(lnk_del); + } + smcr_link_clear(lnk_del, true); + + active_links = smc_llc_active_link_count(lgr); + if (lnk_del == lnk_asym) { + /* expected deletion of asym link, don't change lgr state */ + } else if (active_links == 1) { + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + } else if (!active_links) { + smcr_lgr_set_type(lgr, SMC_LGR_NONE); + smc_lgr_terminate_sched(lgr); } +out_unlock: + mutex_unlock(&lgr->llc_conf_mutex); +out: + kfree(qentry); } -static void smc_llc_rx_delete_rkey(struct smc_link *link, - struct smc_llc_msg_delete_rkey *llc) +/* try to send a DELETE LINK ALL request on any active link, + * waiting for send completion + */ +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) { - u8 err_mask = 0; - int i, max; + struct smc_llc_msg_del_link delllc = {0}; + int i; + + delllc.hd.common.type = SMC_LLC_DELETE_LINK; + delllc.hd.length = sizeof(delllc); + if (ord) + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY; + delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL; + delllc.reason = htonl(rsn); + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (!smc_link_usable(&lgr->lnk[i])) + continue; + if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc)) + break; + } +} - if (llc->hd.flags & SMC_LLC_FLAG_RESP) { - link->llc_delete_rkey_rc = llc->hd.flags & - SMC_LLC_FLAG_RKEY_NEG; - complete(&link->llc_delete_rkey); - } else { - max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); - for (i = 0; i < max; i++) { - if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i])) - err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); +static void smc_llc_process_srv_delete_link(struct smc_link_group *lgr) +{ + struct smc_llc_msg_del_link *del_llc; + struct smc_link *lnk, *lnk_del; + struct smc_llc_qentry *qentry; + int active_links; + int i; + + mutex_lock(&lgr->llc_conf_mutex); + qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl); + lnk = qentry->link; + del_llc = &qentry->msg.delete_link; + + if (qentry->msg.delete_link.hd.flags & SMC_LLC_FLAG_DEL_LINK_ALL) { + /* delete entire lgr */ + smc_llc_send_link_delete_all(lgr, true, ntohl( + qentry->msg.delete_link.reason)); + smc_lgr_terminate_sched(lgr); + goto out; + } + /* delete single link */ + lnk_del = NULL; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { + if (lgr->lnk[i].link_id == del_llc->link_num) { + lnk_del = &lgr->lnk[i]; + break; } + } + if (!lnk_del) + goto out; /* asymmetric link already deleted */ - if (err_mask) { - llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; - llc->err_mask = err_mask; + if (smc_link_downing(&lnk_del->state)) { + smc_switch_conns(lgr, lnk_del, false); + smc_wr_tx_wait_no_pending_sends(lnk_del); + } + if (!list_empty(&lgr->list)) { + /* qentry is either a request from peer (send it back to + * initiate the DELETE_LINK processing), or a locally + * enqueued DELETE_LINK request (forward it) + */ + if (!smc_llc_send_message(lnk, &qentry->msg)) { + struct smc_llc_qentry *qentry2; + + qentry2 = smc_llc_wait(lgr, lnk, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_LINK); + if (qentry2) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); } + } + smcr_link_clear(lnk_del, true); - llc->hd.flags |= SMC_LLC_FLAG_RESP; - smc_llc_send_message(link, llc, sizeof(*llc)); + active_links = smc_llc_active_link_count(lgr); + if (active_links == 1) { + smcr_lgr_set_type(lgr, SMC_LGR_SINGLE); + } else if (!active_links) { + smcr_lgr_set_type(lgr, SMC_LGR_NONE); + smc_lgr_terminate_sched(lgr); } + + if (lgr->type == SMC_LGR_SINGLE && !list_empty(&lgr->list)) { + /* trigger setup of asymm alt link */ + smc_llc_srv_add_link_local(lnk); + } +out: + mutex_unlock(&lgr->llc_conf_mutex); + kfree(qentry); } -static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +static void smc_llc_delete_link_work(struct work_struct *work) { - struct smc_link *link = (struct smc_link *)wc->qp->qp_context; - union smc_llc_msg *llc = buf; + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_del_link_work); - if (wc->byte_len < sizeof(*llc)) - return; /* short message */ - if (llc->raw.hdr.length != sizeof(*llc)) - return; /* invalid message */ - if (link->state == SMC_LNK_INACTIVE) - return; /* link not active, drop msg */ + if (list_empty(&lgr->list)) { + /* link group is terminating */ + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + goto out; + } + + if (lgr->role == SMC_CLNT) + smc_llc_process_cli_delete_link(lgr); + else + smc_llc_process_srv_delete_link(lgr); +out: + smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl); +} + +/* process a confirm_rkey request from peer, remote flow */ +static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr) +{ + struct smc_llc_msg_confirm_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + int num_entries; + int rk_idx; + int i; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.confirm_rkey; + link = qentry->link; + + num_entries = llc->rtoken[0].num_rkeys; + /* first rkey entry is for receiving link */ + rk_idx = smc_rtoken_add(link, + llc->rtoken[0].rmb_vaddr, + llc->rtoken[0].rmb_key); + if (rk_idx < 0) + goto out_err; + + for (i = 1; i <= min_t(u8, num_entries, SMC_LLC_RKEYS_PER_MSG - 1); i++) + smc_rtoken_set2(lgr, rk_idx, llc->rtoken[i].link_id, + llc->rtoken[i].rmb_vaddr, + llc->rtoken[i].rmb_key); + /* max links is 3 so there is no need to support conf_rkey_cont msgs */ + goto out; +out_err: + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY; +out: + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} + +/* process a delete_rkey request from peer, remote flow */ +static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr) +{ + struct smc_llc_msg_delete_rkey *llc; + struct smc_llc_qentry *qentry; + struct smc_link *link; + u8 err_mask = 0; + int i, max; + + qentry = lgr->llc_flow_rmt.qentry; + llc = &qentry->msg.delete_rkey; + link = qentry->link; + + max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX); + for (i = 0; i < max; i++) { + if (smc_rtoken_delete(link, llc->rkey[i])) + err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i); + } + if (err_mask) { + llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG; + llc->err_mask = err_mask; + } + llc->hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, &qentry->msg); + smc_llc_flow_qentry_del(&lgr->llc_flow_rmt); +} + +static void smc_llc_protocol_violation(struct smc_link_group *lgr, u8 type) +{ + pr_warn_ratelimited("smc: SMC-R lg %*phN LLC protocol violation: " + "llc_type %d\n", SMC_LGR_ID_SIZE, &lgr->id, type); + smc_llc_set_termination_rsn(lgr, SMC_LLC_DEL_PROT_VIOL); + smc_lgr_terminate_sched(lgr); +} + +/* flush the llc event queue */ +static void smc_llc_event_flush(struct smc_link_group *lgr) +{ + struct smc_llc_qentry *qentry, *q; + + spin_lock_bh(&lgr->llc_event_q_lock); + list_for_each_entry_safe(qentry, q, &lgr->llc_event_q, list) { + list_del_init(&qentry->list); + kfree(qentry); + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +static void smc_llc_event_handler(struct smc_llc_qentry *qentry) +{ + union smc_llc_msg *llc = &qentry->msg; + struct smc_link *link = qentry->link; + struct smc_link_group *lgr = link->lgr; + + if (!smc_link_usable(link)) + goto out; switch (llc->raw.hdr.common.type) { case SMC_LLC_TEST_LINK: - smc_llc_rx_test_link(link, &llc->test_link); - break; - case SMC_LLC_CONFIRM_LINK: - smc_llc_rx_confirm_link(link, &llc->confirm_link); + llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP; + smc_llc_send_message(link, llc); break; case SMC_LLC_ADD_LINK: - smc_llc_rx_add_link(link, &llc->add_link); + if (list_empty(&lgr->list)) + goto out; /* lgr is terminating */ + if (lgr->role == SMC_CLNT) { + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_add_link_work); + } + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) { + /* as smc server, handle client suggestion */ + schedule_work(&lgr->llc_add_link_work); + } + return; + case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_ADD_LINK_CONT: + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* a flow is waiting for this message */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, qentry); + wake_up_interruptible(&lgr->llc_waiter); + return; + } break; case SMC_LLC_DELETE_LINK: - smc_llc_rx_delete_link(link, &llc->delete_link); - break; + if (lgr->role == SMC_CLNT) { + /* server requests to delete this link, send response */ + if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_NONE) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_del_link_work); + } + } else { + if (lgr->llc_flow_lcl.type == SMC_LLC_FLOW_ADD_LINK && + !lgr->llc_flow_lcl.qentry) { + /* DEL LINK REQ during ADD LINK SEQ */ + smc_llc_flow_qentry_set(&lgr->llc_flow_lcl, + qentry); + wake_up_interruptible(&lgr->llc_waiter); + } else if (smc_llc_flow_start(&lgr->llc_flow_lcl, + qentry)) { + schedule_work(&lgr->llc_del_link_work); + } + } + return; case SMC_LLC_CONFIRM_RKEY: - smc_llc_rx_confirm_rkey(link, &llc->confirm_rkey); - break; + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_conf_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; case SMC_LLC_CONFIRM_RKEY_CONT: - smc_llc_rx_confirm_rkey_cont(link, &llc->confirm_rkey_cont); + /* not used because max links is 3, and 3 rkeys fit into + * one CONFIRM_RKEY message + */ break; case SMC_LLC_DELETE_RKEY: - smc_llc_rx_delete_rkey(link, &llc->delete_rkey); + /* new request from remote, assign to remote flow */ + if (smc_llc_flow_start(&lgr->llc_flow_rmt, qentry)) { + /* process here, does not wait for more llc msgs */ + smc_llc_rmt_delete_rkey(lgr); + smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt); + } + return; + default: + smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type); break; } +out: + kfree(qentry); +} + +/* worker to process llc messages on the event queue */ +static void smc_llc_event_work(struct work_struct *work) +{ + struct smc_link_group *lgr = container_of(work, struct smc_link_group, + llc_event_work); + struct smc_llc_qentry *qentry; + + if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { + if (smc_link_usable(lgr->delayed_event->link)) { + smc_llc_event_handler(lgr->delayed_event); + } else { + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + kfree(qentry); + } + } + +again: + spin_lock_bh(&lgr->llc_event_q_lock); + if (!list_empty(&lgr->llc_event_q)) { + qentry = list_first_entry(&lgr->llc_event_q, + struct smc_llc_qentry, list); + list_del_init(&qentry->list); + spin_unlock_bh(&lgr->llc_event_q_lock); + smc_llc_event_handler(qentry); + goto again; + } + spin_unlock_bh(&lgr->llc_event_q_lock); +} + +/* process llc responses in tasklet context */ +static void smc_llc_rx_response(struct smc_link *link, + struct smc_llc_qentry *qentry) +{ + u8 llc_type = qentry->msg.raw.hdr.common.type; + + switch (llc_type) { + case SMC_LLC_TEST_LINK: + if (link->state == SMC_LNK_ACTIVE) + complete(&link->llc_testlink_resp); + break; + case SMC_LLC_ADD_LINK: + case SMC_LLC_DELETE_LINK: + case SMC_LLC_CONFIRM_LINK: + case SMC_LLC_ADD_LINK_CONT: + case SMC_LLC_CONFIRM_RKEY: + case SMC_LLC_DELETE_RKEY: + /* assign responses to the local flow, we requested them */ + smc_llc_flow_qentry_set(&link->lgr->llc_flow_lcl, qentry); + wake_up_interruptible(&link->lgr->llc_waiter); + return; + case SMC_LLC_CONFIRM_RKEY_CONT: + /* not used because max links is 3 */ + break; + default: + smc_llc_protocol_violation(link->lgr, llc_type); + break; + } + kfree(qentry); +} + +static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc) +{ + struct smc_link_group *lgr = link->lgr; + struct smc_llc_qentry *qentry; + unsigned long flags; + + qentry = kmalloc(sizeof(*qentry), GFP_ATOMIC); + if (!qentry) + return; + qentry->link = link; + INIT_LIST_HEAD(&qentry->list); + memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg)); + + /* process responses immediately */ + if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) { + smc_llc_rx_response(link, qentry); + return; + } + + /* add requests to event queue */ + spin_lock_irqsave(&lgr->llc_event_q_lock, flags); + list_add_tail(&qentry->list, &lgr->llc_event_q); + spin_unlock_irqrestore(&lgr->llc_event_q_lock, flags); + schedule_work(&link->lgr->llc_event_work); +} + +/* copy received msg and add it to the event queue */ +static void smc_llc_rx_handler(struct ib_wc *wc, void *buf) +{ + struct smc_link *link = (struct smc_link *)wc->qp->qp_context; + union smc_llc_msg *llc = buf; + + if (wc->byte_len < sizeof(*llc)) + return; /* short message */ + if (llc->raw.hdr.length != sizeof(*llc)) + return; /* invalid message */ + + smc_llc_enqueue(link, llc); } /***************************** worker, utils *********************************/ @@ -613,112 +1656,162 @@ static void smc_llc_testlink_work(struct work_struct *work) /* receive TEST LINK response over RoCE fabric */ rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp, SMC_LLC_WAIT_TIME); + if (link->state != SMC_LNK_ACTIVE) + return; /* link state changed */ if (rc <= 0) { - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); return; } next_interval = link->llc_testlink_time; out: - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - next_interval); + schedule_delayed_work(&link->llc_testlink_wrk, next_interval); } -int smc_llc_link_init(struct smc_link *link) +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) { - struct smc_link_group *lgr = smc_get_lgr(link); - link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM, - *((u32 *)lgr->id), - link->link_id); - if (!link->llc_wq) - return -ENOMEM; - init_completion(&link->llc_confirm); - init_completion(&link->llc_confirm_resp); - init_completion(&link->llc_add); - init_completion(&link->llc_add_resp); - init_completion(&link->llc_confirm_rkey); - init_completion(&link->llc_delete_rkey); - mutex_init(&link->llc_delete_rkey_mutex); - init_completion(&link->llc_testlink_resp); - INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); - return 0; + struct net *net = sock_net(smc->clcsock->sk); + + INIT_WORK(&lgr->llc_event_work, smc_llc_event_work); + INIT_WORK(&lgr->llc_add_link_work, smc_llc_add_link_work); + INIT_WORK(&lgr->llc_del_link_work, smc_llc_delete_link_work); + INIT_LIST_HEAD(&lgr->llc_event_q); + spin_lock_init(&lgr->llc_event_q_lock); + spin_lock_init(&lgr->llc_flow_lock); + init_waitqueue_head(&lgr->llc_waiter); + mutex_init(&lgr->llc_conf_mutex); + lgr->llc_testlink_time = net->ipv4.sysctl_tcp_keepalive_time; } -void smc_llc_link_active(struct smc_link *link, int testlink_time) +/* called after lgr was removed from lgr_list */ +void smc_llc_lgr_clear(struct smc_link_group *lgr) { - link->state = SMC_LNK_ACTIVE; - if (testlink_time) { - link->llc_testlink_time = testlink_time * HZ; - queue_delayed_work(link->llc_wq, &link->llc_testlink_wrk, - link->llc_testlink_time); + smc_llc_event_flush(lgr); + wake_up_interruptible_all(&lgr->llc_waiter); + cancel_work_sync(&lgr->llc_event_work); + cancel_work_sync(&lgr->llc_add_link_work); + cancel_work_sync(&lgr->llc_del_link_work); + if (lgr->delayed_event) { + kfree(lgr->delayed_event); + lgr->delayed_event = NULL; } } -void smc_llc_link_deleting(struct smc_link *link) +int smc_llc_link_init(struct smc_link *link) { - link->state = SMC_LNK_DELETING; - smc_wr_wakeup_tx_wait(link); + init_completion(&link->llc_testlink_resp); + INIT_DELAYED_WORK(&link->llc_testlink_wrk, smc_llc_testlink_work); + return 0; } -/* called in tasklet context */ -void smc_llc_link_inactive(struct smc_link *link) +void smc_llc_link_active(struct smc_link *link) { - link->state = SMC_LNK_INACTIVE; - cancel_delayed_work(&link->llc_testlink_wrk); - smc_wr_wakeup_reg_wait(link); - smc_wr_wakeup_tx_wait(link); + pr_warn_ratelimited("smc: SMC-R lg %*phN link added: id %*phN, " + "peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); + link->state = SMC_LNK_ACTIVE; + if (link->lgr->llc_testlink_time) { + link->llc_testlink_time = link->lgr->llc_testlink_time * HZ; + schedule_delayed_work(&link->llc_testlink_wrk, + link->llc_testlink_time); + } } /* called in worker context */ -void smc_llc_link_clear(struct smc_link *link) +void smc_llc_link_clear(struct smc_link *link, bool log) { - flush_workqueue(link->llc_wq); - destroy_workqueue(link->llc_wq); + if (log) + pr_warn_ratelimited("smc: SMC-R lg %*phN link removed: id %*phN" + ", peerid %*phN, ibdev %s, ibport %d\n", + SMC_LGR_ID_SIZE, &link->lgr->id, + SMC_LGR_ID_SIZE, &link->link_uid, + SMC_LGR_ID_SIZE, &link->peer_link_uid, + link->smcibdev->ibdev->name, link->ibport); + complete(&link->llc_testlink_resp); + cancel_delayed_work_sync(&link->llc_testlink_wrk); + smc_wr_wakeup_reg_wait(link); + smc_wr_wakeup_tx_wait(link); } -/* register a new rtoken at the remote peer */ -int smc_llc_do_confirm_rkey(struct smc_link *link, +/* register a new rtoken at the remote peer (for all links) */ +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc) { - int rc; + struct smc_link_group *lgr = send_link->lgr; + struct smc_llc_qentry *qentry = NULL; + int rc = 0; - /* protected by mutex smc_create_lgr_pending */ - reinit_completion(&link->llc_confirm_rkey); - rc = smc_llc_send_confirm_rkey(link, rmb_desc); + rc = smc_llc_send_confirm_rkey(send_link, rmb_desc); if (rc) - return rc; + goto out; /* receive CONFIRM RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_confirm_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_confirm_rkey_rc) - return -EFAULT; - return 0; + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_CONFIRM_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) + rc = -EFAULT; +out: + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); + return rc; } /* unregister an rtoken at the remote peer */ -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc) { + struct smc_llc_qentry *qentry = NULL; + struct smc_link *send_link; int rc = 0; - mutex_lock(&link->llc_delete_rkey_mutex); - if (link->state != SMC_LNK_ACTIVE) - goto out; - reinit_completion(&link->llc_delete_rkey); - rc = smc_llc_send_delete_rkey(link, rmb_desc); + send_link = smc_llc_usable_link(lgr); + if (!send_link) + return -ENOLINK; + + /* protected by llc_flow control */ + rc = smc_llc_send_delete_rkey(send_link, rmb_desc); if (rc) goto out; /* receive DELETE RKEY response from server over RoCE fabric */ - rc = wait_for_completion_interruptible_timeout(&link->llc_delete_rkey, - SMC_LLC_WAIT_TIME); - if (rc <= 0 || link->llc_delete_rkey_rc) + qentry = smc_llc_wait(lgr, send_link, SMC_LLC_WAIT_TIME, + SMC_LLC_DELETE_RKEY); + if (!qentry || (qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_RKEY_NEG)) rc = -EFAULT; - else - rc = 0; out: - mutex_unlock(&link->llc_delete_rkey_mutex); + if (qentry) + smc_llc_flow_qentry_del(&lgr->llc_flow_lcl); return rc; } +void smc_llc_link_set_uid(struct smc_link *link) +{ + __be32 link_uid; + + link_uid = htonl(*((u32 *)link->lgr->id) + link->link_id); + memcpy(link->link_uid, &link_uid, SMC_LGR_ID_SIZE); +} + +/* save peers link user id, used for debug purposes */ +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry) +{ + memcpy(qentry->link->peer_link_uid, qentry->msg.confirm_link.link_uid, + SMC_LGR_ID_SIZE); +} + +/* evaluate confirm link request or response */ +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type) +{ + if (type == SMC_LLC_REQ) { /* SMC server assigns link_id */ + qentry->link->link_id = qentry->msg.confirm_link.link_num; + smc_llc_link_set_uid(qentry->link); + } + if (!(qentry->msg.raw.hdr.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)) + return -ENOTSUPP; + return 0; +} + /***************************** init, exit, misc ******************************/ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { @@ -736,6 +1829,10 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = { }, { .handler = smc_llc_rx_handler, + .type = SMC_LLC_ADD_LINK_CONT + }, + { + .handler = smc_llc_rx_handler, .type = SMC_LLC_DELETE_LINK }, { diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 461c0c3ef76e..a5d2fe3eea61 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -28,6 +28,7 @@ enum smc_llc_reqresp { enum smc_llc_msg_type { SMC_LLC_CONFIRM_LINK = 0x01, SMC_LLC_ADD_LINK = 0x02, + SMC_LLC_ADD_LINK_CONT = 0x03, SMC_LLC_DELETE_LINK = 0x04, SMC_LLC_CONFIRM_RKEY = 0x06, SMC_LLC_TEST_LINK = 0x07, @@ -35,22 +36,74 @@ enum smc_llc_msg_type { SMC_LLC_DELETE_RKEY = 0x09, }; +#define smc_link_downing(state) \ + (cmpxchg(state, SMC_LNK_ACTIVE, SMC_LNK_INACTIVE) == SMC_LNK_ACTIVE) + +/* LLC DELETE LINK Request Reason Codes */ +#define SMC_LLC_DEL_LOST_PATH 0x00010000 +#define SMC_LLC_DEL_OP_INIT_TERM 0x00020000 +#define SMC_LLC_DEL_PROG_INIT_TERM 0x00030000 +#define SMC_LLC_DEL_PROT_VIOL 0x00040000 +#define SMC_LLC_DEL_NO_ASYM_NEEDED 0x00050000 +/* LLC DELETE LINK Response Reason Codes */ +#define SMC_LLC_DEL_NOLNK 0x00100000 /* Unknown Link ID (no link) */ +#define SMC_LLC_DEL_NOLGR 0x00200000 /* Unknown Link Group */ + +/* returns a usable link of the link group, or NULL */ +static inline struct smc_link *smc_llc_usable_link(struct smc_link_group *lgr) +{ + int i; + + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) + if (smc_link_usable(&lgr->lnk[i])) + return &lgr->lnk[i]; + return NULL; +} + +/* set the termination reason code for the link group */ +static inline void smc_llc_set_termination_rsn(struct smc_link_group *lgr, + u32 rsn) +{ + if (!lgr->llc_termination_rsn) + lgr->llc_termination_rsn = rsn; +} + /* transmit */ int smc_llc_send_confirm_link(struct smc_link *lnk, enum smc_llc_reqresp reqresp); int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[], + struct smc_link *link_new, enum smc_llc_reqresp reqresp); -int smc_llc_send_delete_link(struct smc_link *link, - enum smc_llc_reqresp reqresp, bool orderly); +int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id, + enum smc_llc_reqresp reqresp, bool orderly, + u32 reason); +void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id); +void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc); +void smc_llc_lgr_clear(struct smc_link_group *lgr); int smc_llc_link_init(struct smc_link *link); -void smc_llc_link_active(struct smc_link *link, int testlink_time); -void smc_llc_link_deleting(struct smc_link *link); -void smc_llc_link_inactive(struct smc_link *link); -void smc_llc_link_clear(struct smc_link *link); -int smc_llc_do_confirm_rkey(struct smc_link *link, +void smc_llc_link_active(struct smc_link *link); +void smc_llc_link_clear(struct smc_link *link, bool log); +int smc_llc_do_confirm_rkey(struct smc_link *send_link, struct smc_buf_desc *rmb_desc); -int smc_llc_do_delete_rkey(struct smc_link *link, +int smc_llc_do_delete_rkey(struct smc_link_group *lgr, struct smc_buf_desc *rmb_desc); +int smc_llc_flow_initiate(struct smc_link_group *lgr, + enum smc_llc_flowtype type); +void smc_llc_flow_stop(struct smc_link_group *lgr, struct smc_llc_flow *flow); +int smc_llc_eval_conf_link(struct smc_llc_qentry *qentry, + enum smc_llc_reqresp type); +void smc_llc_link_set_uid(struct smc_link *link); +void smc_llc_save_peer_uid(struct smc_llc_qentry *qentry); +struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr, + struct smc_link *lnk, + int time_out, u8 exp_msg); +struct smc_llc_qentry *smc_llc_flow_qentry_clr(struct smc_llc_flow *flow); +void smc_llc_flow_qentry_del(struct smc_llc_flow *flow); +void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, + u32 rsn); +int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry); +int smc_llc_srv_add_link(struct smc_link *link); +void smc_llc_srv_add_link_local(struct smc_link *link); int smc_llc_init(void) __init; #endif /* SMC_LLC_H */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 2a5ed47c3e08..014d91b9778e 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -32,7 +32,7 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev); -static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { +static const struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { [SMC_PNETID_NAME] = { .type = NLA_NUL_STRING, .len = SMC_MAX_PNETID_LEN @@ -50,29 +50,26 @@ static struct nla_policy smc_pnet_policy[SMC_PNETID_MAX + 1] = { static struct genl_family smc_pnet_nl_family; -/** - * struct smc_user_pnetentry - pnet identifier name entry for/from user - * @list: List node. - * @pnet_name: Pnet identifier name - * @ndev: pointer to network device. - * @smcibdev: Pointer to IB device. - * @ib_port: Port of IB device. - * @smcd_dev: Pointer to smcd device. - */ -struct smc_user_pnetentry { - struct list_head list; - char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; - struct smc_ib_device *smcibdev; - u8 ib_port; - struct smcd_dev *smcd_dev; +enum smc_pnet_nametype { + SMC_PNET_ETH = 1, + SMC_PNET_IB = 2, }; /* pnet entry stored in pnet table */ struct smc_pnetentry { struct list_head list; char pnet_name[SMC_MAX_PNETID_LEN + 1]; - struct net_device *ndev; + enum smc_pnet_nametype type; + union { + struct { + char eth_name[IFNAMSIZ + 1]; + struct net_device *ndev; + }; + struct { + char ib_name[IB_DEVICE_NAME_MAX + 1]; + u8 ib_port; + }; + }; }; /* Check if two given pnetids match */ @@ -106,14 +103,21 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* remove netdevices */ + /* remove table entry */ write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { if (!pnet_name || smc_pnet_match(pnetelem->pnet_name, pnet_name)) { list_del(&pnetelem->list); - dev_put(pnetelem->ndev); + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev) { + dev_put(pnetelem->ndev); + pr_warn_ratelimited("smc: net device %s " + "erased user defined " + "pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + } kfree(pnetelem); rc = 0; } @@ -132,6 +136,12 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) (!pnet_name || smc_pnet_match(pnet_name, ibdev->pnetid[ibport]))) { + pr_warn_ratelimited("smc: ib device %s ibport " + "%d erased user defined " + "pnetid %.16s\n", + ibdev->ibdev->name, + ibport + 1, + ibdev->pnetid[ibport]); memset(ibdev->pnetid[ibport], 0, SMC_MAX_PNETID_LEN); ibdev->pnetid_by_user[ibport] = false; @@ -146,6 +156,10 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) if (smcd_dev->pnetid_by_user && (!pnet_name || smc_pnet_match(pnet_name, smcd_dev->pnetid))) { + pr_warn_ratelimited("smc: smcd device %s " + "erased user defined pnetid " + "%.16s\n", dev_name(&smcd_dev->dev), + smcd_dev->pnetid); memset(smcd_dev->pnetid, 0, SMC_MAX_PNETID_LEN); smcd_dev->pnetid_by_user = false; rc = 0; @@ -155,9 +169,9 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) return rc; } -/* Remove a pnet entry mentioning a given network device from the pnet table. +/* Add the reference to a given network device to the pnet table. */ -static int smc_pnet_remove_by_ndev(struct net_device *ndev) +static int smc_pnet_add_by_ndev(struct net_device *ndev) { struct smc_pnetentry *pnetelem, *tmp_pe; struct smc_pnettable *pnettable; @@ -171,11 +185,15 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) write_lock(&pnettable->lock); list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { - if (pnetelem->ndev == ndev) { - list_del(&pnetelem->list); - dev_put(pnetelem->ndev); - kfree(pnetelem); + if (pnetelem->type == SMC_PNET_ETH && !pnetelem->ndev && + !strncmp(pnetelem->eth_name, ndev->name, IFNAMSIZ)) { + dev_hold(ndev); + pnetelem->ndev = ndev; rc = 0; + pr_warn_ratelimited("smc: adding net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); break; } } @@ -183,80 +201,71 @@ static int smc_pnet_remove_by_ndev(struct net_device *ndev) return rc; } -/* Append a pnetid to the end of the pnet table if not already on this list. +/* Remove the reference to a given network device from the pnet table. */ -static int smc_pnet_enter(struct smc_pnettable *pnettable, - struct smc_user_pnetentry *new_pnetelem) +static int smc_pnet_remove_by_ndev(struct net_device *ndev) { - u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; - u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_pnetentry *tmp_pnetelem; - struct smc_pnetentry *pnetelem; - bool new_smcddev = false; - struct net_device *ndev; - bool new_netdev = true; - bool new_ibdev = false; - - if (new_pnetelem->smcibdev) { - struct smc_ib_device *ib_dev = new_pnetelem->smcibdev; - int ib_port = new_pnetelem->ib_port; + struct smc_pnetentry *pnetelem, *tmp_pe; + struct smc_pnettable *pnettable; + struct net *net = dev_net(ndev); + struct smc_net *sn; + int rc = -ENOENT; - spin_lock(&smc_ib_devices.lock); - if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { - memcpy(ib_dev->pnetid[ib_port - 1], - new_pnetelem->pnet_name, SMC_MAX_PNETID_LEN); - ib_dev->pnetid_by_user[ib_port - 1] = true; - new_ibdev = true; - } - spin_unlock(&smc_ib_devices.lock); - } - if (new_pnetelem->smcd_dev) { - struct smcd_dev *smcd_dev = new_pnetelem->smcd_dev; + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; - spin_lock(&smcd_dev_list.lock); - if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { - memcpy(smcd_dev->pnetid, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - smcd_dev->pnetid_by_user = true; - new_smcddev = true; + write_lock(&pnettable->lock); + list_for_each_entry_safe(pnetelem, tmp_pe, &pnettable->pnetlist, list) { + if (pnetelem->type == SMC_PNET_ETH && pnetelem->ndev == ndev) { + dev_put(pnetelem->ndev); + pnetelem->ndev = NULL; + rc = 0; + pr_warn_ratelimited("smc: removing net device %s with " + "user defined pnetid %.16s\n", + pnetelem->eth_name, + pnetelem->pnet_name); + break; } - spin_unlock(&smcd_dev_list.lock); } + write_unlock(&pnettable->lock); + return rc; +} - if (!new_pnetelem->ndev) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; +/* Apply pnetid to ib device when no pnetid is set. + */ +static bool smc_pnet_apply_ib(struct smc_ib_device *ib_dev, u8 ib_port, + char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - /* check if (base) netdev already has a pnetid. If there is one, we do - * not want to add a pnet table entry - */ - ndev = pnet_find_base_ndev(new_pnetelem->ndev); - if (!smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, - ndev_pnetid)) - return (new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_lock(&smc_ib_devices.lock); + if (smc_pnet_match(ib_dev->pnetid[ib_port - 1], pnet_null)) { + memcpy(ib_dev->pnetid[ib_port - 1], pnet_name, + SMC_MAX_PNETID_LEN); + ib_dev->pnetid_by_user[ib_port - 1] = true; + applied = true; + } + spin_unlock(&smc_ib_devices.lock); + return applied; +} - /* add a new netdev entry to the pnet table if there isn't one */ - tmp_pnetelem = kzalloc(sizeof(*pnetelem), GFP_KERNEL); - if (!tmp_pnetelem) - return -ENOMEM; - memcpy(tmp_pnetelem->pnet_name, new_pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_pnetelem->ndev = new_pnetelem->ndev; +/* Apply pnetid to smcd device when no pnetid is set. + */ +static bool smc_pnet_apply_smcd(struct smcd_dev *smcd_dev, char *pnet_name) +{ + u8 pnet_null[SMC_MAX_PNETID_LEN] = {0}; + bool applied = false; - write_lock(&pnettable->lock); - list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (pnetelem->ndev == new_pnetelem->ndev) - new_netdev = false; - } - if (new_netdev) { - dev_hold(tmp_pnetelem->ndev); - list_add_tail(&tmp_pnetelem->list, &pnettable->pnetlist); - write_unlock(&pnettable->lock); - } else { - write_unlock(&pnettable->lock); - kfree(tmp_pnetelem); + spin_lock(&smcd_dev_list.lock); + if (smc_pnet_match(smcd_dev->pnetid, pnet_null)) { + memcpy(smcd_dev->pnetid, pnet_name, SMC_MAX_PNETID_LEN); + smcd_dev->pnetid_by_user = true; + applied = true; } - - return (new_netdev || new_ibdev || new_smcddev) ? 0 : -EEXIST; + spin_unlock(&smcd_dev_list.lock); + return applied; } /* The limit for pnetid is 16 characters. @@ -323,57 +332,184 @@ out: return smcd_dev; } -/* Parse the supplied netlink attributes and fill a pnetentry structure. - * For ethernet and infiniband device names verify that the devices exist. +static int smc_pnet_add_eth(struct smc_pnettable *pnettable, struct net *net, + char *eth_name, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct net_device *ndev, *base_ndev; + u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + bool new_netdev; + int rc; + + /* check if (base) netdev already has a pnetid. If there is one, we do + * not want to add a pnet table entry + */ + rc = -EEXIST; + ndev = dev_get_by_name(net, eth_name); /* dev_hold() */ + if (ndev) { + base_ndev = pnet_find_base_ndev(ndev); + if (!smc_pnetid_by_dev_port(base_ndev->dev.parent, + base_ndev->dev_port, ndev_pnetid)) + goto out_put; + } + + /* add a new netdev entry to the pnet table if there isn't one */ + rc = -ENOMEM; + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + goto out_put; + new_pe->type = SMC_PNET_ETH; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->eth_name, eth_name, IFNAMSIZ); + new_pe->ndev = ndev; + + rc = -EEXIST; + new_netdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_ETH && + !strncmp(tmp_pe->eth_name, eth_name, IFNAMSIZ)) { + new_netdev = false; + break; + } + } + if (new_netdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + goto out_put; + } + if (ndev) + pr_warn_ratelimited("smc: net device %s " + "applied user defined pnetid %.16s\n", + new_pe->eth_name, new_pe->pnet_name); + return 0; + +out_put: + if (ndev) + dev_put(ndev); + return rc; +} + +static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, + u8 ib_port, char *pnet_name) +{ + struct smc_pnetentry *tmp_pe, *new_pe; + struct smc_ib_device *ib_dev; + bool smcddev_applied = true; + bool ibdev_applied = true; + struct smcd_dev *smcd_dev; + bool new_ibdev; + + /* try to apply the pnetid to active devices */ + ib_dev = smc_pnet_find_ib(ib_name); + if (ib_dev) { + ibdev_applied = smc_pnet_apply_ib(ib_dev, ib_port, pnet_name); + if (ibdev_applied) + pr_warn_ratelimited("smc: ib device %s ibport %d " + "applied user defined pnetid " + "%.16s\n", ib_dev->ibdev->name, + ib_port, + ib_dev->pnetid[ib_port - 1]); + } + smcd_dev = smc_pnet_find_smcd(ib_name); + if (smcd_dev) { + smcddev_applied = smc_pnet_apply_smcd(smcd_dev, pnet_name); + if (smcddev_applied) + pr_warn_ratelimited("smc: smcd device %s " + "applied user defined pnetid " + "%.16s\n", dev_name(&smcd_dev->dev), + smcd_dev->pnetid); + } + /* Apply fails when a device has a hardware-defined pnetid set, do not + * add a pnet table entry in that case. + */ + if (!ibdev_applied || !smcddev_applied) + return -EEXIST; + + /* add a new ib entry to the pnet table if there isn't one */ + new_pe = kzalloc(sizeof(*new_pe), GFP_KERNEL); + if (!new_pe) + return -ENOMEM; + new_pe->type = SMC_PNET_IB; + memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); + strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + new_pe->ib_port = ib_port; + + new_ibdev = true; + write_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + new_ibdev = false; + break; + } + } + if (new_ibdev) { + list_add_tail(&new_pe->list, &pnettable->pnetlist); + write_unlock(&pnettable->lock); + } else { + write_unlock(&pnettable->lock); + kfree(new_pe); + } + return (new_ibdev) ? 0 : -EEXIST; +} + +/* Append a pnetid to the end of the pnet table if not already on this list. */ -static int smc_pnet_fill_entry(struct net *net, - struct smc_user_pnetentry *pnetelem, - struct nlattr *tb[]) +static int smc_pnet_enter(struct net *net, struct nlattr *tb[]) { - char *string, *ibname; + char pnet_name[SMC_MAX_PNETID_LEN + 1]; + struct smc_pnettable *pnettable; + bool new_netdev = false; + bool new_ibdev = false; + struct smc_net *sn; + u8 ibport = 1; + char *string; int rc; - memset(pnetelem, 0, sizeof(*pnetelem)); - INIT_LIST_HEAD(&pnetelem->list); + /* get pnettable for namespace */ + sn = net_generic(net, smc_net_id); + pnettable = &sn->pnettable; rc = -EINVAL; if (!tb[SMC_PNETID_NAME]) goto error; string = (char *)nla_data(tb[SMC_PNETID_NAME]); - if (!smc_pnetid_valid(string, pnetelem->pnet_name)) + if (!smc_pnetid_valid(string, pnet_name)) goto error; - rc = -EINVAL; if (tb[SMC_PNETID_ETHNAME]) { string = (char *)nla_data(tb[SMC_PNETID_ETHNAME]); - pnetelem->ndev = dev_get_by_name(net, string); - if (!pnetelem->ndev) + rc = smc_pnet_add_eth(pnettable, net, string, pnet_name); + if (!rc) + new_netdev = true; + else if (rc != -EEXIST) goto error; } /* if this is not the initial namespace, stop here */ if (net != &init_net) - return 0; + return new_netdev ? 0 : -EEXIST; rc = -EINVAL; if (tb[SMC_PNETID_IBNAME]) { - ibname = (char *)nla_data(tb[SMC_PNETID_IBNAME]); - ibname = strim(ibname); - pnetelem->smcibdev = smc_pnet_find_ib(ibname); - pnetelem->smcd_dev = smc_pnet_find_smcd(ibname); - if (!pnetelem->smcibdev && !pnetelem->smcd_dev) - goto error; - if (pnetelem->smcibdev) { - if (!tb[SMC_PNETID_IBPORT]) - goto error; - pnetelem->ib_port = nla_get_u8(tb[SMC_PNETID_IBPORT]); - if (pnetelem->ib_port < 1 || - pnetelem->ib_port > SMC_MAX_PORTS) + string = (char *)nla_data(tb[SMC_PNETID_IBNAME]); + string = strim(string); + if (tb[SMC_PNETID_IBPORT]) { + ibport = nla_get_u8(tb[SMC_PNETID_IBPORT]); + if (ibport < 1 || ibport > SMC_MAX_PORTS) goto error; } + rc = smc_pnet_add_ib(pnettable, string, ibport, pnet_name); + if (!rc) + new_ibdev = true; + else if (rc != -EEXIST) + goto error; } - - return 0; + return (new_netdev || new_ibdev) ? 0 : -EEXIST; error: return rc; @@ -381,28 +517,22 @@ error: /* Convert an smc_pnetentry to a netlink attribute sequence */ static int smc_pnet_set_nla(struct sk_buff *msg, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { if (nla_put_string(msg, SMC_PNETID_NAME, pnetelem->pnet_name)) return -1; - if (pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH) { if (nla_put_string(msg, SMC_PNETID_ETHNAME, - pnetelem->ndev->name)) + pnetelem->eth_name)) return -1; } else { if (nla_put_string(msg, SMC_PNETID_ETHNAME, "n/a")) return -1; } - if (pnetelem->smcibdev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(pnetelem->smcibdev->ibdev->dev.parent)) || + if (pnetelem->type == SMC_PNET_IB) { + if (nla_put_string(msg, SMC_PNETID_IBNAME, pnetelem->ib_name) || nla_put_u8(msg, SMC_PNETID_IBPORT, pnetelem->ib_port)) return -1; - } else if (pnetelem->smcd_dev) { - if (nla_put_string(msg, SMC_PNETID_IBNAME, - dev_name(&pnetelem->smcd_dev->dev)) || - nla_put_u8(msg, SMC_PNETID_IBPORT, 1)) - return -1; } else { if (nla_put_string(msg, SMC_PNETID_IBNAME, "n/a") || nla_put_u8(msg, SMC_PNETID_IBPORT, 0xff)) @@ -415,21 +545,8 @@ static int smc_pnet_set_nla(struct sk_buff *msg, static int smc_pnet_add(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); - struct smc_user_pnetentry pnetelem; - struct smc_pnettable *pnettable; - struct smc_net *sn; - int rc; - - /* get pnettable for namespace */ - sn = net_generic(net, smc_net_id); - pnettable = &sn->pnettable; - rc = smc_pnet_fill_entry(net, &pnetelem, info->attrs); - if (!rc) - rc = smc_pnet_enter(pnettable, &pnetelem); - if (pnetelem.ndev) - dev_put(pnetelem.ndev); - return rc; + return smc_pnet_enter(net, info->attrs); } static int smc_pnet_del(struct sk_buff *skb, struct genl_info *info) @@ -450,7 +567,7 @@ static int smc_pnet_dump_start(struct netlink_callback *cb) static int smc_pnet_dumpinfo(struct sk_buff *skb, u32 portid, u32 seq, u32 flags, - struct smc_user_pnetentry *pnetelem) + struct smc_pnetentry *pnetelem) { void *hdr; @@ -469,91 +586,32 @@ static int smc_pnet_dumpinfo(struct sk_buff *skb, static int _smc_pnet_dump(struct net *net, struct sk_buff *skb, u32 portid, u32 seq, u8 *pnetid, int start_idx) { - struct smc_user_pnetentry tmp_entry; struct smc_pnettable *pnettable; struct smc_pnetentry *pnetelem; - struct smc_ib_device *ibdev; - struct smcd_dev *smcd_dev; struct smc_net *sn; int idx = 0; - int ibport; /* get pnettable for namespace */ sn = net_generic(net, smc_net_id); pnettable = &sn->pnettable; - /* dump netdevices */ + /* dump pnettable entries */ read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { if (pnetid && !smc_pnet_match(pnetelem->pnet_name, pnetid)) continue; if (idx++ < start_idx) continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, pnetelem->pnet_name, - SMC_MAX_PNETID_LEN); - tmp_entry.ndev = pnetelem->ndev; + /* if this is not the initial namespace, dump only netdev */ + if (net != &init_net && pnetelem->type != SMC_PNET_ETH) + continue; if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { + pnetelem)) { --idx; break; } } read_unlock(&pnettable->lock); - - /* if this is not the initial namespace, stop here */ - if (net != &init_net) - return idx; - - /* dump ib devices */ - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (ibport = 0; ibport < SMC_MAX_PORTS; ibport++) { - if (ibdev->pnetid_by_user[ibport]) { - if (pnetid && - !smc_pnet_match(ibdev->pnetid[ibport], - pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, - ibdev->pnetid[ibport], - SMC_MAX_PNETID_LEN); - tmp_entry.smcibdev = ibdev; - tmp_entry.ib_port = ibport + 1; - if (smc_pnet_dumpinfo(skb, portid, seq, - NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - } - spin_unlock(&smc_ib_devices.lock); - - /* dump smcd devices */ - spin_lock(&smcd_dev_list.lock); - list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (smcd_dev->pnetid_by_user) { - if (pnetid && !smc_pnet_match(smcd_dev->pnetid, pnetid)) - continue; - if (idx++ < start_idx) - continue; - memset(&tmp_entry, 0, sizeof(tmp_entry)); - memcpy(&tmp_entry.pnet_name, smcd_dev->pnetid, - SMC_MAX_PNETID_LEN); - tmp_entry.smcd_dev = smcd_dev; - if (smc_pnet_dumpinfo(skb, portid, seq, NLM_F_MULTI, - &tmp_entry)) { - --idx; - break; - } - } - } - spin_unlock(&smcd_dev_list.lock); - return idx; } @@ -659,6 +717,9 @@ static int smc_pnet_netdev_event(struct notifier_block *this, case NETDEV_UNREGISTER: smc_pnet_remove_by_ndev(event_dev); return NOTIFY_OK; + case NETDEV_REGISTER: + smc_pnet_add_by_ndev(event_dev); + return NOTIFY_OK; default: return NOTIFY_DONE; } @@ -744,7 +805,7 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, read_lock(&pnettable->lock); list_for_each_entry(pnetelem, &pnettable->pnetlist, list) { - if (ndev == pnetelem->ndev) { + if (pnetelem->type == SMC_PNET_ETH && ndev == pnetelem->ndev) { /* get pnetid of netdev device */ memcpy(pnetid, pnetelem->pnet_name, SMC_MAX_PNETID_LEN); rc = 0; @@ -755,6 +816,45 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev, return rc; } +/* find a roce device for the given pnetid */ +static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id, + struct smc_init_info *ini, + struct smc_ib_device *known_dev) +{ + struct smc_ib_device *ibdev; + int i; + + ini->ib_dev = NULL; + spin_lock(&smc_ib_devices.lock); + list_for_each_entry(ibdev, &smc_ib_devices.list, list) { + if (ibdev == known_dev) + continue; + for (i = 1; i <= SMC_MAX_PORTS; i++) { + if (!rdma_is_port_valid(ibdev->ibdev, i)) + continue; + if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) && + smc_ib_port_active(ibdev, i) && + !test_bit(i - 1, ibdev->ports_going_away) && + !smc_ib_determine_gid(ibdev, i, ini->vlan_id, + ini->ib_gid, NULL)) { + ini->ib_dev = ibdev; + ini->ib_port = i; + goto out; + } + } + } +out: + spin_unlock(&smc_ib_devices.lock); +} + +/* find alternate roce device with same pnet_id and vlan_id */ +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev) +{ + _smc_pnet_find_roce_by_pnetid(lgr->pnet_id, ini, known_dev); +} + /* if handshake network device belongs to a roce device, return its * IB device and port */ @@ -801,8 +901,6 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; - struct smc_ib_device *ibdev; - int i; ndev = pnet_find_base_ndev(ndev); if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, @@ -811,25 +909,7 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, smc_pnet_find_rdma_dev(ndev, ini); return; /* pnetid could not be determined */ } - - spin_lock(&smc_ib_devices.lock); - list_for_each_entry(ibdev, &smc_ib_devices.list, list) { - for (i = 1; i <= SMC_MAX_PORTS; i++) { - if (!rdma_is_port_valid(ibdev->ibdev, i)) - continue; - if (smc_pnet_match(ibdev->pnetid[i - 1], ndev_pnetid) && - smc_ib_port_active(ibdev, i) && - !test_bit(i - 1, ibdev->ports_going_away) && - !smc_ib_determine_gid(ibdev, i, ini->vlan_id, - ini->ib_gid, NULL)) { - ini->ib_dev = ibdev; - ini->ib_port = i; - goto out; - } - } - } -out: - spin_unlock(&smc_ib_devices.lock); + _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL); } static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, @@ -895,3 +975,60 @@ out_rel: out: return; } + +/* Lookup and apply a pnet table entry to the given ib device. + */ +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) +{ + char *ib_name = smcibdev->ibdev->name; + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX) && + tmp_pe->ib_port == ib_port) { + smc_pnet_apply_ib(smcibdev, ib_port, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} + +/* Lookup and apply a pnet table entry to the given smcd device. + */ +int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) +{ + const char *ib_name = dev_name(&smcddev->dev); + struct smc_pnettable *pnettable; + struct smc_pnetentry *tmp_pe; + struct smc_net *sn; + int rc = -ENOENT; + + /* get pnettable for init namespace */ + sn = net_generic(&init_net, smc_net_id); + pnettable = &sn->pnettable; + + read_lock(&pnettable->lock); + list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { + if (tmp_pe->type == SMC_PNET_IB && + !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); + rc = 0; + break; + } + } + read_unlock(&pnettable->lock); + + return rc; +} diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 4564e4d69c2e..811a65986691 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -19,6 +19,7 @@ struct smc_ib_device; struct smcd_dev; struct smc_init_info; +struct smc_link_group; /** * struct smc_pnettable - SMC PNET table anchor @@ -46,5 +47,9 @@ void smc_pnet_exit(void); void smc_pnet_net_exit(struct net *net); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini); void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini); - +int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port); +int smc_pnetid_by_table_smcd(struct smcd_dev *smcd); +void smc_pnet_find_alt_roce(struct smc_link_group *lgr, + struct smc_init_info *ini, + struct smc_ib_device *known_dev); #endif diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 9f1ade86d70e..54ba0443847e 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -269,22 +269,21 @@ static int smc_tx_rdma_write(struct smc_connection *conn, int peer_rmbe_offset, int num_sges, struct ib_rdma_wr *rdma_wr) { struct smc_link_group *lgr = conn->lgr; - struct smc_link *link; + struct smc_link *link = conn->lnk; int rc; - link = &lgr->lnk[SMC_SINGLE_LINK]; rdma_wr->wr.wr_id = smc_wr_tx_get_next_wr_id(link); rdma_wr->wr.num_sge = num_sges; rdma_wr->remote_addr = - lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].dma_addr + + lgr->rtokens[conn->rtoken_idx][link->link_idx].dma_addr + /* RMBE within RMB */ conn->tx_off + /* offset within RMBE */ peer_rmbe_offset; - rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][SMC_SINGLE_LINK].rkey; + rdma_wr->rkey = lgr->rtokens[conn->rtoken_idx][link->link_idx].rkey; rc = ib_post_send(link->roce_qp, &rdma_wr->wr, NULL); if (rc) - smc_lgr_terminate_sched(lgr); + smcr_link_down_cond_sched(link); return rc; } @@ -310,8 +309,10 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len, size_t dst_off, size_t dst_len, struct smc_rdma_wr *wr_rdma_buf) { + struct smc_link *link = conn->lnk; + dma_addr_t dma_addr = - sg_dma_address(conn->sndbuf_desc->sgt[SMC_SINGLE_LINK].sgl); + sg_dma_address(conn->sndbuf_desc->sgt[link->link_idx].sgl); int src_len_sum = src_len, dst_len_sum = dst_len; int sent_count = src_off; int srcchunk, dstchunk; @@ -481,12 +482,13 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; + struct smc_link *link = conn->lnk; struct smc_rdma_wr *wr_rdma_buf; struct smc_cdc_tx_pend *pend; struct smc_wr_buf *wr_buf; int rc; - rc = smc_cdc_get_free_slot(conn, &wr_buf, &wr_rdma_buf, &pend); + rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend); if (rc < 0) { if (rc == -EBUSY) { struct smc_sock *smc = @@ -504,10 +506,17 @@ static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) } spin_lock_bh(&conn->send_lock); + if (link != conn->lnk) { + /* link of connection changed, tx_work will restart */ + smc_wr_tx_put_slot(link, + (struct smc_wr_tx_pend_priv *)pend); + rc = -ENOLINK; + goto out_unlock; + } if (!pflags->urg_data_present) { rc = smc_tx_rdma_writes(conn, wr_rdma_buf); if (rc) { - smc_wr_tx_put_slot(&conn->lgr->lnk[SMC_SINGLE_LINK], + smc_wr_tx_put_slot(link, (struct smc_wr_tx_pend_priv *)pend); goto out_unlock; } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 337ee52ad3d3..7239ba9b99dc 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -44,6 +44,7 @@ struct smc_wr_tx_pend { /* control data for a pending send request */ struct smc_link *link; u32 idx; struct smc_wr_tx_pend_priv priv; + u8 compl_requested; }; /******************************** send queue *********************************/ @@ -61,7 +62,7 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link) } /* wait till all pending tx work requests on the given link are completed */ -static inline int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) +int smc_wr_tx_wait_no_pending_sends(struct smc_link *link) { if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link), SMC_WR_TX_WAIT_PENDING_TIME)) @@ -103,6 +104,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) if (pnd_snd_idx == link->wr_tx_cnt) return; link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; + if (link->wr_tx_pends[pnd_snd_idx].compl_requested) + complete(&link->wr_tx_compl[pnd_snd_idx]); memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd)); /* clear the full struct smc_wr_tx_pend including .priv */ memset(&link->wr_tx_pends[pnd_snd_idx], 0, @@ -120,8 +123,8 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) sizeof(link->wr_tx_bufs[i])); clear_bit(i, link->wr_tx_mask); } - /* terminate connections of this link group abnormally */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + /* terminate link */ + smcr_link_down_cond_sched(link); } if (pnd_snd.handler) pnd_snd.handler(&pnd_snd.priv, link, wc->status); @@ -207,13 +210,13 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, } else { rc = wait_event_interruptible_timeout( link->wr_tx_wait, - link->state == SMC_LNK_INACTIVE || + !smc_link_usable(link) || lgr->terminating || (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), SMC_WR_TX_WAIT_FREE_SLOT_TIME); if (!rc) { - /* timeout - terminate connections */ - smc_lgr_terminate_sched(lgr); + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); return -EPIPE; } if (idx == link->wr_tx_cnt) @@ -270,11 +273,38 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx], NULL); if (rc) { smc_wr_tx_put_slot(link, priv); - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); } return rc; } +/* Send prepared WR slot via ib_post_send and wait for send completion + * notification. + * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer + */ +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout) +{ + struct smc_wr_tx_pend *pend; + int rc; + + pend = container_of(priv, struct smc_wr_tx_pend, priv); + pend->compl_requested = 1; + init_completion(&link->wr_tx_compl[pend->idx]); + + rc = smc_wr_tx_send(link, priv); + if (rc) + return rc; + /* wait for completion by smc_wr_tx_process_cqe() */ + rc = wait_for_completion_interruptible_timeout( + &link->wr_tx_compl[pend->idx], timeout); + if (rc <= 0) + rc = -ENODATA; + if (rc > 0) + rc = 0; + return rc; +} + /* Register a memory region and wait for result. */ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) { @@ -294,8 +324,8 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) (link->wr_reg_state != POSTED), SMC_WR_REG_MR_WAIT_TIME); if (!rc) { - /* timeout - terminate connections */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + /* timeout - terminate link */ + smcr_link_down_cond_sched(link); return -EPIPE; } if (rc == -ERESTARTSYS) @@ -393,10 +423,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RETRY_EXC_ERR: case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: - /* terminate connections of this link group - * abnormally - */ - smc_lgr_terminate_sched(smc_get_lgr(link)); + smcr_link_down_cond_sched(link); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -558,6 +585,8 @@ void smc_wr_free_link(struct smc_link *lnk) void smc_wr_free_link_mem(struct smc_link *lnk) { + kfree(lnk->wr_tx_compl); + lnk->wr_tx_compl = NULL; kfree(lnk->wr_tx_pends); lnk->wr_tx_pends = NULL; kfree(lnk->wr_tx_mask); @@ -628,8 +657,15 @@ int smc_wr_alloc_link_mem(struct smc_link *link) GFP_KERNEL); if (!link->wr_tx_pends) goto no_mem_wr_tx_mask; + link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, + sizeof(link->wr_tx_compl[0]), + GFP_KERNEL); + if (!link->wr_tx_compl) + goto no_mem_wr_tx_pends; return 0; +no_mem_wr_tx_pends: + kfree(link->wr_tx_pends); no_mem_wr_tx_mask: kfree(link->wr_tx_mask); no_mem_wr_rx_sges: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index 3ac99c898418..423b8709f1c9 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -101,11 +101,14 @@ int smc_wr_tx_put_slot(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *wr_pend_priv); +int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, + unsigned long timeout); void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context); void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type, smc_wr_tx_filter filter, smc_wr_tx_dismisser dismisser, unsigned long data); +int smc_wr_tx_wait_no_pending_sends(struct smc_link *link); int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler); int smc_wr_rx_post_init(struct smc_link *link); diff --git a/net/socket.c b/net/socket.c index 2dd739fba866..976426d03f09 100644 --- a/net/socket.c +++ b/net/socket.c @@ -924,14 +924,9 @@ EXPORT_SYMBOL(sock_recvmsg); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { - mm_segment_t oldfs = get_fs(); - int result; - + msg->msg_control_is_user = false; iov_iter_kvec(&msg->msg_iter, READ, vec, num, size); - set_fs(KERNEL_DS); - result = sock_recvmsg(sock, msg, flags); - set_fs(oldfs); - return result; + return sock_recvmsg(sock, msg, flags); } EXPORT_SYMBOL(kernel_recvmsg); @@ -2239,7 +2234,8 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - kmsg->msg_control = (void __force *)msg.msg_control; + kmsg->msg_control_is_user = true; + kmsg->msg_control_user = msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; @@ -2331,16 +2327,10 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, goto out; } err = -EFAULT; - /* - * Careful! Before this, msg_sys->msg_control contains a user pointer. - * Afterwards, it will be a kernel pointer. Thus the compiler-assisted - * checking falls down on this. - */ - if (copy_from_user(ctl_buf, - (void __user __force *)msg_sys->msg_control, - ctl_len)) + if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len)) goto out_freectl; msg_sys->msg_control = ctl_buf; + msg_sys->msg_control_is_user = false; } msg_sys->msg_flags = flags; @@ -3376,94 +3366,6 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, return err; } -struct rtentry32 { - u32 rt_pad1; - struct sockaddr rt_dst; /* target address */ - struct sockaddr rt_gateway; /* gateway addr (RTF_GATEWAY) */ - struct sockaddr rt_genmask; /* target network mask (IP) */ - unsigned short rt_flags; - short rt_pad2; - u32 rt_pad3; - unsigned char rt_tos; - unsigned char rt_class; - short rt_pad4; - short rt_metric; /* +1 for binary compatibility! */ - /* char * */ u32 rt_dev; /* forcing the device at add */ - u32 rt_mtu; /* per route MTU/Window */ - u32 rt_window; /* Window clamping */ - unsigned short rt_irtt; /* Initial RTT */ -}; - -struct in6_rtmsg32 { - struct in6_addr rtmsg_dst; - struct in6_addr rtmsg_src; - struct in6_addr rtmsg_gateway; - u32 rtmsg_type; - u16 rtmsg_dst_len; - u16 rtmsg_src_len; - u32 rtmsg_metric; - u32 rtmsg_info; - u32 rtmsg_flags; - s32 rtmsg_ifindex; -}; - -static int routing_ioctl(struct net *net, struct socket *sock, - unsigned int cmd, void __user *argp) -{ - int ret; - void *r = NULL; - struct in6_rtmsg r6; - struct rtentry r4; - char devname[16]; - u32 rtdev; - mm_segment_t old_fs = get_fs(); - - if (sock && sock->sk && sock->sk->sk_family == AF_INET6) { /* ipv6 */ - struct in6_rtmsg32 __user *ur6 = argp; - ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst), - 3 * sizeof(struct in6_addr)); - ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); - ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); - ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); - ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); - ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); - ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); - ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); - - r = (void *) &r6; - } else { /* ipv4 */ - struct rtentry32 __user *ur4 = argp; - ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), - 3 * sizeof(struct sockaddr)); - ret |= get_user(r4.rt_flags, &(ur4->rt_flags)); - ret |= get_user(r4.rt_metric, &(ur4->rt_metric)); - ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu)); - ret |= get_user(r4.rt_window, &(ur4->rt_window)); - ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt)); - ret |= get_user(rtdev, &(ur4->rt_dev)); - if (rtdev) { - ret |= copy_from_user(devname, compat_ptr(rtdev), 15); - r4.rt_dev = (char __user __force *)devname; - devname[15] = 0; - } else - r4.rt_dev = NULL; - - r = (void *) &r4; - } - - if (ret) { - ret = -EFAULT; - goto out; - } - - set_fs(KERNEL_DS); - ret = sock_do_ioctl(net, sock, cmd, (unsigned long) r); - set_fs(old_fs); - -out: - return ret; -} - /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE * for some operations; this forces use of the newer bridge-utils that * use compatible ioctls @@ -3502,9 +3404,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock, case SIOCGIFMAP: case SIOCSIFMAP: return compat_sioc_ifmap(net, cmd, argp); - case SIOCADDRT: - case SIOCDELRT: - return routing_ioctl(net, sock, cmd, argp); case SIOCGSTAMP_OLD: case SIOCGSTAMPNS_OLD: if (!sock->ops->gettstamp) @@ -3726,71 +3625,6 @@ int kernel_getpeername(struct socket *sock, struct sockaddr *addr) EXPORT_SYMBOL(kernel_getpeername); /** - * kernel_getsockopt - get a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Assigns the option length to @optlen. - * Returns 0 or an error. - */ - -int kernel_getsockopt(struct socket *sock, int level, int optname, - char *optval, int *optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int __user *uoptlen; - int err; - - uoptval = (char __user __force *) optval; - uoptlen = (int __user __force *) optlen; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); - else - err = sock->ops->getsockopt(sock, level, optname, uoptval, - uoptlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_getsockopt); - -/** - * kernel_setsockopt - set a socket option (kernel space) - * @sock: socket - * @level: API level (SOL_SOCKET, ...) - * @optname: option tag - * @optval: option value - * @optlen: option length - * - * Returns 0 or an error. - */ - -int kernel_setsockopt(struct socket *sock, int level, int optname, - char *optval, unsigned int optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int err; - - uoptval = (char __user __force *) optval; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, uoptval, optlen); - else - err = sock->ops->setsockopt(sock, level, optname, uoptval, - optlen); - set_fs(oldfs); - return err; -} -EXPORT_SYMBOL(kernel_setsockopt); - -/** * kernel_sendpage - send a &page through a socket (kernel space) * @sock: socket * @page: page diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 023514e392b3..e7a0037d9b56 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -323,17 +323,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) { - struct svc_sock *svsk; - struct socket *sock; - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + sock_no_linger(svsk->sk_sock->sk); } /* @@ -603,8 +595,6 @@ static struct svc_xprt_class svc_udp_class = { static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) { - int err, level, optname, one = 1; - svc_xprt_init(sock_net(svsk->sk_sock->sk), &svc_udp_class, &svsk->sk_xprt, serv); clear_bit(XPT_CACHE_AUTH, &svsk->sk_xprt.xpt_flags); @@ -624,19 +614,14 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { case AF_INET: - level = SOL_IP; - optname = IP_PKTINFO; + ip_sock_set_pktinfo(svsk->sk_sock->sk); break; case AF_INET6: - level = SOL_IPV6; - optname = IPV6_RECVPKTINFO; + ip6_sock_set_recvpktinfo(svsk->sk_sock->sk); break; default: BUG(); } - err = kernel_setsockopt(svsk->sk_sock, level, optname, - (char *)&one, sizeof(one)); - dprintk("svc: kernel_setsockopt returned %d\n", err); } /* @@ -1337,7 +1322,6 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; int family; - int val; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("svc: svc_create_socket(%s, %d, %s)\n", @@ -1373,11 +1357,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, * getting requests from IPv4 remotes. Those should * be shunted to a PF_INET listener via rpcbind. */ - val = 1; if (family == PF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - + ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ error = kernel_bind(sock, sin, len); diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index d75f17b56f0e..999eee1ed61c 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -60,7 +60,7 @@ rpc_unregister_sysctl(void) } static int proc_do_xprt(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; size_t len; @@ -70,15 +70,15 @@ static int proc_do_xprt(struct ctl_table *table, int write, return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); } static int -proc_dodebug(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, + loff_t *ppos) { - char tmpbuf[20], c, *s = NULL; - char __user *p; + char tmpbuf[20], *s = NULL; + char *p; unsigned int value; size_t left, len; @@ -90,18 +90,17 @@ proc_dodebug(struct ctl_table *table, int write, left = *lenp; if (write) { - if (!access_ok(buffer, left)) - return -EFAULT; p = buffer; - while (left && __get_user(c, p) >= 0 && isspace(c)) - left--, p++; + while (left && isspace(*p)) { + left--; + p++; + } if (!left) goto done; if (left > sizeof(tmpbuf) - 1) return -EINVAL; - if (copy_from_user(tmpbuf, p, left)) - return -EFAULT; + memcpy(tmpbuf, p, left); tmpbuf[left] = '\0'; value = simple_strtol(tmpbuf, &s, 0); @@ -121,11 +120,9 @@ proc_dodebug(struct ctl_table *table, int write, len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (copy_to_user(buffer, tmpbuf, len)) - return -EFAULT; + memcpy(buffer, tmpbuf, len); if ((left -= len) > 0) { - if (put_user('\n', (char __user *)buffer + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; left--; } } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 97bca509a391..526da5d4710b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -80,8 +80,7 @@ atomic_t rdma_stat_sq_prod; * current value. */ static int read_reset_stat(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { atomic_t *stat = (atomic_t *)table->data; @@ -103,8 +102,8 @@ static int read_reset_stat(struct ctl_table *table, int write, len -= *ppos; if (len > *lenp) len = *lenp; - if (len && copy_to_user(buffer, str_buf, len)) - return -EFAULT; + if (len) + memcpy(buffer, str_buf, len); *lenp = len; *ppos += len; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 845d0be805ec..3a143e250b9a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1594,21 +1594,6 @@ static int xs_get_random_port(void) return rand + min; } -/** - * xs_set_reuseaddr_port - set the socket's port and address reuse options - * @sock: socket - * - * Note that this function has to be called on all sockets that share the - * same port, and it must be called before binding. - */ -static void xs_sock_set_reuseport(struct socket *sock) -{ - int opt = 1; - - kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - (char *)&opt, sizeof(opt)); -} - static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; @@ -1801,7 +1786,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, xs_reclassify_socket(family, sock); if (reuseport) - xs_sock_set_reuseport(sock); + sock_set_reuseport(sock->sk); err = xs_bind(transport, sock); if (err) { @@ -2110,7 +2095,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); unsigned int keepidle; unsigned int keepcnt; - unsigned int opt_on = 1; unsigned int timeo; spin_lock(&xprt->transport_lock); @@ -2122,18 +2106,13 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, spin_unlock(&xprt->transport_lock); /* TCP Keepalive options */ - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&opt_on, sizeof(opt_on)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, - (char *)&keepidle, sizeof(keepidle)); - kernel_setsockopt(sock, SOL_TCP, TCP_KEEPCNT, - (char *)&keepcnt, sizeof(keepcnt)); + sock_set_keepalive(sock->sk); + tcp_sock_set_keepidle(sock->sk, keepidle); + tcp_sock_set_keepintvl(sock->sk, keepidle); + tcp_sock_set_keepcnt(sock->sk, keepcnt); /* TCP user timeout (see RFC5482) */ - kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&timeo, sizeof(timeo)); + tcp_sock_set_user_timeout(sock->sk, timeo); } static void xs_tcp_set_connect_timeout(struct rpc_xprt *xprt, @@ -2171,7 +2150,6 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!transport->inet) { struct sock *sk = sock->sk; - unsigned int addr_pref = IPV6_PREFER_SRC_PUBLIC; /* Avoid temporary address, they are bad for long-lived * connections such as NFS mounts. @@ -2180,8 +2158,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) * knowledge about the normal duration of connections, * MAY override this as appropriate. */ - kernel_setsockopt(sock, SOL_IPV6, IPV6_ADDR_PREFERENCES, - (char *)&addr_pref, sizeof(addr_pref)); + if (xs_addr(xprt)->sa_family == PF_INET6) { + ip6_sock_set_addr_preferences(sk, + IPV6_PREFER_SRC_PUBLIC); + } xs_tcp_set_socket_timeouts(xprt, sock); diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 4c20be08b9c4..383f87bc1061 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -46,6 +46,7 @@ #define BCLINK_WIN_MIN 32 /* bcast minimum link window size */ const char tipc_bclink_name[] = "broadcast-link"; +unsigned long sysctl_tipc_bc_retruni __read_mostly; /** * struct tipc_bc_base - base structure for keeping broadcast send state @@ -474,7 +475,7 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, __skb_queue_head_init(&xmitq); tipc_bcast_lock(net); - tipc_link_bc_ack_rcv(l, acked, &xmitq); + tipc_link_bc_ack_rcv(l, acked, 0, NULL, &xmitq, NULL); tipc_bcast_unlock(net); tipc_bcbase_xmit(net, &xmitq); @@ -489,9 +490,11 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, * RCU is locked, no other locks set */ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, - struct tipc_msg *hdr) + struct tipc_msg *hdr, + struct sk_buff_head *retrq) { struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq; + struct tipc_gap_ack_blks *ga; struct sk_buff_head xmitq; int rc = 0; @@ -501,8 +504,13 @@ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, if (msg_type(hdr) != STATE_MSG) { tipc_link_bc_init_rcv(l, hdr); } else if (!msg_bc_ack_invalid(hdr)) { - tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), &xmitq); - rc = tipc_link_bc_sync_rcv(l, hdr, &xmitq); + tipc_get_gap_ack_blks(&ga, l, hdr, false); + if (!sysctl_tipc_bc_retruni) + retrq = &xmitq; + rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), + msg_bc_gap(hdr), ga, &xmitq, + retrq); + rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq); } tipc_bcast_unlock(net); @@ -555,10 +563,8 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l) tipc_sk_rcv(net, inputq); } -int tipc_bclink_reset_stats(struct net *net) +int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l) { - struct tipc_link *l = tipc_bc_sndlink(net); - if (!l) return -ENOPROTOOPT; @@ -686,7 +692,7 @@ int tipc_bcast_init(struct net *net) tn->bcbase = bb; spin_lock_init(&tipc_net(net)->bclock); - if (!tipc_link_bc_create(net, 0, 0, + if (!tipc_link_bc_create(net, 0, 0, NULL, FB_MTU, BCLINK_WIN_DEFAULT, BCLINK_WIN_DEFAULT, diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 9e847d9617d3..4240c95188b1 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -45,6 +45,7 @@ struct tipc_nl_msg; struct tipc_nlist; struct tipc_nitem; extern const char tipc_bclink_name[]; +extern unsigned long sysctl_tipc_bc_retruni; #define TIPC_METHOD_EXPIRE msecs_to_jiffies(5000) @@ -93,10 +94,12 @@ int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb); void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, struct tipc_msg *hdr); int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l, - struct tipc_msg *hdr); -int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); + struct tipc_msg *hdr, + struct sk_buff_head *retrq); +int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, + struct tipc_link *bcl); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); -int tipc_bclink_reset_stats(struct net *net); +int tipc_bclink_reset_stats(struct net *net, struct tipc_link *l); u32 tipc_bcast_get_broadcast_mode(struct net *net); u32 tipc_bcast_get_broadcast_ratio(struct net *net); diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 8c47ded2edb6..c8c47fc72653 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1712,7 +1712,6 @@ exit: case -EBUSY: this_cpu_inc(stats->stat[STAT_ASYNC]); *skb = NULL; - tipc_aead_put(aead); return rc; default: this_cpu_inc(stats->stat[STAT_NOK]); diff --git a/net/tipc/link.c b/net/tipc/link.c index d4675e922a8f..ee3b8d0576b8 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -188,6 +188,8 @@ struct tipc_link { /* Broadcast */ u16 ackers; u16 acked; + u16 last_gap; + struct tipc_gap_ack_blks *last_ga; struct tipc_link *bc_rcvlink; struct tipc_link *bc_sndlink; u8 nack_state; @@ -249,11 +251,14 @@ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); -static int tipc_link_release_pkts(struct tipc_link *l, u16 to); -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap); -static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, +static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga, + struct tipc_link *l, u8 start_index); +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr); +static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, + u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq); + struct sk_buff_head *xmitq, + bool *retransmitted, int *rc); static void tipc_link_update_cwin(struct tipc_link *l, int released, bool retransmitted); /* @@ -370,7 +375,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l, snd_l->ackers--; rcv_l->bc_peer_is_up = true; rcv_l->state = LINK_ESTABLISHED; - tipc_link_bc_ack_rcv(rcv_l, ack, xmitq); + tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq, NULL); trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!"); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; @@ -534,7 +539,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, * * Returns true if link was created, otherwise false */ -bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, +bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, @@ -549,7 +554,18 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, return false; l = *link; - strcpy(l->name, tipc_bclink_name); + if (peer_id) { + char peer_str[NODE_ID_STR_LEN] = {0,}; + + tipc_nodeid2string(peer_str, peer_id); + if (strlen(peer_str) > 16) + sprintf(peer_str, "%x", peer); + /* Broadcast receiver link name: "broadcast-link:<peer>" */ + snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name, + peer_str); + } else { + strcpy(l->name, tipc_bclink_name); + } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "bclink created!"); tipc_link_reset(l); l->state = LINK_RESET; @@ -784,8 +800,6 @@ bool tipc_link_too_silent(struct tipc_link *l) return (l->silent_intv_cnt + 2 > l->abort_limit); } -static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, - u16 from, u16 to, struct sk_buff_head *xmitq); /* tipc_link_timeout - perform periodic task as instructed from node timeout */ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) @@ -948,6 +962,9 @@ void tipc_link_reset(struct tipc_link *l) l->snd_nxt_state = 1; l->rcv_nxt_state = 1; l->acked = 0; + l->last_gap = 0; + kfree(l->last_ga); + l->last_ga = NULL; l->silent_intv_cnt = 0; l->rst_cnt = 0; l->bc_peer_is_up = false; @@ -1183,68 +1200,14 @@ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, if (link_is_bc_sndlink(l)) { r->state = LINK_RESET; - *rc = TIPC_LINK_DOWN_EVT; + *rc |= TIPC_LINK_DOWN_EVT; } else { - *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); + *rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } return true; } -/* tipc_link_bc_retrans() - retransmit zero or more packets - * @l: the link to transmit on - * @r: the receiving link ordering the retransmit. Same as l if unicast - * @from: retransmit from (inclusive) this sequence number - * @to: retransmit to (inclusive) this sequence number - * xmitq: queue for accumulating the retransmitted packets - */ -static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, - u16 from, u16 to, struct sk_buff_head *xmitq) -{ - struct sk_buff *_skb, *skb = skb_peek(&l->transmq); - u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - u16 ack = l->rcv_nxt - 1; - int retransmitted = 0; - struct tipc_msg *hdr; - int rc = 0; - - if (!skb) - return 0; - if (less(to, from)) - return 0; - - trace_tipc_link_retrans(r, from, to, &l->transmq); - - if (link_retransmit_failure(l, r, &rc)) - return rc; - - skb_queue_walk(&l->transmq, skb) { - hdr = buf_msg(skb); - if (less(msg_seqno(hdr), from)) - continue; - if (more(msg_seqno(hdr), to)) - break; - if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) - continue; - TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; - _skb = pskb_copy(skb, GFP_ATOMIC); - if (!_skb) - return 0; - hdr = buf_msg(_skb); - msg_set_ack(hdr, ack); - msg_set_bcast_ack(hdr, bc_ack); - _skb->priority = TC_PRIO_CONTROL; - __skb_queue_tail(xmitq, _skb); - l->stats.retransmitted++; - retransmitted++; - /* Increase actual retrans counter & mark first time */ - if (!TIPC_SKB_CB(skb)->retr_cnt++) - TIPC_SKB_CB(skb)->retr_stamp = jiffies; - } - tipc_link_update_cwin(l, 0, retransmitted); - return 0; -} - /* tipc_data_input - deliver data and name distr msgs to upper layer * * Consumes buffer if message is of right type @@ -1402,46 +1365,68 @@ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb, return rc; } -static int tipc_link_release_pkts(struct tipc_link *l, u16 acked) -{ - int released = 0; - struct sk_buff *skb, *tmp; - - skb_queue_walk_safe(&l->transmq, skb, tmp) { - if (more(buf_seqno(skb), acked)) - break; - __skb_unlink(skb, &l->transmq); - kfree_skb(skb); - released++; +/** + * tipc_get_gap_ack_blks - get Gap ACK blocks from PROTOCOL/STATE_MSG + * @ga: returned pointer to the Gap ACK blocks if any + * @l: the tipc link + * @hdr: the PROTOCOL/STATE_MSG header + * @uc: desired Gap ACK blocks type, i.e. unicast (= 1) or broadcast (= 0) + * + * Return: the total Gap ACK blocks size + */ +u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l, + struct tipc_msg *hdr, bool uc) +{ + struct tipc_gap_ack_blks *p; + u16 sz = 0; + + /* Does peer support the Gap ACK blocks feature? */ + if (l->peer_caps & TIPC_GAP_ACK_BLOCK) { + p = (struct tipc_gap_ack_blks *)msg_data(hdr); + sz = ntohs(p->len); + /* Sanity check */ + if (sz == tipc_gap_ack_blks_sz(p->ugack_cnt + p->bgack_cnt)) { + /* Good, check if the desired type exists */ + if ((uc && p->ugack_cnt) || (!uc && p->bgack_cnt)) + goto ok; + /* Backward compatible: peer might not support bc, but uc? */ + } else if (uc && sz == tipc_gap_ack_blks_sz(p->ugack_cnt)) { + if (p->ugack_cnt) { + p->bgack_cnt = 0; + goto ok; + } + } } - return released; + /* Other cases: ignore! */ + p = NULL; + +ok: + *ga = p; + return sz; } -/* tipc_build_gap_ack_blks - build Gap ACK blocks - * @l: tipc link that data have come with gaps in sequence if any - * @data: data buffer to store the Gap ACK blocks after built - * - * returns the actual allocated memory size - */ -static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap) +static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga, + struct tipc_link *l, u8 start_index) { + struct tipc_gap_ack *gacks = &ga->gacks[start_index]; struct sk_buff *skb = skb_peek(&l->deferdq); - struct tipc_gap_ack_blks *ga = data; - u16 len, expect, seqno = 0; + u16 expect, seqno = 0; u8 n = 0; - if (!skb || !gap) - goto exit; + if (!skb) + return 0; expect = buf_seqno(skb); skb_queue_walk(&l->deferdq, skb) { seqno = buf_seqno(skb); if (unlikely(more(seqno, expect))) { - ga->gacks[n].ack = htons(expect - 1); - ga->gacks[n].gap = htons(seqno - expect); - if (++n >= MAX_GAP_ACK_BLKS) { - pr_info_ratelimited("Too few Gap ACK blocks!\n"); - goto exit; + gacks[n].ack = htons(expect - 1); + gacks[n].gap = htons(seqno - expect); + if (++n >= MAX_GAP_ACK_BLKS / 2) { + pr_info_ratelimited("Gacks on %s: %d, ql: %d!\n", + l->name, n, + skb_queue_len(&l->deferdq)); + return n; } } else if (unlikely(less(seqno, expect))) { pr_warn("Unexpected skb in deferdq!\n"); @@ -1451,14 +1436,44 @@ static u16 tipc_build_gap_ack_blks(struct tipc_link *l, void *data, u16 gap) } /* last block */ - ga->gacks[n].ack = htons(seqno); - ga->gacks[n].gap = 0; + gacks[n].ack = htons(seqno); + gacks[n].gap = 0; n++; + return n; +} -exit: - len = tipc_gap_ack_blks_sz(n); +/* tipc_build_gap_ack_blks - build Gap ACK blocks + * @l: tipc unicast link + * @hdr: the tipc message buffer to store the Gap ACK blocks after built + * + * The function builds Gap ACK blocks for both the unicast & broadcast receiver + * links of a certain peer, the buffer after built has the network data format + * as found at the struct tipc_gap_ack_blks definition. + * + * returns the actual allocated memory size + */ +static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr) +{ + struct tipc_link *bcl = l->bc_rcvlink; + struct tipc_gap_ack_blks *ga; + u16 len; + + ga = (struct tipc_gap_ack_blks *)msg_data(hdr); + + /* Start with broadcast link first */ + tipc_bcast_lock(bcl->net); + msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1); + msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); + ga->bgack_cnt = __tipc_build_gap_ack_blks(ga, bcl, 0); + tipc_bcast_unlock(bcl->net); + + /* Now for unicast link, but an explicit NACK only (???) */ + ga->ugack_cnt = (msg_seq_gap(hdr)) ? + __tipc_build_gap_ack_blks(ga, l, ga->bgack_cnt) : 0; + + /* Total len */ + len = tipc_gap_ack_blks_sz(ga->bgack_cnt + ga->ugack_cnt); ga->len = htons(len); - ga->gack_cnt = n; return len; } @@ -1466,47 +1481,111 @@ exit: * acked packets, also doing retransmissions if * gaps found * @l: tipc link with transmq queue to be advanced + * @r: tipc link "receiver" i.e. in case of broadcast (= "l" if unicast) * @acked: seqno of last packet acked by peer without any gaps before * @gap: # of gap packets * @ga: buffer pointer to Gap ACK blocks from peer * @xmitq: queue for accumulating the retransmitted packets if any + * @retransmitted: returned boolean value if a retransmission is really issued + * @rc: returned code e.g. TIPC_LINK_DOWN_EVT if a repeated retransmit failures + * happens (- unlikely case) * - * In case of a repeated retransmit failures, the call will return shortly - * with a returned code (e.g. TIPC_LINK_DOWN_EVT) + * Return: the number of packets released from the link transmq */ -static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, +static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, + u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, - struct sk_buff_head *xmitq) + struct sk_buff_head *xmitq, + bool *retransmitted, int *rc) { + struct tipc_gap_ack_blks *last_ga = r->last_ga, *this_ga = NULL; + struct tipc_gap_ack *gacks = NULL; struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; + u32 qlen = skb_queue_len(&l->transmq); + u16 nacked = acked, ngap = gap, gack_cnt = 0; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; - bool retransmitted = false; u16 ack = l->rcv_nxt - 1; - bool passed = false; - u16 released = 0; u16 seqno, n = 0; - int rc = 0; + u16 end = r->acked, start = end, offset = r->last_gap; + u16 si = (last_ga) ? last_ga->start_index : 0; + bool is_uc = !link_is_bc_sndlink(l); + bool bc_has_acked = false; + + trace_tipc_link_retrans(r, acked + 1, acked + gap, &l->transmq); + + /* Determine Gap ACK blocks if any for the particular link */ + if (ga && is_uc) { + /* Get the Gap ACKs, uc part */ + gack_cnt = ga->ugack_cnt; + gacks = &ga->gacks[ga->bgack_cnt]; + } else if (ga) { + /* Copy the Gap ACKs, bc part, for later renewal if needed */ + this_ga = kmemdup(ga, tipc_gap_ack_blks_sz(ga->bgack_cnt), + GFP_ATOMIC); + if (likely(this_ga)) { + this_ga->start_index = 0; + /* Start with the bc Gap ACKs */ + gack_cnt = this_ga->bgack_cnt; + gacks = &this_ga->gacks[0]; + } else { + /* Hmm, we can get in trouble..., simply ignore it */ + pr_warn_ratelimited("Ignoring bc Gap ACKs, no memory\n"); + } + } + /* Advance the link transmq */ skb_queue_walk_safe(&l->transmq, skb, tmp) { seqno = buf_seqno(skb); next_gap_ack: - if (less_eq(seqno, acked)) { + if (less_eq(seqno, nacked)) { + if (is_uc) + goto release; + /* Skip packets peer has already acked */ + if (!more(seqno, r->acked)) + continue; + /* Get the next of last Gap ACK blocks */ + while (more(seqno, end)) { + if (!last_ga || si >= last_ga->bgack_cnt) + break; + start = end + offset + 1; + end = ntohs(last_ga->gacks[si].ack); + offset = ntohs(last_ga->gacks[si].gap); + si++; + WARN_ONCE(more(start, end) || + (!offset && + si < last_ga->bgack_cnt) || + si > MAX_GAP_ACK_BLKS, + "Corrupted Gap ACK: %d %d %d %d %d\n", + start, end, offset, si, + last_ga->bgack_cnt); + } + /* Check against the last Gap ACK block */ + if (in_range(seqno, start, end)) + continue; + /* Update/release the packet peer is acking */ + bc_has_acked = true; + if (--TIPC_SKB_CB(skb)->ackers) + continue; +release: /* release skb */ __skb_unlink(skb, &l->transmq); kfree_skb(skb); - released++; - } else if (less_eq(seqno, acked + gap)) { - /* First, check if repeated retrans failures occurs? */ - if (!passed && link_retransmit_failure(l, l, &rc)) - return rc; - passed = true; - + } else if (less_eq(seqno, nacked + ngap)) { + /* First gap: check if repeated retrans failures? */ + if (unlikely(seqno == acked + 1 && + link_retransmit_failure(l, r, rc))) { + /* Ignore this bc Gap ACKs if any */ + kfree(this_ga); + this_ga = NULL; + break; + } /* retransmit skb if unrestricted*/ if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; - TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; + TIPC_SKB_CB(skb)->nxt_retr = (is_uc) ? + TIPC_UC_RETR_TIME : TIPC_BC_RETR_LIM; _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) continue; @@ -1516,25 +1595,53 @@ next_gap_ack: _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; - retransmitted = true; + if (!is_uc) + r->stats.retransmitted++; + *retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } else { /* retry with Gap ACK blocks if any */ - if (!ga || n >= ga->gack_cnt) + if (n >= gack_cnt) break; - acked = ntohs(ga->gacks[n].ack); - gap = ntohs(ga->gacks[n].gap); + nacked = ntohs(gacks[n].ack); + ngap = ntohs(gacks[n].gap); n++; goto next_gap_ack; } } - if (released || retransmitted) - tipc_link_update_cwin(l, released, retransmitted); - if (released) - tipc_link_advance_backlog(l, xmitq); - return 0; + + /* Renew last Gap ACK blocks for bc if needed */ + if (bc_has_acked) { + if (this_ga) { + kfree(last_ga); + r->last_ga = this_ga; + r->last_gap = gap; + } else if (last_ga) { + if (less(acked, start)) { + si--; + offset = start - acked - 1; + } else if (less(acked, end)) { + acked = end; + } + if (si < last_ga->bgack_cnt) { + last_ga->start_index = si; + r->last_gap = offset; + } else { + kfree(last_ga); + r->last_ga = NULL; + r->last_gap = 0; + } + } else { + r->last_gap = 0; + } + r->acked = acked; + } else { + kfree(this_ga); + } + + return qlen - skb_queue_len(&l->transmq); } /* tipc_link_build_state_msg: prepare link state message for transmission @@ -1651,11 +1758,13 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, kfree_skb(skb); break; } - released += tipc_link_release_pkts(l, msg_ack(hdr)); + released += tipc_link_advance_transmq(l, l, msg_ack(hdr), 0, + NULL, NULL, NULL, NULL); /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { - __tipc_skb_queue_sorted(defq, seqno, skb); + if (!__tipc_skb_queue_sorted(defq, seqno, skb)) + l->stats.duplicates++; rc |= tipc_link_build_nack_msg(l, xmitq); break; } @@ -1689,15 +1798,15 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, int tolerance, int priority, struct sk_buff_head *xmitq) { + struct tipc_mon_state *mstate = &l->mon_state; + struct sk_buff_head *dfq = &l->deferdq; struct tipc_link *bcl = l->bc_rcvlink; - struct sk_buff *skb; struct tipc_msg *hdr; - struct sk_buff_head *dfq = &l->deferdq; + struct sk_buff *skb; bool node_up = link_is_up(bcl); - struct tipc_mon_state *mstate = &l->mon_state; + u16 glen = 0, bc_rcvgap = 0; int dlen = 0; void *data; - u16 glen = 0; /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) @@ -1735,11 +1844,12 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (l->peer_caps & TIPC_LINK_PROTO_SEQNO) msg_set_seqno(hdr, l->snd_nxt_state++); msg_set_seq_gap(hdr, rcvgap); - msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); + bc_rcvgap = link_bc_rcv_gap(bcl); + msg_set_bc_gap(hdr, bc_rcvgap); msg_set_probe(hdr, probe); msg_set_is_keepalive(hdr, probe || probe_reply); if (l->peer_caps & TIPC_GAP_ACK_BLOCK) - glen = tipc_build_gap_ack_blks(l, data, rcvgap); + glen = tipc_build_gap_ack_blks(l, hdr); tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + glen + dlen); skb_trim(skb, INT_H_SIZE + glen + dlen); @@ -1760,6 +1870,8 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, l->stats.sent_probes++; if (rcvgap) l->stats.sent_nacks++; + if (bc_rcvgap) + bcl->stats.sent_nacks++; skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, skb); trace_tipc_proto_build(skb, false, l->name); @@ -2027,20 +2139,19 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, { struct tipc_msg *hdr = buf_msg(skb); struct tipc_gap_ack_blks *ga = NULL; - u16 rcvgap = 0; - u16 ack = msg_ack(hdr); - u16 gap = msg_seq_gap(hdr); + bool reply = msg_probe(hdr), retransmitted = false; + u16 dlen = msg_data_sz(hdr), glen = 0; u16 peers_snd_nxt = msg_next_sent(hdr); u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); + u16 gap = msg_seq_gap(hdr); + u16 ack = msg_ack(hdr); u16 rcv_nxt = l->rcv_nxt; - u16 dlen = msg_data_sz(hdr); + u16 rcvgap = 0; int mtyp = msg_type(hdr); - bool reply = msg_probe(hdr); - u16 glen = 0; - void *data; + int rc = 0, released; char *if_name; - int rc = 0; + void *data; trace_tipc_proto_rcv(skb, false, l->name); if (tipc_link_is_blocked(l) || !xmitq) @@ -2137,13 +2248,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, } /* Receive Gap ACK blocks from peer if any */ - if (l->peer_caps & TIPC_GAP_ACK_BLOCK) { - ga = (struct tipc_gap_ack_blks *)data; - glen = ntohs(ga->len); - /* sanity check: if failed, ignore Gap ACK blocks */ - if (glen != tipc_gap_ack_blks_sz(ga->gack_cnt)) - ga = NULL; - } + glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); @@ -2158,9 +2263,14 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); - rc |= tipc_link_advance_transmq(l, ack, gap, ga, xmitq); + released = tipc_link_advance_transmq(l, l, ack, gap, ga, xmitq, + &retransmitted, &rc); if (gap) l->stats.recv_nacks++; + if (released || retransmitted) + tipc_link_update_cwin(l, released, retransmitted); + if (released) + tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); } @@ -2246,10 +2356,7 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr) int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *xmitq) { - struct tipc_link *snd_l = l->bc_sndlink; u16 peers_snd_nxt = msg_bc_snd_nxt(hdr); - u16 from = msg_bcast_ack(hdr) + 1; - u16 to = from + msg_bc_gap(hdr) - 1; int rc = 0; if (!link_is_up(l)) @@ -2265,14 +2372,10 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, if (!l->bc_peer_is_up) return rc; - l->stats.recv_nacks++; - /* Ignore if peers_snd_nxt goes beyond receive window */ if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; - rc = tipc_link_bc_retrans(snd_l, l, from, to, xmitq); - l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) rc |= TIPC_LINK_SND_STATE; @@ -2307,38 +2410,34 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, return 0; } -void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, - struct sk_buff_head *xmitq) +int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq, + struct sk_buff_head *retrq) { - struct sk_buff *skb, *tmp; - struct tipc_link *snd_l = l->bc_sndlink; - - if (!link_is_up(l) || !l->bc_peer_is_up) - return; + struct tipc_link *l = r->bc_sndlink; + bool unused = false; + int rc = 0; - if (!more(acked, l->acked)) - return; + if (!link_is_up(r) || !r->bc_peer_is_up) + return 0; - trace_tipc_link_bc_ack(l, l->acked, acked, &snd_l->transmq); - /* Skip over packets peer has already acked */ - skb_queue_walk(&snd_l->transmq, skb) { - if (more(buf_seqno(skb), l->acked)) - break; + if (gap) { + l->stats.recv_nacks++; + r->stats.recv_nacks++; } - /* Update/release the packets peer is acking now */ - skb_queue_walk_from_safe(&snd_l->transmq, skb, tmp) { - if (more(buf_seqno(skb), acked)) - break; - if (!--TIPC_SKB_CB(skb)->ackers) { - __skb_unlink(skb, &snd_l->transmq); - kfree_skb(skb); - } - } - l->acked = acked; - tipc_link_advance_backlog(snd_l, xmitq); - if (unlikely(!skb_queue_empty(&snd_l->wakeupq))) - link_prepare_wakeup(snd_l); + if (less(acked, r->acked) || (acked == r->acked && !gap && !ga)) + return 0; + + trace_tipc_link_bc_ack(r, acked, gap, &l->transmq); + tipc_link_advance_transmq(l, r, acked, gap, ga, retrq, &unused, &rc); + + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + + return rc; } /* tipc_link_bc_nack_rcv(): receive broadcast nack message @@ -2366,8 +2465,8 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, return 0; if (dnode == tipc_own_addr(l->net)) { - tipc_link_bc_ack_rcv(l, acked, xmitq); - rc = tipc_link_bc_retrans(l->bc_sndlink, l, from, to, xmitq); + rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq, + xmitq); l->stats.recv_nacks++; return rc; } @@ -2639,16 +2738,15 @@ msg_full: return -EMSGSIZE; } -int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg) +int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, + struct tipc_link *bcl) { int err; void *hdr; struct nlattr *attrs; struct nlattr *prop; - struct tipc_net *tn = net_generic(net, tipc_net_id); u32 bc_mode = tipc_bcast_get_broadcast_mode(net); u32 bc_ratio = tipc_bcast_get_broadcast_ratio(net); - struct tipc_link *bcl = tn->bcl; if (!bcl) return 0; @@ -2735,21 +2833,6 @@ void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) l->abort_limit = limit; } -char *tipc_link_name_ext(struct tipc_link *l, char *buf) -{ - if (!l) - scnprintf(buf, TIPC_MAX_LINK_NAME, "null"); - else if (link_is_bc_sndlink(l)) - scnprintf(buf, TIPC_MAX_LINK_NAME, "broadcast-sender"); - else if (link_is_bc_rcvlink(l)) - scnprintf(buf, TIPC_MAX_LINK_NAME, - "broadcast-receiver, peer %x", l->addr); - else - memcpy(buf, l->name, TIPC_MAX_LINK_NAME); - - return buf; -} - /** * tipc_link_dump - dump TIPC link data * @l: tipc link to be dumped diff --git a/net/tipc/link.h b/net/tipc/link.h index d3c1c3fc1659..fc07232c9a12 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -80,7 +80,7 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link **link); -bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, +bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, @@ -111,7 +111,6 @@ u16 tipc_link_rcv_nxt(struct tipc_link *l); u16 tipc_link_acked(struct tipc_link *l); u32 tipc_link_id(struct tipc_link *l); char *tipc_link_name(struct tipc_link *l); -char *tipc_link_name_ext(struct tipc_link *l, char *buf); u32 tipc_link_state(struct tipc_link *l); char tipc_link_plane(struct tipc_link *l); int tipc_link_prio(struct tipc_link *l); @@ -143,8 +142,12 @@ int tipc_link_bc_peers(struct tipc_link *l); void tipc_link_set_mtu(struct tipc_link *l, int mtu); int tipc_link_mtu(struct tipc_link *l); int tipc_link_mss(struct tipc_link *l); -void tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, - struct sk_buff_head *xmitq); +u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l, + struct tipc_msg *hdr, bool uc); +int tipc_link_bc_ack_rcv(struct tipc_link *l, u16 acked, u16 gap, + struct tipc_gap_ack_blks *ga, + struct sk_buff_head *xmitq, + struct sk_buff_head *retrq); void tipc_link_build_bc_sync_msg(struct tipc_link *l, struct sk_buff_head *xmitq); void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr); diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 4d0e0bdd997b..c0afcd627c5e 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -212,7 +212,7 @@ err: int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq) { - struct sk_buff *skb, *prev; + struct sk_buff *skb; int accounted, total, curr; int mlen, cpy, rem = dlen; struct tipc_msg *hdr; @@ -223,7 +223,6 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, while (rem) { if (!skb || skb->len >= mss) { - prev = skb; skb = tipc_buf_acquire(mss, GFP_KERNEL); if (unlikely(!skb)) return -ENOMEM; @@ -235,9 +234,6 @@ int tipc_msg_append(struct tipc_msg *_hdr, struct msghdr *m, int dlen, msg_set_size(hdr, MIN_H_SIZE); __skb_queue_tail(txq, skb); total += 1; - if (prev) - msg_set_ack_required(buf_msg(prev), 0); - msg_set_ack_required(hdr, 1); } hdr = buf_msg(skb); curr = msg_blocks(hdr); @@ -825,19 +821,19 @@ bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, * @seqno: sequence number of buffer to add * @skb: buffer to add */ -void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, +bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb) { struct sk_buff *_skb, *tmp; if (skb_queue_empty(list) || less(seqno, buf_seqno(skb_peek(list)))) { __skb_queue_head(list, skb); - return; + return true; } if (more(seqno, buf_seqno(skb_peek_tail(list)))) { __skb_queue_tail(list, skb); - return; + return true; } skb_queue_walk_safe(list, _skb, tmp) { @@ -846,9 +842,10 @@ void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, if (seqno == buf_seqno(_skb)) break; __skb_queue_before(list, _skb, skb); - return; + return true; } kfree_skb(skb); + return false; } void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 871feadbbc19..58660d56bc83 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -160,20 +160,39 @@ struct tipc_gap_ack { /* struct tipc_gap_ack_blks * @len: actual length of the record - * @gack_cnt: number of Gap ACK blocks in the record + * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast + * ones) + * @start_index: starting index for "valid" broadcast Gap ACK blocks + * @bgack_cnt: number of Gap ACK blocks for broadcast in the record * @gacks: array of Gap ACK blocks + * + * 31 16 15 0 + * +-------------+-------------+-------------+-------------+ + * | bgack_cnt | ugack_cnt | len | + * +-------------+-------------+-------------+-------------+ - + * | gap | ack | | + * +-------------+-------------+-------------+-------------+ > bc gacks + * : : : | + * +-------------+-------------+-------------+-------------+ - + * | gap | ack | | + * +-------------+-------------+-------------+-------------+ > uc gacks + * : : : | + * +-------------+-------------+-------------+-------------+ - */ struct tipc_gap_ack_blks { __be16 len; - u8 gack_cnt; - u8 reserved; + union { + u8 ugack_cnt; + u8 start_index; + }; + u8 bgack_cnt; struct tipc_gap_ack gacks[]; }; #define tipc_gap_ack_blks_sz(n) (sizeof(struct tipc_gap_ack_blks) + \ sizeof(struct tipc_gap_ack) * (n)) -#define MAX_GAP_ACK_BLKS 32 +#define MAX_GAP_ACK_BLKS 128 #define MAX_GAP_ACK_BLKS_SZ tipc_gap_ack_blks_sz(MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) @@ -321,9 +340,19 @@ static inline int msg_ack_required(struct tipc_msg *m) return msg_bits(m, 0, 18, 1); } -static inline void msg_set_ack_required(struct tipc_msg *m, u32 d) +static inline void msg_set_ack_required(struct tipc_msg *m) { - msg_set_bits(m, 0, 18, 1, d); + msg_set_bits(m, 0, 18, 1, 1); +} + +static inline int msg_nagle_ack(struct tipc_msg *m) +{ + return msg_bits(m, 0, 18, 1); +} + +static inline void msg_set_nagle_ack(struct tipc_msg *m) +{ + msg_set_bits(m, 0, 18, 1, 1); } static inline bool msg_is_rcast(struct tipc_msg *m) @@ -1126,7 +1155,7 @@ bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); -void __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, +bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index bb9862410e68..c4aee6247d55 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -188,7 +188,7 @@ static const struct genl_ops tipc_genl_v2_ops[] = { }, { .cmd = TIPC_NL_LINK_GET, - .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .validate = GENL_DONT_VALIDATE_STRICT, .doit = tipc_nl_node_get_link, .dumpit = tipc_nl_node_dump_link, }, diff --git a/net/tipc/node.c b/net/tipc/node.c index 803a3a6d0f50..a4c2816c3746 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1138,7 +1138,7 @@ void tipc_node_check_dest(struct net *net, u32 addr, if (unlikely(!n->bc_entry.link)) { snd_l = tipc_bc_sndlink(net); if (!tipc_link_bc_create(net, tipc_own_addr(net), - addr, U16_MAX, + addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, @@ -1772,7 +1772,7 @@ static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, struct tipc_link *ucl; int rc; - rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr); + rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); @@ -2038,7 +2038,6 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) n = tipc_node_find_by_id(net, ehdr->id); } tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); - tipc_node_put(n); if (!skb) return; @@ -2071,10 +2070,16 @@ rcv: le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ - if (unlikely(usr == LINK_PROTOCOL)) + if (unlikely(usr == LINK_PROTOCOL)) { + if (unlikely(skb_linearize(skb))) { + tipc_node_put(n); + goto discard; + } + hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); - else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) + } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); + } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); @@ -2429,7 +2434,7 @@ int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { - err = tipc_nl_add_bc_link(net, &msg); + err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { @@ -2473,6 +2478,7 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); + struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) @@ -2489,11 +2495,26 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); - if (strcmp(link_name, tipc_bclink_name) == 0) { - err = tipc_bclink_reset_stats(net); + err = -EINVAL; + if (!strcmp(link_name, tipc_bclink_name)) { + err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; + } else if (strstr(link_name, tipc_bclink_name)) { + rcu_read_lock(); + list_for_each_entry_rcu(node, &tn->node_list, list) { + tipc_node_read_lock(node); + link = node->bc_entry.link; + if (link && !strcmp(link_name, tipc_link_name(link))) { + err = tipc_bclink_reset_stats(net, link); + tipc_node_read_unlock(node); + break; + } + tipc_node_read_unlock(node); + } + rcu_read_unlock(); + return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); @@ -2517,7 +2538,8 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, - struct tipc_node *node, u32 *prev_link) + struct tipc_node *node, u32 *prev_link, + bool bc_link) { u32 i; int err; @@ -2533,6 +2555,14 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, if (err) return err; } + + if (bc_link) { + *prev_link = i; + err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); + if (err) + return err; + } + *prev_link = 0; return 0; @@ -2541,17 +2571,36 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); + struct nlattr **attrs = genl_dumpit_info(cb)->attrs; + struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; + bool bc_link = cb->args[3]; int err; if (done) return 0; + if (!prev_node) { + /* Check if broadcast-receiver links dumping is needed */ + if (attrs && attrs[TIPC_NLA_LINK]) { + err = nla_parse_nested_deprecated(link, + TIPC_NLA_LINK_MAX, + attrs[TIPC_NLA_LINK], + tipc_nl_link_policy, + NULL); + if (unlikely(err)) + return err; + if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) + return -EINVAL; + bc_link = true; + } + } + msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; @@ -2575,7 +2624,7 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, - &prev_link); + &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; @@ -2583,14 +2632,14 @@ int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) prev_node = node->addr; } } else { - err = tipc_nl_add_bc_link(net, &msg); + err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, - &prev_link); + &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; @@ -2605,6 +2654,7 @@ out: cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; + cb->args[3] = bc_link; return skb->len; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e370ad0edd76..26123f4177fd 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -48,6 +48,8 @@ #include "group.h" #include "trace.h" +#define NAGLE_START_INIT 4 +#define NAGLE_START_MAX 1024 #define CONN_TIMEOUT_DEFAULT 8000 /* default connect timeout = 8s */ #define CONN_PROBING_INTV msecs_to_jiffies(3600000) /* [ms] => 1 h */ #define TIPC_FWD_MSG 1 @@ -119,7 +121,10 @@ struct tipc_sock { struct rcu_head rcu; struct tipc_group *group; u32 oneway; + u32 nagle_start; u16 snd_backlog; + u16 msg_acc; + u16 pkt_cnt; bool expect_ack; bool nodelay; bool group_is_open; @@ -143,7 +148,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk); static void tipc_sk_remove(struct tipc_sock *tsk); static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dsz); static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz); -static void tipc_sk_push_backlog(struct tipc_sock *tsk); +static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack); static const struct proto_ops packet_ops; static const struct proto_ops stream_ops; @@ -191,17 +196,17 @@ static int tsk_importance(struct tipc_sock *tsk) return msg_importance(&tsk->phdr); } -static int tsk_set_importance(struct tipc_sock *tsk, int imp) +static struct tipc_sock *tipc_sk(const struct sock *sk) { - if (imp > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL; - msg_set_importance(&tsk->phdr, (u32)imp); - return 0; + return container_of(sk, struct tipc_sock, sk); } -static struct tipc_sock *tipc_sk(const struct sock *sk) +int tsk_set_importance(struct sock *sk, int imp) { - return container_of(sk, struct tipc_sock, sk); + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL; + msg_set_importance(&tipc_sk(sk)->phdr, (u32)imp); + return 0; } static bool tsk_conn_cong(struct tipc_sock *tsk) @@ -474,6 +479,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock, tsk = tipc_sk(sk); tsk->max_pkt = MAX_PKT_DEFAULT; tsk->maxnagle = 0; + tsk->nagle_start = NAGLE_START_INIT; INIT_LIST_HEAD(&tsk->publications); INIT_LIST_HEAD(&tsk->cong_links); msg = &tsk->phdr; @@ -541,7 +547,7 @@ static void __tipc_shutdown(struct socket *sock, int error) !tsk_conn_cong(tsk))); /* Push out delayed messages if in Nagle mode */ - tipc_sk_push_backlog(tsk); + tipc_sk_push_backlog(tsk, false); /* Remove pending SYN */ __skb_queue_purge(&sk->sk_write_queue); @@ -1252,14 +1258,37 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, /* tipc_sk_push_backlog(): send accumulated buffers in socket write queue * when socket is in Nagle mode */ -static void tipc_sk_push_backlog(struct tipc_sock *tsk) +static void tipc_sk_push_backlog(struct tipc_sock *tsk, bool nagle_ack) { struct sk_buff_head *txq = &tsk->sk.sk_write_queue; + struct sk_buff *skb = skb_peek_tail(txq); struct net *net = sock_net(&tsk->sk); u32 dnode = tsk_peer_node(tsk); - struct sk_buff *skb = skb_peek(txq); int rc; + if (nagle_ack) { + tsk->pkt_cnt += skb_queue_len(txq); + if (!tsk->pkt_cnt || tsk->msg_acc / tsk->pkt_cnt < 2) { + tsk->oneway = 0; + if (tsk->nagle_start < NAGLE_START_MAX) + tsk->nagle_start *= 2; + tsk->expect_ack = false; + pr_debug("tsk %10u: bad nagle %u -> %u, next start %u!\n", + tsk->portid, tsk->msg_acc, tsk->pkt_cnt, + tsk->nagle_start); + } else { + tsk->nagle_start = NAGLE_START_INIT; + if (skb) { + msg_set_ack_required(buf_msg(skb)); + tsk->expect_ack = true; + } else { + tsk->expect_ack = false; + } + } + tsk->msg_acc = 0; + tsk->pkt_cnt = 0; + } + if (!skb || tsk->cong_link_cnt) return; @@ -1267,9 +1296,10 @@ static void tipc_sk_push_backlog(struct tipc_sock *tsk) if (msg_is_syn(buf_msg(skb))) return; + if (tsk->msg_acc) + tsk->pkt_cnt += skb_queue_len(txq); tsk->snt_unacked += tsk->snd_backlog; tsk->snd_backlog = 0; - tsk->expect_ack = true; rc = tipc_node_xmit(net, txq, dnode, tsk->portid); if (rc == -ELINKCONG) tsk->cong_link_cnt = 1; @@ -1322,8 +1352,7 @@ static void tipc_sk_conn_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, return; } else if (mtyp == CONN_ACK) { was_cong = tsk_conn_cong(tsk); - tsk->expect_ack = false; - tipc_sk_push_backlog(tsk); + tipc_sk_push_backlog(tsk, msg_nagle_ack(hdr)); tsk->snt_unacked -= msg_conn_ack(hdr); if (tsk->peer_caps & TIPC_BLOCK_FLOWCTL) tsk->snd_win = msg_adv_win(hdr); @@ -1516,6 +1545,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) struct tipc_sock *tsk = tipc_sk(sk); struct tipc_msg *hdr = &tsk->phdr; struct net *net = sock_net(sk); + struct sk_buff *skb; u32 dnode = tsk_peer_node(tsk); int maxnagle = tsk->maxnagle; int maxpkt = tsk->max_pkt; @@ -1544,17 +1574,29 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) break; send = min_t(size_t, dlen - sent, TIPC_MAX_USER_MSG_SIZE); blocks = tsk->snd_backlog; - if (tsk->oneway++ >= 4 && send <= maxnagle) { + if (tsk->oneway++ >= tsk->nagle_start && send <= maxnagle) { rc = tipc_msg_append(hdr, m, send, maxnagle, txq); if (unlikely(rc < 0)) break; blocks += rc; + tsk->msg_acc++; if (blocks <= 64 && tsk->expect_ack) { tsk->snd_backlog = blocks; sent += send; break; + } else if (blocks > 64) { + tsk->pkt_cnt += skb_queue_len(txq); + } else { + skb = skb_peek_tail(txq); + if (skb) { + msg_set_ack_required(buf_msg(skb)); + tsk->expect_ack = true; + } else { + tsk->expect_ack = false; + } + tsk->msg_acc = 0; + tsk->pkt_cnt = 0; } - tsk->expect_ack = true; } else { rc = tipc_msg_build(hdr, m, sent, send, maxpkt, txq); if (unlikely(rc != send)) @@ -2091,7 +2133,7 @@ static void tipc_sk_proto_rcv(struct sock *sk, smp_wmb(); tsk->cong_link_cnt--; wakeup = true; - tipc_sk_push_backlog(tsk); + tipc_sk_push_backlog(tsk, false); break; case GROUP_PROTOCOL: tipc_group_proto_rcv(grp, &wakeup, hdr, inputq, xmitq); @@ -2180,7 +2222,7 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, return false; case TIPC_ESTABLISHED: if (!skb_queue_empty(&sk->sk_write_queue)) - tipc_sk_push_backlog(tsk); + tipc_sk_push_backlog(tsk, false); /* Accept only connection-based messages sent by peer */ if (likely(con_msg && !err && pport == oport && pnode == onode)) { @@ -2188,8 +2230,10 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, struct sk_buff *skb; skb = tipc_sk_build_ack(tsk); - if (skb) + if (skb) { + msg_set_nagle_ack(buf_msg(skb)); __skb_queue_tail(xmitq, skb); + } } return true; } @@ -2681,7 +2725,7 @@ static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags, /* Connect new socket to it's peer */ tipc_sk_finish_conn(new_tsock, msg_origport(msg), msg_orignode(msg)); - tsk_set_importance(new_tsock, msg_importance(msg)); + tsk_set_importance(new_sk, msg_importance(msg)); if (msg_named(msg)) { new_tsock->conn_type = msg_nametype(msg); new_tsock->conn_instance = msg_nameinst(msg); @@ -3099,7 +3143,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - res = tsk_set_importance(tsk, value); + res = tsk_set_importance(sk, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) diff --git a/net/tipc/socket.h b/net/tipc/socket.h index 235b9679acee..b11575afc66f 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -75,4 +75,6 @@ u32 tipc_sock_get_portid(struct sock *sk); bool tipc_sk_overlimit1(struct sock *sk, struct sk_buff *skb); bool tipc_sk_overlimit2(struct sock *sk, struct sk_buff *skb); +int tsk_set_importance(struct sock *sk, int imp); + #endif diff --git a/net/tipc/sysctl.c b/net/tipc/sysctl.c index 58ab3d6dcdce..97a6264a2993 100644 --- a/net/tipc/sysctl.c +++ b/net/tipc/sysctl.c @@ -36,7 +36,7 @@ #include "core.h" #include "trace.h" #include "crypto.h" - +#include "bcast.h" #include <linux/sysctl.h> static struct ctl_table_header *tipc_ctl_hdr; @@ -75,6 +75,13 @@ static struct ctl_table tipc_table[] = { .extra1 = SYSCTL_ONE, }, #endif + { + .procname = "bc_retruni", + .data = &sysctl_tipc_bc_retruni, + .maxlen = sizeof(sysctl_tipc_bc_retruni), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, {} }; diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 446af7bbd13e..1489cfb941d8 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -497,7 +497,6 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk) static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) { - int imp = TIPC_CRITICAL_IMPORTANCE; struct socket *lsock = NULL; struct sockaddr_tipc saddr; struct sock *sk; @@ -514,8 +513,9 @@ static int tipc_topsrv_create_listener(struct tipc_topsrv *srv) sk->sk_user_data = srv; write_unlock_bh(&sk->sk_callback_lock); - rc = kernel_setsockopt(lsock, SOL_TIPC, TIPC_IMPORTANCE, - (char *)&imp, sizeof(imp)); + lock_sock(sk); + rc = tsk_set_importance(sk, TIPC_CRITICAL_IMPORTANCE); + release_sock(sk); if (rc < 0) goto err; diff --git a/net/tipc/trace.h b/net/tipc/trace.h index 4d8e00483afc..04af83f0500c 100644 --- a/net/tipc/trace.h +++ b/net/tipc/trace.h @@ -255,7 +255,7 @@ DECLARE_EVENT_CLASS(tipc_link_class, TP_fast_assign( __assign_str(header, header); - tipc_link_name_ext(l, __entry->name); + memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME); tipc_link_dump(l, dqueues, __get_str(buf)); ), @@ -295,12 +295,14 @@ DECLARE_EVENT_CLASS(tipc_link_transmq_class, ), TP_fast_assign( - tipc_link_name_ext(r, __entry->name); + memcpy(__entry->name, tipc_link_name(r), TIPC_MAX_LINK_NAME); __entry->from = f; __entry->to = t; __entry->len = skb_queue_len(tq); - __entry->fseqno = msg_seqno(buf_msg(skb_peek(tq))); - __entry->lseqno = msg_seqno(buf_msg(skb_peek_tail(tq))); + __entry->fseqno = __entry->len ? + msg_seqno(buf_msg(skb_peek(tq))) : 0; + __entry->lseqno = __entry->len ? + msg_seqno(buf_msg(skb_peek_tail(tq))) : 0; ), TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n", @@ -308,15 +310,16 @@ DECLARE_EVENT_CLASS(tipc_link_transmq_class, __entry->len, __entry->fseqno, __entry->lseqno) ); -DEFINE_EVENT(tipc_link_transmq_class, tipc_link_retrans, +DEFINE_EVENT_CONDITION(tipc_link_transmq_class, tipc_link_retrans, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), - TP_ARGS(r, f, t, tq) + TP_ARGS(r, f, t, tq), + TP_CONDITION(less_eq(f, t)) ); DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), - TP_printk("<%s> acked: [%u-%u] transmq: %u [%u-%u]\n", + TP_printk("<%s> acked: %u gap: %u transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index a562ebaaa33c..0e55f8365ce2 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -694,10 +694,11 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_offload_context_rx *rx_ctx; + bool is_req_pending, is_force_resync; u8 rcd_sn[TLS_MAX_REC_SEQ_SIZE]; - u32 sock_data, is_req_pending; struct tls_prot_info *prot; s64 resync_req; + u32 sock_data; u32 req_seq; if (tls_ctx->rx_conf != TLS_HW) @@ -712,9 +713,11 @@ void tls_device_rx_resync_new_rec(struct sock *sk, u32 rcd_len, u32 seq) resync_req = atomic64_read(&rx_ctx->resync_req); req_seq = resync_req >> 32; seq += TLS_HEADER_SIZE - 1; - is_req_pending = resync_req; + is_req_pending = resync_req & RESYNC_REQ; + is_force_resync = resync_req & RESYNC_REQ_FORCE; - if (likely(!is_req_pending) || req_seq != seq || + if (likely(!is_req_pending) || + (!is_force_resync && req_seq != seq) || !atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0)) return; break; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8c2763eb6aae..24f64bc0de18 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1742,6 +1742,7 @@ int tls_sw_recvmsg(struct sock *sk, long timeo; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; + bool bpf_strp_enabled; int num_async = 0; int pending; @@ -1752,6 +1753,7 @@ int tls_sw_recvmsg(struct sock *sk, psock = sk_psock_get(sk); lock_sock(sk); + bpf_strp_enabled = sk_psock_strp_enabled(psock); /* Process pending decrypted records. It must be non-zero-copy */ err = process_rx_list(ctx, msg, &control, &cmsg, 0, len, false, @@ -1805,11 +1807,12 @@ int tls_sw_recvmsg(struct sock *sk, if (to_decrypt <= len && !is_kvec && !is_peek && ctx->control == TLS_RECORD_TYPE_DATA && - prot->version != TLS_1_3_VERSION) + prot->version != TLS_1_3_VERSION && + !bpf_strp_enabled) zc = true; /* Do not use async mode if record is non-data */ - if (ctx->control == TLS_RECORD_TYPE_DATA) + if (ctx->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled) async_capable = ctx->async_capable; else async_capable = false; @@ -1859,6 +1862,19 @@ int tls_sw_recvmsg(struct sock *sk, goto pick_next_record; if (!zc) { + if (bpf_strp_enabled) { + err = sk_psock_tls_strp_read(psock, skb); + if (err != __SK_PASS) { + rxm->offset = rxm->offset + rxm->full_len; + rxm->full_len = 0; + if (err == __SK_DROP) + consume_skb(skb); + ctx->recv_pkt = NULL; + __strp_unpause(&ctx->strp); + continue; + } + } + if (rxm->full_len > len) { retain_skb = true; chunk = len; diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 63cf7131f601..813e93644ae7 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -181,8 +181,8 @@ config CFG80211_CRDA_SUPPORT default y help You should enable this option unless you know for sure you have no - need for it, for example when using internal regdb (above) or the - database loaded as a firmware file. + need for it, for example when using the regulatory database loaded as + a firmware file. If unsure, say Y. diff --git a/net/wireless/chan.c b/net/wireless/chan.c index fcac5c6366e1..cddf92c5d09e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -6,7 +6,7 @@ * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2018 Intel Corporation + * Copyright 2018-2020 Intel Corporation */ #include <linux/export.h> @@ -27,6 +27,7 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef, return; chandef->chan = chan; + chandef->freq1_offset = chan->freq_offset; chandef->center_freq2 = 0; chandef->edmg.bw_config = 0; chandef->edmg.channels = 0; @@ -146,6 +147,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) if (!chandef->chan) return false; + if (chandef->freq1_offset >= 1000) + return false; + control_freq = chandef->chan->center_freq; switch (chandef->width) { @@ -153,7 +157,8 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_10: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_20_NOHT: - if (chandef->center_freq1 != control_freq) + if (ieee80211_chandef_to_khz(chandef) != + ieee80211_channel_to_khz(chandef->chan)) return false; if (chandef->center_freq2) return false; @@ -386,10 +391,11 @@ static u32 cfg80211_get_start_freq(u32 center_freq, { u32 start_freq; - if (bandwidth <= 20) + bandwidth = MHZ_TO_KHZ(bandwidth); + if (bandwidth <= MHZ_TO_KHZ(20)) start_freq = center_freq; else - start_freq = center_freq - bandwidth/2 + 10; + start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10); return start_freq; } @@ -399,10 +405,11 @@ static u32 cfg80211_get_end_freq(u32 center_freq, { u32 end_freq; - if (bandwidth <= 20) + bandwidth = MHZ_TO_KHZ(bandwidth); + if (bandwidth <= MHZ_TO_KHZ(20)) end_freq = center_freq; else - end_freq = center_freq + bandwidth/2 - 10; + end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10); return end_freq; } @@ -417,8 +424,8 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return -EINVAL; @@ -449,8 +456,8 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, return -EINVAL; ret = cfg80211_get_chans_dfs_required(wiphy, - chandef->center_freq1, - width); + ieee80211_chandef_to_khz(chandef), + width); if (ret < 0) return ret; else if (ret > 0) @@ -460,8 +467,8 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy, return 0; ret = cfg80211_get_chans_dfs_required(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); if (ret < 0) return ret; else if (ret > 0) @@ -503,8 +510,8 @@ static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy, * DFS_AVAILABLE). Return number of usable channels * (require CAC). Allow DFS and non-DFS channel mix. */ - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return -EINVAL; @@ -536,8 +543,9 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, if (width < 0) return false; - r1 = cfg80211_get_chans_dfs_usable(wiphy, chandef->center_freq1, - width); + r1 = cfg80211_get_chans_dfs_usable(wiphy, + MHZ_TO_KHZ(chandef->center_freq1), + width); if (r1 < 0) return false; @@ -546,8 +554,8 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, case NL80211_CHAN_WIDTH_80P80: WARN_ON(!chandef->center_freq2); r2 = cfg80211_get_chans_dfs_usable(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); if (r2 < 0) return false; break; @@ -694,8 +702,8 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, * If any channel in between is disabled or has not * had gone through CAC return false */ - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return false; @@ -724,7 +732,8 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, if (width < 0) return false; - r = cfg80211_get_chans_dfs_available(wiphy, chandef->center_freq1, + r = cfg80211_get_chans_dfs_available(wiphy, + MHZ_TO_KHZ(chandef->center_freq1), width); /* If any of channels unavailable for cf1 just return */ @@ -735,8 +744,8 @@ static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy, case NL80211_CHAN_WIDTH_80P80: WARN_ON(!chandef->center_freq2); r = cfg80211_get_chans_dfs_available(wiphy, - chandef->center_freq2, - width); + MHZ_TO_KHZ(chandef->center_freq2), + width); break; default: WARN_ON(chandef->center_freq2); @@ -757,8 +766,8 @@ static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c) return 0; @@ -790,14 +799,14 @@ cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, return 0; t1 = cfg80211_get_chans_dfs_cac_time(wiphy, - chandef->center_freq1, + MHZ_TO_KHZ(chandef->center_freq1), width); if (!chandef->center_freq2) return t1; t2 = cfg80211_get_chans_dfs_cac_time(wiphy, - chandef->center_freq2, + MHZ_TO_KHZ(chandef->center_freq2), width); return max(t1, t2); @@ -813,8 +822,8 @@ static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy, start_freq = cfg80211_get_start_freq(center_freq, bandwidth); end_freq = cfg80211_get_end_freq(center_freq, bandwidth); - for (freq = start_freq; freq <= end_freq; freq += 20) { - c = ieee80211_get_channel(wiphy, freq); + for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) { + c = ieee80211_get_channel_khz(wiphy, freq); if (!c || c->flags & prohibited_flags) return false; } @@ -910,7 +919,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, width = 10; break; case NL80211_CHAN_WIDTH_20: - if (!ht_cap->ht_supported) + if (!ht_cap->ht_supported && + chandef->chan->band != NL80211_BAND_6GHZ) return false; /* fall through */ case NL80211_CHAN_WIDTH_20_NOHT: @@ -919,6 +929,8 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, break; case NL80211_CHAN_WIDTH_40: width = 40; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; if (!ht_cap->ht_supported) return false; if (!(ht_cap->cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) || @@ -933,24 +945,29 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, break; case NL80211_CHAN_WIDTH_80P80: cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; - if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) + if (chandef->chan->band != NL80211_BAND_6GHZ && + cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return false; /* fall through */ case NL80211_CHAN_WIDTH_80: - if (!vht_cap->vht_supported) - return false; prohibited_flags |= IEEE80211_CHAN_NO_80MHZ; width = 80; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; + if (!vht_cap->vht_supported) + return false; break; case NL80211_CHAN_WIDTH_160: + prohibited_flags |= IEEE80211_CHAN_NO_160MHZ; + width = 160; + if (chandef->chan->band == NL80211_BAND_6GHZ) + break; if (!vht_cap->vht_supported) return false; cap = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ && cap != IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return false; - prohibited_flags |= IEEE80211_CHAN_NO_160MHZ; - width = 160; break; default: WARN_ON_ONCE(1); @@ -976,13 +993,15 @@ bool cfg80211_chandef_usable(struct wiphy *wiphy, prohibited_flags |= IEEE80211_CHAN_NO_OFDM; - if (!cfg80211_secondary_chans_ok(wiphy, chandef->center_freq1, + if (!cfg80211_secondary_chans_ok(wiphy, + ieee80211_chandef_to_khz(chandef), width, prohibited_flags)) return false; if (!chandef->center_freq2) return true; - return cfg80211_secondary_chans_ok(wiphy, chandef->center_freq2, + return cfg80211_secondary_chans_ok(wiphy, + MHZ_TO_KHZ(chandef->center_freq2), width, prohibited_flags); } EXPORT_SYMBOL(cfg80211_chandef_usable); diff --git a/net/wireless/core.c b/net/wireless/core.c index ce024440fa51..f0226ae9561c 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -480,9 +480,6 @@ use_default_name: INIT_LIST_HEAD(&rdev->bss_list); INIT_LIST_HEAD(&rdev->sched_scan_req_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); - INIT_LIST_HEAD(&rdev->mlme_unreg); - spin_lock_init(&rdev->mlme_unreg_lock); - INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT @@ -794,6 +791,7 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check supported bands/channels */ for (band = 0; band < NUM_NL80211_BANDS; band++) { u16 types = 0; + bool have_he = false; sband = wiphy->bands[band]; if (!sband) @@ -810,6 +808,11 @@ int wiphy_register(struct wiphy *wiphy) !sband->n_bitrates)) return -EINVAL; + if (WARN_ON(band == NL80211_BAND_6GHZ && + (sband->ht_cap.ht_supported || + sband->vht_cap.vht_supported))) + return -EINVAL; + /* * Since cfg80211_disable_40mhz_24ghz is global, we can * modify the sband's ht data even if the driver uses a @@ -837,6 +840,9 @@ int wiphy_register(struct wiphy *wiphy) sband->channels[i].orig_mpwr = sband->channels[i].max_power; sband->channels[i].band = band; + + if (WARN_ON(sband->channels[i].freq_offset >= 1000)) + return -EINVAL; } for (i = 0; i < sband->n_iftype_data; i++) { @@ -854,8 +860,17 @@ int wiphy_register(struct wiphy *wiphy) return -EINVAL; types |= iftd->types_mask; + + if (i == 0) + have_he = iftd->he_cap.has_he; + else + have_he = have_he && + iftd->he_cap.has_he; } + if (WARN_ON(!have_he && band == NL80211_BAND_6GHZ)) + return -EINVAL; + have_band = true; } @@ -1030,7 +1045,6 @@ void wiphy_unregister(struct wiphy *wiphy) cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->sched_scan_stop_wk); - flush_work(&rdev->mlme_unreg_wk); flush_work(&rdev->propagate_radar_detect_wk); flush_work(&rdev->propagate_cac_done_wk); @@ -1094,6 +1108,7 @@ static void __cfg80211_unregister_wdev(struct wireless_dev *wdev, bool sync) rdev->devlist_generation++; cfg80211_mlme_purge_registrations(wdev); + flush_work(&wdev->mgmt_registrations_update_wk); switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: @@ -1238,6 +1253,8 @@ void cfg80211_init_wdev(struct cfg80211_registered_device *rdev, spin_lock_init(&wdev->event_lock); INIT_LIST_HEAD(&wdev->mgmt_registrations); spin_lock_init(&wdev->mgmt_registrations_lock); + INIT_WORK(&wdev->mgmt_registrations_update_wk, + cfg80211_mgmt_registrations_update_wk); INIT_LIST_HEAD(&wdev->pmsr_list); spin_lock_init(&wdev->pmsr_lock); INIT_WORK(&wdev->pmsr_free_wk, cfg80211_pmsr_free_wk); diff --git a/net/wireless/core.h b/net/wireless/core.h index bb897a803ffe..e0e5b3ee9699 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -60,10 +60,6 @@ struct cfg80211_registered_device { struct list_head beacon_registrations; spinlock_t beacon_registrations_lock; - struct list_head mlme_unreg; - spinlock_t mlme_unreg_lock; - struct work_struct mlme_unreg_wk; - /* protected by RTNL only */ int num_running_ifaces; int num_running_monitor_ifaces; @@ -290,7 +286,7 @@ struct cfg80211_cqm_config { u32 rssi_hyst; s32 last_rssi_event_value; int n_rssi_thresholds; - s32 rssi_thresholds[0]; + s32 rssi_thresholds[]; }; void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); @@ -385,8 +381,9 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct net_device *dev); int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, - int match_len, struct netlink_ext_ack *extack); -void cfg80211_mlme_unreg_wk(struct work_struct *wk); + int match_len, bool multicast_rx, + struct netlink_ext_ack *extack); +void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index e4805a3bd310..189334314cba 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -426,58 +426,62 @@ struct cfg80211_mgmt_registration { __le16 frame_type; + bool multicast_rx; + u8 match[]; }; -static void -cfg80211_process_mlme_unregistrations(struct cfg80211_registered_device *rdev) +static void cfg80211_mgmt_registrations_update(struct wireless_dev *wdev) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct wireless_dev *tmp; struct cfg80211_mgmt_registration *reg; + struct mgmt_frame_regs upd = {}; ASSERT_RTNL(); - spin_lock_bh(&rdev->mlme_unreg_lock); - while ((reg = list_first_entry_or_null(&rdev->mlme_unreg, - struct cfg80211_mgmt_registration, - list))) { - list_del(®->list); - spin_unlock_bh(&rdev->mlme_unreg_lock); + rcu_read_lock(); + list_for_each_entry_rcu(tmp, &rdev->wiphy.wdev_list, list) { + list_for_each_entry_rcu(reg, &tmp->mgmt_registrations, list) { + u32 mask = BIT(le16_to_cpu(reg->frame_type) >> 4); + u32 mcast_mask = 0; - if (rdev->ops->mgmt_frame_register) { - u16 frame_type = le16_to_cpu(reg->frame_type); + if (reg->multicast_rx) + mcast_mask = mask; - rdev_mgmt_frame_register(rdev, reg->wdev, - frame_type, false); - } + upd.global_stypes |= mask; + upd.global_mcast_stypes |= mcast_mask; - kfree(reg); - - spin_lock_bh(&rdev->mlme_unreg_lock); + if (tmp == wdev) { + upd.interface_stypes |= mask; + upd.interface_mcast_stypes |= mcast_mask; + } + } } - spin_unlock_bh(&rdev->mlme_unreg_lock); + rcu_read_unlock(); + + rdev_update_mgmt_frame_registrations(rdev, wdev, &upd); } -void cfg80211_mlme_unreg_wk(struct work_struct *wk) +void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk) { - struct cfg80211_registered_device *rdev; - - rdev = container_of(wk, struct cfg80211_registered_device, - mlme_unreg_wk); + struct wireless_dev *wdev = container_of(wk, struct wireless_dev, + mgmt_registrations_update_wk); rtnl_lock(); - cfg80211_process_mlme_unregistrations(rdev); + cfg80211_mgmt_registrations_update(wdev); rtnl_unlock(); } int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, - int match_len, struct netlink_ext_ack *extack) + int match_len, bool multicast_rx, + struct netlink_ext_ack *extack) { - struct wiphy *wiphy = wdev->wiphy; - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; + bool update_multicast = false; if (!wdev->wiphy->mgmt_stypes) return -EOPNOTSUPP; @@ -528,34 +532,39 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, continue; if (memcmp(reg->match, match_data, mlen) == 0) { + if (reg->multicast_rx != multicast_rx) { + update_multicast = true; + reg->multicast_rx = multicast_rx; + break; + } NL_SET_ERR_MSG(extack, "Match already configured"); err = -EALREADY; break; } } - if (err) { - kfree(nreg); + if (err) goto out; - } - memcpy(nreg->match, match_data, match_len); - nreg->match_len = match_len; - nreg->nlportid = snd_portid; - nreg->frame_type = cpu_to_le16(frame_type); - nreg->wdev = wdev; - list_add(&nreg->list, &wdev->mgmt_registrations); + if (update_multicast) { + kfree(nreg); + } else { + memcpy(nreg->match, match_data, match_len); + nreg->match_len = match_len; + nreg->nlportid = snd_portid; + nreg->frame_type = cpu_to_le16(frame_type); + nreg->wdev = wdev; + nreg->multicast_rx = multicast_rx; + list_add(&nreg->list, &wdev->mgmt_registrations); + } spin_unlock_bh(&wdev->mgmt_registrations_lock); - /* process all unregistrations to avoid driver confusion */ - cfg80211_process_mlme_unregistrations(rdev); - - if (rdev->ops->mgmt_frame_register) - rdev_mgmt_frame_register(rdev, wdev, frame_type, true); + cfg80211_mgmt_registrations_update(wdev); return 0; out: + kfree(nreg); spin_unlock_bh(&wdev->mgmt_registrations_lock); return err; @@ -574,11 +583,9 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) continue; list_del(®->list); - spin_lock(&rdev->mlme_unreg_lock); - list_add_tail(®->list, &rdev->mlme_unreg); - spin_unlock(&rdev->mlme_unreg_lock); + kfree(reg); - schedule_work(&rdev->mlme_unreg_wk); + schedule_work(&wdev->mgmt_registrations_update_wk); } spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -594,15 +601,16 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { - struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); + struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&wdev->mgmt_registrations_lock); - spin_lock(&rdev->mlme_unreg_lock); - list_splice_tail_init(&wdev->mgmt_registrations, &rdev->mlme_unreg); - spin_unlock(&rdev->mlme_unreg_lock); + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { + list_del(®->list); + kfree(reg); + } spin_unlock_bh(&wdev->mgmt_registrations_lock); - cfg80211_process_mlme_unregistrations(rdev); + cfg80211_mgmt_registrations_update(wdev); } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, @@ -721,8 +729,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, return rdev_mgmt_tx(rdev, wdev, params, cookie); } -bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, - const u8 *buf, size_t len, u32 flags) +bool cfg80211_rx_mgmt_khz(struct wireless_dev *wdev, int freq, int sig_dbm, + const u8 *buf, size_t len, u32 flags) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -777,7 +785,7 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_dbm, trace_cfg80211_return_bool(result); return result; } -EXPORT_SYMBOL(cfg80211_rx_mgmt); +EXPORT_SYMBOL(cfg80211_rx_mgmt_khz); void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 692bcd35f809..263ae395ad44 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -253,6 +253,8 @@ static int validate_ie_attr(const struct nlattr *attr, } /* policy for the attributes */ +static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; + static const struct nla_policy nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = { [NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, }, @@ -296,11 +298,7 @@ nl80211_pmsr_req_attr_policy[NL80211_PMSR_REQ_ATTR_MAX + 1] = { static const struct nla_policy nl80211_psmr_peer_attr_policy[NL80211_PMSR_PEER_ATTR_MAX + 1] = { [NL80211_PMSR_PEER_ATTR_ADDR] = NLA_POLICY_ETH_ADDR, - /* - * we could specify this again to be the top-level policy, - * but that would open us up to recursion problems ... - */ - [NL80211_PMSR_PEER_ATTR_CHAN] = { .type = NLA_NESTED }, + [NL80211_PMSR_PEER_ATTR_CHAN] = NLA_POLICY_NESTED(nl80211_policy), [NL80211_PMSR_PEER_ATTR_REQ] = NLA_POLICY_NESTED(nl80211_pmsr_req_attr_policy), [NL80211_PMSR_PEER_ATTR_RESP] = { .type = NLA_REJECT }, @@ -331,6 +329,15 @@ he_bss_color_policy[NL80211_HE_BSS_COLOR_ATTR_MAX + 1] = { [NL80211_HE_BSS_COLOR_ATTR_PARTIAL] = { .type = NLA_FLAG }, }; +static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { + [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_RATES }, + [NL80211_TXRATE_HT] = { .type = NLA_BINARY, + .len = NL80211_MAX_SUPP_HT_RATES }, + [NL80211_TXRATE_VHT] = NLA_POLICY_EXACT_LEN_WARN(sizeof(struct nl80211_txrate_vht)), + [NL80211_TXRATE_GI] = { .type = NLA_U8 }, +}; + static const struct nla_policy nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { [NL80211_TID_CONFIG_ATTR_VIF_SUPP] = { .type = NLA_U64 }, @@ -345,9 +352,15 @@ nl80211_tid_config_attr_policy[NL80211_TID_CONFIG_ATTR_MAX + 1] = { NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), [NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL] = NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_AMSDU_CTRL] = + NLA_POLICY_MAX(NLA_U8, NL80211_TID_CONFIG_DISABLE), + [NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE] = + NLA_POLICY_MAX(NLA_U8, NL80211_TX_RATE_FIXED), + [NL80211_TID_CONFIG_ATTR_TX_RATE] = + NLA_POLICY_NESTED(nl80211_txattr_policy), }; -const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { +static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [0] = { .strict_start_type = NL80211_ATTR_HE_OBSS_PD }, [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, @@ -365,6 +378,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CHANNEL_WIDTH] = { .type = NLA_U32 }, [NL80211_ATTR_CENTER_FREQ1] = { .type = NLA_U32 }, + [NL80211_ATTR_CENTER_FREQ1_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), [NL80211_ATTR_CENTER_FREQ2] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_RETRY_SHORT] = NLA_POLICY_MIN(NLA_U8, 1), @@ -378,11 +392,8 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_IFINDEX] = { .type = NLA_U32 }, [NL80211_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, - [NL80211_ATTR_MAC] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN }, - [NL80211_ATTR_PREV_BSSID] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), + [NL80211_ATTR_PREV_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_KEY] = { .type = NLA_NESTED, }, [NL80211_ATTR_KEY_DATA] = { .type = NLA_BINARY, @@ -434,10 +445,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MESH_CONFIG] = { .type = NLA_NESTED }, [NL80211_ATTR_SUPPORT_MESH_AUTH] = { .type = NLA_FLAG }, - [NL80211_ATTR_HT_CAPABILITY] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_HT_CAPABILITY_LEN - }, + [NL80211_ATTR_HT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_HT_CAPABILITY_LEN), [NL80211_ATTR_MGMT_SUBTYPE] = { .type = NLA_U8 }, [NL80211_ATTR_IE] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, @@ -468,10 +476,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, [NL80211_ATTR_PID] = { .type = NLA_U32 }, [NL80211_ATTR_4ADDR] = { .type = NLA_U8 }, - [NL80211_ATTR_PMKID] = { - .type = NLA_EXACT_LEN_WARN, - .len = WLAN_PMKID_LEN - }, + [NL80211_ATTR_PMKID] = NLA_POLICY_EXACT_LEN_WARN(WLAN_PMKID_LEN), [NL80211_ATTR_DURATION] = { .type = NLA_U32 }, [NL80211_ATTR_COOKIE] = { .type = NLA_U64 }, [NL80211_ATTR_TX_RATES] = { .type = NLA_NESTED }, @@ -535,10 +540,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WDEV] = { .type = NLA_U64 }, [NL80211_ATTR_USER_REG_HINT_TYPE] = { .type = NLA_U32 }, [NL80211_ATTR_AUTH_DATA] = { .type = NLA_BINARY, }, - [NL80211_ATTR_VHT_CAPABILITY] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_VHT_CAPABILITY_LEN - }, + [NL80211_ATTR_VHT_CAPABILITY] = NLA_POLICY_EXACT_LEN_WARN(NL80211_VHT_CAPABILITY_LEN), [NL80211_ATTR_SCAN_FLAGS] = { .type = NLA_U32 }, [NL80211_ATTR_P2P_CTWINDOW] = NLA_POLICY_MAX(NLA_U8, 127), [NL80211_ATTR_P2P_OPPPS] = NLA_POLICY_MAX(NLA_U8, 1), @@ -576,10 +578,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, [NL80211_ATTR_QOS_MAP] = { .type = NLA_BINARY, .len = IEEE80211_QOS_MAP_LEN_MAX }, - [NL80211_ATTR_MAC_HINT] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC_HINT] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_WIPHY_FREQ_HINT] = { .type = NLA_U32 }, [NL80211_ATTR_TDLS_PEER_CAPABILITY] = { .type = NLA_U32 }, [NL80211_ATTR_SOCKET_OWNER] = { .type = NLA_FLAG }, @@ -591,10 +590,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_ADMITTED_TIME] = { .type = NLA_U16 }, [NL80211_ATTR_SMPS_MODE] = { .type = NLA_U8 }, [NL80211_ATTR_OPER_CLASS] = { .type = NLA_U8 }, - [NL80211_ATTR_MAC_MASK] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MAC_MASK] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_WIPHY_SELF_MANAGED_REG] = { .type = NLA_FLAG }, [NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 }, @@ -606,21 +602,15 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_GROUP_DATA] = { .len = VHT_MUMIMO_GROUPS_DATA_LEN }, - [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, - [NL80211_ATTR_FILS_NONCES] = { - .type = NLA_EXACT_LEN_WARN, - .len = 2 * FILS_NONCE_LEN - }, + [NL80211_ATTR_FILS_NONCES] = NLA_POLICY_EXACT_LEN_WARN(2 * FILS_NONCE_LEN), [NL80211_ATTR_MULTICAST_TO_UNICAST_ENABLED] = { .type = NLA_FLAG, }, - [NL80211_ATTR_BSSID] = { .type = NLA_EXACT_LEN_WARN, .len = ETH_ALEN }, + [NL80211_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI] = { .type = NLA_S8 }, [NL80211_ATTR_SCHED_SCAN_RSSI_ADJUST] = { .len = sizeof(struct nl80211_bss_select_rssi_adjust) @@ -633,7 +623,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] = { .type = NLA_U16 }, [NL80211_ATTR_FILS_ERP_RRK] = { .type = NLA_BINARY, .len = FILS_ERP_MAX_RRK_LEN }, - [NL80211_ATTR_FILS_CACHE_ID] = { .type = NLA_EXACT_LEN_WARN, .len = 2 }, + [NL80211_ATTR_FILS_CACHE_ID] = NLA_POLICY_EXACT_LEN_WARN(2), [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, [NL80211_ATTR_EXTERNAL_AUTH_SUPPORT] = { .type = NLA_FLAG }, @@ -661,6 +651,13 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_CONTROL_PORT_NO_PREAUTH] = { .type = NLA_FLAG }, [NL80211_ATTR_PMK_LIFETIME] = NLA_POLICY_MIN(NLA_U32, 1), [NL80211_ATTR_PMK_REAUTH_THRESHOLD] = NLA_POLICY_RANGE(NLA_U8, 1, 100), + [NL80211_ATTR_RECEIVE_MULTICAST] = { .type = NLA_FLAG }, + [NL80211_ATTR_WIPHY_FREQ_OFFSET] = NLA_POLICY_RANGE(NLA_U32, 0, 999), + [NL80211_ATTR_SCAN_FREQ_KHZ] = { .type = NLA_NESTED }, + [NL80211_ATTR_HE_6GHZ_CAPABILITY] = { + .type = NLA_EXACT_LEN, + .len = sizeof(struct ieee80211_he_6ghz_capa), + }, }; /* policy for the key attributes */ @@ -703,10 +700,7 @@ static const struct nla_policy nl80211_wowlan_tcp_policy[NUM_NL80211_WOWLAN_TCP] = { [NL80211_WOWLAN_TCP_SRC_IPV4] = { .type = NLA_U32 }, [NL80211_WOWLAN_TCP_DST_IPV4] = { .type = NLA_U32 }, - [NL80211_WOWLAN_TCP_DST_MAC] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_WOWLAN_TCP_DST_MAC] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_WOWLAN_TCP_SRC_PORT] = { .type = NLA_U16 }, [NL80211_WOWLAN_TCP_DST_PORT] = { .type = NLA_U16 }, [NL80211_WOWLAN_TCP_DATA_PAYLOAD] = { .type = NLA_MIN_LEN, .len = 1 }, @@ -737,17 +731,15 @@ nl80211_coalesce_policy[NUM_NL80211_ATTR_COALESCE_RULE] = { static const struct nla_policy nl80211_rekey_policy[NUM_NL80211_REKEY_DATA] = { [NL80211_REKEY_DATA_KEK] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_KEK_LEN, + .type = NLA_BINARY, + .len = NL80211_KEK_EXT_LEN }, [NL80211_REKEY_DATA_KCK] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_KCK_LEN, - }, - [NL80211_REKEY_DATA_REPLAY_CTR] = { - .type = NLA_EXACT_LEN_WARN, - .len = NL80211_REPLAY_CTR_LEN + .type = NLA_BINARY, + .len = NL80211_KCK_EXT_LEN }, + [NL80211_REKEY_DATA_REPLAY_CTR] = NLA_POLICY_EXACT_LEN_WARN(NL80211_REPLAY_CTR_LEN), + [NL80211_REKEY_DATA_AKM] = { .type = NLA_U32 }, }; static const struct nla_policy @@ -762,10 +754,7 @@ static const struct nla_policy nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_SSID_LEN }, - [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, [NL80211_SCHED_SCAN_MATCH_PER_BAND_RSSI] = NLA_POLICY_NESTED(nl80211_match_band_rssi_policy), @@ -797,10 +786,7 @@ nl80211_nan_func_policy[NL80211_NAN_FUNC_ATTR_MAX + 1] = { [NL80211_NAN_FUNC_SUBSCRIBE_ACTIVE] = { .type = NLA_FLAG }, [NL80211_NAN_FUNC_FOLLOW_UP_ID] = { .type = NLA_U8 }, [NL80211_NAN_FUNC_FOLLOW_UP_REQ_ID] = { .type = NLA_U8 }, - [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = { - .type = NLA_EXACT_LEN_WARN, - .len = ETH_ALEN - }, + [NL80211_NAN_FUNC_FOLLOW_UP_DEST] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_NAN_FUNC_CLOSE_RANGE] = { .type = NLA_FLAG }, [NL80211_NAN_FUNC_TTL] = { .type = NLA_U32 }, [NL80211_NAN_FUNC_SERVICE_INFO] = { .type = NLA_BINARY, @@ -945,6 +931,9 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, chan->center_freq)) goto nla_put_failure; + if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_OFFSET, chan->freq_offset)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_DISABLED) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_DISABLED)) goto nla_put_failure; @@ -1350,13 +1339,11 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) } static struct ieee80211_channel *nl80211_get_valid_chan(struct wiphy *wiphy, - struct nlattr *tb) + u32 freq) { struct ieee80211_channel *chan; - if (tb == NULL) - return NULL; - chan = ieee80211_get_channel(wiphy, nla_get_u32(tb)); + chan = ieee80211_get_channel_khz(wiphy, freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) return NULL; return chan; @@ -1582,6 +1569,7 @@ static int nl80211_send_coalesce(struct sk_buff *msg, static int nl80211_send_iftype_data(struct sk_buff *msg, + const struct ieee80211_supported_band *sband, const struct ieee80211_sband_iftype_data *iftdata) { const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; @@ -1605,6 +1593,12 @@ nl80211_send_iftype_data(struct sk_buff *msg, return -ENOBUFS; } + if (sband->band == NL80211_BAND_6GHZ && + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, + sizeof(iftdata->he_6ghz_capa), + &iftdata->he_6ghz_capa)) + return -ENOBUFS; + return 0; } @@ -1653,7 +1647,7 @@ static int nl80211_send_band_rateinfo(struct sk_buff *msg, if (!iftdata) return -ENOBUFS; - err = nl80211_send_iftype_data(msg, + err = nl80211_send_iftype_data(msg, sband, &sband->iftype_data[i]); if (err) return err; @@ -2811,13 +2805,17 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, if (!attrs[NL80211_ATTR_WIPHY_FREQ]) return -EINVAL; - control_freq = nla_get_u32(attrs[NL80211_ATTR_WIPHY_FREQ]); + control_freq = MHZ_TO_KHZ( + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + control_freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); memset(chandef, 0, sizeof(*chandef)); - - chandef->chan = ieee80211_get_channel(&rdev->wiphy, control_freq); + chandef->chan = ieee80211_get_channel_khz(&rdev->wiphy, control_freq); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; - chandef->center_freq1 = control_freq; + chandef->center_freq1 = KHZ_TO_MHZ(control_freq); + chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; /* Primary channel not allowed */ @@ -2865,9 +2863,15 @@ int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (attrs[NL80211_ATTR_CENTER_FREQ1]) + if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); + if (attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]) + chandef->freq1_offset = nla_get_u32( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET]); + else + chandef->freq1_offset = 0; + } if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); @@ -3300,6 +3304,9 @@ static int nl80211_send_chandef(struct sk_buff *msg, if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chandef->chan->center_freq)) return -ENOBUFS; + if (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, + chandef->chan->freq_offset)) + return -ENOBUFS; switch (chandef->width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: @@ -3904,14 +3911,25 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) }; void *hdr; struct sk_buff *msg; + bool bigtk_support = false; + + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION)) + bigtk_support = true; + + if ((dev->ieee80211_ptr->iftype == NL80211_IFTYPE_STATION || + dev->ieee80211_ptr->iftype == NL80211_IFTYPE_P2P_CLIENT) && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) + bigtk_support = true; if (info->attrs[NL80211_ATTR_KEY_IDX]) { key_idx = nla_get_u8(info->attrs[NL80211_ATTR_KEY_IDX]); - if (key_idx > 5 && - !wiphy_ext_feature_isset( - &rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + + if (key_idx >= 6 && key_idx <= 7 && !bigtk_support) { + GENL_SET_ERR_MSG(info, "BIGTK not supported"); return -EINVAL; + } } if (info->attrs[NL80211_ATTR_MAC]) @@ -4401,19 +4419,9 @@ static bool vht_set_mcs_mask(struct ieee80211_supported_band *sband, return true; } -static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { - [NL80211_TXRATE_LEGACY] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_RATES }, - [NL80211_TXRATE_HT] = { .type = NLA_BINARY, - .len = NL80211_MAX_SUPP_HT_RATES }, - [NL80211_TXRATE_VHT] = { - .type = NLA_EXACT_LEN_WARN, - .len = sizeof(struct nl80211_txrate_vht), - }, - [NL80211_TXRATE_GI] = { .type = NLA_U8 }, -}; - static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, + struct nlattr *attrs[], + enum nl80211_attrs attr, struct cfg80211_bitrate_mask *mask) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; @@ -4444,14 +4452,14 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, } /* if no rates are given set it back to the defaults */ - if (!info->attrs[NL80211_ATTR_TX_RATES]) + if (!attrs[attr]) goto out; /* The nested attribute uses enum nl80211_band as the index. This maps * directly to the enum nl80211_band values used in cfg80211. */ BUILD_BUG_ON(NL80211_MAX_SUPP_HT_RATES > IEEE80211_HT_MCS_MASK_LEN * 8); - nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { + nla_for_each_nested(tx_rates, attrs[attr], rem) { enum nl80211_band band = nla_type(tx_rates); int err; @@ -4726,6 +4734,8 @@ static void nl80211_check_ap_rate_selectors(struct cfg80211_ap_settings *params, params->ht_required = true; if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_VHT_PHY) params->vht_required = true; + if (rates[2 + i] == BSS_MEMBERSHIP_SELECTOR_HE_PHY) + params->he_required = true; } } @@ -4954,7 +4964,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (info->attrs[NL80211_ATTR_TX_RATES]) { - err = nl80211_parse_tx_bitrate_mask(info, ¶ms.beacon_rate); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, + ¶ms.beacon_rate); if (err) return err; @@ -5995,6 +6007,10 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_CAPABILITY]); + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) params.airtime_weight = nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); @@ -6129,6 +6145,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) + params.he_6ghz_capa = + nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.opmode_notif_used = true; params.opmode_notif = @@ -6173,10 +6193,14 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.vht_capa = NULL; /* HE requires WME */ - if (params.he_capa_len) + if (params.he_capa_len || params.he_6ghz_capa) return -EINVAL; } + /* Ensure that HT/VHT capabilities are not set for 6 GHz HE STA */ + if (params.he_6ghz_capa && (params.ht_capa || params.vht_capa)) + return -EINVAL; + /* When you run into this, adjust the code below for the new flag */ BUILD_BUG_ON(NL80211_STA_FLAG_MAX != 7); @@ -7734,6 +7758,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_scan_request *request; + struct nlattr *scan_freqs = NULL; + bool scan_freqs_khz = false; struct nlattr *attr; struct wiphy *wiphy; int err, tmp, n_ssids = 0, n_channels, i; @@ -7752,9 +7778,17 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto unlock; } - if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { - n_channels = validate_scan_freqs( - info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); + if (info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]) { + if (!wiphy_ext_feature_isset(wiphy, + NL80211_EXT_FEATURE_SCAN_FREQ_KHZ)) + return -EOPNOTSUPP; + scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQ_KHZ]; + scan_freqs_khz = true; + } else if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) + scan_freqs = info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]; + + if (scan_freqs) { + n_channels = validate_scan_freqs(scan_freqs); if (!n_channels) { err = -EINVAL; goto unlock; @@ -7802,13 +7836,16 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) } i = 0; - if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { + if (scan_freqs) { /* user specified, bail out if channel not found */ - nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_FREQUENCIES], tmp) { + nla_for_each_nested(attr, scan_freqs, tmp) { struct ieee80211_channel *chan; + int freq = nla_get_u32(attr); - chan = ieee80211_get_channel(wiphy, nla_get_u32(attr)); + if (!scan_freqs_khz) + freq = MHZ_TO_KHZ(freq); + chan = ieee80211_get_channel_khz(wiphy, freq); if (!chan) { err = -EINVAL; goto out_free; @@ -8904,6 +8941,8 @@ static int nl80211_send_bss(struct sk_buff *msg, struct netlink_callback *cb, goto nla_put_failure; if (nla_put_u16(msg, NL80211_BSS_CAPABILITY, res->capability) || nla_put_u32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq) || + nla_put_u32(msg, NL80211_BSS_FREQUENCY_OFFSET, + res->channel->freq_offset) || nla_put_u32(msg, NL80211_BSS_CHAN_WIDTH, res->scan_width) || nla_put_u32(msg, NL80211_BSS_SEEN_MS_AGO, jiffies_to_msecs(jiffies - intbss->ts))) @@ -9172,6 +9211,7 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) enum nl80211_auth_type auth_type; struct key_parse key; bool local_state_change; + u32 freq; if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; @@ -9228,8 +9268,12 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return -EOPNOTSUPP; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = nl80211_get_valid_chan(&rdev->wiphy, - info->attrs[NL80211_ATTR_WIPHY_FREQ]); + freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); if (!chan) return -EINVAL; @@ -9419,6 +9463,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) struct cfg80211_assoc_request req = {}; const u8 *bssid, *ssid; int err, ssid_len = 0; + u32 freq; if (dev->ieee80211_ptr->conn_owner_nlportid && dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid) @@ -9438,8 +9483,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - chan = nl80211_get_valid_chan(&rdev->wiphy, - info->attrs[NL80211_ATTR_WIPHY_FREQ]); + freq = MHZ_TO_KHZ(nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + chan = nl80211_get_valid_chan(&rdev->wiphy, freq); if (!chan) return -EINVAL; @@ -10119,6 +10167,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) struct cfg80211_connect_params connect; struct wiphy *wiphy; struct cfg80211_cached_keys *connkeys = NULL; + u32 freq = 0; int err; memset(&connect, 0, sizeof(connect)); @@ -10189,14 +10238,21 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); - if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) { - connect.channel = nl80211_get_valid_chan( - wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ]); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ]) + freq = MHZ_TO_KHZ(nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_FREQ])); + if (info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]) + freq += + nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_OFFSET]); + + if (freq) { + connect.channel = nl80211_get_valid_chan(wiphy, freq); if (!connect.channel) return -EINVAL; } else if (info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]) { - connect.channel_hint = nl80211_get_valid_chan( - wiphy, info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ_HINT]); + freq = MHZ_TO_KHZ(freq); + connect.channel_hint = nl80211_get_valid_chan(wiphy, freq); if (!connect.channel_hint) return -EINVAL; } @@ -10735,7 +10791,8 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, if (!rdev->ops->set_bitrate_mask) return -EOPNOTSUPP; - err = nl80211_parse_tx_bitrate_mask(info, &mask); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, &mask); if (err) return err; @@ -10773,9 +10830,18 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; + if (info->attrs[NL80211_ATTR_RECEIVE_MULTICAST] && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_MULTICAST_REGISTRATIONS)) { + GENL_SET_ERR_MSG(info, + "multicast RX registrations are not supported"); + return -EOPNOTSUPP; + } + return cfg80211_mlme_register_mgmt(wdev, info->snd_portid, frame_type, nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]), + info->attrs[NL80211_ATTR_RECEIVE_MULTICAST], info->extack); } @@ -11332,7 +11398,9 @@ static int nl80211_join_mesh(struct sk_buff *skb, struct genl_info *info) } if (info->attrs[NL80211_ATTR_TX_RATES]) { - err = nl80211_parse_tx_bitrate_mask(info, &setup.beacon_rate); + err = nl80211_parse_tx_bitrate_mask(info, info->attrs, + NL80211_ATTR_TX_RATES, + &setup.beacon_rate); if (err) return err; @@ -12286,14 +12354,22 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; - if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) + if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && + nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KEK_EXT_LEN)) return -ERANGE; - if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN) + if (nla_len(tb[NL80211_REKEY_DATA_KCK]) != NL80211_KCK_LEN && + !(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_EXT_KEK_KCK && + nla_len(tb[NL80211_REKEY_DATA_KEK]) == NL80211_KCK_EXT_LEN)) return -ERANGE; rekey_data.kek = nla_data(tb[NL80211_REKEY_DATA_KEK]); rekey_data.kck = nla_data(tb[NL80211_REKEY_DATA_KCK]); rekey_data.replay_ctr = nla_data(tb[NL80211_REKEY_DATA_REPLAY_CTR]); + rekey_data.kek_len = nla_len(tb[NL80211_REKEY_DATA_KEK]); + rekey_data.kck_len = nla_len(tb[NL80211_REKEY_DATA_KCK]); + if (tb[NL80211_REKEY_DATA_AKM]) + rekey_data.akm = nla_get_u32(tb[NL80211_REKEY_DATA_AKM]); wdev_lock(wdev); if (!wdev->current_bss) { @@ -13839,6 +13915,7 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) { + bool dont_wait_for_ack = info->attrs[NL80211_ATTR_DONT_WAIT_FOR_ACK]; struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; @@ -13847,6 +13924,7 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) u8 *dest; u16 proto; bool noencrypt; + u64 cookie = 0; int err; if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -13891,9 +13969,12 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) noencrypt = nla_get_flag(info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]); - return rdev_tx_control_port(rdev, dev, buf, len, - dest, cpu_to_be16(proto), noencrypt); - + err = rdev_tx_control_port(rdev, dev, buf, len, + dest, cpu_to_be16(proto), noencrypt, + dont_wait_for_ack ? NULL : &cookie); + if (!err && !dont_wait_for_ack) + nl_set_extack_cookie_u64(info->extack, cookie); + return err; out: wdev_unlock(wdev); return err; @@ -14058,10 +14139,7 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, if (rdev->ops->reset_tid_config) { err = rdev_reset_tid_config(rdev, dev, peer, tid_conf->tids); - /* If peer is there no other configuration will be - * allowed - */ - if (err || peer) + if (err) return err; } else { return -EINVAL; @@ -14104,6 +14182,29 @@ static int parse_tid_conf(struct cfg80211_registered_device *rdev, nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_RTSCTS_CTRL]); } + if (attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]) { + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_AMSDU_CTRL); + tid_conf->amsdu = + nla_get_u8(attrs[NL80211_TID_CONFIG_ATTR_AMSDU_CTRL]); + } + + if (attrs[NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE]) { + u32 idx = NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE, attr; + + tid_conf->txrate_type = nla_get_u8(attrs[idx]); + + if (tid_conf->txrate_type != NL80211_TX_RATE_AUTOMATIC) { + attr = NL80211_TID_CONFIG_ATTR_TX_RATE; + err = nl80211_parse_tx_bitrate_mask(info, attrs, attr, + &tid_conf->txrate_mask); + if (err) + return err; + + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE); + } + tid_conf->mask |= BIT(NL80211_TID_CONFIG_ATTR_TX_RATE_TYPE); + } + if (peer) mask = rdev->wiphy.tid_config_support.peer; else @@ -15215,14 +15316,27 @@ static int nl80211_add_scan_req(struct sk_buff *msg, } nla_nest_end(msg, nest); - nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_FREQUENCIES); - if (!nest) - goto nla_put_failure; - for (i = 0; i < req->n_channels; i++) { - if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) { + nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ); + if (!nest) goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, + ieee80211_channel_to_khz(req->channels[i]))) + goto nla_put_failure; + } + nla_nest_end(msg, nest); + } else { + nest = nla_nest_start_noflag(msg, + NL80211_ATTR_SCAN_FREQUENCIES); + if (!nest) + goto nla_put_failure; + for (i = 0; i < req->n_channels; i++) { + if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + goto nla_put_failure; + } + nla_nest_end(msg, nest); } - nla_nest_end(msg, nest); if (req->ie && nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie)) @@ -15542,10 +15656,19 @@ void cfg80211_rx_unprot_mlme_mgmt(struct net_device *dev, const u8 *buf, if (WARN_ON(len < 2)) return; - if (ieee80211_is_deauth(mgmt->frame_control)) + if (ieee80211_is_deauth(mgmt->frame_control)) { cmd = NL80211_CMD_UNPROT_DEAUTHENTICATE; - else + } else if (ieee80211_is_disassoc(mgmt->frame_control)) { cmd = NL80211_CMD_UNPROT_DISASSOCIATE; + } else if (ieee80211_is_beacon(mgmt->frame_control)) { + if (wdev->unprot_beacon_reported && + elapsed_jiffies_msecs(wdev->unprot_beacon_reported) < 10000) + return; + cmd = NL80211_CMD_UNPROT_BEACON; + wdev->unprot_beacon_reported = jiffies; + } else { + return; + } trace_cfg80211_rx_unprot_mlme_mgmt(dev, buf, len); nl80211_send_mlme_event(rdev, dev, buf, len, cmd, GFP_ATOMIC, -1, @@ -16224,7 +16347,8 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, netdev->ifindex)) || nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), NL80211_ATTR_PAD) || - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, KHZ_TO_MHZ(freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, freq % 1000) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, buf) || @@ -16241,8 +16365,9 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, return -ENOBUFS; } -void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp) +static void nl80211_frame_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp, enum nl80211_commands command) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); @@ -16250,13 +16375,16 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, struct sk_buff *msg; void *hdr; - trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + if (command == NL80211_CMD_FRAME_TX_STATUS) + trace_cfg80211_mgmt_tx_status(wdev, cookie, ack); + else + trace_cfg80211_control_port_tx_status(wdev, cookie, ack); msg = nlmsg_new(100 + len, gfp); if (!msg) return; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS); + hdr = nl80211hdr_put(msg, 0, 0, 0, command); if (!hdr) { nlmsg_free(msg); return; @@ -16279,9 +16407,25 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, NL80211_MCGRP_MLME, gfp); return; - nla_put_failure: +nla_put_failure: nlmsg_free(msg); } + +void cfg80211_control_port_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp) +{ + nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + NL80211_CMD_CONTROL_PORT_FRAME_TX_STATUS); +} +EXPORT_SYMBOL(cfg80211_control_port_tx_status); + +void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie, + const u8 *buf, size_t len, bool ack, gfp_t gfp) +{ + nl80211_frame_tx_status(wdev, cookie, buf, len, ack, gfp, + NL80211_CMD_FRAME_TX_STATUS); +} EXPORT_SYMBOL(cfg80211_mgmt_tx_status); static int __nl80211_rx_control_port(struct net_device *dev, @@ -16850,9 +16994,8 @@ void cfg80211_probe_status(struct net_device *dev, const u8 *addr, } EXPORT_SYMBOL(cfg80211_probe_status); -void cfg80211_report_obss_beacon(struct wiphy *wiphy, - const u8 *frame, size_t len, - int freq, int sig_dbm) +void cfg80211_report_obss_beacon_khz(struct wiphy *wiphy, const u8 *frame, + size_t len, int freq, int sig_dbm) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct sk_buff *msg; @@ -16875,7 +17018,10 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || (freq && - nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq)) || + (nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, + KHZ_TO_MHZ(freq)) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ_OFFSET, + freq % 1000))) || (sig_dbm && nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || nla_put(msg, NL80211_ATTR_FRAME, len, frame)) @@ -16892,7 +17038,7 @@ void cfg80211_report_obss_beacon(struct wiphy *wiphy, spin_unlock_bh(&rdev->beacon_registrations_lock); nlmsg_free(msg); } -EXPORT_SYMBOL(cfg80211_report_obss_beacon); +EXPORT_SYMBOL(cfg80211_report_obss_beacon_khz); #ifdef CONFIG_PM static int cfg80211_net_detect_results(struct sk_buff *msg, diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index a41e94a49a89..d3e8e426c486 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -11,8 +11,6 @@ int nl80211_init(void); void nl80211_exit(void); -extern const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; - void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd); bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index 63dc8023447f..a95c79d18349 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -187,10 +187,9 @@ static int pmsr_parse_peer(struct cfg80211_registered_device *rdev, /* reuse info->attrs */ memset(info->attrs, 0, sizeof(*info->attrs) * (NL80211_ATTR_MAX + 1)); - /* need to validate here, we don't want to have validation recursion */ err = nla_parse_nested_deprecated(info->attrs, NL80211_ATTR_MAX, tb[NL80211_PMSR_PEER_ATTR_CHAN], - nl80211_policy, info->extack); + NULL, info->extack); if (err) return err; diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 6582d155e2fc..d5e28239e030 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -90,7 +90,7 @@ static const struct ieee80211_radiotap_namespace radiotap_ns = { * iterator.this_arg for type "type" safely on all arches. * * Example code: - * See Documentation/networking/radiotap-headers.txt + * See Documentation/networking/radiotap-headers.rst */ int ieee80211_radiotap_iterator_init( diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 99462f0c4e08..950d57494168 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -748,14 +748,17 @@ static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, - const bool noencrypt) + const bool noencrypt, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, - dest, proto, noencrypt); - trace_rdev_return_int(&rdev->wiphy, ret); + dest, proto, noencrypt, cookie); + if (cookie) + trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); + else + trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -819,13 +822,16 @@ rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, } static inline void -rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, u16 frame_type, bool reg) +rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, + struct wireless_dev *wdev, + struct mgmt_frame_regs *upd) { might_sleep(); - trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); - rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); + trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); + if (rdev->ops->update_mgmt_frame_registrations) + rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev, + upd); trace_rdev_return_void(&rdev->wiphy); } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index d476d4da0d09..0d74a31ef0ab 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1658,22 +1658,23 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd const struct ieee80211_channel *chan) { const struct ieee80211_freq_range *freq_range = NULL; - u32 max_bandwidth_khz, bw_flags = 0; + u32 max_bandwidth_khz, center_freq_khz, bw_flags = 0; freq_range = ®_rule->freq_range; max_bandwidth_khz = freq_range->max_bandwidth_khz; + center_freq_khz = ieee80211_channel_to_khz(chan); /* Check if auto calculation requested */ if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, - MHZ_TO_KHZ(chan->center_freq), + center_freq_khz, MHZ_TO_KHZ(10))) bw_flags |= IEEE80211_CHAN_NO_10MHZ; if (!cfg80211_does_bw_fit_range(freq_range, - MHZ_TO_KHZ(chan->center_freq), + center_freq_khz, MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; @@ -1710,7 +1711,7 @@ static void handle_channel(struct wiphy *wiphy, flags = chan->orig_flags; - reg_rule = freq_reg_info(wiphy, MHZ_TO_KHZ(chan->center_freq)); + reg_rule = freq_reg_info(wiphy, ieee80211_channel_to_khz(chan)); if (IS_ERR(reg_rule)) { /* * We will disable all channels that do not match our @@ -1729,13 +1730,13 @@ static void handle_channel(struct wiphy *wiphy, if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER && request_wiphy && request_wiphy == wiphy && request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) { - pr_debug("Disabling freq %d MHz for good\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz for good\n", + chan->center_freq, chan->freq_offset); chan->orig_flags |= IEEE80211_CHAN_DISABLED; chan->flags = chan->orig_flags; } else { - pr_debug("Disabling freq %d MHz\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz\n", + chan->center_freq, chan->freq_offset); chan->flags |= IEEE80211_CHAN_DISABLED; } return; @@ -1936,7 +1937,7 @@ static void handle_reg_beacon(struct wiphy *wiphy, unsigned int chan_idx, sband = wiphy->bands[reg_beacon->chan.band]; chan = &sband->channels[chan_idx]; - if (likely(chan->center_freq != reg_beacon->chan.center_freq)) + if (likely(!ieee80211_channel_equal(chan, ®_beacon->chan))) return; if (chan->beacon_found) @@ -2269,18 +2270,18 @@ static void handle_channel_custom(struct wiphy *wiphy, u32 bw_flags = 0; const struct ieee80211_reg_rule *reg_rule = NULL; const struct ieee80211_power_rule *power_rule = NULL; - u32 bw; + u32 bw, center_freq_khz; + center_freq_khz = ieee80211_channel_to_khz(chan); for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { - reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq), - regd, bw); + reg_rule = freq_reg_info_regd(center_freq_khz, regd, bw); if (!IS_ERR(reg_rule)) break; } if (IS_ERR_OR_NULL(reg_rule)) { - pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n", - chan->center_freq); + pr_debug("Disabling freq %d.%03d MHz as custom regd has no rule that fits it\n", + chan->center_freq, chan->freq_offset); if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) { chan->flags |= IEEE80211_CHAN_DISABLED; } else { @@ -3337,8 +3338,8 @@ static bool pending_reg_beacon(struct ieee80211_channel *beacon_chan) struct reg_beacon *pending_beacon; list_for_each_entry(pending_beacon, ®_pending_beacons, list) - if (beacon_chan->center_freq == - pending_beacon->chan.center_freq) + if (ieee80211_channel_equal(beacon_chan, + &pending_beacon->chan)) return true; return false; } @@ -3367,9 +3368,10 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy, if (!reg_beacon) return -ENOMEM; - pr_debug("Found new beacon on frequency: %d MHz (Ch %d) on %s\n", - beacon_chan->center_freq, - ieee80211_frequency_to_channel(beacon_chan->center_freq), + pr_debug("Found new beacon on frequency: %d.%03d MHz (Ch %d) on %s\n", + beacon_chan->center_freq, beacon_chan->freq_offset, + ieee80211_freq_khz_to_channel( + ieee80211_channel_to_khz(beacon_chan)), wiphy_name(wiphy)); memcpy(®_beacon->chan, beacon_chan, diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 4000382aef48..74ea4cfb39fb 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1322,8 +1322,8 @@ cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, return channel; } - freq = ieee80211_channel_to_frequency(channel_number, channel->band); - alt_channel = ieee80211_get_channel(wiphy, freq); + freq = ieee80211_channel_to_freq_khz(channel_number, channel->band); + alt_channel = ieee80211_get_channel_khz(wiphy, freq); if (!alt_channel) { if (channel->band == NL80211_BAND_2GHZ) { /* diff --git a/net/wireless/sme.c b/net/wireless/sme.c index ac3e60aa1fc8..15595cf401de 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,7 +5,7 @@ * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2009 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ @@ -694,6 +694,7 @@ void __cfg80211_connect_result(struct net_device *dev, return; } + wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); @@ -921,6 +922,7 @@ void __cfg80211_roamed(struct wireless_dev *wdev, cfg80211_hold_bss(bss_from_pub(info->bss)); wdev->current_bss = bss_from_pub(info->bss); + wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); @@ -1116,7 +1118,10 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, if (wiphy_ext_feature_isset( wdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + NL80211_EXT_FEATURE_BEACON_PROTECTION) || + wiphy_ext_feature_isset( + wdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) rdev_del_key(rdev, dev, i, false, NULL); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 839df54cee21..b23cab016521 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -112,24 +112,29 @@ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ - __field(u32, center_freq) + __field(u32, center_freq) \ + __field(u16, freq_offset) #define CHAN_ASSIGN(chan) \ do { \ if (chan) { \ __entry->band = chan->band; \ __entry->center_freq = chan->center_freq; \ + __entry->freq_offset = chan->freq_offset; \ } else { \ __entry->band = 0; \ __entry->center_freq = 0; \ + __entry->freq_offset = 0; \ } \ } while (0) -#define CHAN_PR_FMT "band: %d, freq: %u" -#define CHAN_PR_ARG __entry->band, __entry->center_freq +#define CHAN_PR_FMT "band: %d, freq: %u.%03u" +#define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset #define CHAN_DEF_ENTRY __field(enum nl80211_band, band) \ __field(u32, control_freq) \ + __field(u32, freq_offset) \ __field(u32, width) \ __field(u32, center_freq1) \ + __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHAN_DEF_ASSIGN(chandef) \ do { \ @@ -137,21 +142,27 @@ __entry->band = (chandef)->chan->band; \ __entry->control_freq = \ (chandef)->chan->center_freq; \ + __entry->freq_offset = \ + (chandef)->chan->freq_offset; \ __entry->width = (chandef)->width; \ __entry->center_freq1 = (chandef)->center_freq1;\ + __entry->freq1_offset = (chandef)->freq1_offset;\ __entry->center_freq2 = (chandef)->center_freq2;\ } else { \ __entry->band = 0; \ __entry->control_freq = 0; \ + __entry->freq_offset = 0; \ __entry->width = 0; \ __entry->center_freq1 = 0; \ + __entry->freq1_offset = 0; \ __entry->center_freq2 = 0; \ } \ } while (0) #define CHAN_DEF_PR_FMT \ - "band: %d, control freq: %u, width: %d, cf1: %u, cf2: %u" + "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u" #define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq, \ - __entry->width, __entry->center_freq1, \ + __entry->freq_offset, __entry->width, \ + __entry->center_freq1, __entry->freq1_offset, \ __entry->center_freq2 #define SINFO_ENTRY __field(int, generation) \ @@ -1582,25 +1593,25 @@ TRACE_EVENT(rdev_set_bitrate_mask, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(peer)) ); -TRACE_EVENT(rdev_mgmt_frame_register, +TRACE_EVENT(rdev_update_mgmt_frame_registrations, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - u16 frame_type, bool reg), - TP_ARGS(wiphy, wdev, frame_type, reg), + struct mgmt_frame_regs *upd), + TP_ARGS(wiphy, wdev, upd), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY - __field(u16, frame_type) - __field(bool, reg) + __field(u16, global_stypes) + __field(u16, interface_stypes) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; - __entry->frame_type = frame_type; - __entry->reg = reg; + __entry->global_stypes = upd->global_stypes; + __entry->interface_stypes = upd->interface_stypes; ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", frame_type: 0x%.2x, reg: %s ", - WIPHY_PR_ARG, WDEV_PR_ARG, __entry->frame_type, - __entry->reg ? "true" : "false") + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", global: 0x%.2x, intf: 0x%.2x", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->global_stypes, __entry->interface_stypes) ); TRACE_EVENT(rdev_return_int_tx_rx, @@ -2829,8 +2840,8 @@ TRACE_EVENT(cfg80211_rx_mgmt, __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), - TP_printk(WDEV_PR_FMT ", freq: %d, sig dbm: %d", - WDEV_PR_ARG, __entry->freq, __entry->sig_dbm) + TP_printk(WDEV_PR_FMT ", freq: "KHZ_F", sig dbm: %d", + WDEV_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_mgmt_tx_status, @@ -2850,6 +2861,23 @@ TRACE_EVENT(cfg80211_mgmt_tx_status, WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) ); +TRACE_EVENT(cfg80211_control_port_tx_status, + TP_PROTO(struct wireless_dev *wdev, u64 cookie, bool ack), + TP_ARGS(wdev, cookie, ack), + TP_STRUCT__entry( + WDEV_ENTRY + __field(u64, cookie) + __field(bool, ack) + ), + TP_fast_assign( + WDEV_ASSIGN; + __entry->cookie = cookie; + __entry->ack = ack; + ), + TP_printk(WDEV_PR_FMT", cookie: %llu, ack: %s", + WDEV_PR_ARG, __entry->cookie, BOOL_TO_STR(__entry->ack)) +); + TRACE_EVENT(cfg80211_rx_control_port, TP_PROTO(struct net_device *netdev, struct sk_buff *skb, bool unencrypted), @@ -3110,8 +3138,8 @@ TRACE_EVENT(cfg80211_report_obss_beacon, __entry->freq = freq; __entry->sig_dbm = sig_dbm; ), - TP_printk(WIPHY_PR_FMT ", freq: %d, sig_dbm: %d", - WIPHY_PR_ARG, __entry->freq, __entry->sig_dbm) + TP_printk(WIPHY_PR_FMT ", freq: "KHZ_F", sig_dbm: %d", + WIPHY_PR_ARG, PR_KHZ(__entry->freq), __entry->sig_dbm) ); TRACE_EVENT(cfg80211_tdls_oper_request, diff --git a/net/wireless/util.c b/net/wireless/util.c index 6590efbbcbb9..4d3b76f94f55 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2019 Intel Corporation + * Copyright (C) 2018-2020 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> @@ -72,7 +72,7 @@ u32 ieee80211_mandatory_rates(struct ieee80211_supported_band *sband, } EXPORT_SYMBOL(ieee80211_mandatory_rates); -int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) +u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) { /* see 802.11 17.3.8.3.2 and Annex J * there are overlapping channel numbers in 5GHz and 2GHz bands */ @@ -81,34 +81,39 @@ int ieee80211_channel_to_frequency(int chan, enum nl80211_band band) switch (band) { case NL80211_BAND_2GHZ: if (chan == 14) - return 2484; + return MHZ_TO_KHZ(2484); else if (chan < 14) - return 2407 + chan * 5; + return MHZ_TO_KHZ(2407 + chan * 5); break; case NL80211_BAND_5GHZ: if (chan >= 182 && chan <= 196) - return 4000 + chan * 5; + return MHZ_TO_KHZ(4000 + chan * 5); else - return 5000 + chan * 5; + return MHZ_TO_KHZ(5000 + chan * 5); break; case NL80211_BAND_6GHZ: - /* see 802.11ax D4.1 27.3.22.2 */ + /* see 802.11ax D6.1 27.3.23.2 */ + if (chan == 2) + return MHZ_TO_KHZ(5935); if (chan <= 253) - return 5940 + chan * 5; + return MHZ_TO_KHZ(5950 + chan * 5); break; case NL80211_BAND_60GHZ: if (chan < 7) - return 56160 + chan * 2160; + return MHZ_TO_KHZ(56160 + chan * 2160); break; default: ; } return 0; /* not supported */ } -EXPORT_SYMBOL(ieee80211_channel_to_frequency); +EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); -int ieee80211_frequency_to_channel(int freq) +int ieee80211_freq_khz_to_channel(u32 freq) { + /* TODO: just handle MHz for now */ + freq = KHZ_TO_MHZ(freq); + /* see 802.11 17.3.8.3.2 and Annex J */ if (freq == 2484) return 14; @@ -126,9 +131,10 @@ int ieee80211_frequency_to_channel(int freq) else return 0; } -EXPORT_SYMBOL(ieee80211_frequency_to_channel); +EXPORT_SYMBOL(ieee80211_freq_khz_to_channel); -struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq) +struct ieee80211_channel *ieee80211_get_channel_khz(struct wiphy *wiphy, + u32 freq) { enum nl80211_band band; struct ieee80211_supported_band *sband; @@ -141,14 +147,16 @@ struct ieee80211_channel *ieee80211_get_channel(struct wiphy *wiphy, int freq) continue; for (i = 0; i < sband->n_channels; i++) { - if (sband->channels[i].center_freq == freq) - return &sband->channels[i]; + struct ieee80211_channel *chan = &sband->channels[i]; + + if (ieee80211_channel_to_khz(chan) == freq) + return chan; } } return NULL; } -EXPORT_SYMBOL(ieee80211_get_channel); +EXPORT_SYMBOL(ieee80211_get_channel_khz); static void set_mandatory_flags_band(struct ieee80211_supported_band *sband) { @@ -234,7 +242,9 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, int max_key_idx = 5; if (wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_BEACON_PROTECTION)) + NL80211_EXT_FEATURE_BEACON_PROTECTION) || + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; if (key_idx < 0 || key_idx > max_key_idx) return -EINVAL; @@ -2030,10 +2040,10 @@ EXPORT_SYMBOL(cfg80211_send_layer2_update); int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, enum ieee80211_vht_chanwidth bw, - int mcs, bool ext_nss_bw_capable) + int mcs, bool ext_nss_bw_capable, + unsigned int max_vht_nss) { u16 map = le16_to_cpu(cap->supp_mcs.rx_mcs_map); - int max_vht_nss = 0; int ext_nss_bw; int supp_width; int i, mcs_encoding; @@ -2041,7 +2051,7 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, if (map == 0xffff) return 0; - if (WARN_ON(mcs > 9)) + if (WARN_ON(mcs > 9 || max_vht_nss > 8)) return 0; if (mcs <= 7) mcs_encoding = 0; @@ -2050,16 +2060,18 @@ int ieee80211_get_vht_max_nss(struct ieee80211_vht_cap *cap, else mcs_encoding = 2; - /* find max_vht_nss for the given MCS */ - for (i = 7; i >= 0; i--) { - int supp = (map >> (2 * i)) & 3; + if (!max_vht_nss) { + /* find max_vht_nss for the given MCS */ + for (i = 7; i >= 0; i--) { + int supp = (map >> (2 * i)) & 3; - if (supp == 3) - continue; + if (supp == 3) + continue; - if (supp >= mcs_encoding) { - max_vht_nss = i + 1; - break; + if (supp >= mcs_encoding) { + max_vht_nss = i + 1; + break; + } } } diff --git a/net/x25/Kconfig b/net/x25/Kconfig index 2ecb2e5e241e..9f0d58b0b90b 100644 --- a/net/x25/Kconfig +++ b/net/x25/Kconfig @@ -20,8 +20,8 @@ config X25 You can read more about X.25 at <http://www.sangoma.com/tutorials/x25/> and <http://docwiki.cisco.com/wiki/X.25>. Information about X.25 for Linux is contained in the files - <file:Documentation/networking/x25.txt> and - <file:Documentation/networking/x25-iface.txt>. + <file:Documentation/networking/x25.rst> and + <file:Documentation/networking/x25-iface.rst>. One connects to an X.25 network either with a dedicated network card using the X.21 protocol (not yet supported by Linux) or one can do diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 71e2bdafb2ce..30cdc4315f42 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o +obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o +obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 3889bd9aec46..1bbaf1747e4f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -30,9 +30,9 @@ void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) if (!xs->tx) return; - spin_lock_irqsave(&umem->xsk_list_lock, flags); - list_add_rcu(&xs->list, &umem->xsk_list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); + list_add_rcu(&xs->list, &umem->xsk_tx_list); + spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); } void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) @@ -42,9 +42,9 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) if (!xs->tx) return; - spin_lock_irqsave(&umem->xsk_list_lock, flags); + spin_lock_irqsave(&umem->xsk_tx_list_lock, flags); list_del_rcu(&xs->list); - spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + spin_unlock_irqrestore(&umem->xsk_tx_list_lock, flags); } /* The umem is stored both in the _rx struct and the _tx struct as we do @@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem) umem->zc = false; } -static void xdp_umem_unmap_pages(struct xdp_umem *umem) -{ - unsigned int i; - - for (i = 0; i < umem->npgs; i++) - if (PageHighMem(umem->pgs[i])) - vunmap(umem->pages[i].addr); -} - -static int xdp_umem_map_pages(struct xdp_umem *umem) -{ - unsigned int i; - void *addr; - - for (i = 0; i < umem->npgs; i++) { - if (PageHighMem(umem->pgs[i])) - addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL); - else - addr = page_address(umem->pgs[i]); - - if (!addr) { - xdp_umem_unmap_pages(umem); - return -ENOMEM; - } - - umem->pages[i].addr = addr; - } - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); @@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - xsk_reuseq_destroy(umem); - - xdp_umem_unmap_pages(umem); + xp_destroy(umem->pool); xdp_umem_unpin_pages(umem); - kvfree(umem->pages); - umem->pages = NULL; - xdp_umem_unaccount_pages(umem); kfree(umem); } @@ -279,7 +243,7 @@ void xdp_put_umem(struct xdp_umem *umem) } } -static int xdp_umem_pin_pages(struct xdp_umem *umem) +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) { unsigned int gup_flags = FOLL_WRITE; long npgs; @@ -291,7 +255,7 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem) return -ENOMEM; down_read(¤t->mm->mmap_sem); - npgs = pin_user_pages(umem->address, umem->npgs, + npgs = pin_user_pages(address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); up_read(¤t->mm->mmap_sem); @@ -389,18 +353,15 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - umem->address = (unsigned long)addr; - umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK - : ~((u64)chunk_size - 1); umem->size = size; umem->headroom = headroom; - umem->chunk_size_nohr = chunk_size - headroom; + umem->chunk_size = chunk_size; umem->npgs = (u32)npgs; umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; - INIT_LIST_HEAD(&umem->xsk_list); - spin_lock_init(&umem->xsk_list_lock); + INIT_LIST_HEAD(&umem->xsk_tx_list); + spin_lock_init(&umem->xsk_tx_list_lock); refcount_set(&umem->users, 1); @@ -408,22 +369,17 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) return err; - err = xdp_umem_pin_pages(umem); + err = xdp_umem_pin_pages(umem, (unsigned long)addr); if (err) goto out_account; - umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), - GFP_KERNEL_ACCOUNT); - if (!umem->pages) { + umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, + headroom, size, unaligned_chunks); + if (!umem->pool) { err = -ENOMEM; goto out_pin; } - - err = xdp_umem_map_pages(umem); - if (!err) - return 0; - - kvfree(umem->pages); + return 0; out_pin: xdp_umem_unpin_pages(umem); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index a63a9fb251f5..32067fe98f65 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -6,7 +6,7 @@ #ifndef XDP_UMEM_H_ #define XDP_UMEM_H_ -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u16 queue_id, u16 flags); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index c350108aa38d..b6c0f08bd80d 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -22,7 +22,7 @@ #include <linux/net.h> #include <linux/netdevice.h> #include <linux/rculist.h> -#include <net/xdp_sock.h> +#include <net/xdp_sock_drv.h> #include <net/xdp.h> #include "xsk_queue.h" @@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) READ_ONCE(xs->umem->fq); } -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return xskq_cons_has_entries(umem->fq, cnt); -} -EXPORT_SYMBOL(xsk_umem_has_addrs); - -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return xskq_cons_peek_addr(umem->fq, addr, umem); -} -EXPORT_SYMBOL(xsk_umem_peek_addr); - -void xsk_umem_release_addr(struct xdp_umem *umem) -{ - xskq_cons_release(umem->fq); -} -EXPORT_SYMBOL(xsk_umem_release_addr); - void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { if (umem->need_wakeup & XDP_WAKEUP_RX) @@ -75,7 +57,7 @@ void xsk_set_tx_need_wakeup(struct xdp_umem *umem) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { xs->tx->ring->flags |= XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); @@ -102,7 +84,7 @@ void xsk_clear_tx_need_wakeup(struct xdp_umem *umem) return; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { xs->tx->ring->flags &= ~XDP_RING_NEED_WAKEUP; } rcu_read_unlock(); @@ -117,76 +99,82 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); -/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for - * each page. This is only required in copy mode. - */ -static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, - u32 len, u32 metalen) +void xp_release(struct xdp_buff_xsk *xskb) { - void *to_buf = xdp_umem_get_data(umem, addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { - void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; - u64 page_start = addr & ~(PAGE_SIZE - 1); - u64 first_len = PAGE_SIZE - (addr - page_start); - - memcpy(to_buf, from_buf, first_len); - memcpy(next_pg_addr, from_buf + first_len, - len + metalen - first_len); + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} - return; - } +static u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; - memcpy(to_buf, from_buf, len + metalen); + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u64 offset = xs->umem->headroom; - u64 addr, memcpy_addr; - void *from_buf; - u32 metalen; + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u64 addr; int err; - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc(xs->rx, addr, len); + if (err) { xs->rx_dropped++; - return -ENOSPC; + return err; } - if (unlikely(xdp_data_meta_unsupported(xdp))) { - from_buf = xdp->data; - metalen = 0; - } else { - from_buf = xdp->data_meta; - metalen = xdp->data - xdp->data_meta; - } + xp_release(xskb); + return 0; +} - memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); +static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +{ + void *from_buf, *to_buf; + u32 metalen; - offset += metalen; - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (!err) { - xskq_cons_release(xs->umem->fq); - xdp_return_buff(xdp); - return 0; + if (unlikely(xdp_data_meta_unsupported(from))) { + from_buf = from->data; + to_buf = to->data; + metalen = 0; + } else { + from_buf = from->data_meta; + metalen = from->data - from->data_meta; + to_buf = to->data - metalen; } - xs->rx_dropped++; - return err; + memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, + bool explicit_free) { - int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); + struct xdp_buff *xsk_xdp; + int err; - if (err) + if (len > xsk_umem_get_rx_frame_size(xs->umem)) { xs->rx_dropped++; + return -ENOSPC; + } - return err; + xsk_xdp = xsk_buff_alloc(xs->umem); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOSPC; + } + + xsk_copy_xdp(xsk_xdp, xdp, len); + err = __xsk_rcv_zc(xs, xsk_xdp, len); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + if (explicit_free) + xdp_return_buff(xdp); + return 0; } static bool xsk_is_bound(struct xdp_sock *xs) @@ -199,7 +187,8 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, + bool explicit_free) { u32 len; @@ -211,8 +200,9 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) len = xdp->data_end - xdp->data; - return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? - __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); + return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + __xsk_rcv_zc(xs, xdp, len) : + __xsk_rcv(xs, xdp, len, explicit_free); } static void xsk_flush(struct xdp_sock *xs) @@ -224,46 +214,11 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 metalen = xdp->data - xdp->data_meta; - u32 len = xdp->data_end - xdp->data; - u64 offset = xs->umem->headroom; - void *buffer; - u64 addr; int err; spin_lock_bh(&xs->rx_lock); - - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { - err = -EINVAL; - goto out_unlock; - } - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - err = -ENOSPC; - goto out_drop; - } - - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data_meta, len + metalen); - - addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (err) - goto out_drop; - - xskq_cons_release(xs->umem->fq); - xskq_prod_submit(xs->rx); - - spin_unlock_bh(&xs->rx_lock); - - xs->sk.sk_data_ready(&xs->sk); - return 0; - -out_drop: - xs->rx_dropped++; -out_unlock: + err = xsk_rcv(xs, xdp, false); + xsk_flush(xs); spin_unlock_bh(&xs->rx_lock); return err; } @@ -273,7 +228,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp); + err = xsk_rcv(xs, xdp, true); if (err) return err; @@ -305,7 +260,7 @@ void xsk_umem_consume_tx_done(struct xdp_umem *umem) struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { __xskq_cons_release(xs->tx); xs->sk.sk_write_space(&xs->sk); } @@ -318,11 +273,11 @@ bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc) struct xdp_sock *xs; rcu_read_lock(); - list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + list_for_each_entry_rcu(xs, &umem->xsk_tx_list, list) { if (!xskq_cons_peek_desc(xs->tx, desc, umem)) continue; - /* This is the backpreassure mechanism for the Tx path. + /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. @@ -404,9 +359,9 @@ static int xsk_generic_xmit(struct sock *sk) skb_put(skb, len); addr = desc.addr; - buffer = xdp_umem_get_data(xs->umem, addr); + buffer = xsk_buff_raw_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); - /* This is the backpreassure mechanism for the Tx path. + /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed * if there is space in it. This avoids having to implement * any buffering in the Tx path. @@ -629,24 +584,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } -/* Check if umem pages are contiguous. - * If zero-copy mode, use the DMA address to do the page contiguity check - * For all other modes we use addr (kernel virtual address) - * Store the result in the low bits of addr. - */ -static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) -{ - struct xdp_umem_page *pgs = umem->pages; - int i, is_contig; - - for (i = 0; i < umem->npgs - 1; i++) { - is_contig = (flags & XDP_ZEROCOPY) ? - (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : - (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); - pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; - } -} - static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -729,23 +666,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } else { /* This xsk has its own umem. */ - xskq_set_umem(xs->umem->fq, xs->umem->size, - xs->umem->chunk_mask); - xskq_set_umem(xs->umem->cq, xs->umem->size, - xs->umem->chunk_mask); - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) goto out_unlock; - - xsk_check_page_contiguity(xs->umem, flags); } xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); - xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); out_unlock: @@ -860,6 +788,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); + if (optname == XDP_UMEM_FILL_RING) + xp_set_fq(xs->umem->pool, *q); mutex_unlock(&xs->mutex); return err; } diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h index 4cfd106bdb53..455ddd480f3d 100644 --- a/net/xdp/xsk.h +++ b/net/xdp/xsk.h @@ -4,6 +4,20 @@ #ifndef XSK_H_ #define XSK_H_ +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + +/* Flags for the umem flags field. + * + * The NEED_WAKEUP flag is 1 due to the reuse of the flags field for public + * flags. See inlude/uapi/include/linux/if_xdp.h. + */ +#define XDP_UMEM_USES_NEED_WAKEUP BIT(1) + struct xdp_ring_offset_v1 { __u64 producer; __u64 consumer; @@ -17,9 +31,25 @@ struct xdp_mmap_offsets_v1 { struct xdp_ring_offset_v1 cr; }; +/* Nodes are linked in the struct xdp_sock map_list field, and used to + * track which maps a certain socket reside in. + */ + +struct xsk_map_node { + struct list_head node; + struct xsk_map *map; + struct xdp_sock **map_entry; +}; + static inline struct xdp_sock *xdp_sk(struct sock *sk) { return (struct xdp_sock *)sk; } +bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry); +int xsk_map_inc(struct xsk_map *map); +void xsk_map_put(struct xsk_map *map); + #endif /* XSK_H_ */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c new file mode 100644 index 000000000000..540ed75e4482 --- /dev/null +++ b/net/xdp/xsk_buff_pool.c @@ -0,0 +1,336 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <net/xsk_buff_pool.h> +#include <net/xdp_sock.h> +#include <linux/dma-direct.h> +#include <linux/dma-noncoherent.h> +#include <linux/swiotlb.h> + +#include "xsk_queue.h" + +static void xp_addr_unmap(struct xsk_buff_pool *pool) +{ + vunmap(pool->addrs); +} + +static int xp_addr_map(struct xsk_buff_pool *pool, + struct page **pages, u32 nr_pages) +{ + pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!pool->addrs) + return -ENOMEM; + return 0; +} + +void xp_destroy(struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + xp_addr_unmap(pool); + kvfree(pool->heads); + kvfree(pool); +} + +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned) +{ + struct xsk_buff_pool *pool; + struct xdp_buff_xsk *xskb; + int err; + u32 i; + + pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL); + if (!pool) + goto out; + + pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL); + if (!pool->heads) + goto out; + + pool->chunk_mask = ~((u64)chunk_size - 1); + pool->addrs_cnt = size; + pool->heads_cnt = chunks; + pool->free_heads_cnt = chunks; + pool->headroom = headroom; + pool->chunk_size = chunk_size; + pool->cheap_dma = true; + pool->unaligned = unaligned; + pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0; i < pool->free_heads_cnt; i++) { + xskb = &pool->heads[i]; + xskb->pool = pool; + xskb->xdp.frame_sz = chunk_size - headroom; + pool->free_heads[i] = xskb; + } + + err = xp_addr_map(pool, pages, nr_pages); + if (!err) + return pool; + +out: + xp_destroy(pool); + return NULL; +} + +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq) +{ + pool->fq = fq; +} + +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) +{ + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) + pool->heads[i].xdp.rxq = rxq; +} +EXPORT_SYMBOL(xp_set_rxq_info); + +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +{ + dma_addr_t *dma; + u32 i; + + if (pool->dma_pages_cnt == 0) + return; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = &pool->dma_pages[i]; + if (*dma) { + dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + *dma = 0; + } + } + + kvfree(pool->dma_pages); + pool->dma_pages_cnt = 0; + pool->dev = NULL; +} +EXPORT_SYMBOL(xp_dma_unmap); + +static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) +{ + u32 i; + + for (i = 0; i < pool->dma_pages_cnt - 1; i++) { + if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1]) + pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; + else + pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; + } +} + +static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_SWIOTLB) + phys_addr_t paddr; + u32 i; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); + if (is_swiotlb_buffer(paddr)) + return false; + } +#endif + return true; +} + +static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_HAS_DMA) + const struct dma_map_ops *ops = get_dma_ops(pool->dev); + + if (ops) { + return !ops->sync_single_for_cpu && + !ops->sync_single_for_device; + } + + if (!dma_is_direct(ops)) + return false; + + if (!xp_check_swiotlb_dma(pool)) + return false; + + if (!dev_is_dma_coherent(pool->dev)) { +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) + return false; +#endif + } +#endif + return true; +} + +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages) +{ + dma_addr_t dma; + u32 i; + + pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages), + GFP_KERNEL); + if (!pool->dma_pages) + return -ENOMEM; + + pool->dev = dev; + pool->dma_pages_cnt = nr_pages; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + if (dma_mapping_error(dev, dma)) { + xp_dma_unmap(pool, attrs); + return -ENOMEM; + } + pool->dma_pages[i] = dma; + } + + if (pool->unaligned) + xp_check_dma_contiguity(pool); + + pool->dev = dev; + pool->cheap_dma = xp_check_cheap_dma(pool); + return 0; +} +EXPORT_SYMBOL(xp_dma_map); + +static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr) +{ + return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); +} + +static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_unaligned_extract_addr(*addr); + if (*addr >= pool->addrs_cnt || + *addr + pool->chunk_size > pool->addrs_cnt || + xp_addr_crosses_non_contig_pg(pool, *addr)) + return false; + return true; +} + +static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_aligned_extract_addr(pool, *addr); + return *addr < pool->addrs_cnt; +} + +static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + if (pool->free_heads_cnt == 0) + return NULL; + + xskb = pool->free_heads[--pool->free_heads_cnt]; + + for (;;) { + if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + xp_release(xskb); + return NULL; + } + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (!ok) { + pool->fq->invalid_descs++; + xskq_cons_release(pool->fq); + continue; + } + break; + } + xskq_cons_release(pool->fq); + + xskb->orig_addr = addr; + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; + if (pool->dma_pages_cnt) { + xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + + XDP_PACKET_HEADROOM; + } + return xskb; +} + +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + + if (!pool->free_list_cnt) { + xskb = __xp_alloc(pool); + if (!xskb) + return NULL; + } else { + pool->free_list_cnt--; + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, + free_list_node); + list_del(&xskb->free_list_node); + } + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; + + if (!pool->cheap_dma) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, + pool->frame_len, + DMA_BIDIRECTIONAL); + } + return &xskb->xdp; +} +EXPORT_SYMBOL(xp_alloc); + +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) +{ + if (pool->free_list_cnt >= count) + return true; + return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); +} +EXPORT_SYMBOL(xp_can_alloc); + +void xp_free(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_list_cnt++; + list_add(&xskb->free_list_node, &xskb->pool->free_list); +} +EXPORT_SYMBOL(xp_free); + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return pool->addrs + addr; +} +EXPORT_SYMBOL(xp_raw_get_data); + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); +} +EXPORT_SYMBOL(xp_raw_get_dma); + +void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb) +{ + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, + xskb->pool->frame_len, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_cpu_slow); + +void xp_dma_sync_for_device_slow(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) +{ + dma_sync_single_range_for_device(pool->dev, dma, 0, + size, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_device_slow); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index f59791ba43a0..0163b26aaf63 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; - du.chunk_size = umem->chunk_size_nohr + umem->headroom; + du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; du.ifindex = umem->dev ? umem->dev->ifindex : 0; du.queue_id = umem->queue_id; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index c90e9c1e3c63..6cf9586e5027 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -6,18 +6,10 @@ #include <linux/log2.h> #include <linux/slab.h> #include <linux/overflow.h> +#include <net/xdp_sock_drv.h> #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask) -{ - if (!q) - return; - - q->size = size; - q->chunk_mask = chunk_mask; -} - static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; @@ -63,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } - -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - struct xdp_umem_fq_reuse *newq; - - /* Check for overflow */ - if (nentries > (u32)roundup_pow_of_two(nentries)) - return NULL; - nentries = roundup_pow_of_two(nentries); - - newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); - if (!newq) - return NULL; - memset(newq, 0, offsetof(typeof(*newq), handles)); - - newq->nentries = nentries; - return newq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); - -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; - - if (!oldq) { - umem->fq_reuse = newq; - return NULL; - } - - if (newq->nentries < oldq->length) - return newq; - - memcpy(newq->handles, oldq->handles, - array_size(oldq->length, sizeof(u64))); - newq->length = oldq->length; - - umem->fq_reuse = newq; - return oldq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_swap); - -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ - kvfree(rq); -} -EXPORT_SYMBOL_GPL(xsk_reuseq_free); - -void xsk_reuseq_destroy(struct xdp_umem *umem) -{ - xsk_reuseq_free(umem->fq_reuse); - umem->fq_reuse = NULL; -} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index b50bb5c76da5..5b5d24d2dd37 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -9,6 +9,9 @@ #include <linux/types.h> #include <linux/if_xdp.h> #include <net/xdp_sock.h> +#include <net/xsk_buff_pool.h> + +#include "xsk.h" struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; @@ -29,8 +32,6 @@ struct xdp_umem_ring { }; struct xsk_queue { - u64 chunk_mask; - u64 size; u32 ring_mask; u32 nentries; u32 cached_prod; @@ -103,98 +104,73 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, - u64 addr, - u64 length) +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { - bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; - bool next_pg_contig = - (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr & - XSK_NEXT_PG_CONTIG_MASK; - - return cross_pg && !next_pg_contig; -} + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; -static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, - u64 addr, - u64 length, - struct xdp_umem *umem) -{ - u64 base_addr = xsk_umem_extract_addr(addr); + if (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; - addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->size || addr >= q->size || - xskq_cons_crosses_non_contig_pg(umem, addr, length)) { - q->invalid_descs++; - return false; + *addr = ring->desc[idx]; + return true; } - return true; + return false; } -static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) +static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { - if (addr >= q->size) { - q->invalid_descs++; + u64 chunk, chunk_end; + + chunk = xp_aligned_extract_addr(pool, desc->addr); + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); + if (chunk != chunk_end) + return false; + + if (chunk >= pool->addrs_cnt) return false; - } + if (desc->options) + return false; return true; } -static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) { - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; + u64 addr, base_addr; - *addr = ring->desc[idx] & q->chunk_mask; + base_addr = xp_unaligned_extract_addr(desc->addr); + addr = xp_unaligned_add_offset_to_addr(desc->addr); - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_cons_is_valid_unaligned(q, *addr, - umem->chunk_size_nohr, - umem)) - return true; - goto out; - } + if (desc->len > pool->chunk_size) + return false; - if (xskq_cons_is_valid_addr(q, *addr)) - return true; + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; -out: - q->cached_cons++; - } + if (desc->options) + return false; + return true; +} - return false; +static inline bool xp_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); } static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xdp_umem *umem) { - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) - return false; - - if (d->len > umem->chunk_size_nohr || d->options) { - q->invalid_descs++; - return false; - } - - return true; - } - - if (!xskq_cons_is_valid_addr(q, d->addr)) - return false; - - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || - d->options) { + if (!xp_validate_desc(umem->pool, d)) { q->invalid_descs++; return false; } - return true; } @@ -250,12 +226,11 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) return entries >= cnt; } -static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) +static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) xskq_cons_get_entries(q); - return xskq_cons_read_addr(q, addr, umem); + return xskq_cons_read_addr_unchecked(q, addr); } static inline bool xskq_cons_peek_desc(struct xsk_queue *q, @@ -379,11 +354,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -void xskq_set_umem(struct xsk_queue *q, u64 size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -/* Executed by the core when the entire UMEM gets freed */ -void xsk_reuseq_destroy(struct xdp_umem *umem); - #endif /* _LINUX_XSK_QUEUE_H */ diff --git a/net/xdp/xskmap.c b/net/xdp/xskmap.c new file mode 100644 index 000000000000..1dc7208c71ba --- /dev/null +++ b/net/xdp/xskmap.c @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XSKMAP used for AF_XDP sockets + * Copyright(c) 2018 Intel Corporation. + */ + +#include <linux/bpf.h> +#include <linux/capability.h> +#include <net/xdp_sock.h> +#include <linux/slab.h> +#include <linux/sched.h> + +#include "xsk.h" + +int xsk_map_inc(struct xsk_map *map) +{ + bpf_map_inc(&map->map); + return 0; +} + +void xsk_map_put(struct xsk_map *map) +{ + bpf_map_put(&map->map); +} + +static struct xsk_map_node *xsk_map_node_alloc(struct xsk_map *map, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *node; + int err; + + node = kzalloc(sizeof(*node), GFP_ATOMIC | __GFP_NOWARN); + if (!node) + return ERR_PTR(-ENOMEM); + + err = xsk_map_inc(map); + if (err) { + kfree(node); + return ERR_PTR(err); + } + + node->map = map; + node->map_entry = map_entry; + return node; +} + +static void xsk_map_node_free(struct xsk_map_node *node) +{ + xsk_map_put(node->map); + kfree(node); +} + +static void xsk_map_sock_add(struct xdp_sock *xs, struct xsk_map_node *node) +{ + spin_lock_bh(&xs->map_list_lock); + list_add_tail(&node->node, &xs->map_list); + spin_unlock_bh(&xs->map_list_lock); +} + +static void xsk_map_sock_delete(struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + struct xsk_map_node *n, *tmp; + + spin_lock_bh(&xs->map_list_lock); + list_for_each_entry_safe(n, tmp, &xs->map_list, node) { + if (map_entry == n->map_entry) { + list_del(&n->node); + xsk_map_node_free(n); + } + } + spin_unlock_bh(&xs->map_list_lock); +} + +static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) +{ + struct bpf_map_memory mem; + int err, numa_node; + struct xsk_map *m; + u64 size; + + if (!capable(CAP_NET_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || + attr->map_flags & ~(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)) + return ERR_PTR(-EINVAL); + + numa_node = bpf_map_attr_numa_node(attr); + size = struct_size(m, xsk_map, attr->max_entries); + + err = bpf_map_charge_init(&mem, size); + if (err < 0) + return ERR_PTR(err); + + m = bpf_map_area_alloc(size, numa_node); + if (!m) { + bpf_map_charge_finish(&mem); + return ERR_PTR(-ENOMEM); + } + + bpf_map_init_from_attr(&m->map, attr); + bpf_map_charge_move(&m->map.memory, &mem); + spin_lock_init(&m->lock); + + return &m->map; +} + +static void xsk_map_free(struct bpf_map *map) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + + bpf_clear_redirect_map(map); + synchronize_net(); + bpf_map_area_free(m); +} + +static int xsk_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = next_key; + + if (index >= m->map.max_entries) { + *next = 0; + return 0; + } + + if (index == m->map.max_entries - 1) + return -ENOENT; + *next = index + 1; + return 0; +} + +static u32 xsk_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + const int ret = BPF_REG_0, mp = BPF_REG_1, index = BPF_REG_2; + struct bpf_insn *insn = insn_buf; + + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(sizeof(struct xsk_sock *))); + *insn++ = BPF_ALU64_IMM(BPF_ADD, mp, offsetof(struct xsk_map, xsk_map)); + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, mp); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(struct xsk_sock *), ret, ret, 0); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + +static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return __xsk_map_lookup_elem(map, *(u32 *)key); +} + +static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static int xsk_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *xs, *old_xs, **map_entry; + u32 i = *(u32 *)key, fd = *(u32 *)value; + struct xsk_map_node *node; + struct socket *sock; + int err; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + if (unlikely(i >= m->map.max_entries)) + return -E2BIG; + + sock = sockfd_lookup(fd, &err); + if (!sock) + return err; + + if (sock->sk->sk_family != PF_XDP) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + xs = (struct xdp_sock *)sock->sk; + + if (!xsk_is_setup_for_bpf_map(xs)) { + sockfd_put(sock); + return -EOPNOTSUPP; + } + + map_entry = &m->xsk_map[i]; + node = xsk_map_node_alloc(m, map_entry); + if (IS_ERR(node)) { + sockfd_put(sock); + return PTR_ERR(node); + } + + spin_lock_bh(&m->lock); + old_xs = READ_ONCE(*map_entry); + if (old_xs == xs) { + err = 0; + goto out; + } else if (old_xs && map_flags == BPF_NOEXIST) { + err = -EEXIST; + goto out; + } else if (!old_xs && map_flags == BPF_EXIST) { + err = -ENOENT; + goto out; + } + xsk_map_sock_add(xs, node); + WRITE_ONCE(*map_entry, xs); + if (old_xs) + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); + sockfd_put(sock); + return 0; + +out: + spin_unlock_bh(&m->lock); + sockfd_put(sock); + xsk_map_node_free(node); + return err; +} + +static int xsk_map_delete_elem(struct bpf_map *map, void *key) +{ + struct xsk_map *m = container_of(map, struct xsk_map, map); + struct xdp_sock *old_xs, **map_entry; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + spin_lock_bh(&m->lock); + map_entry = &m->xsk_map[k]; + old_xs = xchg(map_entry, NULL); + if (old_xs) + xsk_map_sock_delete(old_xs, map_entry); + spin_unlock_bh(&m->lock); + + return 0; +} + +void xsk_map_try_sock_delete(struct xsk_map *map, struct xdp_sock *xs, + struct xdp_sock **map_entry) +{ + spin_lock_bh(&map->lock); + if (READ_ONCE(*map_entry) == xs) { + WRITE_ONCE(*map_entry, NULL); + xsk_map_sock_delete(xs, map_entry); + } + spin_unlock_bh(&map->lock); +} + +const struct bpf_map_ops xsk_map_ops = { + .map_alloc = xsk_map_alloc, + .map_free = xsk_map_free, + .map_get_next_key = xsk_map_get_next_key, + .map_lookup_elem = xsk_map_lookup_elem, + .map_gen_lookup = xsk_map_gen_lookup, + .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, + .map_update_elem = xsk_map_update_elem, + .map_delete_elem = xsk_map_delete_elem, + .map_check_btf = map_check_no_btf, +}; diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index 6921a18201a0..b7fd9c838416 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -99,4 +99,7 @@ config NET_KEY_MIGRATE If unsure, say N. +config XFRM_ESPINTCP + bool + endif # INET diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 212a4fcb4a88..2d4bb4b9f75e 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -11,4 +11,4 @@ obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o -obj-$(CONFIG_INET_ESPINTCP) += espintcp.o +obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 5a0ff665b71a..100e29682b48 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -6,6 +6,9 @@ #include <net/espintcp.h> #include <linux/skmsg.h> #include <net/inet_common.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ipv6_stubs.h> +#endif static void handle_nonesp(struct espintcp_ctx *ctx, struct sk_buff *skb, struct sock *sk) @@ -31,7 +34,12 @@ static void handle_esp(struct sk_buff *skb, struct sock *sk) rcu_read_lock(); skb->dev = dev_get_by_index_rcu(sock_net(sk), skb->skb_iif); local_bh_disable(); - xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) + ipv6_stub->xfrm6_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); + else +#endif + xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, TCP_ENCAP_ESPINTCP); local_bh_enable(); rcu_read_unlock(); } @@ -347,6 +355,9 @@ unlock: static struct proto espintcp_prot __ro_after_init; static struct proto_ops espintcp_ops __ro_after_init; +static struct proto espintcp6_prot; +static struct proto_ops espintcp6_ops; +static DEFINE_MUTEX(tcpv6_prot_mutex); static void espintcp_data_ready(struct sock *sk) { @@ -385,10 +396,14 @@ static void espintcp_destruct(struct sock *sk) bool tcp_is_ulp_esp(struct sock *sk) { - return sk->sk_prot == &espintcp_prot; + return sk->sk_prot == &espintcp_prot || sk->sk_prot == &espintcp6_prot; } EXPORT_SYMBOL_GPL(tcp_is_ulp_esp); +static void build_protos(struct proto *espintcp_prot, + struct proto_ops *espintcp_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops); static int espintcp_init_sk(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -416,8 +431,19 @@ static int espintcp_init_sk(struct sock *sk) strp_check_rcv(&ctx->strp); skb_queue_head_init(&ctx->ike_queue); skb_queue_head_init(&ctx->out_queue); - sk->sk_prot = &espintcp_prot; - sk->sk_socket->ops = &espintcp_ops; + + if (sk->sk_family == AF_INET) { + sk->sk_prot = &espintcp_prot; + sk->sk_socket->ops = &espintcp_ops; + } else { + mutex_lock(&tcpv6_prot_mutex); + if (!espintcp6_prot.recvmsg) + build_protos(&espintcp6_prot, &espintcp6_ops, sk->sk_prot, sk->sk_socket->ops); + mutex_unlock(&tcpv6_prot_mutex); + + sk->sk_prot = &espintcp6_prot; + sk->sk_socket->ops = &espintcp6_ops; + } ctx->saved_data_ready = sk->sk_data_ready; ctx->saved_write_space = sk->sk_write_space; ctx->saved_destruct = sk->sk_destruct; @@ -491,6 +517,20 @@ static __poll_t espintcp_poll(struct file *file, struct socket *sock, return mask; } +static void build_protos(struct proto *espintcp_prot, + struct proto_ops *espintcp_ops, + const struct proto *orig_prot, + const struct proto_ops *orig_ops) +{ + memcpy(espintcp_prot, orig_prot, sizeof(struct proto)); + memcpy(espintcp_ops, orig_ops, sizeof(struct proto_ops)); + espintcp_prot->sendmsg = espintcp_sendmsg; + espintcp_prot->recvmsg = espintcp_recvmsg; + espintcp_prot->close = espintcp_close; + espintcp_prot->release_cb = espintcp_release; + espintcp_ops->poll = espintcp_poll; +} + static struct tcp_ulp_ops espintcp_ulp __read_mostly = { .name = "espintcp", .owner = THIS_MODULE, @@ -499,13 +539,7 @@ static struct tcp_ulp_ops espintcp_ulp __read_mostly = { void __init espintcp_init(void) { - memcpy(&espintcp_prot, &tcp_prot, sizeof(tcp_prot)); - memcpy(&espintcp_ops, &inet_stream_ops, sizeof(inet_stream_ops)); - espintcp_prot.sendmsg = espintcp_sendmsg; - espintcp_prot.recvmsg = espintcp_recvmsg; - espintcp_prot.close = espintcp_close; - espintcp_prot.release_cb = espintcp_release; - espintcp_ops.poll = espintcp_poll; + build_protos(&espintcp_prot, &espintcp_ops, &tcp_prot, &inet_stream_ops); tcp_register_ulp(&espintcp_ulp); } diff --git a/net/xfrm/xfrm_inout.h b/net/xfrm/xfrm_inout.h index c7b0318938e2..efc5e6b2e87b 100644 --- a/net/xfrm/xfrm_inout.h +++ b/net/xfrm/xfrm_inout.h @@ -6,6 +6,38 @@ #ifndef XFRM_INOUT_H #define XFRM_INOUT_H 1 +static inline void xfrm4_extract_header(struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); + XFRM_MODE_SKB_CB(skb)->id = iph->id; + XFRM_MODE_SKB_CB(skb)->frag_off = iph->frag_off; + XFRM_MODE_SKB_CB(skb)->tos = iph->tos; + XFRM_MODE_SKB_CB(skb)->ttl = iph->ttl; + XFRM_MODE_SKB_CB(skb)->optlen = iph->ihl * 4 - sizeof(*iph); + memset(XFRM_MODE_SKB_CB(skb)->flow_lbl, 0, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); +} + +static inline void xfrm6_extract_header(struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *iph = ipv6_hdr(skb); + + XFRM_MODE_SKB_CB(skb)->ihl = sizeof(*iph); + XFRM_MODE_SKB_CB(skb)->id = 0; + XFRM_MODE_SKB_CB(skb)->frag_off = htons(IP_DF); + XFRM_MODE_SKB_CB(skb)->tos = ipv6_get_dsfield(iph); + XFRM_MODE_SKB_CB(skb)->ttl = iph->hop_limit; + XFRM_MODE_SKB_CB(skb)->optlen = 0; + memcpy(XFRM_MODE_SKB_CB(skb)->flow_lbl, iph->flow_lbl, + sizeof(XFRM_MODE_SKB_CB(skb)->flow_lbl)); +#else + WARN_ON_ONCE(1); +#endif +} + static inline void xfrm6_beet_make_header(struct sk_buff *skb) { struct ipv6hdr *iph = ipv6_hdr(skb); diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 8a202c44f89a..bd984ff17c2d 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -353,17 +353,18 @@ xfrm_inner_mode_encap_remove(struct xfrm_state *x, static int xfrm_prepare_input(struct xfrm_state *x, struct sk_buff *skb) { const struct xfrm_mode *inner_mode = &x->inner_mode; - const struct xfrm_state_afinfo *afinfo; - int err = -EAFNOSUPPORT; - - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(x->outer_mode.family); - if (likely(afinfo)) - err = afinfo->extract_input(x, skb); - rcu_read_unlock(); - if (err) - return err; + switch (x->outer_mode.family) { + case AF_INET: + xfrm4_extract_header(skb); + break; + case AF_INET6: + xfrm6_extract_header(skb); + break; + default: + WARN_ON_ONCE(1); + return -EAFNOSUPPORT; + } if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 1e115cbf21d3..c407ecbc5d46 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -145,7 +145,6 @@ static int xfrmi_create(struct net_device *dev) if (err < 0) goto out; - dev_hold(dev); xfrmi_link(xfrmn, xi); return 0; @@ -175,7 +174,6 @@ static void xfrmi_dev_uninit(struct net_device *dev) struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); xfrmi_unlink(xfrmn, xi); - dev_put(dev); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) @@ -778,6 +776,7 @@ static struct pernet_operations xfrmi_net_ops = { static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -785,6 +784,7 @@ static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, @@ -792,6 +792,7 @@ static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, + .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 69c4900db817..e4c23f69f69f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -13,9 +13,15 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> +#include <net/icmp.h> #include <net/inet_ecn.h> #include <net/xfrm.h> +#if IS_ENABLED(CONFIG_IPV6) +#include <net/ip6_route.h> +#include <net/ipv6_stubs.h> +#endif + #include "xfrm_inout.h" static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb); @@ -565,6 +571,22 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) struct xfrm_state *x = skb_dst(skb)->xfrm; int err; + switch (x->outer_mode.family) { + case AF_INET: + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); +#ifdef CONFIG_NETFILTER + IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED; +#endif + break; + case AF_INET6: + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); + +#ifdef CONFIG_NETFILTER + IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; +#endif + break; + } + secpath_reset(skb); if (xfrm_dev_offload_ok(skb, x)) { @@ -611,11 +633,101 @@ out: } EXPORT_SYMBOL_GPL(xfrm_output); +static int xfrm4_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + + if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) + goto out; + + if (!(ip_hdr(skb)->frag_off & htons(IP_DF)) || skb->ignore_df) + goto out; + + mtu = dst_mtu(skb_dst(skb)); + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && + !skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) { + skb->protocol = htons(ETH_P_IP); + + if (skb->sk) + xfrm_local_error(skb, mtu); + else + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_FRAG_NEEDED, htonl(mtu)); + ret = -EMSGSIZE; + } +out: + return ret; +} + +static int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + err = xfrm4_tunnel_check_size(skb); + if (err) + return err; + + XFRM_MODE_SKB_CB(skb)->protocol = ip_hdr(skb)->protocol; + + xfrm4_extract_header(skb); + return 0; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst = skb_dst(skb); + + if (skb->ignore_df) + goto out; + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && + !skb_gso_validate_network_len(skb, ip6_skb_dst_mtu(skb)))) { + skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); + + if (xfrm6_local_dontfrag(skb->sk)) + ipv6_stub->xfrm6_local_rxpmtu(skb, mtu); + else if (skb->sk) + xfrm_local_error(skb, mtu); + else + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + ret = -EMSGSIZE; + } +out: + return ret; +} +#endif + +static int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb) +{ +#if IS_ENABLED(CONFIG_IPV6) + int err; + + err = xfrm6_tunnel_check_size(skb); + if (err) + return err; + + XFRM_MODE_SKB_CB(skb)->protocol = ipv6_hdr(skb)->nexthdr; + + xfrm6_extract_header(skb); + return 0; +#else + WARN_ON_ONCE(1); + return -EAFNOSUPPORT; +#endif +} + static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) { - const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; - int err = -EAFNOSUPPORT; if (x->sel.family == AF_UNSPEC) inner_mode = xfrm_ip2inner_mode(x, @@ -626,13 +738,14 @@ static int xfrm_inner_extract_output(struct xfrm_state *x, struct sk_buff *skb) if (inner_mode == NULL) return -EAFNOSUPPORT; - rcu_read_lock(); - afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); - if (likely(afinfo)) - err = afinfo->extract_output(x, skb); - rcu_read_unlock(); + switch (inner_mode->family) { + case AF_INET: + return xfrm4_extract_output(x, skb); + case AF_INET6: + return xfrm6_extract_output(x, skb); + } - return err; + return -EAFNOSUPPORT; } void xfrm_local_error(struct sk_buff *skb, int mtu) |