diff options
author | Linus Torvalds | 2020-01-28 16:02:33 -0800 |
---|---|---|
committer | Linus Torvalds | 2020-01-28 16:02:33 -0800 |
commit | bd2463ac7d7ec51d432f23bf0e893fb371a908cd (patch) | |
tree | 3da32c23be83adb9d9bda7e51b51fa39f69f2447 /net/ipv4 | |
parent | a78208e2436963d0b2c7d186277d6e1a9755029a (diff) | |
parent | f76e4c167ea2212e23c15ee7e601a865e822c291 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from David Miller:
1) Add WireGuard
2) Add HE and TWT support to ath11k driver, from John Crispin.
3) Add ESP in TCP encapsulation support, from Sabrina Dubroca.
4) Add variable window congestion control to TIPC, from Jon Maloy.
5) Add BCM84881 PHY driver, from Russell King.
6) Start adding netlink support for ethtool operations, from Michal
Kubecek.
7) Add XDP drop and TX action support to ena driver, from Sameeh
Jubran.
8) Add new ipv4 route notifications so that mlxsw driver does not have
to handle identical routes itself. From Ido Schimmel.
9) Add BPF dynamic program extensions, from Alexei Starovoitov.
10) Support RX and TX timestamping in igc, from Vinicius Costa Gomes.
11) Add support for macsec HW offloading, from Antoine Tenart.
12) Add initial support for MPTCP protocol, from Christoph Paasch,
Matthieu Baerts, Florian Westphal, Peter Krystad, and many others.
13) Add Octeontx2 PF support, from Sunil Goutham, Geetha sowjanya, Linu
Cherian, and others.
* git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1469 commits)
net: phy: add default ARCH_BCM_IPROC for MDIO_BCM_IPROC
udp: segment looped gso packets correctly
netem: change mailing list
qed: FW 8.42.2.0 debug features
qed: rt init valid initialization changed
qed: Debug feature: ilt and mdump
qed: FW 8.42.2.0 Add fw overlay feature
qed: FW 8.42.2.0 HSI changes
qed: FW 8.42.2.0 iscsi/fcoe changes
qed: Add abstraction for different hsi values per chip
qed: FW 8.42.2.0 Additional ll2 type
qed: Use dmae to write to widebus registers in fw_funcs
qed: FW 8.42.2.0 Parser offsets modified
qed: FW 8.42.2.0 Queue Manager changes
qed: FW 8.42.2.0 Expose new registers and change windows
qed: FW 8.42.2.0 Internal ram offsets modifications
MAINTAINERS: Add entry for Marvell OcteonTX2 Physical Function driver
Documentation: net: octeontx2: Add RVU HW and drivers overview
octeontx2-pf: ethtool RSS config support
octeontx2-pf: Add basic ethtool support
...
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/Kconfig | 11 | ||||
-rw-r--r-- | net/ipv4/Makefile | 4 | ||||
-rw-r--r-- | net/ipv4/bpf_tcp_ca.c | 252 | ||||
-rw-r--r-- | net/ipv4/esp4.c | 264 | ||||
-rw-r--r-- | net/ipv4/fib_lookup.h | 8 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 33 | ||||
-rw-r--r-- | net/ipv4/fib_trie.c | 194 | ||||
-rw-r--r-- | net/ipv4/gre_offload.c | 2 | ||||
-rw-r--r-- | net/ipv4/inet_connection_sock.c | 20 | ||||
-rw-r--r-- | net/ipv4/ip_output.c | 8 | ||||
-rw-r--r-- | net/ipv4/nexthop.c | 4 | ||||
-rw-r--r-- | net/ipv4/proc.c | 2 | ||||
-rw-r--r-- | net/ipv4/route.c | 31 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 9 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 14 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 16 | ||||
-rw-r--r-- | net/ipv4/tcp_cubic.c | 83 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 36 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 135 | ||||
-rw-r--r-- | net/ipv4/tcp_metrics.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 73 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 6 | ||||
-rw-r--r-- | net/ipv4/udp.c | 6 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 106 | ||||
-rw-r--r-- | net/ipv4/xfrm4_protocol.c | 9 |
26 files changed, 1120 insertions, 223 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index fc816b187170..f96bd489b362 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -378,6 +378,17 @@ config INET_ESP_OFFLOAD If unsure, say N. +config INET_ESPINTCP + bool "IP: ESP in TCP encapsulation (RFC 8229)" + depends on XFRM && INET_ESP + select STREAM_PARSER + select NET_SOCK_MSG + help + Support for RFC 8229 encapsulation of ESP and IKE over + TCP/IPv4 sockets. + + If unsure, say N. + config INET_IPCOMP tristate "IP: IPComp transformation" select INET_XFRM_TUNNEL diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index d57ecfaf89d4..9d97bace13c8 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o + +ifeq ($(CONFIG_BPF_JIT),y) +obj-$(CONFIG_BPF_SYSCALL) += bpf_tcp_ca.o +endif diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c new file mode 100644 index 000000000000..574972bc7299 --- /dev/null +++ b/net/ipv4/bpf_tcp_ca.c @@ -0,0 +1,252 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2019 Facebook */ + +#include <linux/types.h> +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/filter.h> +#include <net/tcp.h> + +static u32 optional_ops[] = { + offsetof(struct tcp_congestion_ops, init), + offsetof(struct tcp_congestion_ops, release), + offsetof(struct tcp_congestion_ops, set_state), + offsetof(struct tcp_congestion_ops, cwnd_event), + offsetof(struct tcp_congestion_ops, in_ack_event), + offsetof(struct tcp_congestion_ops, pkts_acked), + offsetof(struct tcp_congestion_ops, min_tso_segs), + offsetof(struct tcp_congestion_ops, sndbuf_expand), + offsetof(struct tcp_congestion_ops, cong_control), +}; + +static u32 unsupported_ops[] = { + offsetof(struct tcp_congestion_ops, get_info), +}; + +static const struct btf_type *tcp_sock_type; +static u32 tcp_sock_id, sock_id; + +static int bpf_tcp_ca_init(struct btf *btf) +{ + s32 type_id; + + type_id = btf_find_by_name_kind(btf, "sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + sock_id = type_id; + + type_id = btf_find_by_name_kind(btf, "tcp_sock", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + tcp_sock_id = type_id; + tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); + + return 0; +} + +static bool is_optional(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(optional_ops); i++) { + if (member_offset == optional_ops[i]) + return true; + } + + return false; +} + +static bool is_unsupported(u32 member_offset) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) { + if (member_offset == unsupported_ops[i]) + return true; + } + + return false; +} + +extern struct btf *btf_vmlinux; + +static bool bpf_tcp_ca_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + + if (!btf_ctx_access(off, size, type, prog, info)) + return false; + + if (info->reg_type == PTR_TO_BTF_ID && info->btf_id == sock_id) + /* promote it to tcp_sock */ + info->btf_id = tcp_sock_id; + + return true; +} + +static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id) +{ + size_t end; + + if (atype == BPF_READ) + return btf_struct_access(log, t, off, size, atype, next_btf_id); + + if (t != tcp_sock_type) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + switch (off) { + case bpf_ctx_range(struct inet_connection_sock, icsk_ca_priv): + end = offsetofend(struct inet_connection_sock, icsk_ca_priv); + break; + case offsetof(struct inet_connection_sock, icsk_ack.pending): + end = offsetofend(struct inet_connection_sock, + icsk_ack.pending); + break; + case offsetof(struct tcp_sock, snd_cwnd): + end = offsetofend(struct tcp_sock, snd_cwnd); + break; + case offsetof(struct tcp_sock, snd_cwnd_cnt): + end = offsetofend(struct tcp_sock, snd_cwnd_cnt); + break; + case offsetof(struct tcp_sock, snd_ssthresh): + end = offsetofend(struct tcp_sock, snd_ssthresh); + break; + case offsetof(struct tcp_sock, ecn_flags): + end = offsetofend(struct tcp_sock, ecn_flags); + break; + default: + bpf_log(log, "no write support to tcp_sock at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of tcp_sock ended at %zu\n", + off, size, end); + return -EACCES; + } + + return NOT_INIT; +} + +BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) +{ + /* bpf_tcp_ca prog cannot have NULL tp */ + __tcp_send_ack((struct sock *)tp, rcv_nxt); + return 0; +} + +static const struct bpf_func_proto bpf_tcp_send_ack_proto = { + .func = bpf_tcp_send_ack, + .gpl_only = false, + /* In case we want to report error later */ + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg2_type = ARG_ANYTHING, + .btf_id = &tcp_sock_id, +}; + +static const struct bpf_func_proto * +bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_tcp_send_ack: + return &bpf_tcp_send_ack_proto; + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = { + .get_func_proto = bpf_tcp_ca_get_func_proto, + .is_valid_access = bpf_tcp_ca_is_valid_access, + .btf_struct_access = bpf_tcp_ca_btf_struct_access, +}; + +static int bpf_tcp_ca_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct tcp_congestion_ops *utcp_ca; + struct tcp_congestion_ops *tcp_ca; + size_t tcp_ca_name_len; + int prog_fd; + u32 moff; + + utcp_ca = (const struct tcp_congestion_ops *)udata; + tcp_ca = (struct tcp_congestion_ops *)kdata; + + moff = btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct tcp_congestion_ops, flags): + if (utcp_ca->flags & ~TCP_CONG_MASK) + return -EINVAL; + tcp_ca->flags = utcp_ca->flags; + return 1; + case offsetof(struct tcp_congestion_ops, name): + tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name)); + if (!tcp_ca_name_len || + tcp_ca_name_len == sizeof(utcp_ca->name)) + return -EINVAL; + if (tcp_ca_find(utcp_ca->name)) + return -EEXIST; + memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name)); + return 1; + } + + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL)) + return 0; + + /* Ensure bpf_prog is provided for compulsory func ptr */ + prog_fd = (int)(*(unsigned long *)(udata + moff)); + if (!prog_fd && !is_optional(moff) && !is_unsupported(moff)) + return -EINVAL; + + return 0; +} + +static int bpf_tcp_ca_check_member(const struct btf_type *t, + const struct btf_member *member) +{ + if (is_unsupported(btf_member_bit_offset(t, member) / 8)) + return -ENOTSUPP; + return 0; +} + +static int bpf_tcp_ca_reg(void *kdata) +{ + return tcp_register_congestion_control(kdata); +} + +static void bpf_tcp_ca_unreg(void *kdata) +{ + tcp_unregister_congestion_control(kdata); +} + +/* Avoid sparse warning. It is only used in bpf_struct_ops.c. */ +extern struct bpf_struct_ops bpf_tcp_congestion_ops; + +struct bpf_struct_ops bpf_tcp_congestion_ops = { + .verifier_ops = &bpf_tcp_ca_verifier_ops, + .reg = bpf_tcp_ca_reg, + .unreg = bpf_tcp_ca_unreg, + .check_member = bpf_tcp_ca_check_member, + .init_member = bpf_tcp_ca_init_member, + .init = bpf_tcp_ca_init, + .name = "tcp_congestion_ops", +}; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5c967764041f..103c7d599a3c 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -18,6 +18,8 @@ #include <net/icmp.h> #include <net/protocol.h> #include <net/udp.h> +#include <net/tcp.h> +#include <net/espintcp.h> #include <linux/highmem.h> @@ -117,6 +119,132 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) put_page(sg_page(sg)); } +#ifdef CONFIG_INET_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + +static struct sock *esp_find_tcp_sk(struct xfrm_state *x) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct esp_tcp_sk *esk; + __be16 sport, dport; + struct sock *nsk; + struct sock *sk; + + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } + spin_unlock_bh(&x->lock); + + sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4, + dport, x->props.saddr.a4, sport, 0); + if (!sk) + return ERR_PTR(-ENOENT); + + if (!tcp_is_ulp_esp(sk)) { + sock_put(sk); + return ERR_PTR(-EINVAL); + } + + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + + return sk; +} + +static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) +{ + struct sock *sk; + int err; + + rcu_read_lock(); + + sk = esp_find_tcp_sk(x); + err = PTR_ERR_OR_ZERO(sk); + if (err) + goto out; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + err = espintcp_queue_out(sk, skb); + else + err = espintcp_push_skb(sk, skb); + bh_unlock_sock(sk); + +out: + rcu_read_unlock(); + return err; +} + +static int esp_output_tcp_encap_cb(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + + return esp_output_tcp_finish(x, skb); +} + +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + + local_bh_disable(); + err = xfrm_trans_queue_net(xs_net(x), skb, esp_output_tcp_encap_cb); + local_bh_enable(); + + /* EINPROGRESS just happens to do the right thing. It + * actually means that the skb has been consumed and + * isn't coming back. + */ + return err ?: -EINPROGRESS; +} +#else +static int esp_output_tail_tcp(struct xfrm_state *x, struct sk_buff *skb) +{ + kfree_skb(skb); + + return -EOPNOTSUPP; +} +#endif + static void esp_output_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; @@ -147,7 +275,11 @@ static void esp_output_done(struct crypto_async_request *base, int err) secpath_reset(skb); xfrm_dev_resume(skb); } else { - xfrm_output_resume(skb, err); + if (!err && + x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + esp_output_tail_tcp(x, skb); + else + xfrm_output_resume(skb, err); } } @@ -225,45 +357,100 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static int esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +static struct ip_esp_hdr *esp_output_udp_encap(struct sk_buff *skb, + int encap_type, + struct esp_info *esp, + __be16 sport, + __be16 dport) { - int encap_type; struct udphdr *uh; __be32 *udpdata32; - __be16 sport, dport; - struct xfrm_encap_tmpl *encap = x->encap; - struct ip_esp_hdr *esph = esp->esph; unsigned int len; - spin_lock_bh(&x->lock); - sport = encap->encap_sport; - dport = encap->encap_dport; - encap_type = encap->encap_type; - spin_unlock_bh(&x->lock); - len = skb->len + esp->tailen - skb_transport_offset(skb); - if (len + sizeof(struct iphdr) >= IP_MAX_MTU) - return -EMSGSIZE; + if (len + sizeof(struct iphdr) > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); - uh = (struct udphdr *)esph; + uh = (struct udphdr *)esp->esph; uh->source = sport; uh->dest = dport; uh->len = htons(len); uh->check = 0; + *skb_mac_header(skb) = IPPROTO_UDP; + + if (encap_type == UDP_ENCAP_ESPINUDP_NON_IKE) { + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + return (struct ip_esp_hdr *)(udpdata32 + 2); + } + + return (struct ip_esp_hdr *)(uh + 1); +} + +#ifdef CONFIG_INET_ESPINTCP +static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + __be16 *lenp = (void *)esp->esph; + struct ip_esp_hdr *esph; + unsigned int len; + struct sock *sk; + + len = skb->len + esp->tailen - skb_transport_offset(skb); + if (len > IP_MAX_MTU) + return ERR_PTR(-EMSGSIZE); + + rcu_read_lock(); + sk = esp_find_tcp_sk(x); + rcu_read_unlock(); + + if (IS_ERR(sk)) + return ERR_CAST(sk); + + *lenp = htons(len); + esph = (struct ip_esp_hdr *)(lenp + 1); + + return esph; +} +#else +static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, + struct sk_buff *skb, + struct esp_info *esp) +{ + return ERR_PTR(-EOPNOTSUPP); +} +#endif + +static int esp_output_encap(struct xfrm_state *x, struct sk_buff *skb, + struct esp_info *esp) +{ + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph; + __be16 sport, dport; + int encap_type; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: - esph = (struct ip_esp_hdr *)(uh + 1); - break; case UDP_ENCAP_ESPINUDP_NON_IKE: - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - esph = (struct ip_esp_hdr *)(udpdata32 + 2); + esph = esp_output_udp_encap(skb, encap_type, esp, sport, dport); + break; + case TCP_ENCAP_ESPINTCP: + esph = esp_output_tcp_encap(x, skb, esp); break; } - *skb_mac_header(skb) = IPPROTO_UDP; + if (IS_ERR(esph)) + return PTR_ERR(esph); + esp->esph = esph; return 0; @@ -279,9 +466,9 @@ int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * struct sk_buff *trailer; int tailen = esp->tailen; - /* this is non-NULL only with UDP Encapsulation */ + /* this is non-NULL only with TCP/UDP Encapsulation */ if (x->encap) { - int err = esp_output_udp_encap(x, skb, esp); + int err = esp_output_encap(x, skb, esp); if (err < 0) return err; @@ -474,6 +661,9 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info * if (sg != dsg) esp_ssg_unref(x, tmp); + if (!err && x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP) + err = esp_output_tail_tcp(x, skb); + error_free: kfree(tmp); error: @@ -600,7 +790,23 @@ int esp_input_done2(struct sk_buff *skb, int err) if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; + struct tcphdr *th = (void *)(skb_network_header(skb) + ihl); struct udphdr *uh = (void *)(skb_network_header(skb) + ihl); + __be16 source; + + switch (x->encap->encap_type) { + case TCP_ENCAP_ESPINTCP: + source = th->source; + break; + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + source = uh->source; + break; + default: + WARN_ON_ONCE(1); + err = -EINVAL; + goto out; + } /* * 1) if the NAT-T peer's IP or port changed then @@ -609,11 +815,11 @@ int esp_input_done2(struct sk_buff *skb, int err) * SRC ports. */ if (iph->saddr != x->props.saddr.a4 || - uh->source != encap->encap_sport) { + source != encap->encap_sport) { xfrm_address_t ipaddr; ipaddr.a4 = iph->saddr; - km_new_mapping(x, &ipaddr, uh->source); + km_new_mapping(x, &ipaddr, source); /* XXX: perhaps add an extra * policy check here, to see @@ -988,6 +1194,14 @@ static int esp_init_state(struct xfrm_state *x) case UDP_ENCAP_ESPINUDP_NON_IKE: x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); break; +#ifdef CONFIG_INET_ESPINTCP + case TCP_ENCAP_ESPINTCP: + /* only the length field, TCP encap is done by + * the socket + */ + x->props.header_len += 2; + break; +#endif } } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index a68b5e21ec51..c092e9a55790 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -16,6 +16,9 @@ struct fib_alias { u8 fa_slen; u32 tb_id; s16 fa_default; + u8 offload:1, + trap:1, + unused:6; struct rcu_head rcu; }; @@ -35,9 +38,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg, int fib_nh_match(struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); -int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u32 tb_id, - u8 type, __be32 dst, int dst_len, u8 tos, struct fib_info *fi, - unsigned int); +int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f1888c683426..a803cdd9400a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -504,6 +504,7 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags) { + struct fib_rt_info fri; struct sk_buff *skb; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; int err = -ENOBUFS; @@ -512,9 +513,15 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, if (!skb) goto errout; - err = fib_dump_info(skb, info->portid, seq, event, tb_id, - fa->fa_type, key, dst_len, - fa->fa_tos, fa->fa_info, nlm_flags); + fri.fi = fa->fa_info; + fri.tb_id = tb_id; + fri.dst = key; + fri.dst_len = dst_len; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; + err = fib_dump_info(skb, info->portid, seq, event, &fri, nlm_flags); if (err < 0) { /* -EMSGSIZE implies BUG in fib_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1725,10 +1732,11 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi) #endif int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, - u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos, - struct fib_info *fi, unsigned int flags) + struct fib_rt_info *fri, unsigned int flags) { - unsigned int nhs = fib_info_num_path(fi); + unsigned int nhs = fib_info_num_path(fri->fi); + struct fib_info *fi = fri->fi; + u32 tb_id = fri->tb_id; struct nlmsghdr *nlh; struct rtmsg *rtm; @@ -1738,22 +1746,22 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, rtm = nlmsg_data(nlh); rtm->rtm_family = AF_INET; - rtm->rtm_dst_len = dst_len; + rtm->rtm_dst_len = fri->dst_len; rtm->rtm_src_len = 0; - rtm->rtm_tos = tos; + rtm->rtm_tos = fri->tos; if (tb_id < 256) rtm->rtm_table = tb_id; else rtm->rtm_table = RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, tb_id)) goto nla_put_failure; - rtm->rtm_type = type; + rtm->rtm_type = fri->type; rtm->rtm_flags = fi->fib_flags; rtm->rtm_scope = fi->fib_scope; rtm->rtm_protocol = fi->fib_protocol; if (rtm->rtm_dst_len && - nla_put_in_addr(skb, RTA_DST, dst)) + nla_put_in_addr(skb, RTA_DST, fri->dst)) goto nla_put_failure; if (fi->fib_priority && nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority)) @@ -1795,6 +1803,11 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, goto nla_put_failure; } + if (fri->offload) + rtm->rtm_flags |= RTM_F_OFFLOAD; + if (fri->trap) + rtm->rtm_flags |= RTM_F_TRAP; + nlmsg_end(skb, nlh); return 0; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 195469a13371..ff0c24371e33 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -980,9 +980,12 @@ static struct key_vector *fib_find_node(struct trie *t, /* Return the first fib alias matching TOS with * priority less than or equal to PRIO. + * If 'find_first' is set, return the first matching + * fib alias, regardless of TOS and priority. */ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, - u8 tos, u32 prio, u32 tb_id) + u8 tos, u32 prio, u32 tb_id, + bool find_first) { struct fib_alias *fa; @@ -998,6 +1001,8 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, continue; if (fa->tb_id != tb_id) break; + if (find_first) + return fa; if (fa->fa_tos > tos) continue; if (fa->fa_info->fib_priority >= prio || fa->fa_tos < tos) @@ -1007,6 +1012,52 @@ static struct fib_alias *fib_find_alias(struct hlist_head *fah, u8 slen, return NULL; } +static struct fib_alias * +fib_find_matching_alias(struct net *net, const struct fib_rt_info *fri) +{ + u8 slen = KEYLENGTH - fri->dst_len; + struct key_vector *l, *tp; + struct fib_table *tb; + struct fib_alias *fa; + struct trie *t; + + tb = fib_get_table(net, fri->tb_id); + if (!tb) + return NULL; + + t = (struct trie *)tb->tb_data; + l = fib_find_node(t, &tp, be32_to_cpu(fri->dst)); + if (!l) + return NULL; + + hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { + if (fa->fa_slen == slen && fa->tb_id == fri->tb_id && + fa->fa_tos == fri->tos && fa->fa_info == fri->fi && + fa->fa_type == fri->type) + return fa; + } + + return NULL; +} + +void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri) +{ + struct fib_alias *fa_match; + + rcu_read_lock(); + + fa_match = fib_find_matching_alias(net, fri); + if (!fa_match) + goto out; + + fa_match->offload = fri->offload; + fa_match->trap = fri->trap; + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(fib_alias_hw_flags_set); + static void trie_rebalance(struct trie *t, struct key_vector *tn) { while (!IS_TRIE(tn)) @@ -1063,9 +1114,6 @@ noleaf: return -ENOMEM; } -/* fib notifier for ADD is sent before calling fib_insert_alias with - * the expectation that the only possible failure ENOMEM - */ static int fib_insert_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *new, struct fib_alias *fa, t_key key) @@ -1118,11 +1166,13 @@ static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) return true; } +static void fib_remove_alias(struct trie *t, struct key_vector *tp, + struct key_vector *l, struct fib_alias *old); + /* Caller must hold RTNL. */ int fib_table_insert(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) { - enum fib_event_type event = FIB_EVENT_ENTRY_ADD; struct trie *t = (struct trie *)tb->tb_data; struct fib_alias *fa, *new_fa; struct key_vector *l, *tp; @@ -1149,7 +1199,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, l = fib_find_node(t, &tp, key); fa = l ? fib_find_alias(&l->leaf, slen, tos, fi->fib_priority, - tb->tb_id) : NULL; + tb->tb_id, false) : NULL; /* Now fa, if non-NULL, points to the first fib alias * with the same keys [prefix,tos,priority], if such key already @@ -1216,19 +1266,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; + new_fa->offload = 0; + new_fa->trap = 0; - err = call_fib_entry_notifiers(net, - FIB_EVENT_ENTRY_REPLACE, - key, plen, new_fa, - extack); - if (err) - goto out_free_new_fa; + hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); + + if (fib_find_alias(&l->leaf, fa->fa_slen, 0, 0, + tb->tb_id, true) == new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, + key, plen, + new_fa, extack); + if (err) { + hlist_replace_rcu(&new_fa->fa_list, + &fa->fa_list); + goto out_free_new_fa; + } + } rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, &cfg->fc_nlinfo, nlflags); - hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); - alias_free_mem_rcu(fa); fib_release_info(fi_drop); @@ -1244,12 +1304,10 @@ int fib_table_insert(struct net *net, struct fib_table *tb, if (fa_match) goto out; - if (cfg->fc_nlflags & NLM_F_APPEND) { - event = FIB_EVENT_ENTRY_APPEND; + if (cfg->fc_nlflags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; - } else { + else fa = fa_first; - } } err = -ENOENT; if (!(cfg->fc_nlflags & NLM_F_CREATE)) @@ -1268,15 +1326,29 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_slen = slen; new_fa->tb_id = tb->tb_id; new_fa->fa_default = -1; - - err = call_fib_entry_notifiers(net, event, key, plen, new_fa, extack); - if (err) - goto out_free_new_fa; + new_fa->offload = 0; + new_fa->trap = 0; /* Insert new entry to the list. */ err = fib_insert_alias(t, tp, l, new_fa, fa, key); if (err) - goto out_fib_notif; + goto out_free_new_fa; + + /* The alias was already inserted, so the node must exist. */ + l = l ? l : fib_find_node(t, &tp, key); + if (WARN_ON_ONCE(!l)) + goto out_free_new_fa; + + if (fib_find_alias(&l->leaf, new_fa->fa_slen, 0, 0, tb->tb_id, true) == + new_fa) { + enum fib_event_type fib_event; + + fib_event = FIB_EVENT_ENTRY_REPLACE; + err = call_fib_entry_notifiers(net, fib_event, key, plen, + new_fa, extack); + if (err) + goto out_remove_new_fa; + } if (!plen) tb->tb_num_default++; @@ -1287,14 +1359,8 @@ int fib_table_insert(struct net *net, struct fib_table *tb, succeeded: return 0; -out_fib_notif: - /* notifier was sent that entry would be added to trie, but - * the add failed and need to recover. Only failure for - * fib_insert_alias is ENOMEM. - */ - NL_SET_ERR_MSG(extack, "Failed to insert route into trie"); - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, - plen, new_fa, NULL); +out_remove_new_fa: + fib_remove_alias(t, tp, l, new_fa); out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: @@ -1545,6 +1611,36 @@ static void fib_remove_alias(struct trie *t, struct key_vector *tp, node_pull_suffix(tp, fa->fa_slen); } +static void fib_notify_alias_delete(struct net *net, u32 key, + struct hlist_head *fah, + struct fib_alias *fa_to_delete, + struct netlink_ext_ack *extack) +{ + struct fib_alias *fa_next, *fa_to_notify; + u32 tb_id = fa_to_delete->tb_id; + u8 slen = fa_to_delete->fa_slen; + enum fib_event_type fib_event; + + /* Do not notify if we do not care about the route. */ + if (fib_find_alias(fah, slen, 0, 0, tb_id, true) != fa_to_delete) + return; + + /* Determine if the route should be replaced by the next route in the + * list. + */ + fa_next = hlist_entry_safe(fa_to_delete->fa_list.next, + struct fib_alias, fa_list); + if (fa_next && fa_next->fa_slen == slen && fa_next->tb_id == tb_id) { + fib_event = FIB_EVENT_ENTRY_REPLACE; + fa_to_notify = fa_next; + } else { + fib_event = FIB_EVENT_ENTRY_DEL; + fa_to_notify = fa_to_delete; + } + call_fib_entry_notifiers(net, fib_event, key, KEYLENGTH - slen, + fa_to_notify, extack); +} + /* Caller must hold RTNL. */ int fib_table_delete(struct net *net, struct fib_table *tb, struct fib_config *cfg, struct netlink_ext_ack *extack) @@ -1566,7 +1662,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!l) return -ESRCH; - fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id); + fa = fib_find_alias(&l->leaf, slen, tos, 0, tb->tb_id, false); if (!fa) return -ESRCH; @@ -1598,8 +1694,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, if (!fa_to_delete) return -ESRCH; - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, key, plen, - fa_to_delete, extack); + fib_notify_alias_delete(net, key, &l->leaf, fa_to_delete, extack); rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, &cfg->fc_nlinfo, 0); @@ -1923,10 +2018,8 @@ int fib_table_flush(struct net *net, struct fib_table *tb, bool flush_all) continue; } - call_fib_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, - n->key, - KEYLENGTH - fa->fa_slen, fa, - NULL); + fib_notify_alias_delete(net, n->key, &n->leaf, fa, + NULL); hlist_del_rcu(&fa->fa_list); fib_release_info(fa->fa_info); alias_free_mem_rcu(fa); @@ -2022,6 +2115,7 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, struct netlink_ext_ack *extack) { struct fib_alias *fa; + int last_slen = -1; int err; hlist_for_each_entry_rcu(fa, &l->leaf, fa_list) { @@ -2036,8 +2130,12 @@ static int fib_leaf_notify(struct key_vector *l, struct fib_table *tb, if (tb->tb_id != fa->tb_id) continue; - err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_ADD, l->key, - KEYLENGTH - fa->fa_slen, + if (fa->fa_slen == last_slen) + continue; + + last_slen = fa->fa_slen; + err = call_fib_entry_notifier(nb, FIB_EVENT_ENTRY_REPLACE, + l->key, KEYLENGTH - fa->fa_slen, fa, extack); if (err) return err; @@ -2146,14 +2244,20 @@ static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, if (filter->dump_routes) { if (!s_fa) { + struct fib_rt_info fri; + + fri.fi = fi; + fri.tb_id = tb->tb_id; + fri.dst = xkey; + fri.dst_len = KEYLENGTH - fa->fa_slen; + fri.tos = fa->fa_tos; + fri.type = fa->fa_type; + fri.offload = fa->offload; + fri.trap = fa->trap; err = fib_dump_info(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - RTM_NEWROUTE, - tb->tb_id, fa->fa_type, - xkey, - KEYLENGTH - fa->fa_slen, - fa->fa_tos, fi, flags); + RTM_NEWROUTE, &fri, flags); if (err < 0) goto stop; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 4de7e962d3da..2e6d1b7a7bc9 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -174,7 +174,7 @@ static struct sk_buff *gre_gro_receive(struct list_head *head, if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; - skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, + skb_gro_checksum_try_convert(skb, IPPROTO_GRE, null_compute_pseudo); } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 18c0d5bffe12..a4db79b1b643 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -610,12 +610,6 @@ no_route: } EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); -#if IS_ENABLED(CONFIG_IPV6) -#define AF_INET_FAMILY(fam) ((fam) == AF_INET) -#else -#define AF_INET_FAMILY(fam) true -#endif - /* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, @@ -770,6 +764,18 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +static void inet_clone_ulp(const struct request_sock *req, struct sock *newsk, + const gfp_t priority) +{ + struct inet_connection_sock *icsk = inet_csk(newsk); + + if (!icsk->icsk_ulp_ops) + return; + + if (icsk->icsk_ulp_ops->clone) + icsk->icsk_ulp_ops->clone(req, newsk, priority); +} + /** * inet_csk_clone_lock - clone an inet socket, and lock its clone * @sk: the socket to clone @@ -810,6 +816,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + inet_clone_ulp(req, newsk, priority); + security_inet_csk_clone(newsk, req); } return newsk; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 14db1e0b8a6e..d84819893db9 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -240,8 +240,8 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { + struct sk_buff *segs, *nskb; netdev_features_t features; - struct sk_buff *segs; int ret = 0; /* common case: seglen is <= mtu @@ -272,8 +272,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, consume_skb(skb); - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { int err; skb_mark_not_on_list(segs); @@ -281,8 +280,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, if (err && ret == 0) ret = err; - segs = nskb; - } while (segs); + } return ret; } diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 511eaa94e2d1..d072c326dd64 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -321,7 +321,9 @@ static size_t nh_nlmsg_size_single(struct nexthop *nh) static size_t nh_nlmsg_size(struct nexthop *nh) { - size_t sz = nla_total_size(4); /* NHA_ID */ + size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); + + sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index cc90243ccf76..2580303249e2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -289,6 +289,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPRcvQDrop", LINUX_MIB_TCPRCVQDROP), SNMP_MIB_ITEM("TCPWqueueTooBig", LINUX_MIB_TCPWQUEUETOOBIG), SNMP_MIB_ITEM("TCPFastOpenPassiveAltKey", LINUX_MIB_TCPFASTOPENPASSIVEALTKEY), + SNMP_MIB_ITEM("TcpTimeoutRehash", LINUX_MIB_TCPTIMEOUTREHASH), + SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e356ea779227..d5c57b3f77d5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3224,16 +3224,41 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { + struct fib_rt_info fri; + if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } + fri.fi = res.fi; + fri.tb_id = table_id; + fri.dst = res.prefix; + fri.dst_len = res.prefixlen; + fri.tos = fl4.flowi4_tos; + fri.type = rt->rt_type; + fri.offload = 0; + fri.trap = 0; + if (res.fa_head) { + struct fib_alias *fa; + + hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { + u8 slen = 32 - fri.dst_len; + + if (fa->fa_slen == slen && + fa->tb_id == fri.tb_id && + fa->fa_tos == fri.tos && + fa->fa_info == res.fi && + fa->fa_type == fri.type) { + fri.offload = fa->offload; + fri.trap = fa->trap; + break; + } + } + } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, - nlh->nlmsg_seq, RTM_NEWROUTE, table_id, - rt->rt_type, res.prefix, res.prefixlen, - fl4.flowi4_tos, res.fi, 0); + nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, NETLINK_CB(in_skb).portid, diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index fcb2cd167f64..9684af02e0a5 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1193,6 +1193,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "tcp_no_ssthresh_metrics_save", + .data = &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "tcp_moderate_rcvbuf", .data = &init_net.ipv4.sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(int), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a7d766e6390e..484485ae74c2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,6 +271,7 @@ #include <net/icmp.h> #include <net/inet_common.h> #include <net/tcp.h> +#include <net/mptcp.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> @@ -443,8 +444,6 @@ void tcp_init_sock(struct sock *sk) tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; - sk->sk_state = TCP_CLOSE; - sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -692,8 +691,8 @@ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, refcount_read(&sk->sk_wmem_alloc) > skb->truesize; } -static void tcp_push(struct sock *sk, int flags, int mss_now, - int nonagle, int size_goal) +void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -927,7 +926,7 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, return max(size_goal, mss_now); } -static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; @@ -1778,6 +1777,8 @@ static int tcp_zerocopy_receive(struct sock *sk, while (length + PAGE_SIZE <= zc->length) { if (zc->recv_skip_hint < PAGE_SIZE) { if (skb) { + if (zc->recv_skip_hint > 0) + break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { @@ -3336,6 +3337,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ + nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ 0; } @@ -3390,6 +3392,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); + nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); return stats; } @@ -4021,4 +4024,5 @@ void __init tcp_init(void) tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); + mptcp_init(); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 3737ec096650..3172e31987be 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -21,7 +21,7 @@ static DEFINE_SPINLOCK(tcp_cong_list_lock); static LIST_HEAD(tcp_cong_list); /* Simple linear search, don't expect many entries! */ -static struct tcp_congestion_ops *tcp_ca_find(const char *name) +struct tcp_congestion_ops *tcp_ca_find(const char *name) { struct tcp_congestion_ops *e; @@ -162,7 +162,7 @@ void tcp_assign_congestion_control(struct sock *sk) rcu_read_lock(); ca = rcu_dereference(net->ipv4.tcp_congestion_control); - if (unlikely(!try_module_get(ca->owner))) + if (unlikely(!bpf_try_module_get(ca, ca->owner))) ca = &tcp_reno; icsk->icsk_ca_ops = ca; rcu_read_unlock(); @@ -208,7 +208,7 @@ void tcp_cleanup_congestion_control(struct sock *sk) if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); - module_put(icsk->icsk_ca_ops->owner); + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -222,12 +222,12 @@ int tcp_set_default_congestion_control(struct net *net, const char *name) ca = tcp_ca_find_autoload(net, name); if (!ca) { ret = -ENOENT; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { ret = -EBUSY; } else { prev = xchg(&net->ipv4.tcp_congestion_control, ca); if (prev) - module_put(prev->owner); + bpf_module_put(prev, prev->owner); ca->flags |= TCP_CONG_NON_RESTRICTED; ret = 0; @@ -366,19 +366,19 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, } else if (!load) { const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops; - if (try_module_get(ca->owner)) { + if (bpf_try_module_get(ca, ca->owner)) { if (reinit) { tcp_reinit_congestion_control(sk, ca); } else { icsk->icsk_ca_ops = ca; - module_put(old_ca->owner); + bpf_module_put(old_ca, old_ca->owner); } } else { err = -EBUSY; } } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || cap_net_admin)) { err = -EPERM; - } else if (!try_module_get(ca->owner)) { + } else if (!bpf_try_module_get(ca, ca->owner)) { err = -EBUSY; } else { tcp_reinit_congestion_control(sk, ca); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 1b3d032a4df2..8f8eefd3a3ce 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -40,8 +40,8 @@ /* Number of delay samples for detecting the increase of delay */ #define HYSTART_MIN_SAMPLES 8 -#define HYSTART_DELAY_MIN (4U<<3) -#define HYSTART_DELAY_MAX (16U<<3) +#define HYSTART_DELAY_MIN (4000U) /* 4 ms */ +#define HYSTART_DELAY_MAX (16000U) /* 16 ms */ #define HYSTART_DELAY_THRESH(x) clamp(x, HYSTART_DELAY_MIN, HYSTART_DELAY_MAX) static int fast_convergence __read_mostly = 1; @@ -53,7 +53,7 @@ static int tcp_friendliness __read_mostly = 1; static int hystart __read_mostly = 1; static int hystart_detect __read_mostly = HYSTART_ACK_TRAIN | HYSTART_DELAY; static int hystart_low_window __read_mostly = 16; -static int hystart_ack_delta __read_mostly = 2; +static int hystart_ack_delta_us __read_mostly = 2000; static u32 cube_rtt_scale __read_mostly; static u32 beta_scale __read_mostly; @@ -77,8 +77,8 @@ MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); -module_param(hystart_ack_delta, int, 0644); -MODULE_PARM_DESC(hystart_ack_delta, "spacing between ack's indicating train (msecs)"); +module_param(hystart_ack_delta_us, int, 0644); +MODULE_PARM_DESC(hystart_ack_delta_us, "spacing between ack's indicating train (usecs)"); /* BIC TCP Parameters */ struct bictcp { @@ -89,7 +89,7 @@ struct bictcp { u32 bic_origin_point;/* origin point of bic function */ u32 bic_K; /* time to origin point from the beginning of the current epoch */ - u32 delay_min; /* min delay (msec << 3) */ + u32 delay_min; /* min delay (usec) */ u32 epoch_start; /* beginning of an epoch */ u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ @@ -117,13 +117,9 @@ static inline void bictcp_reset(struct bictcp *ca) ca->found = 0; } -static inline u32 bictcp_clock(void) +static inline u32 bictcp_clock_us(const struct sock *sk) { -#if HZ < 1000 - return ktime_to_ms(ktime_get_real()); -#else - return jiffies_to_msecs(jiffies); -#endif + return tcp_sk(sk)->tcp_mstamp; } static inline void bictcp_hystart_reset(struct sock *sk) @@ -131,9 +127,9 @@ static inline void bictcp_hystart_reset(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - ca->round_start = ca->last_ack = bictcp_clock(); + ca->round_start = ca->last_ack = bictcp_clock_us(sk); ca->end_seq = tp->snd_nxt; - ca->curr_rtt = 0; + ca->curr_rtt = ~0U; ca->sample_cnt = 0; } @@ -276,7 +272,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd, u32 acked) */ t = (s32)(tcp_jiffies32 - ca->epoch_start); - t += msecs_to_jiffies(ca->delay_min >> 3); + t += usecs_to_jiffies(ca->delay_min); /* change the unit from HZ to bictcp_HZ */ t <<= BICTCP_HZ; do_div(t, HZ); @@ -376,22 +372,54 @@ static void bictcp_state(struct sock *sk, u8 new_state) } } +/* Account for TSO/GRO delays. + * Otherwise short RTT flows could get too small ssthresh, since during + * slow start we begin with small TSO packets and ca->delay_min would + * not account for long aggregation delay when TSO packets get bigger. + * Ideally even with a very small RTT we would like to have at least one + * TSO packet being sent and received by GRO, and another one in qdisc layer. + * We apply another 100% factor because @rate is doubled at this point. + * We cap the cushion to 1ms. + */ +static u32 hystart_ack_delay(struct sock *sk) +{ + unsigned long rate; + + rate = READ_ONCE(sk->sk_pacing_rate); + if (!rate) + return 0; + return min_t(u64, USEC_PER_MSEC, + div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate)); +} + static void hystart_update(struct sock *sk, u32 delay) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - - if (ca->found & hystart_detect) - return; + u32 threshold; if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now = bictcp_clock(); + u32 now = bictcp_clock_us(sk); /* first detection parameter - ack-train detection */ - if ((s32)(now - ca->last_ack) <= hystart_ack_delta) { + if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) { ca->last_ack = now; - if ((s32)(now - ca->round_start) > ca->delay_min >> 4) { - ca->found |= HYSTART_ACK_TRAIN; + + threshold = ca->delay_min + hystart_ack_delay(sk); + + /* Hystart ack train triggers if we get ack past + * ca->delay_min/2. + * Pacing might have delayed packets up to RTT/2 + * during slow start. + */ + if (sk->sk_pacing_status == SK_PACING_NONE) + threshold >>= 1; + + if ((s32)(now - ca->round_start) > threshold) { + ca->found = 1; + pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", + now - ca->round_start, threshold, + ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), @@ -405,14 +433,14 @@ static void hystart_update(struct sock *sk, u32 delay) if (hystart_detect & HYSTART_DELAY) { /* obtain the minimum delay of more than sampling packets */ if (ca->sample_cnt < HYSTART_MIN_SAMPLES) { - if (ca->curr_rtt == 0 || ca->curr_rtt > delay) + if (ca->curr_rtt > delay) ca->curr_rtt = delay; ca->sample_cnt++; } else { if (ca->curr_rtt > ca->delay_min + HYSTART_DELAY_THRESH(ca->delay_min >> 3)) { - ca->found |= HYSTART_DELAY; + ca->found = 1; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), @@ -424,9 +452,6 @@ static void hystart_update(struct sock *sk, u32 delay) } } -/* Track delayed acknowledgment ratio using sliding window - * ratio = (15*ratio + sample) / 16 - */ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) { const struct tcp_sock *tp = tcp_sk(sk); @@ -441,7 +466,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) if (ca->epoch_start && (s32)(tcp_jiffies32 - ca->epoch_start) < HZ) return; - delay = (sample->rtt_us << 3) / USEC_PER_MSEC; + delay = sample->rtt_us; if (delay == 0) delay = 1; @@ -450,7 +475,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tcp_in_slow_start(tp) && + if (!ca->found && tcp_in_slow_start(tp) && hystart && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2a976f57f7e7..e8b840a4767e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -79,6 +79,7 @@ #include <trace/events/tcp.h> #include <linux/jump_label_ratelimit.h> #include <net/busy_poll.h> +#include <net/mptcp.h> int sysctl_tcp_max_orphans __read_mostly = NR_FILE; @@ -1423,7 +1424,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, if ((TCP_SKB_CB(prev)->sacked & TCPCB_TAGBITS) != TCPCB_SACKED_ACKED) goto fallback; - if (!tcp_skb_can_collapse_to(prev)) + if (!tcp_skb_can_collapse(prev, skb)) goto fallback; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && @@ -3555,7 +3556,7 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) if (rexmit == REXMIT_NONE || sk->sk_state == TCP_SYN_SENT) return; - if (unlikely(rexmit == 2)) { + if (unlikely(rexmit == REXMIT_NEW)) { __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF); if (after(tp->snd_nxt, tp->high_seq)) @@ -3925,6 +3926,10 @@ void tcp_parse_options(const struct net *net, */ break; #endif + case TCPOPT_MPTCP: + mptcp_parse_option(skb, ptr, opsize, opt_rx); + break; + case TCPOPT_FASTOPEN: tcp_parse_fastopen_option( opsize - TCPOLEN_FASTOPEN_BASE, @@ -4266,8 +4271,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { sk_rethink_txhash(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -4425,6 +4432,9 @@ static bool tcp_try_coalesce(struct sock *sk, if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) return false; + if (!mptcp_skb_can_collapse(to, from)) + return false; + #ifdef CONFIG_TLS_DEVICE if (from->decrypted != to->decrypted) return false; @@ -4763,6 +4773,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) bool fragstolen; int eaten; + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { __kfree_skb(skb); return; @@ -4934,7 +4947,7 @@ restart: /* The first skb to collapse is: * - not SYN/FIN and * - bloated or contains data before "start" or - * overlaps to the next one. + * overlaps to the next one and mptcp allow collapsing. */ if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(sk, skb->truesize) > skb->len || @@ -4943,7 +4956,7 @@ restart: break; } - if (n && n != tail && + if (n && n != tail && mptcp_skb_can_collapse(skb, n) && TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) { end_of_skbs = false; break; @@ -4976,6 +4989,7 @@ restart: else __skb_queue_tail(&tmp, nskb); /* defer rbtree insertion */ skb_set_owner_r(nskb, sk); + mptcp_skb_ext_move(nskb, skb); /* Copy data, releasing collapsed skbs. */ while (copy > 0) { @@ -4995,6 +5009,7 @@ restart: skb = tcp_collapse_one(sk, skb, list, root); if (!skb || skb == tail || + !mptcp_skb_can_collapse(nskb, skb) || (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) goto end; #ifdef CONFIG_TLS_DEVICE @@ -5969,6 +5984,9 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); + if (sk_is_mptcp(sk)) + mptcp_rcv_synsent(sk); + /* Remember, tcp_poll() does not lock socket! * Change state from SYN-SENT only after copied_seq * is initialized. */ @@ -6334,8 +6352,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_CLOSE_WAIT: case TCP_CLOSING: case TCP_LAST_ACK: - if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + if (sk_is_mptcp(sk)) + mptcp_incoming_options(sk, skb, &tp->rx_opt); break; + } /* fall through */ case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: @@ -6591,6 +6612,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_rsk(req)->af_specific = af_ops; tcp_rsk(req)->ts_off = 0; +#if IS_ENABLED(CONFIG_MPTCP) + tcp_rsk(req)->is_mptcp = 0; +#endif tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c7326e04f9b..df1166b76126 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -701,9 +701,21 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) rcu_read_lock(); hash_location = tcp_parse_md5sig_option(th); if (sk && sk_fullsock(sk)) { - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + const union tcp_md5_addr *addr; + int l3index; + + /* sdif set, means packet ingressed via a device + * in an L3 domain and inet_iif is set to it. + */ + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } else if (hash_location) { + const union tcp_md5_addr *addr; + int sdif = tcp_v4_sdif(skb); + int dif = inet_iif(skb); + int l3index; + /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. @@ -714,14 +726,17 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, - ntohs(th->source), inet_iif(skb), - tcp_v4_sdif(skb)); + ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ if (!sk1) goto out; - key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *) - &ip_hdr(skb)->saddr, AF_INET); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to it. + */ + l3index = sdif ? dif : 0; + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET); if (!key) goto out; @@ -905,6 +920,9 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { + const union tcp_md5_addr *addr; + int l3index; + /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ @@ -916,14 +934,15 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, * exception of <SYN> segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ + addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr; + l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0; tcp_v4_send_ack(sk, skb, seq, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, req->ts_recent, 0, - tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr, - AF_INET), + tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, ip_hdr(skb)->tos); } @@ -983,7 +1002,7 @@ DEFINE_STATIC_KEY_FALSE(tcp_md5_needed); EXPORT_SYMBOL(tcp_md5_needed); /* Find the Key structure for an address. */ -struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, +struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { @@ -1003,7 +1022,8 @@ struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; - + if (key->l3index && key->l3index != l3index) + continue; if (family == AF_INET) { mask = inet_make_mask(key->prefixlen); match = (key->addr.a4.s_addr & mask) == @@ -1027,7 +1047,8 @@ EXPORT_SYMBOL(__tcp_md5_do_lookup); static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen) + int family, u8 prefixlen, + int l3index) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; @@ -1046,6 +1067,8 @@ static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk, hlist_for_each_entry_rcu(key, &md5sig->head, node) { if (key->family != family) continue; + if (key->l3index && key->l3index != l3index) + continue; if (!memcmp(&key->addr, addr, size) && key->prefixlen == prefixlen) return key; @@ -1057,23 +1080,26 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { const union tcp_md5_addr *addr; + int l3index; + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), + addr_sk->sk_bound_dev_if); addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; - return tcp_md5_do_lookup(sk, addr, AF_INET); + return tcp_md5_do_lookup(sk, l3index, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, - int family, u8 prefixlen, const u8 *newkey, u8 newkeylen, - gfp_t gfp) + int family, u8 prefixlen, int l3index, + const u8 *newkey, u8 newkeylen, gfp_t gfp) { /* Add Key to the list */ struct tcp_md5sig_key *key; struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_info *md5sig; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (key) { /* Pre-existing entry - just update that one. */ memcpy(key->key, newkey, newkeylen); @@ -1105,6 +1131,7 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, key->keylen = newkeylen; key->family = family; key->prefixlen = prefixlen; + key->l3index = l3index; memcpy(&key->addr, addr, (family == AF_INET6) ? sizeof(struct in6_addr) : sizeof(struct in_addr)); @@ -1114,11 +1141,11 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, EXPORT_SYMBOL(tcp_md5_do_add); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, - u8 prefixlen) + u8 prefixlen, int l3index) { struct tcp_md5sig_key *key; - key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen); + key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index); if (!key) return -ENOENT; hlist_del_rcu(&key->node); @@ -1149,7 +1176,9 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, { struct tcp_md5sig cmd; struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr; + const union tcp_md5_addr *addr; u8 prefixlen = 32; + int l3index = 0; if (optlen < sizeof(cmd)) return -EINVAL; @@ -1167,16 +1196,34 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, return -EINVAL; } + if (optname == TCP_MD5SIG_EXT && + cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); + if (dev && netif_is_l3_master(dev)) + l3index = dev->ifindex; + + rcu_read_unlock(); + + /* ok to reference set/not set outside of rcu; + * right now device MUST be an L3 master + */ + if (!dev || !l3index) + return -EINVAL; + } + + addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr; + if (!cmd.tcpm_keylen) - return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen); + return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index); if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; - return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr, - AF_INET, prefixlen, cmd.tcpm_key, cmd.tcpm_keylen, - GFP_KERNEL); + return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, + cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL); } static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp, @@ -1286,7 +1333,8 @@ EXPORT_SYMBOL(tcp_v4_md5_hash_skb); /* Called with rcu_read_lock() */ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, - const struct sk_buff *skb) + const struct sk_buff *skb, + int dif, int sdif) { #ifdef CONFIG_TCP_MD5SIG /* @@ -1301,11 +1349,17 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, struct tcp_md5sig_key *hash_expected; const struct iphdr *iph = ip_hdr(skb); const struct tcphdr *th = tcp_hdr(skb); - int genhash; + const union tcp_md5_addr *addr; unsigned char newhash[16]; + int genhash, l3index; - hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr, - AF_INET); + /* sdif set, means packet ingressed via a device + * in an L3 domain and dif is set to the l3mdev + */ + l3index = sdif ? dif : 0; + + addr = (union tcp_md5_addr *)&iph->saddr; + hash_expected = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); hash_location = tcp_parse_md5sig_option(th); /* We've parsed the options - do we have a hash? */ @@ -1331,11 +1385,11 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk, if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); - net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", + net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s L3 index %d\n", &iph->saddr, ntohs(th->source), &iph->daddr, ntohs(th->dest), genhash ? " tcp_v4_calc_md5_hash failed" - : ""); + : "", l3index); return true; } return false; @@ -1372,7 +1426,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .syn_ack_timeout = tcp_syn_ack_timeout, }; -static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { +const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v4_md5_lookup, @@ -1419,7 +1473,9 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG + const union tcp_md5_addr *addr; struct tcp_md5sig_key *key; + int l3index; #endif struct ip_options_rcu *inet_opt; @@ -1467,9 +1523,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG + l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); /* Copy over the MD5 key from the original socket */ - key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET); + addr = (union tcp_md5_addr *)&newinet->inet_daddr; + key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET); if (key) { /* * We're using one, so create a matching key @@ -1477,8 +1534,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, * memory, then we end up not copying the key * across. Shucks. */ - tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr, - AF_INET, 32, key->key, key->keylen, GFP_ATOMIC); + tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, + key->key, key->keylen, GFP_ATOMIC); sk_nocaps_add(newsk, NETIF_F_GSO_MASK); } #endif @@ -1808,6 +1865,7 @@ int tcp_v4_rcv(struct sk_buff *skb) struct net *net = dev_net(skb->dev); struct sk_buff *skb_to_free; int sdif = inet_sdif(skb); + int dif = inet_iif(skb); const struct iphdr *iph; const struct tcphdr *th; bool refcounted; @@ -1856,7 +1914,7 @@ process: struct sock *nsk; sk = req->rsk_listener; - if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { + if (unlikely(tcp_v4_inbound_md5_hash(sk, skb, dif, sdif))) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; @@ -1914,7 +1972,7 @@ process: if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (tcp_v4_inbound_md5_hash(sk, skb)) + if (tcp_v4_inbound_md5_hash(sk, skb, dif, sdif)) goto discard_and_relse; nf_reset_ct(skb); @@ -2620,7 +2678,8 @@ static void __net_exit tcp_sk_exit(struct net *net) int cpu; if (net->ipv4.tcp_congestion_control) - module_put(net->ipv4.tcp_congestion_control->owner); + bpf_module_put(net->ipv4.tcp_congestion_control, + net->ipv4.tcp_congestion_control->owner); for_each_possible_cpu(cpu) inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); @@ -2675,6 +2734,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX; net->ipv4.sysctl_tcp_tw_reuse = 2; + net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; cnt = tcp_hashinfo.ehash_mask + 1; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; @@ -2726,7 +2786,8 @@ static int __net_init tcp_sk_init(struct net *net) /* Reno is always built in */ if (!net_eq(net, &init_net) && - try_module_get(init_net.ipv4.tcp_congestion_control->owner)) + bpf_try_module_get(init_net.ipv4.tcp_congestion_control, + init_net.ipv4.tcp_congestion_control->owner)) net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control; else net->ipv4.tcp_congestion_control = &tcp_reno; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index c4848e7a0aad..279db8822439 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -385,7 +385,8 @@ void tcp_update_metrics(struct sock *sk) if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tp->snd_cwnd >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -400,7 +401,8 @@ void tcp_update_metrics(struct sock *sk) } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { @@ -416,7 +418,8 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } - if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { + if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save && + !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, @@ -441,6 +444,7 @@ void tcp_init_metrics(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ @@ -458,7 +462,8 @@ void tcp_init_metrics(struct sock *sk) if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); - val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); + val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ? + 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c802bc80c400..ad3b56d9fa71 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -414,7 +414,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; @@ -425,7 +425,7 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || - !try_module_get(icsk->icsk_ca_ops->owner))) + !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b62b59b18db9..306e25d743e8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,6 +38,7 @@ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> +#include <net/mptcp.h> #include <linux/compiler.h> #include <linux/gfp.h> @@ -414,6 +415,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_WSCALE (1 << 3) #define OPTION_FAST_OPEN_COOKIE (1 << 8) #define OPTION_SMC (1 << 9) +#define OPTION_MPTCP (1 << 10) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -439,8 +441,17 @@ struct tcp_out_options { __u8 *hash_location; /* temporary pointer, overloaded */ __u32 tsval, tsecr; /* need to include OPTION_TS */ struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ + struct mptcp_out_options mptcp; }; +static void mptcp_options_write(__be32 *ptr, struct tcp_out_options *opts) +{ +#if IS_ENABLED(CONFIG_MPTCP) + if (unlikely(OPTION_MPTCP & opts->options)) + mptcp_write_options(ptr, &opts->mptcp); +#endif +} + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -549,6 +560,8 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, } smc_options_write(ptr, &options); + + mptcp_options_write(ptr, opts); } static void smc_set_option(const struct tcp_sock *tp, @@ -584,6 +597,22 @@ static void smc_set_option_cond(const struct tcp_sock *tp, #endif } +static void mptcp_set_option_cond(const struct request_sock *req, + struct tcp_out_options *opts, + unsigned int *remaining) +{ + if (rsk_is_mptcp(req)) { + unsigned int size; + + if (mptcp_synack_options(req, &size, &opts->mptcp)) { + if (*remaining >= size) { + opts->options |= OPTION_MPTCP; + *remaining -= size; + } + } + } +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -653,6 +682,15 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, smc_set_option(tp, opts, &remaining); + if (sk_is_mptcp(sk)) { + unsigned int size; + + if (mptcp_syn_options(sk, skb, &size, &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + remaining -= size; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } @@ -714,6 +752,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, } } + mptcp_set_option_cond(req, opts, &remaining); + smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -751,16 +791,37 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb size += TCPOLEN_TSTAMP_ALIGNED; } + /* MPTCP options have precedence over SACK for the limited TCP + * option space because a MPTCP connection would be forced to + * fall back to regular TCP if a required multipath option is + * missing. SACK still gets a chance to use whatever space is + * left. + */ + if (sk_is_mptcp(sk)) { + unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + unsigned int opt_size = 0; + + if (mptcp_established_options(sk, skb, &opt_size, remaining, + &opts->mptcp)) { + opts->options |= OPTION_MPTCP; + size += opt_size; + } + } + eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; + if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) + return size; + opts->num_sack_blocks = min_t(unsigned int, eff_sacks, (remaining - TCPOLEN_SACK_BASE_ALIGNED) / TCPOLEN_SACK_PERBLOCK); - if (likely(opts->num_sack_blocks)) - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; } return size; @@ -2865,7 +2926,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, if (!tcp_can_collapse(sk, skb)) break; - if (!tcp_skb_can_collapse_to(to)) + if (!tcp_skb_can_collapse(to, skb)) break; space -= skb->len; @@ -3369,8 +3430,8 @@ static void tcp_ca_dst_init(struct sock *sk, const struct dst_entry *dst) rcu_read_lock(); ca = tcp_ca_find_key(ca_key); - if (likely(ca && try_module_get(ca->owner))) { - module_put(icsk->icsk_ca_ops->owner); + if (likely(ca && bpf_try_module_get(ca, ca->owner))) { + bpf_module_put(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner); icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1097b438befe..c3f26dcd6704 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -223,6 +223,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; @@ -234,6 +237,9 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); } else { sk_rethink_txhash(sk); + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPTIMEOUTREHASH); } retry_until = net->ipv4.sysctl_tcp_retries2; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 030d43c7c957..db76b9609299 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1709,7 +1709,8 @@ busy_check: /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && - !__skb_wait_for_more_packets(sk, &error, &timeo, + !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, + &error, &timeo, (struct sk_buff *)sk_queue)); *err = error; @@ -2105,8 +2106,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_SGO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); - for (skb = segs; skb; skb = next) { - next = skb->next; + skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index a3908e55ed89..1a98583a79f4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -184,6 +184,20 @@ out_unlock: } EXPORT_SYMBOL(skb_udp_tunnel_segment); +static struct sk_buff *__udp_gso_segment_list(struct sk_buff *skb, + netdev_features_t features) +{ + unsigned int mss = skb_shinfo(skb)->gso_size; + + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); + if (IS_ERR(skb)) + return skb; + + udp_hdr(skb)->len = htons(sizeof(struct udphdr) + mss); + + return skb; +} + struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features) { @@ -196,6 +210,9 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, __sum16 check; __be16 newlen; + if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) + return __udp_gso_segment_list(gso_skb, features); + mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); @@ -354,6 +371,7 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct udphdr *uh2; struct sk_buff *p; unsigned int ulen; + int ret = 0; /* requires non zero csum, for symmetry with GSO */ if (!uh->check) { @@ -369,7 +387,6 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, } /* pull encapsulating udp header */ skb_gro_pull(skb, sizeof(struct udphdr)); - skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); list_for_each_entry(p, head, list) { if (!NAPI_GRO_CB(p)->same_flow) @@ -383,14 +400,40 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, continue; } + if (NAPI_GRO_CB(skb)->is_flist != NAPI_GRO_CB(p)->is_flist) { + NAPI_GRO_CB(skb)->flush = 1; + return p; + } + /* Terminate the flow on len mismatch or if it grow "too much". * Under small packet flood GRO count could elsewhere grow a lot * leading to excessive truesize values. * On len mismatch merge the first packet shorter than gso_size, * otherwise complete the GRO packet. */ - if (ulen > ntohs(uh2->len) || skb_gro_receive(p, skb) || - ulen != ntohs(uh2->len) || + if (ulen > ntohs(uh2->len)) { + pp = p; + } else { + if (NAPI_GRO_CB(skb)->is_flist) { + if (!pskb_may_pull(skb, skb_gro_offset(skb))) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + if ((skb->ip_summed != p->ip_summed) || + (skb->csum_level != p->csum_level)) { + NAPI_GRO_CB(skb)->flush = 1; + return NULL; + } + ret = skb_gro_receive_list(p, skb); + } else { + skb_gro_postpull_rcsum(skb, uh, + sizeof(struct udphdr)); + + ret = skb_gro_receive(p, skb); + } + } + + if (ret || ulen != ntohs(uh2->len) || NAPI_GRO_CB(p)->count >= UDP_GRO_CNT_MAX) pp = p; @@ -401,36 +444,29 @@ static struct sk_buff *udp_gro_receive_segment(struct list_head *head, return NULL; } -INDIRECT_CALLABLE_DECLARE(struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, - __be16 sport, __be16 dport)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, - struct udphdr *uh, udp_lookup_t lookup) + struct udphdr *uh, struct sock *sk) { struct sk_buff *pp = NULL; struct sk_buff *p; struct udphdr *uh2; unsigned int off = skb_gro_offset(skb); int flush = 1; - struct sock *sk; - rcu_read_lock(); - sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, - udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (!sk) - goto out_unlock; + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled: 1; - if (udp_sk(sk)->gro_enabled) { + if ((sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) { pp = call_gro_receive(udp_gro_receive_segment, head, skb); - rcu_read_unlock(); return pp; } - if (NAPI_GRO_CB(skb)->encap_mark || + if (!sk || NAPI_GRO_CB(skb)->encap_mark || (skb->ip_summed != CHECKSUM_PARTIAL && NAPI_GRO_CB(skb)->csum_cnt == 0 && !NAPI_GRO_CB(skb)->csum_valid) || !udp_sk(sk)->gro_receive) - goto out_unlock; + goto out; /* mark that this skb passed once through the tunnel gro layer */ NAPI_GRO_CB(skb)->encap_mark = 1; @@ -457,8 +493,7 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, skb_gro_postpull_rcsum(skb, uh, sizeof(struct udphdr)); pp = call_gro_receive_sk(udp_sk(sk)->gro_receive, sk, head, skb); -out_unlock: - rcu_read_unlock(); +out: skb_gro_flush_final(skb, pp, flush); return pp; } @@ -468,8 +503,10 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sk_buff *pp; + struct sock *sk; - if (unlikely(!uh) || !static_branch_unlikely(&udp_encap_needed_key)) + if (unlikely(!uh)) goto flush; /* Don't bother verifying checksum if we're going to flush anyway. */ @@ -480,11 +517,15 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) inet_gro_compute_pseudo)) goto flush; else if (uh->check) - skb_gro_checksum_try_convert(skb, IPPROTO_UDP, uh->check, + skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; - return udp_gro_receive(head, skb, uh, udp4_lib_lookup_skb); + rcu_read_lock(); + sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + pp = udp_gro_receive(head, skb, uh, sk); + rcu_read_unlock(); + return pp; flush: NAPI_GRO_CB(skb)->flush = 1; @@ -517,9 +558,7 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, rcu_read_lock(); sk = INDIRECT_CALL_INET(lookup, udp6_lib_lookup_skb, udp4_lib_lookup_skb, skb, uh->source, uh->dest); - if (sk && udp_sk(sk)->gro_enabled) { - err = udp_gro_complete_segment(skb); - } else if (sk && udp_sk(sk)->gro_complete) { + if (sk && udp_sk(sk)->gro_complete) { skb_shinfo(skb)->gso_type = uh->check ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; @@ -529,6 +568,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff, skb->encapsulation = 1; err = udp_sk(sk)->gro_complete(sk, skb, nhoff + sizeof(struct udphdr)); + } else { + err = udp_gro_complete_segment(skb); } rcu_read_unlock(); @@ -544,6 +585,23 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_complete(struct sk_buff *skb, int nhoff) const struct iphdr *iph = ip_hdr(skb); struct udphdr *uh = (struct udphdr *)(skb->data + nhoff); + if (NAPI_GRO_CB(skb)->is_flist) { + uh->len = htons(skb->len - nhoff); + + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (skb->csum_level < SKB_MAX_CSUM_LEVEL) + skb->csum_level++; + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->csum_level = 0; + } + + return 0; + } + if (uh->check) uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr, iph->daddr, 0); diff --git a/net/ipv4/xfrm4_protocol.c b/net/ipv4/xfrm4_protocol.c index 8a4285712808..ea595c8549c7 100644 --- a/net/ipv4/xfrm4_protocol.c +++ b/net/ipv4/xfrm4_protocol.c @@ -72,6 +72,14 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, if (!head) goto out; + if (!skb_dst(skb)) { + const struct iphdr *iph = ip_hdr(skb); + + if (ip_route_input_noref(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + goto drop; + } + for_each_protocol_rcu(*head, handler) if ((ret = handler->input_handler(skb, nexthdr, spi, encap_type)) != -EINVAL) return ret; @@ -79,6 +87,7 @@ int xfrm4_rcv_encap(struct sk_buff *skb, int nexthdr, __be32 spi, out: icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: kfree_skb(skb); return 0; } |