From 40171248bb8934537fec8fbaf718e57c8add187c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Oct 2021 05:36:01 -0400 Subject: sctp: allow IP fragmentation when PLPMTUD enters Error state Currently when PLPMTUD enters Error state, transport pathmtu will be set to MIN_PLPMTU(512) while probe is continuing with BASE_PLPMTU(1200). It will cause pathmtu to stay in a very small value, even if the real pmtu is some value like 1000. RFC8899 doesn't clearly say how to set the value in Error state. But one possibility could be keep using BASE_PLPMTU for the real pmtu, but allow to do IP fragmentation when it's in Error state. As it says in rfc8899#section-5.4: Some paths could be unable to sustain packets of the BASE_PLPMTU size. The Error State could be implemented to provide robustness to such paths. This allows fallback to a smaller than desired PLPMTU rather than suffer connectivity failure. This could utilize methods such as endpoint IP fragmentation to enable the PL sender to communicate using packets smaller than the BASE_PLPMTU. This patch is to set pmtu to BASE_PLPMTU instead of MIN_PLPMTU for Error state in sctp_transport_pl_send/toobig(), and set packet ipfragok for non-probe packets when it's in Error state. Fixes: 1dc68c194571 ("sctp: do state transition when PROBE_COUNT == MAX_PROBES on HB send path") Reported-by: Ying Xu Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/output.c | 13 ++++++++----- net/sctp/transport.c | 4 ++-- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sctp/output.c b/net/sctp/output.c index 4dfb5ea82b05..cdfdbd353c67 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -581,13 +581,16 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) chunk = list_entry(packet->chunk_list.next, struct sctp_chunk, list); sk = chunk->skb->sk; - /* check gso */ if (packet->size > tp->pathmtu && !packet->ipfragok && !chunk->pmtu_probe) { - if (!sk_can_gso(sk)) { - pr_err_once("Trying to GSO but underlying device doesn't support it."); - goto out; + if (tp->pl.state == SCTP_PL_ERROR) { /* do IP fragmentation if in Error state */ + packet->ipfragok = 1; + } else { + if (!sk_can_gso(sk)) { /* check gso */ + pr_err_once("Trying to GSO but underlying device doesn't support it."); + goto out; + } + gso = 1; } - gso = 1; } /* alloc head skb */ diff --git a/net/sctp/transport.c b/net/sctp/transport.c index a3d3ca6dd63d..1f2dfad768d5 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -269,7 +269,7 @@ bool sctp_transport_pl_send(struct sctp_transport *t) if (t->pl.probe_size == SCTP_BASE_PLPMTU) { /* BASE_PLPMTU Confirmation Failed */ t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ - t->pl.pmtu = SCTP_MIN_PLPMTU; + t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); sctp_assoc_sync_pmtu(t->asoc); } @@ -366,7 +366,7 @@ static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) if (pmtu >= SCTP_MIN_PLPMTU && pmtu < SCTP_BASE_PLPMTU) { t->pl.state = SCTP_PL_ERROR; /* Base -> Error */ - t->pl.pmtu = SCTP_MIN_PLPMTU; + t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); } } else if (t->pl.state == SCTP_PL_SEARCH) { -- cgit v1.2.3 From 75cf662c64dd8543f56c329c69eba18141c8fd9f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 28 Oct 2021 05:36:04 -0400 Subject: sctp: return true only for pathmtu update in sctp_transport_pl_toobig sctp_transport_pl_toobig() supposes to return true only if there's pathmtu update, so that in sctp_icmp_frag_needed() it would call sctp_assoc_sync_pmtu() and sctp_retransmit(). This patch is to fix these return places in sctp_transport_pl_toobig(). Fixes: 836964083177 ("sctp: do state transition when receiving an icmp TOOBIG packet") Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/sctp/transport.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 1f2dfad768d5..133f1719bf1b 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -368,6 +368,7 @@ static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + return true; } } else if (t->pl.state == SCTP_PL_SEARCH) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { @@ -378,11 +379,10 @@ static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + return true; } else if (pmtu > t->pl.pmtu && pmtu < t->pl.probe_size) { t->pl.probe_size = pmtu; t->pl.probe_count = 0; - - return false; } } else if (t->pl.state == SCTP_PL_COMPLETE) { if (pmtu >= SCTP_BASE_PLPMTU && pmtu < t->pl.pmtu) { @@ -393,10 +393,11 @@ static bool sctp_transport_pl_toobig(struct sctp_transport *t, u32 pmtu) t->pl.probe_high = 0; t->pl.pmtu = SCTP_BASE_PLPMTU; t->pathmtu = t->pl.pmtu + sctp_transport_pl_hlen(t); + return true; } } - return true; + return false; } bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) -- cgit v1.2.3 From 829e050eea69c7442441b714b6f5b339b5b8c367 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Thu, 28 Oct 2021 17:58:35 +0200 Subject: net: bridge: fix uninitialized variables when BRIDGE_CFM is disabled Function br_get_link_af_size_filtered() calls br_cfm_{,peer}_mep_count() that return a count. When BRIDGE_CFM is not enabled these functions simply return -EOPNOTSUPP but do not modify count parameter and calling function then works with uninitialized variables. Modify these inline functions to return zero in count parameter. Fixes: b6d0425b816e ("bridge: cfm: Netlink Notifications.") Cc: Henrik Bjoernlund Signed-off-by: Ivan Vecera Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 37ca76406f1e..fd5e7e74573c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -1911,11 +1911,13 @@ static inline int br_cfm_status_fill_info(struct sk_buff *skb, static inline int br_cfm_mep_count(struct net_bridge *br, u32 *count) { + *count = 0; return -EOPNOTSUPP; } static inline int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) { + *count = 0; return -EOPNOTSUPP; } #endif -- cgit v1.2.3 From 6de6e46d27ef386feecdbea56b3bfd6c3b3bc1f9 Mon Sep 17 00:00:00 2001 From: Yoshiki Komachi Date: Fri, 29 Oct 2021 09:21:41 +0000 Subject: cls_flower: Fix inability to match GRE/IPIP packets When a packet of a new flow arrives in openvswitch kernel module, it dissects the packet and passes the extracted flow key to ovs-vswtichd daemon. If hw- offload configuration is enabled, the daemon creates a new TC flower entry to bypass openvswitch kernel module for the flow (TC flower can also offload flows to NICs but this time that does not matter). In this processing flow, I found the following issue in cases of GRE/IPIP packets. When ovs_flow_key_extract() in openvswitch module parses a packet of a new GRE (or IPIP) flow received on non-tunneling vports, it extracts information of the outer IP header for ip_proto/src_ip/dst_ip match keys. This means ovs-vswitchd creates a TC flower entry with IP protocol/addresses match keys whose values are those of the outer IP header. OTOH, TC flower, which uses flow_dissector (different parser from openvswitch module), extracts information of the inner IP header. The following flow is an example to describe the issue in more detail. <----------- Outer IP -----------------> <---------- Inner IP ----------> +----------+--------------+--------------+----------+----------+----------+ | ip_proto | src_ip | dst_ip | ip_proto | src_ip | dst_ip | | 47 (GRE) | 192.168.10.1 | 192.168.10.2 | 6 (TCP) | 10.0.0.1 | 10.0.0.2 | +----------+--------------+--------------+----------+----------+----------+ In this case, TC flower entry and extracted information are shown as below: - ovs-vswitchd creates TC flower entry with: - ip_proto: 47 - src_ip: 192.168.10.1 - dst_ip: 192.168.10.2 - TC flower extracts below for IP header matches: - ip_proto: 6 - src_ip: 10.0.0.1 - dst_ip: 10.0.0.2 Thus, GRE or IPIP packets never match the TC flower entry, as each dissector behaves differently. IMHO, the behavior of TC flower (flow dissector) does not look correct, as ip_proto/src_ip/dst_ip in TC flower match means the outermost IP header information except for GRE/IPIP cases. This patch adds a new flow_dissector flag FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP which skips dissection of the encapsulated inner GRE/IPIP header in TC flower classifier. Signed-off-by: Yoshiki Komachi Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + net/core/flow_dissector.c | 15 +++++++++++++++ net/sched/cls_flower.c | 3 ++- 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index ffd386ea0dbb..aa33e1092e2c 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -287,6 +287,7 @@ enum flow_dissector_key_id { #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) +#define FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP BIT(3) struct flow_dissector_key { enum flow_dissector_key_id key_id; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index bac0184cf3de..0d4bbf534c7d 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1307,6 +1307,11 @@ ip_proto_again: switch (ip_proto) { case IPPROTO_GRE: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + fdret = __skb_flow_dissect_gre(skb, key_control, flow_dissector, target_container, data, &proto, &nhoff, &hlen, flags); @@ -1364,6 +1369,11 @@ ip_proto_again: break; } case IPPROTO_IPIP: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + proto = htons(ETH_P_IP); key_control->flags |= FLOW_DIS_ENCAPSULATION; @@ -1376,6 +1386,11 @@ ip_proto_again: break; case IPPROTO_IPV6: + if (flags & FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP) { + fdret = FLOW_DISSECT_RET_OUT_GOOD; + break; + } + proto = htons(ETH_P_IPV6); key_control->flags |= FLOW_DIS_ENCAPSULATION; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index eb6345a027e1..aab13ba11767 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -329,7 +329,8 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, ARRAY_SIZE(fl_ct_info_to_flower_map), post_ct); skb_flow_dissect_hash(skb, &mask->dissector, &skb_key); - skb_flow_dissect(skb, &mask->dissector, &skb_key, 0); + skb_flow_dissect(skb, &mask->dissector, &skb_key, + FLOW_DISSECTOR_F_STOP_BEFORE_ENCAP); f = fl_mask_lookup(mask, &skb_key); if (f && !tc_skip_sw(f->flags)) { -- cgit v1.2.3 From 42dcfd850e514b229d616a53dec06d0f2533217c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 29 Oct 2021 08:51:34 -0700 Subject: udp6: allow SO_MARK ctrl msg to affect routing Commit c6af0c227a22 ("ip: support SO_MARK cmsg") added propagation of SO_MARK from cmsg to skb->mark. For IPv4 and raw sockets the mark also affects route lookup, but in case of IPv6 the flow info is initialized before cmsg is parsed. Fixes: c6af0c227a22 ("ip: support SO_MARK cmsg") Reported-and-tested-by: Xintong Hu Signed-off-by: Jakub Kicinski Reviewed-by: David Ahern Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv6/udp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 8d785232b479..be6dc64ece29 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1435,7 +1435,6 @@ do_udp_sendmsg: if (!fl6.flowi6_oif) fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk->sk_uid; if (msg->msg_controllen) { @@ -1471,6 +1470,7 @@ do_udp_sendmsg: ipc6.opt = opt; fl6.flowi6_proto = sk->sk_protocol; + fl6.flowi6_mark = ipc6.sockc.mark; fl6.daddr = *daddr; if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; -- cgit v1.2.3