From c98d8836b817d11fdff4ca7749cbbe04ff7f0c64 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 31 Jan 2024 16:49:10 +0100 Subject: wifi: mac80211: reload info pointer in ieee80211_tx_dequeue() This pointer can change here since the SKB can change, so we actually later open-coded IEEE80211_SKB_CB() again. Reload the pointer where needed, so the monitor-mode case using it gets fixed, and then use info-> later as well. Cc: stable@vger.kernel.org Fixes: 531682159092 ("mac80211: fix VLAN handling with TXQs") Link: https://msgid.link/20240131164910.b54c28d583bc.I29450cec84ea6773cff5d9c16ff92b836c331471@changeid Signed-off-by: Johannes Berg <johannes.berg@intel.com> --- net/mac80211/tx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e448ab338448..6fbb15b65902 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * Transmit and frame generation functions. */ @@ -3927,6 +3927,7 @@ begin: goto begin; skb = __skb_dequeue(&tx.skbs); + info = IEEE80211_SKB_CB(skb); if (!skb_queue_empty(&tx.skbs)) { spin_lock_bh(&fq->lock); @@ -3971,7 +3972,7 @@ begin: } encap_out: - IEEE80211_SKB_CB(skb)->control.vif = vif; + info->control.vif = vif; if (tx.sta && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) { -- cgit v1.2.3 From 4e1d71cabb19ec2586827adfc60d68689c68c194 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 6 Feb 2024 14:16:31 -0500 Subject: net/handshake: Fix handshake_req_destroy_test1 Recently, handshake_req_destroy_test1 started failing: Expected handshake_req_destroy_test == req, but handshake_req_destroy_test == 0000000000000000 req == 0000000060f99b40 not ok 11 req_destroy works This is because "sock_release(sock)" was replaced with "fput(filp)" to address a memory leak. Note that sock_release() is synchronous but fput() usually delays the final close and clean-up. The delay is not consequential in the other cases that were changed but handshake_req_destroy_test1 is testing that handshake_req_cancel() followed by closing the file actually does call the ->hp_destroy method. Thus the PTR_EQ test at the end has to be sure that the final close is complete before it checks the pointer. We cannot use a completion here because if ->hp_destroy is never called (ie, there is an API bug) then the test will hang. Reported by: Guenter Roeck <linux@roeck-us.net> Closes: https://lore.kernel.org/netdev/ZcKDd1to4MPANCrn@tissot.1015granger.net/T/#mac5c6299f86799f1c71776f3a07f9c566c7c3c40 Fixes: 4a0f07d71b04 ("net/handshake: Fix memory leak in __sock_create() and sock_alloc_file()") Signed-off-by: Chuck Lever <chuck.lever@oracle.com> Reviewed-by: Hannes Reinecke <hare@suse.de> Link: https://lore.kernel.org/r/170724699027.91401.7839730697326806733.stgit@oracle-102.nfsv4bat.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/handshake/handshake-test.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/handshake/handshake-test.c b/net/handshake/handshake-test.c index 16ed7bfd29e4..34fd1d9b2db8 100644 --- a/net/handshake/handshake-test.c +++ b/net/handshake/handshake-test.c @@ -471,7 +471,10 @@ static void handshake_req_destroy_test1(struct kunit *test) handshake_req_cancel(sock->sk); /* Act */ - fput(filp); + /* Ensure the close/release/put process has run to + * completion before checking the result. + */ + __fput_sync(filp); /* Assert */ KUNIT_EXPECT_PTR_EQ(test, handshake_req_destroy_test, req); -- cgit v1.2.3 From 6e2f90d31fe09f2b852de25125ca875aabd81367 Mon Sep 17 00:00:00 2001 From: Aaron Conole Date: Wed, 7 Feb 2024 08:24:15 -0500 Subject: net: openvswitch: limit the number of recursions from action sets The ovs module allows for some actions to recursively contain an action list for complex scenarios, such as sampling, checking lengths, etc. When these actions are copied into the internal flow table, they are evaluated to validate that such actions make sense, and these calls happen recursively. The ovs-vswitchd userspace won't emit more than 16 recursion levels deep. However, the module has no such limit and will happily accept limits larger than 16 levels nested. Prevent this by tracking the number of recursions happening and manually limiting it to 16 levels nested. The initial implementation of the sample action would track this depth and prevent more than 3 levels of recursion, but this was removed to support the clone use case, rather than limited at the current userspace limit. Fixes: 798c166173ff ("openvswitch: Optimize sample action for the clone use cases") Signed-off-by: Aaron Conole <aconole@redhat.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20240207132416.1488485-2-aconole@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/openvswitch/flow_netlink.c | 49 ++++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 88965e2068ac..ebc5728aab4e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -48,6 +48,7 @@ struct ovs_len_tbl { #define OVS_ATTR_NESTED -1 #define OVS_ATTR_VARIABLE -2 +#define OVS_COPY_ACTIONS_MAX_DEPTH 16 static bool actions_may_change_flow(const struct nlattr *actions) { @@ -2545,13 +2546,15 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log); + u32 mpls_label_count, bool log, + u32 depth); static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log, bool last) + u32 mpls_label_count, bool log, bool last, + u32 depth) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -2602,7 +2605,8 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, return err; err = __ovs_nla_copy_actions(net, actions, key, sfa, - eth_type, vlan_tci, mpls_label_count, log); + eth_type, vlan_tci, mpls_label_count, log, + depth + 1); if (err) return err; @@ -2617,7 +2621,8 @@ static int validate_and_copy_dec_ttl(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log) + u32 mpls_label_count, bool log, + u32 depth) { const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1]; int start, action_start, err, rem; @@ -2660,7 +2665,8 @@ static int validate_and_copy_dec_ttl(struct net *net, return action_start; err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, - vlan_tci, mpls_label_count, log); + vlan_tci, mpls_label_count, log, + depth + 1); if (err) return err; @@ -2674,7 +2680,8 @@ static int validate_and_copy_clone(struct net *net, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log, bool last) + u32 mpls_label_count, bool log, bool last, + u32 depth) { int start, err; u32 exec; @@ -2694,7 +2701,8 @@ static int validate_and_copy_clone(struct net *net, return err; err = __ovs_nla_copy_actions(net, attr, key, sfa, - eth_type, vlan_tci, mpls_label_count, log); + eth_type, vlan_tci, mpls_label_count, log, + depth + 1); if (err) return err; @@ -3063,7 +3071,7 @@ static int validate_and_copy_check_pkt_len(struct net *net, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, u32 mpls_label_count, - bool log, bool last) + bool log, bool last, u32 depth) { const struct nlattr *acts_if_greater, *acts_if_lesser_eq; struct nlattr *a[OVS_CHECK_PKT_LEN_ATTR_MAX + 1]; @@ -3111,7 +3119,8 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, - eth_type, vlan_tci, mpls_label_count, log); + eth_type, vlan_tci, mpls_label_count, log, + depth + 1); if (err) return err; @@ -3124,7 +3133,8 @@ static int validate_and_copy_check_pkt_len(struct net *net, return nested_acts_start; err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, - eth_type, vlan_tci, mpls_label_count, log); + eth_type, vlan_tci, mpls_label_count, log, + depth + 1); if (err) return err; @@ -3152,12 +3162,16 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, - u32 mpls_label_count, bool log) + u32 mpls_label_count, bool log, + u32 depth) { u8 mac_proto = ovs_key_mac_proto(key); const struct nlattr *a; int rem, err; + if (depth > OVS_COPY_ACTIONS_MAX_DEPTH) + return -EOVERFLOW; + nla_for_each_nested(a, attr, rem) { /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { @@ -3355,7 +3369,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_sample(net, a, key, sfa, eth_type, vlan_tci, mpls_label_count, - log, last); + log, last, depth); if (err) return err; skip_copy = true; @@ -3426,7 +3440,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, err = validate_and_copy_clone(net, a, key, sfa, eth_type, vlan_tci, mpls_label_count, - log, last); + log, last, depth); if (err) return err; skip_copy = true; @@ -3440,7 +3454,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, eth_type, vlan_tci, mpls_label_count, - log, last); + log, last, + depth); if (err) return err; skip_copy = true; @@ -3450,7 +3465,8 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, case OVS_ACTION_ATTR_DEC_TTL: err = validate_and_copy_dec_ttl(net, a, key, sfa, eth_type, vlan_tci, - mpls_label_count, log); + mpls_label_count, log, + depth); if (err) return err; skip_copy = true; @@ -3495,7 +3511,8 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, - key->eth.vlan.tci, mpls_label_count, log); + key->eth.vlan.tci, mpls_label_count, log, + 0); if (err) ovs_nla_free_flow_actions(*sfa); -- cgit v1.2.3 From aae09a6c7783e28d1bcafee85e172fe411923b22 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Wed, 7 Feb 2024 19:29:02 -0300 Subject: net/sched: act_mirred: Don't zero blockid when net device is being deleted While testing tdc with parallel tests for mirred to block we caught an intermittent bug. The blockid was being zeroed out when a net device was deleted and, thus, giving us an incorrect blockid value whenever we tried to dump the mirred action. Since we don't increment the block refcount in the control path (and only use the ID), we don't need to zero the blockid field whenever a net device is going down. Fixes: 42f39036cda8 ("net/sched: act_mirred: Allow mirred to block") Signed-off-by: Victor Nogueira <victor@mojatatu.com> Reviewed-by: Simon Horman <horms@kernel.org> Reviewed-by: Eric Dumazet <edumazet@google.com> Acked-by: Jamal Hadi Salim <jhs@mojatatu.com> Link: https://lore.kernel.org/r/20240207222902.1469398-1-victor@mojatatu.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/sched/act_mirred.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 12386f590b0f..0a1a9e40f237 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -533,8 +533,6 @@ static int mirred_device_event(struct notifier_block *unused, * net_device are already rcu protected. */ RCU_INIT_POINTER(m->tcfm_dev, NULL); - } else if (m->tcfm_blockid) { - m->tcfm_blockid = 0; } spin_unlock_bh(&m->tcf_lock); } -- cgit v1.2.3 From 2599bb5e0c742ba3de1af2abb56b8a103a671a22 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 8 Feb 2024 08:42:36 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for xfrm W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the XFRM interface drivers. Signed-off-by: Breno Leitao <leitao@debian.org> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20240208164244.3818498-2-leitao@debian.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/xfrm/xfrm_algo.c | 1 + net/xfrm/xfrm_user.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index 41533c631431..e6da7e8495c9 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -858,4 +858,5 @@ int xfrm_count_pfkey_enc_supported(void) } EXPORT_SYMBOL_GPL(xfrm_count_pfkey_enc_supported); +MODULE_DESCRIPTION("XFRM Algorithm interface"); MODULE_LICENSE("GPL"); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ad01997c3aa9..f037be190bae 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -3888,5 +3888,6 @@ static void __exit xfrm_user_exit(void) module_init(xfrm_user_init); module_exit(xfrm_user_exit); +MODULE_DESCRIPTION("XFRM User interface"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM); -- cgit v1.2.3 From f73f55b0fcff575fef1854c66d18767a341ebbe2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 8 Feb 2024 08:42:37 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for mpoa W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the Multi-Protocol Over ATM (MPOA) driver. Signed-off-by: Breno Leitao <leitao@debian.org> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20240208164244.3818498-3-leitao@debian.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/atm/mpc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/atm/mpc.c b/net/atm/mpc.c index 033871e718a3..324e3ab96bb3 100644 --- a/net/atm/mpc.c +++ b/net/atm/mpc.c @@ -1532,4 +1532,5 @@ static void __exit atm_mpoa_cleanup(void) module_init(atm_mpoa_init); module_exit(atm_mpoa_cleanup); +MODULE_DESCRIPTION("Multi-Protocol Over ATM (MPOA) driver"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 6e2cf0eb6926a5c51bba0aca819e91d7265c849c Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 8 Feb 2024 08:42:38 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for af_key W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the PF_KEY socket helpers. Signed-off-by: Breno Leitao <leitao@debian.org> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20240208164244.3818498-4-leitao@debian.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/key/af_key.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/key/af_key.c b/net/key/af_key.c index d68d01804dc7..f79fb99271ed 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3924,5 +3924,6 @@ out_unregister_key_proto: module_init(ipsec_pfkey_init); module_exit(ipsec_pfkey_exit); +MODULE_DESCRIPTION("PF_KEY socket helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KEY); -- cgit v1.2.3 From 2898f3075e6a0b0584781272aac88377e5ced0a0 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 8 Feb 2024 08:42:39 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for 6LoWPAN W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to IPv6 over Low power Wireless Personal Area Network. Signed-off-by: Breno Leitao <leitao@debian.org> Acked-by: Alexander Aring <aahringo@redhat.com> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20240208164244.3818498-5-leitao@debian.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/6lowpan/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 7b3341cef926..850d4a185f55 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -179,4 +179,5 @@ static void __exit lowpan_module_exit(void) module_init(lowpan_module_init); module_exit(lowpan_module_exit); +MODULE_DESCRIPTION("IPv6 over Low-Power Wireless Personal Area Network core module"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 92ab08eb63bbf54caebb425ed8908758c98ae8f2 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 8 Feb 2024 08:42:40 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for ipv6 modules W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the IPv6 modules. Signed-off-by: Breno Leitao <leitao@debian.org> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20240208164244.3818498-6-leitao@debian.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv6/ah6.c | 1 + net/ipv6/esp6.c | 1 + net/ipv6/ip6_udp_tunnel.c | 1 + net/ipv6/mip6.c | 1 + net/ipv6/sit.c | 1 + net/ipv6/tunnel6.c | 1 + net/ipv6/xfrm6_tunnel.c | 1 + 7 files changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 2016e90e6e1d..eb474f0987ae 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -800,5 +800,6 @@ static void __exit ah6_fini(void) module_init(ah6_init); module_exit(ah6_fini); +MODULE_DESCRIPTION("IPv6 AH transformation helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_AH); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 2cc1a45742d8..6e6efe026cdc 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -1301,5 +1301,6 @@ static void __exit esp6_fini(void) module_init(esp6_init); module_exit(esp6_fini); +MODULE_DESCRIPTION("IPv6 ESP transformation helpers"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ESP); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index a7bf0327b380..c99053189ea8 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -182,4 +182,5 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(udp_tunnel6_dst_lookup); +MODULE_DESCRIPTION("IPv6 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index 83d2a8be263f..6a16a5bd0d91 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -405,6 +405,7 @@ static void __exit mip6_fini(void) module_init(mip6_init); module_exit(mip6_fini); +MODULE_DESCRIPTION("IPv6 Mobility driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_DSTOPTS); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_ROUTING); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index cc24cefdb85c..5e9f625b76e3 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1956,6 +1956,7 @@ xfrm_tunnel_failed: module_init(sit_init); module_exit(sit_cleanup); +MODULE_DESCRIPTION("IPv6-in-IPv4 tunnel SIT driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0"); diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 00e8d8b1c9a7..dc4ea9b11794 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -302,4 +302,5 @@ static void __exit tunnel6_fini(void) module_init(tunnel6_init); module_exit(tunnel6_fini); +MODULE_DESCRIPTION("IP-in-IPv6 tunnel driver"); MODULE_LICENSE("GPL"); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 1323f2f6928e..f6cb94f82cc3 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -401,5 +401,6 @@ static void __exit xfrm6_tunnel_fini(void) module_init(xfrm6_tunnel_init); module_exit(xfrm6_tunnel_fini); +MODULE_DESCRIPTION("IPv6 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6); -- cgit v1.2.3 From b058a5d25d921af2be83d70844d389ecfd4a0497 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 8 Feb 2024 08:42:41 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for ipv4 modules W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the IPv4 modules. Signed-off-by: Breno Leitao <leitao@debian.org> Reviewed-by: Simon Horman <horms@kernel.org> Link: https://lore.kernel.org/r/20240208164244.3818498-7-leitao@debian.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv4/ah4.c | 1 + net/ipv4/esp4.c | 1 + net/ipv4/ip_gre.c | 1 + net/ipv4/ip_tunnel.c | 1 + net/ipv4/ip_vti.c | 1 + net/ipv4/ipip.c | 1 + net/ipv4/tunnel4.c | 1 + net/ipv4/udp_tunnel_core.c | 1 + net/ipv4/xfrm4_tunnel.c | 1 + 9 files changed, 9 insertions(+) (limited to 'net') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index a2e6e1fdf82b..64aec3dff8ec 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -597,5 +597,6 @@ static void __exit ah4_fini(void) module_init(ah4_init); module_exit(ah4_fini); +MODULE_DESCRIPTION("IPv4 AH transformation library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 4ccfc104f13a..4dd9e5040672 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -1247,5 +1247,6 @@ static void __exit esp4_fini(void) module_init(esp4_init); module_exit(esp4_fini); +MODULE_DESCRIPTION("IPv4 ESP transformation library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_ESP); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5169c3c72cff..6b9cf5a24c19 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1793,6 +1793,7 @@ static void __exit ipgre_fini(void) module_init(ipgre_init); module_exit(ipgre_fini); +MODULE_DESCRIPTION("IPv4 GRE tunnels over IP library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index beeae624c412..a4513ffb66cb 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -1298,4 +1298,5 @@ void ip_tunnel_setup(struct net_device *dev, unsigned int net_id) } EXPORT_SYMBOL_GPL(ip_tunnel_setup); +MODULE_DESCRIPTION("IPv4 tunnel implementation library"); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 9ab9b3ebe0cd..d1d6bb28ed6e 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -721,6 +721,7 @@ static void __exit vti_fini(void) module_init(vti_init); module_exit(vti_fini); +MODULE_DESCRIPTION("Virtual (secure) IP tunneling library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti"); MODULE_ALIAS_NETDEV("ip_vti0"); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 27b8f83c6ea2..03afa3871efc 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -658,6 +658,7 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); +MODULE_DESCRIPTION("IP/IP protocol decoder library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("ipip"); MODULE_ALIAS_NETDEV("tunl0"); diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 5048c47c79b2..4c1f836aae38 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -294,4 +294,5 @@ static void __exit tunnel4_fini(void) module_init(tunnel4_init); module_exit(tunnel4_fini); +MODULE_DESCRIPTION("IPv4 XFRM tunnel library"); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index a87defb2b167..860aff5f8599 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -253,4 +253,5 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, } EXPORT_SYMBOL_GPL(udp_tunnel_dst_lookup); +MODULE_DESCRIPTION("IPv4 Foo over UDP tunnel driver"); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 8489fa106583..8cb266af1393 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -114,5 +114,6 @@ static void __exit ipip_fini(void) module_init(ipip_init); module_exit(ipip_fini); +MODULE_DESCRIPTION("IPv4 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_IPIP); -- cgit v1.2.3 From a46c31bf2744b9807ba5e3ac8fdae2368d8bb3fa Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 8 Feb 2024 08:42:42 -0800 Subject: net: fill in MODULE_DESCRIPTION()s for net/sched W=1 builds now warn if module is built without a MODULE_DESCRIPTION(). Add descriptions to the network schedulers. Suggested-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Breno Leitao <leitao@debian.org> Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com> Link: https://lore.kernel.org/r/20240208164244.3818498-8-leitao@debian.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/sched/em_canid.c | 1 + net/sched/em_cmp.c | 1 + net/sched/em_meta.c | 1 + net/sched/em_nbyte.c | 1 + net/sched/em_text.c | 1 + net/sched/em_u32.c | 1 + 6 files changed, 6 insertions(+) (limited to 'net') diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index 5ea84decec19..5337bc462755 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -222,6 +222,7 @@ static void __exit exit_em_canid(void) tcf_em_unregister(&em_canid_ops); } +MODULE_DESCRIPTION("ematch classifier to match CAN IDs embedded in skb CAN frames"); MODULE_LICENSE("GPL"); module_init(init_em_canid); diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index f17b049ea530..c90ad7ea26b4 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -87,6 +87,7 @@ static void __exit exit_em_cmp(void) tcf_em_unregister(&em_cmp_ops); } +MODULE_DESCRIPTION("ematch classifier for basic data types(8/16/32 bit) against skb data"); MODULE_LICENSE("GPL"); module_init(init_em_cmp); diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 09d8afd04a2a..8996c73c9779 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -1006,6 +1006,7 @@ static void __exit exit_em_meta(void) tcf_em_unregister(&em_meta_ops); } +MODULE_DESCRIPTION("ematch classifier for various internal kernel metadata, skb metadata and sk metadata"); MODULE_LICENSE("GPL"); module_init(init_em_meta); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index a83b237cbeb0..4f9f21a05d5e 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -68,6 +68,7 @@ static void __exit exit_em_nbyte(void) tcf_em_unregister(&em_nbyte_ops); } +MODULE_DESCRIPTION("ematch classifier for arbitrary skb multi-bytes"); MODULE_LICENSE("GPL"); module_init(init_em_nbyte); diff --git a/net/sched/em_text.c b/net/sched/em_text.c index f176afb70559..420c66203b17 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -147,6 +147,7 @@ static void __exit exit_em_text(void) tcf_em_unregister(&em_text_ops); } +MODULE_DESCRIPTION("ematch classifier for embedded text in skbs"); MODULE_LICENSE("GPL"); module_init(init_em_text); diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c index 71b070da0437..fdec4db5ec89 100644 --- a/net/sched/em_u32.c +++ b/net/sched/em_u32.c @@ -52,6 +52,7 @@ static void __exit exit_em_u32(void) tcf_em_unregister(&em_u32_ops); } +MODULE_DESCRIPTION("ematch skb classifier using 32 bit chunks of data"); MODULE_LICENSE("GPL"); module_init(init_em_u32); -- cgit v1.2.3 From c57ca512f3b68ddcd62bda9cc24a8f5584ab01b1 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 6 Feb 2024 17:18:18 -0800 Subject: net: tls: factor out tls_*crypt_async_wait() Factor out waiting for async encrypt and decrypt to finish. There are already multiple copies and a subsequent fix will need more. No functional changes. Note that crypto_wait_req() returns wait->err Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Simon Horman <horms@kernel.org> Reviewed-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tls/tls_sw.c | 96 ++++++++++++++++++++++++++------------------------------ 1 file changed, 45 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 31e8a94dfc11..6a73714f34cc 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -230,6 +230,20 @@ static void tls_decrypt_done(void *data, int err) spin_unlock_bh(&ctx->decrypt_compl_lock); } +static int tls_decrypt_async_wait(struct tls_sw_context_rx *ctx) +{ + int pending; + + spin_lock_bh(&ctx->decrypt_compl_lock); + reinit_completion(&ctx->async_wait.completion); + pending = atomic_read(&ctx->decrypt_pending); + spin_unlock_bh(&ctx->decrypt_compl_lock); + if (pending) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + + return ctx->async_wait.err; +} + static int tls_do_decryption(struct sock *sk, struct scatterlist *sgin, struct scatterlist *sgout, @@ -495,6 +509,28 @@ static void tls_encrypt_done(void *data, int err) schedule_delayed_work(&ctx->tx_work.work, 1); } +static int tls_encrypt_async_wait(struct tls_sw_context_tx *ctx) +{ + int pending; + + spin_lock_bh(&ctx->encrypt_compl_lock); + ctx->async_notify = true; + + pending = atomic_read(&ctx->encrypt_pending); + spin_unlock_bh(&ctx->encrypt_compl_lock); + if (pending) + crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + else + reinit_completion(&ctx->async_wait.completion); + + /* There can be no concurrent accesses, since we have no + * pending encrypt operations + */ + WRITE_ONCE(ctx->async_notify, false); + + return ctx->async_wait.err; +} + static int tls_do_encryption(struct sock *sk, struct tls_context *tls_ctx, struct tls_sw_context_tx *ctx, @@ -984,7 +1020,6 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, int num_zc = 0; int orig_size; int ret = 0; - int pending; if (!eor && (msg->msg_flags & MSG_EOR)) return -EINVAL; @@ -1163,24 +1198,12 @@ trim_sgl: if (!num_async) { goto send_end; } else if (num_zc) { - /* Wait for pending encryptions to get completed */ - spin_lock_bh(&ctx->encrypt_compl_lock); - ctx->async_notify = true; - - pending = atomic_read(&ctx->encrypt_pending); - spin_unlock_bh(&ctx->encrypt_compl_lock); - if (pending) - crypto_wait_req(-EINPROGRESS, &ctx->async_wait); - else - reinit_completion(&ctx->async_wait.completion); - - /* There can be no concurrent accesses, since we have no - * pending encrypt operations - */ - WRITE_ONCE(ctx->async_notify, false); + int err; - if (ctx->async_wait.err) { - ret = ctx->async_wait.err; + /* Wait for pending encryptions to get completed */ + err = tls_encrypt_async_wait(ctx); + if (err) { + ret = err; copied = 0; } } @@ -1229,7 +1252,6 @@ void tls_sw_splice_eof(struct socket *sock) ssize_t copied = 0; bool retrying = false; int ret = 0; - int pending; if (!ctx->open_rec) return; @@ -1264,22 +1286,7 @@ retry: } /* Wait for pending encryptions to get completed */ - spin_lock_bh(&ctx->encrypt_compl_lock); - ctx->async_notify = true; - - pending = atomic_read(&ctx->encrypt_pending); - spin_unlock_bh(&ctx->encrypt_compl_lock); - if (pending) - crypto_wait_req(-EINPROGRESS, &ctx->async_wait); - else - reinit_completion(&ctx->async_wait.completion); - - /* There can be no concurrent accesses, since we have no pending - * encrypt operations - */ - WRITE_ONCE(ctx->async_notify, false); - - if (ctx->async_wait.err) + if (tls_encrypt_async_wait(ctx)) goto unlock; /* Transmit if any encryptions have completed */ @@ -2109,16 +2116,10 @@ put_on_rx_list: recv_end: if (async) { - int ret, pending; + int ret; /* Wait for all previously submitted records to be decrypted */ - spin_lock_bh(&ctx->decrypt_compl_lock); - reinit_completion(&ctx->async_wait.completion); - pending = atomic_read(&ctx->decrypt_pending); - spin_unlock_bh(&ctx->decrypt_compl_lock); - ret = 0; - if (pending) - ret = crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + ret = tls_decrypt_async_wait(ctx); __skb_queue_purge(&ctx->async_hold); if (ret) { @@ -2435,16 +2436,9 @@ void tls_sw_release_resources_tx(struct sock *sk) struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx); struct tls_rec *rec, *tmp; - int pending; /* Wait for any pending async encryptions to complete */ - spin_lock_bh(&ctx->encrypt_compl_lock); - ctx->async_notify = true; - pending = atomic_read(&ctx->encrypt_pending); - spin_unlock_bh(&ctx->encrypt_compl_lock); - - if (pending) - crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + tls_encrypt_async_wait(ctx); tls_tx_records(sk, -1); -- cgit v1.2.3 From aec7961916f3f9e88766e2688992da6980f11b8d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 6 Feb 2024 17:18:19 -0800 Subject: tls: fix race between async notify and socket close The submitting thread (one which called recvmsg/sendmsg) may exit as soon as the async crypto handler calls complete() so any code past that point risks touching already freed data. Try to avoid the locking and extra flags altogether. Have the main thread hold an extra reference, this way we can depend solely on the atomic ref counter for synchronization. Don't futz with reiniting the completion, either, we are now tightly controlling when completion fires. Reported-by: valis <sec@valis.email> Fixes: 0cada33241d9 ("net/tls: fix race condition causing kernel panic") Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Simon Horman <horms@kernel.org> Reviewed-by: Eric Dumazet <edumazet@google.com> Reviewed-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/net/tls.h | 5 ----- net/tls/tls_sw.c | 43 ++++++++++--------------------------------- 2 files changed, 10 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/net/tls.h b/include/net/tls.h index 962f0c501111..340ad43971e4 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -97,9 +97,6 @@ struct tls_sw_context_tx { struct tls_rec *open_rec; struct list_head tx_list; atomic_t encrypt_pending; - /* protect crypto_wait with encrypt_pending */ - spinlock_t encrypt_compl_lock; - int async_notify; u8 async_capable:1; #define BIT_TX_SCHEDULED 0 @@ -136,8 +133,6 @@ struct tls_sw_context_rx { struct tls_strparser strp; atomic_t decrypt_pending; - /* protect crypto_wait with decrypt_pending*/ - spinlock_t decrypt_compl_lock; struct sk_buff_head async_hold; struct wait_queue_head wq; }; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 6a73714f34cc..635305bebfef 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -224,22 +224,15 @@ static void tls_decrypt_done(void *data, int err) kfree(aead_req); - spin_lock_bh(&ctx->decrypt_compl_lock); - if (!atomic_dec_return(&ctx->decrypt_pending)) + if (atomic_dec_and_test(&ctx->decrypt_pending)) complete(&ctx->async_wait.completion); - spin_unlock_bh(&ctx->decrypt_compl_lock); } static int tls_decrypt_async_wait(struct tls_sw_context_rx *ctx) { - int pending; - - spin_lock_bh(&ctx->decrypt_compl_lock); - reinit_completion(&ctx->async_wait.completion); - pending = atomic_read(&ctx->decrypt_pending); - spin_unlock_bh(&ctx->decrypt_compl_lock); - if (pending) + if (!atomic_dec_and_test(&ctx->decrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); + atomic_inc(&ctx->decrypt_pending); return ctx->async_wait.err; } @@ -267,6 +260,7 @@ static int tls_do_decryption(struct sock *sk, aead_request_set_callback(aead_req, CRYPTO_TFM_REQ_MAY_BACKLOG, tls_decrypt_done, aead_req); + DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->decrypt_pending) < 1); atomic_inc(&ctx->decrypt_pending); } else { aead_request_set_callback(aead_req, @@ -455,7 +449,6 @@ static void tls_encrypt_done(void *data, int err) struct sk_msg *msg_en; bool ready = false; struct sock *sk; - int pending; msg_en = &rec->msg_encrypted; @@ -494,12 +487,8 @@ static void tls_encrypt_done(void *data, int err) ready = true; } - spin_lock_bh(&ctx->encrypt_compl_lock); - pending = atomic_dec_return(&ctx->encrypt_pending); - - if (!pending && ctx->async_notify) + if (atomic_dec_and_test(&ctx->encrypt_pending)) complete(&ctx->async_wait.completion); - spin_unlock_bh(&ctx->encrypt_compl_lock); if (!ready) return; @@ -511,22 +500,9 @@ static void tls_encrypt_done(void *data, int err) static int tls_encrypt_async_wait(struct tls_sw_context_tx *ctx) { - int pending; - - spin_lock_bh(&ctx->encrypt_compl_lock); - ctx->async_notify = true; - - pending = atomic_read(&ctx->encrypt_pending); - spin_unlock_bh(&ctx->encrypt_compl_lock); - if (pending) + if (!atomic_dec_and_test(&ctx->encrypt_pending)) crypto_wait_req(-EINPROGRESS, &ctx->async_wait); - else - reinit_completion(&ctx->async_wait.completion); - - /* There can be no concurrent accesses, since we have no - * pending encrypt operations - */ - WRITE_ONCE(ctx->async_notify, false); + atomic_inc(&ctx->encrypt_pending); return ctx->async_wait.err; } @@ -577,6 +553,7 @@ static int tls_do_encryption(struct sock *sk, /* Add the record in tx_list */ list_add_tail((struct list_head *)&rec->list, &ctx->tx_list); + DEBUG_NET_WARN_ON_ONCE(atomic_read(&ctx->encrypt_pending) < 1); atomic_inc(&ctx->encrypt_pending); rc = crypto_aead_encrypt(aead_req); @@ -2601,7 +2578,7 @@ static struct tls_sw_context_tx *init_ctx_tx(struct tls_context *ctx, struct soc } crypto_init_wait(&sw_ctx_tx->async_wait); - spin_lock_init(&sw_ctx_tx->encrypt_compl_lock); + atomic_set(&sw_ctx_tx->encrypt_pending, 1); INIT_LIST_HEAD(&sw_ctx_tx->tx_list); INIT_DELAYED_WORK(&sw_ctx_tx->tx_work.work, tx_work_handler); sw_ctx_tx->tx_work.sk = sk; @@ -2622,7 +2599,7 @@ static struct tls_sw_context_rx *init_ctx_rx(struct tls_context *ctx) } crypto_init_wait(&sw_ctx_rx->async_wait); - spin_lock_init(&sw_ctx_rx->decrypt_compl_lock); + atomic_set(&sw_ctx_rx->decrypt_pending, 1); init_waitqueue_head(&sw_ctx_rx->wq); skb_queue_head_init(&sw_ctx_rx->rx_list); skb_queue_head_init(&sw_ctx_rx->async_hold); -- cgit v1.2.3 From e01e3934a1b2d122919f73bc6ddbe1cdafc4bbdb Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 6 Feb 2024 17:18:20 -0800 Subject: tls: fix race between tx work scheduling and socket close Similarly to previous commit, the submitting thread (recvmsg/sendmsg) may exit as soon as the async crypto handler calls complete(). Reorder scheduling the work before calling complete(). This seems more logical in the first place, as it's the inverse order of what the submitting thread will do. Reported-by: valis <sec@valis.email> Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Simon Horman <horms@kernel.org> Reviewed-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tls/tls_sw.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 635305bebfef..9374a61cef00 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -447,7 +447,6 @@ static void tls_encrypt_done(void *data, int err) struct tls_rec *rec = data; struct scatterlist *sge; struct sk_msg *msg_en; - bool ready = false; struct sock *sk; msg_en = &rec->msg_encrypted; @@ -483,19 +482,16 @@ static void tls_encrypt_done(void *data, int err) /* If received record is at head of tx_list, schedule tx */ first_rec = list_first_entry(&ctx->tx_list, struct tls_rec, list); - if (rec == first_rec) - ready = true; + if (rec == first_rec) { + /* Schedule the transmission */ + if (!test_and_set_bit(BIT_TX_SCHEDULED, + &ctx->tx_bitmask)) + schedule_delayed_work(&ctx->tx_work.work, 1); + } } if (atomic_dec_and_test(&ctx->encrypt_pending)) complete(&ctx->async_wait.completion); - - if (!ready) - return; - - /* Schedule the transmission */ - if (!test_and_set_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) - schedule_delayed_work(&ctx->tx_work.work, 1); } static int tls_encrypt_async_wait(struct tls_sw_context_tx *ctx) -- cgit v1.2.3 From 8590541473188741055d27b955db0777569438e3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 6 Feb 2024 17:18:21 -0800 Subject: net: tls: handle backlogging of crypto requests Since we're setting the CRYPTO_TFM_REQ_MAY_BACKLOG flag on our requests to the crypto API, crypto_aead_{encrypt,decrypt} can return -EBUSY instead of -EINPROGRESS in valid situations. For example, when the cryptd queue for AESNI is full (easy to trigger with an artificially low cryptd.cryptd_max_cpu_qlen), requests will be enqueued to the backlog but still processed. In that case, the async callback will also be called twice: first with err == -EINPROGRESS, which it seems we can just ignore, then with err == 0. Compared to Sabrina's original patch this version uses the new tls_*crypt_async_wait() helpers and converts the EBUSY to EINPROGRESS to avoid having to modify all the error handling paths. The handling is identical. Fixes: a54667f6728c ("tls: Add support for encryption using async offload accelerator") Fixes: 94524d8fc965 ("net/tls: Add support for async decryption of tls records") Co-developed-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net> Link: https://lore.kernel.org/netdev/9681d1febfec295449a62300938ed2ae66983f28.1694018970.git.sd@queasysnail.net/ Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Simon Horman <horms@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tls/tls_sw.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 9374a61cef00..63bef5666e36 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -196,6 +196,17 @@ static void tls_decrypt_done(void *data, int err) struct sock *sk; int aead_size; + /* If requests get too backlogged crypto API returns -EBUSY and calls + * ->complete(-EINPROGRESS) immediately followed by ->complete(0) + * to make waiting for backlog to flush with crypto_wait_req() easier. + * First wait converts -EBUSY -> -EINPROGRESS, and the second one + * -EINPROGRESS -> 0. + * We have a single struct crypto_async_request per direction, this + * scheme doesn't help us, so just ignore the first ->complete(). + */ + if (err == -EINPROGRESS) + return; + aead_size = sizeof(*aead_req) + crypto_aead_reqsize(aead); aead_size = ALIGN(aead_size, __alignof__(*dctx)); dctx = (void *)((u8 *)aead_req + aead_size); @@ -269,6 +280,10 @@ static int tls_do_decryption(struct sock *sk, } ret = crypto_aead_decrypt(aead_req); + if (ret == -EBUSY) { + ret = tls_decrypt_async_wait(ctx); + ret = ret ?: -EINPROGRESS; + } if (ret == -EINPROGRESS) { if (darg->async) return 0; @@ -449,6 +464,9 @@ static void tls_encrypt_done(void *data, int err) struct sk_msg *msg_en; struct sock *sk; + if (err == -EINPROGRESS) /* see the comment in tls_decrypt_done() */ + return; + msg_en = &rec->msg_encrypted; sk = rec->sk; @@ -553,6 +571,10 @@ static int tls_do_encryption(struct sock *sk, atomic_inc(&ctx->encrypt_pending); rc = crypto_aead_encrypt(aead_req); + if (rc == -EBUSY) { + rc = tls_encrypt_async_wait(ctx); + rc = rc ?: -EINPROGRESS; + } if (!rc || rc != -EINPROGRESS) { atomic_dec(&ctx->encrypt_pending); sge->offset -= prot->prepend_size; -- cgit v1.2.3 From 32b55c5ff9103b8508c1e04bfa5a08c64e7a925f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Tue, 6 Feb 2024 17:18:22 -0800 Subject: net: tls: fix use-after-free with partial reads and async decrypt tls_decrypt_sg doesn't take a reference on the pages from clear_skb, so the put_page() in tls_decrypt_done releases them, and we trigger a use-after-free in process_rx_list when we try to read from the partially-read skb. Fixes: fd31f3996af2 ("tls: rx: decrypt into a fresh skb") Signed-off-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Simon Horman <horms@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tls/tls_sw.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 63bef5666e36..a6eff21ade23 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -63,6 +63,7 @@ struct tls_decrypt_ctx { u8 iv[TLS_MAX_IV_SIZE]; u8 aad[TLS_MAX_AAD_SIZE]; u8 tail; + bool free_sgout; struct scatterlist sg[]; }; @@ -187,7 +188,6 @@ static void tls_decrypt_done(void *data, int err) struct aead_request *aead_req = data; struct crypto_aead *aead = crypto_aead_reqtfm(aead_req); struct scatterlist *sgout = aead_req->dst; - struct scatterlist *sgin = aead_req->src; struct tls_sw_context_rx *ctx; struct tls_decrypt_ctx *dctx; struct tls_context *tls_ctx; @@ -224,7 +224,7 @@ static void tls_decrypt_done(void *data, int err) } /* Free the destination pages if skb was not decrypted inplace */ - if (sgout != sgin) { + if (dctx->free_sgout) { /* Skip the first S/G entry as it points to AAD */ for_each_sg(sg_next(sgout), sg, UINT_MAX, pages) { if (!sg) @@ -1583,6 +1583,7 @@ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov, } else if (out_sg) { memcpy(sgout, out_sg, n_sgout * sizeof(*sgout)); } + dctx->free_sgout = !!pages; /* Prepare and submit AEAD request */ err = tls_do_decryption(sk, sgin, sgout, dctx->iv, -- cgit v1.2.3 From ac437a51ce662364062f704e321227f6728e6adc Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 6 Feb 2024 17:18:24 -0800 Subject: net: tls: fix returned read length with async decrypt We double count async, non-zc rx data. The previous fix was lucky because if we fully zc async_copy_bytes is 0 so we add 0. Decrypted already has all the bytes we handled, in all cases. We don't have to adjust anything, delete the erroneous line. Fixes: 4d42cd6bc2ac ("tls: rx: fix return value for async crypto") Co-developed-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net> Signed-off-by: Jakub Kicinski <kuba@kernel.org> Reviewed-by: Simon Horman <horms@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tls/tls_sw.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index a6eff21ade23..9fbc70200cd0 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2132,7 +2132,6 @@ recv_end: else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek); - decrypted += max(err, 0); } copied += decrypted; -- cgit v1.2.3 From 119ff04864a24470b1e531bb53e5c141aa8fefb0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Feb 2024 14:43:21 +0000 Subject: tcp: move tp->scaling_ratio to tcp_sock_read_txrx group tp->scaling_ratio is a read mostly field, used in rx and tx fast paths. Fixes: d5fed5addb2b ("tcp: reorganize tcp_sock fast path variables") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Coco Li <lixiaoyan@google.com> Cc: Wei Wang <weiwan@google.com> Reviewed-by: Simon Horman <horms@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 +- include/linux/tcp.h | 2 +- net/ipv4/tcp.c | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index 97d7a5c8e01c..803912291479 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -38,7 +38,7 @@ u32 max_window read_mostly - u32 mss_cache read_mostly read_mostly tcp_rate_check_app_limited,tcp_current_mss,tcp_sync_mss,tcp_sndbuf_expand,tcp_tso_should_defer(tx);tcp_update_pacing_rate,tcp_clean_rtx_queue(rx) u32 window_clamp read_mostly read_write tcp_rcv_space_adjust,__tcp_select_window u32 rcv_ssthresh read_mostly - __tcp_select_window -u82 scaling_ratio +u8 scaling_ratio read_mostly read_mostly tcp_win_from_space struct tcp_rack u16 advmss - read_mostly tcp_rcv_space_adjust u8 compressed_ack diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 89b290d8c8dc..168f5dca6609 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -221,6 +221,7 @@ struct tcp_sock { u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ u16 tcp_header_len; /* Bytes of tcp header to send */ + u8 scaling_ratio; /* see tcp_win_from_space() */ u8 chrono_type : 2, /* current chronograph type */ repair : 1, is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ @@ -352,7 +353,6 @@ struct tcp_sock { u32 compressed_ack_rcv_nxt; struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ - u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7e2481b9eae1..c82dc42f57c6 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4615,7 +4615,8 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 31); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32); /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); -- cgit v1.2.3 From c353c7b7ffb7ae6ed8f3339906fe33c8be6cf344 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 8 Feb 2024 14:43:23 +0000 Subject: net-device: move lstats in net_device_read_txrx dev->lstats is notably used from loopback ndo_start_xmit() and other virtual drivers. Per cpu stats updates are dirtying per-cpu data, but the pointer itself is read-only. Fixes: 43a71cd66b9c ("net-device: reorganize net_device fast path variables") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Coco Li <lixiaoyan@google.com> Cc: Simon Horman <horms@kernel.org> Reviewed-by: Simon Horman <horms@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- Documentation/networking/net_cachelines/net_device.rst | 4 ++-- include/linux/netdevice.h | 10 +++++----- net/core/dev.c | 3 ++- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index e75a53593bb9..dceb49d56a91 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -136,8 +136,8 @@ struct_netpoll_info* npinfo - possible_net_t nd_net - read_mostly (dev_net)napi_busy_loop,tcp_v(4/6)_rcv,ip(v6)_rcv,ip(6)_input,ip(6)_input_finish void* ml_priv enum_netdev_ml_priv_type ml_priv_type -struct_pcpu_lstats__percpu* lstats -struct_pcpu_sw_netstats__percpu* tstats +struct_pcpu_lstats__percpu* lstats read_mostly dev_lstats_add() +struct_pcpu_sw_netstats__percpu* tstats read_mostly dev_sw_netstats_tx_add() struct_pcpu_dstats__percpu* dstats struct_garp_port* garp_port struct_mrp_port* mrp_port diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 118c40258d07..ef7bfbb98497 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2141,6 +2141,11 @@ struct net_device { /* TXRX read-mostly hotpath */ __cacheline_group_begin(net_device_read_txrx); + union { + struct pcpu_lstats __percpu *lstats; + struct pcpu_sw_netstats __percpu *tstats; + struct pcpu_dstats __percpu *dstats; + }; unsigned int flags; unsigned short hard_header_len; netdev_features_t features; @@ -2395,11 +2400,6 @@ struct net_device { enum netdev_ml_priv_type ml_priv_type; enum netdev_stat_type pcpu_stat_type:8; - union { - struct pcpu_lstats __percpu *lstats; - struct pcpu_sw_netstats __percpu *tstats; - struct pcpu_dstats __percpu *dstats; - }; #if IS_ENABLED(CONFIG_GARP) struct garp_port __rcu *garp_port; diff --git a/net/core/dev.c b/net/core/dev.c index cb2dab0feee0..9bb792cecc16 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11652,11 +11652,12 @@ static void __init net_dev_struct_check(void) CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_tx, 160); /* TXRX read-mostly hotpath */ + CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, lstats); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, flags); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, hard_header_len); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, features); CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_txrx, ip6_ptr); - CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 30); + CACHELINE_ASSERT_GROUP_SIZE(struct net_device, net_device_read_txrx, 38); /* RX read-mostly hotpath */ CACHELINE_ASSERT_GROUP_MEMBER(struct net_device, net_device_read_rx, ptype_specific); -- cgit v1.2.3 From bdd70eb68913c960acb895b00a8c62eb64715b1f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 8 Feb 2024 19:03:49 +0100 Subject: mptcp: drop the push_pending field Such field is there to avoid acquiring the data lock in a few spots, but it adds complexity to the already non trivial locking schema. All the relevant call sites (mptcp-level re-injection, set socket options), are slow-path, drop such field in favor of 'cb_flags', adding the relevant locking. This patch could be seen as an improvement, instead of a fix. But it simplifies the next patch. The 'Fixes' tag has been added to help having this series backported to stable. Fixes: e9d09baca676 ("mptcp: avoid atomic bit manipulation when possible") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/mptcp/protocol.c | 12 ++++++------ net/mptcp/protocol.h | 1 - 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 028e8b473626..2111819016af 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1505,8 +1505,11 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, void mptcp_check_and_set_pending(struct sock *sk) { - if (mptcp_send_head(sk)) - mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); + if (mptcp_send_head(sk)) { + mptcp_data_lock(sk); + mptcp_sk(sk)->cb_flags |= BIT(MPTCP_PUSH_PENDING); + mptcp_data_unlock(sk); + } } static int __subflow_push_pending(struct sock *sk, struct sock *ssk, @@ -3142,7 +3145,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; - msk->push_pending = 0; msk->recovery = false; msk->can_ack = false; msk->fully_established = false; @@ -3330,8 +3332,7 @@ static void mptcp_release_cb(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); for (;;) { - unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) | - msk->push_pending; + unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); struct list_head join_list; if (!flags) @@ -3347,7 +3348,6 @@ static void mptcp_release_cb(struct sock *sk) * datapath acquires the msk socket spinlock while helding * the subflow socket lock */ - msk->push_pending = 0; msk->cb_flags &= ~flags; spin_unlock_bh(&sk->sk_lock.slock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 3517f2d24a22..b905f1868298 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -286,7 +286,6 @@ struct mptcp_sock { int rmem_released; unsigned long flags; unsigned long cb_flags; - unsigned long push_pending; bool recovery; /* closing subflow write queue reinjected */ bool can_ack; bool fully_established; -- cgit v1.2.3 From 013e3179dbd2bc756ce1dd90354abac62f65b739 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 8 Feb 2024 19:03:50 +0100 Subject: mptcp: fix rcv space initialization mptcp_rcv_space_init() is supposed to happen under the msk socket lock, but active msk socket does that without such protection. Leverage the existing mptcp_propagate_state() helper to that extent. We need to ensure mptcp_rcv_space_init will happen before mptcp_rcv_space_adjust(), and the release_cb does not assure that: explicitly check for such condition. While at it, move the wnd_end initialization out of mptcp_rcv_space_init(), it never belonged there. Note that the race does not produce ill effect in practice, but change allows cleaning-up and defying better the locking model. Fixes: a6b118febbab ("mptcp: add receive buffer auto-tuning") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/mptcp/protocol.c | 10 ++++++---- net/mptcp/protocol.h | 3 ++- net/mptcp/subflow.c | 4 ++-- 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 2111819016af..7632eafb683b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1963,6 +1963,9 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (copied <= 0) return; + if (!msk->rcvspace_init) + mptcp_rcv_space_init(msk, msk->first); + msk->rcvq_space.copied += copied; mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); @@ -3160,6 +3163,7 @@ static int mptcp_disconnect(struct sock *sk, int flags) msk->bytes_received = 0; msk->bytes_sent = 0; msk->bytes_retrans = 0; + msk->rcvspace_init = 0; WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); @@ -3247,6 +3251,7 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) { const struct tcp_sock *tp = tcp_sk(ssk); + msk->rcvspace_init = 1; msk->rcvq_space.copied = 0; msk->rcvq_space.rtt_us = 0; @@ -3257,8 +3262,6 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) TCP_INIT_CWND * tp->advmss); if (msk->rcvq_space.space == 0) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; - - WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); } void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) @@ -3478,10 +3481,9 @@ void mptcp_finish_connect(struct sock *ssk) WRITE_ONCE(msk->write_seq, subflow->idsn + 1); WRITE_ONCE(msk->snd_nxt, msk->write_seq); WRITE_ONCE(msk->snd_una, msk->write_seq); + WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); mptcp_pm_new_connection(msk, ssk, 0); - - mptcp_rcv_space_init(msk, ssk); } void mptcp_sock_graft(struct sock *sk, struct socket *parent) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b905f1868298..9f5ee82e3473 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -304,7 +304,8 @@ struct mptcp_sock { nodelay:1, fastopening:1, in_accept_queue:1, - free_first:1; + free_first:1, + rcvspace_init:1; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 0dcb721c89d1..56b2ac2f2f22 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -424,6 +424,8 @@ void __mptcp_sync_state(struct sock *sk, int state) struct mptcp_sock *msk = mptcp_sk(sk); __mptcp_propagate_sndbuf(sk, msk->first); + if (!msk->rcvspace_init) + mptcp_rcv_space_init(msk, msk->first); if (sk->sk_state == TCP_SYN_SENT) { mptcp_set_state(sk, state); sk->sk_state_change(sk); @@ -545,7 +547,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } } else if (mptcp_check_fallback(sk)) { fallback: - mptcp_rcv_space_init(msk, sk); mptcp_propagate_state(parent, sk); } return; @@ -1744,7 +1745,6 @@ static void subflow_state_change(struct sock *sk) msk = mptcp_sk(parent); if (subflow_simultaneous_connect(sk)) { mptcp_do_fallback(sk); - mptcp_rcv_space_init(msk, sk); pr_fallback(msk); subflow->conn_finished = 1; mptcp_propagate_state(parent, sk); -- cgit v1.2.3 From 3f83d8a77eeeb47011b990fd766a421ee64f1d73 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 8 Feb 2024 19:03:51 +0100 Subject: mptcp: fix more tx path fields initialization The 'msk->write_seq' and 'msk->snd_nxt' are always updated under the msk socket lock, except at MPC handshake completiont time. Builds-up on the previous commit to move such init under the relevant lock. There are no known problems caused by the potential race, the primary goal is consistency. Fixes: 6d0060f600ad ("mptcp: Write MPTCP DSS headers to outgoing data packets") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/mptcp/protocol.c | 6 ++---- net/mptcp/subflow.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 7632eafb683b..8cb6a873dae9 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3478,10 +3478,8 @@ void mptcp_finish_connect(struct sock *ssk) * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); - WRITE_ONCE(msk->write_seq, subflow->idsn + 1); - WRITE_ONCE(msk->snd_nxt, msk->write_seq); - WRITE_ONCE(msk->snd_una, msk->write_seq); - WRITE_ONCE(msk->wnd_end, msk->snd_nxt + tcp_sk(ssk)->snd_wnd); + WRITE_ONCE(msk->snd_una, subflow->idsn + 1); + WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); mptcp_pm_new_connection(msk, ssk, 0); } diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 56b2ac2f2f22..c2df34ebcf28 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -421,12 +421,21 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc void __mptcp_sync_state(struct sock *sk, int state) { + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); + struct sock *ssk = msk->first; - __mptcp_propagate_sndbuf(sk, msk->first); + subflow = mptcp_subflow_ctx(ssk); + __mptcp_propagate_sndbuf(sk, ssk); if (!msk->rcvspace_init) - mptcp_rcv_space_init(msk, msk->first); + mptcp_rcv_space_init(msk, ssk); + if (sk->sk_state == TCP_SYN_SENT) { + /* subflow->idsn is always available is TCP_SYN_SENT state, + * even for the FASTOPEN scenarios + */ + WRITE_ONCE(msk->write_seq, subflow->idsn + 1); + WRITE_ONCE(msk->snd_nxt, msk->write_seq); mptcp_set_state(sk, state); sk->sk_state_change(sk); } -- cgit v1.2.3 From e4a0fa47e816e186f6b4c0055d07eeec42d11871 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 8 Feb 2024 19:03:52 +0100 Subject: mptcp: corner case locking for rx path fields initialization Most MPTCP-level related fields are under the mptcp data lock protection, but are written one-off without such lock at MPC complete time, both for the client and the server Leverage the mptcp_propagate_state() infrastructure to move such initialization under the proper lock client-wise. The server side critical init steps are done by mptcp_subflow_fully_established(): ensure the caller properly held the relevant lock, and avoid acquiring the same lock in the nested scopes. There are no real potential races, as write access to such fields is implicitly serialized by the MPTCP state machine; the primary goal is consistency. Fixes: d22f4988ffec ("mptcp: process MP_CAPABLE data option") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/mptcp/fastopen.c | 6 ++---- net/mptcp/options.c | 9 +++++---- net/mptcp/protocol.c | 9 ++++++--- net/mptcp/protocol.h | 9 +++++---- net/mptcp/subflow.c | 56 ++++++++++++++++++++++++++++++---------------------- 5 files changed, 50 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index 74698582a285..ad28da655f8b 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -59,13 +59,12 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf mptcp_data_unlock(sk); } -void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt) +void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt) { struct sock *sk = (struct sock *)msk; struct sk_buff *skb; - mptcp_data_lock(sk); skb = skb_peek_tail(&sk->sk_receive_queue); if (skb) { WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); @@ -77,5 +76,4 @@ void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_ } pr_debug("msk=%p ack_seq=%llx", msk, msk->ack_seq); - mptcp_data_unlock(sk); } diff --git a/net/mptcp/options.c b/net/mptcp/options.c index d2527d189a79..e3e96a49f922 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -962,9 +962,7 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, /* subflows are fully established as soon as we get any * additional ack, including ADD_ADDR. */ - subflow->fully_established = 1; - WRITE_ONCE(msk->fully_established, true); - goto check_notify; + goto set_fully_established; } /* If the first established packet does not contain MP_CAPABLE + data @@ -986,7 +984,10 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, set_fully_established: if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); - mptcp_subflow_fully_established(subflow, mp_opt); + + mptcp_data_lock((struct sock *)msk); + __mptcp_subflow_fully_established(msk, subflow, mp_opt); + mptcp_data_unlock((struct sock *)msk); check_notify: /* if the subflow is not already linked into the conn_list, we can't diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8cb6a873dae9..8ef2927ebca2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3186,6 +3186,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, { struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); struct sock *nsk = sk_clone_lock(sk, GFP_ATOMIC); + struct mptcp_subflow_context *subflow; struct mptcp_sock *msk; if (!nsk) @@ -3226,7 +3227,8 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, /* The msk maintain a ref to each subflow in the connections list */ WRITE_ONCE(msk->first, ssk); - list_add(&mptcp_subflow_ctx(ssk)->node, &msk->conn_list); + subflow = mptcp_subflow_ctx(ssk); + list_add(&subflow->node, &msk->conn_list); sock_hold(ssk); /* new mpc subflow takes ownership of the newly @@ -3241,6 +3243,9 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); + + if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) + __mptcp_subflow_fully_established(msk, subflow, mp_opt); bh_unlock_sock(nsk); /* note: the newly allocated socket refcount is 2 now */ @@ -3478,8 +3483,6 @@ void mptcp_finish_connect(struct sock *ssk) * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); - WRITE_ONCE(msk->snd_una, subflow->idsn + 1); - WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); mptcp_pm_new_connection(msk, ssk, 0); } diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 9f5ee82e3473..fefcbf585411 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -622,8 +622,9 @@ unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_close_timeout(const struct sock *sk); int mptcp_get_pm_type(const struct net *net); const char *mptcp_get_scheduler(const struct net *net); -void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt); +void __mptcp_subflow_fully_established(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); void mptcp_check_and_set_pending(struct sock *sk); void __mptcp_push_pending(struct sock *sk, unsigned int flags); @@ -952,8 +953,8 @@ void mptcp_event_pm_listener(const struct sock *ssk, enum mptcp_event_type event); bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); -void mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt); +void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt); void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index c2df34ebcf28..c34ecadee120 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -441,20 +441,6 @@ void __mptcp_sync_state(struct sock *sk, int state) } } -static void mptcp_propagate_state(struct sock *sk, struct sock *ssk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - mptcp_data_lock(sk); - if (!sock_owned_by_user(sk)) { - __mptcp_sync_state(sk, ssk->sk_state); - } else { - msk->pending_state = ssk->sk_state; - __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); - } - mptcp_data_unlock(sk); -} - static void subflow_set_remote_key(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, const struct mptcp_options_received *mp_opt) @@ -476,6 +462,31 @@ static void subflow_set_remote_key(struct mptcp_sock *msk, atomic64_set(&msk->rcv_wnd_sent, subflow->iasn); } +static void mptcp_propagate_state(struct sock *sk, struct sock *ssk, + struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_data_lock(sk); + if (mp_opt) { + /* Options are available only in the non fallback cases + * avoid updating rx path fields otherwise + */ + WRITE_ONCE(msk->snd_una, subflow->idsn + 1); + WRITE_ONCE(msk->wnd_end, subflow->idsn + 1 + tcp_sk(ssk)->snd_wnd); + subflow_set_remote_key(msk, subflow, mp_opt); + } + + if (!sock_owned_by_user(sk)) { + __mptcp_sync_state(sk, ssk->sk_state); + } else { + msk->pending_state = ssk->sk_state; + __set_bit(MPTCP_SYNC_STATE, &msk->cb_flags); + } + mptcp_data_unlock(sk); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -510,10 +521,9 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) if (mp_opt.deny_join_id0) WRITE_ONCE(msk->pm.remote_deny_join_id0, true); subflow->mp_capable = 1; - subflow_set_remote_key(msk, subflow, &mp_opt); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK); mptcp_finish_connect(sk); - mptcp_propagate_state(parent, sk); + mptcp_propagate_state(parent, sk, subflow, &mp_opt); } else if (subflow->request_join) { u8 hmac[SHA256_DIGEST_SIZE]; @@ -556,7 +566,7 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) } } else if (mptcp_check_fallback(sk)) { fallback: - mptcp_propagate_state(parent, sk); + mptcp_propagate_state(parent, sk, subflow, NULL); } return; @@ -741,17 +751,16 @@ void mptcp_subflow_drop_ctx(struct sock *ssk) kfree_rcu(ctx, rcu); } -void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, - const struct mptcp_options_received *mp_opt) +void __mptcp_subflow_fully_established(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow, + const struct mptcp_options_received *mp_opt) { - struct mptcp_sock *msk = mptcp_sk(subflow->conn); - subflow_set_remote_key(msk, subflow, mp_opt); subflow->fully_established = 1; WRITE_ONCE(msk->fully_established, true); if (subflow->is_mptfo) - mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); + __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); } static struct sock *subflow_syn_recv_sock(const struct sock *sk, @@ -844,7 +853,6 @@ create_child: * mpc option */ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { - mptcp_subflow_fully_established(ctx, &mp_opt); mptcp_pm_fully_established(owner, child); ctx->pm_notified = 1; } @@ -1756,7 +1764,7 @@ static void subflow_state_change(struct sock *sk) mptcp_do_fallback(sk); pr_fallback(msk); subflow->conn_finished = 1; - mptcp_propagate_state(parent, sk); + mptcp_propagate_state(parent, sk, subflow, NULL); } /* as recvmsg() does not acquire the subflow socket for ssk selection -- cgit v1.2.3 From f012d796a6de662692159c539689e47e662853a8 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Thu, 8 Feb 2024 19:03:53 +0100 Subject: mptcp: check addrs list in userspace_pm_get_local_id Before adding a new entry in mptcp_userspace_pm_get_local_id(), it's better to check whether this address is already in userspace pm local address list. If it's in the list, no need to add a new entry, just return it's address ID and use this address. Fixes: 8b20137012d9 ("mptcp: read attributes of addr entries managed by userspace PMs") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang <geliang.tang@linux.dev> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/mptcp/pm_userspace.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index efecbe3cf415..4f3901d5b8ef 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -130,10 +130,21 @@ int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) { - struct mptcp_pm_addr_entry new_entry; + struct mptcp_pm_addr_entry *entry = NULL, *e, new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&e->addr, skc, false)) { + entry = e; + break; + } + } + spin_unlock_bh(&msk->pm.lock); + if (entry) + return entry->addr.id; + memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); new_entry.addr = *skc; new_entry.addr.id = 0; -- cgit v1.2.3 From 337cebbd850f94147cee05252778f8f78b8c337f Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 8 Feb 2024 19:03:54 +0100 Subject: mptcp: really cope with fastopen race Fastopen and PM-trigger subflow shutdown can race, as reported by syzkaller. In my first attempt to close such race, I missed the fact that the subflow status can change again before the subflow_state_change callback is invoked. Address the issue additionally copying with all the states directly reachable from TCP_FIN_WAIT1. Fixes: 1e777f39b4d7 ("mptcp: add MSG_FASTOPEN sendmsg flag support") Fixes: 4fd19a307016 ("mptcp: fix inconsistent state on fastopen race") Cc: stable@vger.kernel.org Reported-by: syzbot+c53d4d3ddb327e80bc51@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/458 Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/mptcp/protocol.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index fefcbf585411..ed50f2015dc3 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1129,7 +1129,8 @@ static inline bool subflow_simultaneous_connect(struct sock *sk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); - return (1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1) && + return (1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING) && is_active_ssk(subflow) && !subflow->conn_finished; } -- cgit v1.2.3 From 9f30831390ede02d9fcd54fd9ea5a585ab649f4a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 9 Feb 2024 18:12:48 +0000 Subject: net: add rcu safety to rtnl_prop_list_size() rtnl_prop_list_size() can be called while alternative names are added or removed concurrently. if_nlmsg_size() / rtnl_calcit() can indeed be called without RTNL held. Use explicit RCU protection to avoid UAF. Fixes: 88f4fb0c7496 ("net: rtnetlink: put alternative names to getlink message") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Jiri Pirko <jiri@nvidia.com> Link: https://lore.kernel.org/r/20240209181248.96637-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/core/dev.c | 2 +- net/core/rtnetlink.c | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 9bb792cecc16..73a021973007 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -336,7 +336,7 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return -ENOMEM; netdev_name_node_add(net, name_node); /* The node that holds dev->name acts as a head of per-device list. */ - list_add_tail(&name_node->list, &dev->name_node->list); + list_add_tail_rcu(&name_node->list, &dev->name_node->list); return 0; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f6f29eb03ec2..9c4f427f3a50 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1020,14 +1020,17 @@ static size_t rtnl_xdp_size(void) static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; - size_t size; + unsigned int cnt = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(name_node, &dev->name_node->list, list) + cnt++; + rcu_read_unlock(); - if (list_empty(&dev->name_node->list)) + if (!cnt) return 0; - size = nla_total_size(0); - list_for_each_entry(name_node, &dev->name_node->list, list) - size += nla_total_size(ALTIFNAMSIZ); - return size; + + return nla_total_size(0) + cnt * nla_total_size(ALTIFNAMSIZ); } static size_t rtnl_proto_down_size(const struct net_device *dev) -- cgit v1.2.3 From f1acf1ac84d2ae97b7889b87223c1064df850069 Mon Sep 17 00:00:00 2001 From: Allison Henderson Date: Thu, 8 Feb 2024 19:28:54 -0700 Subject: net:rds: Fix possible deadlock in rds_message_put Functions rds_still_queued and rds_clear_recv_queue lock a given socket in order to safely iterate over the incoming rds messages. However calling rds_inc_put while under this lock creates a potential deadlock. rds_inc_put may eventually call rds_message_purge, which will lock m_rs_lock. This is the incorrect locking order since m_rs_lock is meant to be locked before the socket. To fix this, we move the message item to a local list or variable that wont need rs_recv_lock protection. Then we can safely call rds_inc_put on any item stored locally after rs_recv_lock is released. Fixes: bdbe6fbc6a2f ("RDS: recv.c") Reported-by: syzbot+f9db6ff27b9bfdcfeca0@syzkaller.appspotmail.com Reported-by: syzbot+dcd73ff9291e6d34b3ab@syzkaller.appspotmail.com Signed-off-by: Allison Henderson <allison.henderson@oracle.com> Link: https://lore.kernel.org/r/20240209022854.200292-1-allison.henderson@oracle.com Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- net/rds/recv.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/recv.c b/net/rds/recv.c index c71b923764fd..5627f80013f8 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -425,6 +425,7 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, struct sock *sk = rds_rs_to_sk(rs); int ret = 0; unsigned long flags; + struct rds_incoming *to_drop = NULL; write_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&inc->i_item)) { @@ -435,11 +436,14 @@ static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); list_del_init(&inc->i_item); - rds_inc_put(inc); + to_drop = inc; } } write_unlock_irqrestore(&rs->rs_recv_lock, flags); + if (to_drop) + rds_inc_put(to_drop); + rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop); return ret; } @@ -758,16 +762,21 @@ void rds_clear_recv_queue(struct rds_sock *rs) struct sock *sk = rds_rs_to_sk(rs); struct rds_incoming *inc, *tmp; unsigned long flags; + LIST_HEAD(to_drop); write_lock_irqsave(&rs->rs_recv_lock, flags); list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) { rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong, -be32_to_cpu(inc->i_hdr.h_len), inc->i_hdr.h_dport); + list_move(&inc->i_item, &to_drop); + } + write_unlock_irqrestore(&rs->rs_recv_lock, flags); + + list_for_each_entry_safe(inc, tmp, &to_drop, i_item) { list_del_init(&inc->i_item); rds_inc_put(inc); } - write_unlock_irqrestore(&rs->rs_recv_lock, flags); } /* -- cgit v1.2.3 From 25236c91b5ab4a26a56ba2e79b8060cf4e047839 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 9 Feb 2024 14:04:53 -0800 Subject: af_unix: Fix task hung while purging oob_skb in GC. syzbot reported a task hung; at the same time, GC was looping infinitely in list_for_each_entry_safe() for OOB skb. [0] syzbot demonstrated that the list_for_each_entry_safe() was not actually safe in this case. A single skb could have references for multiple sockets. If we free such a skb in the list_for_each_entry_safe(), the current and next sockets could be unlinked in a single iteration. unix_notinflight() uses list_del_init() to unlink the socket, so the prefetched next socket forms a loop itself and list_for_each_entry_safe() never stops. Here, we must use while() and make sure we always fetch the first socket. [0]: Sending NMI from CPU 0 to CPUs 1: NMI backtrace for cpu 1 CPU: 1 PID: 5065 Comm: syz-executor236 Not tainted 6.8.0-rc3-syzkaller-00136-g1f719a2f3fa6 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024 RIP: 0010:preempt_count arch/x86/include/asm/preempt.h:26 [inline] RIP: 0010:check_kcov_mode kernel/kcov.c:173 [inline] RIP: 0010:__sanitizer_cov_trace_pc+0xd/0x60 kernel/kcov.c:207 Code: cc cc cc cc 66 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 65 48 8b 14 25 40 c2 03 00 <65> 8b 05 b4 7c 78 7e a9 00 01 ff 00 48 8b 34 24 74 0f f6 c4 01 74 RSP: 0018:ffffc900033efa58 EFLAGS: 00000283 RAX: ffff88807b077800 RBX: ffff88807b077800 RCX: 1ffffffff27b1189 RDX: ffff88802a5a3b80 RSI: ffffffff8968488d RDI: ffff88807b077f70 RBP: ffffc900033efbb0 R08: 0000000000000001 R09: fffffbfff27a900c R10: ffffffff93d48067 R11: ffffffff8ae000eb R12: ffff88807b077800 R13: dffffc0000000000 R14: ffff88807b077e40 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff8880b9500000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000564f4fc1e3a8 CR3: 000000000d57a000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <NMI> </NMI> <TASK> unix_gc+0x563/0x13b0 net/unix/garbage.c:319 unix_release_sock+0xa93/0xf80 net/unix/af_unix.c:683 unix_release+0x91/0xf0 net/unix/af_unix.c:1064 __sock_release+0xb0/0x270 net/socket.c:659 sock_close+0x1c/0x30 net/socket.c:1421 __fput+0x270/0xb80 fs/file_table.c:376 task_work_run+0x14f/0x250 kernel/task_work.c:180 exit_task_work include/linux/task_work.h:38 [inline] do_exit+0xa8a/0x2ad0 kernel/exit.c:871 do_group_exit+0xd4/0x2a0 kernel/exit.c:1020 __do_sys_exit_group kernel/exit.c:1031 [inline] __se_sys_exit_group kernel/exit.c:1029 [inline] __x64_sys_exit_group+0x3e/0x50 kernel/exit.c:1029 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xd5/0x270 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6f/0x77 RIP: 0033:0x7f9d6cbdac09 Code: Unable to access opcode bytes at 0x7f9d6cbdabdf. RSP: 002b:00007fff5952feb8 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9d6cbdac09 RDX: 000000000000003c RSI: 00000000000000e7 RDI: 0000000000000000 RBP: 00007f9d6cc552b0 R08: ffffffffffffffb8 R09: 0000000000000006 R10: 0000000000000006 R11: 0000000000000246 R12: 00007f9d6cc552b0 R13: 0000000000000000 R14: 00007f9d6cc55d00 R15: 00007f9d6cbabe70 </TASK> Reported-by: syzbot+4fa4a2d1f5a5ee06f006@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=4fa4a2d1f5a5ee06f006 Fixes: 1279f9d9dec2 ("af_unix: Call kfree_skb() for dead unix_(sk)->oob_skb in GC.") Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com> Link: https://lore.kernel.org/r/20240209220453.96053-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- net/unix/garbage.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8f63f0b4bf01..2ff7ddbaa782 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -315,10 +315,11 @@ void unix_gc(void) __skb_queue_purge(&hitlist); #if IS_ENABLED(CONFIG_AF_UNIX_OOB) - list_for_each_entry_safe(u, next, &gc_candidates, link) { - struct sk_buff *skb = u->oob_skb; + while (!list_empty(&gc_candidates)) { + u = list_entry(gc_candidates.next, struct unix_sock, link); + if (u->oob_skb) { + struct sk_buff *skb = u->oob_skb; - if (skb) { u->oob_skb = NULL; kfree_skb(skb); } -- cgit v1.2.3 From 6cf9ff463317217d95732a6cce6fbdd12508921a Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Mon, 12 Feb 2024 17:34:02 +0300 Subject: net: smc: fix spurious error message from __sock_release() Commit 67f562e3e147 ("net/smc: transfer fasync_list in case of fallback") leaves the socket's fasync list pointer within a container socket as well. When the latter is destroyed, '__sock_release()' warns about its non-empty fasync list, which is a dangling pointer to previously freed fasync list of an underlying TCP socket. Fix this spurious warning by nullifying fasync list of a container socket. Fixes: 67f562e3e147 ("net/smc: transfer fasync_list in case of fallback") Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/smc/af_smc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index a2cb30af46cb..0f53a5c6fd9d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -924,6 +924,7 @@ static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code) smc->clcsock->file->private_data = smc->clcsock; smc->clcsock->wq.fasync_list = smc->sk.sk_socket->wq.fasync_list; + smc->sk.sk_socket->wq.fasync_list = NULL; /* There might be some wait entries remaining * in smc sk->sk_wq and they should be woken up -- cgit v1.2.3 From 6cdedc18ba7b9dacc36466e27e3267d201948c8d Mon Sep 17 00:00:00 2001 From: Ziqi Zhao Date: Fri, 21 Jul 2023 09:22:26 -0700 Subject: can: j1939: prevent deadlock by changing j1939_socks_lock to rwlock The following 3 locks would race against each other, causing the deadlock situation in the Syzbot bug report: - j1939_socks_lock - active_session_list_lock - sk_session_queue_lock A reasonable fix is to change j1939_socks_lock to an rwlock, since in the rare situations where a write lock is required for the linked list that j1939_socks_lock is protecting, the code does not attempt to acquire any more locks. This would break the circular lock dependency, where, for example, the current thread already locks j1939_socks_lock and attempts to acquire sk_session_queue_lock, and at the same time, another thread attempts to acquire j1939_socks_lock while holding sk_session_queue_lock. NOTE: This patch along does not fix the unregister_netdevice bug reported by Syzbot; instead, it solves a deadlock situation to prepare for one or more further patches to actually fix the Syzbot bug, which appears to be a reference counting problem within the j1939 codebase. Reported-by: <syzbot+1591462f226d9cbf0564@syzkaller.appspotmail.com> Signed-off-by: Ziqi Zhao <astrajoan@yahoo.com> Reviewed-by: Oleksij Rempel <o.rempel@pengutronix.de> Acked-by: Oleksij Rempel <o.rempel@pengutronix.de> Link: https://lore.kernel.org/all/20230721162226.8639-1-astrajoan@yahoo.com [mkl: remove unrelated newline change] Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- net/can/j1939/j1939-priv.h | 2 +- net/can/j1939/main.c | 2 +- net/can/j1939/socket.c | 24 ++++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 16af1a7f80f6..74f15592d170 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -86,7 +86,7 @@ struct j1939_priv { unsigned int tp_max_packet_size; /* lock for j1939_socks list */ - spinlock_t j1939_socks_lock; + rwlock_t j1939_socks_lock; struct list_head j1939_socks; struct kref rx_kref; diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index ecff1c947d68..a6fb89fa6278 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -274,7 +274,7 @@ struct j1939_priv *j1939_netdev_start(struct net_device *ndev) return ERR_PTR(-ENOMEM); j1939_tp_init(priv); - spin_lock_init(&priv->j1939_socks_lock); + rwlock_init(&priv->j1939_socks_lock); INIT_LIST_HEAD(&priv->j1939_socks); mutex_lock(&j1939_netdev_lock); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 14c431663233..94cfc2315e54 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -80,16 +80,16 @@ static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); - spin_lock_bh(&priv->j1939_socks_lock); + write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); - spin_unlock_bh(&priv->j1939_socks_lock); + write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { - spin_lock_bh(&priv->j1939_socks_lock); + write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); - spin_unlock_bh(&priv->j1939_socks_lock); + write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; @@ -329,13 +329,13 @@ bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) struct j1939_sock *jsk; bool match = false; - spin_lock_bh(&priv->j1939_socks_lock); + read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } - spin_unlock_bh(&priv->j1939_socks_lock); + read_unlock_bh(&priv->j1939_socks_lock); return match; } @@ -344,11 +344,11 @@ void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; - spin_lock_bh(&priv->j1939_socks_lock); + read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } - spin_unlock_bh(&priv->j1939_socks_lock); + read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) @@ -1080,12 +1080,12 @@ void j1939_sk_errqueue(struct j1939_session *session, } /* spread RX notifications to all sockets subscribed to this session */ - spin_lock_bh(&priv->j1939_socks_lock); + read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } - spin_unlock_bh(&priv->j1939_socks_lock); + read_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) @@ -1273,7 +1273,7 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) struct j1939_sock *jsk; int error_code = ENETDOWN; - spin_lock_bh(&priv->j1939_socks_lock); + read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) @@ -1281,7 +1281,7 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) j1939_sk_queue_drop_all(priv, jsk, error_code); } - spin_unlock_bh(&priv->j1939_socks_lock); + read_unlock_bh(&priv->j1939_socks_lock); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, -- cgit v1.2.3 From efe7cf828039aedb297c1f9920b638fffee6aabc Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Fri, 20 Oct 2023 15:38:14 +0200 Subject: can: j1939: Fix UAF in j1939_sk_match_filter during setsockopt(SO_J1939_FILTER) Lock jsk->sk to prevent UAF when setsockopt(..., SO_J1939_FILTER, ...) modifies jsk->filters while receiving packets. Following trace was seen on affected system: ================================================================== BUG: KASAN: slab-use-after-free in j1939_sk_recv_match_one+0x1af/0x2d0 [can_j1939] Read of size 4 at addr ffff888012144014 by task j1939/350 CPU: 0 PID: 350 Comm: j1939 Tainted: G W OE 6.5.0-rc5 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: print_report+0xd3/0x620 ? kasan_complete_mode_report_info+0x7d/0x200 ? j1939_sk_recv_match_one+0x1af/0x2d0 [can_j1939] kasan_report+0xc2/0x100 ? j1939_sk_recv_match_one+0x1af/0x2d0 [can_j1939] __asan_load4+0x84/0xb0 j1939_sk_recv_match_one+0x1af/0x2d0 [can_j1939] j1939_sk_recv+0x20b/0x320 [can_j1939] ? __kasan_check_write+0x18/0x20 ? __pfx_j1939_sk_recv+0x10/0x10 [can_j1939] ? j1939_simple_recv+0x69/0x280 [can_j1939] ? j1939_ac_recv+0x5e/0x310 [can_j1939] j1939_can_recv+0x43f/0x580 [can_j1939] ? __pfx_j1939_can_recv+0x10/0x10 [can_j1939] ? raw_rcv+0x42/0x3c0 [can_raw] ? __pfx_j1939_can_recv+0x10/0x10 [can_j1939] can_rcv_filter+0x11f/0x350 [can] can_receive+0x12f/0x190 [can] ? __pfx_can_rcv+0x10/0x10 [can] can_rcv+0xdd/0x130 [can] ? __pfx_can_rcv+0x10/0x10 [can] __netif_receive_skb_one_core+0x13d/0x150 ? __pfx___netif_receive_skb_one_core+0x10/0x10 ? __kasan_check_write+0x18/0x20 ? _raw_spin_lock_irq+0x8c/0xe0 __netif_receive_skb+0x23/0xb0 process_backlog+0x107/0x260 __napi_poll+0x69/0x310 net_rx_action+0x2a1/0x580 ? __pfx_net_rx_action+0x10/0x10 ? __pfx__raw_spin_lock+0x10/0x10 ? handle_irq_event+0x7d/0xa0 __do_softirq+0xf3/0x3f8 do_softirq+0x53/0x80 </IRQ> <TASK> __local_bh_enable_ip+0x6e/0x70 netif_rx+0x16b/0x180 can_send+0x32b/0x520 [can] ? __pfx_can_send+0x10/0x10 [can] ? __check_object_size+0x299/0x410 raw_sendmsg+0x572/0x6d0 [can_raw] ? __pfx_raw_sendmsg+0x10/0x10 [can_raw] ? apparmor_socket_sendmsg+0x2f/0x40 ? __pfx_raw_sendmsg+0x10/0x10 [can_raw] sock_sendmsg+0xef/0x100 sock_write_iter+0x162/0x220 ? __pfx_sock_write_iter+0x10/0x10 ? __rtnl_unlock+0x47/0x80 ? security_file_permission+0x54/0x320 vfs_write+0x6ba/0x750 ? __pfx_vfs_write+0x10/0x10 ? __fget_light+0x1ca/0x1f0 ? __rcu_read_unlock+0x5b/0x280 ksys_write+0x143/0x170 ? __pfx_ksys_write+0x10/0x10 ? __kasan_check_read+0x15/0x20 ? fpregs_assert_state_consistent+0x62/0x70 __x64_sys_write+0x47/0x60 do_syscall_64+0x60/0x90 ? do_syscall_64+0x6d/0x90 ? irqentry_exit+0x3f/0x50 ? exc_page_fault+0x79/0xf0 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Allocated by task 348: kasan_save_stack+0x2a/0x50 kasan_set_track+0x29/0x40 kasan_save_alloc_info+0x1f/0x30 __kasan_kmalloc+0xb5/0xc0 __kmalloc_node_track_caller+0x67/0x160 j1939_sk_setsockopt+0x284/0x450 [can_j1939] __sys_setsockopt+0x15c/0x2f0 __x64_sys_setsockopt+0x6b/0x80 do_syscall_64+0x60/0x90 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Freed by task 349: kasan_save_stack+0x2a/0x50 kasan_set_track+0x29/0x40 kasan_save_free_info+0x2f/0x50 __kasan_slab_free+0x12e/0x1c0 __kmem_cache_free+0x1b9/0x380 kfree+0x7a/0x120 j1939_sk_setsockopt+0x3b2/0x450 [can_j1939] __sys_setsockopt+0x15c/0x2f0 __x64_sys_setsockopt+0x6b/0x80 do_syscall_64+0x60/0x90 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 Fixes: 9d71dd0c70099 ("can: add support of SAE J1939 protocol") Reported-by: Sili Luo <rootlab@huawei.com> Suggested-by: Sili Luo <rootlab@huawei.com> Acked-by: Oleksij Rempel <o.rempel@pengutronix.de> Cc: stable@vger.kernel.org Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de> Link: https://lore.kernel.org/all/20231020133814.383996-1-o.rempel@pengutronix.de Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- net/can/j1939/j1939-priv.h | 1 + net/can/j1939/socket.c | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 74f15592d170..31a93cae5111 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -301,6 +301,7 @@ struct j1939_sock { int ifindex; struct j1939_addr addr; + spinlock_t filters_lock; struct j1939_filter *filters; int nfilters; pgn_t pgn_rx_filter; diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 94cfc2315e54..305dd72c844c 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -262,12 +262,17 @@ static bool j1939_sk_match_dst(struct j1939_sock *jsk, static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { - const struct j1939_filter *f = jsk->filters; - int nfilter = jsk->nfilters; + const struct j1939_filter *f; + int nfilter; + + spin_lock_bh(&jsk->filters_lock); + + f = jsk->filters; + nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ - return true; + goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) @@ -276,9 +281,15 @@ static bool j1939_sk_match_filter(struct j1939_sock *jsk, continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; - return true; + goto filter_match_found; } + + spin_unlock_bh(&jsk->filters_lock); return false; + +filter_match_found: + spin_unlock_bh(&jsk->filters_lock); + return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, @@ -401,6 +412,7 @@ static int j1939_sk_init(struct sock *sk) atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); + spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); @@ -703,9 +715,11 @@ static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, } lock_sock(&jsk->sk); + spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; + spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; -- cgit v1.2.3 From f6374a82fc85bf911d033e2fa791372ce3356270 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 8 Feb 2024 15:46:03 +0100 Subject: netfilter: nft_set_pipapo: fix missing : in kdoc Add missing : in kdoc field names. Fixes: 8683f4b9950d ("nft_set_pipapo: Prepare for vectorised implementation: helpers") Reported-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nft_set_pipapo.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index f59a0cd81105..3842c7341a9f 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -144,10 +144,10 @@ struct nft_pipapo_scratch { /** * struct nft_pipapo_match - Data used for lookup and matching - * @field_count Amount of fields in set + * @field_count: Amount of fields in set * @scratch: Preallocated per-CPU maps for partial matching results * @bsize_max: Maximum lookup table bucket size of all fields, in longs - * @rcu Matching data is swapped on commits + * @rcu: Matching data is swapped on commits * @f: Fields, with lookup and mapping tables */ struct nft_pipapo_match { -- cgit v1.2.3 From 0f1ae2821fa4b13ab0f5ad7ff89fa57efcb04fe0 Mon Sep 17 00:00:00 2001 From: Kyle Swenson Date: Thu, 8 Feb 2024 23:56:31 +0000 Subject: netfilter: nat: restore default DNAT behavior When a DNAT rule is configured via iptables with different port ranges, iptables -t nat -A PREROUTING -p tcp -d 10.0.0.2 -m tcp --dport 32000:32010 -j DNAT --to-destination 192.168.0.10:21000-21010 we seem to be DNATing to some random port on the LAN side. While this is expected if --random is passed to the iptables command, it is not expected without passing --random. The expected behavior (and the observed behavior prior to the commit in the "Fixes" tag) is the traffic will be DNAT'd to 192.168.0.10:21000 unless there is a tuple collision with that destination. In that case, we expect the traffic to be instead DNAT'd to 192.168.0.10:21001, so on so forth until the end of the range. This patch intends to restore the behavior observed prior to the "Fixes" tag. Fixes: 6ed5943f8735 ("netfilter: nat: remove l4 protocol port rovers") Signed-off-by: Kyle Swenson <kyle.swenson@est.tech> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_nat_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index c3d7ecbc777c..016c816d91cb 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -551,8 +551,11 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); - else + else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || + maniptype != NF_NAT_MANIP_DST) off = get_random_u16(); + else + off = 0; attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) -- cgit v1.2.3 From 84443741faab9045d53f022a9ac6a6633067a481 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 14 Feb 2024 15:42:35 +0100 Subject: netfilter: nf_tables: fix bidirectional offload regression Commit 8f84780b84d6 ("netfilter: flowtable: allow unidirectional rules") made unidirectional flow offload possible, while completely ignoring (and breaking) bidirectional flow offload for nftables. Add the missing flag that was left out as an exercise for the reader :) Cc: Vlad Buslov <vladbu@nvidia.com> Fixes: 8f84780b84d6 ("netfilter: flowtable: allow unidirectional rules") Reported-by: Daniel Golle <daniel@makrotopia.org> Signed-off-by: Felix Fietkau <nbd@nbd.name> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nft_flow_offload.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 397351fa4d5f..ab9576098701 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -361,6 +361,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } + __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ret = flow_offload_add(flowtable, flow); if (ret < 0) goto err_flow_add; -- cgit v1.2.3 From 488b6d91b07112eaaaa4454332c1480894d4e06e Mon Sep 17 00:00:00 2001 From: Vadim Fedorenko Date: Tue, 13 Feb 2024 03:04:28 -0800 Subject: net-timestamp: make sk_tskey more predictable in error path When SOF_TIMESTAMPING_OPT_ID is used to ambiguate timestamped datagrams, the sk_tskey can become unpredictable in case of any error happened during sendmsg(). Move increment later in the code and make decrement of sk_tskey in error path. This solution is still racy in case of multiple threads doing snedmsg() over the very same socket in parallel, but still makes error path much more predictable. Fixes: 09c2d251b707 ("net-timestamp: add key to disambiguate concurrent datagrams") Reported-by: Andy Lutomirski <luto@amacapital.net> Signed-off-by: Vadim Fedorenko <vadfed@meta.com> Reviewed-by: Willem de Bruijn <willemb@google.com> Link: https://lore.kernel.org/r/20240213110428.1681540-1-vadfed@meta.com Signed-off-by: Paolo Abeni <pabeni@redhat.com> --- net/ipv4/ip_output.c | 13 ++++++++----- net/ipv6/ip6_output.c | 13 ++++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 41537d18eecf..67d846622365 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -972,8 +972,8 @@ static int __ip_append_data(struct sock *sk, unsigned int maxfraglen, fragheaderlen, maxnonfragsize; int csummode = CHECKSUM_NONE; struct rtable *rt = (struct rtable *)cork->dst; + bool paged, hold_tskey, extra_uref = false; unsigned int wmem_alloc_delta = 0; - bool paged, extra_uref = false; u32 tskey = 0; skb = skb_peek_tail(queue); @@ -982,10 +982,6 @@ static int __ip_append_data(struct sock *sk, mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; paged = !!cork->gso_size; - if (cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; - hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); @@ -1052,6 +1048,11 @@ static int __ip_append_data(struct sock *sk, cork->length += length; + hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; + if (hold_tskey) + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + /* So, what's going on in the loop below? * * We use calculated fragment length to generate chained skb, @@ -1274,6 +1275,8 @@ error: cork->length -= length; IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + if (hold_tskey) + atomic_dec(&sk->sk_tskey); return err; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a722a43dd668..31b86fe661aa 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1424,11 +1424,11 @@ static int __ip6_append_data(struct sock *sk, bool zc = false; u32 tskey = 0; struct rt6_info *rt = (struct rt6_info *)cork->dst; + bool paged, hold_tskey, extra_uref = false; struct ipv6_txoptions *opt = v6_cork->opt; int csummode = CHECKSUM_NONE; unsigned int maxnonfragsize, headersize; unsigned int wmem_alloc_delta = 0; - bool paged, extra_uref = false; skb = skb_peek_tail(queue); if (!skb) { @@ -1440,10 +1440,6 @@ static int __ip6_append_data(struct sock *sk, mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; orig_mtu = mtu; - if (cork->tx_flags & SKBTX_ANY_TSTAMP && - READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) - tskey = atomic_inc_return(&sk->sk_tskey) - 1; - hh_len = LL_RESERVED_SPACE(rt->dst.dev); fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + @@ -1538,6 +1534,11 @@ emsgsize: flags &= ~MSG_SPLICE_PAGES; } + hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && + READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; + if (hold_tskey) + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + /* * Let's try using as much space as possible. * Use MTU if total length of the message fits into the MTU. @@ -1794,6 +1795,8 @@ error: cork->length -= length; IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); + if (hold_tskey) + atomic_dec(&sk->sk_tskey); return err; } -- cgit v1.2.3 From 4e45170d9acc2d5ae8f545bf3f2f67504a361338 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 14 Feb 2024 11:22:24 +0300 Subject: net: sctp: fix skb leak in sctp_inq_free() In case of GSO, 'chunk->skb' pointer may point to an entry from fraglist created in 'sctp_packet_gso_append()'. To avoid freeing random fraglist entry (and so undefined behavior and/or memory leak), introduce 'sctp_inq_chunk_free()' helper to ensure that 'chunk->skb' is set to 'chunk->head_skb' (i.e. fraglist head) before calling 'sctp_chunk_free()', and use the aforementioned helper in 'sctp_inq_pop()' as well. Reported-by: syzbot+8bb053b5d63595ab47db@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?id=0d8351bbe54fd04a492c2daab0164138db008042 Fixes: 90017accff61 ("sctp: Add GSO support") Suggested-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru> Acked-by: Xin Long <lucien.xin@gmail.com> Link: https://lore.kernel.org/r/20240214082224.10168-1-dmantipov@yandex.ru Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/sctp/inqueue.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 7182c5a450fb..5c1652181805 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -38,6 +38,14 @@ void sctp_inq_init(struct sctp_inq *queue) INIT_WORK(&queue->immediate, NULL); } +/* Properly release the chunk which is being worked on. */ +static inline void sctp_inq_chunk_free(struct sctp_chunk *chunk) +{ + if (chunk->head_skb) + chunk->skb = chunk->head_skb; + sctp_chunk_free(chunk); +} + /* Release the memory associated with an SCTP inqueue. */ void sctp_inq_free(struct sctp_inq *queue) { @@ -53,7 +61,7 @@ void sctp_inq_free(struct sctp_inq *queue) * free it as well. */ if (queue->in_progress) { - sctp_chunk_free(queue->in_progress); + sctp_inq_chunk_free(queue->in_progress); queue->in_progress = NULL; } } @@ -130,9 +138,7 @@ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) goto new_skb; } - if (chunk->head_skb) - chunk->skb = chunk->head_skb; - sctp_chunk_free(chunk); + sctp_inq_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { /* Nothing to do. Next chunk in the packet, please. */ -- cgit v1.2.3