From ba062ebb2cd561d404e0fba8ee4b3f5ebce7cbfc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Jun 2018 09:13:39 -0700 Subject: netfilter: nf_queue: augment nfqa_cfg_policy Three attributes are currently not verified, thus can trigger KMSAN warnings such as : BUG: KMSAN: uninit-value in __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline] BUG: KMSAN: uninit-value in __fswab32 include/uapi/linux/swab.h:59 [inline] BUG: KMSAN: uninit-value in nfqnl_recv_config+0x939/0x17d0 net/netfilter/nfnetlink_queue.c:1268 CPU: 1 PID: 4521 Comm: syz-executor120 Not tainted 4.17.0+ #5 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x185/0x1d0 lib/dump_stack.c:113 kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1117 __msan_warning_32+0x70/0xc0 mm/kmsan/kmsan_instr.c:620 __arch_swab32 arch/x86/include/uapi/asm/swab.h:10 [inline] __fswab32 include/uapi/linux/swab.h:59 [inline] nfqnl_recv_config+0x939/0x17d0 net/netfilter/nfnetlink_queue.c:1268 nfnetlink_rcv_msg+0xb2e/0xc80 net/netfilter/nfnetlink.c:212 netlink_rcv_skb+0x37e/0x600 net/netlink/af_netlink.c:2448 nfnetlink_rcv+0x2fe/0x680 net/netfilter/nfnetlink.c:513 netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline] netlink_unicast+0x1680/0x1750 net/netlink/af_netlink.c:1336 netlink_sendmsg+0x104f/0x1350 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:629 [inline] sock_sendmsg net/socket.c:639 [inline] ___sys_sendmsg+0xec8/0x1320 net/socket.c:2117 __sys_sendmsg net/socket.c:2155 [inline] __do_sys_sendmsg net/socket.c:2164 [inline] __se_sys_sendmsg net/socket.c:2162 [inline] __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162 do_syscall_64+0x15b/0x230 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x43fd59 RSP: 002b:00007ffde0e30d28 EFLAGS: 00000213 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fd59 RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000003 RBP: 00000000006ca018 R08: 00000000004002c8 R09: 00000000004002c8 R10: 00000000004002c8 R11: 0000000000000213 R12: 0000000000401680 R13: 0000000000401710 R14: 0000000000000000 R15: 0000000000000000 Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:279 [inline] kmsan_internal_poison_shadow+0xb8/0x1b0 mm/kmsan/kmsan.c:189 kmsan_kmalloc+0x94/0x100 mm/kmsan/kmsan.c:315 kmsan_slab_alloc+0x10/0x20 mm/kmsan/kmsan.c:322 slab_post_alloc_hook mm/slab.h:446 [inline] slab_alloc_node mm/slub.c:2753 [inline] __kmalloc_node_track_caller+0xb35/0x11b0 mm/slub.c:4395 __kmalloc_reserve net/core/skbuff.c:138 [inline] __alloc_skb+0x2cb/0x9e0 net/core/skbuff.c:206 alloc_skb include/linux/skbuff.h:988 [inline] netlink_alloc_large_skb net/netlink/af_netlink.c:1182 [inline] netlink_sendmsg+0x76e/0x1350 net/netlink/af_netlink.c:1876 sock_sendmsg_nosec net/socket.c:629 [inline] sock_sendmsg net/socket.c:639 [inline] ___sys_sendmsg+0xec8/0x1320 net/socket.c:2117 __sys_sendmsg net/socket.c:2155 [inline] __do_sys_sendmsg net/socket.c:2164 [inline] __se_sys_sendmsg net/socket.c:2162 [inline] __x64_sys_sendmsg+0x331/0x460 net/socket.c:2162 do_syscall_64+0x15b/0x230 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: fdb694a01f1f ("netfilter: Add fail-open support") Fixes: 829e17a1a602 ("[NETFILTER]: nfnetlink_queue: allow changing queue length through netlink") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nfnetlink_queue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 4ccd2988f9db..ea4ba551abb2 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -1243,6 +1243,9 @@ static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl, static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, + [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 }, + [NFQA_CFG_MASK] = { .type = NLA_U32 }, + [NFQA_CFG_FLAGS] = { .type = NLA_U32 }, }; static const struct nf_queue_handler nfqh = { -- cgit v1.2.3 From 9ce7bc036ae4cfe3393232c86e9e1fea2153c237 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Jun 2018 10:11:56 -0700 Subject: netfilter: ipv6: nf_defrag: reduce struct net memory waste It is a waste of memory to use a full "struct netns_sysctl_ipv6" while only one pointer is really used, considering netns_sysctl_ipv6 keeps growing. Also, since "struct netns_frags" has cache line alignment, it is better to move the frags_hdr pointer outside, otherwise we spend a full cache line for this pointer. This saves 192 bytes of memory per netns. Fixes: c038a767cd69 ("ipv6: add a new namespace for nf_conntrack_reasm") Signed-off-by: Eric Dumazet Signed-off-by: Pablo Neira Ayuso --- include/net/net_namespace.h | 1 + include/net/netns/ipv6.h | 1 - net/ipv6/netfilter/nf_conntrack_reasm.c | 6 +++--- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 47e35cce3b64..a71264d75d7f 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -128,6 +128,7 @@ struct net { #endif #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) struct netns_nf_frag nf_frag; + struct ctl_table_header *nf_frag_frags_hdr; #endif struct sock *nfnl; struct sock *nfnl_stash; diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index c978a31b0f84..762ac9931b62 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -109,7 +109,6 @@ struct netns_ipv6 { #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) struct netns_nf_frag { - struct netns_sysctl_ipv6 sysctl; struct netns_frags frags; }; #endif diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 5e0332014c17..a452d99c9f52 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -107,7 +107,7 @@ static int nf_ct_frag6_sysctl_register(struct net *net) if (hdr == NULL) goto err_reg; - net->nf_frag.sysctl.frags_hdr = hdr; + net->nf_frag_frags_hdr = hdr; return 0; err_reg: @@ -121,8 +121,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) { struct ctl_table *table; - table = net->nf_frag.sysctl.frags_hdr->ctl_table_arg; - unregister_net_sysctl_table(net->nf_frag.sysctl.frags_hdr); + table = net->nf_frag_frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->nf_frag_frags_hdr); if (!net_eq(net, &init_net)) kfree(table); } -- cgit v1.2.3 From ad9852af97587b8abe8102f9ddcb05c9769656f6 Mon Sep 17 00:00:00 2001 From: Gao Feng Date: Wed, 13 Jun 2018 12:26:13 +0800 Subject: netfilter: nf_ct_helper: Fix possible panic after nf_conntrack_helper_unregister The helper module would be unloaded after nf_conntrack_helper_unregister, so it may cause a possible panic caused by race. nf_ct_iterate_destroy(unhelp, me) reset the helper of conntrack as NULL, but maybe someone has gotten the helper pointer during this period. Then it would panic, when it accesses the helper and the module was unloaded. Take an example as following: CPU0 CPU1 ctnetlink_dump_helpinfo helper = rcu_dereference(help->helper); unhelp set helper as NULL unload helper module helper->to_nlattr(skb, ct); As above, the cpu0 tries to access the helper and its module is unloaded, then the panic happens. Signed-off-by: Gao Feng Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_helper.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 551a1eddf0fa..a75b11c39312 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -465,6 +465,11 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) nf_ct_expect_iterate_destroy(expect_iter_me, NULL); nf_ct_iterate_destroy(unhelp, me); + + /* Maybe someone has gotten the helper already when unhelp above. + * So need to wait it. + */ + synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); -- cgit v1.2.3 From dffd22aed2aa1e804bccf19b30a421e89ee2ae61 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 20 Jun 2018 18:33:45 +0200 Subject: netfilter: nf_log: fix uninit read in nf_log_proc_dostring When proc_dostring() is called with a non-zero offset in strict mode, it doesn't just write to the ->data buffer, it also reads. Make sure it doesn't read uninitialized data. Fixes: c6ac37d8d884 ("netfilter: nf_log: fix error on write NONE to [...]") Signed-off-by: Jann Horn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 426457047578..2c47f9ec3511 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -424,6 +424,10 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write, if (write) { struct ctl_table tmp = *table; + /* proc_dostring() can append to existing strings, so we need to + * initialize it as an empty string. + */ + buf[0] = '\0'; tmp.data = buf; r = proc_dostring(&tmp, write, buffer, lenp, ppos); if (r) -- cgit v1.2.3 From ce00bf07cc95a57cd20b208e02b3c2604e532ae8 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 25 Jun 2018 17:22:00 +0200 Subject: netfilter: nf_log: don't hold nf_log_mutex during user access The old code would indefinitely block other users of nf_log_mutex if a userspace access in proc_dostring() blocked e.g. due to a userfaultfd region. Fix it by moving proc_dostring() out of the locked region. This is a followup to commit 266d07cb1c9a ("netfilter: nf_log: fix sleeping function called from invalid context"), which changed this code from using rcu_read_lock() to taking nf_log_mutex. Fixes: 266d07cb1c9a ("netfilter: nf_log: fix sleeping function calle[...]") Signed-off-by: Jann Horn Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_log.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 2c47f9ec3511..a61d6df6e5f6 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -446,14 +446,17 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write, rcu_assign_pointer(net->nf.nf_loggers[tindex], logger); mutex_unlock(&nf_log_mutex); } else { + struct ctl_table tmp = *table; + + tmp.data = buf; mutex_lock(&nf_log_mutex); logger = nft_log_dereference(net->nf.nf_loggers[tindex]); if (!logger) - table->data = "NONE"; + strlcpy(buf, "NONE", sizeof(buf)); else - table->data = logger->name; - r = proc_dostring(table, write, buffer, lenp, ppos); + strlcpy(buf, logger->name, sizeof(buf)); mutex_unlock(&nf_log_mutex); + r = proc_dostring(&tmp, write, buffer, lenp, ppos); } return r; -- cgit v1.2.3 From b36e4523d4d56e2595e28f16f6ccf1cd6a9fc452 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 20 Jun 2018 23:32:26 +0200 Subject: netfilter: nf_conncount: fix garbage collection confirm race Yi-Hung Wei and Justin Pettit found a race in the garbage collection scheme used by nf_conncount. When doing list walk, we lookup the tuple in the conntrack table. If the lookup fails we remove this tuple from our list because the conntrack entry is gone. This is the common cause, but turns out its not the only one. The list entry could have been created just before by another cpu, i.e. the conntrack entry might not yet have been inserted into the global hash. The avoid this, we introduce a timestamp and the owning cpu. If the entry appears to be stale, evict only if: 1. The current cpu is the one that added the entry, or, 2. The timestamp is older than two jiffies The second constraint allows GC to be taken over by other cpu too (e.g. because a cpu was offlined or napi got moved to another cpu). We can't pretend the 'doubtful' entry wasn't in our list. Instead, when we don't find an entry indicate via IS_ERR that entry was removed ('did not exist' or withheld ('might-be-unconfirmed'). This most likely also fixes a xt_connlimit imbalance earlier reported by Dmitry Andrianov. Cc: Dmitry Andrianov Reported-by: Justin Pettit Reported-by: Yi-Hung Wei Signed-off-by: Florian Westphal Acked-by: Yi-Hung Wei Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conncount.c | 52 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index d8383609fe28..510039862aa9 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -47,6 +47,8 @@ struct nf_conncount_tuple { struct hlist_node node; struct nf_conntrack_tuple tuple; struct nf_conntrack_zone zone; + int cpu; + u32 jiffies32; }; struct nf_conncount_rb { @@ -91,11 +93,42 @@ bool nf_conncount_add(struct hlist_head *head, return false; conn->tuple = *tuple; conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; hlist_add_head(&conn->node, head); return true; } EXPORT_SYMBOL_GPL(nf_conncount_add); +static const struct nf_conntrack_tuple_hash * +find_or_evict(struct net *net, struct nf_conncount_tuple *conn) +{ + const struct nf_conntrack_tuple_hash *found; + unsigned long a, b; + int cpu = raw_smp_processor_id(); + __s32 age; + + found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); + if (found) + return found; + b = conn->jiffies32; + a = (u32)jiffies; + + /* conn might have been added just before by another cpu and + * might still be unconfirmed. In this case, nf_conntrack_find() + * returns no result. Thus only evict if this cpu added the + * stale entry or if the entry is older than two jiffies. + */ + age = a - b; + if (conn->cpu == cpu || age >= 2) { + hlist_del(&conn->node); + kmem_cache_free(conncount_conn_cachep, conn); + return ERR_PTR(-ENOENT); + } + + return ERR_PTR(-EAGAIN); +} + unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_zone *zone, @@ -103,18 +136,27 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head, { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn; - struct hlist_node *n; struct nf_conn *found_ct; + struct hlist_node *n; unsigned int length = 0; *addit = tuple ? true : false; /* check the saved connections */ hlist_for_each_entry_safe(conn, n, head, node) { - found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); - if (found == NULL) { - hlist_del(&conn->node); - kmem_cache_free(conncount_conn_cachep, conn); + found = find_or_evict(net, conn); + if (IS_ERR(found)) { + /* Not found, but might be about to be confirmed */ + if (PTR_ERR(found) == -EAGAIN) { + length++; + if (!tuple) + continue; + + if (nf_ct_tuple_equal(&conn->tuple, tuple) && + nf_ct_zone_id(&conn->zone, conn->zone.dir) == + nf_ct_zone_id(zone, zone->dir)) + *addit = false; + } continue; } -- cgit v1.2.3 From c809195f5523dd4d09403bbb1c9732d548aa0d1e Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Mon, 25 Jun 2018 06:41:25 -0700 Subject: rds: clean up loopback rds_connections on netns deletion The RDS core module creates rds_connections based on callbacks from rds_loop_transport when sending/receiving packets to local addresses. These connections will need to be cleaned up when they are created from a netns that is not init_net, and that netns is deleted. Add the changes aligned with the changes from commit ebeeb1ad9b8a ("rds: tcp: use rds_destroy_pending() to synchronize netns/module teardown and rds connection/workq management") for rds_loop_transport Reported-and-tested-by: syzbot+4c20b3866171ce8441d2@syzkaller.appspotmail.com Acked-by: Santosh Shilimkar Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/connection.c | 11 ++++++++++- net/rds/loop.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/rds/loop.h | 2 ++ 3 files changed, 68 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index abef75da89a7..cfb05953b0e5 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -659,11 +659,19 @@ static void rds_conn_info(struct socket *sock, unsigned int len, int rds_conn_init(void) { + int ret; + + ret = rds_loop_net_init(); /* register pernet callback */ + if (ret) + return ret; + rds_conn_slab = kmem_cache_create("rds_connection", sizeof(struct rds_connection), 0, 0, NULL); - if (!rds_conn_slab) + if (!rds_conn_slab) { + rds_loop_net_exit(); return -ENOMEM; + } rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); rds_info_register_func(RDS_INFO_SEND_MESSAGES, @@ -676,6 +684,7 @@ int rds_conn_init(void) void rds_conn_exit(void) { + rds_loop_net_exit(); /* unregister pernet callback */ rds_loop_exit(); WARN_ON(!hlist_empty(rds_conn_hash)); diff --git a/net/rds/loop.c b/net/rds/loop.c index dac6218a460e..feea1f96ee2a 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -33,6 +33,8 @@ #include #include #include +#include +#include #include "rds_single_path.h" #include "rds.h" @@ -40,6 +42,17 @@ static DEFINE_SPINLOCK(loop_conns_lock); static LIST_HEAD(loop_conns); +static atomic_t rds_loop_unloading = ATOMIC_INIT(0); + +static void rds_loop_set_unloading(void) +{ + atomic_set(&rds_loop_unloading, 1); +} + +static bool rds_loop_is_unloading(struct rds_connection *conn) +{ + return atomic_read(&rds_loop_unloading) != 0; +} /* * This 'loopback' transport is a special case for flows that originate @@ -165,6 +178,8 @@ void rds_loop_exit(void) struct rds_loop_connection *lc, *_lc; LIST_HEAD(tmp_list); + rds_loop_set_unloading(); + synchronize_rcu(); /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&loop_conns_lock); list_splice(&loop_conns, &tmp_list); @@ -177,6 +192,46 @@ void rds_loop_exit(void) } } +static void rds_loop_kill_conns(struct net *net) +{ + struct rds_loop_connection *lc, *_lc; + LIST_HEAD(tmp_list); + + spin_lock_irq(&loop_conns_lock); + list_for_each_entry_safe(lc, _lc, &loop_conns, loop_node) { + struct net *c_net = read_pnet(&lc->conn->c_net); + + if (net != c_net) + continue; + list_move_tail(&lc->loop_node, &tmp_list); + } + spin_unlock_irq(&loop_conns_lock); + + list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) { + WARN_ON(lc->conn->c_passive); + rds_conn_destroy(lc->conn); + } +} + +static void __net_exit rds_loop_exit_net(struct net *net) +{ + rds_loop_kill_conns(net); +} + +static struct pernet_operations rds_loop_net_ops = { + .exit = rds_loop_exit_net, +}; + +int rds_loop_net_init(void) +{ + return register_pernet_device(&rds_loop_net_ops); +} + +void rds_loop_net_exit(void) +{ + unregister_pernet_device(&rds_loop_net_ops); +} + /* * This is missing .xmit_* because loop doesn't go through generic * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and @@ -194,4 +249,5 @@ struct rds_transport rds_loop_transport = { .inc_free = rds_loop_inc_free, .t_name = "loopback", .t_type = RDS_TRANS_LOOP, + .t_unloading = rds_loop_is_unloading, }; diff --git a/net/rds/loop.h b/net/rds/loop.h index 469fa4b2da4f..bbc8cdd030df 100644 --- a/net/rds/loop.h +++ b/net/rds/loop.h @@ -5,6 +5,8 @@ /* loop.c */ extern struct rds_transport rds_loop_transport; +int rds_loop_net_init(void); +void rds_loop_net_exit(void); void rds_loop_exit(void); #endif -- cgit v1.2.3 From 7c8f4e6dc30996bff806285730a0bb4e714d3d52 Mon Sep 17 00:00:00 2001 From: Jason A. Donenfeld Date: Tue, 26 Jun 2018 01:39:32 +0200 Subject: fib_rules: match rules based on suppress_* properties too Two rules with different values of suppress_prefix or suppress_ifgroup are not the same. This fixes an -EEXIST when running: $ ip -4 rule add table main suppress_prefixlength 0 Signed-off-by: Jason A. Donenfeld Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule") Signed-off-by: David S. Miller --- net/core/fib_rules.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 126ffc5bc630..bc8425d81022 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -416,6 +416,14 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->mark && r->mark != rule->mark) continue; + if (rule->suppress_ifgroup != -1 && + r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (rule->suppress_prefixlen != -1 && + r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + if (rule->mark_mask && r->mark_mask != rule->mark_mask) continue; -- cgit v1.2.3 From 88e85a7daf8e21f2d6cb054374d480c540725cde Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Jun 2018 12:55:35 +0900 Subject: bpfilter: check compiler capability in Kconfig With the brand-new syntax extension of Kconfig, we can directly check the compiler capability in the configuration phase. If the cc-can-link.sh fails, the BPFILTER_UMH is automatically hidden by the dependency. I also deleted 'default n', which is no-op. Signed-off-by: Masahiro Yamada Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Makefile | 5 ----- net/Makefile | 4 ---- net/bpfilter/Kconfig | 2 +- scripts/cc-can-link.sh | 2 +- 4 files changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/Makefile b/Makefile index c9132594860b..e1bd4c3627bc 100644 --- a/Makefile +++ b/Makefile @@ -507,11 +507,6 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC) $(KBUILD_CFLA KBUILD_AFLAGS += -DCC_HAVE_ASM_GOTO endif -ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/cc-can-link.sh $(CC)), y) - CC_CAN_LINK := y - export CC_CAN_LINK -endif - # The expansion should be delayed until arch/$(SRCARCH)/Makefile is included. # Some architectures define CROSS_COMPILE in arch/$(SRCARCH)/Makefile. # CC_VERSION_TEXT is referenced from Kconfig (so it needs export), diff --git a/net/Makefile b/net/Makefile index 13ec0d5415c7..bdaf53925acd 100644 --- a/net/Makefile +++ b/net/Makefile @@ -20,11 +20,7 @@ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ obj-$(CONFIG_NET) += ipv6/ -ifneq ($(CC_CAN_LINK),y) -$(warning CC cannot link executables. Skipping bpfilter.) -else obj-$(CONFIG_BPFILTER) += bpfilter/ -endif obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ obj-$(CONFIG_BRIDGE) += bridge/ diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig index a948b072c28f..76deb6615883 100644 --- a/net/bpfilter/Kconfig +++ b/net/bpfilter/Kconfig @@ -1,6 +1,5 @@ menuconfig BPFILTER bool "BPF based packet filtering framework (BPFILTER)" - default n depends on NET && BPF && INET help This builds experimental bpfilter framework that is aiming to @@ -9,6 +8,7 @@ menuconfig BPFILTER if BPFILTER config BPFILTER_UMH tristate "bpfilter kernel module with user mode helper" + depends on $(success,$(srctree)/scripts/cc-can-link.sh $(CC)) default m help This builds bpfilter kernel module with embedded user mode helper diff --git a/scripts/cc-can-link.sh b/scripts/cc-can-link.sh index 208eb2825dab..6efcead31989 100755 --- a/scripts/cc-can-link.sh +++ b/scripts/cc-can-link.sh @@ -1,7 +1,7 @@ #!/bin/sh # SPDX-License-Identifier: GPL-2.0 -cat << "END" | $@ -x c - -o /dev/null >/dev/null 2>&1 && echo "y" +cat << "END" | $@ -x c - -o /dev/null >/dev/null 2>&1 #include int main(void) { -- cgit v1.2.3 From 977c7114ebda2e746a114840d3a875e0cdb826fb Mon Sep 17 00:00:00 2001 From: Doron Roberts-Kedes Date: Tue, 26 Jun 2018 18:33:33 -0700 Subject: strparser: Remove early eaten to fix full tcp receive buffer stall On receving an incomplete message, the existing code stores the remaining length of the cloned skb in the early_eaten field instead of incrementing the value returned by __strp_recv. This defers invocation of sock_rfree for the current skb until the next invocation of __strp_recv, which returns early_eaten if early_eaten is non-zero. This behavior causes a stall when the current message occupies the very tail end of a massive skb, and strp_peek/need_bytes indicates that the remainder of the current message has yet to arrive on the socket. The TCP receive buffer is totally full, causing the TCP window to go to zero, so the remainder of the message will never arrive. Incrementing the value returned by __strp_recv by the amount otherwise stored in early_eaten prevents stalls of this nature. Signed-off-by: Doron Roberts-Kedes Signed-off-by: David S. Miller --- net/strparser/strparser.c | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) (limited to 'net') diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 373836615c57..625acb27efcc 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -35,7 +35,6 @@ struct _strp_msg { */ struct strp_msg strp; int accum_len; - int early_eaten; }; static inline struct _strp_msg *_strp_msg(struct sk_buff *skb) @@ -115,20 +114,6 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, head = strp->skb_head; if (head) { /* Message already in progress */ - - stm = _strp_msg(head); - if (unlikely(stm->early_eaten)) { - /* Already some number of bytes on the receive sock - * data saved in skb_head, just indicate they - * are consumed. - */ - eaten = orig_len <= stm->early_eaten ? - orig_len : stm->early_eaten; - stm->early_eaten -= eaten; - - return eaten; - } - if (unlikely(orig_offset)) { /* Getting data with a non-zero offset when a message is * in progress is not expected. If it does happen, we @@ -297,9 +282,9 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, } stm->accum_len += cand_len; + eaten += cand_len; strp->need_bytes = stm->strp.full_len - stm->accum_len; - stm->early_eaten = cand_len; STRP_STATS_ADD(strp->stats.bytes, cand_len); desc->count = 0; /* Stop reading socket */ break; -- cgit v1.2.3 From 8e75887d321d102200abf3a9fa621e2c10ff4cc5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Tue, 26 Jun 2018 20:13:48 -0700 Subject: bpfilter: include bpfilter_umh in assembly instead of using objcopy What we want here is to embed a user-space program into the kernel. Instead of the complex ELF magic, let's simply wrap it in the assembly with the '.incbin' directive. Signed-off-by: Masahiro Yamada Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/bpfilter/Makefile | 17 ++--------------- net/bpfilter/bpfilter_kern.c | 11 +++++------ net/bpfilter/bpfilter_umh_blob.S | 7 +++++++ 3 files changed, 14 insertions(+), 21 deletions(-) create mode 100644 net/bpfilter/bpfilter_umh_blob.S (limited to 'net') diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index 051dc18b8ccb..39c6980b5d99 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -15,20 +15,7 @@ ifeq ($(CONFIG_BPFILTER_UMH), y) HOSTLDFLAGS += -static endif -# a bit of elf magic to convert bpfilter_umh binary into a binary blob -# inside bpfilter_umh.o elf file referenced by -# _binary_net_bpfilter_bpfilter_umh_start symbol -# which bpfilter_kern.c passes further into umh blob loader at run-time -quiet_cmd_copy_umh = GEN $@ - cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \ - $(OBJCOPY) -I binary \ - `LC_ALL=C $(OBJDUMP) -f net/bpfilter/bpfilter_umh \ - |awk -F' |,' '/file format/{print "-O",$$NF} \ - /^architecture:/{print "-B",$$2}'` \ - --rename-section .data=.init.rodata $< $@ - -$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh - $(call cmd,copy_umh) +$(obj)/bpfilter_umh_blob.o: $(obj)/bpfilter_umh obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o -bpfilter-objs += bpfilter_kern.o bpfilter_umh.o +bpfilter-objs += bpfilter_kern.o bpfilter_umh_blob.o diff --git a/net/bpfilter/bpfilter_kern.c b/net/bpfilter/bpfilter_kern.c index 09522573f611..f0fc182d3db7 100644 --- a/net/bpfilter/bpfilter_kern.c +++ b/net/bpfilter/bpfilter_kern.c @@ -10,11 +10,8 @@ #include #include "msgfmt.h" -#define UMH_start _binary_net_bpfilter_bpfilter_umh_start -#define UMH_end _binary_net_bpfilter_bpfilter_umh_end - -extern char UMH_start; -extern char UMH_end; +extern char bpfilter_umh_start; +extern char bpfilter_umh_end; static struct umh_info info; /* since ip_getsockopt() can run in parallel, serialize access to umh */ @@ -93,7 +90,9 @@ static int __init load_umh(void) int err; /* fork usermode process */ - err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info); + err = fork_usermode_blob(&bpfilter_umh_start, + &bpfilter_umh_end - &bpfilter_umh_start, + &info); if (err) return err; pr_info("Loaded bpfilter_umh pid %d\n", info.pid); diff --git a/net/bpfilter/bpfilter_umh_blob.S b/net/bpfilter/bpfilter_umh_blob.S new file mode 100644 index 000000000000..40311d10d2f2 --- /dev/null +++ b/net/bpfilter/bpfilter_umh_blob.S @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + .section .init.rodata, "a" + .global bpfilter_umh_start +bpfilter_umh_start: + .incbin "net/bpfilter/bpfilter_umh" + .global bpfilter_umh_end +bpfilter_umh_end: -- cgit v1.2.3 From 15ecbe94a45ef88491ca459b26efdd02f91edb6d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 27 Jun 2018 08:47:21 -0700 Subject: tcp: add one more quick ack after after ECN events Larry Brakmo proposal ( https://patchwork.ozlabs.org/patch/935233/ tcp: force cwnd at least 2 in tcp_cwnd_reduction) made us rethink about our recent patch removing ~16 quick acks after ECN events. tcp_enter_quickack_mode(sk, 1) makes sure one immediate ack is sent, but in the case the sender cwnd was lowered to 1, we do not want to have a delayed ack for the next packet we will receive. Fixes: 522040ea5fdd ("tcp: do not aggressively quick ack after ECN events") Signed-off-by: Eric Dumazet Reported-by: Neal Cardwell Cc: Lawrence Brakmo Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 355d3dffd021..045d930d01a9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -265,7 +265,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) * it is probably a retransmit. */ if (tp->ecn_flags & TCP_ECN_SEEN) - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: if (tcp_ca_needs_ecn(sk)) @@ -273,7 +273,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ - tcp_enter_quickack_mode(sk, 1); + tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } tp->ecn_flags |= TCP_ECN_SEEN; -- cgit v1.2.3 From 24ac3a08e65845a098140ff270229dec4a897404 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 27 Jun 2018 17:59:50 +0200 Subject: net/smc: rebuild nonblocking connect The recent poll change may lead to stalls for non-blocking connecting SMC sockets, since sock_poll_wait is no longer performed on the internal CLC socket, but on the outer SMC socket. kernel_connect() on the internal CLC socket returns with -EINPROGRESS, but the wake up logic does not work in all cases. If the internal CLC socket is still in state TCP_SYN_SENT when polled, sock_poll_wait() from sock_poll() does not sleep. It is supposed to sleep till the state of the internal CLC socket switches to TCP_ESTABLISHED. This problem triggered a redesign of the SMC nonblocking connect logic. This patch introduces a connect worker covering all connect steps followed by a wake up of socket waiters. It allows to get rid of all delays and locks in smc_poll(). Fixes: c0129a061442 ("smc: convert to ->poll_mask") Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- net/smc/af_smc.c | 91 +++++++++++++++++++++++++++++++++++++++----------------- net/smc/smc.h | 8 +++++ 2 files changed, 71 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index da7f02edcd37..4d1d4ddb2f49 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -45,6 +45,7 @@ static DEFINE_MUTEX(smc_create_lgr_pending); /* serialize link group */ static void smc_tcp_listen_work(struct work_struct *); +static void smc_connect_work(struct work_struct *); static void smc_set_keepalive(struct sock *sk, int val) { @@ -122,6 +123,12 @@ static int smc_release(struct socket *sock) goto out; smc = smc_sk(sk); + + /* cleanup for a dangling non-blocking connect */ + flush_work(&smc->connect_work); + kfree(smc->connect_info); + smc->connect_info = NULL; + if (sk->sk_state == SMC_LISTEN) /* smc_close_non_accepted() is called and acquires * sock lock for child sockets again @@ -186,6 +193,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_protocol = protocol; smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_WORK(&smc->connect_work, smc_connect_work); INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work); INIT_LIST_HEAD(&smc->accept_q); spin_lock_init(&smc->accept_q_lock); @@ -576,6 +584,35 @@ static int __smc_connect(struct smc_sock *smc) return 0; } +static void smc_connect_work(struct work_struct *work) +{ + struct smc_sock *smc = container_of(work, struct smc_sock, + connect_work); + int rc; + + lock_sock(&smc->sk); + rc = kernel_connect(smc->clcsock, &smc->connect_info->addr, + smc->connect_info->alen, smc->connect_info->flags); + if (smc->clcsock->sk->sk_err) { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + goto out; + } + if (rc < 0) { + smc->sk.sk_err = -rc; + goto out; + } + + rc = __smc_connect(smc); + if (rc < 0) + smc->sk.sk_err = -rc; + +out: + smc->sk.sk_state_change(&smc->sk); + kfree(smc->connect_info); + smc->connect_info = NULL; + release_sock(&smc->sk); +} + static int smc_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { @@ -605,15 +642,32 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, smc_copy_sock_settings_to_clc(smc); tcp_sk(smc->clcsock->sk)->syn_smc = 1; - rc = kernel_connect(smc->clcsock, addr, alen, flags); - if (rc) - goto out; + if (flags & O_NONBLOCK) { + if (smc->connect_info) { + rc = -EALREADY; + goto out; + } + smc->connect_info = kzalloc(alen + 2 * sizeof(int), GFP_KERNEL); + if (!smc->connect_info) { + rc = -ENOMEM; + goto out; + } + smc->connect_info->alen = alen; + smc->connect_info->flags = flags ^ O_NONBLOCK; + memcpy(&smc->connect_info->addr, addr, alen); + schedule_work(&smc->connect_work); + rc = -EINPROGRESS; + } else { + rc = kernel_connect(smc->clcsock, addr, alen, flags); + if (rc) + goto out; - rc = __smc_connect(smc); - if (rc < 0) - goto out; - else - rc = 0; /* success cases including fallback */ + rc = __smc_connect(smc); + if (rc < 0) + goto out; + else + rc = 0; /* success cases including fallback */ + } out: release_sock(sk); @@ -1278,34 +1332,17 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) struct sock *sk = sock->sk; __poll_t mask = 0; struct smc_sock *smc; - int rc; if (!sk) return EPOLLNVAL; smc = smc_sk(sock->sk); - sock_hold(sk); - lock_sock(sk); if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { /* delegate to CLC child sock */ - release_sock(sk); mask = smc->clcsock->ops->poll_mask(smc->clcsock, events); - lock_sock(sk); sk->sk_err = smc->clcsock->sk->sk_err; - if (sk->sk_err) { + if (sk->sk_err) mask |= EPOLLERR; - } else { - /* if non-blocking connect finished ... */ - if (sk->sk_state == SMC_INIT && - mask & EPOLLOUT && - smc->clcsock->sk->sk_state != TCP_CLOSE) { - rc = __smc_connect(smc); - if (rc < 0) - mask |= EPOLLERR; - /* success cases including fallback */ - mask |= EPOLLOUT | EPOLLWRNORM; - } - } } else { if (sk->sk_err) mask |= EPOLLERR; @@ -1334,8 +1371,6 @@ static __poll_t smc_poll_mask(struct socket *sock, __poll_t events) mask |= EPOLLPRI; } - release_sock(sk); - sock_put(sk); return mask; } diff --git a/net/smc/smc.h b/net/smc/smc.h index 51ae1f10d81a..d7ca26570482 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -187,11 +187,19 @@ struct smc_connection { struct work_struct close_work; /* peer sent some closing */ }; +struct smc_connect_info { + int flags; + int alen; + struct sockaddr addr; +}; + struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ + struct smc_connect_info *connect_info; /* connect address & flags */ + struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ -- cgit v1.2.3 From 4c79579b44b1876444f4d04de31c1a37098a0350 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 26 Jun 2018 16:21:18 -0700 Subject: bpf: Change bpf_fib_lookup to return lookup status For ACLs implemented using either FIB rules or FIB entries, the BPF program needs the FIB lookup status to be able to drop the packet. Since the bpf_fib_lookup API has not reached a released kernel yet, change the return code to contain an encoding of the FIB lookup result and return the nexthop device index in the params struct. In addition, inform the BPF program of any post FIB lookup reason as to why the packet needs to go up the stack. The fib result for unicast routes must have an egress device, so remove the check that it is non-NULL. Signed-off-by: David Ahern Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 28 ++++++++++++--- net/core/filter.c | 86 +++++++++++++++++++++++++++++----------------- samples/bpf/xdp_fwd_kern.c | 8 ++--- 3 files changed, 81 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59b19b6a40d7..b7db3261c62d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1857,7 +1857,8 @@ union bpf_attr { * is resolved), the nexthop address is returned in ipv4_dst * or ipv6_dst based on family, smac is set to mac address of * egress device, dmac is set to nexthop mac address, rt_metric - * is set to metric from route (IPv4/IPv6 only). + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. * * *plen* argument is the size of the passed in struct. * *flags* argument can be a combination of one or more of the @@ -1873,9 +1874,10 @@ union bpf_attr { * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. * Return - * Egress device index on success, 0 if packet needs to continue - * up the stack for further processing or a negative error in case - * of failure. + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * * packet is not forwarded or needs assist from full stack * * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) * Description @@ -2612,6 +2614,18 @@ struct bpf_raw_tracepoint_args { #define BPF_FIB_LOOKUP_DIRECT BIT(0) #define BPF_FIB_LOOKUP_OUTPUT BIT(1) +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +}; + struct bpf_fib_lookup { /* input: network family for lookup (AF_INET, AF_INET6) * output: network family of egress nexthop @@ -2625,7 +2639,11 @@ struct bpf_fib_lookup { /* total length of packet from network header - used for MTU check */ __u16 tot_len; - __u32 ifindex; /* L3 device index for lookup */ + + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; union { /* inputs to lookup */ diff --git a/net/core/filter.c b/net/core/filter.c index e7f12e9f598c..0ca6907d7efe 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4073,8 +4073,9 @@ static int bpf_fib_set_fwd_params(struct bpf_fib_lookup *params, memcpy(params->smac, dev->dev_addr, ETH_ALEN); params->h_vlan_TCI = 0; params->h_vlan_proto = 0; + params->ifindex = dev->ifindex; - return dev->ifindex; + return 0; } #endif @@ -4098,7 +4099,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* verify forwarding is enabled on this interface */ in_dev = __in_dev_get_rcu(dev); if (unlikely(!in_dev || !IN_DEV_FORWARD(in_dev))) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl4.flowi4_iif = 1; @@ -4123,7 +4124,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = fib_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; err = fib_table_lookup(tb, &fl4, &res, FIB_LOOKUP_NOREF); } else { @@ -4135,8 +4136,20 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_NOREF); } - if (err || res.type != RTN_UNICAST) - return 0; + if (err) { + /* map fib lookup errors to RTN_ type */ + if (err == -EINVAL) + return BPF_FIB_LKUP_RET_BLACKHOLE; + if (err == -EHOSTUNREACH) + return BPF_FIB_LKUP_RET_UNREACHABLE; + if (err == -EACCES) + return BPF_FIB_LKUP_RET_PROHIBIT; + + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + + if (res.type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (res.fi->fib_nhs > 1) fib_select_path(net, &res, &fl4, NULL); @@ -4144,19 +4157,16 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ip_mtu_from_fib_result(&res, params->ipv4_dst); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } nh = &res.fi->fib_nh[res.nh_sel]; /* do not handle lwt encaps right now */ if (nh->nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; dev = nh->nh_dev; - if (unlikely(!dev)) - return 0; - if (nh->nh_gw) params->ipv4_dst = nh->nh_gw; @@ -4166,10 +4176,10 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, * rcu_read_lock_bh is not needed here */ neigh = __ipv4_neigh_lookup_noref(dev, (__force u32)params->ipv4_dst); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4190,7 +4200,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, /* link local addresses are never forwarded */ if (rt6_need_strict(dst) || rt6_need_strict(src)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; dev = dev_get_by_index_rcu(net, params->ifindex); if (unlikely(!dev)) @@ -4198,7 +4208,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, idev = __in6_dev_get_safely(dev); if (unlikely(!idev || !net->ipv6.devconf_all->forwarding)) - return 0; + return BPF_FIB_LKUP_RET_FWD_DISABLED; if (flags & BPF_FIB_LOOKUP_OUTPUT) { fl6.flowi6_iif = 1; @@ -4225,7 +4235,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, tb = ipv6_stub->fib6_get_table(net, tbid); if (unlikely(!tb)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; f6i = ipv6_stub->fib6_table_lookup(net, tb, oif, &fl6, strict); } else { @@ -4238,11 +4248,23 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, } if (unlikely(IS_ERR_OR_NULL(f6i) || f6i == net->ipv6.fib6_null_entry)) - return 0; + return BPF_FIB_LKUP_RET_NOT_FWDED; + + if (unlikely(f6i->fib6_flags & RTF_REJECT)) { + switch (f6i->fib6_type) { + case RTN_BLACKHOLE: + return BPF_FIB_LKUP_RET_BLACKHOLE; + case RTN_UNREACHABLE: + return BPF_FIB_LKUP_RET_UNREACHABLE; + case RTN_PROHIBIT: + return BPF_FIB_LKUP_RET_PROHIBIT; + default: + return BPF_FIB_LKUP_RET_NOT_FWDED; + } + } - if (unlikely(f6i->fib6_flags & RTF_REJECT || - f6i->fib6_type != RTN_UNICAST)) - return 0; + if (f6i->fib6_type != RTN_UNICAST) + return BPF_FIB_LKUP_RET_NOT_FWDED; if (f6i->fib6_nsiblings && fl6.flowi6_oif == 0) f6i = ipv6_stub->fib6_multipath_select(net, f6i, &fl6, @@ -4252,11 +4274,11 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, if (check_mtu) { mtu = ipv6_stub->ip6_mtu_from_fib6(f6i, dst, src); if (params->tot_len > mtu) - return 0; + return BPF_FIB_LKUP_RET_FRAG_NEEDED; } if (f6i->fib6_nh.nh_lwtstate) - return 0; + return BPF_FIB_LKUP_RET_UNSUPP_LWT; if (f6i->fib6_flags & RTF_GATEWAY) *dst = f6i->fib6_nh.nh_gw; @@ -4270,10 +4292,10 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params, */ neigh = ___neigh_lookup_noref(ipv6_stub->nd_tbl, neigh_key_eq128, ndisc_hashfn, dst, dev); - if (neigh) - return bpf_fib_set_fwd_params(params, neigh, dev); + if (!neigh) + return BPF_FIB_LKUP_RET_NO_NEIGH; - return 0; + return bpf_fib_set_fwd_params(params, neigh, dev); } #endif @@ -4315,7 +4337,7 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, struct bpf_fib_lookup *, params, int, plen, u32, flags) { struct net *net = dev_net(skb->dev); - int index = -EAFNOSUPPORT; + int rc = -EAFNOSUPPORT; if (plen < sizeof(*params)) return -EINVAL; @@ -4326,25 +4348,25 @@ BPF_CALL_4(bpf_skb_fib_lookup, struct sk_buff *, skb, switch (params->family) { #if IS_ENABLED(CONFIG_INET) case AF_INET: - index = bpf_ipv4_fib_lookup(net, params, flags, false); + rc = bpf_ipv4_fib_lookup(net, params, flags, false); break; #endif #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - index = bpf_ipv6_fib_lookup(net, params, flags, false); + rc = bpf_ipv6_fib_lookup(net, params, flags, false); break; #endif } - if (index > 0) { + if (!rc) { struct net_device *dev; - dev = dev_get_by_index_rcu(net, index); + dev = dev_get_by_index_rcu(net, params->ifindex); if (!is_skb_forwardable(dev, skb)) - index = 0; + rc = BPF_FIB_LKUP_RET_FRAG_NEEDED; } - return index; + return rc; } static const struct bpf_func_proto bpf_skb_fib_lookup_proto = { diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c index 6673cdb9f55c..a7e94e7ff87d 100644 --- a/samples/bpf/xdp_fwd_kern.c +++ b/samples/bpf/xdp_fwd_kern.c @@ -48,9 +48,9 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) struct ethhdr *eth = data; struct ipv6hdr *ip6h; struct iphdr *iph; - int out_index; u16 h_proto; u64 nh_off; + int rc; nh_off = sizeof(*eth); if (data + nh_off > data_end) @@ -101,7 +101,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) fib_params.ifindex = ctx->ingress_ifindex; - out_index = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); + rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags); /* verify egress index has xdp support * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with @@ -109,7 +109,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) * NOTE: without verification that egress index supports XDP * forwarding packets are dropped. */ - if (out_index > 0) { + if (rc == 0) { if (h_proto == htons(ETH_P_IP)) ip_decrease_ttl(iph); else if (h_proto == htons(ETH_P_IPV6)) @@ -117,7 +117,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags) memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN); memcpy(eth->h_source, fib_params.smac, ETH_ALEN); - return bpf_redirect_map(&tx_port, out_index, 0); + return bpf_redirect_map(&tx_port, fib_params.ifindex, 0); } return XDP_PASS; -- cgit v1.2.3 From e7441c9274a6a5453e06f4c2b8b5f72eca0a3f17 Mon Sep 17 00:00:00 2001 From: Denis Kenzior Date: Tue, 19 Jun 2018 10:39:50 -0500 Subject: mac80211: disable BHs/preemption in ieee80211_tx_control_port() On pre-emption enabled kernels the following print was being seen due to missing local_bh_disable/local_bh_enable calls. mac80211 assumes that pre-emption is disabled in the data path. BUG: using smp_processor_id() in preemptible [00000000] code: iwd/517 caller is __ieee80211_subif_start_xmit+0x144/0x210 [mac80211] [...] Call Trace: dump_stack+0x5c/0x80 check_preemption_disabled.cold.0+0x46/0x51 __ieee80211_subif_start_xmit+0x144/0x210 [mac80211] Fixes: 911806491425 ("mac80211: Add support for tx_control_port") Signed-off-by: Denis Kenzior [commit message rewrite, fixes tag] Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 44b5dfe8727d..fa1f1e63a264 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4845,7 +4845,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_network_header(skb); skb_reset_mac_header(skb); + local_bh_disable(); __ieee80211_subif_start_xmit(skb, skb->dev, flags); + local_bh_enable(); return 0; } -- cgit v1.2.3 From 188f60ab8e787fcbb5ac9d64ede23a0070231f09 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Sun, 24 Jun 2018 21:10:49 -0400 Subject: nl80211: relax ht operation checks for mesh Commit 9757235f451c, "nl80211: correct checks for NL80211_MESHCONF_HT_OPMODE value") relaxed the range for the HT operation field in meshconf, while also adding checks requiring the non-greenfield and non-ht-sta bits to be set in certain circumstances. The latter bit is actually reserved for mesh BSSes according to Table 9-168 in 802.11-2016, so in fact it should not be set. wpa_supplicant sets these bits because the mesh and AP code share the same implementation, but authsae does not. As a result, some meshconf updates from authsae which set only the NONHT_MIXED protection bits were being rejected. In order to avoid breaking userspace by changing the rules again, simply accept the values with or without the bits set, and mask off the reserved bit to match the spec. While in here, update the 802.11-2012 reference to 802.11-2016. Fixes: 9757235f451c ("nl80211: correct checks for NL80211_MESHCONF_HT_OPMODE value") Cc: Masashi Honma Signed-off-by: Bob Copeland Reviewed-by: Masashi Honma Reviewed-by: Masashi Honma Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index c7bbe5f0aae8..351eeaf16abe 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -6231,7 +6231,7 @@ do { \ nl80211_check_s32); /* * Check HT operation mode based on - * IEEE 802.11 2012 8.4.2.59 HT Operation element. + * IEEE 802.11-2016 9.4.2.57 HT Operation element. */ if (tb[NL80211_MESHCONF_HT_OPMODE]) { ht_opmode = nla_get_u16(tb[NL80211_MESHCONF_HT_OPMODE]); @@ -6241,22 +6241,9 @@ do { \ IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) return -EINVAL; - if ((ht_opmode & IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT) && - (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) - return -EINVAL; + /* NON_HT_STA bit is reserved, but some programs set it */ + ht_opmode &= ~IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT; - switch (ht_opmode & IEEE80211_HT_OP_MODE_PROTECTION) { - case IEEE80211_HT_OP_MODE_PROTECTION_NONE: - case IEEE80211_HT_OP_MODE_PROTECTION_20MHZ: - if (ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT) - return -EINVAL; - break; - case IEEE80211_HT_OP_MODE_PROTECTION_NONMEMBER: - case IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED: - if (!(ht_opmode & IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT)) - return -EINVAL; - break; - } cfg->ht_opmode = ht_opmode; mask |= (1 << (NL80211_MESHCONF_HT_OPMODE - 1)); } -- cgit v1.2.3 From 95bca62fb723a121954fc7ae5473bb2c1f0d5986 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 29 Jun 2018 09:33:39 +0200 Subject: nl80211: check nla_parse_nested() return values At the very least we should check the return value if nla_parse_nested() is called with a non-NULL policy. Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 351eeaf16abe..4eece06be1e7 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10949,9 +10949,12 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - nl80211_packet_pattern_policy, - info->extack); + err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, + nl80211_packet_pattern_policy, + info->extack); + if (err) + goto error; + err = -EINVAL; if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) @@ -11200,8 +11203,11 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - nl80211_packet_pattern_policy, NULL); + err = nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, + nl80211_packet_pattern_policy, NULL); + if (err) + return err; + if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) return -EINVAL; -- cgit v1.2.3 From e699e2c6a654ff8d7303f5297ab5dd83da7b23e0 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 27 Jun 2018 15:16:42 -0700 Subject: net, mm: account sock objects to kmemcg Currently the kernel accounts the memory for network traffic through mem_cgroup_[un]charge_skmem() interface. However the memory accounted only includes the truesize of sk_buff which does not include the size of sock objects. In our production environment, with opt-out kmem accounting, the sock kmem caches (TCP[v6], UDP[v6], RAW[v6], UNIX) are among the top most charged kmem caches and consume a significant amount of memory which can not be left as system overhead. So, this patch converts the kmem caches of all sock objects to SLAB_ACCOUNT. Signed-off-by: Shakeel Butt Suggested-by: Eric Dumazet Reviewed-by: Kirill Tkhai Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/sock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index bcc41829a16d..9e8f65585b81 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3243,7 +3243,8 @@ static int req_prot_init(const struct proto *prot) rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, rsk_prot->obj_size, 0, - prot->slab_flags, NULL); + SLAB_ACCOUNT | prot->slab_flags, + NULL); if (!rsk_prot->slab) { pr_crit("%s: Can't create request sock SLAB cache!\n", @@ -3258,7 +3259,8 @@ int proto_register(struct proto *prot, int alloc_slab) if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | prot->slab_flags, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags, prot->useroffset, prot->usersize, NULL); @@ -3281,6 +3283,7 @@ int proto_register(struct proto *prot, int alloc_slab) kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, 0, + SLAB_ACCOUNT | prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) -- cgit v1.2.3 From c860e997e9170a6d68f9d1e6e2cf61f572191aaf Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 27 Jun 2018 16:04:48 -0700 Subject: tcp: fix Fast Open key endianness Fast Open key could be stored in different endian based on the CPU. Previously hosts in different endianness in a server farm using the same key config (sysctl value) would produce different cookies. This patch fixes it by always storing it as little endian to keep same API for LE hosts. Reported-by: Daniele Iamartino Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d06247ba08b2..af0a857d8352 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -265,8 +265,9 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ipv4.sysctl_tcp_fastopen); struct ctl_table tbl = { .maxlen = (TCP_FASTOPEN_KEY_LENGTH * 2 + 10) }; struct tcp_fastopen_context *ctxt; - int ret; u32 user_key[4]; /* 16 bytes, matching TCP_FASTOPEN_KEY_LENGTH */ + __le32 key[4]; + int ret, i; tbl.data = kmalloc(tbl.maxlen, GFP_KERNEL); if (!tbl.data) @@ -275,11 +276,14 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, rcu_read_lock(); ctxt = rcu_dereference(net->ipv4.tcp_fastopen_ctx); if (ctxt) - memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + memcpy(key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); else - memset(user_key, 0, sizeof(user_key)); + memset(key, 0, sizeof(key)); rcu_read_unlock(); + for (i = 0; i < ARRAY_SIZE(key); i++) + user_key[i] = le32_to_cpu(key[i]); + snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", user_key[0], user_key[1], user_key[2], user_key[3]); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); @@ -290,13 +294,17 @@ static int proc_tcp_fastopen_key(struct ctl_table *table, int write, ret = -EINVAL; goto bad_key; } - tcp_fastopen_reset_cipher(net, NULL, user_key, + + for (i = 0; i < ARRAY_SIZE(user_key); i++) + key[i] = cpu_to_le32(user_key[i]); + + tcp_fastopen_reset_cipher(net, NULL, key, TCP_FASTOPEN_KEY_LENGTH); } bad_key: pr_debug("proc FO key set 0x%x-%x-%x-%x <- 0x%s: %u\n", - user_key[0], user_key[1], user_key[2], user_key[3], + user_key[0], user_key[1], user_key[2], user_key[3], (char *)tbl.data, ret); kfree(tbl.data); return ret; -- cgit v1.2.3 From d14b56f508ad70eca3e659545aab3c45200f258c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 28 Jun 2018 17:53:06 +0200 Subject: net: cleanup gfp mask in alloc_skb_with_frags alloc_skb_with_frags uses __GFP_NORETRY for non-sleeping allocations which is just a noop and a little bit confusing. __GFP_NORETRY was added by ed98df3361f0 ("net: use __GFP_NORETRY for high order allocations") to prevent from the OOM killer. Yet this was not enough because fb05e7a89f50 ("net: don't wait for order-3 page allocation") didn't want an excessive reclaim for non-costly orders so it made it completely NOWAIT while it preserved __GFP_NORETRY in place which is now redundant. Drop the pointless __GFP_NORETRY because this function is used as copy&paste source for other places. Reviewed-by: Eric Dumazet Signed-off-by: Michal Hocko Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c642304f178c..eba8dae22c25 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -5276,8 +5276,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (npages >= 1 << order) { page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | - __GFP_NOWARN | - __GFP_NORETRY, + __GFP_NOWARN, order); if (page) goto fill_page; -- cgit v1.2.3 From e7c7faa936680a2f0e78fc6d9683a5728421a150 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 28 Jun 2018 13:36:55 -0700 Subject: net/ipv6: Fix updates to prefix route Sowmini reported that a recent commit broke prefix routes for linklocal addresses. The newly added modify_prefix_route is attempting to add a new prefix route when the ifp priority does not match the route metric however the check needs to account for the default priority. In addition, the route add fails because the route already exists, and then the delete removes the one that exists. Flip the order to do the delete first. Fixes: 8308f3ff1753 ("net/ipv6: Add support for specifying metric of connected routes") Reported-by: Sowmini Varadhan Tested-by: Sowmini Varadhan Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c134286d6a41..91580c62bb86 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4528,6 +4528,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, u32 flags) { struct fib6_info *f6i; + u32 prio; f6i = addrconf_get_prefix_route(&ifp->addr, ifp->prefix_len, @@ -4536,13 +4537,15 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, if (!f6i) return -ENOENT; - if (f6i->fib6_metric != ifp->rt_priority) { + prio = ifp->rt_priority ? : IP6_RT_PRIO_ADDRCONF; + if (f6i->fib6_metric != prio) { + /* delete old one */ + ip6_del_rt(dev_net(ifp->idev->dev), f6i); + /* add new one */ addrconf_prefix_route(&ifp->addr, ifp->prefix_len, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); - /* delete old one */ - ip6_del_rt(dev_net(ifp->idev->dev), f6i); } else { if (!expires) fib6_clean_expires(f6i); -- cgit v1.2.3 From 3f76df198288ceec92fc9eddecad1e73c52769b0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 29 Jun 2018 13:42:48 -0700 Subject: net: use dev_change_tx_queue_len() for SIOCSIFTXQLEN As noticed by Eric, we need to switch to the helper dev_change_tx_queue_len() for SIOCSIFTXQLEN call path too, otheriwse still miss dev_qdisc_change_tx_queue_len(). Fixes: 6a643ddb5624 ("net: introduce helper dev_change_tx_queue_len()") Reported-by: Eric Dumazet Signed-off-by: Cong Wang Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev_ioctl.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index a04e1e88bf3a..50537ff961a7 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -285,16 +285,9 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd) if (ifr->ifr_qlen < 0) return -EINVAL; if (dev->tx_queue_len ^ ifr->ifr_qlen) { - unsigned int orig_len = dev->tx_queue_len; - - dev->tx_queue_len = ifr->ifr_qlen; - err = call_netdevice_notifiers( - NETDEV_CHANGE_TX_QUEUE_LEN, dev); - err = notifier_to_errno(err); - if (err) { - dev->tx_queue_len = orig_len; + err = dev_change_tx_queue_len(dev, ifr->ifr_qlen); + if (err) return err; - } } return 0; -- cgit v1.2.3 From 35e8c7ba0863acadd4b9badd09740af7a8331125 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Fri, 29 Jun 2018 14:32:15 -0700 Subject: net: fib_rules: bring back rule_exists to match rule during add After commit f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule"), rule_exists got replaced by rule_find for existing rule lookup in both the add and del paths. While this is good for the delete path, it solves a few problems but opens up a few invalid key matches in the add path. $ip -4 rule add table main tos 10 fwmark 1 $ip -4 rule add table main tos 10 RTNETLINK answers: File exists The problem here is rule_find does not check if the key masks in the new and old rule are the same and hence ends up matching a more secific rule. Rule key masks cannot be easily compared today without an elaborate if-else block. Its best to introduce key masks for easier and accurate rule comparison in the future. Until then, due to fear of regressions this patch re-introduces older loose rule_exists during add. Also fixes both rule_exists and rule_find to cover missing attributes. Fixes: f9d4b0c1e969 ("fib_rules: move common handling of newrule delrule msgs into fib_nl2rule") Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/core/fib_rules.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index bc8425d81022..f64aa13811ea 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -444,6 +444,9 @@ static struct fib_rule *rule_find(struct fib_rules_ops *ops, if (rule->ip_proto && r->ip_proto != rule->ip_proto) continue; + if (rule->proto && r->proto != rule->proto) + continue; + if (fib_rule_port_range_set(&rule->sport_range) && !fib_rule_port_range_compare(&r->sport_range, &rule->sport_range)) @@ -653,6 +656,73 @@ errout: return err; } +static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, + struct nlattr **tb, struct fib_rule *rule) +{ + struct fib_rule *r; + + list_for_each_entry(r, &ops->rules_list, list) { + if (r->action != rule->action) + continue; + + if (r->table != rule->table) + continue; + + if (r->pref != rule->pref) + continue; + + if (memcmp(r->iifname, rule->iifname, IFNAMSIZ)) + continue; + + if (memcmp(r->oifname, rule->oifname, IFNAMSIZ)) + continue; + + if (r->mark != rule->mark) + continue; + + if (r->suppress_ifgroup != rule->suppress_ifgroup) + continue; + + if (r->suppress_prefixlen != rule->suppress_prefixlen) + continue; + + if (r->mark_mask != rule->mark_mask) + continue; + + if (r->tun_id != rule->tun_id) + continue; + + if (r->fr_net != rule->fr_net) + continue; + + if (r->l3mdev != rule->l3mdev) + continue; + + if (!uid_eq(r->uid_range.start, rule->uid_range.start) || + !uid_eq(r->uid_range.end, rule->uid_range.end)) + continue; + + if (r->ip_proto != rule->ip_proto) + continue; + + if (r->proto != rule->proto) + continue; + + if (!fib_rule_port_range_compare(&r->sport_range, + &rule->sport_range)) + continue; + + if (!fib_rule_port_range_compare(&r->dport_range, + &rule->dport_range)) + continue; + + if (!ops->compare(r, frh, tb)) + continue; + return 1; + } + return 0; +} + int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { @@ -687,7 +757,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; if ((nlh->nlmsg_flags & NLM_F_EXCL) && - rule_find(ops, frh, tb, rule, user_priority)) { + rule_exists(ops, frh, tb, rule)) { err = -EEXIST; goto errout_free; } -- cgit v1.2.3 From 1236f22fbae15df3736ab4a984c64c0c6ee6254c Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Fri, 29 Jun 2018 13:07:53 +0300 Subject: tcp: prevent bogus FRTO undos with non-SACK flows If SACK is not enabled and the first cumulative ACK after the RTO retransmission covers more than the retransmitted skb, a spurious FRTO undo will trigger (assuming FRTO is enabled for that RTO). The reason is that any non-retransmitted segment acknowledged will set FLAG_ORIG_SACK_ACKED in tcp_clean_rtx_queue even if there is no indication that it would have been delivered for real (the scoreboard is not kept with TCPCB_SACKED_ACKED bits in the non-SACK case so the check for that bit won't help like it does with SACK). Having FLAG_ORIG_SACK_ACKED set results in the spurious FRTO undo in tcp_process_loss. We need to use more strict condition for non-SACK case and check that none of the cumulatively ACKed segments were retransmitted to prove that progress is due to original transmissions. Only then keep FLAG_ORIG_SACK_ACKED set, allowing FRTO undo to proceed in non-SACK case. (FLAG_ORIG_SACK_ACKED is planned to be renamed to FLAG_ORIG_PROGRESS to better indicate its purpose but to keep this change minimal, it will be done in another patch). Besides burstiness and congestion control violations, this problem can result in RTO loop: When the loss recovery is prematurely undoed, only new data will be transmitted (if available) and the next retransmission can occur only after a new RTO which in case of multiple losses (that are not for consecutive packets) requires one RTO per loss to recover. Signed-off-by: Ilpo Järvinen Tested-by: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 045d930d01a9..8e5522c6833a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3181,6 +3181,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (tcp_is_reno(tp)) { tcp_remove_reno_sacks(sk, pkts_acked); + + /* If any of the cumulatively ACKed segments was + * retransmitted, non-SACK case cannot confirm that + * progress was due to original transmission due to + * lack of TCPCB_SACKED_ACKED bits even if some of + * the packets may have been never retransmitted. + */ + if (flag & FLAG_RETRANS_DATA_ACKED) + flag &= ~FLAG_ORIG_SACK_ACKED; } else { int delta; -- cgit v1.2.3 From 603d4cf8fe095b1ee78f423d514427be507fb513 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Sat, 30 Jun 2018 17:38:55 +0200 Subject: net: fix use-after-free in GRO with ESP Since the addition of GRO for ESP, gro_receive can consume the skb and return -EINPROGRESS. In that case, the lower layer GRO handler cannot touch the skb anymore. Commit 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.") converted some of the gro_receive handlers that can lead to ESP's gro_receive so that they wouldn't access the skb when -EINPROGRESS is returned, but missed other spots, mainly in tunneling protocols. This patch finishes the conversion to using skb_gro_flush_final(), and adds a new helper, skb_gro_flush_final_remcsum(), used in VXLAN and GUE. Fixes: 5f114163f2f5 ("net: Add a skb_gro_flush_final helper.") Signed-off-by: Sabrina Dubroca Reviewed-by: Stefano Brivio Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 4 +--- include/linux/netdevice.h | 20 ++++++++++++++++++++ net/8021q/vlan.c | 2 +- net/ipv4/fou.c | 4 +--- net/ipv4/gre_offload.c | 2 +- net/ipv4/udp_offload.c | 2 +- 7 files changed, 26 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 750eaa53bf0c..ada33c2d9ac2 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -476,7 +476,7 @@ static struct sk_buff **geneve_gro_receive(struct sock *sk, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index aee0e60471f1..f6bb1d54d4bd 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -623,9 +623,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, flush = 0; out: - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 3ec9850c7936..3d0cc0b5cec2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2789,11 +2789,31 @@ static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, if (PTR_ERR(pp) != -EINPROGRESS) NAPI_GRO_CB(skb)->flush |= flush; } +static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, + struct sk_buff **pp, + int flush, + struct gro_remcsum *grc) +{ + if (PTR_ERR(pp) != -EINPROGRESS) { + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, grc); + skb->remcsum_offload = 0; + } +} #else static inline void skb_gro_flush_final(struct sk_buff *skb, struct sk_buff **pp, int flush) { NAPI_GRO_CB(skb)->flush |= flush; } +static inline void skb_gro_flush_final_remcsum(struct sk_buff *skb, + struct sk_buff **pp, + int flush, + struct gro_remcsum *grc) +{ + NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_remcsum_cleanup(skb, grc); + skb->remcsum_offload = 0; +} #endif static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 73a65789271b..8ccee3d01822 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -693,7 +693,7 @@ static struct sk_buff **vlan_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 1540db65241a..c9ec1603666b 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -448,9 +448,7 @@ next_proto: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; - skb_gro_remcsum_cleanup(skb, &grc); - skb->remcsum_offload = 0; + skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c index 1859c473b21a..6a7d980105f6 100644 --- a/net/ipv4/gre_offload.c +++ b/net/ipv4/gre_offload.c @@ -223,7 +223,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head, out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 92dc9e5a7ff3..69c54540d5b4 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -394,7 +394,7 @@ unflush: out_unlock: rcu_read_unlock(); out: - NAPI_GRO_CB(skb)->flush |= flush; + skb_gro_flush_final(skb, pp, flush); return pp; } EXPORT_SYMBOL(udp_gro_receive); -- cgit v1.2.3 From fc9c2029e37c3ae9efc28bf47045e0b87e09660c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sat, 30 Jun 2018 15:26:56 -0700 Subject: ipv6: sr: fix passing wrong flags to crypto_alloc_shash() The 'mask' argument to crypto_alloc_shash() uses the CRYPTO_ALG_* flags, not 'gfp_t'. So don't pass GFP_KERNEL to it. Fixes: bf355b8d2c30 ("ipv6: sr: add core files for SR HMAC support") Signed-off-by: Eric Biggers Signed-off-by: David S. Miller --- net/ipv6/seg6_hmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 33fb35cbfac1..558fe8cc6d43 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -373,7 +373,7 @@ static int seg6_hmac_init_algo(void) return -ENOMEM; for_each_possible_cpu(cpu) { - tfm = crypto_alloc_shash(algo->name, 0, GFP_KERNEL); + tfm = crypto_alloc_shash(algo->name, 0, 0); if (IS_ERR(tfm)) return PTR_ERR(tfm); p_tfm = per_cpu_ptr(algo->tfms, cpu); -- cgit v1.2.3