aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller2022-05-16 10:10:37 +0100
committerDavid S. Miller2022-05-16 10:10:37 +0100
commit1a01a0751731c807c04e81d3c19c5b782d205af7 (patch)
tree3646fe82bbefb7085b3138baf17ecb4af5e557a1
parentd9713088158b23973266e07fdc85ff7d68791a8c (diff)
parent4f9bd53084d18c2f9f1ec68fa56587b99a2cef00 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Pablo Neira Ayuso says: ==================== Netfilter updates for net-next This is v2 including deadlock fix in conntrack ecache rework reported by Jakub Kicinski. The following patchset contains Netfilter updates for net-next, mostly updates to conntrack from Florian Westphal. 1) Add a dedicated list for conntrack event redelivery. 2) Include event redelivery list in conntrack dumps of dying type. 3) Remove per-cpu dying list for event redelivery, not used anymore. 4) Add netns .pre_exit to cttimeout to zap timeout objects before synchronize_rcu() call. 5) Remove nf_ct_unconfirmed_destroy. 6) Add generation id for conntrack extensions for conntrack timeout and helpers. 7) Detach timeout policy from conntrack on cttimeout module removal. 8) Remove __nf_ct_unconfirmed_destroy. 9) Remove unconfirmed list. 10) Remove unconditional local_bh_disable in init_conntrack(). 11) Consolidate conntrack iterator nf_ct_iterate_cleanup(). 12) Detect if ctnetlink listeners exist to short-circuit event path early. 13) Un-inline nf_ct_ecache_ext_add(). 14) Add nf_conntrack_events autodetect ctnetlink listener mode and make it default. 15) Add nf_ct_ecache_exist() to check for event cache extension. 16) Extend flowtable reverse route lookup to include source, iif, tos and mark, from Sven Auhagen. 17) Do not verify zero checksum UDP packets in nf_reject, from Kevin Mitchell. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--Documentation/networking/nf_conntrack-sysctl.rst5
-rw-r--r--include/net/netfilter/nf_conntrack.h17
-rw-r--r--include/net/netfilter/nf_conntrack_core.h2
-rw-r--r--include/net/netfilter/nf_conntrack_ecache.h53
-rw-r--r--include/net/netfilter/nf_conntrack_extend.h31
-rw-r--r--include/net/netfilter/nf_conntrack_labels.h10
-rw-r--r--include/net/netfilter/nf_conntrack_timeout.h8
-rw-r--r--include/net/netfilter/nf_reject.h21
-rw-r--r--include/net/netns/conntrack.h8
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c10
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c4
-rw-r--r--net/netfilter/nf_conntrack_core.c304
-rw-r--r--net/netfilter/nf_conntrack_ecache.c165
-rw-r--r--net/netfilter/nf_conntrack_extend.c32
-rw-r--r--net/netfilter/nf_conntrack_helper.c5
-rw-r--r--net/netfilter/nf_conntrack_netlink.c86
-rw-r--r--net/netfilter/nf_conntrack_proto.c10
-rw-r--r--net/netfilter/nf_conntrack_standalone.c2
-rw-r--r--net/netfilter/nf_conntrack_timeout.c7
-rw-r--r--net/netfilter/nf_nat_masquerade.c5
-rw-r--r--net/netfilter/nfnetlink.c40
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c47
-rw-r--r--net/netfilter/nft_flow_offload.c8
23 files changed, 494 insertions, 386 deletions
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 311128abb768..834945ebc4cd 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -34,10 +34,13 @@ nf_conntrack_count - INTEGER (read-only)
nf_conntrack_events - BOOLEAN
- 0 - disabled
- - not 0 - enabled (default)
+ - 1 - enabled
+ - 2 - auto (default)
If this option is enabled, the connection tracking code will
provide userspace with connection tracking events via ctnetlink.
+ The default allocates the extension if a userspace program is
+ listening to ctnetlink events.
nf_conntrack_expect_max - INTEGER
Maximum size of expectation table. Default value is
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 69e6c6a218be..a32be8aa7ed2 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -45,7 +45,8 @@ union nf_conntrack_expect_proto {
struct nf_conntrack_net_ecache {
struct delayed_work dwork;
- struct netns_ct *ct_net;
+ spinlock_t dying_lock;
+ struct hlist_nulls_head dying_list;
};
struct nf_conntrack_net {
@@ -100,7 +101,6 @@ struct nf_conn {
/* Have we seen traffic both ways yet? (bitset) */
unsigned long status;
- u16 cpu;
possible_net_t ct_net;
#if IS_ENABLED(CONFIG_NF_NAT)
@@ -236,13 +236,16 @@ static inline bool nf_ct_kill(struct nf_conn *ct)
return nf_ct_delete(ct, 0, 0);
}
-/* Set all unconfirmed conntrack as dying */
-void nf_ct_unconfirmed_destroy(struct net *);
+struct nf_ct_iter_data {
+ struct net *net;
+ void *data;
+ u32 portid;
+ int report;
+};
/* Iterate over all conntracks: if iter returns true, it's deleted. */
-void nf_ct_iterate_cleanup_net(struct net *net,
- int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report);
+void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
+ const struct nf_ct_iter_data *iter_data);
/* also set unconfirmed conntracks as dying. Only use in module exit path. */
void nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data),
diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 13807ea94cd2..6406cfee34c2 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -60,7 +60,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
if (ct) {
if (!nf_ct_is_confirmed(ct))
ret = __nf_conntrack_confirm(skb);
- if (likely(ret == NF_ACCEPT))
+ if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct))
nf_ct_deliver_cached_events(ct);
}
return ret;
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 6c4c490a3e34..0c1dac318e02 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -14,7 +14,6 @@
#include <net/netfilter/nf_conntrack_extend.h>
enum nf_ct_ecache_state {
- NFCT_ECACHE_UNKNOWN, /* destroy event not sent */
NFCT_ECACHE_DESTROY_FAIL, /* tried but failed to send destroy event */
NFCT_ECACHE_DESTROY_SENT, /* sent destroy event after failure */
};
@@ -23,7 +22,6 @@ struct nf_conntrack_ecache {
unsigned long cache; /* bitops want long */
u16 ctmask; /* bitmask of ct events to be delivered */
u16 expmask; /* bitmask of expect events to be delivered */
- enum nf_ct_ecache_state state:8;/* ecache state */
u32 missed; /* missed events */
u32 portid; /* netlink portid of destroyer */
};
@@ -38,28 +36,12 @@ nf_ct_ecache_find(const struct nf_conn *ct)
#endif
}
-static inline struct nf_conntrack_ecache *
-nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
+static inline bool nf_ct_ecache_exist(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- struct net *net = nf_ct_net(ct);
- struct nf_conntrack_ecache *e;
-
- if (!ctmask && !expmask && net->ct.sysctl_events) {
- ctmask = ~0;
- expmask = ~0;
- }
- if (!ctmask && !expmask)
- return NULL;
-
- e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
- if (e) {
- e->ctmask = ctmask;
- e->expmask = expmask;
- }
- return e;
+ return nf_ct_ext_exist(ct, NF_CT_EXT_ECACHE);
#else
- return NULL;
+ return false;
#endif
}
@@ -91,6 +73,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct);
int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct,
u32 portid, int report);
+bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp);
#else
static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct)
@@ -105,6 +88,10 @@ static inline int nf_conntrack_eventmask_report(unsigned int eventmask,
return 0;
}
+static inline bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
+{
+ return false;
+}
#endif
static inline void
@@ -130,30 +117,20 @@ nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct,
u32 portid, int report)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- const struct net *net = nf_ct_net(ct);
-
- if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
- return 0;
-
- return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
-#else
- return 0;
+ if (nf_ct_ecache_exist(ct))
+ return nf_conntrack_eventmask_report(1 << event, ct, portid, report);
#endif
+ return 0;
}
static inline int
nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_EVENTS
- const struct net *net = nf_ct_net(ct);
-
- if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb))
- return 0;
-
- return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
-#else
- return 0;
+ if (nf_ct_ecache_exist(ct))
+ return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
#endif
+ return 0;
}
#ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -166,6 +143,8 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state);
void nf_conntrack_ecache_pernet_init(struct net *net);
void nf_conntrack_ecache_pernet_fini(struct net *net);
+struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net);
+
static inline bool nf_conntrack_ecache_dwork_pending(const struct net *net)
{
return net->ct.ecache_dwork_pending;
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index 96635ad2acc7..0b247248b032 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -34,21 +34,11 @@ enum nf_ct_ext_id {
NF_CT_EXT_NUM,
};
-#define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
-#define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
-#define NF_CT_EXT_SEQADJ_TYPE struct nf_conn_seqadj
-#define NF_CT_EXT_ACCT_TYPE struct nf_conn_acct
-#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
-#define NF_CT_EXT_TSTAMP_TYPE struct nf_conn_tstamp
-#define NF_CT_EXT_TIMEOUT_TYPE struct nf_conn_timeout
-#define NF_CT_EXT_LABELS_TYPE struct nf_conn_labels
-#define NF_CT_EXT_SYNPROXY_TYPE struct nf_conn_synproxy
-#define NF_CT_EXT_ACT_CT_TYPE struct nf_conn_act_ct_ext
-
/* Extensions: optional stuff which isn't permanently in struct. */
struct nf_ct_ext {
u8 offset[NF_CT_EXT_NUM];
u8 len;
+ unsigned int gen_id;
char data[] __aligned(8);
};
@@ -62,17 +52,28 @@ static inline bool nf_ct_ext_exist(const struct nf_conn *ct, u8 id)
return (ct->ext && __nf_ct_ext_exist(ct->ext, id));
}
-static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id)
+void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id);
+
+static inline void *nf_ct_ext_find(const struct nf_conn *ct, u8 id)
{
- if (!nf_ct_ext_exist(ct, id))
+ struct nf_ct_ext *ext = ct->ext;
+
+ if (!ext || !__nf_ct_ext_exist(ext, id))
return NULL;
+ if (unlikely(ext->gen_id))
+ return __nf_ct_ext_find(ext, id);
+
return (void *)ct->ext + ct->ext->offset[id];
}
-#define nf_ct_ext_find(ext, id) \
- ((id##_TYPE *)__nf_ct_ext_find((ext), (id)))
/* Add this type, returns pointer to data or NULL. */
void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp);
+/* ext genid. if ext->id != ext_genid, extensions cannot be used
+ * anymore unless conntrack has CONFIRMED bit set.
+ */
+extern atomic_t nf_conntrack_ext_genid;
+void nf_ct_ext_bump_genid(void);
+
#endif /* _NF_CONNTRACK_EXTEND_H */
diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h
index 3c23298e68ca..66bab6c60d12 100644
--- a/include/net/netfilter/nf_conntrack_labels.h
+++ b/include/net/netfilter/nf_conntrack_labels.h
@@ -17,10 +17,18 @@ struct nf_conn_labels {
unsigned long bits[NF_CT_LABELS_MAX_SIZE / sizeof(long)];
};
+/* Can't use nf_ct_ext_find(), flow dissector cannot use symbols
+ * exported by nf_conntrack module.
+ */
static inline struct nf_conn_labels *nf_ct_labels_find(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_LABELS
- return nf_ct_ext_find(ct, NF_CT_EXT_LABELS);
+ struct nf_ct_ext *ext = ct->ext;
+
+ if (!ext || !__nf_ct_ext_exist(ext, NF_CT_EXT_LABELS))
+ return NULL;
+
+ return (void *)ct->ext + ct->ext->offset[NF_CT_EXT_LABELS];
#else
return NULL;
#endif
diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h
index 3ea94f6f3844..fea258983d23 100644
--- a/include/net/netfilter/nf_conntrack_timeout.h
+++ b/include/net/netfilter/nf_conntrack_timeout.h
@@ -17,14 +17,6 @@ struct nf_ct_timeout {
char data[];
};
-struct ctnl_timeout {
- struct list_head head;
- struct rcu_head rcu_head;
- refcount_t refcnt;
- char name[CTNL_TIMEOUT_NAME_MAX];
- struct nf_ct_timeout timeout;
-};
-
struct nf_conn_timeout {
struct nf_ct_timeout __rcu *timeout;
};
diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h
index 9051c3a0c8e7..7c669792fb9c 100644
--- a/include/net/netfilter/nf_reject.h
+++ b/include/net/netfilter/nf_reject.h
@@ -5,12 +5,28 @@
#include <linux/types.h>
#include <uapi/linux/in.h>
-static inline bool nf_reject_verify_csum(__u8 proto)
+static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff,
+ __u8 proto)
{
/* Skip protocols that don't use 16-bit one's complement checksum
* of the entire payload.
*/
switch (proto) {
+ /* Protocols with optional checksums. */
+ case IPPROTO_UDP: {
+ const struct udphdr *udp_hdr;
+ struct udphdr _udp_hdr;
+
+ udp_hdr = skb_header_pointer(skb, dataoff,
+ sizeof(_udp_hdr),
+ &_udp_hdr);
+ if (!udp_hdr || udp_hdr->check)
+ return true;
+
+ return false;
+ }
+ case IPPROTO_GRE:
+
/* Protocols with other integrity checks. */
case IPPROTO_AH:
case IPPROTO_ESP:
@@ -19,9 +35,6 @@ static inline bool nf_reject_verify_csum(__u8 proto)
/* Protocols with partial checksums. */
case IPPROTO_UDPLITE:
case IPPROTO_DCCP:
-
- /* Protocols with optional checksums. */
- case IPPROTO_GRE:
return false;
}
return true;
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 0294f3d473af..0677cd3de034 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -93,14 +93,9 @@ struct nf_ip_net {
#endif
};
-struct ct_pcpu {
- spinlock_t lock;
- struct hlist_nulls_head unconfirmed;
- struct hlist_nulls_head dying;
-};
-
struct netns_ct {
#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ bool ctnetlink_has_listener;
bool ecache_dwork_pending;
#endif
u8 sysctl_log_invalid; /* Log invalid packets */
@@ -110,7 +105,6 @@ struct netns_ct {
u8 sysctl_tstamp;
u8 sysctl_checksum;
- struct ct_pcpu __percpu *pcpu_lists;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
struct nf_ip_net nf_ct_proto;
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 4eed5afca392..918c61fda0f3 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -80,6 +80,7 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
struct iphdr *niph;
struct icmphdr *icmph;
unsigned int len;
+ int dataoff;
__wsum csum;
u8 proto;
@@ -99,10 +100,11 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
if (pskb_trim_rcsum(oldskb, ntohs(ip_hdr(oldskb)->tot_len)))
return NULL;
+ dataoff = ip_hdrlen(oldskb);
proto = ip_hdr(oldskb)->protocol;
if (!skb_csum_unnecessary(oldskb) &&
- nf_reject_verify_csum(proto) &&
+ nf_reject_verify_csum(oldskb, dataoff, proto) &&
nf_ip_checksum(oldskb, hook, ip_hdrlen(oldskb), proto))
return NULL;
@@ -311,6 +313,7 @@ EXPORT_SYMBOL_GPL(nf_send_reset);
void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
{
struct iphdr *iph = ip_hdr(skb_in);
+ int dataoff = ip_hdrlen(skb_in);
u8 proto = iph->protocol;
if (iph->frag_off & htons(IP_OFFSET))
@@ -320,12 +323,13 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook)
nf_reject_fill_skb_dst(skb_in) < 0)
return;
- if (skb_csum_unnecessary(skb_in) || !nf_reject_verify_csum(proto)) {
+ if (skb_csum_unnecessary(skb_in) ||
+ !nf_reject_verify_csum(skb_in, dataoff, proto)) {
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
return;
}
- if (nf_ip_checksum(skb_in, hook, ip_hdrlen(skb_in), proto) == 0)
+ if (nf_ip_checksum(skb_in, hook, dataoff, proto) == 0)
icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
}
EXPORT_SYMBOL_GPL(nf_send_unreach);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index dffeaaaadcde..f61d4f18e1cf 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -31,7 +31,7 @@ static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook)
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
- if (!nf_reject_verify_csum(proto))
+ if (!nf_reject_verify_csum(skb, thoff, proto))
return true;
return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
@@ -388,7 +388,7 @@ static bool reject6_csum_ok(struct sk_buff *skb, int hook)
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
- if (!nf_reject_verify_csum(proto))
+ if (!nf_reject_verify_csum(skb, thoff, proto))
return true;
return nf_ip6_checksum(skb, hook, thoff, proto) == 0;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 0164e5f522e8..082a2fd8d85b 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -525,50 +525,6 @@ clean_from_lists(struct nf_conn *ct)
nf_ct_remove_expectations(ct);
}
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_dying_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) dying list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->dying);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* add this conntrack to the (per cpu) unconfirmed list */
- ct->cpu = smp_processor_id();
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
- &pcpu->unconfirmed);
- spin_unlock(&pcpu->lock);
-}
-
-/* must be called with local_bh_disable */
-static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
-{
- struct ct_pcpu *pcpu;
-
- /* We overload first tuple to link into unconfirmed or dying list.*/
- pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
-
- spin_lock(&pcpu->lock);
- BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
- spin_unlock(&pcpu->lock);
-}
-
#define NFCT_ALIGN(len) (((len) + NFCT_INFOMASK) & ~NFCT_INFOMASK)
/* Released via nf_ct_destroy() */
@@ -640,7 +596,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct)
if (unlikely(nf_ct_protonum(ct) == IPPROTO_GRE))
destroy_gre_conntrack(ct);
- local_bh_disable();
/* Expectations will have been removed in clean_from_lists,
* except TFTP can create an expectation on the first packet,
* before connection is in the list, so we need to clean here,
@@ -648,10 +603,6 @@ void nf_ct_destroy(struct nf_conntrack *nfct)
*/
nf_ct_remove_expectations(ct);
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
-
- local_bh_enable();
-
if (ct->master)
nf_ct_put(ct->master);
@@ -660,15 +611,12 @@ void nf_ct_destroy(struct nf_conntrack *nfct)
}
EXPORT_SYMBOL(nf_ct_destroy);
-static void nf_ct_delete_from_lists(struct nf_conn *ct)
+static void __nf_ct_delete_from_lists(struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
unsigned int sequence;
- nf_ct_helper_destroy(ct);
-
- local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
hash = hash_conntrack(net,
@@ -681,12 +629,30 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct)
clean_from_lists(ct);
nf_conntrack_double_unlock(hash, reply_hash);
+}
+
+static void nf_ct_delete_from_lists(struct nf_conn *ct)
+{
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
- nf_ct_add_to_dying_list(ct);
+ __nf_ct_delete_from_lists(ct);
local_bh_enable();
}
+static void nf_ct_add_to_ecache_list(struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ struct nf_conntrack_net *cnet = nf_ct_pernet(nf_ct_net(ct));
+
+ spin_lock(&cnet->ecache.dying_lock);
+ hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &cnet->ecache.dying_list);
+ spin_unlock(&cnet->ecache.dying_lock);
+#endif
+}
+
bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
{
struct nf_conn_tstamp *tstamp;
@@ -709,7 +675,12 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
/* destroy event was not delivered. nf_ct_put will
* be done by event cache worker on redelivery.
*/
- nf_ct_delete_from_lists(ct);
+ nf_ct_helper_destroy(ct);
+ local_bh_disable();
+ __nf_ct_delete_from_lists(ct);
+ nf_ct_add_to_ecache_list(ct);
+ local_bh_enable();
+
nf_conntrack_ecache_work(nf_ct_net(ct), NFCT_ECACHE_DESTROY_FAIL);
return false;
}
@@ -870,6 +841,33 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct,
&nf_conntrack_hash[reply_hash]);
}
+static bool nf_ct_ext_valid_pre(const struct nf_ct_ext *ext)
+{
+ /* if ext->gen_id is not equal to nf_conntrack_ext_genid, some extensions
+ * may contain stale pointers to e.g. helper that has been removed.
+ *
+ * The helper can't clear this because the nf_conn object isn't in
+ * any hash and synchronize_rcu() isn't enough because associated skb
+ * might sit in a queue.
+ */
+ return !ext || ext->gen_id == atomic_read(&nf_conntrack_ext_genid);
+}
+
+static bool nf_ct_ext_valid_post(struct nf_ct_ext *ext)
+{
+ if (!ext)
+ return true;
+
+ if (ext->gen_id != atomic_read(&nf_conntrack_ext_genid))
+ return false;
+
+ /* inserted into conntrack table, nf_ct_iterate_cleanup()
+ * will find it. Disable nf_ct_ext_find() id check.
+ */
+ WRITE_ONCE(ext->gen_id, 0);
+ return true;
+}
+
int
nf_conntrack_hash_check_insert(struct nf_conn *ct)
{
@@ -885,6 +883,11 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
zone = nf_ct_zone(ct);
+ if (!nf_ct_ext_valid_pre(ct->ext)) {
+ NF_CT_STAT_INC(net, insert_failed);
+ return -ETIMEDOUT;
+ }
+
local_bh_disable();
do {
sequence = read_seqcount_begin(&nf_conntrack_generation);
@@ -925,6 +928,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert);
local_bh_enable();
+
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ nf_ct_kill(ct);
+ NF_CT_STAT_INC(net, drop);
+ return -ETIMEDOUT;
+ }
+
return 0;
chaintoolong:
NF_CT_STAT_INC(net, chaintoolong);
@@ -972,7 +982,6 @@ static void __nf_conntrack_insert_prepare(struct nf_conn *ct)
struct nf_conn_tstamp *tstamp;
refcount_inc(&ct->ct_general.use);
- ct->status |= IPS_CONFIRMED;
/* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct);
@@ -1001,7 +1010,6 @@ static int __nf_ct_resolve_clash(struct sk_buff *skb,
nf_conntrack_get(&ct->ct_general);
nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_ct_add_to_dying_list(loser_ct);
nf_ct_put(loser_ct);
nf_ct_set(skb, ct, ctinfo);
@@ -1134,7 +1142,6 @@ nf_ct_resolve_clash(struct sk_buff *skb, struct nf_conntrack_tuple_hash *h,
return ret;
drop:
- nf_ct_add_to_dying_list(loser_ct);
NF_CT_STAT_INC(net, drop);
NF_CT_STAT_INC(net, insert_failed);
return NF_DROP;
@@ -1195,16 +1202,20 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_DROP;
}
+ if (!nf_ct_ext_valid_pre(ct->ext)) {
+ NF_CT_STAT_INC(net, insert_failed);
+ goto dying;
+ }
+
pr_debug("Confirming conntrack %p\n", ct);
/* We have to check the DYING flag after unlink to prevent
* a race against nf_ct_get_next_corpse() possibly called from
* user context, else we insert an already 'dead' hash, blocking
* further use of that particular connection -JM.
*/
- nf_ct_del_from_dying_or_unconfirmed_list(ct);
+ ct->status |= IPS_CONFIRMED;
if (unlikely(nf_ct_is_dying(ct))) {
- nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, insert_failed);
goto dying;
}
@@ -1228,7 +1239,6 @@ __nf_conntrack_confirm(struct sk_buff *skb)
goto out;
if (chainlen++ > max_chainlen) {
chaintoolong:
- nf_ct_add_to_dying_list(ct);
NF_CT_STAT_INC(net, chaintoolong);
NF_CT_STAT_INC(net, insert_failed);
ret = NF_DROP;
@@ -1252,6 +1262,16 @@ chaintoolong:
nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
+ /* ext area is still valid (rcu read lock is held,
+ * but will go out of scope soon, we need to remove
+ * this conntrack again.
+ */
+ if (!nf_ct_ext_valid_post(ct->ext)) {
+ nf_ct_kill(ct);
+ NF_CT_STAT_INC(net, drop);
+ return NF_DROP;
+ }
+
help = nfct_help(ct);
if (help && help->helper)
nf_conntrack_event_cache(IPCT_HELPER, ct);
@@ -1678,7 +1698,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
struct nf_conn *ct;
struct nf_conn_help *help;
struct nf_conntrack_tuple repl_tuple;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
struct nf_conntrack_ecache *ecache;
+#endif
struct nf_conntrack_expect *exp = NULL;
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
@@ -1711,15 +1733,21 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
nf_ct_labels_ext_add(ct);
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
- nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
- ecache ? ecache->expmask : 0,
- GFP_ATOMIC);
- local_bh_disable();
+ if ((ecache || net->ct.sysctl_events) &&
+ !nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
+ ecache ? ecache->expmask : 0,
+ GFP_ATOMIC)) {
+ nf_conntrack_free(ct);
+ return ERR_PTR(-ENOMEM);
+ }
+#endif
+
cnet = nf_ct_pernet(net);
if (cnet->expect_count) {
- spin_lock(&nf_conntrack_expect_lock);
+ spin_lock_bh(&nf_conntrack_expect_lock);
exp = nf_ct_find_expectation(net, zone, tuple);
if (exp) {
pr_debug("expectation arrives ct=%p exp=%p\n",
@@ -1742,16 +1770,13 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
#endif
NF_CT_STAT_INC(net, expect_new);
}
- spin_unlock(&nf_conntrack_expect_lock);
+ spin_unlock_bh(&nf_conntrack_expect_lock);
}
if (!exp)
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
- /* Now it is inserted into the unconfirmed list, set refcount to 1. */
+ /* Now it is going to be associated with an sk_buff, set refcount to 1. */
refcount_set(&ct->ct_general.use, 1);
- nf_ct_add_to_unconfirmed_list(ct);
-
- local_bh_enable();
if (exp) {
if (exp->expectfn)
@@ -2319,7 +2344,7 @@ static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
/* Bring out ya dead! */
static struct nf_conn *
get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
- void *data, unsigned int *bucket)
+ const struct nf_ct_iter_data *iter_data, unsigned int *bucket)
{
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
@@ -2350,7 +2375,12 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
* tuple while iterating.
*/
ct = nf_ct_tuplehash_to_ctrack(h);
- if (iter(ct, data))
+
+ if (iter_data->net &&
+ !net_eq(iter_data->net, nf_ct_net(ct)))
+ continue;
+
+ if (iter(ct, iter_data->data))
goto found;
}
spin_unlock(lockp);
@@ -2367,7 +2397,7 @@ found:
}
static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
+ const struct nf_ct_iter_data *iter_data)
{
unsigned int bucket = 0;
struct nf_conn *ct;
@@ -2375,91 +2405,28 @@ static void nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data),
might_sleep();
mutex_lock(&nf_conntrack_mutex);
- while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
+ while ((ct = get_next_corpse(iter, iter_data, &bucket)) != NULL) {
/* Time to push up daises... */
- nf_ct_delete(ct, portid, report);
+ nf_ct_delete(ct, iter_data->portid, iter_data->report);
nf_ct_put(ct);
cond_resched();
}
mutex_unlock(&nf_conntrack_mutex);
}
-struct iter_data {
- int (*iter)(struct nf_conn *i, void *data);
- void *data;
- struct net *net;
-};
-
-static int iter_net_only(struct nf_conn *i, void *data)
-{
- struct iter_data *d = data;
-
- if (!net_eq(d->net, nf_ct_net(i)))
- return 0;
-
- return d->iter(i, d->data);
-}
-
-static void
-__nf_ct_unconfirmed_destroy(struct net *net)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct nf_conntrack_tuple_hash *h;
- struct hlist_nulls_node *n;
- struct ct_pcpu *pcpu;
-
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_bh(&pcpu->lock);
- hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
- struct nf_conn *ct;
-
- ct = nf_ct_tuplehash_to_ctrack(h);
-
- /* we cannot call iter() on unconfirmed list, the
- * owning cpu can reallocate ct->ext at any time.
- */
- set_bit(IPS_DYING_BIT, &ct->status);
- }
- spin_unlock_bh(&pcpu->lock);
- cond_resched();
- }
-}
-
-void nf_ct_unconfirmed_destroy(struct net *net)
+void nf_ct_iterate_cleanup_net(int (*iter)(struct nf_conn *i, void *data),
+ const struct nf_ct_iter_data *iter_data)
{
+ struct net *net = iter_data->net;
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
might_sleep();
- if (atomic_read(&cnet->count) > 0) {
- __nf_ct_unconfirmed_destroy(net);
- nf_queue_nf_hook_drop(net);
- synchronize_net();
- }
-}
-EXPORT_SYMBOL_GPL(nf_ct_unconfirmed_destroy);
-
-void nf_ct_iterate_cleanup_net(struct net *net,
- int (*iter)(struct nf_conn *i, void *data),
- void *data, u32 portid, int report)
-{
- struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- struct iter_data d;
-
- might_sleep();
-
if (atomic_read(&cnet->count) == 0)
return;
- d.iter = iter;
- d.data = data;
- d.net = net;
-
- nf_ct_iterate_cleanup(iter_net_only, &d, portid, report);
+ nf_ct_iterate_cleanup(iter, iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
@@ -2477,6 +2444,7 @@ EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup_net);
void
nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
{
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
down_read(&net_rwsem);
@@ -2485,31 +2453,41 @@ nf_ct_iterate_destroy(int (*iter)(struct nf_conn *i, void *data), void *data)
if (atomic_read(&cnet->count) == 0)
continue;
- __nf_ct_unconfirmed_destroy(net);
nf_queue_nf_hook_drop(net);
}
up_read(&net_rwsem);
/* Need to wait for netns cleanup worker to finish, if its
* running -- it might have deleted a net namespace from
- * the global list, so our __nf_ct_unconfirmed_destroy() might
- * not have affected all namespaces.
+ * the global list, so hook drop above might not have
+ * affected all namespaces.
*/
net_ns_barrier();
- /* a conntrack could have been unlinked from unconfirmed list
- * before we grabbed pcpu lock in __nf_ct_unconfirmed_destroy().
+ /* a skb w. unconfirmed conntrack could have been reinjected just
+ * before we called nf_queue_nf_hook_drop().
+ *
* This makes sure its inserted into conntrack table.
*/
synchronize_net();
- nf_ct_iterate_cleanup(iter, data, 0, 0);
+ nf_ct_ext_bump_genid();
+ iter_data.data = data;
+ nf_ct_iterate_cleanup(iter, &iter_data);
+
+ /* Another cpu might be in a rcu read section with
+ * rcu protected pointer cleared in iter callback
+ * or hidden via nf_ct_ext_bump_genid() above.
+ *
+ * Wait until those are done.
+ */
+ synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_destroy);
static int kill_all(struct nf_conn *i, void *data)
{
- return net_eq(nf_ct_net(i), data);
+ return 1;
}
void nf_conntrack_cleanup_start(void)
@@ -2544,8 +2522,9 @@ void nf_conntrack_cleanup_net(struct net *net)
void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list)
{
- int busy;
+ struct nf_ct_iter_data iter_data = {};
struct net *net;
+ int busy;
/*
* This makes sure all current packets have passed through
@@ -2558,7 +2537,8 @@ i_see_dead_people:
list_for_each_entry(net, net_exit_list, exit_list) {
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
- nf_ct_iterate_cleanup(kill_all, net, 0, 0);
+ iter_data.net = net;
+ nf_ct_iterate_cleanup_net(kill_all, &iter_data);
if (atomic_read(&cnet->count) != 0)
busy = 1;
}
@@ -2571,7 +2551,6 @@ i_see_dead_people:
nf_conntrack_ecache_pernet_fini(net);
nf_conntrack_expect_pernet_fini(net);
free_percpu(net->ct.stat);
- free_percpu(net->ct.pcpu_lists);
}
}
@@ -2777,33 +2756,19 @@ void nf_conntrack_init_end(void)
* We need to use special "null" values, not used in hash table
*/
#define UNCONFIRMED_NULLS_VAL ((1<<30)+0)
-#define DYING_NULLS_VAL ((1<<30)+1)
int nf_conntrack_init_net(struct net *net)
{
struct nf_conntrack_net *cnet = nf_ct_pernet(net);
int ret = -ENOMEM;
- int cpu;
BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER);
BUILD_BUG_ON_NOT_POWER_OF_2(CONNTRACK_LOCKS);
atomic_set(&cnet->count, 0);
- net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
- if (!net->ct.pcpu_lists)
- goto err_stat;
-
- for_each_possible_cpu(cpu) {
- struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
-
- spin_lock_init(&pcpu->lock);
- INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
- INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
- }
-
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
if (!net->ct.stat)
- goto err_pcpu_lists;
+ return ret;
ret = nf_conntrack_expect_pernet_init(net);
if (ret < 0)
@@ -2819,8 +2784,5 @@ int nf_conntrack_init_net(struct net *net)
err_expect:
free_percpu(net->ct.stat);
-err_pcpu_lists:
- free_percpu(net->ct.pcpu_lists);
-err_stat:
return ret;
}
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 0cb2da0a759a..8698b3424646 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -16,7 +16,6 @@
#include <linux/vmalloc.h>
#include <linux/stddef.h>
#include <linux/err.h>
-#include <linux/percpu.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/slab.h>
@@ -29,8 +28,9 @@
static DEFINE_MUTEX(nf_ct_ecache_mutex);
-#define ECACHE_RETRY_WAIT (HZ/10)
-#define ECACHE_STACK_ALLOC (256 / sizeof(void *))
+#define DYING_NULLS_VAL ((1 << 30) + 1)
+#define ECACHE_MAX_JIFFIES msecs_to_jiffies(10)
+#define ECACHE_RETRY_JIFFIES msecs_to_jiffies(10)
enum retry_state {
STATE_CONGESTED,
@@ -38,58 +38,67 @@ enum retry_state {
STATE_DONE,
};
-static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
+struct nf_conntrack_net_ecache *nf_conn_pernet_ecache(const struct net *net)
{
- struct nf_conn *refs[ECACHE_STACK_ALLOC];
+ struct nf_conntrack_net *cnet = nf_ct_pernet(net);
+
+ return &cnet->ecache;
+}
+#if IS_MODULE(CONFIG_NF_CT_NETLINK)
+EXPORT_SYMBOL_GPL(nf_conn_pernet_ecache);
+#endif
+
+static enum retry_state ecache_work_evict_list(struct nf_conntrack_net *cnet)
+{
+ unsigned long stop = jiffies + ECACHE_MAX_JIFFIES;
+ struct hlist_nulls_head evicted_list;
enum retry_state ret = STATE_DONE;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- unsigned int evicted = 0;
+ unsigned int sent;
- spin_lock(&pcpu->lock);
+ INIT_HLIST_NULLS_HEAD(&evicted_list, DYING_NULLS_VAL);
- hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
+next:
+ sent = 0;
+ spin_lock_bh(&cnet->ecache.dying_lock);
+
+ hlist_nulls_for_each_entry_safe(h, n, &cnet->ecache.dying_list, hnnode) {
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
- struct nf_conntrack_ecache *e;
-
- if (!nf_ct_is_confirmed(ct))
- continue;
-
- /* This ecache access is safe because the ct is on the
- * pcpu dying list and we hold the spinlock -- the entry
- * cannot be free'd until after the lock is released.
- *
- * This is true even if ct has a refcount of 0: the
- * cpu that is about to free the entry must remove it
- * from the dying list and needs the lock to do so.
- */
- e = nf_ct_ecache_find(ct);
- if (!e || e->state != NFCT_ECACHE_DESTROY_FAIL)
- continue;
- /* ct is in NFCT_ECACHE_DESTROY_FAIL state, this means
- * the worker owns this entry: the ct will remain valid
- * until the worker puts its ct reference.
+ /* The worker owns all entries, ct remains valid until nf_ct_put
+ * in the loop below.
*/
if (nf_conntrack_event(IPCT_DESTROY, ct)) {
ret = STATE_CONGESTED;
break;
}
- e->state = NFCT_ECACHE_DESTROY_SENT;
- refs[evicted] = ct;
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &evicted_list);
- if (++evicted >= ARRAY_SIZE(refs)) {
+ if (time_after(stop, jiffies)) {
ret = STATE_RESTART;
break;
}
+
+ if (sent++ > 16) {
+ spin_unlock_bh(&cnet->ecache.dying_lock);
+ cond_resched();
+ goto next;
+ }
}
- spin_unlock(&pcpu->lock);
+ spin_unlock_bh(&cnet->ecache.dying_lock);
- /* can't _put while holding lock */
- while (evicted)
- nf_ct_put(refs[--evicted]);
+ hlist_nulls_for_each_entry_safe(h, n, &evicted_list, hnnode) {
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode);
+ nf_ct_put(ct);
+
+ cond_resched();
+ }
return ret;
}
@@ -97,35 +106,20 @@ static enum retry_state ecache_work_evict_list(struct ct_pcpu *pcpu)
static void ecache_work(struct work_struct *work)
{
struct nf_conntrack_net *cnet = container_of(work, struct nf_conntrack_net, ecache.dwork.work);
- struct netns_ct *ctnet = cnet->ecache.ct_net;
- int cpu, delay = -1;
- struct ct_pcpu *pcpu;
-
- local_bh_disable();
-
- for_each_possible_cpu(cpu) {
- enum retry_state ret;
-
- pcpu = per_cpu_ptr(ctnet->pcpu_lists, cpu);
-
- ret = ecache_work_evict_list(pcpu);
-
- switch (ret) {
- case STATE_CONGESTED:
- delay = ECACHE_RETRY_WAIT;
- goto out;
- case STATE_RESTART:
- delay = 0;
- break;
- case STATE_DONE:
- break;
- }
+ int ret, delay = -1;
+
+ ret = ecache_work_evict_list(cnet);
+ switch (ret) {
+ case STATE_CONGESTED:
+ delay = ECACHE_RETRY_JIFFIES;
+ break;
+ case STATE_RESTART:
+ delay = 0;
+ break;
+ case STATE_DONE:
+ break;
}
- out:
- local_bh_enable();
-
- ctnet->ecache_dwork_pending = delay > 0;
if (delay >= 0)
schedule_delayed_work(&cnet->ecache.dwork, delay);
}
@@ -199,7 +193,6 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
*/
if (e->portid == 0 && portid != 0)
e->portid = portid;
- e->state = NFCT_ECACHE_DESTROY_FAIL;
}
return ret;
@@ -297,12 +290,51 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
schedule_delayed_work(&cnet->ecache.dwork, HZ);
net->ct.ecache_dwork_pending = true;
} else if (state == NFCT_ECACHE_DESTROY_SENT) {
- net->ct.ecache_dwork_pending = false;
- mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
+ if (!hlist_nulls_empty(&cnet->ecache.dying_list))
+ mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
+ else
+ net->ct.ecache_dwork_pending = false;
}
}
-#define NF_CT_EVENTS_DEFAULT 1
+bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
+{
+ struct net *net = nf_ct_net(ct);
+ struct nf_conntrack_ecache *e;
+
+ switch (net->ct.sysctl_events) {
+ case 0:
+ /* assignment via template / ruleset? ignore sysctl. */
+ if (ctmask || expmask)
+ break;
+ return true;
+ case 2: /* autodetect: no event listener, don't allocate extension. */
+ if (!READ_ONCE(net->ct.ctnetlink_has_listener))
+ return true;
+ fallthrough;
+ case 1:
+ /* always allocate an extension. */
+ if (!ctmask && !expmask) {
+ ctmask = ~0;
+ expmask = ~0;
+ }
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ return true;
+ }
+
+ e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
+ if (e) {
+ e->ctmask = ctmask;
+ e->expmask = expmask;
+ }
+
+ return e != NULL;
+}
+EXPORT_SYMBOL_GPL(nf_ct_ecache_ext_add);
+
+#define NF_CT_EVENTS_DEFAULT 2
static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
void nf_conntrack_ecache_pernet_init(struct net *net)
@@ -311,8 +343,9 @@ void nf_conntrack_ecache_pernet_init(struct net *net)
net->ct.sysctl_events = nf_ct_events;
- cnet->ecache.ct_net = &net->ct;
INIT_DELAYED_WORK(&cnet->ecache.dwork, ecache_work);
+ INIT_HLIST_NULLS_HEAD(&cnet->ecache.dying_list, DYING_NULLS_VAL);
+ spin_lock_init(&cnet->ecache.dying_lock);
BUILD_BUG_ON(__IPCT_MAX >= 16); /* e->ctmask is u16 */
}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 1296fda54ac6..0b513f7bf9f3 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -27,6 +27,8 @@
#define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */
+atomic_t nf_conntrack_ext_genid __read_mostly = ATOMIC_INIT(1);
+
static const u8 nf_ct_ext_type_len[NF_CT_EXT_NUM] = {
[NF_CT_EXT_HELPER] = sizeof(struct nf_conn_help),
#if IS_ENABLED(CONFIG_NF_NAT)
@@ -116,8 +118,10 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
if (!new)
return NULL;
- if (!ct->ext)
+ if (!ct->ext) {
memset(new->offset, 0, sizeof(new->offset));
+ new->gen_id = atomic_read(&nf_conntrack_ext_genid);
+ }
new->offset[id] = newoff;
new->len = newlen;
@@ -127,3 +131,29 @@ void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp)
return (void *)new + newoff;
}
EXPORT_SYMBOL(nf_ct_ext_add);
+
+/* Use nf_ct_ext_find wrapper. This is only useful for unconfirmed entries. */
+void *__nf_ct_ext_find(const struct nf_ct_ext *ext, u8 id)
+{
+ unsigned int gen_id = atomic_read(&nf_conntrack_ext_genid);
+ unsigned int this_id = READ_ONCE(ext->gen_id);
+
+ if (!__nf_ct_ext_exist(ext, id))
+ return NULL;
+
+ if (this_id == 0 || ext->gen_id == gen_id)
+ return (void *)ext + ext->offset[id];
+
+ return NULL;
+}
+EXPORT_SYMBOL(__nf_ct_ext_find);
+
+void nf_ct_ext_bump_genid(void)
+{
+ unsigned int value = atomic_inc_return(&nf_conntrack_ext_genid);
+
+ if (value == UINT_MAX)
+ atomic_set(&nf_conntrack_ext_genid, 1);
+
+ msleep(HZ);
+}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 8dec42ec603e..c12a87ebc3ee 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -468,11 +468,6 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
nf_ct_expect_iterate_destroy(expect_iter_me, NULL);
nf_ct_iterate_destroy(unhelp, me);
-
- /* Maybe someone has gotten the helper already when unhelp above.
- * So need to wait it.
- */
- synchronize_rcu();
}
EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 924d766e6c53..e768f59741a6 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1559,6 +1559,11 @@ static int ctnetlink_flush_conntrack(struct net *net,
u32 portid, int report, u8 family)
{
struct ctnetlink_filter *filter = NULL;
+ struct nf_ct_iter_data iter = {
+ .net = net,
+ .portid = portid,
+ .report = report,
+ };
if (ctnetlink_needs_filter(family, cda)) {
if (cda[CTA_FILTER])
@@ -1567,10 +1572,11 @@ static int ctnetlink_flush_conntrack(struct net *net,
filter = ctnetlink_alloc_filter(cda, family);
if (IS_ERR(filter))
return PTR_ERR(filter);
+
+ iter.data = filter;
}
- nf_ct_iterate_cleanup_net(net, ctnetlink_flush_iterate, filter,
- portid, report);
+ nf_ct_iterate_cleanup_net(ctnetlink_flush_iterate, &iter);
kfree(filter);
return 0;
@@ -1750,61 +1756,59 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
}
static int
-ctnetlink_dump_list(struct sk_buff *skb, struct netlink_callback *cb, bool dying)
+ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ return 0;
+}
+
+static int
+ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
- struct nf_conn *ct, *last;
+ struct nf_conn *last = ctx->last;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ const struct net *net = sock_net(skb->sk);
+ struct nf_conntrack_net_ecache *ecache_net;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
- struct hlist_nulls_head *list;
- struct net *net = sock_net(skb->sk);
- int res, cpu;
+#endif
if (ctx->done)
return 0;
- last = ctx->last;
+ ctx->last = NULL;
- for (cpu = ctx->cpu; cpu < nr_cpu_ids; cpu++) {
- struct ct_pcpu *pcpu;
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ ecache_net = nf_conn_pernet_ecache(net);
+ spin_lock_bh(&ecache_net->dying_lock);
- if (!cpu_possible(cpu))
- continue;
+ hlist_nulls_for_each_entry(h, n, &ecache_net->dying_list, hnnode) {
+ struct nf_conn *ct;
+ int res;
- pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
- spin_lock_bh(&pcpu->lock);
- list = dying ? &pcpu->dying : &pcpu->unconfirmed;
-restart:
- hlist_nulls_for_each_entry(h, n, list, hnnode) {
- ct = nf_ct_tuplehash_to_ctrack(h);
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ if (last && last != ct)
+ continue;
- res = ctnetlink_dump_one_entry(skb, cb, ct, dying);
- if (res < 0) {
- ctx->cpu = cpu;
- spin_unlock_bh(&pcpu->lock);
- goto out;
- }
- }
- if (ctx->last) {
- ctx->last = NULL;
- goto restart;
+ res = ctnetlink_dump_one_entry(skb, cb, ct, true);
+ if (res < 0) {
+ spin_unlock_bh(&ecache_net->dying_lock);
+ nf_ct_put(last);
+ return skb->len;
}
- spin_unlock_bh(&pcpu->lock);
+
+ nf_ct_put(last);
+ last = NULL;
}
+
+ spin_unlock_bh(&ecache_net->dying_lock);
+#endif
ctx->done = true;
-out:
- if (last)
- nf_ct_put(last);
+ nf_ct_put(last);
return skb->len;
}
-static int
-ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return ctnetlink_dump_list(skb, cb, true);
-}
-
static int ctnetlink_get_ct_dying(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const cda[])
@@ -1820,12 +1824,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb,
return -EOPNOTSUPP;
}
-static int
-ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
-{
- return ctnetlink_dump_list(skb, cb, false);
-}
-
static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
const struct nfnl_info *info,
const struct nlattr * const cda[])
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index d1f2d3c8d2b1..895b09cbd7cf 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -538,9 +538,13 @@ retry:
out_unlock:
mutex_unlock(&nf_ct_proto_mutex);
- if (fixup_needed)
- nf_ct_iterate_cleanup_net(net, nf_ct_tcp_fixup,
- (void *)(unsigned long)nfproto, 0, 0);
+ if (fixup_needed) {
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = (void *)(unsigned long)nfproto,
+ };
+ nf_ct_iterate_cleanup_net(nf_ct_tcp_fixup, &iter_data);
+ }
return err;
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 55aa55b252b2..6ad7bbc90d38 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -693,7 +693,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_ONE,
+ .extra2 = SYSCTL_TWO,
},
#endif
#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index cec166ecba77..0f828d05ea60 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -38,7 +38,12 @@ static int untimeout(struct nf_conn *ct, void *timeout)
void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout)
{
- nf_ct_iterate_cleanup_net(net, untimeout, timeout, 0, 0);
+ struct nf_ct_iter_data iter_data = {
+ .net = net,
+ .data = timeout,
+ };
+
+ nf_ct_iterate_cleanup_net(untimeout, &iter_data);
}
EXPORT_SYMBOL_GPL(nf_ct_untimeout);
diff --git a/net/netfilter/nf_nat_masquerade.c b/net/netfilter/nf_nat_masquerade.c
index e32fac374608..1a506b0c6511 100644
--- a/net/netfilter/nf_nat_masquerade.c
+++ b/net/netfilter/nf_nat_masquerade.c
@@ -77,11 +77,14 @@ EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4);
static void iterate_cleanup_work(struct work_struct *work)
{
+ struct nf_ct_iter_data iter_data = {};
struct masq_dev_work *w;
w = container_of(work, struct masq_dev_work, work);
- nf_ct_iterate_cleanup_net(w->net, w->iter, (void *)w, 0, 0);
+ iter_data.net = w->net;
+ iter_data.data = (void *)w;
+ nf_ct_iterate_cleanup_net(w->iter, &iter_data);
put_net_track(w->net, &w->ns_tracker);
kfree(w);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 7e2c8dd01408..ad3bbe34ca88 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -45,6 +45,7 @@ MODULE_DESCRIPTION("Netfilter messages via netlink socket");
static unsigned int nfnetlink_pernet_id __read_mostly;
struct nfnl_net {
+ unsigned int ctnetlink_listeners;
struct sock *nfnl;
};
@@ -654,7 +655,6 @@ static void nfnetlink_rcv(struct sk_buff *skb)
netlink_rcv_skb(skb, nfnetlink_rcv_msg);
}
-#ifdef CONFIG_MODULES
static int nfnetlink_bind(struct net *net, int group)
{
const struct nfnetlink_subsystem *ss;
@@ -670,9 +670,44 @@ static int nfnetlink_bind(struct net *net, int group)
rcu_read_unlock();
if (!ss)
request_module_nowait("nfnetlink-subsys-%d", type);
+
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ if (type == NFNL_SUBSYS_CTNETLINK) {
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+
+ if (WARN_ON_ONCE(nfnlnet->ctnetlink_listeners == UINT_MAX)) {
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ return -EOVERFLOW;
+ }
+
+ nfnlnet->ctnetlink_listeners++;
+ if (nfnlnet->ctnetlink_listeners == 1)
+ WRITE_ONCE(net->ct.ctnetlink_has_listener, true);
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ }
+#endif
return 0;
}
+
+static void nfnetlink_unbind(struct net *net, int group)
+{
+#ifdef CONFIG_NF_CONNTRACK_EVENTS
+ int type = nfnl_group2type[group];
+
+ if (type == NFNL_SUBSYS_CTNETLINK) {
+ struct nfnl_net *nfnlnet = nfnl_pernet(net);
+
+ nfnl_lock(NFNL_SUBSYS_CTNETLINK);
+ WARN_ON_ONCE(nfnlnet->ctnetlink_listeners == 0);
+ nfnlnet->ctnetlink_listeners--;
+ if (nfnlnet->ctnetlink_listeners == 0)
+ WRITE_ONCE(net->ct.ctnetlink_has_listener, false);
+ nfnl_unlock(NFNL_SUBSYS_CTNETLINK);
+ }
#endif
+}
static int __net_init nfnetlink_net_init(struct net *net)
{
@@ -680,9 +715,8 @@ static int __net_init nfnetlink_net_init(struct net *net)
struct netlink_kernel_cfg cfg = {
.groups = NFNLGRP_MAX,
.input = nfnetlink_rcv,
-#ifdef CONFIG_MODULES
.bind = nfnetlink_bind,
-#endif
+ .unbind = nfnetlink_unbind,
};
nfnlnet->nfnl = netlink_kernel_create(net, NETLINK_NETFILTER, &cfg);
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index eea486f32971..f069c24c6146 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -33,8 +33,19 @@
static unsigned int nfct_timeout_id __read_mostly;
+struct ctnl_timeout {
+ struct list_head head;
+ struct rcu_head rcu_head;
+ refcount_t refcnt;
+ char name[CTNL_TIMEOUT_NAME_MAX];
+ struct nf_ct_timeout timeout;
+
+ struct list_head free_head;
+};
+
struct nfct_timeout_pernet {
struct list_head nfct_timeout_list;
+ struct list_head nfct_timeout_freelist;
};
MODULE_LICENSE("GPL");
@@ -574,20 +585,36 @@ static int __net_init cttimeout_net_init(struct net *net)
struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
INIT_LIST_HEAD(&pernet->nfct_timeout_list);
+ INIT_LIST_HEAD(&pernet->nfct_timeout_freelist);
return 0;
}
+static void __net_exit cttimeout_net_pre_exit(struct net *net)
+{
+ struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
+ struct ctnl_timeout *cur, *tmp;
+
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
+ list_del_rcu(&cur->head);
+ list_add(&cur->free_head, &pernet->nfct_timeout_freelist);
+ }
+
+ /* core calls synchronize_rcu() after this */
+}
+
static void __net_exit cttimeout_net_exit(struct net *net)
{
struct nfct_timeout_pernet *pernet = nfct_timeout_pernet(net);
struct ctnl_timeout *cur, *tmp;
- nf_ct_unconfirmed_destroy(net);
+ if (list_empty(&pernet->nfct_timeout_freelist))
+ return;
+
nf_ct_untimeout(net, NULL);
- list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_list, head) {
- list_del_rcu(&cur->head);
+ list_for_each_entry_safe(cur, tmp, &pernet->nfct_timeout_freelist, head) {
+ list_del(&cur->free_head);
if (refcount_dec_and_test(&cur->refcnt))
kfree_rcu(cur, rcu_head);
@@ -596,6 +623,7 @@ static void __net_exit cttimeout_net_exit(struct net *net)
static struct pernet_operations cttimeout_ops = {
.init = cttimeout_net_init,
+ .pre_exit = cttimeout_net_pre_exit,
.exit = cttimeout_net_exit,
.id = &nfct_timeout_id,
.size = sizeof(struct nfct_timeout_pernet),
@@ -628,13 +656,24 @@ err_out:
return ret;
}
+static int untimeout(struct nf_conn *ct, void *timeout)
+{
+ struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct);
+
+ if (timeout_ext)
+ RCU_INIT_POINTER(timeout_ext->timeout, NULL);
+
+ return 0;
+}
+
static void __exit cttimeout_exit(void)
{
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
RCU_INIT_POINTER(nf_ct_timeout_hook, NULL);
- synchronize_rcu();
+
+ nf_ct_iterate_destroy(untimeout, NULL);
}
module_init(cttimeout_init);
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 900d48c810a1..50991ab1e2d2 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -227,11 +227,19 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
switch (nft_pf(pkt)) {
case NFPROTO_IPV4:
fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip;
+ fl.u.ip4.saddr = ct->tuplehash[dir].tuple.dst.u3.ip;
fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
+ fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
+ fl.u.ip4.flowi4_tos = RT_TOS(ip_hdr(pkt->skb)->tos);
+ fl.u.ip4.flowi4_mark = pkt->skb->mark;
break;
case NFPROTO_IPV6:
fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6;
+ fl.u.ip6.saddr = ct->tuplehash[dir].tuple.dst.u3.in6;
fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex;
+ fl.u.ip6.flowi6_iif = this_dst->dev->ifindex;
+ fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb));
+ fl.u.ip6.flowi6_mark = pkt->skb->mark;
break;
}