aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakub Kicinski2024-05-13 13:12:34 -0700
committerJakub Kicinski2024-05-13 13:12:35 -0700
commitc85e41bfe7af41c71c438c6011b298398c185fa8 (patch)
tree6e7bee1a64f50af926efaa05439010c20fc00547
parentcddd2dc6390b90e62cec2768424d1d90f6d04161 (diff)
parentfa23e0d4b756d25829e124d6b670a4c6bbd4bf7e (diff)
Merge tag 'nf-next-24-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Pablo Neira Ayuso says: ==================== Netfilter updates for net-next The following patchset contains Netfilter updates for net-next: Patch #1 skips transaction if object type provides no .update interface. Patch #2 skips NETDEV_CHANGENAME which is unused. Patch #3 enables conntrack to handle Multicast Router Advertisements and Multicast Router Solicitations from the Multicast Router Discovery protocol (RFC4286) as untracked opposed to invalid packets. From Linus Luessing. Patch #4 updates DCCP conntracker to mark invalid as invalid, instead of dropping them, from Jason Xing. Patch #5 uses NF_DROP instead of -NF_DROP since NF_DROP is 0, also from Jason. Patch #6 removes reference in netfilter's sysctl documentation on pickup entries which were already removed by Florian Westphal. Patch #7 removes check for IPS_OFFLOAD flag to disable early drop which allows to evict entries from the conntrack table, also from Florian. Patches #8 to #16 updates nf_tables pipapo set backend to allocate the datastructure copy on-demand from preparation phase, to better deal with OOM situations where .commit step is too late to fail. Series from Florian Westphal. Patch #17 adds a selftest with packetdrill to cover conntrack TCP state transitions, also from Florian. Patch #18 use GFP_KERNEL to clone elements from control plane to avoid quick atomic reserves exhaustion with large sets, reporter refers to million entries magnitude. * tag 'nf-next-24-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next: netfilter: nf_tables: allow clone callbacks to sleep selftests: netfilter: add packetdrill based conntrack tests netfilter: nft_set_pipapo: remove dirty flag netfilter: nft_set_pipapo: move cloning of match info to insert/removal path netfilter: nft_set_pipapo: prepare pipapo_get helper for on-demand clone netfilter: nft_set_pipapo: merge deactivate helper into caller netfilter: nft_set_pipapo: prepare walk function for on-demand clone netfilter: nft_set_pipapo: prepare destroy function for on-demand clone netfilter: nft_set_pipapo: make pipapo_clone helper return NULL netfilter: nft_set_pipapo: move prove_locking helper around netfilter: conntrack: remove flowtable early-drop test netfilter: conntrack: documentation: remove reference to non-existent sysctl netfilter: use NF_DROP instead of -NF_DROP netfilter: conntrack: dccp: try not to drop skb in conntrack netfilter: conntrack: fix ct-state for ICMPv6 Multicast Router Discovery netfilter: nf_tables: remove NETDEV_CHANGENAME from netdev chain event handler netfilter: nf_tables: skip transaction if update object is not implemented ==================== Link: https://lore.kernel.org/r/20240512161436.168973-1-pablo@netfilter.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
-rw-r--r--Documentation/networking/nf_conntrack-sysctl.rst4
-rw-r--r--include/net/netfilter/nf_tables.h4
-rw-r--r--include/uapi/linux/icmpv6.h1
-rw-r--r--net/ipv4/netfilter/iptable_filter.c2
-rw-r--r--net/ipv6/netfilter/ip6table_filter.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c4
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c4
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c4
-rw-r--r--net/netfilter/nf_tables_api.c16
-rw-r--r--net/netfilter/nft_chain_filter.c6
-rw-r--r--net/netfilter/nft_connlimit.c4
-rw-r--r--net/netfilter/nft_counter.c4
-rw-r--r--net/netfilter/nft_dynset.c2
-rw-r--r--net/netfilter/nft_last.c4
-rw-r--r--net/netfilter/nft_limit.c14
-rw-r--r--net/netfilter/nft_quota.c4
-rw-r--r--net/netfilter/nft_set_pipapo.c258
-rw-r--r--net/netfilter/nft_set_pipapo.h2
-rw-r--r--tools/testing/selftests/net/netfilter/Makefile2
-rw-r--r--tools/testing/selftests/net/netfilter/config1
-rwxr-xr-xtools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh71
-rwxr-xr-xtools/testing/selftests/net/netfilter/packetdrill/common.sh33
-rw-r--r--tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt118
-rw-r--r--tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt62
-rw-r--r--tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt59
-rw-r--r--tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt44
-rw-r--r--tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt51
-rw-r--r--tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt34
28 files changed, 639 insertions, 175 deletions
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index c383a394c665..238b66d0e059 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -222,11 +222,11 @@ nf_flowtable_tcp_timeout - INTEGER (seconds)
Control offload timeout for tcp connections.
TCP connections may be offloaded from nf conntrack to nf flow table.
- Once aged, the connection is returned to nf conntrack with tcp pickup timeout.
+ Once aged, the connection is returned to nf conntrack.
nf_flowtable_udp_timeout - INTEGER (seconds)
default 30
Control offload timeout for udp connections.
UDP connections may be offloaded from nf conntrack to nf flow table.
- Once aged, the connection is returned to nf conntrack with udp pickup timeout.
+ Once aged, the connection is returned to nf conntrack.
diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3f1ed467f951..2796153b03da 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -416,7 +416,7 @@ struct nft_expr_info;
int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
struct nft_expr_info *info);
-int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src);
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp);
void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
const struct nft_expr *expr, bool reset);
@@ -935,7 +935,7 @@ struct nft_expr_ops {
struct nft_regs *regs,
const struct nft_pktinfo *pkt);
int (*clone)(struct nft_expr *dst,
- const struct nft_expr *src);
+ const struct nft_expr *src, gfp_t gfp);
unsigned int size;
int (*init)(const struct nft_ctx *ctx,
diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h
index ecaece3af38d..4eaab89e2856 100644
--- a/include/uapi/linux/icmpv6.h
+++ b/include/uapi/linux/icmpv6.h
@@ -112,6 +112,7 @@ struct icmp6hdr {
#define ICMPV6_MOBILE_PREFIX_ADV 147
#define ICMPV6_MRDISC_ADV 151
+#define ICMPV6_MRDISC_SOL 152
#define ICMPV6_MSG_MAX 255
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index b9062f4552ac..3ab908b74795 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -44,7 +44,7 @@ static int iptable_filter_table_init(struct net *net)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ipt_standard *)repl->entries)[1].target.verdict =
- forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+ forward ? -NF_ACCEPT - 1 : NF_DROP - 1;
err = ipt_register_table(net, &packet_filter, repl, filter_ops);
kfree(repl);
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index df785ebda0ca..e8992693e14a 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -43,7 +43,7 @@ static int ip6table_filter_table_init(struct net *net)
return -ENOMEM;
/* Entry 1 is the FORWARD hook */
((struct ip6t_standard *)repl->entries)[1].target.verdict =
- forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
+ forward ? -NF_ACCEPT - 1 : NF_DROP - 1;
err = ip6t_register_table(net, &packet_filter, repl, filter_ops);
kfree(repl);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index c63868666bd9..7ac20750c127 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1440,8 +1440,6 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
const struct nf_conntrack_l4proto *l4proto;
u8 protonum = nf_ct_protonum(ct);
- if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP)
- return false;
if (!test_bit(IPS_ASSURED_BIT, &ct->status))
return true;
@@ -2024,7 +2022,7 @@ repeat:
goto repeat;
NF_CT_STAT_INC_ATOMIC(state->net, invalid);
- if (ret == -NF_DROP)
+ if (ret == NF_DROP)
NF_CT_STAT_INC_ATOMIC(state->net, drop);
ret = -ret;
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index e2db1f4ec2df..ebc4f733bb2e 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -525,7 +525,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh);
if (!dh)
- return NF_DROP;
+ return -NF_ACCEPT;
if (dccp_error(dh, skb, dataoff, state))
return -NF_ACCEPT;
@@ -533,7 +533,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb,
/* pull again, including possible 48 bit sequences and subtype header */
dh = dccp_header_pointer(skb, dataoff, dh, &_dh);
if (!dh)
- return NF_DROP;
+ return -NF_ACCEPT;
type = dh->dccph_type;
if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state))
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 1020d67600a9..327b8059025d 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = {
[NDISC_ROUTER_ADVERTISEMENT - 130] = 1,
[NDISC_NEIGHBOUR_SOLICITATION - 130] = 1,
[NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1,
- [ICMPV6_MLD2_REPORT - 130] = 1
+ [ICMPV6_MLD2_REPORT - 130] = 1,
+ [ICMPV6_MRDISC_ADV - 130] = 1,
+ [ICMPV6_MRDISC_SOL - 130] = 1
};
bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 167074283ea9..be3b4c90d2ed 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3333,7 +3333,7 @@ err_expr_parse:
return ERR_PTR(err);
}
-int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
+int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp)
{
int err;
@@ -3341,7 +3341,7 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src)
return -EINVAL;
dst->ops = src->ops;
- err = src->ops->clone(dst, src);
+ err = src->ops->clone(dst, src, gfp);
if (err < 0)
return err;
@@ -6525,7 +6525,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set,
if (!expr)
goto err_expr;
- err = nft_expr_clone(expr, set->exprs[i]);
+ err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT);
if (err < 0) {
kfree(expr);
goto err_expr;
@@ -6564,7 +6564,7 @@ static int nft_set_elem_expr_setup(struct nft_ctx *ctx,
for (i = 0; i < num_exprs; i++) {
expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
- err = nft_expr_clone(expr, expr_array[i]);
+ err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT);
if (err < 0)
goto err_elem_expr_setup;
@@ -7776,6 +7776,9 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info,
if (WARN_ON_ONCE(!type))
return -ENOENT;
+ if (!obj->ops->update)
+ return 0;
+
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj);
@@ -9467,9 +9470,10 @@ static void nft_obj_commit_update(struct nft_trans *trans)
obj = nft_trans_obj(trans);
newobj = nft_trans_obj_newobj(trans);
- if (obj->ops->update)
- obj->ops->update(obj, newobj);
+ if (WARN_ON_ONCE(!obj->ops->update))
+ return;
+ obj->ops->update(obj, newobj);
nft_obj_destroy(&trans->ctx, newobj);
}
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index d170758a1eb5..7010541fcca6 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -325,9 +325,6 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev,
struct nft_hook *hook, *found = NULL;
int n = 0;
- if (event != NETDEV_UNREGISTER)
- return;
-
list_for_each_entry(hook, &basechain->hook_list, list) {
if (hook->ops.dev == dev)
found = hook;
@@ -367,8 +364,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
.net = dev_net(dev),
};
- if (event != NETDEV_UNREGISTER &&
- event != NETDEV_CHANGENAME)
+ if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
nft_net = nft_pernet(ctx.net);
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index de9d1980df69..92b984fa8175 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -210,12 +210,12 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx,
nft_connlimit_do_destroy(ctx, priv);
}
-static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC);
+ priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp);
if (!priv_dst->list)
return -ENOMEM;
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index dccc68a5135a..291ed2026367 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -226,7 +226,7 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
nft_counter_do_destroy(priv);
}
-static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
@@ -236,7 +236,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
nft_counter_fetch(priv, &total);
- cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC);
+ cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp);
if (cpu_stats == NULL)
return -ENOMEM;
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index c09dba57354c..b4ada3ab2167 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -35,7 +35,7 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv,
for (i = 0; i < priv->num_exprs; i++) {
expr = nft_setelem_expr_at(elem_expr, elem_expr->size);
- if (nft_expr_clone(expr, priv->expr_array[i]) < 0)
+ if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0)
return -1;
elem_expr->size += priv->expr_array[i]->ops->size;
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index 8e6d7eaf9dc8..de1b6066bfa8 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -102,12 +102,12 @@ static void nft_last_destroy(const struct nft_ctx *ctx,
kfree(priv->last);
}
-static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_last_priv *priv_dst = nft_expr_priv(dst);
struct nft_last_priv *priv_src = nft_expr_priv(src);
- priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC);
+ priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp);
if (!priv_dst->last)
return -ENOMEM;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index cefa25e0dbb0..21d26b79b460 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -150,7 +150,7 @@ static void nft_limit_destroy(const struct nft_ctx *ctx,
}
static int nft_limit_clone(struct nft_limit_priv *priv_dst,
- const struct nft_limit_priv *priv_src)
+ const struct nft_limit_priv *priv_src, gfp_t gfp)
{
priv_dst->tokens_max = priv_src->tokens_max;
priv_dst->rate = priv_src->rate;
@@ -158,7 +158,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst,
priv_dst->burst = priv_src->burst;
priv_dst->invert = priv_src->invert;
- priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC);
+ priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp);
if (!priv_dst->limit)
return -ENOMEM;
@@ -223,14 +223,15 @@ static void nft_limit_pkts_destroy(const struct nft_ctx *ctx,
nft_limit_destroy(ctx, &priv->limit);
}
-static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src,
+ gfp_t gfp)
{
struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst);
struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src);
priv_dst->cost = priv_src->cost;
- return nft_limit_clone(&priv_dst->limit, &priv_src->limit);
+ return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp);
}
static struct nft_expr_type nft_limit_type;
@@ -281,12 +282,13 @@ static void nft_limit_bytes_destroy(const struct nft_ctx *ctx,
nft_limit_destroy(ctx, priv);
}
-static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src,
+ gfp_t gfp)
{
struct nft_limit_priv *priv_dst = nft_expr_priv(dst);
struct nft_limit_priv *priv_src = nft_expr_priv(src);
- return nft_limit_clone(priv_dst, priv_src);
+ return nft_limit_clone(priv_dst, priv_src, gfp);
}
static const struct nft_expr_ops nft_limit_bytes_ops = {
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index 3ba12a7471b0..9b2d7463d3d3 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -233,7 +233,7 @@ static void nft_quota_destroy(const struct nft_ctx *ctx,
return nft_quota_do_destroy(ctx, priv);
}
-static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
+static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp)
{
struct nft_quota *priv_dst = nft_expr_priv(dst);
struct nft_quota *priv_src = nft_expr_priv(src);
@@ -241,7 +241,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src)
priv_dst->quota = priv_src->quota;
priv_dst->flags = priv_src->flags;
- priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC);
+ priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp);
if (!priv_dst->consumed)
return -ENOMEM;
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 187138afac45..15a236bebb46 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -504,6 +504,7 @@ out:
* pipapo_get() - Get matching element reference given key data
* @net: Network namespace
* @set: nftables API set representation
+ * @m: storage containing active/existing elements
* @data: Key data to be matched against existing elements
* @genmask: If set, check that element is active in given genmask
* @tstamp: timestamp to check for expired elements
@@ -517,17 +518,15 @@ out:
*/
static struct nft_pipapo_elem *pipapo_get(const struct net *net,
const struct nft_set *set,
+ const struct nft_pipapo_match *m,
const u8 *data, u8 genmask,
u64 tstamp, gfp_t gfp)
{
struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT);
- struct nft_pipapo *priv = nft_set_priv(set);
unsigned long *res_map, *fill_map = NULL;
- const struct nft_pipapo_match *m;
const struct nft_pipapo_field *f;
int i;
- m = priv->clone;
if (m->bsize_max == 0)
return ret;
@@ -612,9 +611,11 @@ static struct nft_elem_priv *
nft_pipapo_get(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem, unsigned int flags)
{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m = rcu_dereference(priv->match);
struct nft_pipapo_elem *e;
- e = pipapo_get(net, set, (const u8 *)elem->key.val.data,
+ e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data,
nft_genmask_cur(net), get_jiffies_64(),
GFP_ATOMIC);
if (IS_ERR(e))
@@ -1247,6 +1248,40 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
return 0;
}
+static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ const struct net *net = read_pnet(&set->net);
+
+ return lockdep_is_held(&nft_pernet(net)->commit_mutex);
+#else
+ return true;
+#endif
+}
+
+static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old);
+
+/**
+ * pipapo_maybe_clone() - Build clone for pending data changes, if not existing
+ * @set: nftables API set representation
+ *
+ * Return: newly created or existing clone, if any. NULL on allocation failure
+ */
+static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ struct nft_pipapo_match *m;
+
+ if (priv->clone)
+ return priv->clone;
+
+ m = rcu_dereference_protected(priv->match,
+ nft_pipapo_transaction_mutex_held(set));
+ priv->clone = pipapo_clone(m);
+
+ return priv->clone;
+}
+
/**
* nft_pipapo_insert() - Validate and insert ranged elements
* @net: Network namespace
@@ -1263,8 +1298,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
const u8 *start = (const u8 *)elem->key.val.data, *end;
- struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *m = priv->clone;
+ struct nft_pipapo_match *m = pipapo_maybe_clone(set);
u8 genmask = nft_genmask_next(net);
struct nft_pipapo_elem *e, *dup;
u64 tstamp = nft_net_tstamp(net);
@@ -1272,12 +1306,15 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
const u8 *start_p, *end_p;
int i, bsize_max, err = 0;
+ if (!m)
+ return -ENOMEM;
+
if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END))
end = (const u8 *)nft_set_ext_key_end(ext)->data;
else
end = start;
- dup = pipapo_get(net, set, start, genmask, tstamp, GFP_KERNEL);
+ dup = pipapo_get(net, set, m, start, genmask, tstamp, GFP_KERNEL);
if (!IS_ERR(dup)) {
/* Check if we already have the same exact entry */
const struct nft_data *dup_key, *dup_end;
@@ -1299,7 +1336,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
if (PTR_ERR(dup) == -ENOENT) {
/* Look for partially overlapping entries */
- dup = pipapo_get(net, set, end, nft_genmask_next(net), tstamp,
+ dup = pipapo_get(net, set, m, end, nft_genmask_next(net), tstamp,
GFP_KERNEL);
}
@@ -1332,8 +1369,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
}
/* Insert */
- priv->dirty = true;
-
bsize_max = m->bsize_max;
nft_pipapo_for_each_field(f, i, m) {
@@ -1384,7 +1419,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
* pipapo_clone() - Clone matching data to create new working copy
* @old: Existing matching data
*
- * Return: copy of matching data passed as 'old', error pointer on failure
+ * Return: copy of matching data passed as 'old' or NULL.
*/
static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
{
@@ -1394,7 +1429,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old)
new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL);
if (!new)
- return ERR_PTR(-ENOMEM);
+ return NULL;
new->field_count = old->field_count;
new->bsize_max = old->bsize_max;
@@ -1466,7 +1501,7 @@ out_scratch:
free_percpu(new->scratch);
kfree(new);
- return ERR_PTR(-ENOMEM);
+ return NULL;
}
/**
@@ -1698,8 +1733,6 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
* NFT_SET_ELEM_DEAD_BIT.
*/
if (__nft_set_elem_expired(&e->ext, tstamp)) {
- priv->dirty = true;
-
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
if (!gc)
return;
@@ -1777,57 +1810,30 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
static void nft_pipapo_commit(struct nft_set *set)
{
struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *new_clone, *old;
-
- if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
- pipapo_gc(set, priv->clone);
+ struct nft_pipapo_match *old;
- if (!priv->dirty)
+ if (!priv->clone)
return;
- new_clone = pipapo_clone(priv->clone);
- if (IS_ERR(new_clone))
- return;
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ pipapo_gc(set, priv->clone);
- priv->dirty = false;
+ old = rcu_replace_pointer(priv->match, priv->clone,
+ nft_pipapo_transaction_mutex_held(set));
+ priv->clone = NULL;
- old = rcu_access_pointer(priv->match);
- rcu_assign_pointer(priv->match, priv->clone);
if (old)
call_rcu(&old->rcu, pipapo_reclaim_match);
-
- priv->clone = new_clone;
-}
-
-static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set)
-{
-#ifdef CONFIG_PROVE_LOCKING
- const struct net *net = read_pnet(&set->net);
-
- return lockdep_is_held(&nft_pernet(net)->commit_mutex);
-#else
- return true;
-#endif
}
static void nft_pipapo_abort(const struct nft_set *set)
{
struct nft_pipapo *priv = nft_set_priv(set);
- struct nft_pipapo_match *new_clone, *m;
-
- if (!priv->dirty)
- return;
-
- m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set));
- new_clone = pipapo_clone(m);
- if (IS_ERR(new_clone))
+ if (!priv->clone)
return;
-
- priv->dirty = false;
-
pipapo_free_match(priv->clone);
- priv->clone = new_clone;
+ priv->clone = NULL;
}
/**
@@ -1851,52 +1857,38 @@ static void nft_pipapo_activate(const struct net *net,
}
/**
- * pipapo_deactivate() - Check that element is in set, mark as inactive
+ * nft_pipapo_deactivate() - Search for element and make it inactive
* @net: Network namespace
* @set: nftables API set representation
- * @data: Input key data
- * @ext: nftables API extension pointer, used to check for end element
- *
- * This is a convenience function that can be called from both
- * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same
- * operation.
+ * @elem: nftables API element representation containing key data
*
* Return: deactivated element if found, NULL otherwise.
*/
-static void *pipapo_deactivate(const struct net *net, const struct nft_set *set,
- const u8 *data, const struct nft_set_ext *ext)
+static struct nft_elem_priv *
+nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
+ const struct nft_set_elem *elem)
{
+ struct nft_pipapo_match *m = pipapo_maybe_clone(set);
struct nft_pipapo_elem *e;
- e = pipapo_get(net, set, data, nft_genmask_next(net),
- nft_net_tstamp(net), GFP_KERNEL);
+ /* removal must occur on priv->clone, if we are low on memory
+ * we have no choice and must fail the removal request.
+ */
+ if (!m)
+ return NULL;
+
+ e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data,
+ nft_genmask_next(net), nft_net_tstamp(net), GFP_KERNEL);
if (IS_ERR(e))
return NULL;
nft_set_elem_change_active(net, set, &e->ext);
- return e;
-}
-
-/**
- * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive
- * @net: Network namespace
- * @set: nftables API set representation
- * @elem: nftables API element representation containing key data
- *
- * Return: deactivated element if found, NULL otherwise.
- */
-static struct nft_elem_priv *
-nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
- const struct nft_set_elem *elem)
-{
- const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
-
- return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext);
+ return &e->priv;
}
/**
- * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive
+ * nft_pipapo_flush() - make element inactive
* @net: Network namespace
* @set: nftables API set representation
* @elem_priv: nftables API element representation containing key data
@@ -2093,7 +2085,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
if (last && f->mt[rulemap[i].to].e == e) {
- priv->dirty = true;
pipapo_drop(m, rulemap);
return;
}
@@ -2106,35 +2097,23 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
}
/**
- * nft_pipapo_walk() - Walk over elements
+ * nft_pipapo_do_walk() - Walk over elements in m
* @ctx: nftables API context
* @set: nftables API set representation
+ * @m: matching data pointing to key mapping array
* @iter: Iterator
*
* As elements are referenced in the mapping array for the last field, directly
* scan that array: there's no need to follow rule mappings from the first
- * field.
+ * field. @m is protected either by RCU read lock or by transaction mutex.
*/
-static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ const struct nft_pipapo_match *m,
+ struct nft_set_iter *iter)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- const struct nft_pipapo_match *m;
const struct nft_pipapo_field *f;
unsigned int i, r;
- WARN_ON_ONCE(iter->type != NFT_ITER_READ &&
- iter->type != NFT_ITER_UPDATE);
-
- rcu_read_lock();
- if (iter->type == NFT_ITER_READ)
- m = rcu_dereference(priv->match);
- else
- m = priv->clone;
-
- if (unlikely(!m))
- goto out;
-
for (i = 0, f = m->f; i < m->field_count - 1; i++, f++)
;
@@ -2151,14 +2130,49 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
iter->err = iter->fn(ctx, set, iter, &e->priv);
if (iter->err < 0)
- goto out;
+ return;
cont:
iter->count++;
}
+}
-out:
- rcu_read_unlock();
+/**
+ * nft_pipapo_walk() - Walk over elements
+ * @ctx: nftables API context
+ * @set: nftables API set representation
+ * @iter: Iterator
+ *
+ * Test if destructive action is needed or not, clone active backend if needed
+ * and call the real function to work on the data.
+ */
+static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ m = pipapo_maybe_clone(set);
+ if (!m) {
+ iter->err = -ENOMEM;
+ return;
+ }
+
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ break;
+ case NFT_ITER_READ:
+ rcu_read_lock();
+ m = rcu_dereference(priv->match);
+ nft_pipapo_do_walk(ctx, set, m, iter);
+ rcu_read_unlock();
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
}
/**
@@ -2267,21 +2281,10 @@ static int nft_pipapo_init(const struct nft_set *set,
f->mt = NULL;
}
- /* Create an initial clone of matching data for next insertion */
- priv->clone = pipapo_clone(m);
- if (IS_ERR(priv->clone)) {
- err = PTR_ERR(priv->clone);
- goto out_free;
- }
-
- priv->dirty = false;
-
rcu_assign_pointer(priv->match, m);
return 0;
-out_free:
- free_percpu(m->scratch);
out_scratch:
kfree(m);
@@ -2326,33 +2329,18 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx,
{
struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_match *m;
- int cpu;
m = rcu_dereference_protected(priv->match, true);
- if (m) {
- rcu_barrier();
-
- for_each_possible_cpu(cpu)
- pipapo_free_scratch(m, cpu);
- free_percpu(m->scratch);
- pipapo_free_fields(m);
- kfree(m);
- priv->match = NULL;
- }
if (priv->clone) {
- m = priv->clone;
-
- nft_set_pipapo_match_destroy(ctx, set, m);
-
- for_each_possible_cpu(cpu)
- pipapo_free_scratch(priv->clone, cpu);
- free_percpu(priv->clone->scratch);
-
- pipapo_free_fields(priv->clone);
- kfree(priv->clone);
+ nft_set_pipapo_match_destroy(ctx, set, priv->clone);
+ pipapo_free_match(priv->clone);
priv->clone = NULL;
+ } else {
+ nft_set_pipapo_match_destroy(ctx, set, m);
}
+
+ pipapo_free_match(m);
}
/**
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index 24cd1ff73f98..0d2e40e10f7f 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -155,14 +155,12 @@ struct nft_pipapo_match {
* @match: Currently in-use matching data
* @clone: Copy where pending insertions and deletions are kept
* @width: Total bytes to be matched for one packet, including padding
- * @dirty: Working copy has pending insertions or deletions
* @last_gc: Timestamp of last garbage collection run, jiffies
*/
struct nft_pipapo {
struct nft_pipapo_match __rcu *match;
struct nft_pipapo_match *clone;
int width;
- bool dirty;
unsigned long last_gc;
};
diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile
index e9a6c702b8c9..47945b2b3f92 100644
--- a/tools/testing/selftests/net/netfilter/Makefile
+++ b/tools/testing/selftests/net/netfilter/Makefile
@@ -13,6 +13,7 @@ TEST_PROGS += conntrack_tcp_unreplied.sh
TEST_PROGS += conntrack_sctp_collision.sh
TEST_PROGS += conntrack_vrf.sh
TEST_PROGS += ipvs.sh
+TEST_PROGS += nf_conntrack_packetdrill.sh
TEST_PROGS += nf_nat_edemux.sh
TEST_PROGS += nft_audit.sh
TEST_PROGS += nft_concat_range.sh
@@ -45,6 +46,7 @@ $(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS)
$(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS)
TEST_FILES := lib.sh
+TEST_FILES += packetdrill
TEST_INCLUDES := \
../lib.sh
diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config
index 5b5b764f6cd0..63ef80ef47a4 100644
--- a/tools/testing/selftests/net/netfilter/config
+++ b/tools/testing/selftests/net/netfilter/config
@@ -86,3 +86,4 @@ CONFIG_VLAN_8021Q=m
CONFIG_XFRM_USER=m
CONFIG_XFRM_STATISTICS=y
CONFIG_NET_PKTGEN=m
+CONFIG_TUN=m
diff --git a/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh
new file mode 100755
index 000000000000..c6fdd2079f4d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+source lib.sh
+
+checktool "conntrack --version" "run test without conntrack"
+checktool "iptables --version" "run test without iptables"
+checktool "ip6tables --version" "run test without ip6tables"
+
+modprobe -q tun
+modprobe -q nf_conntrack
+# echo 1 > /proc/sys/net/netfilter/nf_log_all_netns
+
+PDRILL_TIMEOUT=10
+
+files="
+conntrack_ack_loss_stall.pkt
+conntrack_inexact_rst.pkt
+conntrack_syn_challenge_ack.pkt
+conntrack_synack_old.pkt
+conntrack_synack_reuse.pkt
+conntrack_rst_invalid.pkt
+"
+
+if ! packetdrill --dry_run --verbose "packetdrill/conntrack_ack_loss_stall.pkt";then
+ echo "SKIP: packetdrill not installed"
+ exit ${ksft_skip}
+fi
+
+ret=0
+
+run_packetdrill()
+{
+ filename="$1"
+ ipver="$2"
+ local mtu=1500
+
+ export NFCT_IP_VERSION="$ipver"
+
+ if [ "$ipver" = "ipv4" ];then
+ export xtables="iptables"
+ elif [ "$ipver" = "ipv6" ];then
+ export xtables="ip6tables"
+ mtu=1520
+ fi
+
+ timeout "$PDRILL_TIMEOUT" unshare -n packetdrill --ip_version="$ipver" --mtu=$mtu \
+ --tolerance_usecs=1000000 --non_fatal packet "$filename"
+}
+
+run_one_test_file()
+{
+ filename="$1"
+
+ for v in ipv4 ipv6;do
+ printf "%-50s(%s)%-20s" "$filename" "$v" ""
+ if run_packetdrill packetdrill/"$f" "$v";then
+ echo OK
+ else
+ echo FAIL
+ ret=1
+ fi
+ done
+}
+
+echo "Replaying packetdrill test cases:"
+for f in $files;do
+ run_one_test_file packetdrill/"$f"
+done
+
+exit $ret
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/common.sh b/tools/testing/selftests/net/netfilter/packetdrill/common.sh
new file mode 100755
index 000000000000..ed36d535196d
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/common.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+
+# for debugging set net.netfilter.nf_log_all_netns=1 in init_net
+# or do not use net namespaces.
+modprobe -q nf_conntrack
+sysctl -q net.netfilter.nf_conntrack_log_invalid=6
+
+# Flush old cached data (fastopen cookies).
+ip tcp_metrics flush all > /dev/null 2>&1
+
+# TCP min, default, and max receive and send buffer sizes.
+sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))"
+sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304"
+
+# TCP congestion control.
+sysctl -q net.ipv4.tcp_congestion_control=cubic
+
+# TCP slow start after idle.
+sysctl -q net.ipv4.tcp_slow_start_after_idle=0
+
+# TCP Explicit Congestion Notification (ECN)
+sysctl -q net.ipv4.tcp_ecn=0
+
+sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1
+
+# Override the default qdisc on the tun device.
+# Many tests fail with timing errors if the default
+# is FQ and that paces their flows.
+tc qdisc add dev tun0 root pfifo
+
+# Enable conntrack
+$xtables -A INPUT -m conntrack --ctstate NEW -p tcp --syn
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt
new file mode 100644
index 000000000000..d755bd64c54f
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt
@@ -0,0 +1,118 @@
+// check that already-acked (retransmitted) packet is let through rather
+// than tagged as INVALID.
+
+`packetdrill/common.sh`
+
+// should set -P DROP but it disconnects VM w.o. extra netns
++0 `$xtables -A INPUT -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 10) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000>
++0 > S. 0:0(0) ack 1 <mss 1460>
++.01 < . 1:1(0) ack 1 win 65535
++0 accept(3, ..., ...) = 4
+
++0.0001 < P. 1:1461(1460) ack 1 win 257
++.0 > . 1:1(0) ack 1461 win 65535
++0.0001 < P. 1461:2921(1460) ack 1 win 257
++.0 > . 1:1(0) ack 2921 win 65535
++0.0001 < P. 2921:4381(1460) ack 1 win 257
++.0 > . 1:1(0) ack 4381 win 65535
++0.0001 < P. 4381:5841(1460) ack 1 win 257
++.0 > . 1:1(0) ack 5841 win 65535
++0.0001 < P. 5841:7301(1460) ack 1 win 257
++.0 > . 1:1(0) ack 7301 win 65535
++0.0001 < P. 7301:8761(1460) ack 1 win 257
++.0 > . 1:1(0) ack 8761 win 65535
++0.0001 < P. 8761:10221(1460) ack 1 win 257
++.0 > . 1:1(0) ack 10221 win 65535
++0.0001 < P. 10221:11681(1460) ack 1 win 257
++.0 > . 1:1(0) ack 11681 win 65535
++0.0001 < P. 11681:13141(1460) ack 1 win 257
++.0 > . 1:1(0) ack 13141 win 65535
++0.0001 < P. 13141:14601(1460) ack 1 win 257
++.0 > . 1:1(0) ack 14601 win 65535
++0.0001 < P. 14601:16061(1460) ack 1 win 257
++.0 > . 1:1(0) ack 16061 win 65535
++0.0001 < P. 16061:17521(1460) ack 1 win 257
++.0 > . 1:1(0) ack 17521 win 65535
++0.0001 < P. 17521:18981(1460) ack 1 win 257
++.0 > . 1:1(0) ack 18981 win 65535
++0.0001 < P. 18981:20441(1460) ack 1 win 257
++.0 > . 1:1(0) ack 20441 win 65535
++0.0001 < P. 20441:21901(1460) ack 1 win 257
++.0 > . 1:1(0) ack 21901 win 65535
++0.0001 < P. 21901:23361(1460) ack 1 win 257
++.0 > . 1:1(0) ack 23361 win 65535
++0.0001 < P. 23361:24821(1460) ack 1 win 257
+0.055 > . 1:1(0) ack 24821 win 65535
++0.0001 < P. 24821:26281(1460) ack 1 win 257
++.0 > . 1:1(0) ack 26281 win 65535
++0.0001 < P. 26281:27741(1460) ack 1 win 257
++.0 > . 1:1(0) ack 27741 win 65535
++0.0001 < P. 27741:29201(1460) ack 1 win 257
++.0 > . 1:1(0) ack 29201 win 65535
++0.0001 < P. 29201:30661(1460) ack 1 win 257
++.0 > . 1:1(0) ack 30661 win 65535
++0.0001 < P. 30661:32121(1460) ack 1 win 257
++.0 > . 1:1(0) ack 32121 win 65535
++0.0001 < P. 32121:33581(1460) ack 1 win 257
++.0 > . 1:1(0) ack 33581 win 65535
++0.0001 < P. 33581:35041(1460) ack 1 win 257
++.0 > . 1:1(0) ack 35041 win 65535
++0.0001 < P. 35041:36501(1460) ack 1 win 257
++.0 > . 1:1(0) ack 36501 win 65535
++0.0001 < P. 36501:37961(1460) ack 1 win 257
++.0 > . 1:1(0) ack 37961 win 65535
++0.0001 < P. 37961:39421(1460) ack 1 win 257
++.0 > . 1:1(0) ack 39421 win 65535
++0.0001 < P. 39421:40881(1460) ack 1 win 257
++.0 > . 1:1(0) ack 40881 win 65535
++0.0001 < P. 40881:42341(1460) ack 1 win 257
++.0 > . 1:1(0) ack 42341 win 65535
++0.0001 < P. 42341:43801(1460) ack 1 win 257
++.0 > . 1:1(0) ack 43801 win 65535
++0.0001 < P. 43801:45261(1460) ack 1 win 257
++.0 > . 1:1(0) ack 45261 win 65535
++0.0001 < P. 45261:46721(1460) ack 1 win 257
++.0 > . 1:1(0) ack 46721 win 65535
++0.0001 < P. 46721:48181(1460) ack 1 win 257
++.0 > . 1:1(0) ack 48181 win 65535
++0.0001 < P. 48181:49641(1460) ack 1 win 257
++.0 > . 1:1(0) ack 49641 win 65535
++0.0001 < P. 49641:51101(1460) ack 1 win 257
++.0 > . 1:1(0) ack 51101 win 65535
++0.0001 < P. 51101:52561(1460) ack 1 win 257
++.0 > . 1:1(0) ack 52561 win 65535
++0.0001 < P. 52561:54021(1460) ack 1 win 257
++.0 > . 1:1(0) ack 54021 win 65535
++0.0001 < P. 54021:55481(1460) ack 1 win 257
++.0 > . 1:1(0) ack 55481 win 65535
++0.0001 < P. 55481:56941(1460) ack 1 win 257
++.0 > . 1:1(0) ack 56941 win 65535
++0.0001 < P. 56941:58401(1460) ack 1 win 257
++.0 > . 1:1(0) ack 58401 win 65535
++0.0001 < P. 58401:59861(1460) ack 1 win 257
++.0 > . 1:1(0) ack 59861 win 65535
++0.0001 < P. 59861:61321(1460) ack 1 win 257
++.0 > . 1:1(0) ack 61321 win 65535
++0.0001 < P. 61321:62781(1460) ack 1 win 257
++.0 > . 1:1(0) ack 62781 win 65535
++0.0001 < P. 62781:64241(1460) ack 1 win 257
++.0 > . 1:1(0) ack 64241 win 65535
++0.0001 < P. 64241:65701(1460) ack 1 win 257
++.0 > . 1:1(0) ack 65701 win 65535
++0.0001 < P. 65701:67161(1460) ack 1 win 257
++.0 > . 1:1(0) ack 67161 win 65535
+
+// nf_ct_proto_6: SEQ is under the lower bound (already ACKed data retransmitted) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.24.72 LEN=1500 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=34375 DPT=8080 SEQ=1 ACK=4162510439 WINDOW=257 RES=0x00 ACK PSH URGP=0
++0.0001 < P. 1:1461(1460) ack 1 win 257
+
+// only sent if above packet isn't flagged as invalid
++.0 > . 1:1(0) ack 67161 win 65535
+
++0 `$xtables -D INPUT -m conntrack --ctstate INVALID -j DROP`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt
new file mode 100644
index 000000000000..dccdd4c009c6
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt
@@ -0,0 +1,62 @@
+// check RST packet that doesn't exactly match expected next sequence
+// number still transitions conntrack state to CLOSE iff its already in
+// FIN/CLOSE_WAIT.
+
+`packetdrill/common.sh`
+
+// 5.771921 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture]
+// 5.771994 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture]
+// 5.772212 client_ip > server_ip TCP 66 45020 > 443 [ACK] Seq=1905874048 Ack=781810658 Win=36352 Len=0 TSval=3317842872 TSecr=675936334
+// 5.787924 server_ip > client_ip TLSv1.2 1300 [Packet size limited during capture]
+// 5.788126 server_ip > client_ip TLSv1.2 90 Application Data
+// 5.788207 server_ip > client_ip TCP 66 443 > 45020 [FIN, ACK] Seq=781811916 Ack=1905874048 Win=31104 Len=0 TSval=675936350 TSecr=3317842872
+// 5.788447 client_ip > server_ip TLSv1.2 90 Application Data
+// 5.788479 client_ip > server_ip TCP 66 45020 > 443 [RST, ACK] Seq=1905874072 Ack=781811917 Win=39040 Len=0 TSval=3317842889 TSecr=675936350
+// 5.788581 server_ip > client_ip TCP 54 8443 > 45020 [RST] Seq=781811892 Win=0 Len=0
+
++0 `iptables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `iptables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460>
+
++0 > . 1:1(0) ack 1 win 65535
++0 < . 1:1001(1000) ack 1 win 65535
++0 < . 1001:2001(1000) ack 1 win 65535
++0 < . 2001:3001(1000) ack 1 win 65535
+
++0 > . 1:1(0) ack 1001 win 65535
++0 > . 1:1(0) ack 2001 win 65535
++0 > . 1:1(0) ack 3001 win 65535
+
++0 write(3, ..., 1000) = 1000
+
++0.0 > P. 1:1001(1000) ack 3001 win 65535
+
++0.1 read(3, ..., 1000) = 1000
+
+// Conntrack should move to FIN_WAIT, then CLOSE_WAIT.
++0 < F. 3001:3001(0) ack 1001 win 65535
++0 > . 1001:1001(0) ack 3002 win 65535
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE_WAIT`
+
++1 close(3) = 0
+// RST: unread data. FIN was seen, hence ack + 1
++0 > R. 1001:1001(0) ack 3002 win 65535
+// ... and then, CLOSE.
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ `
+
+// Spurious RST from peer -- no sk state. Should NOT get
+// marked INVALID, because conntrack is already closing.
++0.1 < R 2001:2001(0) win 0
+
+// No packets should have been marked INVALID
++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"`
++0 `iptables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt
new file mode 100644
index 000000000000..686f18a3d9ef
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt
@@ -0,0 +1,59 @@
+// check that out of window resets are marked as INVALID and conntrack remains
+// in ESTABLISHED state.
+
+`packetdrill/common.sh`
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460>
+
++0 > . 1:1(0) ack 1 win 65535
++0 < . 1:1001(1000) ack 1 win 65535
++0 < . 1001:2001(1000) ack 1 win 65535
++0 < . 2001:3001(1000) ack 1 win 65535
+
++0 > . 1:1(0) ack 1001 win 65535
++0 > . 1:1(0) ack 2001 win 65535
++0 > . 1:1(0) ack 3001 win 65535
+
++0 write(3, ..., 1000) = 1000
+
+// out of window
++0.0 < R 0:0(0) win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// out of window
++0.0 < R 1000000:1000000(0) win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// in-window but not exact match
++0.0 < R 42:42(0) win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
++0.0 > P. 1:1001(1000) ack 3001 win 65535
+
++0.1 read(3, ..., 1000) = 1000
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
++0 < . 3001:3001(0) ack 1001 win 65535
+
++0.0 < R. 3000:3000(0) ack 1001 win 0
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED`
+
+// exact next sequence
++0.0 < R. 3001:3001(0) ack 1001 win 0
+// Conntrack should move to CLOSE
+
+// Expect four invalid RSTs
++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 4 "`
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ `
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt
new file mode 100644
index 000000000000..3442cd29bc93
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt
@@ -0,0 +1,44 @@
+// Check connection re-use, i.e. peer that receives the SYN answers with
+// a challenge-ACK.
+// Check that conntrack lets all packets pass, including the challenge ack,
+// and that a new connection is established.
+
+`packetdrill/common.sh`
+
+// S >
+// . < (challnge-ack)
+// R. >
+// S >
+// S. <
+// Expected outcome: established connection.
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// Challenge ACK, old incarnation.
+0.1 < . 145824453:145824453(0) ack 643160523 win 240 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
++0.01 > R 643160523:643160523(0) win 0
+
++0.01 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT`
+
+// Must go through.
++0.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// correct synack
++0.1 < S. 0:0(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
+// 3whs completes.
++0.01 > . 1:1(0) ack 1 win 256 <nop,nop,TS val 1 ecr 1>
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED`
+
+// No packets should have been marked INVALID
++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"`
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt
new file mode 100644
index 000000000000..3047160c4bf3
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt
@@ -0,0 +1,51 @@
+// Check conntrack copes with syn/ack reply for a previous, old incarnation.
+
+// tcpdump with buggy sequence
+// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083107423 ecr 0,nop,wscale 7], length 0
+// OLD synack, for old/previous S
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 145824453, ack 643160523, win 65535, options [mss 8952,nop,wscale 5,TS val 3215437785 ecr 2082921663,nop,nop], length 0
+// This reset never makes it to the endpoint, elided in the packetdrill script
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [R.], seq 1, ack 1, win 65535, options [mss 8952,nop,wscale 5,TS val 3215443451 ecr 2082921663,nop,nop], length 0
+// Syn retransmit, no change
+// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083115583 ecr 0,nop,wscale 7], length 0
+// CORRECT synack, should be accepted, but conntrack classified this as INVALID:
+// SEQ is over the upper bound (over the window of the receiver) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.37.78 LEN=40 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=8080 DPT=34500 SEQ=162602411 ACK=2124350315 ..
+// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 162602410, ack 2375731742, win 65535, options [mss 8952,nop,wscale 5,TS val 3215445754 ecr 2083115583,nop,nop], length 0
+
+`packetdrill/common.sh`
+
++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0
+
+0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress)
+0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8>
+
+// bogus/outdated synack, invalid ack value
+0.1 < S. 145824453:145824453(0) ack 643160523 win 240 <mss 1440,nop,nop,TS val 1 ecr 1,nop,wscale 0>
+
+// syn retransmitted
+1.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8>
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT`
+
+// correct synack
++0 < S. 145758918:145758918(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0>
++0 write(3, ..., 1) = 1
+
+// with buggy conntrack above packet is dropped, so SYN rtx is seen:
+// script packet: 1.054007 . 1:1(0) ack 16777958 win 256 <nop,nop,TS val 1033 ecr 1>
+// actual packet: 3.010000 S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8>
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED`
+
++0 > P. 1:2(1) ack 4294901762 win 256 <nop,nop,TS val 1067 ecr 1>
+
++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ASSURED | grep -q ESTABLISHED`
+
+// No packets should have been marked INVALID in OUTPUT direction, 1 in INPUT
++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"`
++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 1 "`
+
++0 `$xtables -D INPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
++0 `$xtables -D OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP`
diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt
new file mode 100644
index 000000000000..21e1bb6395e4
--- /dev/null
+++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt
@@ -0,0 +1,34 @@
+// Check reception of another SYN while we have an established conntrack state.
+// Challenge ACK is supposed to pass through, RST reply should clear conntrack
+// state and SYN retransmit should give us new 'SYN_RECV' connection state.
+
+`packetdrill/common.sh`
+
+// should show a match if bug is present:
++0 `iptables -A INPUT -m conntrack --ctstate INVALID -p tcp --tcp-flags SYN,ACK SYN,ACK`
+
++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
++0 bind(3, ..., ...) = 0
++0 listen(3, 10) = 0
+
++0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7, TS val 1 ecr 0,nop,nop>
++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,TS val 100 ecr 1,nop,wscale 8>
++.01 < . 1:1(0) ack 1 win 257 <TS val 1 ecr 100,nop,nop>
++0 accept(3, ..., ...) = 4
+
++0 < P. 1:101(100) ack 1 win 257 <TS val 2 ecr 100,nop,nop>
++.001 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 110 ecr 2>
++0 read(4, ..., 101) = 100
+
+1.0 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 7, TS val 233 ecr 0,nop,nop>
+// Won't expect this: challenge ack.
+
++0 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 112 ecr 2>
++0 < R. 101:101(0) ack 1 win 257
++0 close(4) = 0
+
+1.5 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 0, TS val 233 ecr 0,nop,nop>
+
++0 `conntrack -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV`
++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"`