diff options
author | Jakub Kicinski | 2024-05-13 13:12:34 -0700 |
---|---|---|
committer | Jakub Kicinski | 2024-05-13 13:12:35 -0700 |
commit | c85e41bfe7af41c71c438c6011b298398c185fa8 (patch) | |
tree | 6e7bee1a64f50af926efaa05439010c20fc00547 | |
parent | cddd2dc6390b90e62cec2768424d1d90f6d04161 (diff) | |
parent | fa23e0d4b756d25829e124d6b670a4c6bbd4bf7e (diff) |
Merge tag 'nf-next-24-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next
Pablo Neira Ayuso says:
====================
Netfilter updates for net-next
The following patchset contains Netfilter updates for net-next:
Patch #1 skips transaction if object type provides no .update interface.
Patch #2 skips NETDEV_CHANGENAME which is unused.
Patch #3 enables conntrack to handle Multicast Router Advertisements and
Multicast Router Solicitations from the Multicast Router Discovery
protocol (RFC4286) as untracked opposed to invalid packets.
From Linus Luessing.
Patch #4 updates DCCP conntracker to mark invalid as invalid, instead of
dropping them, from Jason Xing.
Patch #5 uses NF_DROP instead of -NF_DROP since NF_DROP is 0,
also from Jason.
Patch #6 removes reference in netfilter's sysctl documentation on pickup
entries which were already removed by Florian Westphal.
Patch #7 removes check for IPS_OFFLOAD flag to disable early drop which
allows to evict entries from the conntrack table,
also from Florian.
Patches #8 to #16 updates nf_tables pipapo set backend to allocate
the datastructure copy on-demand from preparation phase,
to better deal with OOM situations where .commit step is too late
to fail. Series from Florian Westphal.
Patch #17 adds a selftest with packetdrill to cover conntrack TCP state
transitions, also from Florian.
Patch #18 use GFP_KERNEL to clone elements from control plane to avoid
quick atomic reserves exhaustion with large sets, reporter refers
to million entries magnitude.
* tag 'nf-next-24-05-12' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
netfilter: nf_tables: allow clone callbacks to sleep
selftests: netfilter: add packetdrill based conntrack tests
netfilter: nft_set_pipapo: remove dirty flag
netfilter: nft_set_pipapo: move cloning of match info to insert/removal path
netfilter: nft_set_pipapo: prepare pipapo_get helper for on-demand clone
netfilter: nft_set_pipapo: merge deactivate helper into caller
netfilter: nft_set_pipapo: prepare walk function for on-demand clone
netfilter: nft_set_pipapo: prepare destroy function for on-demand clone
netfilter: nft_set_pipapo: make pipapo_clone helper return NULL
netfilter: nft_set_pipapo: move prove_locking helper around
netfilter: conntrack: remove flowtable early-drop test
netfilter: conntrack: documentation: remove reference to non-existent sysctl
netfilter: use NF_DROP instead of -NF_DROP
netfilter: conntrack: dccp: try not to drop skb in conntrack
netfilter: conntrack: fix ct-state for ICMPv6 Multicast Router Discovery
netfilter: nf_tables: remove NETDEV_CHANGENAME from netdev chain event handler
netfilter: nf_tables: skip transaction if update object is not implemented
====================
Link: https://lore.kernel.org/r/20240512161436.168973-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
28 files changed, 639 insertions, 175 deletions
diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst index c383a394c665..238b66d0e059 100644 --- a/Documentation/networking/nf_conntrack-sysctl.rst +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -222,11 +222,11 @@ nf_flowtable_tcp_timeout - INTEGER (seconds) Control offload timeout for tcp connections. TCP connections may be offloaded from nf conntrack to nf flow table. - Once aged, the connection is returned to nf conntrack with tcp pickup timeout. + Once aged, the connection is returned to nf conntrack. nf_flowtable_udp_timeout - INTEGER (seconds) default 30 Control offload timeout for udp connections. UDP connections may be offloaded from nf conntrack to nf flow table. - Once aged, the connection is returned to nf conntrack with udp pickup timeout. + Once aged, the connection is returned to nf conntrack. diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 3f1ed467f951..2796153b03da 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -416,7 +416,7 @@ struct nft_expr_info; int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, struct nft_expr_info *info); -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src); +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp); void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr); int nft_expr_dump(struct sk_buff *skb, unsigned int attr, const struct nft_expr *expr, bool reset); @@ -935,7 +935,7 @@ struct nft_expr_ops { struct nft_regs *regs, const struct nft_pktinfo *pkt); int (*clone)(struct nft_expr *dst, - const struct nft_expr *src); + const struct nft_expr *src, gfp_t gfp); unsigned int size; int (*init)(const struct nft_ctx *ctx, diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index ecaece3af38d..4eaab89e2856 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -112,6 +112,7 @@ struct icmp6hdr { #define ICMPV6_MOBILE_PREFIX_ADV 147 #define ICMPV6_MRDISC_ADV 151 +#define ICMPV6_MRDISC_SOL 152 #define ICMPV6_MSG_MAX 255 diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index b9062f4552ac..3ab908b74795 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -44,7 +44,7 @@ static int iptable_filter_table_init(struct net *net) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ipt_standard *)repl->entries)[1].target.verdict = - forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ipt_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c index df785ebda0ca..e8992693e14a 100644 --- a/net/ipv6/netfilter/ip6table_filter.c +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -43,7 +43,7 @@ static int ip6table_filter_table_init(struct net *net) return -ENOMEM; /* Entry 1 is the FORWARD hook */ ((struct ip6t_standard *)repl->entries)[1].target.verdict = - forward ? -NF_ACCEPT - 1 : -NF_DROP - 1; + forward ? -NF_ACCEPT - 1 : NF_DROP - 1; err = ip6t_register_table(net, &packet_filter, repl, filter_ops); kfree(repl); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c63868666bd9..7ac20750c127 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1440,8 +1440,6 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) const struct nf_conntrack_l4proto *l4proto; u8 protonum = nf_ct_protonum(ct); - if (test_bit(IPS_OFFLOAD_BIT, &ct->status) && protonum != IPPROTO_UDP) - return false; if (!test_bit(IPS_ASSURED_BIT, &ct->status)) return true; @@ -2024,7 +2022,7 @@ repeat: goto repeat; NF_CT_STAT_INC_ATOMIC(state->net, invalid); - if (ret == -NF_DROP) + if (ret == NF_DROP) NF_CT_STAT_INC_ATOMIC(state->net, drop); ret = -ret; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index e2db1f4ec2df..ebc4f733bb2e 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -525,7 +525,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); if (!dh) - return NF_DROP; + return -NF_ACCEPT; if (dccp_error(dh, skb, dataoff, state)) return -NF_ACCEPT; @@ -533,7 +533,7 @@ int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, /* pull again, including possible 48 bit sequences and subtype header */ dh = dccp_header_pointer(skb, dataoff, dh, &_dh); if (!dh) - return NF_DROP; + return -NF_ACCEPT; type = dh->dccph_type; if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 1020d67600a9..327b8059025d 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -62,7 +62,9 @@ static const u_int8_t noct_valid_new[] = { [NDISC_ROUTER_ADVERTISEMENT - 130] = 1, [NDISC_NEIGHBOUR_SOLICITATION - 130] = 1, [NDISC_NEIGHBOUR_ADVERTISEMENT - 130] = 1, - [ICMPV6_MLD2_REPORT - 130] = 1 + [ICMPV6_MLD2_REPORT - 130] = 1, + [ICMPV6_MRDISC_ADV - 130] = 1, + [ICMPV6_MRDISC_SOL - 130] = 1 }; bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 167074283ea9..be3b4c90d2ed 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3333,7 +3333,7 @@ err_expr_parse: return ERR_PTR(err); } -int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) +int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src, gfp_t gfp) { int err; @@ -3341,7 +3341,7 @@ int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) return -EINVAL; dst->ops = src->ops; - err = src->ops->clone(dst, src); + err = src->ops->clone(dst, src, gfp); if (err < 0) return err; @@ -6525,7 +6525,7 @@ int nft_set_elem_expr_clone(const struct nft_ctx *ctx, struct nft_set *set, if (!expr) goto err_expr; - err = nft_expr_clone(expr, set->exprs[i]); + err = nft_expr_clone(expr, set->exprs[i], GFP_KERNEL_ACCOUNT); if (err < 0) { kfree(expr); goto err_expr; @@ -6564,7 +6564,7 @@ static int nft_set_elem_expr_setup(struct nft_ctx *ctx, for (i = 0; i < num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); - err = nft_expr_clone(expr, expr_array[i]); + err = nft_expr_clone(expr, expr_array[i], GFP_KERNEL_ACCOUNT); if (err < 0) goto err_elem_expr_setup; @@ -7776,6 +7776,9 @@ static int nf_tables_newobj(struct sk_buff *skb, const struct nfnl_info *info, if (WARN_ON_ONCE(!type)) return -ENOENT; + if (!obj->ops->update) + return 0; + nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); return nf_tables_updobj(&ctx, type, nla[NFTA_OBJ_DATA], obj); @@ -9467,9 +9470,10 @@ static void nft_obj_commit_update(struct nft_trans *trans) obj = nft_trans_obj(trans); newobj = nft_trans_obj_newobj(trans); - if (obj->ops->update) - obj->ops->update(obj, newobj); + if (WARN_ON_ONCE(!obj->ops->update)) + return; + obj->ops->update(obj, newobj); nft_obj_destroy(&trans->ctx, newobj); } diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index d170758a1eb5..7010541fcca6 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -325,9 +325,6 @@ static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_hook *hook, *found = NULL; int n = 0; - if (event != NETDEV_UNREGISTER) - return; - list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.dev == dev) found = hook; @@ -367,8 +364,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, .net = dev_net(dev), }; - if (event != NETDEV_UNREGISTER && - event != NETDEV_CHANGENAME) + if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; nft_net = nft_pernet(ctx.net); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index de9d1980df69..92b984fa8175 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -210,12 +210,12 @@ static void nft_connlimit_destroy(const struct nft_ctx *ctx, nft_connlimit_do_destroy(ctx, priv); } -static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_connlimit *priv_dst = nft_expr_priv(dst); struct nft_connlimit *priv_src = nft_expr_priv(src); - priv_dst->list = kmalloc(sizeof(*priv_dst->list), GFP_ATOMIC); + priv_dst->list = kmalloc(sizeof(*priv_dst->list), gfp); if (!priv_dst->list) return -ENOMEM; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index dccc68a5135a..291ed2026367 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -226,7 +226,7 @@ static void nft_counter_destroy(const struct nft_ctx *ctx, nft_counter_do_destroy(priv); } -static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_counter_percpu_priv *priv = nft_expr_priv(src); struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst); @@ -236,7 +236,7 @@ static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src) nft_counter_fetch(priv, &total); - cpu_stats = alloc_percpu_gfp(struct nft_counter, GFP_ATOMIC); + cpu_stats = alloc_percpu_gfp(struct nft_counter, gfp); if (cpu_stats == NULL) return -ENOMEM; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index c09dba57354c..b4ada3ab2167 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -35,7 +35,7 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, for (i = 0; i < priv->num_exprs; i++) { expr = nft_setelem_expr_at(elem_expr, elem_expr->size); - if (nft_expr_clone(expr, priv->expr_array[i]) < 0) + if (nft_expr_clone(expr, priv->expr_array[i], GFP_ATOMIC) < 0) return -1; elem_expr->size += priv->expr_array[i]->ops->size; diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c index 8e6d7eaf9dc8..de1b6066bfa8 100644 --- a/net/netfilter/nft_last.c +++ b/net/netfilter/nft_last.c @@ -102,12 +102,12 @@ static void nft_last_destroy(const struct nft_ctx *ctx, kfree(priv->last); } -static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_last_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_last_priv *priv_dst = nft_expr_priv(dst); struct nft_last_priv *priv_src = nft_expr_priv(src); - priv_dst->last = kzalloc(sizeof(*priv_dst->last), GFP_ATOMIC); + priv_dst->last = kzalloc(sizeof(*priv_dst->last), gfp); if (!priv_dst->last) return -ENOMEM; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index cefa25e0dbb0..21d26b79b460 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -150,7 +150,7 @@ static void nft_limit_destroy(const struct nft_ctx *ctx, } static int nft_limit_clone(struct nft_limit_priv *priv_dst, - const struct nft_limit_priv *priv_src) + const struct nft_limit_priv *priv_src, gfp_t gfp) { priv_dst->tokens_max = priv_src->tokens_max; priv_dst->rate = priv_src->rate; @@ -158,7 +158,7 @@ static int nft_limit_clone(struct nft_limit_priv *priv_dst, priv_dst->burst = priv_src->burst; priv_dst->invert = priv_src->invert; - priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), GFP_ATOMIC); + priv_dst->limit = kmalloc(sizeof(*priv_dst->limit), gfp); if (!priv_dst->limit) return -ENOMEM; @@ -223,14 +223,15 @@ static void nft_limit_pkts_destroy(const struct nft_ctx *ctx, nft_limit_destroy(ctx, &priv->limit); } -static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_limit_pkts_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) { struct nft_limit_priv_pkts *priv_dst = nft_expr_priv(dst); struct nft_limit_priv_pkts *priv_src = nft_expr_priv(src); priv_dst->cost = priv_src->cost; - return nft_limit_clone(&priv_dst->limit, &priv_src->limit); + return nft_limit_clone(&priv_dst->limit, &priv_src->limit, gfp); } static struct nft_expr_type nft_limit_type; @@ -281,12 +282,13 @@ static void nft_limit_bytes_destroy(const struct nft_ctx *ctx, nft_limit_destroy(ctx, priv); } -static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_limit_bytes_clone(struct nft_expr *dst, const struct nft_expr *src, + gfp_t gfp) { struct nft_limit_priv *priv_dst = nft_expr_priv(dst); struct nft_limit_priv *priv_src = nft_expr_priv(src); - return nft_limit_clone(priv_dst, priv_src); + return nft_limit_clone(priv_dst, priv_src, gfp); } static const struct nft_expr_ops nft_limit_bytes_ops = { diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 3ba12a7471b0..9b2d7463d3d3 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -233,7 +233,7 @@ static void nft_quota_destroy(const struct nft_ctx *ctx, return nft_quota_do_destroy(ctx, priv); } -static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) +static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src, gfp_t gfp) { struct nft_quota *priv_dst = nft_expr_priv(dst); struct nft_quota *priv_src = nft_expr_priv(src); @@ -241,7 +241,7 @@ static int nft_quota_clone(struct nft_expr *dst, const struct nft_expr *src) priv_dst->quota = priv_src->quota; priv_dst->flags = priv_src->flags; - priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), GFP_ATOMIC); + priv_dst->consumed = kmalloc(sizeof(*priv_dst->consumed), gfp); if (!priv_dst->consumed) return -ENOMEM; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 187138afac45..15a236bebb46 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -504,6 +504,7 @@ out: * pipapo_get() - Get matching element reference given key data * @net: Network namespace * @set: nftables API set representation + * @m: storage containing active/existing elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask * @tstamp: timestamp to check for expired elements @@ -517,17 +518,15 @@ out: */ static struct nft_pipapo_elem *pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_pipapo_match *m, const u8 *data, u8 genmask, u64 tstamp, gfp_t gfp) { struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); - struct nft_pipapo *priv = nft_set_priv(set); unsigned long *res_map, *fill_map = NULL; - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; int i; - m = priv->clone; if (m->bsize_max == 0) return ret; @@ -612,9 +611,11 @@ static struct nft_elem_priv * nft_pipapo_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = rcu_dereference(priv->match); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, (const u8 *)elem->key.val.data, + e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, nft_genmask_cur(net), get_jiffies_64(), GFP_ATOMIC); if (IS_ERR(e)) @@ -1247,6 +1248,40 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, return 0; } +static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) +{ +#ifdef CONFIG_PROVE_LOCKING + const struct net *net = read_pnet(&set->net); + + return lockdep_is_held(&nft_pernet(net)->commit_mutex); +#else + return true; +#endif +} + +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old); + +/** + * pipapo_maybe_clone() - Build clone for pending data changes, if not existing + * @set: nftables API set representation + * + * Return: newly created or existing clone, if any. NULL on allocation failure + */ +static struct nft_pipapo_match *pipapo_maybe_clone(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + + if (priv->clone) + return priv->clone; + + m = rcu_dereference_protected(priv->match, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = pipapo_clone(m); + + return priv->clone; +} + /** * nft_pipapo_insert() - Validate and insert ranged elements * @net: Network namespace @@ -1263,8 +1298,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; const u8 *start = (const u8 *)elem->key.val.data, *end; - struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *m = priv->clone; + struct nft_pipapo_match *m = pipapo_maybe_clone(set); u8 genmask = nft_genmask_next(net); struct nft_pipapo_elem *e, *dup; u64 tstamp = nft_net_tstamp(net); @@ -1272,12 +1306,15 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, const u8 *start_p, *end_p; int i, bsize_max, err = 0; + if (!m) + return -ENOMEM; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) end = (const u8 *)nft_set_ext_key_end(ext)->data; else end = start; - dup = pipapo_get(net, set, start, genmask, tstamp, GFP_KERNEL); + dup = pipapo_get(net, set, m, start, genmask, tstamp, GFP_KERNEL); if (!IS_ERR(dup)) { /* Check if we already have the same exact entry */ const struct nft_data *dup_key, *dup_end; @@ -1299,7 +1336,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, if (PTR_ERR(dup) == -ENOENT) { /* Look for partially overlapping entries */ - dup = pipapo_get(net, set, end, nft_genmask_next(net), tstamp, + dup = pipapo_get(net, set, m, end, nft_genmask_next(net), tstamp, GFP_KERNEL); } @@ -1332,8 +1369,6 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, } /* Insert */ - priv->dirty = true; - bsize_max = m->bsize_max; nft_pipapo_for_each_field(f, i, m) { @@ -1384,7 +1419,7 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, * pipapo_clone() - Clone matching data to create new working copy * @old: Existing matching data * - * Return: copy of matching data passed as 'old', error pointer on failure + * Return: copy of matching data passed as 'old' or NULL. */ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) { @@ -1394,7 +1429,7 @@ static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL); if (!new) - return ERR_PTR(-ENOMEM); + return NULL; new->field_count = old->field_count; new->bsize_max = old->bsize_max; @@ -1466,7 +1501,7 @@ out_scratch: free_percpu(new->scratch); kfree(new); - return ERR_PTR(-ENOMEM); + return NULL; } /** @@ -1698,8 +1733,6 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m) * NFT_SET_ELEM_DEAD_BIT. */ if (__nft_set_elem_expired(&e->ext, tstamp)) { - priv->dirty = true; - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); if (!gc) return; @@ -1777,57 +1810,30 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) static void nft_pipapo_commit(struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *new_clone, *old; - - if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - pipapo_gc(set, priv->clone); + struct nft_pipapo_match *old; - if (!priv->dirty) + if (!priv->clone) return; - new_clone = pipapo_clone(priv->clone); - if (IS_ERR(new_clone)) - return; + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); - priv->dirty = false; + old = rcu_replace_pointer(priv->match, priv->clone, + nft_pipapo_transaction_mutex_held(set)); + priv->clone = NULL; - old = rcu_access_pointer(priv->match); - rcu_assign_pointer(priv->match, priv->clone); if (old) call_rcu(&old->rcu, pipapo_reclaim_match); - - priv->clone = new_clone; -} - -static bool nft_pipapo_transaction_mutex_held(const struct nft_set *set) -{ -#ifdef CONFIG_PROVE_LOCKING - const struct net *net = read_pnet(&set->net); - - return lockdep_is_held(&nft_pernet(net)->commit_mutex); -#else - return true; -#endif } static void nft_pipapo_abort(const struct nft_set *set) { struct nft_pipapo *priv = nft_set_priv(set); - struct nft_pipapo_match *new_clone, *m; - - if (!priv->dirty) - return; - - m = rcu_dereference_protected(priv->match, nft_pipapo_transaction_mutex_held(set)); - new_clone = pipapo_clone(m); - if (IS_ERR(new_clone)) + if (!priv->clone) return; - - priv->dirty = false; - pipapo_free_match(priv->clone); - priv->clone = new_clone; + priv->clone = NULL; } /** @@ -1851,52 +1857,38 @@ static void nft_pipapo_activate(const struct net *net, } /** - * pipapo_deactivate() - Check that element is in set, mark as inactive + * nft_pipapo_deactivate() - Search for element and make it inactive * @net: Network namespace * @set: nftables API set representation - * @data: Input key data - * @ext: nftables API extension pointer, used to check for end element - * - * This is a convenience function that can be called from both - * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same - * operation. + * @elem: nftables API element representation containing key data * * Return: deactivated element if found, NULL otherwise. */ -static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, - const u8 *data, const struct nft_set_ext *ext) +static struct nft_elem_priv * +nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) { + struct nft_pipapo_match *m = pipapo_maybe_clone(set); struct nft_pipapo_elem *e; - e = pipapo_get(net, set, data, nft_genmask_next(net), - nft_net_tstamp(net), GFP_KERNEL); + /* removal must occur on priv->clone, if we are low on memory + * we have no choice and must fail the removal request. + */ + if (!m) + return NULL; + + e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data, + nft_genmask_next(net), nft_net_tstamp(net), GFP_KERNEL); if (IS_ERR(e)) return NULL; nft_set_elem_change_active(net, set, &e->ext); - return e; -} - -/** - * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive - * @net: Network namespace - * @set: nftables API set representation - * @elem: nftables API element representation containing key data - * - * Return: deactivated element if found, NULL otherwise. - */ -static struct nft_elem_priv * -nft_pipapo_deactivate(const struct net *net, const struct nft_set *set, - const struct nft_set_elem *elem) -{ - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); - - return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); + return &e->priv; } /** - * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * nft_pipapo_flush() - make element inactive * @net: Network namespace * @set: nftables API set representation * @elem_priv: nftables API element representation containing key data @@ -2093,7 +2085,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); if (last && f->mt[rulemap[i].to].e == e) { - priv->dirty = true; pipapo_drop(m, rulemap); return; } @@ -2106,35 +2097,23 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, } /** - * nft_pipapo_walk() - Walk over elements + * nft_pipapo_do_walk() - Walk over elements in m * @ctx: nftables API context * @set: nftables API set representation + * @m: matching data pointing to key mapping array * @iter: Iterator * * As elements are referenced in the mapping array for the last field, directly * scan that array: there's no need to follow rule mappings from the first - * field. + * field. @m is protected either by RCU read lock or by transaction mutex. */ -static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_pipapo_do_walk(const struct nft_ctx *ctx, struct nft_set *set, + const struct nft_pipapo_match *m, + struct nft_set_iter *iter) { - struct nft_pipapo *priv = nft_set_priv(set); - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; unsigned int i, r; - WARN_ON_ONCE(iter->type != NFT_ITER_READ && - iter->type != NFT_ITER_UPDATE); - - rcu_read_lock(); - if (iter->type == NFT_ITER_READ) - m = rcu_dereference(priv->match); - else - m = priv->clone; - - if (unlikely(!m)) - goto out; - for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) ; @@ -2151,14 +2130,49 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, iter->err = iter->fn(ctx, set, iter, &e->priv); if (iter->err < 0) - goto out; + return; cont: iter->count++; } +} -out: - rcu_read_unlock(); +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * Test if destructive action is needed or not, clone active backend if needed + * and call the real function to work on the data. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + + switch (iter->type) { + case NFT_ITER_UPDATE: + m = pipapo_maybe_clone(set); + if (!m) { + iter->err = -ENOMEM; + return; + } + + nft_pipapo_do_walk(ctx, set, m, iter); + break; + case NFT_ITER_READ: + rcu_read_lock(); + m = rcu_dereference(priv->match); + nft_pipapo_do_walk(ctx, set, m, iter); + rcu_read_unlock(); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } /** @@ -2267,21 +2281,10 @@ static int nft_pipapo_init(const struct nft_set *set, f->mt = NULL; } - /* Create an initial clone of matching data for next insertion */ - priv->clone = pipapo_clone(m); - if (IS_ERR(priv->clone)) { - err = PTR_ERR(priv->clone); - goto out_free; - } - - priv->dirty = false; - rcu_assign_pointer(priv->match, m); return 0; -out_free: - free_percpu(m->scratch); out_scratch: kfree(m); @@ -2326,33 +2329,18 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx, { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_match *m; - int cpu; m = rcu_dereference_protected(priv->match, true); - if (m) { - rcu_barrier(); - - for_each_possible_cpu(cpu) - pipapo_free_scratch(m, cpu); - free_percpu(m->scratch); - pipapo_free_fields(m); - kfree(m); - priv->match = NULL; - } if (priv->clone) { - m = priv->clone; - - nft_set_pipapo_match_destroy(ctx, set, m); - - for_each_possible_cpu(cpu) - pipapo_free_scratch(priv->clone, cpu); - free_percpu(priv->clone->scratch); - - pipapo_free_fields(priv->clone); - kfree(priv->clone); + nft_set_pipapo_match_destroy(ctx, set, priv->clone); + pipapo_free_match(priv->clone); priv->clone = NULL; + } else { + nft_set_pipapo_match_destroy(ctx, set, m); } + + pipapo_free_match(m); } /** diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 24cd1ff73f98..0d2e40e10f7f 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -155,14 +155,12 @@ struct nft_pipapo_match { * @match: Currently in-use matching data * @clone: Copy where pending insertions and deletions are kept * @width: Total bytes to be matched for one packet, including padding - * @dirty: Working copy has pending insertions or deletions * @last_gc: Timestamp of last garbage collection run, jiffies */ struct nft_pipapo { struct nft_pipapo_match __rcu *match; struct nft_pipapo_match *clone; int width; - bool dirty; unsigned long last_gc; }; diff --git a/tools/testing/selftests/net/netfilter/Makefile b/tools/testing/selftests/net/netfilter/Makefile index e9a6c702b8c9..47945b2b3f92 100644 --- a/tools/testing/selftests/net/netfilter/Makefile +++ b/tools/testing/selftests/net/netfilter/Makefile @@ -13,6 +13,7 @@ TEST_PROGS += conntrack_tcp_unreplied.sh TEST_PROGS += conntrack_sctp_collision.sh TEST_PROGS += conntrack_vrf.sh TEST_PROGS += ipvs.sh +TEST_PROGS += nf_conntrack_packetdrill.sh TEST_PROGS += nf_nat_edemux.sh TEST_PROGS += nft_audit.sh TEST_PROGS += nft_concat_range.sh @@ -45,6 +46,7 @@ $(OUTPUT)/conntrack_dump_flush: CFLAGS += $(MNL_CFLAGS) $(OUTPUT)/conntrack_dump_flush: LDLIBS += $(MNL_LDLIBS) TEST_FILES := lib.sh +TEST_FILES += packetdrill TEST_INCLUDES := \ ../lib.sh diff --git a/tools/testing/selftests/net/netfilter/config b/tools/testing/selftests/net/netfilter/config index 5b5b764f6cd0..63ef80ef47a4 100644 --- a/tools/testing/selftests/net/netfilter/config +++ b/tools/testing/selftests/net/netfilter/config @@ -86,3 +86,4 @@ CONFIG_VLAN_8021Q=m CONFIG_XFRM_USER=m CONFIG_XFRM_STATISTICS=y CONFIG_NET_PKTGEN=m +CONFIG_TUN=m diff --git a/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh new file mode 100755 index 000000000000..c6fdd2079f4d --- /dev/null +++ b/tools/testing/selftests/net/netfilter/nf_conntrack_packetdrill.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source lib.sh + +checktool "conntrack --version" "run test without conntrack" +checktool "iptables --version" "run test without iptables" +checktool "ip6tables --version" "run test without ip6tables" + +modprobe -q tun +modprobe -q nf_conntrack +# echo 1 > /proc/sys/net/netfilter/nf_log_all_netns + +PDRILL_TIMEOUT=10 + +files=" +conntrack_ack_loss_stall.pkt +conntrack_inexact_rst.pkt +conntrack_syn_challenge_ack.pkt +conntrack_synack_old.pkt +conntrack_synack_reuse.pkt +conntrack_rst_invalid.pkt +" + +if ! packetdrill --dry_run --verbose "packetdrill/conntrack_ack_loss_stall.pkt";then + echo "SKIP: packetdrill not installed" + exit ${ksft_skip} +fi + +ret=0 + +run_packetdrill() +{ + filename="$1" + ipver="$2" + local mtu=1500 + + export NFCT_IP_VERSION="$ipver" + + if [ "$ipver" = "ipv4" ];then + export xtables="iptables" + elif [ "$ipver" = "ipv6" ];then + export xtables="ip6tables" + mtu=1520 + fi + + timeout "$PDRILL_TIMEOUT" unshare -n packetdrill --ip_version="$ipver" --mtu=$mtu \ + --tolerance_usecs=1000000 --non_fatal packet "$filename" +} + +run_one_test_file() +{ + filename="$1" + + for v in ipv4 ipv6;do + printf "%-50s(%s)%-20s" "$filename" "$v" "" + if run_packetdrill packetdrill/"$f" "$v";then + echo OK + else + echo FAIL + ret=1 + fi + done +} + +echo "Replaying packetdrill test cases:" +for f in $files;do + run_one_test_file packetdrill/"$f" +done + +exit $ret diff --git a/tools/testing/selftests/net/netfilter/packetdrill/common.sh b/tools/testing/selftests/net/netfilter/packetdrill/common.sh new file mode 100755 index 000000000000..ed36d535196d --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/common.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# for debugging set net.netfilter.nf_log_all_netns=1 in init_net +# or do not use net namespaces. +modprobe -q nf_conntrack +sysctl -q net.netfilter.nf_conntrack_log_invalid=6 + +# Flush old cached data (fastopen cookies). +ip tcp_metrics flush all > /dev/null 2>&1 + +# TCP min, default, and max receive and send buffer sizes. +sysctl -q net.ipv4.tcp_rmem="4096 540000 $((15*1024*1024))" +sysctl -q net.ipv4.tcp_wmem="4096 $((256*1024)) 4194304" + +# TCP congestion control. +sysctl -q net.ipv4.tcp_congestion_control=cubic + +# TCP slow start after idle. +sysctl -q net.ipv4.tcp_slow_start_after_idle=0 + +# TCP Explicit Congestion Notification (ECN) +sysctl -q net.ipv4.tcp_ecn=0 + +sysctl -q net.ipv4.tcp_notsent_lowat=4294967295 > /dev/null 2>&1 + +# Override the default qdisc on the tun device. +# Many tests fail with timing errors if the default +# is FQ and that paces their flows. +tc qdisc add dev tun0 root pfifo + +# Enable conntrack +$xtables -A INPUT -m conntrack --ctstate NEW -p tcp --syn diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt new file mode 100644 index 000000000000..d755bd64c54f --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_ack_loss_stall.pkt @@ -0,0 +1,118 @@ +// check that already-acked (retransmitted) packet is let through rather +// than tagged as INVALID. + +`packetdrill/common.sh` + +// should set -P DROP but it disconnects VM w.o. extra netns ++0 `$xtables -A INPUT -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 10) = 0 + ++0 < S 0:0(0) win 32792 <mss 1000> ++0 > S. 0:0(0) ack 1 <mss 1460> ++.01 < . 1:1(0) ack 1 win 65535 ++0 accept(3, ..., ...) = 4 + ++0.0001 < P. 1:1461(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 1461 win 65535 ++0.0001 < P. 1461:2921(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 2921 win 65535 ++0.0001 < P. 2921:4381(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 4381 win 65535 ++0.0001 < P. 4381:5841(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 5841 win 65535 ++0.0001 < P. 5841:7301(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 7301 win 65535 ++0.0001 < P. 7301:8761(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 8761 win 65535 ++0.0001 < P. 8761:10221(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 10221 win 65535 ++0.0001 < P. 10221:11681(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 11681 win 65535 ++0.0001 < P. 11681:13141(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 13141 win 65535 ++0.0001 < P. 13141:14601(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 14601 win 65535 ++0.0001 < P. 14601:16061(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 16061 win 65535 ++0.0001 < P. 16061:17521(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 17521 win 65535 ++0.0001 < P. 17521:18981(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 18981 win 65535 ++0.0001 < P. 18981:20441(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 20441 win 65535 ++0.0001 < P. 20441:21901(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 21901 win 65535 ++0.0001 < P. 21901:23361(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 23361 win 65535 ++0.0001 < P. 23361:24821(1460) ack 1 win 257 +0.055 > . 1:1(0) ack 24821 win 65535 ++0.0001 < P. 24821:26281(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 26281 win 65535 ++0.0001 < P. 26281:27741(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 27741 win 65535 ++0.0001 < P. 27741:29201(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 29201 win 65535 ++0.0001 < P. 29201:30661(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 30661 win 65535 ++0.0001 < P. 30661:32121(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 32121 win 65535 ++0.0001 < P. 32121:33581(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 33581 win 65535 ++0.0001 < P. 33581:35041(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 35041 win 65535 ++0.0001 < P. 35041:36501(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 36501 win 65535 ++0.0001 < P. 36501:37961(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 37961 win 65535 ++0.0001 < P. 37961:39421(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 39421 win 65535 ++0.0001 < P. 39421:40881(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 40881 win 65535 ++0.0001 < P. 40881:42341(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 42341 win 65535 ++0.0001 < P. 42341:43801(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 43801 win 65535 ++0.0001 < P. 43801:45261(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 45261 win 65535 ++0.0001 < P. 45261:46721(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 46721 win 65535 ++0.0001 < P. 46721:48181(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 48181 win 65535 ++0.0001 < P. 48181:49641(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 49641 win 65535 ++0.0001 < P. 49641:51101(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 51101 win 65535 ++0.0001 < P. 51101:52561(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 52561 win 65535 ++0.0001 < P. 52561:54021(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 54021 win 65535 ++0.0001 < P. 54021:55481(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 55481 win 65535 ++0.0001 < P. 55481:56941(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 56941 win 65535 ++0.0001 < P. 56941:58401(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 58401 win 65535 ++0.0001 < P. 58401:59861(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 59861 win 65535 ++0.0001 < P. 59861:61321(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 61321 win 65535 ++0.0001 < P. 61321:62781(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 62781 win 65535 ++0.0001 < P. 62781:64241(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 64241 win 65535 ++0.0001 < P. 64241:65701(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 65701 win 65535 ++0.0001 < P. 65701:67161(1460) ack 1 win 257 ++.0 > . 1:1(0) ack 67161 win 65535 + +// nf_ct_proto_6: SEQ is under the lower bound (already ACKed data retransmitted) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.24.72 LEN=1500 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=34375 DPT=8080 SEQ=1 ACK=4162510439 WINDOW=257 RES=0x00 ACK PSH URGP=0 ++0.0001 < P. 1:1461(1460) ack 1 win 257 + +// only sent if above packet isn't flagged as invalid ++.0 > . 1:1(0) ack 67161 win 65535 + ++0 `$xtables -D INPUT -m conntrack --ctstate INVALID -j DROP` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt new file mode 100644 index 000000000000..dccdd4c009c6 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_inexact_rst.pkt @@ -0,0 +1,62 @@ +// check RST packet that doesn't exactly match expected next sequence +// number still transitions conntrack state to CLOSE iff its already in +// FIN/CLOSE_WAIT. + +`packetdrill/common.sh` + +// 5.771921 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture] +// 5.771994 server_ip > client_ip TLSv1.2 337 [Packet size limited during capture] +// 5.772212 client_ip > server_ip TCP 66 45020 > 443 [ACK] Seq=1905874048 Ack=781810658 Win=36352 Len=0 TSval=3317842872 TSecr=675936334 +// 5.787924 server_ip > client_ip TLSv1.2 1300 [Packet size limited during capture] +// 5.788126 server_ip > client_ip TLSv1.2 90 Application Data +// 5.788207 server_ip > client_ip TCP 66 443 > 45020 [FIN, ACK] Seq=781811916 Ack=1905874048 Win=31104 Len=0 TSval=675936350 TSecr=3317842872 +// 5.788447 client_ip > server_ip TLSv1.2 90 Application Data +// 5.788479 client_ip > server_ip TCP 66 45020 > 443 [RST, ACK] Seq=1905874072 Ack=781811917 Win=39040 Len=0 TSval=3317842889 TSecr=675936350 +// 5.788581 server_ip > client_ip TCP 54 8443 > 45020 [RST] Seq=781811892 Win=0 Len=0 + ++0 `iptables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `iptables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + ++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460> + ++0 > . 1:1(0) ack 1 win 65535 ++0 < . 1:1001(1000) ack 1 win 65535 ++0 < . 1001:2001(1000) ack 1 win 65535 ++0 < . 2001:3001(1000) ack 1 win 65535 + ++0 > . 1:1(0) ack 1001 win 65535 ++0 > . 1:1(0) ack 2001 win 65535 ++0 > . 1:1(0) ack 3001 win 65535 + ++0 write(3, ..., 1000) = 1000 + ++0.0 > P. 1:1001(1000) ack 3001 win 65535 + ++0.1 read(3, ..., 1000) = 1000 + +// Conntrack should move to FIN_WAIT, then CLOSE_WAIT. ++0 < F. 3001:3001(0) ack 1001 win 65535 ++0 > . 1001:1001(0) ack 3002 win 65535 + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE_WAIT` + ++1 close(3) = 0 +// RST: unread data. FIN was seen, hence ack + 1 ++0 > R. 1001:1001(0) ack 3002 win 65535 +// ... and then, CLOSE. ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ ` + +// Spurious RST from peer -- no sk state. Should NOT get +// marked INVALID, because conntrack is already closing. ++0.1 < R 2001:2001(0) win 0 + +// No packets should have been marked INVALID ++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `iptables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt new file mode 100644 index 000000000000..686f18a3d9ef --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_rst_invalid.pkt @@ -0,0 +1,59 @@ +// check that out of window resets are marked as INVALID and conntrack remains +// in ESTABLISHED state. + +`packetdrill/common.sh` + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) + +0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + ++0.1 < S. 1:1(0) ack 1 win 65535 <mss 1460> + ++0 > . 1:1(0) ack 1 win 65535 ++0 < . 1:1001(1000) ack 1 win 65535 ++0 < . 1001:2001(1000) ack 1 win 65535 ++0 < . 2001:3001(1000) ack 1 win 65535 + ++0 > . 1:1(0) ack 1001 win 65535 ++0 > . 1:1(0) ack 2001 win 65535 ++0 > . 1:1(0) ack 3001 win 65535 + ++0 write(3, ..., 1000) = 1000 + +// out of window ++0.0 < R 0:0(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// out of window ++0.0 < R 1000000:1000000(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// in-window but not exact match ++0.0 < R 42:42(0) win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + ++0.0 > P. 1:1001(1000) ack 3001 win 65535 + ++0.1 read(3, ..., 1000) = 1000 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + ++0 < . 3001:3001(0) ack 1001 win 65535 + ++0.0 < R. 3000:3000(0) ack 1001 win 0 ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q ESTABLISHED` + +// exact next sequence ++0.0 < R. 3001:3001(0) ack 1001 win 0 +// Conntrack should move to CLOSE + +// Expect four invalid RSTs ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 4 "` ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null |grep -q CLOSE\ ` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt new file mode 100644 index 000000000000..3442cd29bc93 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_syn_challenge_ack.pkt @@ -0,0 +1,44 @@ +// Check connection re-use, i.e. peer that receives the SYN answers with +// a challenge-ACK. +// Check that conntrack lets all packets pass, including the challenge ack, +// and that a new connection is established. + +`packetdrill/common.sh` + +// S > +// . < (challnge-ack) +// R. > +// S > +// S. < +// Expected outcome: established connection. + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + +// Challenge ACK, old incarnation. +0.1 < . 145824453:145824453(0) ack 643160523 win 240 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0> + ++0.01 > R 643160523:643160523(0) win 0 + ++0.01 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT` + +// Must go through. ++0.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + +// correct synack ++0.1 < S. 0:0(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0> + +// 3whs completes. ++0.01 > . 1:1(0) ack 1 win 256 <nop,nop,TS val 1 ecr 1> + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED` + +// No packets should have been marked INVALID ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt new file mode 100644 index 000000000000..3047160c4bf3 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_old.pkt @@ -0,0 +1,51 @@ +// Check conntrack copes with syn/ack reply for a previous, old incarnation. + +// tcpdump with buggy sequence +// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083107423 ecr 0,nop,wscale 7], length 0 +// OLD synack, for old/previous S +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 145824453, ack 643160523, win 65535, options [mss 8952,nop,wscale 5,TS val 3215437785 ecr 2082921663,nop,nop], length 0 +// This reset never makes it to the endpoint, elided in the packetdrill script +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [R.], seq 1, ack 1, win 65535, options [mss 8952,nop,wscale 5,TS val 3215443451 ecr 2082921663,nop,nop], length 0 +// Syn retransmit, no change +// 10.176.25.8.829 > 10.192.171.30.2049: Flags [S], seq 2375731741, win 29200, options [mss 1460,sackOK,TS val 2083115583 ecr 0,nop,wscale 7], length 0 +// CORRECT synack, should be accepted, but conntrack classified this as INVALID: +// SEQ is over the upper bound (over the window of the receiver) IN=tun0 OUT= MAC= SRC=192.0.2.1 DST=192.168.37.78 LEN=40 TOS=0x00 PREC=0x00 TTL=255 ID=0 PROTO=TCP SPT=8080 DPT=34500 SEQ=162602411 ACK=2124350315 .. +// 10.192.171.30.2049 > 10.176.25.8.829: Flags [S.], seq 162602410, ack 2375731742, win 65535, options [mss 8952,nop,wscale 5,TS val 3215445754 ecr 2083115583,nop,nop], length 0 + +`packetdrill/common.sh` + ++0 `$xtables -A INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -A OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 + +0.1 connect(3, ..., ...) = -1 EINPROGRESS (Operation now in progress) +0.1 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1 ecr 0,nop,wscale 8> + +// bogus/outdated synack, invalid ack value +0.1 < S. 145824453:145824453(0) ack 643160523 win 240 <mss 1440,nop,nop,TS val 1 ecr 1,nop,wscale 0> + +// syn retransmitted +1.01 > S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8> ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep UNREPLIED | grep -q SYN_SENT` + +// correct synack ++0 < S. 145758918:145758918(0) ack 1 win 250 <mss 1460,nop,nop,TS val 1 ecr 1,nop,wscale 0> ++0 write(3, ..., 1) = 1 + +// with buggy conntrack above packet is dropped, so SYN rtx is seen: +// script packet: 1.054007 . 1:1(0) ack 16777958 win 256 <nop,nop,TS val 1033 ecr 1> +// actual packet: 3.010000 S 0:0(0) win 65535 <mss 1460,sackOK,TS val 1015 ecr 0,nop,wscale 8> ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ESTABLISHED | grep -q ASSURED` + ++0 > P. 1:2(1) ack 4294901762 win 256 <nop,nop,TS val 1067 ecr 1> + ++0 `conntrack -f $NFCT_IP_VERSION -L -p tcp --dport 8080 2>/dev/null | grep ASSURED | grep -q ESTABLISHED` + +// No packets should have been marked INVALID in OUTPUT direction, 1 in INPUT ++0 `$xtables -v -S OUTPUT | grep INVALID | grep -q -- "-c 0 0"` ++0 `$xtables -v -S INPUT | grep INVALID | grep -q -- "-c 1 "` + ++0 `$xtables -D INPUT -p tcp -m conntrack --ctstate INVALID -j DROP` ++0 `$xtables -D OUTPUT -p tcp -m conntrack --ctstate INVALID -j DROP` diff --git a/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt new file mode 100644 index 000000000000..21e1bb6395e4 --- /dev/null +++ b/tools/testing/selftests/net/netfilter/packetdrill/conntrack_synack_reuse.pkt @@ -0,0 +1,34 @@ +// Check reception of another SYN while we have an established conntrack state. +// Challenge ACK is supposed to pass through, RST reply should clear conntrack +// state and SYN retransmit should give us new 'SYN_RECV' connection state. + +`packetdrill/common.sh` + +// should show a match if bug is present: ++0 `iptables -A INPUT -m conntrack --ctstate INVALID -p tcp --tcp-flags SYN,ACK SYN,ACK` + ++0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 ++0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 ++0 bind(3, ..., ...) = 0 ++0 listen(3, 10) = 0 + ++0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7, TS val 1 ecr 0,nop,nop> ++0 > S. 0:0(0) ack 1 <mss 1460,nop,nop,TS val 100 ecr 1,nop,wscale 8> ++.01 < . 1:1(0) ack 1 win 257 <TS val 1 ecr 100,nop,nop> ++0 accept(3, ..., ...) = 4 + ++0 < P. 1:101(100) ack 1 win 257 <TS val 2 ecr 100,nop,nop> ++.001 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 110 ecr 2> ++0 read(4, ..., 101) = 100 + +1.0 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 7, TS val 233 ecr 0,nop,nop> +// Won't expect this: challenge ack. + ++0 > . 1:1(0) ack 101 win 256 <nop,nop,TS val 112 ecr 2> ++0 < R. 101:101(0) ack 1 win 257 ++0 close(4) = 0 + +1.5 < S 2000:2000(0) win 32792 <mss 1000,nop,wscale 0, TS val 233 ecr 0,nop,nop> + ++0 `conntrack -L -p tcp --dport 8080 2>/dev/null | grep -q SYN_RECV` ++0 `iptables -v -S INPUT | grep INVALID | grep -q -- "-c 0 0"` |