diff options
author | Martin KaFai Lau | 2022-05-11 17:06:05 -0700 |
---|---|---|
committer | Jakub Kicinski | 2022-05-12 16:52:18 -0700 |
commit | cae3873c5b3a4fcd9706fb461ff4e91bdf1f0120 (patch) | |
tree | 54650bd157a612c462c1fb7856c1800babd1790b /net | |
parent | e8d0059000b20c4745c5b6a713f6adb269cff8ff (diff) |
net: inet: Retire port only listening_hash
The listen sk is currently stored in two hash tables,
listening_hash (hashed by port) and lhash2 (hashed by port and address).
After commit 0ee58dad5b06 ("net: tcp6: prefer listeners bound to an address")
and commit d9fbc7f6431f ("net: tcp: prefer listeners bound to an address"),
the TCP-SYN lookup fast path does not use listening_hash.
The commit 05c0b35709c5 ("tcp: seq_file: Replace listening_hash with lhash2")
also moved the seq_file (/proc/net/tcp) iteration usage from
listening_hash to lhash2.
There are still a few listening_hash usages left.
One of them is inet_reuseport_add_sock() which uses the listening_hash
to search a listen sk during the listen() system call. This turns
out to be very slow on use cases that listen on many different
VIPs at a popular port (e.g. 443). [ On top of the slowness in
adding to the tail in the IPv6 case ]. The latter patch has a
selftest to demonstrate this case.
This patch takes this chance to move all remaining listening_hash
usages to lhash2 and then retire listening_hash.
Since most changes need to be done together, it is hard to cut
the listening_hash to lhash2 switch into small patches. The
changes in this patch is highlighted here for the review
purpose.
1. Because of the listening_hash removal, lhash2 can use the
sk->sk_nulls_node instead of the icsk->icsk_listen_portaddr_node.
This will also keep the sk_unhashed() check to work as is
after stop adding sk to listening_hash.
The union is removed from inet_listen_hashbucket because
only nulls_head is needed.
2. icsk->icsk_listen_portaddr_node and its helpers are removed.
3. The current lhash2 users needs to iterate with sk_nulls_node
instead of icsk_listen_portaddr_node.
One case is in the inet[6]_lhash2_lookup().
Another case is the seq_file iterator in tcp_ipv4.c.
One thing to note is sk_nulls_next() is needed
because the old inet_lhash2_for_each_icsk_continue()
does a "next" first before iterating.
4. Move the remaining listening_hash usage to lhash2
inet_reuseport_add_sock() which this series is
trying to improve.
inet_diag.c and mptcp_diag.c are the final two
remaining use cases and is moved to lhash2 now also.
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
-rw-r--r-- | net/dccp/proto.c | 1 | ||||
-rw-r--r-- | net/ipv4/inet_diag.c | 5 | ||||
-rw-r--r-- | net/ipv4/inet_hashtables.c | 47 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 1 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 21 | ||||
-rw-r--r-- | net/ipv6/inet6_hashtables.c | 5 | ||||
-rw-r--r-- | net/mptcp/mptcp_diag.c | 4 |
7 files changed, 25 insertions, 59 deletions
diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 58421f94427e..eb8e128e43e8 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1110,7 +1110,6 @@ static int __init dccp_init(void) BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > sizeof_field(struct sk_buff, cb)); - inet_hashinfo_init(&dccp_hashinfo); rc = inet_hashinfo2_init_mod(&dccp_hashinfo); if (rc) goto out_fail; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 581b5b2d72a5..b812eb36f0e3 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -1028,12 +1028,13 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) goto skip_listen_ht; - for (i = s_i; i < INET_LHTABLE_SIZE; i++) { + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; num = 0; - ilb = &hashinfo->listening_hash[i]; + ilb = &hashinfo->lhash2[i]; + spin_lock(&ilb->lock); sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 4b00fcb9088d..0b8235fbd440 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -246,12 +246,11 @@ static struct sock *inet_lhash2_lookup(struct net *net, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { - struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, @@ -598,7 +597,6 @@ int __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb2; - struct inet_listen_hashbucket *ilb; int err = 0; if (sk->sk_state != TCP_LISTEN) { @@ -608,31 +606,23 @@ int __inet_hash(struct sock *sk, struct sock *osk) return 0; } WARN_ON(!sk_unhashed(sk)); - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); - spin_lock(&ilb->lock); spin_lock(&ilb2->lock); if (sk->sk_reuseport) { - err = inet_reuseport_add_sock(sk, ilb); + err = inet_reuseport_add_sock(sk, ilb2); if (err) goto unlock; } if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && - sk->sk_family == AF_INET6) { - hlist_add_tail_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - __sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head); - } else { - hlist_add_head_rcu(&inet_csk(sk)->icsk_listen_portaddr_node, - &ilb2->head); - __sk_nulls_add_node_rcu(sk, &ilb->nulls_head); - } + sk->sk_family == AF_INET6) + __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head); + else + __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head); sock_set_flag(sk, SOCK_RCU_FREE); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); unlock: spin_unlock(&ilb2->lock); - spin_unlock(&ilb->lock); return err; } @@ -658,29 +648,23 @@ void inet_unhash(struct sock *sk) if (sk->sk_state == TCP_LISTEN) { struct inet_listen_hashbucket *ilb2; - struct inet_listen_hashbucket *ilb; - ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; ilb2 = inet_lhash2_bucket_sk(hashinfo, sk); /* Don't disable bottom halves while acquiring the lock to * avoid circular locking dependency on PREEMPT_RT. */ - spin_lock(&ilb->lock); spin_lock(&ilb2->lock); if (sk_unhashed(sk)) { spin_unlock(&ilb2->lock); - spin_unlock(&ilb->lock); return; } if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); - hlist_del_init_rcu(&inet_csk(sk)->icsk_listen_portaddr_node); __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&ilb2->lock); - spin_unlock(&ilb->lock); } else { spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); @@ -848,27 +832,14 @@ int inet_hash_connect(struct inet_timewait_death_row *death_row, } EXPORT_SYMBOL_GPL(inet_hash_connect); -void inet_hashinfo_init(struct inet_hashinfo *h) -{ - int i; - - for (i = 0; i < INET_LHTABLE_SIZE; i++) { - spin_lock_init(&h->listening_hash[i].lock); - INIT_HLIST_NULLS_HEAD(&h->listening_hash[i].nulls_head, - i + LISTENING_NULLS_BASE); - } - - h->lhash2 = NULL; -} -EXPORT_SYMBOL_GPL(inet_hashinfo_init); - static void init_hashinfo_lhash2(struct inet_hashinfo *h) { int i; for (i = 0; i <= h->lhash2_mask; i++) { spin_lock_init(&h->lhash2[i].lock); - INIT_HLIST_HEAD(&h->lhash2[i].head); + INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head, + i + LISTENING_NULLS_BASE); } } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b44fde435bd1..028513d3e2a2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4595,7 +4595,6 @@ void __init tcp_init(void) timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); - inet_hashinfo_init(&tcp_hashinfo); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 918816ec5dd4..218ad871c0e4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2283,16 +2283,15 @@ static void *listening_get_first(struct seq_file *seq) st->offset = 0; for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { struct inet_listen_hashbucket *ilb2; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; struct sock *sk; ilb2 = &tcp_hashinfo.lhash2[st->bucket]; - if (hlist_empty(&ilb2->head)) + if (hlist_nulls_empty(&ilb2->nulls_head)) continue; spin_lock(&ilb2->lock); - inet_lhash2_for_each_icsk(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each(sk, node, &ilb2->nulls_head) { if (seq_sk_match(seq, sk)) return sk; } @@ -2311,15 +2310,14 @@ static void *listening_get_next(struct seq_file *seq, void *cur) { struct tcp_iter_state *st = seq->private; struct inet_listen_hashbucket *ilb2; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; struct sock *sk = cur; ++st->num; ++st->offset; - icsk = inet_csk(sk); - inet_lhash2_for_each_icsk_continue(icsk) { - sk = (struct sock *)icsk; + sk = sk_nulls_next(sk); + sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) return sk; } @@ -2728,16 +2726,15 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, { struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; - struct inet_connection_sock *icsk; + struct hlist_nulls_node *node; unsigned int expected = 1; struct sock *sk; sock_hold(start_sk); iter->batch[iter->end_sk++] = start_sk; - icsk = inet_csk(start_sk); - inet_lhash2_for_each_icsk_continue(icsk) { - sk = (struct sock *)icsk; + sk = sk_nulls_next(start_sk); + sk_nulls_for_each_from(sk, node) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 32ccac10bd62..a758f2ab7b51 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -138,12 +138,11 @@ static struct sock *inet6_lhash2_lookup(struct net *net, const __be16 sport, const struct in6_addr *daddr, const unsigned short hnum, const int dif, const int sdif) { - struct inet_connection_sock *icsk; struct sock *sk, *result = NULL; + struct hlist_nulls_node *node; int score, hiscore = 0; - inet_lhash2_for_each_icsk_rcu(icsk, &ilb2->head) { - sk = (struct sock *)icsk; + sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) { score = compute_score(sk, net, hnum, daddr, dif, sdif); if (score > hiscore) { result = lookup_reuseport(net, sk, skb, doff, diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index dbb6d876a203..7f9a71780437 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -83,13 +83,13 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba struct net *net = sock_net(skb->sk); int i; - for (i = diag_ctx->l_slot; i < INET_LHTABLE_SIZE; i++) { + for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk; int num = 0; - ilb = &tcp_hashinfo.listening_hash[i]; + ilb = &tcp_hashinfo.lhash2[i]; rcu_read_lock(); spin_lock(&ilb->lock); |