diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/arp.c | 2 | ||||
-rw-r--r-- | net/ipv4/fib_frontend.c | 12 | ||||
-rw-r--r-- | net/ipv4/fib_semantics.c | 50 | ||||
-rw-r--r-- | net/ipv4/route.c | 7 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp_bbr.c | 10 | ||||
-rw-r--r-- | net/ipv4/tcp_cdg.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_dctcp.c | 55 | ||||
-rw-r--r-- | net/ipv4/tcp_dctcp.h | 40 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 72 | ||||
-rw-r--r-- | net/ipv4/tcp_timer.c | 2 | ||||
-rw-r--r-- | net/ipv4/udp.c | 2 |
12 files changed, 164 insertions, 100 deletions
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index e90c89ef8c08..850a6f13a082 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1255,6 +1255,8 @@ static int arp_netdev_event(struct notifier_block *this, unsigned long event, change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); + if (!netif_carrier_ok(dev)) + neigh_carrier_down(&arp_tbl, dev); break; default: break; diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 038f511c73fa..0f1beceb47d5 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1291,7 +1291,8 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct netdev_notifier_changeupper_info *info; + struct netdev_notifier_changeupper_info *upper_info = ptr; + struct netdev_notifier_info_ext *info_ext = ptr; struct in_device *in_dev; struct net *net = dev_net(dev); unsigned int flags; @@ -1326,16 +1327,19 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_sync_up(dev, RTNH_F_LINKDOWN); else fib_sync_down_dev(dev, event, false); - /* fall through */ + rt_cache_flush(net); + break; case NETDEV_CHANGEMTU: + fib_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(net); break; case NETDEV_CHANGEUPPER: - info = ptr; + upper_info = ptr; /* flush all routes if dev is linked to or unlinked from * an L3 master device (e.g., VRF) */ - if (info->upper_dev && netif_is_l3_master(info->upper_dev)) + if (upper_info->upper_dev && + netif_is_l3_master(upper_info->upper_dev)) fib_disable_ip(dev, NETDEV_DOWN, true); break; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index f8c7ec8171a8..b5c3937ca6ec 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1457,6 +1457,56 @@ static int call_fib_nh_notifiers(struct fib_nh *fib_nh, return NOTIFY_DONE; } +/* Update the PMTU of exceptions when: + * - the new MTU of the first hop becomes smaller than the PMTU + * - the old MTU was the same as the PMTU, and it limited discovery of + * larger MTUs on the path. With that limit raised, we can now + * discover larger MTUs + * A special case is locked exceptions, for which the PMTU is smaller + * than the minimal accepted PMTU: + * - if the new MTU is greater than the PMTU, don't make any change + * - otherwise, unlock and set PMTU + */ +static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig) +{ + struct fnhe_hash_bucket *bucket; + int i; + + bucket = rcu_dereference_protected(nh->nh_exceptions, 1); + if (!bucket) + return; + + for (i = 0; i < FNHE_HASH_SIZE; i++) { + struct fib_nh_exception *fnhe; + + for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); + fnhe; + fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { + if (fnhe->fnhe_mtu_locked) { + if (new <= fnhe->fnhe_pmtu) { + fnhe->fnhe_pmtu = new; + fnhe->fnhe_mtu_locked = false; + } + } else if (new < fnhe->fnhe_pmtu || + orig == fnhe->fnhe_pmtu) { + fnhe->fnhe_pmtu = new; + } + } + } +} + +void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) +{ + unsigned int hash = fib_devindex_hashfn(dev->ifindex); + struct hlist_head *head = &fib_info_devhash[hash]; + struct fib_nh *nh; + + hlist_for_each_entry(nh, head, nh_hash) { + if (nh->nh_dev == dev) + nh_update_mtu(nh, dev->mtu, orig_mtu); + } +} + /* Event force Flags Description * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f71d2395c428..c0a9d26c06ce 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1001,21 +1001,22 @@ out: kfree_skb(skb); static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; + u32 old_mtu = ipv4_mtu(dst); struct fib_result res; bool lock = false; if (ip_mtu_locked(dst)) return; - if (ipv4_mtu(dst) < mtu) + if (old_mtu < mtu) return; if (mtu < ip_rt_min_pmtu) { lock = true; - mtu = ip_rt_min_pmtu; + mtu = min(old_mtu, ip_rt_min_pmtu); } - if (rt->rt_pmtu == mtu && + if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) return; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 43ef83b2330e..b8ba8fa34eff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3111,10 +3111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + unsigned long rate; u32 now; u64 rate64; bool slow; - u32 rate; memset(info, 0, sizeof(*info)); if (sk->sk_type != SOCK_STREAM) @@ -3124,11 +3124,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; + rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_pacing_rate = rate64; rate = READ_ONCE(sk->sk_max_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; + rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; @@ -3254,8 +3254,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; + unsigned long rate; u64 rate64; - u32 rate; stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) @@ -3274,7 +3274,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) tp->total_retrans, TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); - rate64 = rate != ~0U ? rate : ~0ULL; + rate64 = (rate != ~0UL) ? rate : ~0ULL; nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); rate64 = tcp_compute_delivery_rate(tp); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index a5786e3e2c16..b88081285fd1 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -129,7 +129,7 @@ static const u32 bbr_probe_rtt_mode_ms = 200; static const int bbr_min_tso_rate = 1200000; /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ -static const int bbr_pacing_marging_percent = 1; +static const int bbr_pacing_margin_percent = 1; /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain * that will allow a smoothly increasing pacing rate that will double each RTT @@ -214,12 +214,12 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) rate *= mss; rate *= gain; rate >>= BBR_SCALE; - rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_marging_percent); + rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); return rate >> BW_SCALE; } /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ -static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) { u64 rate = bw; @@ -258,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) bbr_init_pacing_rate_from_rtt(sk); @@ -280,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk) /* Sort of tcp_tso_autosize() but ignoring * driver provided sk_gso_max_size. */ - bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, + bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift, GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 06fbe102a425..37eebd910396 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -146,7 +146,7 @@ static void tcp_cdg_hystart_update(struct sock *sk) return; if (hystart_detect & HYSTART_ACK_TRAIN) { - u32 now_us = div_u64(local_clock(), NSEC_PER_USEC); + u32 now_us = tp->tcp_mstamp; if (ca->last_ack == 0 || !tcp_is_cwnd_limited(sk)) { ca->last_ack = now_us; diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index ca61e2a659e7..cd4814f7e962 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -44,6 +44,7 @@ #include <linux/mm.h> #include <net/tcp.h> #include <linux/inet_diag.h> +#include "tcp_dctcp.h" #define DCTCP_MAX_ALPHA 1024U @@ -118,54 +119,6 @@ static u32 dctcp_ssthresh(struct sock *sk) return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } -/* Minimal DCTP CE state machine: - * - * S: 0 <- last pkt was non-CE - * 1 <- last pkt was CE - */ - -static void dctcp_ce_state_0_to_1(struct sock *sk) -{ - struct dctcp *ca = inet_csk_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (!ca->ce_state) { - /* State has changed from CE=0 to CE=1, force an immediate - * ACK to reflect the new CE state. If an ACK was delayed, - * send that first to reflect the prior CE state. - */ - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) - __tcp_send_ack(sk, ca->prior_rcv_nxt); - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } - - ca->prior_rcv_nxt = tp->rcv_nxt; - ca->ce_state = 1; - - tp->ecn_flags |= TCP_ECN_DEMAND_CWR; -} - -static void dctcp_ce_state_1_to_0(struct sock *sk) -{ - struct dctcp *ca = inet_csk_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (ca->ce_state) { - /* State has changed from CE=1 to CE=0, force an immediate - * ACK to reflect the new CE state. If an ACK was delayed, - * send that first to reflect the prior CE state. - */ - if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) - __tcp_send_ack(sk, ca->prior_rcv_nxt); - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } - - ca->prior_rcv_nxt = tp->rcv_nxt; - ca->ce_state = 0; - - tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; -} - static void dctcp_update_alpha(struct sock *sk, u32 flags) { const struct tcp_sock *tp = tcp_sk(sk); @@ -230,12 +183,12 @@ static void dctcp_state(struct sock *sk, u8 new_state) static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { + struct dctcp *ca = inet_csk_ca(sk); + switch (ev) { case CA_EVENT_ECN_IS_CE: - dctcp_ce_state_0_to_1(sk); - break; case CA_EVENT_ECN_NO_CE: - dctcp_ce_state_1_to_0(sk); + dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; default: /* Don't care for the rest. */ diff --git a/net/ipv4/tcp_dctcp.h b/net/ipv4/tcp_dctcp.h new file mode 100644 index 000000000000..d69a77cbd0c7 --- /dev/null +++ b/net/ipv4/tcp_dctcp.h @@ -0,0 +1,40 @@ +#ifndef _TCP_DCTCP_H +#define _TCP_DCTCP_H + +static inline void dctcp_ece_ack_cwr(struct sock *sk, u32 ce_state) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (ce_state == 1) + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + else + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +/* Minimal DCTP CE state machine: + * + * S: 0 <- last pkt was non-CE + * 1 <- last pkt was CE + */ +static inline void dctcp_ece_ack_update(struct sock *sk, enum tcp_ca_event evt, + u32 *prior_rcv_nxt, u32 *ce_state) +{ + u32 new_ce_state = (evt == CA_EVENT_ECN_IS_CE) ? 1 : 0; + + if (*ce_state != new_ce_state) { + /* CE state has changed, force an immediate ACK to + * reflect the new CE state. If an ACK was delayed, + * send that first to reflect the prior CE state. + */ + if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { + dctcp_ece_ack_cwr(sk, *ce_state); + __tcp_send_ack(sk, *prior_rcv_nxt); + } + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + } + *prior_rcv_nxt = tcp_sk(sk)->rcv_nxt; + *ce_state = new_ce_state; + dctcp_ece_ack_cwr(sk, new_ce_state); +} + +#endif diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 059b67af28b1..d212e4cbc689 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -52,9 +52,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp) { u64 val = tcp_clock_ns(); - /* departure time for next data packet */ - if (val > tp->tcp_wstamp_ns) - tp->tcp_wstamp_ns = val; + if (val > tp->tcp_clock_cache) + tp->tcp_clock_cache = val; val = div_u64(val, NSEC_PER_USEC); if (val > tp->tcp_mstamp) @@ -976,32 +975,26 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void tcp_internal_pacing(struct sock *sk) -{ - if (!tcp_needs_internal_pacing(sk)) - return; - hrtimer_start(&tcp_sk(sk)->pacing_timer, - ns_to_ktime(tcp_sk(sk)->tcp_wstamp_ns), - HRTIMER_MODE_ABS_PINNED_SOFT); - sock_hold(sk); -} - -static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb) +static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, + u64 prior_wstamp) { struct tcp_sock *tp = tcp_sk(sk); skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (sk->sk_pacing_status != SK_PACING_NONE) { - u32 rate = sk->sk_pacing_rate; + unsigned long rate = sk->sk_pacing_rate; /* Original sch_fq does not pace first 10 MSS * Note that tp->data_segs_out overflows after 2^32 packets, * this is a minor annoyance. */ - if (rate != ~0U && rate && tp->data_segs_out >= 10) { - tp->tcp_wstamp_ns += div_u64((u64)skb->len * NSEC_PER_SEC, rate); + if (rate != ~0UL && rate && tp->data_segs_out >= 10) { + u64 len_ns = div64_ul((u64)skb->len * NSEC_PER_SEC, rate); + u64 credit = tp->tcp_wstamp_ns - prior_wstamp; - tcp_internal_pacing(sk); + /* take into account OS jitter */ + len_ns -= min_t(u64, len_ns / 2, credit); + tp->tcp_wstamp_ns += len_ns; } } list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); @@ -1030,6 +1023,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, struct sk_buff *oskb = NULL; struct tcp_md5sig_key *md5; struct tcphdr *th; + u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -1050,6 +1044,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, if (unlikely(!skb)) return -ENOBUFS; } + + prior_wstamp = tp->tcp_wstamp_ns; + tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; inet = inet_sk(sk); @@ -1166,7 +1164,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, err = net_xmit_eval(err); } if (!err && oskb) { - tcp_update_skb_after_send(sk, oskb); + tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; @@ -1701,8 +1699,9 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, { u32 bytes, segs; - bytes = min(sk->sk_pacing_rate >> sk->sk_pacing_shift, - sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); + bytes = min_t(unsigned long, + sk->sk_pacing_rate >> sk->sk_pacing_shift, + sk->sk_gso_max_size - 1 - MAX_TCP_HEADER); /* Goal is to send at least one packet per ms, * not one big TSO packet every 100 ms. @@ -2175,10 +2174,23 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } -static bool tcp_pacing_check(const struct sock *sk) +static bool tcp_pacing_check(struct sock *sk) { - return tcp_needs_internal_pacing(sk) && - hrtimer_is_queued(&tcp_sk(sk)->pacing_timer); + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_needs_internal_pacing(sk)) + return false; + + if (tp->tcp_wstamp_ns <= tp->tcp_clock_cache) + return false; + + if (!hrtimer_is_queued(&tp->pacing_timer)) { + hrtimer_start(&tp->pacing_timer, + ns_to_ktime(tp->tcp_wstamp_ns), + HRTIMER_MODE_ABS_PINNED_SOFT); + sock_hold(sk); + } + return true; } /* TCP Small Queues : @@ -2195,10 +2207,12 @@ static bool tcp_pacing_check(const struct sock *sk) static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, unsigned int factor) { - unsigned int limit; + unsigned long limit; - limit = max(2 * skb->truesize, sk->sk_pacing_rate >> sk->sk_pacing_shift); - limit = min_t(u32, limit, + limit = max_t(unsigned long, + 2 * skb->truesize, + sk->sk_pacing_rate >> sk->sk_pacing_shift); + limit = min_t(unsigned long, limit, sock_net(sk)->ipv4.sysctl_tcp_limit_output_bytes); limit <<= factor; @@ -2315,7 +2329,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { /* "skb_mstamp" is used as a start point for the retransmit timer */ - tcp_update_skb_after_send(sk, skb); + tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); goto repair; /* Skip network transmission */ } @@ -2890,7 +2904,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) } tcp_skb_tsorted_restore(skb); if (!err) { - tcp_update_skb_after_send(sk, skb); + tcp_update_skb_after_send(sk, skb, tp->tcp_wstamp_ns); tcp_rate_skb_sent(sk, skb); } } else { diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 61023d50cd60..676020663ce8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk) */ start_ts = tcp_skb_timestamp(skb); if (!start_ts) - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; + skb->skb_mstamp_ns = tp->tcp_clock_cache; else if (icsk->icsk_user_timeout && (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) goto abort; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1bec2203d558..cf8252d05a01 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1627,7 +1627,7 @@ busy_check: *err = error; return NULL; } -EXPORT_SYMBOL_GPL(__skb_recv_udp); +EXPORT_SYMBOL(__skb_recv_udp); /* * This should be easy, if there is something there we |