diff options
Diffstat (limited to 'net/ipv4/tcp_bbr.c')
-rw-r--r-- | net/ipv4/tcp_bbr.c | 132 |
1 files changed, 96 insertions, 36 deletions
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 13d34427ca3d..9277abdd822a 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -95,11 +95,10 @@ struct bbr { u32 mode:3, /* current bbr_mode in state machine */ prev_ca_state:3, /* CA state on previous ACK */ packet_conservation:1, /* use packet conservation? */ - restore_cwnd:1, /* decided to revert cwnd to old value */ round_start:1, /* start of packet-timed tx->ack round? */ idle_restart:1, /* restarting after idle? */ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ - unused:12, + unused:13, lt_is_sampling:1, /* taking long-term ("LT") samples now? */ lt_rtt_cnt:7, /* round trips in long-term interval */ lt_use_bw:1; /* use lt_bw as our bw estimate? */ @@ -129,6 +128,9 @@ static const u32 bbr_probe_rtt_mode_ms = 200; /* Skip TSO below the following bandwidth (bits/sec): */ static const int bbr_min_tso_rate = 1200000; +/* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. */ +static const int bbr_pacing_margin_percent = 1; + /* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain * that will allow a smoothly increasing pacing rate that will double each RTT * and send the same number of packets per RTT that an un-paced, slow-starting @@ -175,6 +177,8 @@ static const u32 bbr_lt_bw_diff = 4000 / 8; /* If we estimate we're policed, use lt_bw for this many round trips: */ static const u32 bbr_lt_bw_max_rtts = 48; +static void bbr_check_probe_rtt_done(struct sock *sk); + /* Do we estimate that STARTUP filled the pipe? */ static bool bbr_full_bw_reached(const struct sock *sk) { @@ -207,17 +211,15 @@ static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) { unsigned int mss = tcp_sk(sk)->mss_cache; - if (!tcp_needs_internal_pacing(sk)) - mss = tcp_mss_to_mtu(sk, mss); rate *= mss; rate *= gain; rate >>= BBR_SCALE; - rate *= USEC_PER_SEC; + rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); return rate >> BW_SCALE; } /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ -static u32 bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) +static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) { u64 rate = bw; @@ -256,7 +258,7 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 rate = bbr_bw_to_pacing_rate(sk, bw, gain); + unsigned long rate = bbr_bw_to_pacing_rate(sk, bw, gain); if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) bbr_init_pacing_rate_from_rtt(sk); @@ -278,7 +280,7 @@ static u32 bbr_tso_segs_goal(struct sock *sk) /* Sort of tcp_tso_autosize() but ignoring * driver provided sk_gso_max_size. */ - bytes = min_t(u32, sk->sk_pacing_rate >> sk->sk_pacing_shift, + bytes = min_t(unsigned long, sk->sk_pacing_rate >> sk->sk_pacing_shift, GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); @@ -309,6 +311,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) */ if (bbr->mode == BBR_PROBE_BW) bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); } } @@ -365,6 +369,39 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) return cwnd; } +/* With pacing at lower layers, there's often less data "in the network" than + * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), + * we often have several skbs queued in the pacing layer with a pre-scheduled + * earliest departure time (EDT). BBR adapts its pacing rate based on the + * inflight level that it estimates has already been "baked in" by previous + * departure time decisions. We calculate a rough estimate of the number of our + * packets that might be in the network at the earliest departure time for the + * next skb scheduled: + * in_network_at_edt = inflight_at_edt - (EDT - now) * bw + * If we're increasing inflight, then we want to know if the transmit of the + * EDT skb will push inflight above the target, so inflight_at_edt includes + * bbr_tso_segs_goal() from the skb departing at EDT. If decreasing inflight, + * then estimate if inflight will sink too low just before the EDT transmit. + */ +static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u64 now_ns, edt_ns, interval_us; + u32 interval_delivered, inflight_at_edt; + + now_ns = tp->tcp_clock_cache; + edt_ns = max(tp->tcp_wstamp_ns, now_ns); + interval_us = div_u64(edt_ns - now_ns, NSEC_PER_USEC); + interval_delivered = (u64)bbr_bw(sk) * interval_us >> BW_SCALE; + inflight_at_edt = inflight_now; + if (bbr->pacing_gain > BBR_UNIT) /* increasing inflight */ + inflight_at_edt += bbr_tso_segs_goal(sk); /* include EDT skb */ + if (interval_delivered >= inflight_at_edt) + return 0; + return inflight_at_edt - interval_delivered; +} + /* An optimization in BBR to reduce losses: On the first round of recovery, we * follow the packet conservation principle: send P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked. @@ -396,17 +433,11 @@ static bool bbr_set_cwnd_to_recover_or_restore( cwnd = tcp_packets_in_flight(tp) + acked; } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { /* Exiting loss recovery; restore cwnd saved before recovery. */ - bbr->restore_cwnd = 1; + cwnd = max(cwnd, bbr->prior_cwnd); bbr->packet_conservation = 0; } bbr->prev_ca_state = state; - if (bbr->restore_cwnd) { - /* Restore cwnd after exiting loss recovery or PROBE_RTT. */ - cwnd = max(cwnd, bbr->prior_cwnd); - bbr->restore_cwnd = 0; - } - if (bbr->packet_conservation) { *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); return true; /* yes, using packet conservation */ @@ -423,10 +454,10 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd = 0, target_cwnd = 0; + u32 cwnd = tp->snd_cwnd, target_cwnd = 0; if (!acked) - return; + goto done; /* no packet fully ACKed; just apply caps */ if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) goto done; @@ -462,7 +493,7 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, if (bbr->pacing_gain == BBR_UNIT) return is_full_length; /* just use wall clock time */ - inflight = rs->prior_in_flight; /* what was in-flight before ACK? */ + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); bw = bbr_max_bw(sk); /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at @@ -490,8 +521,6 @@ static void bbr_advance_cycle_phase(struct sock *sk) bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); bbr->cycle_mstamp = tp->delivered_mstamp; - bbr->pacing_gain = bbr->lt_use_bw ? BBR_UNIT : - bbr_pacing_gain[bbr->cycle_idx]; } /* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ @@ -509,8 +538,6 @@ static void bbr_reset_startup_mode(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_STARTUP; - bbr->pacing_gain = bbr_high_gain; - bbr->cwnd_gain = bbr_high_gain; } static void bbr_reset_probe_bw_mode(struct sock *sk) @@ -518,8 +545,6 @@ static void bbr_reset_probe_bw_mode(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); bbr->mode = BBR_PROBE_BW; - bbr->pacing_gain = BBR_UNIT; - bbr->cwnd_gain = bbr_cwnd_gain; bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ } @@ -737,17 +762,29 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { bbr->mode = BBR_DRAIN; /* drain queue we created */ - bbr->pacing_gain = bbr_drain_gain; /* pace slow to drain */ - bbr->cwnd_gain = bbr_high_gain; /* maintain cwnd */ tcp_sk(sk)->snd_ssthresh = bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); } /* fall through to check if in-flight is already small: */ if (bbr->mode == BBR_DRAIN && - tcp_packets_in_flight(tcp_sk(sk)) <= + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ } +static void bbr_check_probe_rtt_done(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (!(bbr->probe_rtt_done_stamp && + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + + bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ + tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + bbr_reset_mode(sk); +} + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and * periodically drain the bottleneck queue, to converge to measure the true * min_rtt (unloaded propagation delay). This allows the flows to keep queues @@ -786,8 +823,6 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) if (bbr_probe_rtt_mode_ms > 0 && filter_expired && !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ - bbr->pacing_gain = BBR_UNIT; - bbr->cwnd_gain = BBR_UNIT; bbr_save_cwnd(sk); /* note cwnd so we can restore it */ bbr->probe_rtt_done_stamp = 0; } @@ -806,12 +841,8 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) } else if (bbr->probe_rtt_done_stamp) { if (bbr->round_start) bbr->probe_rtt_round_done = 1; - if (bbr->probe_rtt_round_done && - after(tcp_jiffies32, bbr->probe_rtt_done_stamp)) { - bbr->min_rtt_stamp = tcp_jiffies32; - bbr->restore_cwnd = 1; /* snap to prior_cwnd */ - bbr_reset_mode(sk); - } + if (bbr->probe_rtt_round_done) + bbr_check_probe_rtt_done(sk); } } /* Restart after idle ends only once we process a new S/ACK for data */ @@ -819,6 +850,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) bbr->idle_restart = 0; } +static void bbr_update_gains(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + switch (bbr->mode) { + case BBR_STARTUP: + bbr->pacing_gain = bbr_high_gain; + bbr->cwnd_gain = bbr_high_gain; + break; + case BBR_DRAIN: + bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ + bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ + break; + case BBR_PROBE_BW: + bbr->pacing_gain = (bbr->lt_use_bw ? + BBR_UNIT : + bbr_pacing_gain[bbr->cycle_idx]); + bbr->cwnd_gain = bbr_cwnd_gain; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; + bbr->cwnd_gain = BBR_UNIT; + break; + default: + WARN_ONCE(1, "BBR bad mode: %u\n", bbr->mode); + break; + } +} + static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) { bbr_update_bw(sk, rs); @@ -826,6 +886,7 @@ static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) bbr_check_full_bw_reached(sk, rs); bbr_check_drain(sk, rs); bbr_update_min_rtt(sk, rs); + bbr_update_gains(sk); } static void bbr_main(struct sock *sk, const struct rate_sample *rs) @@ -862,7 +923,6 @@ static void bbr_init(struct sock *sk) bbr->has_seen_rtt = 0; bbr_init_pacing_rate_from_rtt(sk); - bbr->restore_cwnd = 0; bbr->round_start = 0; bbr->idle_restart = 0; bbr->full_bw_reached = 0; |