diff options
author | David S. Miller | 2019-05-07 12:09:25 -0700 |
---|---|---|
committer | David S. Miller | 2019-05-07 12:09:25 -0700 |
commit | a97f4fe6e11c4fb465babe80574307f74135160c (patch) | |
tree | 0fe6429a54468c277bae91eaa708ad31c70d2be1 | |
parent | a55a385d8c846953994d9c140bead67820811b74 (diff) | |
parent | 37c0aead7902b1ddf1b668e1ab74c80b9a7fd183 (diff) |
Merge branch 'fc-quic-pacing'
Eric Dumazet says:
====================
net_sched: sch_fq: enable in-kernel pacing for QUIC servers
Willem added GSO support to UDP stack, greatly improving performance
of QUIC servers.
We also want to enable in-kernel pacing, which is possible thanks to EDT
model, since each sendmsg() can provide a timestamp for the skbs.
We have to change sch_fq to enable feeding packets in arbitrary EDT order,
and make sure that packet classification do not trust unconnected sockets.
Note that this patch series also is a prereq for a future TCP change
enabling per-flow delays/reorders/losses to implement high performance
TCP emulators.
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r-- | net/sched/sch_fq.c | 110 |
1 files changed, 96 insertions, 14 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index d107c74767cd..26a94e5cd5df 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -54,10 +54,23 @@ #include <net/tcp_states.h> #include <net/tcp.h> +struct fq_skb_cb { + u64 time_to_send; +}; + +static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb)); + return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data; +} + /* - * Per flow structure, dynamically allocated + * Per flow structure, dynamically allocated. + * If packets have monotically increasing time_to_send, they are placed in O(1) + * in linear list (head,tail), otherwise are placed in a rbtree (t_root). */ struct fq_flow { + struct rb_root t_root; struct sk_buff *head; /* list of skbs for this flow : first skb */ union { struct sk_buff *tail; /* last skb in the list */ @@ -257,6 +270,17 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) */ sk = (struct sock *)((hash << 1) | 1UL); skb_orphan(skb); + } else if (sk->sk_state == TCP_CLOSE) { + unsigned long hash = skb_get_hash(skb) & q->orphan_mask; + /* + * Sockets in TCP_CLOSE are non connected. + * Typical use case is UDP sockets, they can send packets + * with sendto() to many different destinations. + * We probably could use a generic bit advertising + * non connected sockets, instead of sk_state == TCP_CLOSE, + * if we care enough. + */ + sk = (struct sock *)((hash << 1) | 1UL); } root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; @@ -277,7 +301,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) * It not, we need to refill credit with * initial quantum */ - if (unlikely(skb->sk && + if (unlikely(skb->sk == sk && f->socket_hash != sk->sk_hash)) { f->credit = q->initial_quantum; f->socket_hash = sk->sk_hash; @@ -298,9 +322,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) q->stat_allocation_errors++; return &q->internal; } + /* f->t_root is already zeroed after kmem_cache_zalloc() */ + fq_flow_set_detached(f); f->sk = sk; - if (skb->sk) + if (skb->sk == sk) f->socket_hash = sk->sk_hash; f->credit = q->initial_quantum; @@ -312,14 +338,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) return f; } +static struct sk_buff *fq_peek(struct fq_flow *flow) +{ + struct sk_buff *skb = skb_rb_first(&flow->t_root); + struct sk_buff *head = flow->head; + + if (!skb) + return head; + + if (!head) + return skb; + + if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send) + return skb; + return head; +} + +static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, + struct sk_buff *skb) +{ + if (skb == flow->head) { + flow->head = skb->next; + } else { + rb_erase(&skb->rbnode, &flow->t_root); + skb->dev = qdisc_dev(sch); + } +} /* remove one skb from head of flow queue */ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) { - struct sk_buff *skb = flow->head; + struct sk_buff *skb = fq_peek(flow); if (skb) { - flow->head = skb->next; + fq_erase_head(sch, flow, skb); skb_mark_not_on_list(skb); flow->qlen--; qdisc_qstats_backlog_dec(sch, skb); @@ -330,15 +382,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) { - struct sk_buff *head = flow->head; + struct rb_node **p, *parent; + struct sk_buff *head, *aux; - skb->next = NULL; - if (!head) - flow->head = skb; - else - flow->tail->next = skb; + fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns(); + + head = flow->head; + if (!head || + fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) { + if (!head) + flow->head = skb; + else + flow->tail->next = skb; + flow->tail = skb; + skb->next = NULL; + return; + } + + p = &flow->t_root.rb_node; + parent = NULL; - flow->tail = skb; + while (*p) { + parent = *p; + aux = rb_to_skb(parent); + if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, &flow->t_root); } static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, @@ -450,9 +523,9 @@ begin: goto begin; } - skb = f->head; + skb = fq_peek(f); if (skb) { - u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp), + u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send, f->time_next_packet); if (now < time_next_packet) { @@ -533,6 +606,15 @@ out: static void fq_flow_purge(struct fq_flow *flow) { + struct rb_node *p = rb_first(&flow->t_root); + + while (p) { + struct sk_buff *skb = rb_to_skb(p); + + p = rb_next(p); + rb_erase(&skb->rbnode, &flow->t_root); + rtnl_kfree_skbs(skb, skb); + } rtnl_kfree_skbs(flow->head, flow->tail); flow->head = NULL; flow->qlen = 0; |