aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid S. Miller2019-05-07 12:09:25 -0700
committerDavid S. Miller2019-05-07 12:09:25 -0700
commita97f4fe6e11c4fb465babe80574307f74135160c (patch)
tree0fe6429a54468c277bae91eaa708ad31c70d2be1
parenta55a385d8c846953994d9c140bead67820811b74 (diff)
parent37c0aead7902b1ddf1b668e1ab74c80b9a7fd183 (diff)
Merge branch 'fc-quic-pacing'
Eric Dumazet says: ==================== net_sched: sch_fq: enable in-kernel pacing for QUIC servers Willem added GSO support to UDP stack, greatly improving performance of QUIC servers. We also want to enable in-kernel pacing, which is possible thanks to EDT model, since each sendmsg() can provide a timestamp for the skbs. We have to change sch_fq to enable feeding packets in arbitrary EDT order, and make sure that packet classification do not trust unconnected sockets. Note that this patch series also is a prereq for a future TCP change enabling per-flow delays/reorders/losses to implement high performance TCP emulators. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--net/sched/sch_fq.c110
1 files changed, 96 insertions, 14 deletions
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index d107c74767cd..26a94e5cd5df 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -54,10 +54,23 @@
#include <net/tcp_states.h>
#include <net/tcp.h>
+struct fq_skb_cb {
+ u64 time_to_send;
+};
+
+static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
+{
+ qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
+ return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
+}
+
/*
- * Per flow structure, dynamically allocated
+ * Per flow structure, dynamically allocated.
+ * If packets have monotically increasing time_to_send, they are placed in O(1)
+ * in linear list (head,tail), otherwise are placed in a rbtree (t_root).
*/
struct fq_flow {
+ struct rb_root t_root;
struct sk_buff *head; /* list of skbs for this flow : first skb */
union {
struct sk_buff *tail; /* last skb in the list */
@@ -257,6 +270,17 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
*/
sk = (struct sock *)((hash << 1) | 1UL);
skb_orphan(skb);
+ } else if (sk->sk_state == TCP_CLOSE) {
+ unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
+ /*
+ * Sockets in TCP_CLOSE are non connected.
+ * Typical use case is UDP sockets, they can send packets
+ * with sendto() to many different destinations.
+ * We probably could use a generic bit advertising
+ * non connected sockets, instead of sk_state == TCP_CLOSE,
+ * if we care enough.
+ */
+ sk = (struct sock *)((hash << 1) | 1UL);
}
root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
@@ -277,7 +301,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
* It not, we need to refill credit with
* initial quantum
*/
- if (unlikely(skb->sk &&
+ if (unlikely(skb->sk == sk &&
f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash;
@@ -298,9 +322,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
q->stat_allocation_errors++;
return &q->internal;
}
+ /* f->t_root is already zeroed after kmem_cache_zalloc() */
+
fq_flow_set_detached(f);
f->sk = sk;
- if (skb->sk)
+ if (skb->sk == sk)
f->socket_hash = sk->sk_hash;
f->credit = q->initial_quantum;
@@ -312,14 +338,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
return f;
}
+static struct sk_buff *fq_peek(struct fq_flow *flow)
+{
+ struct sk_buff *skb = skb_rb_first(&flow->t_root);
+ struct sk_buff *head = flow->head;
+
+ if (!skb)
+ return head;
+
+ if (!head)
+ return skb;
+
+ if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
+ return skb;
+ return head;
+}
+
+static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
+ struct sk_buff *skb)
+{
+ if (skb == flow->head) {
+ flow->head = skb->next;
+ } else {
+ rb_erase(&skb->rbnode, &flow->t_root);
+ skb->dev = qdisc_dev(sch);
+ }
+}
/* remove one skb from head of flow queue */
static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
{
- struct sk_buff *skb = flow->head;
+ struct sk_buff *skb = fq_peek(flow);
if (skb) {
- flow->head = skb->next;
+ fq_erase_head(sch, flow, skb);
skb_mark_not_on_list(skb);
flow->qlen--;
qdisc_qstats_backlog_dec(sch, skb);
@@ -330,15 +382,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{
- struct sk_buff *head = flow->head;
+ struct rb_node **p, *parent;
+ struct sk_buff *head, *aux;
- skb->next = NULL;
- if (!head)
- flow->head = skb;
- else
- flow->tail->next = skb;
+ fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
+
+ head = flow->head;
+ if (!head ||
+ fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
+ if (!head)
+ flow->head = skb;
+ else
+ flow->tail->next = skb;
+ flow->tail = skb;
+ skb->next = NULL;
+ return;
+ }
+
+ p = &flow->t_root.rb_node;
+ parent = NULL;
- flow->tail = skb;
+ while (*p) {
+ parent = *p;
+ aux = rb_to_skb(parent);
+ if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
+ p = &parent->rb_right;
+ else
+ p = &parent->rb_left;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, &flow->t_root);
}
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -450,9 +523,9 @@ begin:
goto begin;
}
- skb = f->head;
+ skb = fq_peek(f);
if (skb) {
- u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp),
+ u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
f->time_next_packet);
if (now < time_next_packet) {
@@ -533,6 +606,15 @@ out:
static void fq_flow_purge(struct fq_flow *flow)
{
+ struct rb_node *p = rb_first(&flow->t_root);
+
+ while (p) {
+ struct sk_buff *skb = rb_to_skb(p);
+
+ p = rb_next(p);
+ rb_erase(&skb->rbnode, &flow->t_root);
+ rtnl_kfree_skbs(skb, skb);
+ }
rtnl_kfree_skbs(flow->head, flow->tail);
flow->head = NULL;
flow->qlen = 0;