From 291a00d1a70f96b393da9ac90c58a82bc7949fc8 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 1 Jul 2015 14:11:14 -0700 Subject: tcp: reduce cwnd if retransmit is lost in CA_Loss If the retransmission in CA_Loss is lost again, we should not continue to slow start or raise cwnd in congestion avoidance mode. Instead we should enter fast recovery and use PRR to reduce cwnd, following the principle in RFC5681: "... or the loss of a retransmission, should be taken as two indications of congestion and, therefore, cwnd (and ssthresh) MUST be lowered twice in this case." This is especially important to reduce loss when the CA_Loss state was caused by a traffic policer dropping the entire inflight. The CA_Loss state has a problem where a loss of L packets causes the sender to send a burst of L packets. So a policer that's dropping most packets in a given RTT can cause a huge retransmit storm. By contrast, PRR includes logic to bound the number of outbound packets that result from a given ACK. So switching to CA_Recovery on lost retransmits in CA_Loss avoids this retransmit storm problem when in CA_Loss. Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 684f095d196e..923e0e568bfa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -109,6 +109,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ +#define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -1037,7 +1038,7 @@ static bool tcp_is_sackblock_valid(struct tcp_sock *tp, bool is_dsack, * highest SACK block). Also calculate the lowest snd_nxt among the remaining * retransmitted skbs to avoid some costly processing per ACKs. */ -static void tcp_mark_lost_retrans(struct sock *sk) +static void tcp_mark_lost_retrans(struct sock *sk, int *flag) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -1078,7 +1079,7 @@ static void tcp_mark_lost_retrans(struct sock *sk) if (after(received_upto, ack_seq)) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); - + *flag |= FLAG_LOST_RETRANS; tcp_skb_mark_lost_uncond_verify(tp, skb); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT); } else { @@ -1818,7 +1819,7 @@ advance_sp: ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker)) tcp_update_reordering(sk, tp->fackets_out - state->reord, 0); - tcp_mark_lost_retrans(sk); + tcp_mark_lost_retrans(sk, &state->flag); tcp_verify_left_out(tp); out: @@ -2676,7 +2677,7 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) tp->prior_ssthresh = 0; tcp_init_undo(tp); - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + if (!tcp_in_cwnd_reduction(sk)) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); tcp_init_cwnd_reduction(sk); @@ -2852,9 +2853,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack); - if (icsk->icsk_ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open && + !(flag & FLAG_LOST_RETRANS)) return; - /* Fall through to processing in Open state. */ + /* Change state if cwnd is undone or retransmits are lost */ default: if (tcp_is_reno(tp)) { if (flag & FLAG_SND_UNA_ADVANCED) -- cgit v1.2.3 From 3759824da87b30ce7a35b4873b62b0ba38905ef5 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 1 Jul 2015 14:11:15 -0700 Subject: tcp: PRR uses CRB mode by default and SS mode conditionally PRR slow start is often too aggressive especially when drops are caused by traffic policers. The policers mainly use token bucket to enforce the rate so sending (twice) faster than the delivery rate causes excessive drops. This patch changes PRR to the conservative reduction bound (CRB) mode in RFC 6937 by default. CRB follows the packet conservation rule to send at most the delivery rate by default. But if many packets are lost and the pipe is empty, CRB may take N round trips to repair N losses. We conditionally turn on slow start mode if all these conditions are made to speed up the recovery: 1) on the second round or later in recovery 2) retransmission sent in the previous round is delivered on this ACK 3) no retransmission is marked lost on this ACK By using packet conservation by default, this change reduces the loss retransmits signicantly on networks that deploy traffic policers, up to 20% reduction of overall loss rate. Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 923e0e568bfa..ad1482dd215e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2476,15 +2476,14 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) return false; } -/* The cwnd reduction in CWR and Recovery use the PRR algorithm - * https://datatracker.ietf.org/doc/draft-ietf-tcpm-proportional-rate-reduction/ +/* The cwnd reduction in CWR and Recovery uses the PRR algorithm in RFC 6937. * It computes the number of packets to send (sndcnt) based on packets newly * delivered: * 1) If the packets in flight is larger than ssthresh, PRR spreads the * cwnd reductions across a full RTT. - * 2) If packets in flight is lower than ssthresh (such as due to excess - * losses and/or application stalls), do not perform any further cwnd - * reductions, but instead slow start up to ssthresh. + * 2) Otherwise PRR uses packet conservation to send as much as delivered. + * But when the retransmits are acked without further losses, PRR + * slow starts cwnd up to ssthresh to speed up the recovery. */ static void tcp_init_cwnd_reduction(struct sock *sk) { @@ -2501,7 +2500,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) } static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, - int fast_rexmit) + int fast_rexmit, int flag) { struct tcp_sock *tp = tcp_sk(sk); int sndcnt = 0; @@ -2510,16 +2509,18 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, (tp->packets_out - tp->sacked_out); tp->prr_delivered += newly_acked_sacked; - if (tcp_packets_in_flight(tp) > tp->snd_ssthresh) { + if (delta < 0) { u64 dividend = (u64)tp->snd_ssthresh * tp->prr_delivered + tp->prior_cwnd - 1; sndcnt = div_u64(dividend, tp->prior_cwnd) - tp->prr_out; - } else { + } else if ((flag & FLAG_RETRANS_DATA_ACKED) && + !(flag & FLAG_LOST_RETRANS)) { sndcnt = min_t(int, delta, max_t(int, tp->prr_delivered - tp->prr_out, newly_acked_sacked) + 1); + } else { + sndcnt = min(delta, newly_acked_sacked); } - sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0)); tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; } @@ -2580,7 +2581,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); } else { - tcp_cwnd_reduction(sk, prior_unsacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); } } @@ -2737,7 +2738,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) /* Undo during fast recovery after partial ACK. */ static bool tcp_try_undo_partial(struct sock *sk, const int acked, - const int prior_unsacked) + const int prior_unsacked, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2753,7 +2754,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked, * mark more packets lost or retransmit more. */ if (tp->retrans_out) { - tcp_cwnd_reduction(sk, prior_unsacked, 0); + tcp_cwnd_reduction(sk, prior_unsacked, 0, flag); return true; } @@ -2840,7 +2841,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); } else { - if (tcp_try_undo_partial(sk, acked, prior_unsacked)) + if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag)) return; /* Partial ACK arrived. Force fast retransmit. */ do_lost = tcp_is_reno(tp) || @@ -2891,7 +2892,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (do_lost) tcp_update_scoreboard(sk, fast_rexmit); - tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit); + tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag); tcp_xmit_retransmit_queue(sk); } -- cgit v1.2.3 From 24ea591d2201c3257d666466e8fac50a6cf3c52f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:03 -0700 Subject: net: sched: extend percpu stats helpers qdisc_bstats_update_cpu() and other helpers were added to support percpu stats for qdisc. We want to add percpu stats for tc action, so this patch add common helpers. qdisc_bstats_update_cpu() is renamed to qdisc_bstats_cpu_update() qdisc_qstats_drop_cpu() is renamed to qdisc_qstats_cpu_drop() Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/sch_generic.h | 31 +++++++++++++++++++++---------- net/core/dev.c | 4 ++-- 2 files changed, 23 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2738f6f87908..2eab08c38e32 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -513,17 +513,20 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, bstats->packets += skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; } -static inline void qdisc_bstats_update_cpu(struct Qdisc *sch, - const struct sk_buff *skb) +static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, + const struct sk_buff *skb) { - struct gnet_stats_basic_cpu *bstats = - this_cpu_ptr(sch->cpu_bstats); - u64_stats_update_begin(&bstats->syncp); bstats_update(&bstats->bstats, skb); u64_stats_update_end(&bstats->syncp); } +static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, + const struct sk_buff *skb) +{ + bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb); +} + static inline void qdisc_bstats_update(struct Qdisc *sch, const struct sk_buff *skb) { @@ -547,16 +550,24 @@ static inline void __qdisc_qstats_drop(struct Qdisc *sch, int count) sch->qstats.drops += count; } -static inline void qdisc_qstats_drop(struct Qdisc *sch) +static inline void qstats_drop_inc(struct gnet_stats_queue *qstats) { - sch->qstats.drops++; + qstats->drops++; } -static inline void qdisc_qstats_drop_cpu(struct Qdisc *sch) +static inline void qstats_overlimit_inc(struct gnet_stats_queue *qstats) { - struct gnet_stats_queue *qstats = this_cpu_ptr(sch->cpu_qstats); + qstats->overlimits++; +} - qstats->drops++; +static inline void qdisc_qstats_drop(struct Qdisc *sch) +{ + qstats_drop_inc(&sch->qstats); +} + +static inline void qdisc_qstats_cpu_drop(struct Qdisc *sch) +{ + qstats_drop_inc(this_cpu_ptr(sch->cpu_qstats)); } static inline void qdisc_qstats_overlimit(struct Qdisc *sch) diff --git a/net/core/dev.c b/net/core/dev.c index 6778a9999d52..e0d270143fc7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3646,7 +3646,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, qdisc_skb_cb(skb)->pkt_len = skb->len; skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - qdisc_bstats_update_cpu(cl->q, skb); + qdisc_bstats_cpu_update(cl->q, skb); switch (tc_classify(skb, cl, &cl_res)) { case TC_ACT_OK: @@ -3654,7 +3654,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, skb->tc_index = TC_H_MIN(cl_res.classid); break; case TC_ACT_SHOT: - qdisc_qstats_drop_cpu(cl->q); + qdisc_qstats_cpu_drop(cl->q); case TC_ACT_STOLEN: case TC_ACT_QUEUED: kfree_skb(skb); -- cgit v1.2.3 From 519c818e8fb646eef1e8bfedd18519bec47bc9a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:04 -0700 Subject: net: sched: add percpu stats to actions Reuse existing percpu infrastructure John Fastabend added for qdisc. This patch adds a new cpustats parameter to tcf_hash_create() and all actions pass false, meaning this patch should have no effect yet. Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/act_api.h | 4 +++- net/sched/act_api.c | 44 ++++++++++++++++++++++++++++++++++---------- net/sched/act_bpf.c | 2 +- net/sched/act_connmark.c | 3 ++- net/sched/act_csum.c | 3 ++- net/sched/act_gact.c | 3 ++- net/sched/act_ipt.c | 2 +- net/sched/act_mirred.c | 3 ++- net/sched/act_nat.c | 3 ++- net/sched/act_pedit.c | 3 ++- net/sched/act_simple.c | 3 ++- net/sched/act_skbedit.c | 3 ++- net/sched/act_vlan.c | 3 ++- 13 files changed, 57 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 3ee4c92afd1b..db2063ffd181 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -21,6 +21,8 @@ struct tcf_common { struct gnet_stats_rate_est64 tcfc_rate_est; spinlock_t tcfc_lock; struct rcu_head tcfc_rcu; + struct gnet_stats_basic_cpu __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; }; #define tcf_head common.tcfc_head #define tcf_index common.tcfc_index @@ -103,7 +105,7 @@ int tcf_hash_release(struct tc_action *a, int bind); u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo); int tcf_hash_check(u32 index, struct tc_action *a, int bind); int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind); + int size, int bind, bool cpustats); void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est); void tcf_hash_insert(struct tc_action *a); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index af427a3dbcba..074a32f466f8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -27,6 +27,15 @@ #include #include +static void free_tcf(struct rcu_head *head) +{ + struct tcf_common *p = container_of(head, struct tcf_common, tcfc_rcu); + + free_percpu(p->cpu_bstats); + free_percpu(p->cpu_qstats); + kfree(p); +} + void tcf_hash_destroy(struct tc_action *a) { struct tcf_common *p = a->priv; @@ -41,7 +50,7 @@ void tcf_hash_destroy(struct tc_action *a) * gen_estimator est_timer() might access p->tcfc_lock * or bstats, wait a RCU grace period before freeing p */ - kfree_rcu(p, tcfc_rcu); + call_rcu(&p->tcfc_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_destroy); @@ -230,15 +239,16 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est) if (est) gen_kill_estimator(&pc->tcfc_bstats, &pc->tcfc_rate_est); - kfree_rcu(pc, tcfc_rcu); + call_rcu(&pc->tcfc_rcu, free_tcf); } EXPORT_SYMBOL(tcf_hash_cleanup); int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, - int size, int bind) + int size, int bind, bool cpustats) { struct tcf_hashinfo *hinfo = a->ops->hinfo; struct tcf_common *p = kzalloc(size, GFP_KERNEL); + int err = -ENOMEM; if (unlikely(!p)) return -ENOMEM; @@ -246,18 +256,32 @@ int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, if (bind) p->tcfc_bindcnt = 1; + if (cpustats) { + p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); + if (!p->cpu_bstats) { +err1: + kfree(p); + return err; + } + p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!p->cpu_qstats) { +err2: + free_percpu(p->cpu_bstats); + goto err1; + } + } spin_lock_init(&p->tcfc_lock); INIT_HLIST_NODE(&p->tcfc_head); p->tcfc_index = index ? index : tcf_hash_new_index(hinfo); p->tcfc_tm.install = jiffies; p->tcfc_tm.lastuse = jiffies; if (est) { - int err = gen_new_estimator(&p->tcfc_bstats, NULL, - &p->tcfc_rate_est, - &p->tcfc_lock, est); + err = gen_new_estimator(&p->tcfc_bstats, p->cpu_bstats, + &p->tcfc_rate_est, + &p->tcfc_lock, est); if (err) { - kfree(p); - return err; + free_percpu(p->cpu_qstats); + goto err2; } } @@ -615,10 +639,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, if (err < 0) goto errout; - if (gnet_stats_copy_basic(&d, NULL, &p->tcfc_bstats) < 0 || + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfc_bstats) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfc_bstats, &p->tcfc_rate_est) < 0 || - gnet_stats_copy_queue(&d, NULL, + gnet_stats_copy_queue(&d, p->cpu_qstats, &p->tcfc_qstats, p->tcfc_qstats.qlen) < 0) goto errout; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 1d56903fd4c7..99aa271633e9 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -281,7 +281,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, act, bind)) { ret = tcf_hash_create(parm->index, est, act, - sizeof(*prog), bind); + sizeof(*prog), bind, false); if (ret < 0) goto destroy_fp; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 295d14bd6c67..f2b540220ad0 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -108,7 +108,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_CONNMARK_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*ci), + bind, false); if (ret) return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 4cd5cf1aedf8..b07c535ba8e7 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -62,7 +62,8 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_CSUM_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 7fffc2272701..a4f8af29ee30 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -85,7 +85,8 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, #endif if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index cbc8dd7dd48a..99c9cc1c7af9 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -114,7 +114,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, index = nla_get_u32(tb[TCA_IPT_INDEX]); if (!tcf_hash_check(index, a, bind) ) { - ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind); + ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index a42a3b257226..002cd6c83dc6 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -93,7 +93,8 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { if (dev == NULL) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*m), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*m), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 270a030d5fd0..5be0b3c1c5b0 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -55,7 +55,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, parm = nla_data(tb[TCA_NAT_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; ret = ACT_P_CREATED; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 17e6d6669c7f..ce8676ad892f 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -57,7 +57,8 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { if (!parm->nkeys) return -EINVAL; - ret = tcf_hash_create(parm->index, est, a, sizeof(*p), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*p), + bind, false); if (ret) return ret; p = to_pedit(a); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 6a8d9488613a..d6b708d6afdf 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -103,7 +103,8 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, defdata = nla_data(tb[TCA_DEF_DATA]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), + bind, false); if (ret) return ret; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index fcfeeaf838be..6751b5f8c046 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -99,7 +99,8 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, parm = nla_data(tb[TCA_SKBEDIT_PARMS]); if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*d), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*d), + bind, false); if (ret) return ret; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index d735ecf0b1a7..796785e0bf96 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -116,7 +116,8 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, action = parm->v_action; if (!tcf_hash_check(parm->index, a, bind)) { - ret = tcf_hash_create(parm->index, est, a, sizeof(*v), bind); + ret = tcf_hash_create(parm->index, est, a, sizeof(*v), + bind, false); if (ret) return ret; -- cgit v1.2.3 From cef5ecf96b28dc91c4e9f398a336c578fb9e1a0c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:05 -0700 Subject: net_sched: act_gact: make tcfg_pval non zero First step for gact RCU operation : Instead of testing if tcfg_pval is zero or not, just make it 1. No change in behavior, but slightly faster code. The smp_rmb()/smp_wmb() barriers, while not strictly needed at this stage are added for upcoming spinlock removal. Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/act_gact.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index a4f8af29ee30..22a3a61aa090 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -28,14 +28,16 @@ #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { - if (!gact->tcfg_pval || prandom_u32() % gact->tcfg_pval) + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ + if (prandom_u32() % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } static int gact_determ(struct tcf_gact *gact) { - if (!gact->tcfg_pval || gact->tcf_bstats.packets % gact->tcfg_pval) + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ + if (gact->tcf_bstats.packets % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } @@ -105,7 +107,11 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, #ifdef CONFIG_GACT_PROB if (p_parm) { gact->tcfg_paction = p_parm->paction; - gact->tcfg_pval = p_parm->pval; + gact->tcfg_pval = max_t(u16, 1, p_parm->pval); + /* Make sure tcfg_pval is written before tcfg_ptype + * coupled with smp_rmb() in gact_net_rand() & gact_determ() + */ + smp_wmb(); gact->tcfg_ptype = p_parm->ptype; } #endif -- cgit v1.2.3 From cc6510a9504fd3c03d76bd68d99653148342eecc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:06 -0700 Subject: net_sched: act_gact: use a separate packet counters for gact_determ() Second step for gact RCU operation : We want to get rid of the spinlock protecting gact operations. Stats (packets/bytes) will soon be per cpu. gact_determ() would not work without a central packet counter, so lets add it for this mode. Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/tc_act/tc_gact.h | 7 ++++--- net/sched/act_gact.c | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index 9fc9b578908a..592a6bc02b0b 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -6,9 +6,10 @@ struct tcf_gact { struct tcf_common common; #ifdef CONFIG_GACT_PROB - u16 tcfg_ptype; - u16 tcfg_pval; - int tcfg_paction; + u16 tcfg_ptype; + u16 tcfg_pval; + int tcfg_paction; + atomic_t packets; #endif }; #define to_gact(a) \ diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 22a3a61aa090..2f9bec584b3f 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -36,8 +36,10 @@ static int gact_net_rand(struct tcf_gact *gact) static int gact_determ(struct tcf_gact *gact) { + u32 pack = atomic_inc_return(&gact->packets); + smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ - if (gact->tcf_bstats.packets % gact->tcfg_pval) + if (pack % gact->tcfg_pval) return gact->tcf_action; return gact->tcfg_paction; } -- cgit v1.2.3 From 8f2ae965b7ef4f4ddab6110f06388e270723d694 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:07 -0700 Subject: net_sched: act_gact: read tcfg_ptype once Third step for gact RCU operation : Following patch will get rid of spinlock protection, so we need to read tcfg_ptype once. Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- net/sched/act_gact.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 2f9bec584b3f..e4eb88d3d8dc 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -127,16 +127,16 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gact *gact = a->priv; - int action = TC_ACT_SHOT; + int action = gact->tcf_action; spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB - if (gact->tcfg_ptype) - action = gact_rand[gact->tcfg_ptype](gact); - else - action = gact->tcf_action; -#else - action = gact->tcf_action; + { + u32 ptype = READ_ONCE(gact->tcfg_ptype); + + if (ptype) + action = gact_rand[ptype](gact); + } #endif gact->tcf_bstats.bytes += qdisc_pkt_len(skb); gact->tcf_bstats.packets++; -- cgit v1.2.3 From 56e5d1ca183d8616fab377d7d466c244b4dbb3b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:08 -0700 Subject: net_sched: act_gact: remove spinlock in fast path Final step for gact RCU operation : 1) Use percpu stats 2) update lastuse only every clock tick to avoid false sharing 3) Remove spinlock acquisition, as it is no longer needed. Since this is the last contended lock in packet RX when tc gact is used, this gives impressive gain. My host with 8 RX queues was handling 5 Mpps before the patch, and more than 11 Mpps after patch. Tested: On receiver : dev=eth0 tc qdisc del dev $dev ingress 2>/dev/null tc qdisc add dev $dev ingress tc filter del dev $dev root pref 10 2>/dev/null tc filter del dev $dev pref 10 2>/dev/null tc filter add dev $dev est 1sec 4sec parent ffff: protocol ip prio 1 \ u32 match ip src 7.0.0.0/8 flowid 1:15 action drop Sender sends packets flood from 7/8 network Signed-off-by: Eric Dumazet Acked-by: Alexei Starovoitov Acked-by: Jamal Hadi Salim Acked-by: John Fastabend Signed-off-by: David S. Miller --- include/net/act_api.h | 11 +++++++++++ net/sched/act_gact.c | 17 +++++++---------- 2 files changed, 18 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index db2063ffd181..8d2a707a9e87 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -70,6 +70,17 @@ static inline void tcf_hashinfo_destroy(struct tcf_hashinfo *hf) kfree(hf->htab); } +/* Update lastuse only if needed, to avoid dirtying a cache line. + * We use a temp variable to avoid fetching jiffies twice. + */ +static inline void tcf_lastuse_update(struct tcf_t *tm) +{ + unsigned long now = jiffies; + + if (tm->lastuse != now) + tm->lastuse = now; +} + #ifdef CONFIG_NET_CLS_ACT #define ACT_P_CREATED 1 diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e4eb88d3d8dc..5c1b05170736 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -90,7 +90,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, a, bind)) { ret = tcf_hash_create(parm->index, est, a, sizeof(*gact), - bind, false); + bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -104,7 +104,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact = to_gact(a); - spin_lock_bh(&gact->tcf_lock); + ASSERT_RTNL(); gact->tcf_action = parm->action; #ifdef CONFIG_GACT_PROB if (p_parm) { @@ -117,7 +117,6 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, gact->tcfg_ptype = p_parm->ptype; } #endif - spin_unlock_bh(&gact->tcf_lock); if (ret == ACT_P_CREATED) tcf_hash_insert(a); return ret; @@ -127,9 +126,8 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_gact *gact = a->priv; - int action = gact->tcf_action; + int action = READ_ONCE(gact->tcf_action); - spin_lock(&gact->tcf_lock); #ifdef CONFIG_GACT_PROB { u32 ptype = READ_ONCE(gact->tcfg_ptype); @@ -138,12 +136,11 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a, action = gact_rand[ptype](gact); } #endif - gact->tcf_bstats.bytes += qdisc_pkt_len(skb); - gact->tcf_bstats.packets++; + bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats), skb); if (action == TC_ACT_SHOT) - gact->tcf_qstats.drops++; - gact->tcf_tm.lastuse = jiffies; - spin_unlock(&gact->tcf_lock); + qstats_drop_inc(this_cpu_ptr(gact->common.cpu_qstats)); + + tcf_lastuse_update(&gact->tcf_tm); return action; } -- cgit v1.2.3 From 2ee22a90c7afac265bb6f7abea610b938195e2b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Jul 2015 05:18:09 -0700 Subject: net_sched: act_mirred: remove spinlock in fast path Like act_gact, act_mirred can be lockless in packet processing 1) Use percpu stats 2) update lastuse only every clock tick to avoid false sharing 3) use rcu to protect tcfm_dev 4) Remove spinlock usage, as it is no longer needed. Next step : add multi queue capability to ifb device Signed-off-by: Eric Dumazet Cc: Alexei Starovoitov Cc: Jamal Hadi Salim Cc: John Fastabend Acked-by: Jamal Hadi Salim Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/tc_act/tc_mirred.h | 2 +- net/sched/act_mirred.c | 57 ++++++++++++++++++++++-------------------- 2 files changed, 31 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index 4dd77a1c106b..dae96bae1c19 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -8,7 +8,7 @@ struct tcf_mirred { int tcfm_eaction; int tcfm_ifindex; int tcfm_ok_push; - struct net_device *tcfm_dev; + struct net_device __rcu *tcfm_dev; struct list_head tcfm_list; }; #define to_mirred(a) \ diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 002cd6c83dc6..19cd8904efa0 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -35,9 +35,11 @@ static LIST_HEAD(mirred_list); static void tcf_mirred_release(struct tc_action *a, int bind) { struct tcf_mirred *m = to_mirred(a); + struct net_device *dev = rcu_dereference_protected(m->tcfm_dev, 1); + list_del(&m->tcfm_list); - if (m->tcfm_dev) - dev_put(m->tcfm_dev); + if (dev) + dev_put(dev); } static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { @@ -94,7 +96,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (dev == NULL) return -EINVAL; ret = tcf_hash_create(parm->index, est, a, sizeof(*m), - bind, false); + bind, true); if (ret) return ret; ret = ACT_P_CREATED; @@ -106,18 +108,18 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, } m = to_mirred(a); - spin_lock_bh(&m->tcf_lock); + ASSERT_RTNL(); m->tcf_action = parm->action; m->tcfm_eaction = parm->eaction; if (dev != NULL) { m->tcfm_ifindex = parm->ifindex; if (ret != ACT_P_CREATED) - dev_put(m->tcfm_dev); + dev_put(rcu_dereference_protected(m->tcfm_dev, 1)); dev_hold(dev); - m->tcfm_dev = dev; + rcu_assign_pointer(m->tcfm_dev, dev); m->tcfm_ok_push = ok_push; } - spin_unlock_bh(&m->tcf_lock); + if (ret == ACT_P_CREATED) { list_add(&m->tcfm_list, &mirred_list); tcf_hash_insert(a); @@ -132,20 +134,22 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, struct tcf_mirred *m = a->priv; struct net_device *dev; struct sk_buff *skb2; + int retval, err; u32 at; - int retval, err = 1; - spin_lock(&m->tcf_lock); - m->tcf_tm.lastuse = jiffies; - bstats_update(&m->tcf_bstats, skb); + tcf_lastuse_update(&m->tcf_tm); + + bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); - dev = m->tcfm_dev; - if (!dev) { - printk_once(KERN_NOTICE "tc mirred: target device is gone\n"); + rcu_read_lock(); + retval = READ_ONCE(m->tcf_action); + dev = rcu_dereference(m->tcfm_dev); + if (unlikely(!dev)) { + pr_notice_once("tc mirred: target device is gone\n"); goto out; } - if (!(dev->flags & IFF_UP)) { + if (unlikely(!(dev->flags & IFF_UP))) { net_notice_ratelimited("tc mirred to Houston: device %s is down\n", dev->name); goto out; @@ -153,7 +157,7 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, at = G_TC_AT(skb->tc_verd); skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2 == NULL) + if (!skb2) goto out; if (!(at & AT_EGRESS)) { @@ -169,16 +173,13 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a, skb2->dev = dev; err = dev_queue_xmit(skb2); -out: if (err) { - m->tcf_qstats.overlimits++; +out: + qstats_overlimit_inc(this_cpu_ptr(m->common.cpu_qstats)); if (m->tcfm_eaction != TCA_EGRESS_MIRROR) retval = TC_ACT_SHOT; - else - retval = m->tcf_action; - } else - retval = m->tcf_action; - spin_unlock(&m->tcf_lock); + } + rcu_read_unlock(); return retval; } @@ -217,14 +218,16 @@ static int mirred_device_event(struct notifier_block *unused, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct tcf_mirred *m; + ASSERT_RTNL(); if (event == NETDEV_UNREGISTER) list_for_each_entry(m, &mirred_list, tcfm_list) { - spin_lock_bh(&m->tcf_lock); - if (m->tcfm_dev == dev) { + if (rcu_access_pointer(m->tcfm_dev) == dev) { dev_put(dev); - m->tcfm_dev = NULL; + /* Note : no rcu grace period necessary, as + * net_device are already rcu protected. + */ + RCU_INIT_POINTER(m->tcfm_dev, NULL); } - spin_unlock_bh(&m->tcf_lock); } return NOTIFY_DONE; -- cgit v1.2.3 From 2251ae46af72d013a6a537e7681e94b18b63e332 Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Wed, 8 Jul 2015 10:12:28 +1000 Subject: tcp: v1 always send a quick ack when quickacks are enabled V1 of this patch contains Eric Dumazet's suggestion to move the per dst RTAX_QUICKACK check into tcp_in_quickack_mode(). Thanks Eric. I ran some tests and after setting the "ip route change quickack 1" knob there were still many delayed ACKs sent. This occured because when icsk_ack.quick=0 the !icsk_ack.pingpong value is subsequently ignored as tcp_in_quickack_mode() checks both these values. The condition for a quick ack to trigger requires that both icsk_ack.quick != 0 and icsk_ack.pingpong=0. Currently only icsk_ack.pingpong is controlled by the knob. But the icsk_ack.quick value changes dynamically depending on heuristics. The crux of the matter is that delayed acks still cannot be entirely disabled even with the RTAX_QUICKACK per dst knob enabled. This patch ensures that a quick ack is always sent when the RTAX_QUICKACK per dst knob is turned on. The "ip route change quickack 1" knob was recently added to enable quickacks. It was modeled around the TCP_QUICKACK setsockopt() option. This issue is that even with "ip route change quickack 1" enabled we still see delayed ACKs under some conditions. It would be nice to be able to completely disable delayed ACKs. Here is an example: # netstat -s|grep dela 3 delayed acks sent For all routes enable the knob # ip route change quickack 1 Generate some traffic across a slow link and we still see the delayed acks. # netstat -s|grep dela 106 delayed acks sent 1 delayed acks further delayed because of locked socket The issue is that both the "ip route change quickack 1" knob and the TCP_QUICKACK option set the icsk_ack.pingpong variable to 0. However at the business end in the __tcp_ack_snd_check() routine, tcp_in_quickack_mode() checks that both icsk_ack.quick != 0 and icsk_ack.pingpong=0 in order to trigger a quickack. As icsk_ack.quick is determined by heuristics it can be 0. When that occurs the icsk_ack.pingpong value is ignored and a delayed ACK is sent regardless. This patch moves the RTAX_QUICKACK per dst check into the tcp_in_quickack_mode() routine which ensures that a quickack is always sent when the quickack knob is enabled for that dst. Signed-off-by: Jon Maxwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 11 +++++------ net/ipv4/tcp_output.c | 6 ++---- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ad1482dd215e..7f4a8d5f6eb0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -197,11 +197,13 @@ static void tcp_enter_quickack_mode(struct sock *sk) * and the session is not interactive. */ -static inline bool tcp_in_quickack_mode(const struct sock *sk) +static bool tcp_in_quickack_mode(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); + const struct dst_entry *dst = __sk_dst_get(sk); - return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; + return (dst && dst_metric(dst, RTAX_QUICKACK)) || + (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong); } static void tcp_ecn_queue_cwr(struct tcp_sock *tp) @@ -3951,7 +3953,6 @@ void tcp_reset(struct sock *sk) static void tcp_fin(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - const struct dst_entry *dst; inet_csk_schedule_ack(sk); @@ -3963,9 +3964,7 @@ static void tcp_fin(struct sock *sk) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - dst = __sk_dst_get(sk); - if (!dst || !dst_metric(dst, RTAX_QUICKACK)) - inet_csk(sk)->icsk_ack.pingpong = 1; + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b1c218df2c85..71057849593a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -163,7 +163,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp, { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - const struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) @@ -174,9 +173,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato && - (!dst || !dst_metric(dst, RTAX_QUICKACK))) - icsk->icsk_ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; } /* Account for an ACK we sent. */ -- cgit v1.2.3 From 1007f59dce53a22cba164f854d7bdc171c85dc79 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 9 Jul 2015 11:02:52 -0700 Subject: net: skb_defer_rx_timestamp should check for phydev before setting up classify This change makes it so that the call skb_defer_rx_timestamp will first check for a phydev before going in and manipulating the skb->data and skb->len values. By doing this we can avoid unnecessary work on network devices that don't support phydev. As a result we reduce the total instruction count needed to process this on most devices. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/timestamping.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/timestamping.c b/net/core/timestamping.c index 43d3dd62fcc8..42689d5c468c 100644 --- a/net/core/timestamping.c +++ b/net/core/timestamping.c @@ -60,11 +60,15 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb) struct phy_device *phydev; unsigned int type; + if (!skb->dev || !skb->dev->phydev || !skb->dev->phydev->drv) + return false; + if (skb_headroom(skb) < ETH_HLEN) return false; + __skb_push(skb, ETH_HLEN); - type = classify(skb); + type = ptp_classify_raw(skb); __skb_pull(skb, ETH_HLEN); -- cgit v1.2.3 From 071d5080e33d6f24139e4213c2d9f97a2c21b602 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Jul 2015 13:16:29 -0700 Subject: tcp: add tcp_in_slow_start helper Add a helper to test the slow start condition in various congestion control modules and other places. This is to prepare a slight improvement in policy as to exactly when to slow start. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/net/tcp.h | 7 ++++++- net/ipv4/tcp_bic.c | 2 +- net/ipv4/tcp_cong.c | 2 +- net/ipv4/tcp_cubic.c | 4 ++-- net/ipv4/tcp_highspeed.c | 2 +- net/ipv4/tcp_htcp.c | 2 +- net/ipv4/tcp_illinois.c | 2 +- net/ipv4/tcp_metrics.c | 2 +- net/ipv4/tcp_scalable.c | 2 +- net/ipv4/tcp_vegas.c | 6 +++--- net/ipv4/tcp_veno.c | 2 +- 11 files changed, 19 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 950cfecaad3c..dba22fc1b065 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -989,6 +989,11 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) #define TCP_INFINITE_SSTHRESH 0x7fffffff +static inline bool tcp_in_slow_start(const struct tcp_sock *tp) +{ + return tp->snd_cwnd <= tp->snd_ssthresh; +} + static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; @@ -1065,7 +1070,7 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); /* If in slow start, ensure cwnd grows to twice what was ACKed. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) return tp->snd_cwnd < 2 * tp->max_packets_out; return tp->is_cwnd_limited; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index c037644eafb7..fd1405d37c14 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -146,7 +146,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { bictcp_update(ca, tp->snd_cwnd); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 84be008c945c..654729a8cb23 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -413,7 +413,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In "safe" area, increase. */ - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 06d3d665a9fd..28011fb1f4a2 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -320,7 +320,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { if (hystart && after(ack, ca->end_seq)) bictcp_hystart_reset(sk); acked = tcp_slow_start(tp, acked); @@ -439,7 +439,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) ca->delay_min = delay; /* hystart triggers when cwnd is larger than some threshold */ - if (hystart && tp->snd_cwnd <= tp->snd_ssthresh && + if (hystart && tcp_in_slow_start(tp) && tp->snd_cwnd >= hystart_low_window) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 882c08aae2f5..db7842495a64 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -116,7 +116,7 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* Update AIMD parameters. diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 58469fff6c18..82f0d9ed60f5 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -236,7 +236,7 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { /* In dangerous area, increase slowly. diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index f71002e4db0b..2ab9bbb6faff 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -268,7 +268,7 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; /* In slow start */ - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else { diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index a51d63a43e33..b3d64f61d922 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -461,7 +461,7 @@ void tcp_update_metrics(struct sock *sk) tcp_metric_set(tm, TCP_METRIC_CWND, tp->snd_cwnd); } - } else if (tp->snd_cwnd > tp->snd_ssthresh && + } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 333bcb2415ff..bf5ea9e9bbc1 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -22,7 +22,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!tcp_is_cwnd_limited(sk)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) + if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); else tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index a6cea1d5e20d..13951c4087d4 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -225,7 +225,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) */ diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; - if (diff > gamma && tp->snd_cwnd <= tp->snd_ssthresh) { + if (diff > gamma && tcp_in_slow_start(tp)) { /* Going too fast. Time to slow down * and switch to congestion avoidance. */ @@ -240,7 +240,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); - } else if (tp->snd_cwnd <= tp->snd_ssthresh) { + } else if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { @@ -281,7 +281,7 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) vegas->minRTT = 0x7fffffff; } /* Use normal slow start */ - else if (tp->snd_cwnd <= tp->snd_ssthresh) + else if (tcp_in_slow_start(tp)) tcp_slow_start(tp, acked); } diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 112151eeee45..0d094b995cd9 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -150,7 +150,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; - if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { -- cgit v1.2.3 From 76174004a0f19785a328f40388e87e982bbf69b9 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Jul 2015 13:16:30 -0700 Subject: tcp: do not slow start when cwnd equals ssthresh In the original design slow start is only used to raise cwnd when cwnd is stricly below ssthresh. It makes little sense to slow start when cwnd == ssthresh: especially when hystart has set ssthresh in the initial ramp, or after recovery when cwnd resets to ssthresh. Not doing so will also help reduce the buffer bloat slightly. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_cdg.c | 2 +- net/ipv4/tcp_cong.c | 4 +--- net/ipv4/tcp_hybla.c | 2 +- 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index dba22fc1b065..364426a2be5a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -991,7 +991,7 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { - return tp->snd_cwnd <= tp->snd_ssthresh; + return tp->snd_cwnd < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 8c6fd3d5e40f..167b6a3e1b98 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -264,7 +264,7 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) u32 prior_snd_cwnd; u32 incr; - if (tp->snd_cwnd < tp->snd_ssthresh && hystart_detect) + if (tcp_in_slow_start(tp) && hystart_detect) tcp_cdg_hystart_update(sk); if (after(ack, ca->rtt_seq) && ca->rtt.v64) { diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 654729a8cb23..a2ed23c595cf 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -365,10 +365,8 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) */ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { - u32 cwnd = tp->snd_cwnd + acked; + u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); - if (cwnd > tp->snd_ssthresh) - cwnd = tp->snd_ssthresh + 1; acked -= cwnd - tp->snd_cwnd; tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index f963b274f2b0..083831e359df 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -112,7 +112,7 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) rho_fractions = ca->rho_3ls - (ca->rho << 3); - if (tp->snd_cwnd < tp->snd_ssthresh) { + if (tcp_in_slow_start(tp)) { /* * slow start * INC = 2^RHO - 1 -- cgit v1.2.3 From b20a3fa30a281b52b2576b509efbe5cd47a5a79b Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 9 Jul 2015 13:16:31 -0700 Subject: tcp: update congestion state first before raising cwnd The congestion state and cwnd can be updated in the wrong order. For example, upon receiving a dubious ACK, we incorrectly raise the cwnd first (tcp_may_raise_cwnd()/tcp_cong_avoid()) because the state is still Open, then enter recovery state to reduce cwnd. For another example, if the ACK indicates spurious timeout or retransmits, we first revert the cwnd reduction and congestion state back to Open state. But we don't raise the cwnd even though the ACK does not indicate any congestion. To fix this problem we should first call tcp_fastretrans_alert() to process the dubious ACK and update the congestion state, then call tcp_may_raise_cwnd() that raises cwnd based on the current state. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7f4a8d5f6eb0..1578fc2a6f39 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3568,10 +3568,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) &sack_state); acked -= tp->packets_out; - /* Advance cwnd if state allows */ - if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked); - if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, prior_unsacked, @@ -3580,6 +3576,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); + /* Advance cwnd if state allows */ + if (tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, acked); + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { struct dst_entry *dst = __sk_dst_get(sk); if (dst) -- cgit v1.2.3 From 8b58a39846568dcd7d0c98b2fadc25018e59dedf Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Jul 2015 23:32:12 +0200 Subject: ipv6: use flag instead of u16 for hop in inet6_skb_parm Hop was always either 0 or sizeof(struct ipv6hdr). Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 +- net/ipv6/af_inet6.c | 4 ++-- net/ipv6/datagram.c | 8 ++++---- net/ipv6/exthdrs.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 82806c60aa42..1319a6bb6b82 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -94,7 +94,6 @@ static inline struct ipv6hdr *ipipv6_hdr(const struct sk_buff *skb) struct inet6_skb_parm { int iif; __be16 ra; - __u16 hop; __u16 dst0; __u16 srcrt; __u16 dst1; @@ -111,6 +110,7 @@ struct inet6_skb_parm { #define IP6SKB_REROUTED 4 #define IP6SKB_ROUTERALERT 8 #define IP6SKB_FRAGMENTED 16 +#define IP6SKB_HOPBYHOP 32 }; #define IP6CB(skb) ((struct inet6_skb_parm*)((skb)->cb)) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7de52b65173f..39e670a91596 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -679,8 +679,8 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct ipv6_pinfo *np = inet6_sk(sk); if (np->rxopt.all) { - if ((opt->hop && (np->rxopt.bits.hopopts || - np->rxopt.bits.ohopopts)) || + if (((opt->flags & IP6SKB_HOPBYHOP) && + (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || (ip6_flowinfo((struct ipv6hdr *) skb_network_header(skb)) && np->rxopt.bits.rxflow) || (opt->srcrt && (np->rxopt.bits.srcrt || diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 62d908e64eeb..50115522e80f 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -558,8 +558,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, } /* HbH is allowed only once */ - if (np->rxopt.bits.hopopts && opt->hop) { - u8 *ptr = nh + opt->hop; + if (np->rxopt.bits.hopopts && (opt->flags & IP6SKB_HOPBYHOP)) { + u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); } @@ -620,8 +620,8 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg, int hlim = ipv6_hdr(skb)->hop_limit; put_cmsg(msg, SOL_IPV6, IPV6_2292HOPLIMIT, sizeof(hlim), &hlim); } - if (np->rxopt.bits.ohopopts && opt->hop) { - u8 *ptr = nh + opt->hop; + if (np->rxopt.bits.ohopopts && (opt->flags & IP6SKB_HOPBYHOP)) { + u8 *ptr = nh + sizeof(struct ipv6hdr); put_cmsg(msg, SOL_IPV6, IPV6_2292HOPOPTS, (ptr[1]+1)<<3, ptr); } if (np->rxopt.bits.odstopts && opt->dst0) { diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index a7bbbe45570b..ce203b0402be 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -632,7 +632,7 @@ int ipv6_parse_hopopts(struct sk_buff *skb) return -1; } - opt->hop = sizeof(struct ipv6hdr); + opt->flags |= IP6SKB_HOPBYHOP; if (ip6_parse_tlv(tlvprochopopt_lst, skb)) { skb->transport_header += (skb_transport_header(skb)[1] + 1) << 3; opt = IP6CB(skb); -- cgit v1.2.3 From fc01538f9fb75572c969ca9988176ffc2a8741d6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jul 2015 14:28:29 -0700 Subject: inet: simplify timewait refcounting timewait sockets have a complex refcounting logic. Once we realize it should be similar to established and syn_recv sockets, we can use sk_nulls_del_node_init_rcu() and remove inet_twsk_unhash() In particular, deferred inet_twsk_put() added in commit 13475a30b66cd ("tcp: connect() race with timewait reuse") looks unecessary : When removing a timewait socket from ehash or bhash, caller must own a reference on the socket anyway. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 4 ++-- include/net/inet_timewait_sock.h | 6 ++---- net/ipv4/inet_hashtables.c | 31 ++++++++++------------------- net/ipv4/inet_timewait_sock.c | 42 ++++++---------------------------------- net/ipv6/inet6_hashtables.c | 6 +----- 5 files changed, 21 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index b73c88a19dd4..b07d126694a7 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -205,8 +205,8 @@ void inet_put_port(struct sock *sk); void inet_hashinfo_init(struct inet_hashinfo *h); -int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw); -int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw); +void __inet_hash_nolisten(struct sock *sk, struct sock *osk); +void __inet_hash(struct sock *sk, struct sock *osk); void inet_hash(struct sock *sk); void inet_unhash(struct sock *sk); diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 360c4802288d..96f52a4711c8 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -100,10 +100,8 @@ static inline struct inet_timewait_sock *inet_twsk(const struct sock *sk) void inet_twsk_free(struct inet_timewait_sock *tw); void inet_twsk_put(struct inet_timewait_sock *tw); -int inet_twsk_unhash(struct inet_timewait_sock *tw); - -int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, - struct inet_hashinfo *hashinfo); +void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 5f9b063bbe8a..e58840330da7 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -343,7 +343,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; - int twrefcnt = 0; spin_lock(lock); @@ -371,12 +370,10 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { - twrefcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); - if (twrefcnt) - inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { @@ -384,7 +381,6 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw); - inet_twsk_put(tw); } return 0; @@ -403,13 +399,12 @@ static u32 inet_sk_port_offset(const struct sock *sk) inet->inet_dport); } -int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) +void __inet_hash_nolisten(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; struct inet_ehash_bucket *head; spinlock_t *lock; - int twrefcnt = 0; WARN_ON(!sk_unhashed(sk)); @@ -420,23 +415,22 @@ int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) spin_lock(lock); __sk_nulls_add_node_rcu(sk, list); - if (tw) { - WARN_ON(sk->sk_hash != tw->tw_hash); - twrefcnt = inet_twsk_unhash(tw); + if (osk) { + WARN_ON(sk->sk_hash != osk->sk_hash); + sk_nulls_del_node_init_rcu(osk); } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - return twrefcnt; } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); -int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) +void __inet_hash(struct sock *sk, struct sock *osk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_listen_hashbucket *ilb; if (sk->sk_state != TCP_LISTEN) - return __inet_hash_nolisten(sk, tw); + return __inet_hash_nolisten(sk, osk); WARN_ON(!sk_unhashed(sk)); ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; @@ -445,7 +439,6 @@ int __inet_hash(struct sock *sk, struct inet_timewait_sock *tw) __sk_nulls_add_node_rcu(sk, &ilb->head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); spin_unlock(&ilb->lock); - return 0; } EXPORT_SYMBOL(__inet_hash); @@ -492,7 +485,6 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct inet_bind_bucket *tb; int ret; struct net *net = sock_net(sk); - int twrefcnt = 1; if (!snum) { int i, remaining, low, high, port; @@ -560,18 +552,15 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); - twrefcnt += __inet_hash_nolisten(sk, tw); + __inet_hash_nolisten(sk, (struct sock *)tw); } if (tw) - twrefcnt += inet_twsk_bind_unhash(tw, hinfo); + inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); if (tw) { inet_twsk_deschedule(tw); - while (twrefcnt) { - twrefcnt--; - inet_twsk_put(tw); - } + inet_twsk_put(tw); } ret = 0; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 2ffbd16b79e0..92cd4d50404e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -17,28 +17,6 @@ #include -/** - * inet_twsk_unhash - unhash a timewait socket from established hash - * @tw: timewait socket - * - * unhash a timewait socket from established hash, if hashed. - * ehash lock must be held by caller. - * Returns 1 if caller should call inet_twsk_put() after lock release. - */ -int inet_twsk_unhash(struct inet_timewait_sock *tw) -{ - if (hlist_nulls_unhashed(&tw->tw_node)) - return 0; - - hlist_nulls_del_rcu(&tw->tw_node); - sk_nulls_node_init(&tw->tw_node); - /* - * We cannot call inet_twsk_put() ourself under lock, - * caller must call it for us. - */ - return 1; -} - /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash * @tw: timewait socket @@ -48,35 +26,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw) * bind hash lock must be held by caller. * Returns 1 if caller should call inet_twsk_put() after lock release. */ -int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, +void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) - return 0; + return; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); - /* - * We cannot call inet_twsk_put() ourself under lock, - * caller must call it for us. - */ - return 1; + __sock_put((struct sock *)tw); } /* Must be called with locally disabled BHs. */ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; - struct inet_bind_hashbucket *bhead; - int refcnt; - /* Unlink from established hashes. */ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); + struct inet_bind_hashbucket *bhead; spin_lock(lock); - refcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); spin_unlock(lock); /* Disassociate with bind bucket. */ @@ -84,11 +56,9 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) hashinfo->bhash_size)]; spin_lock(&bhead->lock); - refcnt += inet_twsk_bind_unhash(tw, hashinfo); + inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); - BUG_ON(refcnt >= atomic_read(&tw->tw_refcnt)); - atomic_sub(refcnt, &tw->tw_refcnt); atomic_dec(&tw->tw_dr->tw_count); inet_twsk_put(tw); } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index b4fd96de97e6..a237398aa2b4 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -207,7 +207,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; - int twrefcnt = 0; spin_lock(lock); @@ -234,12 +233,10 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); if (tw) { - twrefcnt = inet_twsk_unhash(tw); + sk_nulls_del_node_init_rcu((struct sock *)tw); NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } spin_unlock(lock); - if (twrefcnt) - inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { @@ -247,7 +244,6 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw); - inet_twsk_put(tw); } return 0; -- cgit v1.2.3 From dbe7faa4045ea83a37b691b12bb02a8f86c2d2e9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jul 2015 14:28:30 -0700 Subject: inet: inet_twsk_deschedule factorization inet_twsk_deschedule() calls are followed by inet_twsk_put(). Only particular case is in inet_twsk_purge() but there is no point to defer the inet_twsk_put() after re-enabling BH. Lets rename inet_twsk_deschedule() to inet_twsk_deschedule_put() and move the inet_twsk_put() inside. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 2 +- net/ipv4/inet_hashtables.c | 9 +++------ net/ipv4/inet_timewait_sock.c | 13 ++++++++----- net/ipv4/tcp_ipv4.c | 3 +-- net/ipv4/tcp_minisocks.c | 6 ++---- net/ipv6/inet6_hashtables.c | 3 +-- net/ipv6/tcp_ipv6.c | 3 +-- net/netfilter/xt_TPROXY.c | 6 ++---- 8 files changed, 19 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 96f52a4711c8..879d6e5a973b 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -111,7 +111,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo); void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); -void inet_twsk_deschedule(struct inet_timewait_sock *tw); +void inet_twsk_deschedule_put(struct inet_timewait_sock *tw); void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e58840330da7..f8b3701a6c3c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -380,8 +380,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); } return 0; @@ -558,10 +557,8 @@ ok: inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); - if (tw) { - inet_twsk_deschedule(tw); - inet_twsk_put(tw); - } + if (tw) + inet_twsk_deschedule_put(tw); ret = 0; goto out; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 92cd4d50404e..ae22cc24fbe8 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -205,13 +205,17 @@ EXPORT_SYMBOL_GPL(inet_twsk_alloc); * tcp_input.c to verify this. */ -/* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw) +/* This is for handling early-kills of TIME_WAIT sockets. + * Warning : consume reference. + * Caller should not access tw anymore. + */ +void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { if (del_timer_sync(&tw->tw_timer)) inet_twsk_kill(tw); + inet_twsk_put(tw); } -EXPORT_SYMBOL(inet_twsk_deschedule); +EXPORT_SYMBOL(inet_twsk_deschedule_put); void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) { @@ -281,9 +285,8 @@ restart: rcu_read_unlock(); local_bh_disable(); - inet_twsk_deschedule(tw); + inet_twsk_deschedule_put(tw); local_bh_enable(); - inet_twsk_put(tw); goto restart_rcu; } /* If the nulls value we got at the end of this lookup is diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d7d4c2b79cf2..486ba96ae91a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1683,8 +1683,7 @@ do_time_wait: iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; goto process; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4bc00cb79e60..6d8795b066ac 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -147,8 +147,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_RST; } @@ -198,8 +197,7 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index a237398aa2b4..6ac8dad0138a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -243,8 +243,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, *twp = tw; } else if (tw) { /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); } return 0; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6748c4277aff..d540846a1a79 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1481,8 +1481,7 @@ do_time_wait: ntohs(th->dest), tcp_v6_iif(skb)); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); - inet_twsk_deschedule(tw); - inet_twsk_put(tw); + inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); goto process; diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index cca96cec1b68..d0c96c5ae29a 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -272,8 +272,7 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, hp->source, lport ? lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } @@ -437,8 +436,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, tgi->lport ? tgi->lport : hp->dest, skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk)); - inet_twsk_put(inet_twsk(sk)); + inet_twsk_deschedule_put(inet_twsk(sk)); sk = sk2; } } -- cgit v1.2.3 From 35a256fee52c7c207796302681fa95189c85b408 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 8 Jul 2015 16:58:22 -0700 Subject: ipv6: Nonlocal bind Add support to allow non-local binds similar to how this was done for IPv4. Non-local binds are very useful in emulating the Internet in a box, etc. This add the ip_nonlocal_bind sysctl under ipv6. Testing: Set up nonlocal binding and receive routing on a host, e.g.: ip -6 rule add from ::/0 iif eth0 lookup 200 ip -6 route add local 2001:0:0:1::/64 dev lo proto kernel scope host table 200 sysctl -w net.ipv6.ip_nonlocal_bind=1 Set up routing to 2001:0:0:1::/64 on peer to go to first host ping6 -I 2001:0:0:1::1 peer-address -- to verify Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 +++++ include/net/netns/ipv6.h | 1 + net/ipv4/ping.c | 3 ++- net/ipv6/af_inet6.c | 3 ++- net/ipv6/raw.c | 3 ++- net/ipv6/sysctl_net_ipv6.c | 8 ++++++++ 6 files changed, 20 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 5fae7704daab..f63aeefd2c24 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1435,6 +1435,11 @@ mtu - INTEGER Default Maximum Transfer Unit Default: 1280 (IPv6 required minimum) +ip_nonlocal_bind - BOOLEAN + If set, allows processes to bind() to non-local IPv6 addresses, + which can be quite useful - but may break some applications. + Default: 0 + router_probe_interval - INTEGER Minimum interval (in seconds) between Router Probing described in RFC4191. diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 8d93544a2d2b..c0368db6df54 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -31,6 +31,7 @@ struct netns_sysctl_ipv6 { int auto_flowlabels; int icmpv6_time; int anycast_src_echo_reply; + int ip_nonlocal_bind; int fwmark_reflect; int idgen_retries; int idgen_delay; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 05ff44b758df..e89094ab5ddb 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -363,7 +363,8 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, scoped); rcu_read_unlock(); - if (!(isk->freebind || isk->transparent || has_addr || + if (!(net->ipv6.sysctl.ip_nonlocal_bind || + isk->freebind || isk->transparent || has_addr || addr_type == IPV6_ADDR_ANY)) return -EADDRNOTAVAIL; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 39e670a91596..7bc92ea4ae8f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -342,7 +342,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!(inet->freebind || inet->transparent) && + if (!net->ipv6.sysctl.ip_nonlocal_bind && + !(inet->freebind || inet->transparent) && !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ca4700cb26c4..fdbada1569a3 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -295,7 +295,8 @@ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; - if (!(addr_type & IPV6_ADDR_MULTICAST)) { + if (!(addr_type & IPV6_ADDR_MULTICAST) && + !sock_net(sk)->ipv6.sysctl.ip_nonlocal_bind) { err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, dev, 0)) { diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 4e705add4f18..db48aebd9c47 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -75,6 +75,13 @@ static struct ctl_table ipv6_table_template[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "ip_nonlocal_bind", + .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; @@ -117,6 +124,7 @@ static int __net_init ipv6_sysctl_net_init(struct net *net) ipv6_table[5].data = &net->ipv6.sysctl.idgen_retries; ipv6_table[6].data = &net->ipv6.sysctl.idgen_delay; ipv6_table[7].data = &net->ipv6.sysctl.flowlabel_state_ranges; + ipv6_table[8].data = &net->ipv6.sysctl.ip_nonlocal_bind; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) -- cgit v1.2.3 From cb1c61680d29a054b91a23c7a504cea8a72bdcff Mon Sep 17 00:00:00 2001 From: Masatake YAMATO Date: Thu, 9 Jul 2015 12:46:35 +0900 Subject: route: remove unsed variable in __mkroute_input flags local variable in __mkroute_input is not used as a variable. Signed-off-by: Masatake YAMATO Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d0362a2de3d3..04c83de4f79e 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1546,7 +1546,6 @@ static int __mkroute_input(struct sk_buff *skb, struct rtable *rth; int err; struct in_device *out_dev; - unsigned int flags = 0; bool do_cache; u32 itag = 0; @@ -1610,7 +1609,7 @@ static int __mkroute_input(struct sk_buff *skb, } rth->rt_genid = rt_genid_ipv4(dev_net(rth->dst.dev)); - rth->rt_flags = flags; + rth->rt_flags = 0; rth->rt_type = res->type; rth->rt_is_input = 1; rth->rt_iif = 0; -- cgit v1.2.3 From 09cf0211f970311383fdb453bbd5b3beeb294324 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 9 Jul 2015 03:11:10 -0700 Subject: bridge: mdb: fill state in br_mdb_notify Fill also the port group state when sending notifications. Signed-off-by: Satish Ashok Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 5 +++-- net/bridge/br_multicast.c | 5 +++-- net/bridge/br_private.h | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index e29ad70b3000..1fb7d076f15c 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -230,7 +230,7 @@ errout: } void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type) + struct br_ip *group, int type, u8 state) { struct br_mdb_entry entry; @@ -241,6 +241,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, #if IS_ENABLED(CONFIG_IPV6) entry.addr.u.ip6 = group->u.ip6; #endif + entry.state = state; __br_mdb_notify(dev, &entry, type); } @@ -348,7 +349,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return -ENOMEM; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB); + br_mdb_notify(br->dev, port, group, RTM_NEWMDB, state); return 0; } diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 742a6c27d7a2..5a44cd9473f2 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -694,7 +694,7 @@ static int br_multicast_add_group(struct net_bridge *br, if (unlikely(!p)) goto err; rcu_assign_pointer(*pp, p); - br_mdb_notify(br->dev, port, group, RTM_NEWMDB); + br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY); found: mod_timer(&p->timer, now + br->multicast_membership_interval); @@ -1439,8 +1439,9 @@ br_multicast_leave_group(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); + br_mdb_notify(br->dev, port, group, RTM_DELMDB, + p->state); call_rcu_bh(&p->rcu, br_multicast_free_pg); - br_mdb_notify(br->dev, port, group, RTM_DELMDB); if (!mp->ports && !mp->mglist && netif_running(br->dev)) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8b21146b24a0..c73fd785654d 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -488,7 +488,7 @@ br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group, void br_mdb_init(void); void br_mdb_uninit(void); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, - struct br_ip *group, int type); + struct br_ip *group, int type, u8 state); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) -- cgit v1.2.3 From a4e2405cc5d20ed6d58c4874325856e80e76a7f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Jul 2015 18:01:40 +0200 Subject: tcp: do not export tcp_init_xmit_timers() After commit 900f65d361d3 ("tcp: move duplicate code from tcp_v4_init_sock()/tcp_v6_init_sock()"), we no longer need to export tcp_init_xmit_timers() Signed-off-by: Eric Dumazet Cc: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 5b752f58a900..7149ebc820c7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -649,4 +649,3 @@ void tcp_init_xmit_timers(struct sock *sk) inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); } -EXPORT_SYMBOL(tcp_init_xmit_timers); -- cgit v1.2.3 From 9131f3de24db4dc12199aede7d931e6703e97f3b Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki/吉藤英明 Date: Fri, 10 Jul 2015 16:58:31 +0900 Subject: ipv6: Do not iterate over all interfaces when finding source address on specific interface. If outgoing interface is specified and the candidate address is restricted to the outgoing interface, it is enough to iterate over that given interface only. Signed-off-by: YOSHIFUJI Hideaki Acked-by: Erik Kline Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 197 ++++++++++++++++++++++++++++------------------------ 1 file changed, 107 insertions(+), 90 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 21c2c818df3b..4ab74d56f65a 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1358,15 +1358,94 @@ out: return ret; } +static void __ipv6_dev_get_saddr(struct net *net, + struct ipv6_saddr_dst *dst, + unsigned int prefs, + const struct in6_addr *saddr, + struct inet6_dev *idev, + struct ipv6_saddr_score *scores) +{ + struct ipv6_saddr_score *score = &scores[0], *hiscore = &scores[1]; + + read_lock_bh(&idev->lock); + list_for_each_entry(score->ifa, &idev->addr_list, if_list) { + int i; + + /* + * - Tentative Address (RFC2462 section 5.4) + * - A tentative address is not considered + * "assigned to an interface" in the traditional + * sense, unless it is also flagged as optimistic. + * - Candidate Source Address (section 4) + * - In any case, anycast addresses, multicast + * addresses, and the unspecified address MUST + * NOT be included in a candidate set. + */ + if ((score->ifa->flags & IFA_F_TENTATIVE) && + (!(score->ifa->flags & IFA_F_OPTIMISTIC))) + continue; + + score->addr_type = __ipv6_addr_type(&score->ifa->addr); + + if (unlikely(score->addr_type == IPV6_ADDR_ANY || + score->addr_type & IPV6_ADDR_MULTICAST)) { + net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s", + idev->dev->name); + continue; + } + + score->rule = -1; + bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); + + for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { + int minihiscore, miniscore; + + minihiscore = ipv6_get_saddr_eval(net, hiscore, dst, i); + miniscore = ipv6_get_saddr_eval(net, score, dst, i); + + if (minihiscore > miniscore) { + if (i == IPV6_SADDR_RULE_SCOPE && + score->scopedist > 0) { + /* + * special case: + * each remaining entry + * has too small (not enough) + * scope, because ifa entries + * are sorted by their scope + * values. + */ + goto out; + } + break; + } else if (minihiscore < miniscore) { + if (hiscore->ifa) + in6_ifa_put(hiscore->ifa); + + in6_ifa_hold(score->ifa); + + swap(hiscore, score); + + /* restore our iterator */ + score->ifa = hiscore->ifa; + + break; + } + } + } +out: + read_unlock_bh(&idev->lock); +} + int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { - struct ipv6_saddr_score scores[2], - *score = &scores[0], *hiscore = &scores[1]; + struct ipv6_saddr_score scores[2], *hiscore = &scores[1]; struct ipv6_saddr_dst dst; + struct inet6_dev *idev; struct net_device *dev; int dst_type; + bool use_oif_addr = false; dst_type = __ipv6_addr_type(daddr); dst.addr = daddr; @@ -1380,97 +1459,35 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, rcu_read_lock(); - for_each_netdev_rcu(net, dev) { - struct inet6_dev *idev; - - /* Candidate Source Address (section 4) - * - multicast and link-local destination address, - * the set of candidate source address MUST only - * include addresses assigned to interfaces - * belonging to the same link as the outgoing - * interface. - * (- For site-local destination addresses, the - * set of candidate source addresses MUST only - * include addresses assigned to interfaces - * belonging to the same site as the outgoing - * interface.) - */ - if (((dst_type & IPV6_ADDR_MULTICAST) || - dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) && - dst.ifindex && dev->ifindex != dst.ifindex) - continue; - - idev = __in6_dev_get(dev); - if (!idev) - continue; - - read_lock_bh(&idev->lock); - list_for_each_entry(score->ifa, &idev->addr_list, if_list) { - int i; - - /* - * - Tentative Address (RFC2462 section 5.4) - * - A tentative address is not considered - * "assigned to an interface" in the traditional - * sense, unless it is also flagged as optimistic. - * - Candidate Source Address (section 4) - * - In any case, anycast addresses, multicast - * addresses, and the unspecified address MUST - * NOT be included in a candidate set. - */ - if ((score->ifa->flags & IFA_F_TENTATIVE) && - (!(score->ifa->flags & IFA_F_OPTIMISTIC))) - continue; - - score->addr_type = __ipv6_addr_type(&score->ifa->addr); + /* Candidate Source Address (section 4) + * - multicast and link-local destination address, + * the set of candidate source address MUST only + * include addresses assigned to interfaces + * belonging to the same link as the outgoing + * interface. + * (- For site-local destination addresses, the + * set of candidate source addresses MUST only + * include addresses assigned to interfaces + * belonging to the same site as the outgoing + * interface.) + */ + if (dst_dev) { + if ((dst_type & IPV6_ADDR_MULTICAST) || + dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) { + idev = __in6_dev_get(dst_dev); + use_oif_addr = true; + } + } - if (unlikely(score->addr_type == IPV6_ADDR_ANY || - score->addr_type & IPV6_ADDR_MULTICAST)) { - net_dbg_ratelimited("ADDRCONF: unspecified / multicast address assigned as unicast address on %s", - dev->name); + if (use_oif_addr) { + __ipv6_dev_get_saddr(net, &dst, prefs, saddr, idev, scores); + } else { + for_each_netdev_rcu(net, dev) { + idev = __in6_dev_get(dev); + if (!idev) continue; - } - - score->rule = -1; - bitmap_zero(score->scorebits, IPV6_SADDR_RULE_MAX); - - for (i = 0; i < IPV6_SADDR_RULE_MAX; i++) { - int minihiscore, miniscore; - - minihiscore = ipv6_get_saddr_eval(net, hiscore, &dst, i); - miniscore = ipv6_get_saddr_eval(net, score, &dst, i); - - if (minihiscore > miniscore) { - if (i == IPV6_SADDR_RULE_SCOPE && - score->scopedist > 0) { - /* - * special case: - * each remaining entry - * has too small (not enough) - * scope, because ifa entries - * are sorted by their scope - * values. - */ - goto try_nextdev; - } - break; - } else if (minihiscore < miniscore) { - if (hiscore->ifa) - in6_ifa_put(hiscore->ifa); - - in6_ifa_hold(score->ifa); - - swap(hiscore, score); - - /* restore our iterator */ - score->ifa = hiscore->ifa; - - break; - } - } + __ipv6_dev_get_saddr(net, &dst, prefs, saddr, idev, scores); } -try_nextdev: - read_unlock_bh(&idev->lock); } rcu_read_unlock(); -- cgit v1.2.3 From 14fe22e334623e451b5592193415c644005461ea Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 11 Jul 2015 01:37:36 +0200 Subject: Revert "ipv4: use skb coalescing in defragmentation" This reverts commit 3cc4949269e01f39443d0fcfffb5bc6b47878d45. There is nothing wrong with coalescing during defragmentation, it reduces truesize overhead and simplifies things for the receiving socket (no fraglist walk needed). However, it also destroys geometry of the original fragments. While that doesn't cause any breakage (we make sure to not exceed largest original size) ip_do_fragment contains a 'fastpath' that takes advantage of a present frag list and results in fragments that (in most cases) match what was received. In case its needed the coalescing could be done later, when we're sure the skb is not forwarded. But discussion during NFWS resulted in 'lets just remove this for now'. Cc: Eric Dumazet Signed-off-by: Florian Westphal Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a50dc6d408d1..4d3fffafbe24 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -522,7 +522,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, int len; int ihlen; int err; - int sum_truesize; u8 ecn; ipq_kill(qp); @@ -590,32 +589,19 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, add_frag_mem_limit(&qp->q, clone->truesize); } + skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - sum_truesize = head->truesize; - for (fp = head->next; fp;) { - bool headstolen; - int delta; - struct sk_buff *next = fp->next; - - sum_truesize += fp->truesize; + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - - if (skb_try_coalesce(head, fp, &headstolen, &delta)) { - kfree_skb_partial(fp, headstolen); - } else { - if (!skb_shinfo(head)->frag_list) - skb_shinfo(head)->frag_list = fp; - head->data_len += fp->len; - head->len += fp->len; - head->truesize += fp->truesize; - } - fp = next; + head->truesize += fp->truesize; } - sub_frag_mem_limit(&qp->q, sum_truesize); + sub_frag_mem_limit(&qp->q, head->truesize); head->next = NULL; head->dev = dev; -- cgit v1.2.3 From de551f2eb22a77a498cea9686f39e79f25329109 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 13 Jul 2015 08:48:00 -0700 Subject: net: Build IPv6 into kernel by default This patch makes the default to build IPv6 into the kernel. IPv6 now has significant traction and any remaining vestiges of IPv6 not being provided parity with IPv4 should be swept away. IPv6 is now core to the Internet and kernel. Points on IPv6 adoption: - Per Google statistics, IPv6 usage has reached 7% on the Internet and continues to exhibit an exponential growth rate https://www.google.com/intl/en/ipv6/statistics.html - Just a few days ago ARIN officially depleted its IPv4 pool - IPv6 only data centers are being successfully built (e.g. at Facebook) This patch changes the IPv6 Kconfig for IPV6. Default for CONFIG_IPV6 is set to "y" and the text has been updated to reflect the maturity of IPv6. Impact: Under some circumstances building modules in to kernel might have a performance advantage. In my testing, I did notice a very slight improvement. This will obviously increase the size of the kernel image. In my configuration I see: IPv6 as module: text data bss dec hex filename 9703666 1899288 933888 12536842 bf4c0a vmlinux IPv6 built into kernel text data bss dec hex filename 9436490 1879600 913408 12229498 ba9b7a vmlinux Which increases text size by ~270K (2.8% increase in size for me). If image size is an issue, presumably for a device which does not do IP networking (IMO we should be discouraging IPv4-only devices), IPV6 can be disabled or still built as a module. Acked-by: YOSHIFUJI Hideaki Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/Kconfig | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 438a73aa777c..643f61339e7b 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -5,16 +5,15 @@ # IPv6 as module will cause a CRASH if you try to unload it menuconfig IPV6 tristate "The IPv6 protocol" - default m + default y ---help--- - This is complemental support for the IP version 6. - You will still be able to do traditional IPv4 networking as well. + Support for IP version 6 (IPv6). For general information about IPv6, see . - For Linux IPv6 development information, see . - For specific information about IPv6 under Linux, read the HOWTO at - . + For specific information about IPv6 under Linux, see + Documentation/networking/ipv6.txt and read the HOWTO at + To compile this protocol support as a module, choose M here: the module will be called ipv6. -- cgit v1.2.3 From 74fe61f17e999a458d5f64ca2aa9a0282ca32198 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 10 Jul 2015 08:02:08 -0700 Subject: bridge: mdb: add vlan support for user entries Until now all user mdb entries were added in vlan 0, this patch adds support to allow the user to specify the vlan for the entry. About the uapi change a hole in struct br_mdb_entry is used so the size and offsets are kept the same (verified with pahole and tested with older iproute2). Example: $ bridge mdb dev br0 port eth1 grp 239.0.0.1 permanent vlan 2000 dev br0 port eth1 grp 239.0.0.1 permanent vlan 200 dev br0 port eth1 grp 239.0.0.1 permanent Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_bridge.h | 1 + net/bridge/br_mdb.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_bridge.h b/include/uapi/linux/if_bridge.h index eaaea6208b42..3635b7797508 100644 --- a/include/uapi/linux/if_bridge.h +++ b/include/uapi/linux/if_bridge.h @@ -182,6 +182,7 @@ struct br_mdb_entry { #define MDB_TEMPORARY 0 #define MDB_PERMANENT 1 __u8 state; + __u16 vid; struct { union { __be32 ip4; diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 1fb7d076f15c..a8d0e93d43f2 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -85,6 +85,7 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb, memset(&e, 0, sizeof(e)); e.ifindex = port->dev->ifindex; e.state = p->state; + e.vid = p->addr.vid; if (p->addr.proto == htons(ETH_P_IP)) e.addr.u.ip4 = p->addr.u.ip4; #if IS_ENABLED(CONFIG_IPV6) @@ -242,6 +243,7 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, entry.addr.u.ip6 = group->u.ip6; #endif entry.state = state; + entry.vid = group->vid; __br_mdb_notify(dev, &entry, type); } @@ -264,6 +266,8 @@ static bool is_valid_mdb_entry(struct br_mdb_entry *entry) return false; if (entry->state != MDB_PERMANENT && entry->state != MDB_TEMPORARY) return false; + if (entry->vid >= VLAN_VID_MASK) + return false; return true; } @@ -372,6 +376,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, if (!p || p->br != br || p->state == BR_STATE_DISABLED) return -EINVAL; + ip.vid = entry->vid; ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) ip.u.ip4 = entry->addr.u.ip4; @@ -418,6 +423,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) if (!netif_running(br->dev) || br->multicast_disabled) return -EINVAL; + ip.vid = entry->vid; ip.proto = entry->addr.proto; if (ip.proto == htons(ETH_P_IP)) { if (timer_pending(&br->ip4_other_query.timer)) -- cgit v1.2.3 From 0d6ef0688d8744454646298b85336407be05e309 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 10 Jul 2015 15:42:49 +0900 Subject: ipvs: Delete an unnecessary check before the function call "module_put" The module_put() function tests whether its argument is NULL and then returns immediately. Thus the test around the call is not needed. This issue was detected by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 199760c71f39..e50221b9d95f 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -137,7 +137,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) { - if (scheduler && scheduler->module) + if (scheduler) module_put(scheduler->module); } -- cgit v1.2.3 From 4c0911566dec7755d15cb89239fb2db4447f7a62 Mon Sep 17 00:00:00 2001 From: Eric W. Biederman Date: Fri, 10 Jul 2015 18:13:58 -0500 Subject: netfilter: Simply the tests for enabling and disabling the ingress queue hook Replace an overcomplicated switch statement with a simple if statement. This also removes the ingress queue enable outside of nf_hook_mutex as the protection provided by the mutex is not necessary and the code is clearer having both of the static key increments together. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index a0e54974e2c9..c4c3b85a2538 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -74,7 +74,6 @@ int nf_register_hook(struct nf_hook_ops *reg) if (reg->hooknum == NF_NETDEV_INGRESS) { BUG_ON(reg->dev == NULL); nf_hook_list = ®->dev->nf_hooks_ingress; - net_inc_ingress_queue(); break; } #endif @@ -90,6 +89,10 @@ int nf_register_hook(struct nf_hook_ops *reg) } list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); +#ifdef CONFIG_NETFILTER_INGRESS + if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_inc_ingress_queue(); +#endif #ifdef HAVE_JUMP_LABEL static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif @@ -102,18 +105,10 @@ void nf_unregister_hook(struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); - switch (reg->pf) { - case NFPROTO_NETDEV: #ifdef CONFIG_NETFILTER_INGRESS - if (reg->hooknum == NF_NETDEV_INGRESS) { - net_dec_ingress_queue(); - break; - } - break; + if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) + net_dec_ingress_queue(); #endif - default: - break; - } #ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif -- cgit v1.2.3 From 0edcf282b0a6f38168294264837cf7d52a2f5255 Mon Sep 17 00:00:00 2001 From: Eric W. Biederman Date: Fri, 10 Jul 2015 18:14:30 -0500 Subject: netfilter: Factor out the hook list selection from nf_register_hook - Add a new function find_nf_hook_list to select the nf_hook_list - Fail nf_register_hook when asked for a per netdevice hook list when support for per netdevice hook lists is not built into the kernel. - Move the hook list head selection outside of nf_hook_mutex as nothing in the selection requires the hook list, and error handling is simpler if a mutex is not held. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index c4c3b85a2538..fa4d3c111d3f 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -62,27 +62,31 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -int nf_register_hook(struct nf_hook_ops *reg) +static struct list_head *find_nf_hook_list(const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list; - struct nf_hook_ops *elem; + struct list_head *nf_hook_list = NULL; - mutex_lock(&nf_hook_mutex); - switch (reg->pf) { - case NFPROTO_NETDEV: + if (reg->pf != NFPROTO_NETDEV) + nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; + else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS - if (reg->hooknum == NF_NETDEV_INGRESS) { - BUG_ON(reg->dev == NULL); + if (reg->dev) nf_hook_list = ®->dev->nf_hooks_ingress; - break; - } #endif - /* Fall through. */ - default: - nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; - break; } + return nf_hook_list; +} + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct list_head *nf_hook_list; + struct nf_hook_ops *elem; + nf_hook_list = find_nf_hook_list(reg); + if (!nf_hook_list) + return -ENOENT; + + mutex_lock(&nf_hook_mutex); list_for_each_entry(elem, nf_hook_list, list) { if (reg->priority < elem->priority) break; -- cgit v1.2.3 From 085db2c04557d31db61541f361bd8b4de92c9939 Mon Sep 17 00:00:00 2001 From: Eric W. Biederman Date: Fri, 10 Jul 2015 18:15:06 -0500 Subject: netfilter: Per network namespace netfilter hooks. - Add a new set of functions for registering and unregistering per network namespace hooks. - Modify the old global namespace hook functions to use the per network namespace hooks in their implementation, so their remains a single list that needs to be walked for any hook (this is important for keeping the hook priority working and for keeping the code walking the hooks simple). - Only allow registering the per netdevice hooks in the network namespace where the network device lives. - Dynamically allocate the structures in the per network namespace hook list in nf_register_net_hook, and unregister them in nf_unregister_net_hook. Dynamic allocate is required somewhere as the number of network namespaces are not fixed so we might as well allocate them in the registration function. The chain of registered hooks on any list is expected to be small so the cost of walking that list to find the entry we are unregistering should also be small. Performing the management of the dynamically allocated list entries in the registration and unregistration functions keeps the complexity from spreading. Signed-off-by: "Eric W. Biederman" --- include/linux/netfilter.h | 14 +++- include/net/netns/netfilter.h | 1 + net/netfilter/core.c | 182 +++++++++++++++++++++++++++++++++++++----- 3 files changed, 173 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 60e89348a91d..9bbd110ec81b 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include #ifdef CONFIG_NETFILTER static inline int NF_DROP_GETERR(int verdict) @@ -118,6 +120,13 @@ struct nf_sockopt_ops { }; /* Function to register/unregister hook points. */ +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *ops); +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *ops); +int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n); +void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n); + int nf_register_hook(struct nf_hook_ops *reg); void nf_unregister_hook(struct nf_hook_ops *reg); int nf_register_hooks(struct nf_hook_ops *reg, unsigned int n); @@ -128,8 +137,6 @@ void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n); int nf_register_sockopt(struct nf_sockopt_ops *reg); void nf_unregister_sockopt(struct nf_sockopt_ops *reg); -extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; - #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; @@ -167,7 +174,8 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int (*okfn)(struct sock *, struct sk_buff *), int thresh) { - struct list_head *nf_hook_list = &nf_hooks[pf][hook]; + struct net *net = dev_net(indev ? indev : outdev); + struct list_head *nf_hook_list = &net->nf.hooks[pf][hook]; if (nf_hook_list_active(nf_hook_list, pf, hook)) { struct nf_hook_state state; diff --git a/include/net/netns/netfilter.h b/include/net/netns/netfilter.h index 532e4ba64f49..38aa4983e2a9 100644 --- a/include/net/netns/netfilter.h +++ b/include/net/netns/netfilter.h @@ -14,5 +14,6 @@ struct netns_nf { #ifdef CONFIG_SYSCTL struct ctl_table_header *nf_log_dir_header; #endif + struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; }; #endif diff --git a/net/netfilter/core.c b/net/netfilter/core.c index fa4d3c111d3f..56ead1a1711c 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -52,9 +52,6 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo) } EXPORT_SYMBOL_GPL(nf_unregister_afinfo); -struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; -EXPORT_SYMBOL(nf_hooks); - #ifdef HAVE_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); @@ -62,27 +59,40 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -static struct list_head *find_nf_hook_list(const struct nf_hook_ops *reg) +static struct list_head *find_nf_hook_list(struct net *net, + const struct nf_hook_ops *reg) { struct list_head *nf_hook_list = NULL; if (reg->pf != NFPROTO_NETDEV) - nf_hook_list = &nf_hooks[reg->pf][reg->hooknum]; + nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS - if (reg->dev) + if (reg->dev && dev_net(reg->dev) == net) nf_hook_list = ®->dev->nf_hooks_ingress; #endif } return nf_hook_list; } -int nf_register_hook(struct nf_hook_ops *reg) +int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { struct list_head *nf_hook_list; - struct nf_hook_ops *elem; + struct nf_hook_ops *elem, *new; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; - nf_hook_list = find_nf_hook_list(reg); + new->hook = reg->hook; + new->dev = reg->dev; + new->owner = reg->owner; + new->priv = reg->priv; + new->pf = reg->pf; + new->hooknum = reg->hooknum; + new->priority = reg->priority; + + nf_hook_list = find_nf_hook_list(net, reg); if (!nf_hook_list) return -ENOENT; @@ -91,7 +101,7 @@ int nf_register_hook(struct nf_hook_ops *reg) if (reg->priority < elem->priority) break; } - list_add_rcu(®->list, elem->list.prev); + list_add_rcu(&new->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -102,13 +112,35 @@ int nf_register_hook(struct nf_hook_ops *reg) #endif return 0; } -EXPORT_SYMBOL(nf_register_hook); +EXPORT_SYMBOL(nf_register_net_hook); -void nf_unregister_hook(struct nf_hook_ops *reg) +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { + struct list_head *nf_hook_list; + struct nf_hook_ops *elem; + + nf_hook_list = find_nf_hook_list(net, reg); + if (!nf_hook_list) + return; + mutex_lock(&nf_hook_mutex); - list_del_rcu(®->list); + list_for_each_entry(elem, nf_hook_list, list) { + if ((reg->hook == elem->hook) && + (reg->dev == elem->dev) && + (reg->owner == elem->owner) && + (reg->priv == elem->priv) && + (reg->pf == elem->pf) && + (reg->hooknum == elem->hooknum) && + (reg->priority == elem->priority)) { + list_del_rcu(&elem->list); + break; + } + } mutex_unlock(&nf_hook_mutex); + if (&elem->list == nf_hook_list) { + WARN(1, "nf_unregister_net_hook: hook not found!\n"); + return; + } #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) net_dec_ingress_queue(); @@ -117,7 +149,77 @@ void nf_unregister_hook(struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(reg); + nf_queue_nf_hook_drop(elem); + kfree(elem); +} +EXPORT_SYMBOL(nf_unregister_net_hook); + +int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + unsigned int i; + int err = 0; + + for (i = 0; i < n; i++) { + err = nf_register_net_hook(net, ®[i]); + if (err) + goto err; + } + return err; + +err: + if (i > 0) + nf_unregister_net_hooks(net, reg, i); + return err; +} +EXPORT_SYMBOL(nf_register_net_hooks); + +void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, + unsigned int n) +{ + while (n-- > 0) + nf_unregister_net_hook(net, ®[n]); +} +EXPORT_SYMBOL(nf_unregister_net_hooks); + +static LIST_HEAD(nf_hook_list); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct net *net, *last; + int ret; + + rtnl_lock(); + for_each_net(net) { + ret = nf_register_net_hook(net, reg); + if (ret && ret != -ENOENT) + goto rollback; + } + list_add_tail(®->list, &nf_hook_list); + rtnl_unlock(); + + return 0; +rollback: + last = net; + for_each_net(net) { + if (net == last) + break; + nf_unregister_net_hook(net, reg); + } + rtnl_unlock(); + return ret; +} +EXPORT_SYMBOL(nf_register_hook); + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + struct net *net; + + rtnl_lock(); + list_del(®->list); + for_each_net(net) + nf_unregister_net_hook(net, reg); + rtnl_unlock(); } EXPORT_SYMBOL(nf_unregister_hook); @@ -294,8 +396,46 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *); EXPORT_SYMBOL(nf_nat_decode_session_hook); #endif +static int nf_register_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + int ret; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) { + ret = nf_register_net_hook(net, elem); + if (ret && ret != -ENOENT) + goto out_undo; + } + rtnl_unlock(); + return 0; + +out_undo: + list_for_each_entry_continue_reverse(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); + return ret; +} + +static void nf_unregister_hook_list(struct net *net) +{ + struct nf_hook_ops *elem; + + rtnl_lock(); + list_for_each_entry(elem, &nf_hook_list, list) + nf_unregister_net_hook(net, elem); + rtnl_unlock(); +} + static int __net_init netfilter_net_init(struct net *net) { + int i, h, ret; + + for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&net->nf.hooks[i][h]); + } + #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); @@ -306,11 +446,16 @@ static int __net_init netfilter_net_init(struct net *net) return -ENOMEM; } #endif - return 0; + ret = nf_register_hook_list(net); + if (ret) + remove_proc_entry("netfilter", net->proc_net); + + return ret; } static void __net_exit netfilter_net_exit(struct net *net) { + nf_unregister_hook_list(net); remove_proc_entry("netfilter", net->proc_net); } @@ -321,12 +466,7 @@ static struct pernet_operations netfilter_net_ops = { int __init netfilter_init(void) { - int i, h, ret; - - for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { - for (h = 0; h < NF_MAX_HOOKS; h++) - INIT_LIST_HEAD(&nf_hooks[i][h]); - } + int ret; ret = register_pernet_subsys(&netfilter_net_ops); if (ret < 0) -- cgit v1.2.3 From fd2ecda0341960d0ce361d648cf4dd98187afb06 Mon Sep 17 00:00:00 2001 From: Eric W. Biederman Date: Fri, 10 Jul 2015 18:15:44 -0500 Subject: netfilter: nftables: Only run the nftables chains in the proper netns - Register the nftables chains in the network namespace that they need to run in. - Remove the hacks that stopped chains running in the wrong network namespace. Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 8 ++++++-- net/netfilter/nf_tables_core.c | 5 ----- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index cfe636808541..4a41eb92bcc0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -130,20 +130,24 @@ static void nft_trans_destroy(struct nft_trans *trans) int nft_register_basechain(struct nft_base_chain *basechain, unsigned int hook_nops) { + struct net *net = read_pnet(&basechain->pnet); + if (basechain->flags & NFT_BASECHAIN_DISABLED) return 0; - return nf_register_hooks(basechain->ops, hook_nops); + return nf_register_net_hooks(net, basechain->ops, hook_nops); } EXPORT_SYMBOL_GPL(nft_register_basechain); void nft_unregister_basechain(struct nft_base_chain *basechain, unsigned int hook_nops) { + struct net *net = read_pnet(&basechain->pnet); + if (basechain->flags & NFT_BASECHAIN_DISABLED) return; - nf_unregister_hooks(basechain->ops, hook_nops); + nf_unregister_net_hooks(net, basechain->ops, hook_nops); } EXPORT_SYMBOL_GPL(nft_unregister_basechain); diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index f77bad46ac68..05d0b03530f6 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -114,7 +114,6 @@ unsigned int nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) { const struct nft_chain *chain = ops->priv, *basechain = chain; - const struct net *chain_net = read_pnet(&nft_base_chain(basechain)->pnet); const struct net *net = dev_net(pkt->in ? pkt->in : pkt->out); const struct nft_rule *rule; const struct nft_expr *expr, *last; @@ -125,10 +124,6 @@ nft_do_chain(struct nft_pktinfo *pkt, const struct nf_hook_ops *ops) int rulenum; unsigned int gencursor = nft_genmask_cur(net); - /* Ignore chains that are not for the current network namespace */ - if (!net_eq(net, chain_net)) - return NF_ACCEPT; - do_chain: rulenum = 0; rule = list_entry(&chain->rules, struct nft_rule, list); -- cgit v1.2.3 From 98d1bd802cdbc8f56868fae51edec13e86b59515 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:06 +0200 Subject: netfilter: xtables: compute exact size needed for jumpstack The {arp,ip,ip6tables} jump stack is currently sized based on the number of user chains. However, its rather unlikely that every user defined chain jumps to the next, so lets use the existing loop detection logic to also track the chain depths. The stacksize is then set to the largest chain depth seen. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 19 ++++++++++++------- net/ipv4/netfilter/ip_tables.c | 28 ++++++++++++++++++---------- net/ipv6/netfilter/ip6_tables.c | 23 +++++++++++++++-------- net/netfilter/x_tables.c | 4 ++++ 4 files changed, 49 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 92305a1a021a..ae6d0a124213 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -372,10 +372,13 @@ static inline bool unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. + * + * Keeps track of largest call depth seen and stores it in newinfo->stacksize. */ -static int mark_source_chains(const struct xt_table_info *newinfo, +static int mark_source_chains(struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { + unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -391,6 +394,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; + calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -445,6 +449,8 @@ static int mark_source_chains(const struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; + if (calldepth > 0) + --calldepth; } else { int newpos = t->verdict; @@ -459,6 +465,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo, return 0; } + if (entry0 + newpos != arpt_next_entry(e) && + ++calldepth > max_calldepth) + max_calldepth = calldepth; + /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -475,6 +485,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } + newinfo->stacksize = max_calldepth; return 1; } @@ -664,9 +675,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (ret != 0) break; ++i; - if (strcmp(arpt_get_target(iter)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -1439,9 +1447,6 @@ static int translate_compat_table(const char *name, break; } ++i; - if (strcmp(arpt_get_target(iter1)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 6c72fbb7b49e..5e44b35a8de8 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -439,11 +439,15 @@ ipt_do_table(struct sk_buff *skb, } /* Figures out from what hook each rule can be called: returns 0 if - there are loops. Puts hook bitmask in comefrom. */ + * there are loops. Puts hook bitmask in comefrom. + * + * Keeps track of largest call depth seen and stores it in newinfo->stacksize. + */ static int -mark_source_chains(const struct xt_table_info *newinfo, +mark_source_chains(struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { + unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -457,6 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; + calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -518,6 +523,9 @@ mark_source_chains(const struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; + WARN_ON_ONCE(calldepth == 0); + if (calldepth > 0) + --calldepth; } else { int newpos = t->verdict; @@ -531,9 +539,14 @@ mark_source_chains(const struct xt_table_info *newinfo, newpos); return 0; } + if (entry0 + newpos != ipt_next_entry(e) && + !(e->ip.flags & IPT_F_GOTO) && + ++calldepth > max_calldepth) + max_calldepth = calldepth; + /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u\n", - pos, newpos); + duprintf("Jump rule %u -> %u, calldepth %d\n", + pos, newpos, calldepth); } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -547,6 +560,7 @@ mark_source_chains(const struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } + newinfo->stacksize = max_calldepth; return 1; } @@ -826,9 +840,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (ret != 0) return ret; ++i; - if (strcmp(ipt_get_target(iter)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (i != repl->num_entries) { @@ -1744,9 +1755,6 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; - if (strcmp(ipt_get_target(iter1)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 3c35ced39b42..baf032179918 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -452,11 +452,15 @@ ip6t_do_table(struct sk_buff *skb, } /* Figures out from what hook each rule can be called: returns 0 if - there are loops. Puts hook bitmask in comefrom. */ + * there are loops. Puts hook bitmask in comefrom. + * + * Keeps track of largest call depth seen and stores it in newinfo->stacksize. + */ static int -mark_source_chains(const struct xt_table_info *newinfo, +mark_source_chains(struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { + unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -470,6 +474,7 @@ mark_source_chains(const struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; + calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -531,6 +536,8 @@ mark_source_chains(const struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; + if (calldepth > 0) + --calldepth; } else { int newpos = t->verdict; @@ -544,6 +551,11 @@ mark_source_chains(const struct xt_table_info *newinfo, newpos); return 0; } + if (entry0 + newpos != ip6t_next_entry(e) && + !(e->ipv6.flags & IP6T_F_GOTO) && + ++calldepth > max_calldepth) + max_calldepth = calldepth; + /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -560,6 +572,7 @@ mark_source_chains(const struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } + newinfo->stacksize = max_calldepth; return 1; } @@ -839,9 +852,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (ret != 0) return ret; ++i; - if (strcmp(ip6t_get_target(iter)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (i != repl->num_entries) { @@ -1754,9 +1764,6 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; - if (strcmp(ip6t_get_target(iter1)->u.user.name, - XT_ERROR_TARGET) == 0) - ++newinfo->stacksize; } if (ret) { /* diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d324fe71260c..4db7d60d42fa 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -749,6 +749,10 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) if (i->jumpstack == NULL) return -ENOMEM; + /* ruleset without jumps -- no stack needed */ + if (i->stacksize == 0) + return 0; + i->stacksize *= xt_jumpstack_multiplier; size = sizeof(void *) * i->stacksize; for_each_possible_cpu(cpu) { -- cgit v1.2.3 From e7c8899f3e6f2830136cf6e115c4a55ce7a3920a Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:07 +0200 Subject: netfilter: move tee_active to core This prepares for a TEE like expression in nftables. We want to ensure only one duplicate is sent, so both will use the same percpu variable to detect duplication. The other use case is detection of recursive call to xtables, but since we don't want dependency from nft to xtables core its put into core.c instead of the x_tables core. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter.h | 11 +++++++++++ net/netfilter/core.c | 3 +++ net/netfilter/xt_TEE.c | 13 ++++++------- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index 9bbd110ec81b..e01da73ee6c4 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -390,4 +390,15 @@ extern struct nfq_ct_hook __rcu *nfq_ct_hook; static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {} #endif +/** + * nf_skb_duplicated - TEE target has sent a packet + * + * When a xtables target sends a packet, the OUTPUT and POSTROUTING + * hooks are traversed again, i.e. nft and xtables are invoked recursively. + * + * This is used by xtables TEE target to prevent the duplicated skb from + * being duplicated again. + */ +DECLARE_PER_CPU(bool, nf_skb_duplicated); + #endif /*__LINUX_NETFILTER_H*/ diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 56ead1a1711c..6896cee8b733 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -34,6 +34,9 @@ EXPORT_SYMBOL(nf_afinfo); const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); +DEFINE_PER_CPU(bool, nf_skb_duplicated); +EXPORT_SYMBOL_GPL(nf_skb_duplicated); + int nf_register_afinfo(const struct nf_afinfo *afinfo) { mutex_lock(&afinfo_mutex); diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index a747eb475b68..8950e79c4dc9 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -37,7 +37,6 @@ struct xt_tee_priv { }; static const union nf_inet_addr tee_zero_address; -static DEFINE_PER_CPU(bool, tee_active); static struct net *pick_net(struct sk_buff *skb) { @@ -88,7 +87,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_tee_tginfo *info = par->targinfo; struct iphdr *iph; - if (__this_cpu_read(tee_active)) + if (__this_cpu_read(nf_skb_duplicated)) return XT_CONTINUE; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for @@ -125,9 +124,9 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) ip_send_check(iph); if (tee_tg_route4(skb, info)) { - __this_cpu_write(tee_active, true); + __this_cpu_write(nf_skb_duplicated, true); ip_local_out(skb); - __this_cpu_write(tee_active, false); + __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); } @@ -170,7 +169,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - if (__this_cpu_read(tee_active)) + if (__this_cpu_read(nf_skb_duplicated)) return XT_CONTINUE; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) @@ -188,9 +187,9 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) --iph->hop_limit; } if (tee_tg_route6(skb, info)) { - __this_cpu_write(tee_active, true); + __this_cpu_write(nf_skb_duplicated, true); ip6_local_out(skb); - __this_cpu_write(tee_active, false); + __this_cpu_write(nf_skb_duplicated, false); } else { kfree_skb(skb); } -- cgit v1.2.3 From 7814b6ec6d0d63444abdb49554166c8cfcbd063e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:08 +0200 Subject: netfilter: xtables: don't save/restore jumpstack offset In most cases there is no reentrancy into ip/ip6tables. For skbs sent by REJECT or SYNPROXY targets, there is one level of reentrancy, but its not relevant as those targets issue an absolute verdict, i.e. the jumpstack can be clobbered since its not used after the target issues absolute verdict (ACCEPT, DROP, STOLEN, etc). So the only special case where it is relevant is the TEE target, which returns XT_CONTINUE. This patch changes ip(6)_do_table to always use the jump stack starting from 0. When we detect we're operating on an skb sent via TEE (percpu nf_skb_duplicated is 1) we switch to an alternate stack to leave the original one alone. Since there is no TEE support for arptables, it doesn't need to test if tee is active. The jump stack overflow tests are no longer needed as well -- since ->stacksize is the largest call depth we cannot exceed it. A much better alternative to the external jumpstack would be to just declare a jumps[32] stack on the local stack frame, but that would mean we'd have to reject iptables rulesets that used to work before. Another alternative would be to start rejecting rulesets with a larger call depth, e.g. 1000 -- in this case it would be feasible to allocate the entire stack in the percpu area which would avoid one dereference. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 1 - net/ipv4/netfilter/arp_tables.c | 11 +++-------- net/ipv4/netfilter/ip_tables.c | 37 ++++++++++++++++++++----------------- net/ipv6/netfilter/ip6_tables.c | 26 ++++++++++++++------------ net/netfilter/x_tables.c | 22 +++++++++++----------- 5 files changed, 48 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 286098a5667f..149284557ca7 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -222,7 +222,6 @@ struct xt_table_info { * @stacksize jumps (number of user chains) can possibly be made. */ unsigned int stacksize; - unsigned int __percpu *stackptr; void ***jumpstack; unsigned char entries[0] __aligned(8); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index ae6d0a124213..969fdbe6fbb5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -280,6 +280,9 @@ unsigned int arpt_do_table(struct sk_buff *skb, table_base = private->entries; jumpstack = (struct arpt_entry **)private->jumpstack[cpu]; + /* No TEE support for arptables, so no need to switch to alternate + * stack. All targets that reenter must return absolute verdicts. + */ e = get_entry(table_base, private->hook_entry[hook]); acpar.in = state->in; @@ -325,11 +328,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, } if (table_base + v != arpt_next_entry(e)) { - - if (stackidx >= private->stacksize) { - verdict = NF_DROP; - break; - } jumpstack[stackidx++] = e; } @@ -337,9 +335,6 @@ unsigned int arpt_do_table(struct sk_buff *skb, continue; } - /* Targets which reenter must return - * abs. verdicts - */ acpar.target = t->u.kernel.target; acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 5e44b35a8de8..a2e4b018a254 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -296,12 +296,13 @@ ipt_do_table(struct sk_buff *skb, const char *indev, *outdev; const void *table_base; struct ipt_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; ip = ip_hdr(skb); indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; @@ -331,13 +332,20 @@ ipt_do_table(struct sk_buff *skb, smp_read_barrier_depends(); table_base = private->entries; jumpstack = (struct ipt_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); - pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n", - table->name, hook, origptr, + pr_debug("Entering %s(hook %u), UF %p\n", + table->name, hook, get_entry(table_base, private->underflow[hook])); do { @@ -383,28 +391,24 @@ ipt_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) { + if (stackidx == 0) { e = get_entry(table_base, private->underflow[hook]); pr_debug("Underflow (this is normal) " "to %p\n", e); } else { - e = jumpstack[--*stackptr]; + e = jumpstack[--stackidx]; pr_debug("Pulled %p out from pos %u\n", - e, *stackptr); + e, stackidx); e = ipt_next_entry(e); } continue; } if (table_base + v != ipt_next_entry(e) && !(e->ip.flags & IPT_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; pr_debug("Pushed %p into pos %u\n", - e, *stackptr - 1); + e, stackidx - 1); } e = get_entry(table_base, v); @@ -423,9 +427,8 @@ ipt_do_table(struct sk_buff *skb, /* Verdict */ break; } while (!acpar.hotdrop); - pr_debug("Exiting %s; resetting sp from %u to %u\n", - __func__, *stackptr, origptr); - *stackptr = origptr; + pr_debug("Exiting %s; sp at %u\n", __func__, stackidx); + xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index baf032179918..531281f0ff86 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -324,12 +324,13 @@ ip6t_do_table(struct sk_buff *skb, const char *indev, *outdev; const void *table_base; struct ip6t_entry *e, **jumpstack; - unsigned int *stackptr, origptr, cpu; + unsigned int stackidx, cpu; const struct xt_table_info *private; struct xt_action_param acpar; unsigned int addend; /* Initialization */ + stackidx = 0; indev = state->in ? state->in->name : nulldevname; outdev = state->out ? state->out->name : nulldevname; /* We handle fragments by dealing with the first fragment as @@ -357,8 +358,15 @@ ip6t_do_table(struct sk_buff *skb, cpu = smp_processor_id(); table_base = private->entries; jumpstack = (struct ip6t_entry **)private->jumpstack[cpu]; - stackptr = per_cpu_ptr(private->stackptr, cpu); - origptr = *stackptr; + + /* Switch to alternate jumpstack if we're being invoked via TEE. + * TEE issues XT_CONTINUE verdict on original skb so we must not + * clobber the jumpstack. + * + * For recursion via REJECT or SYNPROXY the stack will be clobbered + * but it is no problem since absolute verdict is issued by these. + */ + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); @@ -406,20 +414,16 @@ ip6t_do_table(struct sk_buff *skb, verdict = (unsigned int)(-v) - 1; break; } - if (*stackptr <= origptr) + if (stackidx == 0) e = get_entry(table_base, private->underflow[hook]); else - e = ip6t_next_entry(jumpstack[--*stackptr]); + e = ip6t_next_entry(jumpstack[--stackidx]); continue; } if (table_base + v != ip6t_next_entry(e) && !(e->ipv6.flags & IP6T_F_GOTO)) { - if (*stackptr >= private->stacksize) { - verdict = NF_DROP; - break; - } - jumpstack[(*stackptr)++] = e; + jumpstack[stackidx++] = e; } e = get_entry(table_base, v); @@ -437,8 +441,6 @@ ip6t_do_table(struct sk_buff *skb, break; } while (!acpar.hotdrop); - *stackptr = origptr; - xt_write_recseq_end(addend); local_bh_enable(); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 4db7d60d42fa..154447e519ab 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -67,9 +67,6 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = { [NFPROTO_IPV6] = "ip6", }; -/* Allow this many total (re)entries. */ -static const unsigned int xt_jumpstack_multiplier = 2; - /* Registration hooks for targets. */ int xt_register_target(struct xt_target *target) { @@ -688,8 +685,6 @@ void xt_free_table_info(struct xt_table_info *info) kvfree(info->jumpstack); } - free_percpu(info->stackptr); - kvfree(info); } EXPORT_SYMBOL(xt_free_table_info); @@ -737,10 +732,6 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) unsigned int size; int cpu; - i->stackptr = alloc_percpu(unsigned int); - if (i->stackptr == NULL) - return -ENOMEM; - size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) i->jumpstack = vzalloc(size); @@ -753,8 +744,17 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) if (i->stacksize == 0) return 0; - i->stacksize *= xt_jumpstack_multiplier; - size = sizeof(void *) * i->stacksize; + /* Jumpstack needs to be able to record two full callchains, one + * from the first rule set traversal, plus one table reentrancy + * via -j TEE without clobbering the callchain that brought us to + * TEE target. + * + * This is done by allocating two jumpstacks per cpu, on reentry + * the upper half of the stack is used. + * + * see the jumpstack setup in ipt_do_table() for more details. + */ + size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { if (size > PAGE_SIZE) i->jumpstack[cpu] = vmalloc_node(size, -- cgit v1.2.3 From dcebd3153e0a7749bb054ab73fa4e1ca33e9d3f9 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:09 +0200 Subject: netfilter: add and use jump label for xt_tee Don't bother testing if we need to switch to alternate stack unless TEE target is used. Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/x_tables.h | 7 +++++++ net/ipv4/netfilter/ip_tables.c | 3 ++- net/ipv6/netfilter/ip6_tables.c | 3 ++- net/netfilter/x_tables.c | 3 +++ net/netfilter/xt_TEE.c | 2 ++ 5 files changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h index 149284557ca7..b006b719183f 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -3,6 +3,7 @@ #include +#include #include /** @@ -280,6 +281,12 @@ void xt_free_table_info(struct xt_table_info *info); */ DECLARE_PER_CPU(seqcount_t, xt_recseq); +/* xt_tee_enabled - true if x_tables needs to handle reentrancy + * + * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. + */ +extern struct static_key xt_tee_enabled; + /** * xt_write_recseq_begin - start of a write section * diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index a2e4b018a254..ff585bdbf850 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -340,7 +340,8 @@ ipt_do_table(struct sk_buff *skb, * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 531281f0ff86..ea6d105063c2 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -366,7 +366,8 @@ ip6t_do_table(struct sk_buff *skb, * For recursion via REJECT or SYNPROXY the stack will be clobbered * but it is no problem since absolute verdict is issued by these. */ - jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); + if (static_key_false(&xt_tee_enabled)) + jumpstack += private->stacksize * __this_cpu_read(nf_skb_duplicated); e = get_entry(table_base, private->hook_entry[hook]); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 154447e519ab..9b42b5ea6dcd 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -727,6 +727,9 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock); DEFINE_PER_CPU(seqcount_t, xt_recseq); EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq); +struct static_key xt_tee_enabled __read_mostly; +EXPORT_SYMBOL_GPL(xt_tee_enabled); + static int xt_jumpstack_alloc(struct xt_table_info *i) { unsigned int size; diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 8950e79c4dc9..c5d6556dbc5e 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -251,6 +251,7 @@ static int tee_tg_check(const struct xt_tgchk_param *par) } else info->priv = NULL; + static_key_slow_inc(&xt_tee_enabled); return 0; } @@ -262,6 +263,7 @@ static void tee_tg_destroy(const struct xt_tgdtor_param *par) unregister_netdevice_notifier(&info->priv->notifier); kfree(info->priv); } + static_key_slow_dec(&xt_tee_enabled); } static struct xt_target tee_tg_reg[] __read_mostly = { -- cgit v1.2.3 From 6c7941dee9c41d6ab5a8be06ec44aa579a6123e1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 14 Jul 2015 17:51:10 +0200 Subject: netfilter: xtables: remove __pure annotation sparse complains: ip_tables.c:361:27: warning: incorrect type in assignment (different modifiers) ip_tables.c:361:27: expected struct ipt_entry *[assigned] e ip_tables.c:361:27: got struct ipt_entry [pure] * doesn't change generated code. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 2 +- net/ipv4/netfilter/ip_tables.c | 2 +- net/ipv6/netfilter/ip6_tables.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 969fdbe6fbb5..c416cb355cb0 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -240,7 +240,7 @@ get_entry(const void *base, unsigned int offset) return (struct arpt_entry *)(base + offset); } -static inline __pure +static inline struct arpt_entry *arpt_next_entry(const struct arpt_entry *entry) { return (void *)entry + entry->next_offset; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ff585bdbf850..787f99ed55e2 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -276,7 +276,7 @@ static void trace_packet(const struct sk_buff *skb, } #endif -static inline __pure +static inline struct ipt_entry *ipt_next_entry(const struct ipt_entry *entry) { return (void *)entry + entry->next_offset; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index ea6d105063c2..4e21f80228be 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -305,7 +305,7 @@ static void trace_packet(const struct sk_buff *skb, } #endif -static inline __pure struct ip6t_entry * +static inline struct ip6t_entry * ip6t_next_entry(const struct ip6t_entry *entry) { return (void *)entry + entry->next_offset; -- cgit v1.2.3 From 40bdc5360d0919b537e27d7c3feb78fd784c31f5 Mon Sep 17 00:00:00 2001 From: Andrea Parri Date: Tue, 14 Jul 2015 00:12:05 +0200 Subject: pkt_sched: sch_qfq: remove unused member of struct qfq_sched The member (u32) "num_active_agg" of struct qfq_sched has been unused since its introduction in 462dbc9101acd38e92eda93c0726857517a24bbd "pkt_sched: QFQ Plus: fair-queueing service at DRR cost" and (AFAICT) there is no active plan to use it; this removes the member. Signed-off-by: Andrea Parri Acked-by: Paolo Valente Signed-off-by: David S. Miller --- net/sched/sch_qfq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index b8d73bca683c..ffaeea63d473 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -186,7 +186,6 @@ struct qfq_sched { u64 oldV, V; /* Precise virtual times. */ struct qfq_aggregate *in_serv_agg; /* Aggregate being served. */ - u32 num_active_agg; /* Num. of active aggregates */ u32 wsum; /* weight sum */ u32 iwsum; /* inverse weight sum */ -- cgit v1.2.3 From c0b8da1e7613a11ce7e44560aae507b62da1f846 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki/吉藤英明 Date: Mon, 13 Jul 2015 23:28:10 +0900 Subject: ipv6: Fix finding best source address in ipv6_dev_get_saddr(). Commit 9131f3de2 ("ipv6: Do not iterate over all interfaces when finding source address on specific interface.") did not properly update best source address available. Plus, it introduced possible NULL pointer dereference. Bug was reported by Erik Kline . Based on patch proposed by Hajime Tazaki . Fixes: 9131f3de24db4dc12199aede7d931e6703e97f3b ("ipv6: Do not iterate over all interfaces when finding source address on specific interface.") Signed-off-by: YOSHIFUJI Hideaki Acked-by: Hajime Tazaki Acked-by: Erik Kline Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4ab74d56f65a..4c9a0246ef48 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1358,14 +1358,15 @@ out: return ret; } -static void __ipv6_dev_get_saddr(struct net *net, - struct ipv6_saddr_dst *dst, - unsigned int prefs, - const struct in6_addr *saddr, - struct inet6_dev *idev, - struct ipv6_saddr_score *scores) +static int __ipv6_dev_get_saddr(struct net *net, + struct ipv6_saddr_dst *dst, + unsigned int prefs, + const struct in6_addr *saddr, + struct inet6_dev *idev, + struct ipv6_saddr_score *scores, + int hiscore_idx) { - struct ipv6_saddr_score *score = &scores[0], *hiscore = &scores[1]; + struct ipv6_saddr_score *score = &scores[1 - hiscore_idx], *hiscore = &scores[hiscore_idx]; read_lock_bh(&idev->lock); list_for_each_entry(score->ifa, &idev->addr_list, if_list) { @@ -1424,6 +1425,7 @@ static void __ipv6_dev_get_saddr(struct net *net, in6_ifa_hold(score->ifa); swap(hiscore, score); + hiscore_idx = 1 - hiscore_idx; /* restore our iterator */ score->ifa = hiscore->ifa; @@ -1434,18 +1436,20 @@ static void __ipv6_dev_get_saddr(struct net *net, } out: read_unlock_bh(&idev->lock); + return hiscore_idx; } int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { - struct ipv6_saddr_score scores[2], *hiscore = &scores[1]; + struct ipv6_saddr_score scores[2], *hiscore; struct ipv6_saddr_dst dst; struct inet6_dev *idev; struct net_device *dev; int dst_type; bool use_oif_addr = false; + int hiscore_idx = 0; dst_type = __ipv6_addr_type(daddr); dst.addr = daddr; @@ -1454,8 +1458,8 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, dst.label = ipv6_addr_label(net, daddr, dst_type, dst.ifindex); dst.prefs = prefs; - hiscore->rule = -1; - hiscore->ifa = NULL; + scores[hiscore_idx].rule = -1; + scores[hiscore_idx].ifa = NULL; rcu_read_lock(); @@ -1480,17 +1484,19 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, } if (use_oif_addr) { - __ipv6_dev_get_saddr(net, &dst, prefs, saddr, idev, scores); + if (idev) + hiscore_idx = __ipv6_dev_get_saddr(net, &dst, prefs, saddr, idev, scores, hiscore_idx); } else { for_each_netdev_rcu(net, dev) { idev = __in6_dev_get(dev); if (!idev) continue; - __ipv6_dev_get_saddr(net, &dst, prefs, saddr, idev, scores); + hiscore_idx = __ipv6_dev_get_saddr(net, &dst, prefs, saddr, idev, scores, hiscore_idx); } } rcu_read_unlock(); + hiscore = &scores[hiscore_idx]; if (!hiscore->ifa) return -EADDRNOTAVAIL; -- cgit v1.2.3 From d746d707a8b1421a4ba46b497cb5d59e20161645 Mon Sep 17 00:00:00 2001 From: Anuradha Karuppiah Date: Tue, 14 Jul 2015 13:43:19 -0700 Subject: net core: Add protodown support. This patch introduces the proto_down flag that can be used by user space applications to notify switch drivers that errors have been detected on the device. The switch driver can react to protodown notification by doing a phys down on the associated switch port. Signed-off-by: Anuradha Karuppiah Signed-off-by: Andy Gospodarek Signed-off-by: Roopa Prabhu Signed-off-by: Wilson Kok Signed-off-by: David S. Miller --- include/linux/netdevice.h | 14 ++++++++++++++ net/core/dev.c | 20 ++++++++++++++++++++ net/core/net-sysfs.c | 14 ++++++++++++++ 3 files changed, 48 insertions(+) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e20979dfd6a9..45cfd797eb77 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1041,6 +1041,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, * TX queue. * int (*ndo_get_iflink)(const struct net_device *dev); * Called to get the iflink value of this device. + * void (*ndo_change_proto_down)(struct net_device *dev, + * bool proto_down); + * This function is used to pass protocol port error state information + * to the switch driver. The switch driver can react to the proto_down + * by doing a phys down on the associated switch port. + * */ struct net_device_ops { int (*ndo_init)(struct net_device *dev); @@ -1211,6 +1217,8 @@ struct net_device_ops { int queue_index, u32 maxrate); int (*ndo_get_iflink)(const struct net_device *dev); + int (*ndo_change_proto_down)(struct net_device *dev, + bool proto_down); }; /** @@ -1502,6 +1510,10 @@ enum netdev_priv_flags { * * @qdisc_tx_busylock: XXX: need comments on this one * + * @proto_down: protocol port state information can be sent to the + * switch driver and used to set the phys state of the + * switch port. + * * FIXME: cleanup struct net_device such that network protocol info * moves out. */ @@ -1762,6 +1774,7 @@ struct net_device { #endif struct phy_device *phydev; struct lock_class_key *qdisc_tx_busylock; + bool proto_down; }; #define to_net_dev(d) container_of(d, struct net_device, dev) @@ -2982,6 +2995,7 @@ int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); +int dev_change_proto_down(struct net_device *dev, bool proto_down); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); diff --git a/net/core/dev.c b/net/core/dev.c index 69445a33ace6..8810b6bbebfe 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6074,6 +6074,26 @@ int dev_get_phys_port_name(struct net_device *dev, } EXPORT_SYMBOL(dev_get_phys_port_name); +/** + * dev_change_proto_down - update protocol port state information + * @dev: device + * @proto_down: new value + * + * This info can be used by switch drivers to set the phys state of the + * port. + */ +int dev_change_proto_down(struct net_device *dev, bool proto_down) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_change_proto_down) + return -EOPNOTSUPP; + if (!netif_device_present(dev)) + return -ENODEV; + return ops->ndo_change_proto_down(dev, proto_down); +} +EXPORT_SYMBOL(dev_change_proto_down); + /** * dev_new_index - allocate an ifindex * @net: the applicable net namespace diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 18b34d771ed4..194c1d03b2b3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -404,6 +404,19 @@ static ssize_t group_store(struct device *dev, struct device_attribute *attr, NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, S_IRUGO | S_IWUSR, group_show, group_store); +static int change_proto_down(struct net_device *dev, unsigned long proto_down) +{ + return dev_change_proto_down(dev, (bool) proto_down); +} + +static ssize_t proto_down_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, change_proto_down); +} +NETDEVICE_SHOW_RW(proto_down, fmt_dec); + static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -501,6 +514,7 @@ static struct attribute *net_class_attrs[] = { &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, + &dev_attr_proto_down.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); -- cgit v1.2.3 From 88d6378bd6c096cb8440face3ae3f33d55a2e6e4 Mon Sep 17 00:00:00 2001 From: Anuradha Karuppiah Date: Tue, 14 Jul 2015 13:43:20 -0700 Subject: netlink: changes for setting and clearing protodown via netlink. Signed-off-by: Anuradha Karuppiah Signed-off-by: Andy Gospodarek Signed-off-by: Roopa Prabhu Signed-off-by: Wilson Kok Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/core/rtnetlink.c | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 2c7e8e3d3981..24d68b797c59 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -148,6 +148,7 @@ enum { IFLA_PHYS_SWITCH_ID, IFLA_LINK_NETNSID, IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, __IFLA_MAX }; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9e433d58d265..03d61b54aac0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -896,7 +896,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ - + nla_total_size(MAX_PHYS_ITEM_ID_LEN); /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + + nla_total_size(1); /* IFLA_PROTO_DOWN */ + } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -1082,7 +1084,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, (dev->ifalias && nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes))) + atomic_read(&dev->carrier_changes)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; if (1) { @@ -1319,6 +1322,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, + [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1858,6 +1862,14 @@ static int do_setlink(const struct sk_buff *skb, } err = 0; + if (tb[IFLA_PROTO_DOWN]) { + err = dev_change_proto_down(dev, + nla_get_u8(tb[IFLA_PROTO_DOWN])); + if (err) + goto errout; + status |= DO_SETLINK_NOTIFY; + } + errout: if (status & DO_SETLINK_MODIFIED) { if (status & DO_SETLINK_NOTIFY) -- cgit v1.2.3 From c15df306fc79c672573f1cc2ebdfcb32d7e68780 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 16 Jul 2015 16:51:30 +0900 Subject: ipv6: Remove unused arguments for __ipv6_dev_get_saddr(). Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 4c9a0246ef48..32153c248959 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1360,8 +1360,6 @@ out: static int __ipv6_dev_get_saddr(struct net *net, struct ipv6_saddr_dst *dst, - unsigned int prefs, - const struct in6_addr *saddr, struct inet6_dev *idev, struct ipv6_saddr_score *scores, int hiscore_idx) @@ -1485,13 +1483,13 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, if (use_oif_addr) { if (idev) - hiscore_idx = __ipv6_dev_get_saddr(net, &dst, prefs, saddr, idev, scores, hiscore_idx); + hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } else { for_each_netdev_rcu(net, dev) { idev = __in6_dev_get(dev); if (!idev) continue; - hiscore_idx = __ipv6_dev_get_saddr(net, &dst, prefs, saddr, idev, scores, hiscore_idx); + hiscore_idx = __ipv6_dev_get_saddr(net, &dst, idev, scores, hiscore_idx); } } rcu_read_unlock(); -- cgit v1.2.3 From 5c48f1201744233d4f235c7dd916d5196ed20716 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 17 Jun 2015 09:58:06 +0200 Subject: mac80211: remove exposing 'mfp' to drivers There's no driver using this, so remove it. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 -- net/mac80211/cfg.c | 1 - net/mac80211/mlme.c | 6 +----- 3 files changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 6b1077c2a63f..43dbddfa06c0 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1675,7 +1675,6 @@ struct ieee80211_sta_rates { * @tdls: indicates whether the STA is a TDLS peer * @tdls_initiator: indicates the STA is an initiator of the TDLS link. Only * valid if the STA is a TDLS peer in the first place. - * @mfp: indicates whether the STA uses management frame protection or not. * @txq: per-TID data TX queues (if driver uses the TXQ abstraction) */ struct ieee80211_sta { @@ -1693,7 +1692,6 @@ struct ieee80211_sta { struct ieee80211_sta_rates __rcu *rates; bool tdls; bool tdls_initiator; - bool mfp; struct ieee80211_txq *txq[IEEE80211_NUM_TIDS]; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index bf7023f6c327..5fc7788e2ff2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1076,7 +1076,6 @@ static int sta_apply_parameters(struct ieee80211_local *local, } if (mask & BIT(NL80211_STA_FLAG_MFP)) { - sta->sta.mfp = !!(set & BIT(NL80211_STA_FLAG_MFP)); if (set & BIT(NL80211_STA_FLAG_MFP)) set_sta_flag(sta, WLAN_STA_MFP); else diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 9b2cc278ac2a..ae5d6c48272d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3034,12 +3034,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, rate_control_rate_init(sta); - if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) { + if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) set_sta_flag(sta, WLAN_STA_MFP); - sta->sta.mfp = true; - } else { - sta->sta.mfp = false; - } sta->sta.wme = elems.wmm_param && local->hw.queues >= IEEE80211_NUM_ACS; -- cgit v1.2.3 From cf47161ad26c293dd5f98186c0cc45d125da952c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Jun 2015 16:16:38 +0200 Subject: mac80211: rename 'sta_inf' variable to more common 'sta' We typically use 'sta' for the station info struct, and if needed 'pubsta' for the public (driver-visible) portion thereof. Do this in the ieee80211_sta_ps_transition() function. Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 5dae166cb7f5..9fb8ce982c2d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1240,22 +1240,22 @@ static void sta_ps_end(struct sta_info *sta) ieee80211_sta_ps_deliver_wakeup(sta); } -int ieee80211_sta_ps_transition(struct ieee80211_sta *sta, bool start) +int ieee80211_sta_ps_transition(struct ieee80211_sta *pubsta, bool start) { - struct sta_info *sta_inf = container_of(sta, struct sta_info, sta); + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); bool in_ps; - WARN_ON(!ieee80211_hw_check(&sta_inf->local->hw, AP_LINK_PS)); + WARN_ON(!ieee80211_hw_check(&sta->local->hw, AP_LINK_PS)); /* Don't let the same PS state be set twice */ - in_ps = test_sta_flag(sta_inf, WLAN_STA_PS_STA); + in_ps = test_sta_flag(sta, WLAN_STA_PS_STA); if ((start && in_ps) || (!start && !in_ps)) return -EINVAL; if (start) - sta_ps_start(sta_inf); + sta_ps_start(sta); else - sta_ps_end(sta_inf); + sta_ps_end(sta); return 0; } -- cgit v1.2.3 From 16bf948081d0ea0f6cdef54b79a0250d4b099970 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Jun 2015 16:10:30 +0200 Subject: mac80211: remove sta_info.gtk_idx This struct member is only assigned, never used otherwise; remove it. Signed-off-by: Johannes Berg --- net/mac80211/key.c | 1 - net/mac80211/sta_info.h | 2 -- 2 files changed, 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index b22df3a79a41..44388d6a1d8e 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -336,7 +336,6 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, ieee80211_check_fast_xmit(sta); } else { rcu_assign_pointer(sta->gtk[idx], new); - sta->gtk_idx = idx; } } else { defunikey = old && diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 226f8ca47ad6..147464dbc455 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -283,7 +283,6 @@ struct ieee80211_fast_tx { * @ptk: peer keys negotiated with this station, if any * @ptk_idx: last installed peer key index * @gtk: group keys negotiated with this station, if any - * @gtk_idx: last installed group key index * @rate_ctrl: rate control algorithm reference * @rate_ctrl_lock: spinlock used to protect rate control data * (data inside the algorithm, so serializes calls there) @@ -381,7 +380,6 @@ struct sta_info { struct ieee80211_sub_if_data *sdata; struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; struct ieee80211_key __rcu *ptk[NUM_DEFAULT_KEYS]; - u8 gtk_idx; u8 ptk_idx; struct rate_control_ref *rate_ctrl; void *rate_ctrl_priv; -- cgit v1.2.3 From 9ad8b21b742503030d543cd272de6a4eb3e3cc27 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Jun 2015 15:05:57 +0200 Subject: mac80211: remove short frame test and counter Short frames less than 16 octets are already blocked in the monitor code by the should_drop_frame() function, and cannot get into the regular RX path. Therefore, this check can never trigger and the counter invariably stays zero. Remove the useless code. Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 1 - net/mac80211/ieee80211_i.h | 1 - net/mac80211/rx.c | 5 ----- 3 files changed, 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 3ea8b7de9633..2c79d777f0e4 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -277,7 +277,6 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_STATS_ADD(rx_handlers_queued); DEBUGFS_STATS_ADD(rx_handlers_drop_nullfunc); DEBUGFS_STATS_ADD(rx_handlers_drop_defrag); - DEBUGFS_STATS_ADD(rx_handlers_drop_short); DEBUGFS_STATS_ADD(tx_expand_skb_head); DEBUGFS_STATS_ADD(tx_expand_skb_head_cloned); DEBUGFS_STATS_ADD(rx_expand_skb_head_defrag); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b12f61507f9f..eb91102e4809 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1286,7 +1286,6 @@ struct ieee80211_local { unsigned int rx_handlers_queued; unsigned int rx_handlers_drop_nullfunc; unsigned int rx_handlers_drop_defrag; - unsigned int rx_handlers_drop_short; unsigned int tx_expand_skb_head; unsigned int tx_expand_skb_head_cloned; unsigned int rx_expand_skb_head_defrag; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 9fb8ce982c2d..aa57a2ab8245 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1093,11 +1093,6 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; - if (unlikely(rx->skb->len < 16)) { - I802_DEBUG_INC(rx->local->rx_handlers_drop_short); - return RX_DROP_MONITOR; - } - /* Drop disallowed frame classes based on STA auth/assoc state; * IEEE 802.11, Chap 5.5. * -- cgit v1.2.3 From a682849329ad5df5fd13a7b1ab08cbc39df5484b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Jun 2015 15:17:15 +0200 Subject: mac80211: move ieee80211_get_bssid into RX file This function is only used in the RX code, so moving it into that file gives the compiler better optimisation possibilities and also allows us to remove the check for short frames (which in the RX path cannot happen, but as a generic utility needed to be checked.) Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 -- net/mac80211/rx.c | 45 ++++++++++++++++++++++++++++++++++++++++++ net/mac80211/util.c | 49 ---------------------------------------------- 3 files changed, 45 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index eb91102e4809..361bb3ca335c 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1762,8 +1762,6 @@ static inline int __ieee80211_resume(struct ieee80211_hw *hw) /* utility functions/constants */ extern const void *const mac80211_wiphy_privid; /* for wiphy privid */ -u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, - enum nl80211_iftype type); int ieee80211_frame_duration(enum ieee80211_band band, size_t len, int rate, int erp, int short_preamble, int shift); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index aa57a2ab8245..dd6bb2a54d45 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -42,6 +42,51 @@ static inline void ieee80211_rx_stats(struct net_device *dev, u32 len) u64_stats_update_end(&tstats->syncp); } +static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, + enum nl80211_iftype type) +{ + __le16 fc = hdr->frame_control; + + if (ieee80211_is_data(fc)) { + if (len < 24) /* drop incorrect hdr len (data) */ + return NULL; + + if (ieee80211_has_a4(fc)) + return NULL; + if (ieee80211_has_tods(fc)) + return hdr->addr1; + if (ieee80211_has_fromds(fc)) + return hdr->addr2; + + return hdr->addr3; + } + + if (ieee80211_is_mgmt(fc)) { + if (len < 24) /* drop incorrect hdr len (mgmt) */ + return NULL; + return hdr->addr3; + } + + if (ieee80211_is_ctl(fc)) { + if (ieee80211_is_pspoll(fc)) + return hdr->addr1; + + if (ieee80211_is_back_req(fc)) { + switch (type) { + case NL80211_IFTYPE_STATION: + return hdr->addr2; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + return hdr->addr1; + default: + break; /* fall through to the return */ + } + } + } + + return NULL; +} + /* * monitor mode reception * diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 43e5aadd7a89..7fb2c7bacc8c 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -47,55 +47,6 @@ struct ieee80211_hw *wiphy_to_ieee80211_hw(struct wiphy *wiphy) } EXPORT_SYMBOL(wiphy_to_ieee80211_hw); -u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, - enum nl80211_iftype type) -{ - __le16 fc = hdr->frame_control; - - /* drop ACK/CTS frames and incorrect hdr len (ctrl) */ - if (len < 16) - return NULL; - - if (ieee80211_is_data(fc)) { - if (len < 24) /* drop incorrect hdr len (data) */ - return NULL; - - if (ieee80211_has_a4(fc)) - return NULL; - if (ieee80211_has_tods(fc)) - return hdr->addr1; - if (ieee80211_has_fromds(fc)) - return hdr->addr2; - - return hdr->addr3; - } - - if (ieee80211_is_mgmt(fc)) { - if (len < 24) /* drop incorrect hdr len (mgmt) */ - return NULL; - return hdr->addr3; - } - - if (ieee80211_is_ctl(fc)) { - if (ieee80211_is_pspoll(fc)) - return hdr->addr1; - - if (ieee80211_is_back_req(fc)) { - switch (type) { - case NL80211_IFTYPE_STATION: - return hdr->addr2; - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - return hdr->addr1; - default: - break; /* fall through to the return */ - } - } - } - - return NULL; -} - void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { struct sk_buff *skb; -- cgit v1.2.3 From 798a457dfb232535ebc9670082b8dfccdab684ff Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Jun 2015 15:58:22 +0200 Subject: mac80211: fix comment referring to RX queue There are no RX queues in mac80211 (yet), the comment should refer to the TID (including one slot for non-QoS) rather than 'RX queue'. Signed-off-by: Johannes Berg --- net/mac80211/sta_info.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 147464dbc455..db76103b4445 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -317,7 +317,8 @@ struct ieee80211_fast_tx { * @last_signal: signal of last received frame from this STA * @avg_signal: moving average of signal of received frames from this STA * @last_ack_signal: signal of last received Ack frame from this STA - * @last_seq_ctrl: last received seq/frag number from this STA (per RX queue) + * @last_seq_ctrl: last received seq/frag number from this STA (per TID + * plus one for non-QoS frames) * @tx_filtered_count: number of frames the hardware filtered for this STA * @tx_retry_failed: number of frames that failed retry * @tx_retry_count: total number of retries for frames to this STA -- cgit v1.2.3 From af9f9b22beee70aae58651cdbb9d6375e6e51797 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 11 Jun 2015 16:02:32 +0200 Subject: mac80211: don't store napi struct When introducing multiple RX queues, a single NAPI struct will not be sufficient. Instead of trying to store multiple, simply change the API to have the NAPI struct passed to the RX function. This of course means that drivers using rx_irqsafe() cannot use NAPI, but that seems a reasonable trade-off, particularly since only two of all drivers are currently using it at all. While at it, we can now remove the IEEE80211_RX_REORDER_TIMER flag again since this code path cannot have a napi struct anyway. Signed-off-by: Johannes Berg --- drivers/net/wireless/iwlwifi/dvm/dev.h | 2 ++ drivers/net/wireless/iwlwifi/dvm/main.c | 3 ++- drivers/net/wireless/iwlwifi/dvm/rx.c | 2 +- drivers/net/wireless/iwlwifi/mvm/mvm.h | 1 + drivers/net/wireless/iwlwifi/mvm/ops.c | 3 ++- drivers/net/wireless/iwlwifi/mvm/rx.c | 2 +- include/net/mac80211.h | 37 +++++++++++++++++++++------------ net/mac80211/ieee80211_i.h | 6 +----- net/mac80211/main.c | 12 ----------- net/mac80211/rx.c | 18 +++++++++------- 10 files changed, 44 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/drivers/net/wireless/iwlwifi/dvm/dev.h b/drivers/net/wireless/iwlwifi/dvm/dev.h index 3811878ab9cd..074977ede343 100644 --- a/drivers/net/wireless/iwlwifi/dvm/dev.h +++ b/drivers/net/wireless/iwlwifi/dvm/dev.h @@ -669,6 +669,8 @@ struct iwl_priv { /* ieee device used by generic ieee processing code */ struct ieee80211_hw *hw; + struct napi_struct *napi; + struct list_head calib_results; struct workqueue_struct *workqueue; diff --git a/drivers/net/wireless/iwlwifi/dvm/main.c b/drivers/net/wireless/iwlwifi/dvm/main.c index 234e30f498b2..644819563cf0 100644 --- a/drivers/net/wireless/iwlwifi/dvm/main.c +++ b/drivers/net/wireless/iwlwifi/dvm/main.c @@ -2037,7 +2037,8 @@ static void iwl_napi_add(struct iwl_op_mode *op_mode, { struct iwl_priv *priv = IWL_OP_MODE_GET_DVM(op_mode); - ieee80211_napi_add(priv->hw, napi, napi_dev, poll, weight); + netif_napi_add(napi_dev, napi, poll, weight); + priv->napi = napi; } static const struct iwl_op_mode_ops iwl_dvm_ops = { diff --git a/drivers/net/wireless/iwlwifi/dvm/rx.c b/drivers/net/wireless/iwlwifi/dvm/rx.c index debec963c610..5a91f5d6b1dc 100644 --- a/drivers/net/wireless/iwlwifi/dvm/rx.c +++ b/drivers/net/wireless/iwlwifi/dvm/rx.c @@ -786,7 +786,7 @@ static void iwlagn_pass_packet_to_mac80211(struct iwl_priv *priv, memcpy(IEEE80211_SKB_RXCB(skb), stats, sizeof(*stats)); - ieee80211_rx(priv->hw, skb); + ieee80211_rx_napi(priv->hw, skb, priv->napi); } static u32 iwlagn_translate_rx_status(struct iwl_priv *priv, u32 decrypt_in) diff --git a/drivers/net/wireless/iwlwifi/mvm/mvm.h b/drivers/net/wireless/iwlwifi/mvm/mvm.h index 2d4bad5fe825..605f57a2c6be 100644 --- a/drivers/net/wireless/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/iwlwifi/mvm/mvm.h @@ -559,6 +559,7 @@ struct iwl_mvm { const struct iwl_cfg *cfg; struct iwl_phy_db *phy_db; struct ieee80211_hw *hw; + struct napi_struct *napi; /* for protecting access to iwl_mvm */ struct mutex mutex; diff --git a/drivers/net/wireless/iwlwifi/mvm/ops.c b/drivers/net/wireless/iwlwifi/mvm/ops.c index e4fa50075ffd..3967df63e0f3 100644 --- a/drivers/net/wireless/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/iwlwifi/mvm/ops.c @@ -1316,7 +1316,8 @@ static void iwl_mvm_napi_add(struct iwl_op_mode *op_mode, { struct iwl_mvm *mvm = IWL_OP_MODE_GET_MVM(op_mode); - ieee80211_napi_add(mvm->hw, napi, napi_dev, poll, weight); + netif_napi_add(napi_dev, napi, poll, weight); + mvm->napi = napi; } static const struct iwl_op_mode_ops iwl_mvm_ops = { diff --git a/drivers/net/wireless/iwlwifi/mvm/rx.c b/drivers/net/wireless/iwlwifi/mvm/rx.c index 8f1d93b7a13a..9ff0b4321df3 100644 --- a/drivers/net/wireless/iwlwifi/mvm/rx.c +++ b/drivers/net/wireless/iwlwifi/mvm/rx.c @@ -129,7 +129,7 @@ static void iwl_mvm_pass_packet_to_mac80211(struct iwl_mvm *mvm, fraglen, rxb->truesize); } - ieee80211_rx(mvm->hw, skb); + ieee80211_rx_napi(mvm->hw, skb, mvm->napi); } /* diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 43dbddfa06c0..ff68b8c4ab35 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3694,20 +3694,28 @@ void ieee80211_free_hw(struct ieee80211_hw *hw); void ieee80211_restart_hw(struct ieee80211_hw *hw); /** - * ieee80211_napi_add - initialize mac80211 NAPI context - * @hw: the hardware to initialize the NAPI context on - * @napi: the NAPI context to initialize - * @napi_dev: dummy NAPI netdevice, here to not waste the space if the - * driver doesn't use NAPI - * @poll: poll function - * @weight: default weight + * ieee80211_rx_napi - receive frame from NAPI context * - * See also netif_napi_add(). + * Use this function to hand received frames to mac80211. The receive + * buffer in @skb must start with an IEEE 802.11 header. In case of a + * paged @skb is used, the driver is recommended to put the ieee80211 + * header of the frame on the linear part of the @skb to avoid memory + * allocation and/or memcpy by the stack. + * + * This function may not be called in IRQ context. Calls to this function + * for a single hardware must be synchronized against each other. Calls to + * this function, ieee80211_rx_ni() and ieee80211_rx_irqsafe() may not be + * mixed for a single hardware. Must not run concurrently with + * ieee80211_tx_status() or ieee80211_tx_status_ni(). + * + * This function must be called with BHs disabled. + * + * @hw: the hardware this frame came in on + * @skb: the buffer to receive, owned by mac80211 after this call + * @napi: the NAPI context */ -void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi, - struct net_device *napi_dev, - int (*poll)(struct napi_struct *, int), - int weight); +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb, + struct napi_struct *napi); /** * ieee80211_rx - receive frame @@ -3729,7 +3737,10 @@ void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi, * @hw: the hardware this frame came in on * @skb: the buffer to receive, owned by mac80211 after this call */ -void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb); +static inline void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) +{ + ieee80211_rx_napi(hw, skb, NULL); +} /** * ieee80211_rx_irqsafe - receive frame diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 361bb3ca335c..7d75f93bac7d 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -202,8 +202,6 @@ enum ieee80211_packet_rx_flags { * @IEEE80211_RX_CMNTR: received on cooked monitor already * @IEEE80211_RX_BEACON_REPORTED: This frame was already reported * to cfg80211_report_obss_beacon(). - * @IEEE80211_RX_REORDER_TIMER: this frame is released by the - * reorder buffer timeout timer, not the normal RX path * * These flags are used across handling multiple interfaces * for a single frame. @@ -211,10 +209,10 @@ enum ieee80211_packet_rx_flags { enum ieee80211_rx_flags { IEEE80211_RX_CMNTR = BIT(0), IEEE80211_RX_BEACON_REPORTED = BIT(1), - IEEE80211_RX_REORDER_TIMER = BIT(2), }; struct ieee80211_rx_data { + struct napi_struct *napi; struct sk_buff *skb; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; @@ -1347,8 +1345,6 @@ struct ieee80211_local { struct ieee80211_sub_if_data __rcu *p2p_sdata; - struct napi_struct *napi; - /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct cfg80211_chan_def monitor_chandef; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 3c63468b4dfb..dba0a86dee18 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1132,18 +1132,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) } EXPORT_SYMBOL(ieee80211_register_hw); -void ieee80211_napi_add(struct ieee80211_hw *hw, struct napi_struct *napi, - struct net_device *napi_dev, - int (*poll)(struct napi_struct *, int), - int weight) -{ - struct ieee80211_local *local = hw_to_local(hw); - - netif_napi_add(napi_dev, napi, poll, weight); - local->napi = napi; -} -EXPORT_SYMBOL_GPL(ieee80211_napi_add); - void ieee80211_unregister_hw(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index dd6bb2a54d45..817bf22dad5a 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2148,9 +2148,8 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) /* deliver to local stack */ skb->protocol = eth_type_trans(skb, dev); memset(skb->cb, 0, sizeof(skb->cb)); - if (!(rx->flags & IEEE80211_RX_REORDER_TIMER) && - rx->local->napi) - napi_gro_receive(rx->local->napi, skb); + if (rx->napi) + napi_gro_receive(rx->napi, skb); else netif_receive_skb(skb); } @@ -3256,7 +3255,7 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* This is OK -- must be QoS data frame */ .security_idx = tid, .seqno_idx = tid, - .flags = IEEE80211_RX_REORDER_TIMER, + .napi = NULL, /* must be NULL to not have races */ }; struct tid_ampdu_rx *tid_agg_rx; @@ -3433,7 +3432,8 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, * be called with rcu_read_lock protection. */ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, - struct sk_buff *skb) + struct sk_buff *skb, + struct napi_struct *napi) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; @@ -3449,6 +3449,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, memset(&rx, 0, sizeof(rx)); rx.skb = skb; rx.local = local; + rx.napi = napi; if (ieee80211_is_data(fc) || ieee80211_is_mgmt(fc)) I802_DEBUG_INC(local->dot11ReceivedFragmentCount); @@ -3550,7 +3551,8 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, * This is the receive path handler. It is called by a low level driver when an * 802.11 MPDU is received from the hardware. */ -void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) +void ieee80211_rx_napi(struct ieee80211_hw *hw, struct sk_buff *skb, + struct napi_struct *napi) { struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_rate *rate = NULL; @@ -3649,7 +3651,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) ieee80211_tpt_led_trig_rx(local, ((struct ieee80211_hdr *)skb->data)->frame_control, skb->len); - __ieee80211_rx_handle_packet(hw, skb); + __ieee80211_rx_handle_packet(hw, skb, napi); rcu_read_unlock(); @@ -3657,7 +3659,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) drop: kfree_skb(skb); } -EXPORT_SYMBOL(ieee80211_rx); +EXPORT_SYMBOL(ieee80211_rx_napi); /* This is a version of the rx handler that can be called from hard irq * context. Post the skb on the queue and schedule the tasklet */ -- cgit v1.2.3 From 0c028b5fd1bd10d5777756e571c6c1971f04062b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Jun 2015 14:33:54 +0200 Subject: mac80211: remove zero-length A-MPDU subframe reporting As there's no driver using this capability and reporting zero-length A-MPDU subframes for radiotap monitoring, remove the capability to free up two RX flags. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 6 +----- net/mac80211/rx.c | 7 +------ 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index ff68b8c4ab35..7417fee18185 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -997,9 +997,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference * number (@ampdu_reference) must be populated and be a distinct number for * each A-MPDU - * @RX_FLAG_AMPDU_REPORT_ZEROLEN: driver reports 0-length subframes - * @RX_FLAG_AMPDU_IS_ZEROLEN: This is a zero-length subframe, for - * monitoring purposes only * @RX_FLAG_AMPDU_LAST_KNOWN: last subframe is known, should be set on all * subframes of a single A-MPDU * @RX_FLAG_AMPDU_IS_LAST: this subframe is the last subframe of the A-MPDU @@ -1039,8 +1036,7 @@ enum mac80211_rx_flags { RX_FLAG_NO_SIGNAL_VAL = BIT(12), RX_FLAG_HT_GF = BIT(13), RX_FLAG_AMPDU_DETAILS = BIT(14), - RX_FLAG_AMPDU_REPORT_ZEROLEN = BIT(15), - RX_FLAG_AMPDU_IS_ZEROLEN = BIT(16), + /* bits 15/16 free */ RX_FLAG_AMPDU_LAST_KNOWN = BIT(17), RX_FLAG_AMPDU_IS_LAST = BIT(18), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(19), diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 817bf22dad5a..9d95cb8e8e95 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -122,8 +122,7 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, hdr = (void *)(skb->data + rtap_vendor_space); if (status->flag & (RX_FLAG_FAILED_FCS_CRC | - RX_FLAG_FAILED_PLCP_CRC | - RX_FLAG_AMPDU_IS_ZEROLEN)) + RX_FLAG_FAILED_PLCP_CRC)) return true; if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space)) @@ -391,10 +390,6 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, cpu_to_le32(1 << IEEE80211_RADIOTAP_AMPDU_STATUS); put_unaligned_le32(status->ampdu_reference, pos); pos += 4; - if (status->flag & RX_FLAG_AMPDU_REPORT_ZEROLEN) - flags |= IEEE80211_RADIOTAP_AMPDU_REPORT_ZEROLEN; - if (status->flag & RX_FLAG_AMPDU_IS_ZEROLEN) - flags |= IEEE80211_RADIOTAP_AMPDU_IS_ZEROLEN; if (status->flag & RX_FLAG_AMPDU_LAST_KNOWN) flags |= IEEE80211_RADIOTAP_AMPDU_LAST_KNOWN; if (status->flag & RX_FLAG_AMPDU_IS_LAST) -- cgit v1.2.3 From 77c96404a4cf16ffa0720a3fbf45839cc16018ac Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Jun 2015 14:40:45 +0200 Subject: mac80211: remove key TX/RX counter This counter is inherently racy (since it can be incremented by RX as well as by concurrent TX) and only available in debugfs. Instead of fixing it to be per-CPU or similar, remove it for now. If needed it should be added without races and with proper nl80211, perhaps even addressing the threshold reporting TODO item that's been there since the code was originally added. Signed-off-by: Johannes Berg --- net/mac80211/debugfs_key.c | 2 -- net/mac80211/key.h | 3 --- net/mac80211/rx.c | 1 - net/mac80211/tx.c | 1 - 4 files changed, 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index e82bf1e9d7a8..702ca122c498 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -57,7 +57,6 @@ KEY_CONF_FILE(keylen, D); KEY_CONF_FILE(keyidx, D); KEY_CONF_FILE(hw_key_idx, D); KEY_FILE(flags, X); -KEY_FILE(tx_rx_count, D); KEY_READ(ifindex, sdata->name, "%s\n"); KEY_OPS(ifindex); @@ -310,7 +309,6 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key) DEBUGFS_ADD(flags); DEBUGFS_ADD(keyidx); DEBUGFS_ADD(hw_key_idx); - DEBUGFS_ADD(tx_rx_count); DEBUGFS_ADD(algorithm); DEBUGFS_ADD(tx_spec); DEBUGFS_ADD(rx_spec); diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 3f4f9eaac140..9951ef06323e 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -115,9 +115,6 @@ struct ieee80211_key { } gen; } u; - /* number of times this key has been used */ - int tx_rx_count; - #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *stalink; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 9d95cb8e8e95..3037bd152ffa 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1682,7 +1682,6 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (unlikely(rx->key->flags & KEY_FLAG_TAINTED)) return RX_DROP_MONITOR; - rx->key->tx_rx_count++; /* TODO: add threshold stuff again */ } else { return RX_DROP_MONITOR; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 8410bb3bf5e8..87b9b4e27d22 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -610,7 +610,6 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (tx->key) { bool skip_hw = false; - tx->key->tx_rx_count++; /* TODO: add threshold stuff again */ switch (tx->key->conf.cipher) { -- cgit v1.2.3 From 981d94a80174e4f33bd5015fb49051bfc2eb00d2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 12 Jun 2015 14:39:02 +0200 Subject: mac80211: support device/driver PN check for CCMP/GCMP When there are multiple RX queues, the PN checks in mac80211 cannot be used since packets might be processed out of order on different CPUs. Allow the driver to report that the PN has been checked, drivers that will use multi-queue RX will have to set this flag. For now, the flag is only valid when the frame has been decrypted, in theory that restriction doesn't have to be there, but in practice the hardware will have decrypted the frame already. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 7 ++++- net/mac80211/wpa.c | 83 +++++++++++++++++++++++++++----------------------- 2 files changed, 51 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 7417fee18185..4d3d2686f278 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -973,6 +973,10 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * @RX_FLAG_IV_STRIPPED: The IV/ICV are stripped from this frame. * If this flag is set, the stack cannot do any replay detection * hence the driver or hardware will have to do that. + * @RX_FLAG_PN_VALIDATED: Currently only valid for CCMP/GCMP frames, this + * flag indicates that the PN was verified for replay protection. + * Note that this flag is also currently only supported when a frame + * is also decrypted (ie. @RX_FLAG_DECRYPTED must be set) * @RX_FLAG_FAILED_FCS_CRC: Set this flag if the FCS check failed on * the frame. * @RX_FLAG_FAILED_PLCP_CRC: Set this flag if the PCLP check failed on @@ -1036,7 +1040,8 @@ enum mac80211_rx_flags { RX_FLAG_NO_SIGNAL_VAL = BIT(12), RX_FLAG_HT_GF = BIT(13), RX_FLAG_AMPDU_DETAILS = BIT(14), - /* bits 15/16 free */ + RX_FLAG_PN_VALIDATED = BIT(15), + /* bit 16 free */ RX_FLAG_AMPDU_LAST_KNOWN = BIT(17), RX_FLAG_AMPDU_IS_LAST = BIT(18), RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(19), diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 943f7606527e..feb547dc8643 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -516,31 +516,34 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, return RX_DROP_UNUSABLE; } - ccmp_hdr2pn(pn, skb->data + hdrlen); + if (!(status->flag & RX_FLAG_PN_VALIDATED)) { + ccmp_hdr2pn(pn, skb->data + hdrlen); - queue = rx->security_idx; + queue = rx->security_idx; - if (memcmp(pn, key->u.ccmp.rx_pn[queue], IEEE80211_CCMP_PN_LEN) <= 0) { - key->u.ccmp.replays++; - return RX_DROP_UNUSABLE; - } + if (memcmp(pn, key->u.ccmp.rx_pn[queue], + IEEE80211_CCMP_PN_LEN) <= 0) { + key->u.ccmp.replays++; + return RX_DROP_UNUSABLE; + } - if (!(status->flag & RX_FLAG_DECRYPTED)) { - u8 aad[2 * AES_BLOCK_SIZE]; - u8 b_0[AES_BLOCK_SIZE]; - /* hardware didn't decrypt/verify MIC */ - ccmp_special_blocks(skb, pn, b_0, aad); + if (!(status->flag & RX_FLAG_DECRYPTED)) { + u8 aad[2 * AES_BLOCK_SIZE]; + u8 b_0[AES_BLOCK_SIZE]; + /* hardware didn't decrypt/verify MIC */ + ccmp_special_blocks(skb, pn, b_0, aad); + + if (ieee80211_aes_ccm_decrypt( + key->u.ccmp.tfm, b_0, aad, + skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN, + data_len, + skb->data + skb->len - mic_len, mic_len)) + return RX_DROP_UNUSABLE; + } - if (ieee80211_aes_ccm_decrypt( - key->u.ccmp.tfm, b_0, aad, - skb->data + hdrlen + IEEE80211_CCMP_HDR_LEN, - data_len, - skb->data + skb->len - mic_len, mic_len)) - return RX_DROP_UNUSABLE; + memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN); } - memcpy(key->u.ccmp.rx_pn[queue], pn, IEEE80211_CCMP_PN_LEN); - /* Remove CCMP header and MIC */ if (pskb_trim(skb, skb->len - mic_len)) return RX_DROP_UNUSABLE; @@ -739,31 +742,35 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; } - gcmp_hdr2pn(pn, skb->data + hdrlen); + if (!(status->flag & RX_FLAG_PN_VALIDATED)) { + gcmp_hdr2pn(pn, skb->data + hdrlen); - queue = rx->security_idx; + queue = rx->security_idx; - if (memcmp(pn, key->u.gcmp.rx_pn[queue], IEEE80211_GCMP_PN_LEN) <= 0) { - key->u.gcmp.replays++; - return RX_DROP_UNUSABLE; - } + if (memcmp(pn, key->u.gcmp.rx_pn[queue], + IEEE80211_GCMP_PN_LEN) <= 0) { + key->u.gcmp.replays++; + return RX_DROP_UNUSABLE; + } - if (!(status->flag & RX_FLAG_DECRYPTED)) { - u8 aad[2 * AES_BLOCK_SIZE]; - u8 j_0[AES_BLOCK_SIZE]; - /* hardware didn't decrypt/verify MIC */ - gcmp_special_blocks(skb, pn, j_0, aad); + if (!(status->flag & RX_FLAG_DECRYPTED)) { + u8 aad[2 * AES_BLOCK_SIZE]; + u8 j_0[AES_BLOCK_SIZE]; + /* hardware didn't decrypt/verify MIC */ + gcmp_special_blocks(skb, pn, j_0, aad); + + if (ieee80211_aes_gcm_decrypt( + key->u.gcmp.tfm, j_0, aad, + skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN, + data_len, + skb->data + skb->len - + IEEE80211_GCMP_MIC_LEN)) + return RX_DROP_UNUSABLE; + } - if (ieee80211_aes_gcm_decrypt( - key->u.gcmp.tfm, j_0, aad, - skb->data + hdrlen + IEEE80211_GCMP_HDR_LEN, - data_len, - skb->data + skb->len - IEEE80211_GCMP_MIC_LEN)) - return RX_DROP_UNUSABLE; + memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN); } - memcpy(key->u.gcmp.rx_pn[queue], pn, IEEE80211_GCMP_PN_LEN); - /* Remove GCMP header and MIC */ if (pskb_trim(skb, skb->len - IEEE80211_GCMP_MIC_LEN)) return RX_DROP_UNUSABLE; -- cgit v1.2.3 From ac100ce52a2d3b6261a06939d22e4382d9aa0bb2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Jun 2015 16:22:12 +0200 Subject: mac80211: duplicate station's MAC address for hash table Currently, the station hash table lookup (or iteration) must access two cachelines for each station - the one with the hash table node, and the one with the MAC address. Duplicate the MAC address next to the hash node to get rid of this. Since the MAC address is static there's no consistency problem introduced by this. Signed-off-by: Johannes Berg --- net/mac80211/sta_info.c | 3 ++- net/mac80211/sta_info.h | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 666ddac3c87c..d573a499750e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -68,7 +68,7 @@ static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), - .key_offset = offsetof(struct sta_info, sta.addr), + .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .hashfn = sta_addr_hash, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, @@ -320,6 +320,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; #endif + memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); sta->local = local; sta->sdata = sdata; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index db76103b4445..422984986263 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -278,6 +278,8 @@ struct ieee80211_fast_tx { * @list: global linked list entry * @free_list: list entry for keeping track of stations to free * @hash_node: hash node for rhashtable + * @addr: station's MAC address - duplicated from public part to + * let the hash table work with just a single cacheline * @local: pointer to the global information * @sdata: virtual interface this station belongs to * @ptk: peer keys negotiated with this station, if any @@ -377,6 +379,7 @@ struct sta_info { struct list_head list, free_list; struct rcu_head rcu_head; struct rhash_head hash_node; + u8 addr[ETH_ALEN]; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; @@ -607,7 +610,7 @@ u32 sta_addr_hash(const void *key, u32 length, u32 seed); _sta_bucket_idx(tbl, _addr), \ hash_node) \ /* compare address and run code only if it matches */ \ - if (ether_addr_equal(_sta->sta.addr, (_addr))) + if (ether_addr_equal(_sta->addr, (_addr))) /* * Get STA info by index, BROKEN! -- cgit v1.2.3 From e414eea77d1ae1201d5252964406a22adfa9f3c2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 16 Jun 2015 12:53:10 +0200 Subject: mac80211: remove IEEE80211_RX_FRAGMENTED There's a long-standing TODO item to use this flag in the cooked monitor RX, but clearly it was never needed and now this hasn't been used by userspace for a long time, so no userspace changes could require it now. Remove the unused flag. Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 -- net/mac80211/rx.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 7d75f93bac7d..68b091a0cae1 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -181,7 +181,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result; /** * enum ieee80211_packet_rx_flags - packet RX flags - * @IEEE80211_RX_FRAGMENTED: fragmented frame * @IEEE80211_RX_AMSDU: a-MSDU packet * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed * @IEEE80211_RX_DEFERRED_RELEASE: frame was subjected to receive reordering @@ -190,7 +189,6 @@ typedef unsigned __bitwise__ ieee80211_rx_result; * @rx_flags field of &struct ieee80211_rx_status. */ enum ieee80211_packet_rx_flags { - IEEE80211_RX_FRAGMENTED = BIT(2), IEEE80211_RX_AMSDU = BIT(3), IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), IEEE80211_RX_DEFERRED_RELEASE = BIT(5), diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3037bd152ffa..3a1462810c8e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1917,7 +1917,6 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) /* Complete frame has been reassembled - process it now */ status = IEEE80211_SKB_RXCB(rx->skb); - status->rx_flags |= IEEE80211_RX_FRAGMENTED; out: ieee80211_led_rx(rx->local); @@ -3037,7 +3036,6 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) return RX_QUEUED; } -/* TODO: use IEEE80211_RX_FRAGMENTED */ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, struct ieee80211_rate *rate) { -- cgit v1.2.3 From 433f5bc1c0efc67a86433e47a14b115510fc1409 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 17 Jun 2015 10:31:00 +0200 Subject: mac80211: move mesh related station fields to own struct There are now a fairly large number of mesh fields that really aren't needed in any other modes; move those into their own structure and allocate them separately. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 8 +-- net/mac80211/mesh.c | 2 +- net/mac80211/mesh_hwmp.c | 2 +- net/mac80211/mesh_plink.c | 177 +++++++++++++++++++++++----------------------- net/mac80211/mesh_ps.c | 42 +++++------ net/mac80211/mesh_sync.c | 16 ++--- net/mac80211/sta_info.c | 37 ++++++---- net/mac80211/sta_info.h | 88 ++++++++++++----------- 8 files changed, 196 insertions(+), 176 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 5fc7788e2ff2..c9f8f34ac728 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1150,10 +1150,10 @@ static int sta_apply_parameters(struct ieee80211_local *local, if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { switch (params->plink_state) { case NL80211_PLINK_ESTAB: - if (sta->plink_state != NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) changed = mesh_plink_inc_estab_count( sdata); - sta->plink_state = params->plink_state; + sta->mesh->plink_state = params->plink_state; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, @@ -1165,10 +1165,10 @@ static int sta_apply_parameters(struct ieee80211_local *local, case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_CNF_RCVD: case NL80211_PLINK_HOLDING: - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count( sdata); - sta->plink_state = params->plink_state; + sta->mesh->plink_state = params->plink_state; ieee80211_mps_sta_status_update(sta); changed |= ieee80211_mps_set_sta_local_pm(sta, diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 817098add1d6..e06a5ca7c9a9 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -158,7 +158,7 @@ void mesh_sta_cleanup(struct sta_info *sta) changed = mesh_accept_plinks_update(sdata); if (!sdata->u.mesh.user_mpm) { changed |= mesh_plink_deactivate(sta); - del_timer_sync(&sta->plink_timer); + del_timer_sync(&sta->mesh->plink_timer); } if (changed) diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 085edc1d056b..cd02810038cb 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -862,7 +862,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); sta = sta_info_get(sdata, mgmt->sa); - if (!sta || sta->plink_state != NL80211_PLINK_ESTAB) { + if (!sta || sta->mesh->plink_state != NL80211_PLINK_ESTAB) { rcu_read_unlock(); return; } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 5438d13e2f00..1a7d98398626 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -16,7 +16,7 @@ #define PLINK_GET_LLID(p) (p + 2) #define PLINK_GET_PLID(p) (p + 4) -#define mod_plink_timer(s, t) (mod_timer(&s->plink_timer, \ +#define mod_plink_timer(s, t) (mod_timer(&s->mesh->plink_timer, \ jiffies + msecs_to_jiffies(t))) enum plink_event { @@ -72,14 +72,14 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, * * @sta: mesh peer link to restart * - * Locking: this function must be called holding sta->plink_lock + * Locking: this function must be called holding sta->mesh->plink_lock */ static inline void mesh_plink_fsm_restart(struct sta_info *sta) { - lockdep_assert_held(&sta->plink_lock); - sta->plink_state = NL80211_PLINK_LISTEN; - sta->llid = sta->plid = sta->reason = 0; - sta->plink_retries = 0; + lockdep_assert_held(&sta->mesh->plink_lock); + sta->mesh->plink_state = NL80211_PLINK_LISTEN; + sta->mesh->llid = sta->mesh->plid = sta->mesh->reason = 0; + sta->mesh->plink_retries = 0; } /* @@ -119,7 +119,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || - sta->plink_state != NL80211_PLINK_ESTAB) + sta->mesh->plink_state != NL80211_PLINK_ESTAB) continue; short_slot = false; @@ -169,7 +169,7 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || - sta->plink_state != NL80211_PLINK_ESTAB) + sta->mesh->plink_state != NL80211_PLINK_ESTAB) continue; if (sta->sta.bandwidth > IEEE80211_STA_RX_BW_20) @@ -212,18 +212,18 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) * All mesh paths with this peer as next hop will be flushed * Returns beacon changed flag if the beacon content changed. * - * Locking: the caller must hold sta->plink_lock + * Locking: the caller must hold sta->mesh->plink_lock */ static u32 __mesh_plink_deactivate(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; u32 changed = 0; - lockdep_assert_held(&sta->plink_lock); + lockdep_assert_held(&sta->mesh->plink_lock); - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) changed = mesh_plink_dec_estab_count(sdata); - sta->plink_state = NL80211_PLINK_BLOCKED; + sta->mesh->plink_state = NL80211_PLINK_BLOCKED; mesh_path_flush_by_nexthop(sta); ieee80211_mps_sta_status_update(sta); @@ -245,13 +245,13 @@ u32 mesh_plink_deactivate(struct sta_info *sta) struct ieee80211_sub_if_data *sdata = sta->sdata; u32 changed; - spin_lock_bh(&sta->plink_lock); + spin_lock_bh(&sta->mesh->plink_lock); changed = __mesh_plink_deactivate(sta); - sta->reason = WLAN_REASON_MESH_PEER_CANCELED; + sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, - sta->sta.addr, sta->llid, sta->plid, - sta->reason); - spin_unlock_bh(&sta->plink_lock); + sta->sta.addr, sta->mesh->llid, sta->mesh->plid, + sta->mesh->reason); + spin_unlock_bh(&sta->mesh->plink_lock); return changed; } @@ -388,13 +388,14 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, sband = local->hw.wiphy->bands[band]; rates = ieee80211_sta_get_rates(sdata, elems, band, &basic_rates); - spin_lock_bh(&sta->plink_lock); + spin_lock_bh(&sta->mesh->plink_lock); sta->last_rx = jiffies; /* rates and capabilities don't change during peering */ - if (sta->plink_state == NL80211_PLINK_ESTAB && sta->processed_beacon) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB && + sta->mesh->processed_beacon) goto out; - sta->processed_beacon = true; + sta->mesh->processed_beacon = true; if (sta->sta.supp_rates[band] != rates) changed |= IEEE80211_RC_SUPP_RATES_CHANGED; @@ -421,7 +422,7 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, else rate_control_rate_update(local, sband, sta, changed); out: - spin_unlock_bh(&sta->plink_lock); + spin_unlock_bh(&sta->mesh->plink_lock); } static struct sta_info * @@ -436,7 +437,7 @@ __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) if (!sta) return NULL; - sta->plink_state = NL80211_PLINK_LISTEN; + sta->mesh->plink_state = NL80211_PLINK_LISTEN; sta->sta.wme = true; sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); @@ -524,7 +525,7 @@ void mesh_neighbour_update(struct ieee80211_sub_if_data *sdata, goto out; if (mesh_peer_accepts_plinks(elems) && - sta->plink_state == NL80211_PLINK_LISTEN && + sta->mesh->plink_state == NL80211_PLINK_LISTEN && sdata->u.mesh.accepting_plinks && sdata->u.mesh.mshcfg.auto_open_plinks && rssi_threshold_check(sdata, sta)) @@ -554,52 +555,52 @@ static void mesh_plink_timer(unsigned long data) if (sta->sdata->local->quiescing) return; - spin_lock_bh(&sta->plink_lock); + spin_lock_bh(&sta->mesh->plink_lock); /* If a timer fires just before a state transition on another CPU, * we may have already extended the timeout and changed state by the * time we've acquired the lock and arrived here. In that case, * skip this timer and wait for the new one. */ - if (time_before(jiffies, sta->plink_timer.expires)) { + if (time_before(jiffies, sta->mesh->plink_timer.expires)) { mpl_dbg(sta->sdata, "Ignoring timer for %pM in state %s (timer adjusted)", - sta->sta.addr, mplstates[sta->plink_state]); - spin_unlock_bh(&sta->plink_lock); + sta->sta.addr, mplstates[sta->mesh->plink_state]); + spin_unlock_bh(&sta->mesh->plink_lock); return; } /* del_timer() and handler may race when entering these states */ - if (sta->plink_state == NL80211_PLINK_LISTEN || - sta->plink_state == NL80211_PLINK_ESTAB) { + if (sta->mesh->plink_state == NL80211_PLINK_LISTEN || + sta->mesh->plink_state == NL80211_PLINK_ESTAB) { mpl_dbg(sta->sdata, "Ignoring timer for %pM in state %s (timer deleted)", - sta->sta.addr, mplstates[sta->plink_state]); - spin_unlock_bh(&sta->plink_lock); + sta->sta.addr, mplstates[sta->mesh->plink_state]); + spin_unlock_bh(&sta->mesh->plink_lock); return; } mpl_dbg(sta->sdata, "Mesh plink timer for %pM fired on state %s\n", - sta->sta.addr, mplstates[sta->plink_state]); + sta->sta.addr, mplstates[sta->mesh->plink_state]); sdata = sta->sdata; mshcfg = &sdata->u.mesh.mshcfg; - switch (sta->plink_state) { + switch (sta->mesh->plink_state) { case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_OPN_SNT: /* retry timer */ - if (sta->plink_retries < mshcfg->dot11MeshMaxRetries) { + if (sta->mesh->plink_retries < mshcfg->dot11MeshMaxRetries) { u32 rand; mpl_dbg(sta->sdata, "Mesh plink for %pM (retry, timeout): %d %d\n", - sta->sta.addr, sta->plink_retries, - sta->plink_timeout); + sta->sta.addr, sta->mesh->plink_retries, + sta->mesh->plink_timeout); get_random_bytes(&rand, sizeof(u32)); - sta->plink_timeout = sta->plink_timeout + - rand % sta->plink_timeout; - ++sta->plink_retries; - mod_plink_timer(sta, sta->plink_timeout); + sta->mesh->plink_timeout = sta->mesh->plink_timeout + + rand % sta->mesh->plink_timeout; + ++sta->mesh->plink_retries; + mod_plink_timer(sta, sta->mesh->plink_timeout); action = WLAN_SP_MESH_PEERING_OPEN; break; } @@ -609,31 +610,31 @@ static void mesh_plink_timer(unsigned long data) /* confirm timer */ if (!reason) reason = WLAN_REASON_MESH_CONFIRM_TIMEOUT; - sta->plink_state = NL80211_PLINK_HOLDING; + sta->mesh->plink_state = NL80211_PLINK_HOLDING; mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout); action = WLAN_SP_MESH_PEERING_CLOSE; break; case NL80211_PLINK_HOLDING: /* holding timer */ - del_timer(&sta->plink_timer); + del_timer(&sta->mesh->plink_timer); mesh_plink_fsm_restart(sta); break; default: break; } - spin_unlock_bh(&sta->plink_lock); + spin_unlock_bh(&sta->mesh->plink_lock); if (action) mesh_plink_frame_tx(sdata, action, sta->sta.addr, - sta->llid, sta->plid, reason); + sta->mesh->llid, sta->mesh->plid, reason); } static inline void mesh_plink_timer_set(struct sta_info *sta, u32 timeout) { - sta->plink_timer.expires = jiffies + msecs_to_jiffies(timeout); - sta->plink_timer.data = (unsigned long) sta; - sta->plink_timer.function = mesh_plink_timer; - sta->plink_timeout = timeout; - add_timer(&sta->plink_timer); + sta->mesh->plink_timer.expires = jiffies + msecs_to_jiffies(timeout); + sta->mesh->plink_timer.data = (unsigned long) sta; + sta->mesh->plink_timer.function = mesh_plink_timer; + sta->mesh->plink_timeout = timeout; + add_timer(&sta->mesh->plink_timer); } static bool llid_in_use(struct ieee80211_sub_if_data *sdata, @@ -645,7 +646,7 @@ static bool llid_in_use(struct ieee80211_sub_if_data *sdata, rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { - if (!memcmp(&sta->llid, &llid, sizeof(llid))) { + if (!memcmp(&sta->mesh->llid, &llid, sizeof(llid))) { in_use = true; break; } @@ -676,16 +677,16 @@ u32 mesh_plink_open(struct sta_info *sta) if (!test_sta_flag(sta, WLAN_STA_AUTH)) return 0; - spin_lock_bh(&sta->plink_lock); - sta->llid = mesh_get_new_llid(sdata); - if (sta->plink_state != NL80211_PLINK_LISTEN && - sta->plink_state != NL80211_PLINK_BLOCKED) { - spin_unlock_bh(&sta->plink_lock); + spin_lock_bh(&sta->mesh->plink_lock); + sta->mesh->llid = mesh_get_new_llid(sdata); + if (sta->mesh->plink_state != NL80211_PLINK_LISTEN && + sta->mesh->plink_state != NL80211_PLINK_BLOCKED) { + spin_unlock_bh(&sta->mesh->plink_lock); return 0; } - sta->plink_state = NL80211_PLINK_OPN_SNT; + sta->mesh->plink_state = NL80211_PLINK_OPN_SNT; mesh_plink_timer_set(sta, sdata->u.mesh.mshcfg.dot11MeshRetryTimeout); - spin_unlock_bh(&sta->plink_lock); + spin_unlock_bh(&sta->mesh->plink_lock); mpl_dbg(sdata, "Mesh plink: starting establishment with %pM\n", sta->sta.addr); @@ -694,7 +695,7 @@ u32 mesh_plink_open(struct sta_info *sta) changed = ieee80211_mps_local_status_update(sdata); mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_OPEN, - sta->sta.addr, sta->llid, 0, 0); + sta->sta.addr, sta->mesh->llid, 0, 0); return changed; } @@ -702,10 +703,10 @@ u32 mesh_plink_block(struct sta_info *sta) { u32 changed; - spin_lock_bh(&sta->plink_lock); + spin_lock_bh(&sta->mesh->plink_lock); changed = __mesh_plink_deactivate(sta); - sta->plink_state = NL80211_PLINK_BLOCKED; - spin_unlock_bh(&sta->plink_lock); + sta->mesh->plink_state = NL80211_PLINK_BLOCKED; + spin_unlock_bh(&sta->mesh->plink_lock); return changed; } @@ -715,12 +716,11 @@ static void mesh_plink_close(struct ieee80211_sub_if_data *sdata, enum plink_event event) { struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; - u16 reason = (event == CLS_ACPT) ? WLAN_REASON_MESH_CLOSE : WLAN_REASON_MESH_CONFIG; - sta->reason = reason; - sta->plink_state = NL80211_PLINK_HOLDING; + sta->mesh->reason = reason; + sta->mesh->plink_state = NL80211_PLINK_HOLDING; mod_plink_timer(sta, mshcfg->dot11MeshHoldingTimeout); } @@ -730,8 +730,8 @@ static u32 mesh_plink_establish(struct ieee80211_sub_if_data *sdata, struct mesh_config *mshcfg = &sdata->u.mesh.mshcfg; u32 changed = 0; - del_timer(&sta->plink_timer); - sta->plink_state = NL80211_PLINK_ESTAB; + del_timer(&sta->mesh->plink_timer); + sta->mesh->plink_state = NL80211_PLINK_ESTAB; changed |= mesh_plink_inc_estab_count(sdata); changed |= mesh_set_ht_prot_mode(sdata); changed |= mesh_set_short_slot_time(sdata); @@ -758,18 +758,18 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, u32 changed = 0; mpl_dbg(sdata, "peer %pM in state %s got event %s\n", sta->sta.addr, - mplstates[sta->plink_state], mplevents[event]); + mplstates[sta->mesh->plink_state], mplevents[event]); - spin_lock_bh(&sta->plink_lock); - switch (sta->plink_state) { + spin_lock_bh(&sta->mesh->plink_lock); + switch (sta->mesh->plink_state) { case NL80211_PLINK_LISTEN: switch (event) { case CLS_ACPT: mesh_plink_fsm_restart(sta); break; case OPN_ACPT: - sta->plink_state = NL80211_PLINK_OPN_RCVD; - sta->llid = mesh_get_new_llid(sdata); + sta->mesh->plink_state = NL80211_PLINK_OPN_RCVD; + sta->mesh->llid = mesh_get_new_llid(sdata); mesh_plink_timer_set(sta, mshcfg->dot11MeshRetryTimeout); @@ -791,11 +791,11 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, break; case OPN_ACPT: /* retry timer is left untouched */ - sta->plink_state = NL80211_PLINK_OPN_RCVD; + sta->mesh->plink_state = NL80211_PLINK_OPN_RCVD; action = WLAN_SP_MESH_PEERING_CONFIRM; break; case CNF_ACPT: - sta->plink_state = NL80211_PLINK_CNF_RCVD; + sta->mesh->plink_state = NL80211_PLINK_CNF_RCVD; mod_plink_timer(sta, mshcfg->dot11MeshConfirmTimeout); break; default: @@ -855,7 +855,7 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, case NL80211_PLINK_HOLDING: switch (event) { case CLS_ACPT: - del_timer(&sta->plink_timer); + del_timer(&sta->mesh->plink_timer); mesh_plink_fsm_restart(sta); break; case OPN_ACPT: @@ -874,17 +874,18 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, */ break; } - spin_unlock_bh(&sta->plink_lock); + spin_unlock_bh(&sta->mesh->plink_lock); if (action) { mesh_plink_frame_tx(sdata, action, sta->sta.addr, - sta->llid, sta->plid, sta->reason); + sta->mesh->llid, sta->mesh->plid, + sta->mesh->reason); /* also send confirm in open case */ if (action == WLAN_SP_MESH_PEERING_OPEN) { mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CONFIRM, - sta->sta.addr, sta->llid, - sta->plid, 0); + sta->sta.addr, sta->mesh->llid, + sta->mesh->plid, 0); } } @@ -939,7 +940,7 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, mpl_dbg(sdata, "Mesh plink: Action frame from non-authed peer\n"); goto out; } - if (sta->plink_state == NL80211_PLINK_BLOCKED) + if (sta->mesh->plink_state == NL80211_PLINK_BLOCKED) goto out; } @@ -954,7 +955,7 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, if (!matches_local) event = OPN_RJCT; if (!mesh_plink_free_count(sdata) || - (sta->plid && sta->plid != plid)) + (sta->mesh->plid && sta->mesh->plid != plid)) event = OPN_IGNR; else event = OPN_ACPT; @@ -963,14 +964,14 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, if (!matches_local) event = CNF_RJCT; if (!mesh_plink_free_count(sdata) || - sta->llid != llid || - (sta->plid && sta->plid != plid)) + sta->mesh->llid != llid || + (sta->mesh->plid && sta->mesh->plid != plid)) event = CNF_IGNR; else event = CNF_ACPT; break; case WLAN_SP_MESH_PEERING_CLOSE: - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) /* Do not check for llid or plid. This does not * follow the standard but since multiple plinks * per sta are not supported, it is necessary in @@ -981,9 +982,9 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata, * restarted. */ event = CLS_ACPT; - else if (sta->plid != plid) + else if (sta->mesh->plid != plid) event = CLS_IGNR; - else if (ie_len == 8 && sta->llid != llid) + else if (ie_len == 8 && sta->mesh->llid != llid) event = CLS_IGNR; else event = CLS_ACPT; @@ -1070,7 +1071,7 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, mpl_dbg(sdata, "Mesh plink: failed to init peer!\n"); goto unlock_rcu; } - sta->plid = plid; + sta->mesh->plid = plid; } else if (!sta && event == OPN_RJCT) { mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, mgmt->sa, 0, plid, @@ -1082,8 +1083,8 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, } /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */ - if (!sta->plid && event == CNF_ACPT) - sta->plid = plid; + if (!sta->mesh->plid && event == CNF_ACPT) + sta->mesh->plid = plid; changed |= mesh_plink_fsm(sdata, sta, event); diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index ad8b377b4b9f..29747f92b9b0 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -92,16 +92,16 @@ u32 ieee80211_mps_local_status_update(struct ieee80211_sub_if_data *sdata) if (sdata != sta->sdata) continue; - switch (sta->plink_state) { + switch (sta->mesh->plink_state) { case NL80211_PLINK_OPN_SNT: case NL80211_PLINK_OPN_RCVD: case NL80211_PLINK_CNF_RCVD: peering = true; break; case NL80211_PLINK_ESTAB: - if (sta->local_pm == NL80211_MESH_POWER_LIGHT_SLEEP) + if (sta->mesh->local_pm == NL80211_MESH_POWER_LIGHT_SLEEP) light_sleep_cnt++; - else if (sta->local_pm == NL80211_MESH_POWER_DEEP_SLEEP) + else if (sta->mesh->local_pm == NL80211_MESH_POWER_DEEP_SLEEP) deep_sleep_cnt++; break; default: @@ -153,19 +153,19 @@ u32 ieee80211_mps_set_sta_local_pm(struct sta_info *sta, { struct ieee80211_sub_if_data *sdata = sta->sdata; - if (sta->local_pm == pm) + if (sta->mesh->local_pm == pm) return 0; mps_dbg(sdata, "local STA operates in mode %d with %pM\n", pm, sta->sta.addr); - sta->local_pm = pm; + sta->mesh->local_pm = pm; /* * announce peer-specific power mode transition * (see IEEE802.11-2012 13.14.3.2 and 13.14.3.3) */ - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) mps_qos_null_tx(sta); return ieee80211_mps_local_status_update(sdata); @@ -197,8 +197,8 @@ void ieee80211_mps_set_frame_flags(struct ieee80211_sub_if_data *sdata, if (is_unicast_ether_addr(hdr->addr1) && ieee80211_is_data_qos(hdr->frame_control) && - sta->plink_state == NL80211_PLINK_ESTAB) - pm = sta->local_pm; + sta->mesh->plink_state == NL80211_PLINK_ESTAB) + pm = sta->mesh->local_pm; else pm = sdata->u.mesh.nonpeer_pm; @@ -241,16 +241,16 @@ void ieee80211_mps_sta_status_update(struct sta_info *sta) * use peer-specific power mode if peering is established and the * peer's power mode is known */ - if (sta->plink_state == NL80211_PLINK_ESTAB && - sta->peer_pm != NL80211_MESH_POWER_UNKNOWN) - pm = sta->peer_pm; + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB && + sta->mesh->peer_pm != NL80211_MESH_POWER_UNKNOWN) + pm = sta->mesh->peer_pm; else - pm = sta->nonpeer_pm; + pm = sta->mesh->nonpeer_pm; do_buffer = (pm != NL80211_MESH_POWER_ACTIVE); /* clear the MPSP flags for non-peers or active STA */ - if (sta->plink_state != NL80211_PLINK_ESTAB) { + if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) { clear_sta_flag(sta, WLAN_STA_MPSP_OWNER); clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); } else if (!do_buffer) { @@ -296,13 +296,13 @@ static void mps_set_sta_peer_pm(struct sta_info *sta, pm = NL80211_MESH_POWER_ACTIVE; } - if (sta->peer_pm == pm) + if (sta->mesh->peer_pm == pm) return; mps_dbg(sta->sdata, "STA %pM enters mode %d\n", sta->sta.addr, pm); - sta->peer_pm = pm; + sta->mesh->peer_pm = pm; ieee80211_mps_sta_status_update(sta); } @@ -317,13 +317,13 @@ static void mps_set_sta_nonpeer_pm(struct sta_info *sta, else pm = NL80211_MESH_POWER_ACTIVE; - if (sta->nonpeer_pm == pm) + if (sta->mesh->nonpeer_pm == pm) return; mps_dbg(sta->sdata, "STA %pM sets non-peer mode to %d\n", sta->sta.addr, pm); - sta->nonpeer_pm = pm; + sta->mesh->nonpeer_pm = pm; ieee80211_mps_sta_status_update(sta); } @@ -552,7 +552,7 @@ void ieee80211_mpsp_trigger_process(u8 *qc, struct sta_info *sta, } else { if (eosp) clear_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); - else if (sta->local_pm != NL80211_MESH_POWER_ACTIVE) + else if (sta->mesh->local_pm != NL80211_MESH_POWER_ACTIVE) set_sta_flag(sta, WLAN_STA_MPSP_RECIPIENT); if (rspi && !test_and_set_sta_flag(sta, WLAN_STA_MPSP_OWNER)) @@ -577,9 +577,9 @@ void ieee80211_mps_frame_release(struct sta_info *sta, int ac, buffer_local = 0; bool has_buffered = false; - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, - sta->llid); + sta->mesh->llid); if (has_buffered) mps_dbg(sta->sdata, "%pM indicates buffered frames\n", @@ -598,7 +598,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, if (!has_buffered && !buffer_local) return; - if (sta->plink_state == NL80211_PLINK_ESTAB) + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) mpsp_trigger_send(sta, has_buffered, !buffer_local); else mps_frame_deliver(sta, 1); diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index 09625d6205c3..64bc22ad9496 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -127,14 +127,14 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, /* Timing offset calculation (see 13.13.2.2.2) */ t_t = le64_to_cpu(mgmt->u.beacon.timestamp); - sta->t_offset = t_t - t_r; + sta->mesh->t_offset = t_t - t_r; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { - s64 t_clockdrift = sta->t_offset_setpoint - sta->t_offset; + s64 t_clockdrift = sta->mesh->t_offset_setpoint - sta->mesh->t_offset; msync_dbg(sdata, - "STA %pM : sta->t_offset=%lld, sta->t_offset_setpoint=%lld, t_clockdrift=%lld\n", - sta->sta.addr, (long long) sta->t_offset, - (long long) sta->t_offset_setpoint, + "STA %pM : t_offset=%lld, t_offset_setpoint=%lld, t_clockdrift=%lld\n", + sta->sta.addr, (long long) sta->mesh->t_offset, + (long long) sta->mesh->t_offset_setpoint, (long long) t_clockdrift); if (t_clockdrift > TOFFSET_MAXIMUM_ADJUSTMENT || @@ -152,12 +152,12 @@ static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, ifmsh->sync_offset_clockdrift_max = t_clockdrift; spin_unlock_bh(&ifmsh->sync_offset_lock); } else { - sta->t_offset_setpoint = sta->t_offset - TOFFSET_SET_MARGIN; + sta->mesh->t_offset_setpoint = sta->mesh->t_offset - TOFFSET_SET_MARGIN; set_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN); msync_dbg(sdata, - "STA %pM : offset was invalid, sta->t_offset=%lld\n", + "STA %pM : offset was invalid, t_offset=%lld\n", sta->sta.addr, - (long long) sta->t_offset); + (long long) sta->mesh->t_offset); } no_sync: diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index d573a499750e..9da7d2bc271a 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -249,6 +249,9 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) if (sta->sta.txq[0]) kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); +#ifdef CONFIG_MAC80211_MESH + kfree(sta->mesh); +#endif kfree(sta); } @@ -313,11 +316,16 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, INIT_WORK(&sta->ampdu_mlme.work, ieee80211_ba_session_work); mutex_init(&sta->ampdu_mlme.mtx); #ifdef CONFIG_MAC80211_MESH - spin_lock_init(&sta->plink_lock); - if (ieee80211_vif_is_mesh(&sdata->vif) && - !sdata->u.mesh.user_mpm) - init_timer(&sta->plink_timer); - sta->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; + if (ieee80211_vif_is_mesh(&sdata->vif)) { + sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); + if (!sta->mesh) + goto free; + spin_lock_init(&sta->mesh->plink_lock); + if (ieee80211_vif_is_mesh(&sdata->vif) && + !sdata->u.mesh.user_mpm) + init_timer(&sta->mesh->plink_timer); + sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; + } #endif memcpy(sta->addr, addr, ETH_ALEN); @@ -406,6 +414,9 @@ free_txq: if (sta->sta.txq[0]) kfree(to_txq_info(sta->sta.txq[0])); free: +#ifdef CONFIG_MAC80211_MESH + kfree(sta->mesh); +#endif kfree(sta); return NULL; } @@ -637,7 +648,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; /* TIM map only for 1 <= PLID <= IEEE80211_MAX_AID */ - id = sta->plid % (IEEE80211_MAX_AID + 1); + id = sta->mesh->plid % (IEEE80211_MAX_AID + 1); #endif } else { return; @@ -1957,16 +1968,16 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) BIT(NL80211_STA_INFO_PEER_PM) | BIT(NL80211_STA_INFO_NONPEER_PM); - sinfo->llid = sta->llid; - sinfo->plid = sta->plid; - sinfo->plink_state = sta->plink_state; + sinfo->llid = sta->mesh->llid; + sinfo->plid = sta->mesh->plid; + sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT(NL80211_STA_INFO_T_OFFSET); - sinfo->t_offset = sta->t_offset; + sinfo->t_offset = sta->mesh->t_offset; } - sinfo->local_pm = sta->local_pm; - sinfo->peer_pm = sta->peer_pm; - sinfo->nonpeer_pm = sta->nonpeer_pm; + sinfo->local_pm = sta->mesh->local_pm; + sinfo->peer_pm = sta->mesh->peer_pm; + sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; #endif } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 422984986263..9e568927c080 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -269,6 +269,48 @@ struct ieee80211_fast_tx { struct rcu_head rcu_head; }; +/** + * struct mesh_sta - mesh STA information + * @plink_lock: serialize access to plink fields + * @llid: Local link ID + * @plid: Peer link ID + * @reason: Cancel reason on PLINK_HOLDING state + * @plink_retries: Retries in establishment + * @plink_state: peer link state + * @plink_timeout: timeout of peer link + * @plink_timer: peer link watch timer + * @t_offset: timing offset relative to this host + * @t_offset_setpoint: reference timing offset of this sta to be used when + * calculating clockdrift + * @local_pm: local link-specific power save mode + * @peer_pm: peer-specific power save mode towards local STA + * @nonpeer_pm: STA power save mode towards non-peer neighbors + * @processed_beacon: set to true after peer rates and capabilities are + * processed + */ +struct mesh_sta { + struct timer_list plink_timer; + + s64 t_offset; + s64 t_offset_setpoint; + + spinlock_t plink_lock; + u16 llid; + u16 plid; + u16 reason; + u8 plink_retries; + + bool processed_beacon; + + enum nl80211_plink_state plink_state; + u32 plink_timeout; + + /* mesh power save */ + enum nl80211_mesh_power_mode local_pm; + enum nl80211_mesh_power_mode peer_pm; + enum nl80211_mesh_power_mode nonpeer_pm; +}; + /** * struct sta_info - STA information * @@ -330,20 +372,7 @@ struct ieee80211_fast_tx { * @tid_seq: per-TID sequence numbers for sending to this STA * @ampdu_mlme: A-MPDU state machine state * @timer_to_tid: identity mapping to ID timers - * @plink_lock: serialize access to plink fields - * @llid: Local link ID - * @plid: Peer link ID - * @reason: Cancel reason on PLINK_HOLDING state - * @plink_retries: Retries in establishment - * @plink_state: peer link state - * @plink_timeout: timeout of peer link - * @plink_timer: peer link watch timer - * @t_offset: timing offset relative to this host - * @t_offset_setpoint: reference timing offset of this sta to be used when - * calculating clockdrift - * @local_pm: local link-specific power save mode - * @peer_pm: peer-specific power save mode towards local STA - * @nonpeer_pm: STA power save mode towards non-peer neighbors + * @mesh: mesh STA information * @debugfs: debug filesystem info * @dead: set to true when sta is unlinked * @uploaded: set to true when sta is uploaded to the driver @@ -371,8 +400,6 @@ struct ieee80211_fast_tx { * @rx_msdu: MSDUs received from this station, using IEEE80211_NUM_TID * entry for non-QoS frames * @fast_tx: TX fastpath information - * @processed_beacon: set to true after peer rates and capabilities are - * processed */ struct sta_info { /* General information, mostly static */ @@ -392,6 +419,10 @@ struct sta_info { struct ieee80211_fast_tx __rcu *fast_tx; +#ifdef CONFIG_MAC80211_MESH + struct mesh_sta *mesh; +#endif + struct work_struct drv_deliver_wk; u16 listen_interval; @@ -457,29 +488,6 @@ struct sta_info { struct sta_ampdu_mlme ampdu_mlme; u8 timer_to_tid[IEEE80211_NUM_TIDS]; -#ifdef CONFIG_MAC80211_MESH - /* - * Mesh peer link attributes, protected by plink_lock. - * TODO: move to a sub-structure that is referenced with pointer? - */ - spinlock_t plink_lock; - u16 llid; - u16 plid; - u16 reason; - u8 plink_retries; - enum nl80211_plink_state plink_state; - u32 plink_timeout; - struct timer_list plink_timer; - - s64 t_offset; - s64 t_offset_setpoint; - /* mesh power save */ - enum nl80211_mesh_power_mode local_pm; - enum nl80211_mesh_power_mode peer_pm; - enum nl80211_mesh_power_mode nonpeer_pm; - bool processed_beacon; -#endif - #ifdef CONFIG_MAC80211_DEBUGFS struct sta_info_debugfsdentries { struct dentry *dir; @@ -507,7 +515,7 @@ struct sta_info { static inline enum nl80211_plink_state sta_plink_state(struct sta_info *sta) { #ifdef CONFIG_MAC80211_MESH - return sta->plink_state; + return sta->mesh->plink_state; #endif return NL80211_PLINK_LISTEN; } -- cgit v1.2.3 From 1365770248c122dd155351e714b44fe77036292c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 17 Jun 2015 10:34:54 +0200 Subject: mac80211: move mesh STA parameters code to own function The code was always a bit awkward due to the 80-col restriction and got worse in the previous patch. Refactor it a bit into its own function to make it read nicer. Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 117 ++++++++++++++++++++++++++++------------------------- 1 file changed, 61 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c9f8f34ac728..b145942a7624 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1019,6 +1019,65 @@ static int sta_apply_auth_flags(struct ieee80211_local *local, return 0; } +static void sta_apply_mesh_params(struct ieee80211_local *local, + struct sta_info *sta, + struct station_parameters *params) +{ +#ifdef CONFIG_MAC80211_MESH + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 changed = 0; + + if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { + switch (params->plink_state) { + case NL80211_PLINK_ESTAB: + if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) + changed = mesh_plink_inc_estab_count(sdata); + sta->mesh->plink_state = params->plink_state; + + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + sdata->u.mesh.mshcfg.power_mode); + break; + case NL80211_PLINK_LISTEN: + case NL80211_PLINK_BLOCKED: + case NL80211_PLINK_OPN_SNT: + case NL80211_PLINK_OPN_RCVD: + case NL80211_PLINK_CNF_RCVD: + case NL80211_PLINK_HOLDING: + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) + changed = mesh_plink_dec_estab_count(sdata); + sta->mesh->plink_state = params->plink_state; + + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + NL80211_MESH_POWER_UNKNOWN); + break; + default: + /* nothing */ + break; + } + } + + switch (params->plink_action) { + case NL80211_PLINK_ACTION_NO_ACTION: + /* nothing */ + break; + case NL80211_PLINK_ACTION_OPEN: + changed |= mesh_plink_open(sta); + break; + case NL80211_PLINK_ACTION_BLOCK: + changed |= mesh_plink_block(sta); + break; + } + + if (params->local_pm) + changed |= ieee80211_mps_set_sta_local_pm(sta, + params->local_pm); + + ieee80211_mbss_info_change_notify(sdata, changed); +#endif +} + static int sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) @@ -1143,62 +1202,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, band, false); } - if (ieee80211_vif_is_mesh(&sdata->vif)) { -#ifdef CONFIG_MAC80211_MESH - u32 changed = 0; - - if (params->sta_modify_mask & STATION_PARAM_APPLY_PLINK_STATE) { - switch (params->plink_state) { - case NL80211_PLINK_ESTAB: - if (sta->mesh->plink_state != NL80211_PLINK_ESTAB) - changed = mesh_plink_inc_estab_count( - sdata); - sta->mesh->plink_state = params->plink_state; - - ieee80211_mps_sta_status_update(sta); - changed |= ieee80211_mps_set_sta_local_pm(sta, - sdata->u.mesh.mshcfg.power_mode); - break; - case NL80211_PLINK_LISTEN: - case NL80211_PLINK_BLOCKED: - case NL80211_PLINK_OPN_SNT: - case NL80211_PLINK_OPN_RCVD: - case NL80211_PLINK_CNF_RCVD: - case NL80211_PLINK_HOLDING: - if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) - changed = mesh_plink_dec_estab_count( - sdata); - sta->mesh->plink_state = params->plink_state; - - ieee80211_mps_sta_status_update(sta); - changed |= ieee80211_mps_set_sta_local_pm(sta, - NL80211_MESH_POWER_UNKNOWN); - break; - default: - /* nothing */ - break; - } - } - - switch (params->plink_action) { - case NL80211_PLINK_ACTION_NO_ACTION: - /* nothing */ - break; - case NL80211_PLINK_ACTION_OPEN: - changed |= mesh_plink_open(sta); - break; - case NL80211_PLINK_ACTION_BLOCK: - changed |= mesh_plink_block(sta); - break; - } - - if (params->local_pm) - changed |= - ieee80211_mps_set_sta_local_pm(sta, - params->local_pm); - ieee80211_mbss_info_change_notify(sdata, changed); -#endif - } + if (ieee80211_vif_is_mesh(&sdata->vif)) + sta_apply_mesh_params(local, sta, params); /* set the STA state after all sta info from usermode has been set */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) { -- cgit v1.2.3 From e996ec2a4ddb53164262990d33304c429709f687 Mon Sep 17 00:00:00 2001 From: Wojciech Dubowik Date: Wed, 10 Jun 2015 13:06:53 +0200 Subject: mac80211: avoid unnecessary beacon deref on CSA counter update The beacon struct is already available in many contexts that are also already in an RCU read-locked section. Avoid that by using the existing beacon struct pointer directly. Signed-off-by: Wojciech Dubowik [rewrite subject/add commit message] Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 87b9b4e27d22..7c6832f91dc3 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3210,6 +3210,16 @@ static void ieee80211_set_csa(struct ieee80211_sub_if_data *sdata, rcu_read_unlock(); } +static u8 __ieee80211_csa_update_counter(struct beacon_data *beacon) +{ + beacon->csa_current_counter--; + + /* the counter should never reach 0 */ + WARN_ON_ONCE(!beacon->csa_current_counter); + + return beacon->csa_current_counter; +} + u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); @@ -3228,11 +3238,7 @@ u8 ieee80211_csa_update_counter(struct ieee80211_vif *vif) if (!beacon) goto unlock; - beacon->csa_current_counter--; - - /* the counter should never reach 0 */ - WARN_ON_ONCE(!beacon->csa_current_counter); - count = beacon->csa_current_counter; + count = __ieee80211_csa_update_counter(beacon); unlock: rcu_read_unlock(); @@ -3332,7 +3338,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (beacon) { if (beacon->csa_counter_offsets[0]) { if (!is_template) - ieee80211_csa_update_counter(vif); + __ieee80211_csa_update_counter(beacon); ieee80211_set_csa(sdata, beacon); } @@ -3378,7 +3384,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, if (beacon->csa_counter_offsets[0]) { if (!is_template) - ieee80211_csa_update_counter(vif); + __ieee80211_csa_update_counter(beacon); ieee80211_set_csa(sdata, beacon); } @@ -3408,7 +3414,7 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, * for now we leave it consistent with overall * mac80211's behavior. */ - ieee80211_csa_update_counter(vif); + __ieee80211_csa_update_counter(beacon); ieee80211_set_csa(sdata, beacon); } -- cgit v1.2.3 From 6513e98e05ccb8eca77adaf93beae44aa7bf4a45 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 10 Jun 2015 20:18:55 +0300 Subject: mac80211: allow passing NULL to ieee80211_vif_to_wdev() Simply return NULL in this case, instead of crashing. This can simplify callers that would otherwise have to check for this explicitly. Signed-off-by: Johannes Berg --- net/mac80211/util.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 7fb2c7bacc8c..89e089c452c1 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -703,7 +703,12 @@ EXPORT_SYMBOL_GPL(wdev_to_ieee80211_vif); struct wireless_dev *ieee80211_vif_to_wdev(struct ieee80211_vif *vif) { - struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_sub_if_data *sdata; + + if (!vif) + return NULL; + + sdata = vif_to_sdata(vif); if (!ieee80211_sdata_running(sdata) || !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) -- cgit v1.2.3 From 7584f88f9e30d36c2c1041831121f1cd3a194bf1 Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 10 Jun 2015 20:19:37 +0300 Subject: mac80211: clear local->in_reconfig on reconfig error If reconfiguration fails, local->in_reconfig is never cleaned, resulting in rx frames being dropped next time the device is started. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/util.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 89e089c452c1..e54596f95663 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1665,6 +1665,7 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local) local->resuming = false; local->suspended = false; local->started = false; + local->in_reconfig = false; /* scheduled scan clearly can't be running any more, but tell * cfg80211 and clear local state -- cgit v1.2.3 From b98fb44ffceeac717789e8f2fb3497e6b8c5c65b Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 10 Jun 2015 20:42:59 +0300 Subject: mac80211: define TDLS wider BW support bits Allow a device to specify support for the TDLS wider-bandwidth feature. Indicate this support during TDLS setup in the ext-capab IE and set an appropriate station flag when our TDLS peer supports it. This feature gives TDLS peers the ability to use a wider channel than the base width of the BSS. For instance VHT capable TDLS peers connected on a 20MHz channel can extend the channel to 80MHz, if regulatory considerations allow it. Do not cap the bandwidth of such stations by the current BSS channel width in mac80211. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 ++++ net/mac80211/cfg.c | 6 ++++++ net/mac80211/debugfs.c | 1 + net/mac80211/sta_info.h | 3 +++ net/mac80211/tdls.c | 18 +++++++++++++----- net/mac80211/vht.c | 8 ++++++-- 6 files changed, 33 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 4d3d2686f278..8f61a230c482 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -1887,6 +1887,9 @@ struct ieee80211_txq { * @IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS: The HW supports scanning on all bands * in one command, mac80211 doesn't have to run separate scans per band. * + * @IEEE80211_HW_TDLS_WIDER_BW: The device/driver supports wider bandwidth + * than then BSS bandwidth for a TDLS link on the base channel. + * * @NUM_IEEE80211_HW_FLAGS: number of hardware flags, used for sizing arrays */ enum ieee80211_hw_flags { @@ -1919,6 +1922,7 @@ enum ieee80211_hw_flags { IEEE80211_HW_CHANCTX_STA_CSA, IEEE80211_HW_SUPPORTS_CLONED_SKBS, IEEE80211_HW_SINGLE_SCAN_ON_ALL_BANDS, + IEEE80211_HW_TDLS_WIDER_BW, /* keep last, obviously */ NUM_IEEE80211_HW_FLAGS diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b145942a7624..a32575bf0546 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1155,6 +1155,12 @@ static int sta_apply_parameters(struct ieee80211_local *local, params->ext_capab[3] & WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH) set_sta_flag(sta, WLAN_STA_TDLS_CHAN_SWITCH); + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && + ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && + params->ext_capab_len >= 8 && + params->ext_capab[7] & WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED) + set_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW); + if (params->sta_modify_mask & STATION_PARAM_APPLY_UAPSD) { sta->sta.uapsd_queues = params->uapsd_queues; sta->sta.max_sp = params->max_sp; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 2c79d777f0e4..ced6bf3be8d6 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -122,6 +122,7 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = { FLAG(CHANCTX_STA_CSA), FLAG(SUPPORTS_CLONED_SKBS), FLAG(SINGLE_SCAN_ON_ALL_BANDS), + FLAG(TDLS_WIDER_BW), /* keep last for the build bug below */ (void *)0x1 diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 9e568927c080..b9c1aaaa01ff 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -53,6 +53,8 @@ * @WLAN_STA_TDLS_CHAN_SWITCH: This TDLS peer supports TDLS channel-switching * @WLAN_STA_TDLS_OFF_CHANNEL: The local STA is currently off-channel with this * TDLS peer + * @WLAN_STA_TDLS_WIDER_BW: This TDLS peer supports working on a wider bw on + * the BSS base channel. * @WLAN_STA_UAPSD: Station requested unscheduled SP while driver was * keeping station in power-save mode, reply when the driver * unblocks the station. @@ -84,6 +86,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_TDLS_INITIATOR, WLAN_STA_TDLS_CHAN_SWITCH, WLAN_STA_TDLS_OFF_CHANNEL, + WLAN_STA_TDLS_WIDER_BW, WLAN_STA_UAPSD, WLAN_STA_SP, WLAN_STA_4ADDR_EVENT, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index ad31b2dab4f5..fec1b336d03c 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -35,20 +35,28 @@ void ieee80211_tdls_peer_del_work(struct work_struct *wk) mutex_unlock(&local->mtx); } -static void ieee80211_tdls_add_ext_capab(struct ieee80211_local *local, +static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - u8 *pos = (void *)skb_put(skb, 7); + struct ieee80211_local *local = sdata->local; bool chan_switch = local->hw.wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH; + bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW); + enum ieee80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; + bool vht = sband && sband->vht_cap.vht_supported; + u8 *pos = (void *)skb_put(skb, 10); *pos++ = WLAN_EID_EXT_CAPABILITY; - *pos++ = 5; /* len */ + *pos++ = 8; /* len */ *pos++ = 0x0; *pos++ = 0x0; *pos++ = 0x0; *pos++ = chan_switch ? WLAN_EXT_CAPA4_TDLS_CHAN_SWITCH : 0; *pos++ = WLAN_EXT_CAPA5_TDLS_ENABLED; + *pos++ = 0; + *pos++ = 0; + *pos++ = (vht && wider_band) ? WLAN_EXT_CAPA8_TDLS_WIDE_BW_ENABLED : 0; } static u8 @@ -320,7 +328,7 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, offset = noffset; } - ieee80211_tdls_add_ext_capab(local, skb); + ieee80211_tdls_add_ext_capab(sdata, skb); /* add the QoS element if we support it */ if (local->hw.queues >= IEEE80211_NUM_ACS && @@ -784,7 +792,7 @@ ieee80211_tdls_build_mgmt_packet_data(struct ieee80211_sub_if_data *sdata, max(sizeof(struct ieee80211_mgmt), sizeof(struct ieee80211_tdls_data)) + 50 + /* supported rates */ - 7 + /* ext capab */ + 10 + /* ext capab */ 26 + /* max(WMM-info, WMM-param) */ 2 + max(sizeof(struct ieee80211_ht_cap), sizeof(struct ieee80211_ht_operation)) + diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index 80694d55db74..f05808d0d80f 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -308,11 +308,15 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; enum ieee80211_sta_rx_bandwidth bw; + enum nl80211_chan_width bss_width = sdata->vif.bss_conf.chandef.width; - bw = ieee80211_chan_width_to_rx_bw(sdata->vif.bss_conf.chandef.width); - bw = min(bw, ieee80211_sta_cap_rx_bw(sta)); + bw = ieee80211_sta_cap_rx_bw(sta); bw = min(bw, sta->cur_max_bandwidth); + /* do not cap the BW of TDLS WIDER_BW peers by the bss */ + if (!test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) + bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); + return bw; } -- cgit v1.2.3 From 0fabfaafec3ae017fc7c82997035872ff385752f Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 10 Jun 2015 20:41:23 +0300 Subject: mac80211: upgrade BW of TDLS peers when possible Define a station chandef, to be used for wider-bw TDLS peers. When both peers support the feature, upgrade the channel bandwidth to the maximum allowed by both peers and regulatory. Currently widths up to 80MHz are supported in the 5GHz band. When a TDLS peer connects/disconnects recalculate the channel type of the current chanctx. Make the chanctx width calculation consider wider-bw TDLS peers and similarly fix the max_required_bw calculation for the chanctx min_def. Since the sta->bandwidth is calculated only later on, take bss_conf.chandef.width as the minimal width for station interface. Set the upgraded channel width in the VHT-operation set during TDLS setup. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/chan.c | 31 +++++++++++-- net/mac80211/ieee80211_i.h | 3 ++ net/mac80211/sta_info.h | 4 ++ net/mac80211/tdls.c | 105 ++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 132 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index f01c18a3160e..1d1b9b7bdefe 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -190,7 +190,7 @@ ieee80211_find_reservation_chanctx(struct ieee80211_local *local, return NULL; } -static enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta) +enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta) { switch (sta->bandwidth) { case IEEE80211_STA_RX_BW_20: @@ -264,9 +264,17 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, case NL80211_IFTYPE_AP_VLAN: width = ieee80211_get_max_required_bw(sdata); break; + case NL80211_IFTYPE_STATION: + /* + * The ap's sta->bandwidth is not set yet at this + * point, so take the width from the chandef, but + * account also for TDLS peers + */ + width = max(vif->bss_conf.chandef.width, + ieee80211_get_max_required_bw(sdata)); + break; case NL80211_IFTYPE_P2P_DEVICE: continue; - case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_ADHOC: case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: @@ -554,12 +562,13 @@ static void ieee80211_free_chanctx(struct ieee80211_local *local, kfree_rcu(ctx, rcu_head); } -static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx) +void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) { struct ieee80211_chanctx_conf *conf = &ctx->conf; struct ieee80211_sub_if_data *sdata; const struct cfg80211_chan_def *compat = NULL; + struct sta_info *sta; lockdep_assert_held(&local->chanctx_mtx); @@ -581,6 +590,20 @@ static void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, if (WARN_ON_ONCE(!compat)) break; } + + /* TDLS peers can sometimes affect the chandef width */ + list_for_each_entry_rcu(sta, &local->sta_list, list) { + if (!sta->uploaded || + !test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || + !sta->tdls_chandef.chan) + continue; + + compat = cfg80211_chandef_compatible(&sta->tdls_chandef, + compat); + if (WARN_ON_ONCE(!compat)) + break; + } rcu_read_unlock(); if (!compat) diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 68b091a0cae1..6376c673a9fe 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2033,6 +2033,9 @@ int ieee80211_check_combinations(struct ieee80211_sub_if_data *sdata, enum ieee80211_chanctx_mode chanmode, u8 radar_detect); int ieee80211_max_num_channels(struct ieee80211_local *local); +enum nl80211_chan_width ieee80211_get_sta_bw(struct ieee80211_sta *sta); +void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx); /* TDLS */ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index b9c1aaaa01ff..0fbf3f348446 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -403,6 +403,8 @@ struct mesh_sta { * @rx_msdu: MSDUs received from this station, using IEEE80211_NUM_TID * entry for non-QoS frames * @fast_tx: TX fastpath information + * @tdls_chandef: a TDLS peer can have a wider chandef that is compatible to + * the BSS one. */ struct sta_info { /* General information, mostly static */ @@ -511,6 +513,8 @@ struct sta_info { u8 reserved_tid; + struct cfg80211_chan_def tdls_chandef; + /* keep last! */ struct ieee80211_sta sta; }; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index fec1b336d03c..fb846cb047d6 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -291,6 +291,60 @@ static void ieee80211_tdls_add_wmm_param_ie(struct ieee80211_sub_if_data *sdata, } } +static void +ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) +{ + /* IEEE802.11ac-2013 Table E-4 */ + u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 }; + struct cfg80211_chan_def uc = sta->tdls_chandef; + enum nl80211_chan_width max_width = ieee80211_get_sta_bw(&sta->sta); + int i; + + /* only support upgrading non-narrow channels up to 80Mhz */ + if (max_width == NL80211_CHAN_WIDTH_5 || + max_width == NL80211_CHAN_WIDTH_10) + return; + + if (max_width > NL80211_CHAN_WIDTH_80) + max_width = NL80211_CHAN_WIDTH_80; + + if (uc.width == max_width) + return; + /* + * Channel usage constrains in the IEEE802.11ac-2013 specification only + * allow expanding a 20MHz channel to 80MHz in a single way. In + * addition, there are no 40MHz allowed channels that are not part of + * the allowed 80MHz range in the 5GHz spectrum (the relevant one here). + */ + for (i = 0; i < ARRAY_SIZE(centers_80mhz); i++) + if (abs(uc.chan->center_freq - centers_80mhz[i]) <= 30) { + uc.center_freq1 = centers_80mhz[i]; + uc.width = NL80211_CHAN_WIDTH_80; + break; + } + + if (!uc.center_freq1) + return; + + /* proceed to downgrade the chandef until usable or the same */ + while (uc.width > max_width && + !cfg80211_reg_can_beacon(sdata->local->hw.wiphy, + &uc, sdata->wdev.iftype)) + ieee80211_chandef_downgrade(&uc); + + if (!cfg80211_chandef_identical(&uc, &sta->tdls_chandef)) { + tdls_dbg(sdata, "TDLS ch width upgraded %d -> %d\n", + sta->tdls_chandef.width, uc.width); + + /* + * the station is not yet authorized when BW upgrade is done, + * locking is not required + */ + sta->tdls_chandef = uc; + } +} + static void ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, const u8 *peer, @@ -358,15 +412,17 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, offset = noffset; } - rcu_read_lock(); + mutex_lock(&local->sta_mtx); /* we should have the peer STA if we're already responding */ if (action_code == WLAN_TDLS_SETUP_RESPONSE) { sta = sta_info_get(sdata, peer); if (WARN_ON_ONCE(!sta)) { - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); return; } + + sta->tdls_chandef = sdata->vif.bss_conf.chandef; } ieee80211_tdls_add_oper_classes(sdata, skb); @@ -456,9 +512,16 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, pos = skb_put(skb, sizeof(struct ieee80211_vht_cap) + 2); ieee80211_ie_build_vht_cap(pos, &vht_cap, vht_cap.cap); + + /* + * if both peers support WIDER_BW, we can expand the chandef to + * a wider compatible one, up to 80MHz + */ + if (test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) + ieee80211_tdls_chandef_vht_upgrade(sdata, sta); } - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); /* add any remaining IEs */ if (extra_ies_len) { @@ -482,15 +545,17 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, enum ieee80211_band band = ieee80211_get_sdata_band(sdata); u8 *pos; - rcu_read_lock(); + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); ap_sta = sta_info_get(sdata, ifmgd->bssid); if (WARN_ON_ONCE(!sta || !ap_sta)) { - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); return; } + sta->tdls_chandef = sdata->vif.bss_conf.chandef; + /* add any custom IEs that go before the QoS IE */ if (extra_ies_len) { static const u8 before_qos[] = { @@ -538,12 +603,19 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, /* only include VHT-operation if not on the 2.4GHz band */ if (band != IEEE80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { + /* + * if both peers support WIDER_BW, we can expand the chandef to + * a wider compatible one, up to 80MHz + */ + if (test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) + ieee80211_tdls_chandef_vht_upgrade(sdata, sta); + pos = skb_put(skb, 2 + sizeof(struct ieee80211_vht_operation)); ieee80211_ie_build_vht_oper(pos, &sta->sta.vht_cap, - &sdata->vif.bss_conf.chandef); + &sta->tdls_chandef); } - rcu_read_unlock(); + mutex_unlock(&local->sta_mtx); /* add any remaining IEs */ if (extra_ies_len) { @@ -1154,6 +1226,22 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev, return ret; } +static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *conf; + struct ieee80211_chanctx *ctx; + + mutex_lock(&local->chanctx_mtx); + conf = rcu_dereference_protected(sdata->vif.chanctx_conf, + lockdep_is_held(&local->chanctx_mtx)); + if (conf) { + ctx = container_of(conf, struct ieee80211_chanctx, conf); + ieee80211_recalc_chanctx_chantype(local, ctx); + } + mutex_unlock(&local->chanctx_mtx); +} + int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, const u8 *peer, enum nl80211_tdls_operation oper) { @@ -1190,6 +1278,8 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, break; } + iee80211_tdls_recalc_chanctx(sdata); + rcu_read_lock(); sta = sta_info_get(sdata, peer); if (!sta) { @@ -1221,6 +1311,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, ieee80211_flush_queues(local, sdata, false); ret = sta_info_destroy_addr(sdata, peer); + iee80211_tdls_recalc_chanctx(sdata); break; default: ret = -ENOTSUPP; -- cgit v1.2.3 From 703ee73a41a74210dde9050c6669053866b133a0 Mon Sep 17 00:00:00 2001 From: Alexis Green Date: Wed, 10 Jun 2015 11:02:09 -0700 Subject: mac80211: mesh: add missing case to PERR processing When the nexthop is unable to resolve its own nexthop it will send back a PERR with a zero target_sn. According to section 13.10.11.4.3 step b in the 2012 standard that perr should be forwarded and the associated mpath->sn should be incremented. Neither one of those was happening which is rather bad because the originator was not told that packets are black holing. Signed-off-by: Alexis Green CC: Jesse Jones Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index cd02810038cb..d31d48f403f5 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -736,9 +736,12 @@ static void hwmp_perr_frame_process(struct ieee80211_sub_if_data *sdata, if (mpath->flags & MESH_PATH_ACTIVE && ether_addr_equal(ta, sta->sta.addr) && (!(mpath->flags & MESH_PATH_SN_VALID) || - SN_GT(target_sn, mpath->sn))) { + SN_GT(target_sn, mpath->sn) || target_sn == 0)) { mpath->flags &= ~MESH_PATH_ACTIVE; - mpath->sn = target_sn; + if (target_sn != 0) + mpath->sn = target_sn; + else + mpath->sn += 1; spin_unlock_bh(&mpath->state_lock); if (!ifmsh->mshcfg.dot11MeshForwarding) goto endperr; -- cgit v1.2.3 From d82547106ff9dee43e6ee4f4b3d70b5314ae266f Mon Sep 17 00:00:00 2001 From: Jesse Jones Date: Fri, 12 Jun 2015 14:13:09 -0700 Subject: mac80211: mesh: don't invalidate SN on discovery failure The 2012 spec mentions that path SNs can be invalid when created (see section 13.10.8.4 table 13-9) but AFAICT never talks about invalidating SNs. Which makes sense: if we have figured out the path to a target at a certain SN then we want to remember that fact. Failing to do so can lead to routing loops because if we don't have a valid SN then we have no way of knowing whether an incoming path message leads to or away from the target. However currently when discovery fails we zero out mpath->flags which clears MESH_PATH_SN_VALID. This patch fixes that so that only the discovery relevant flags are cleared. Signed-off-by: Alexis Green Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index d31d48f403f5..5ed38c5a998f 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1179,7 +1179,9 @@ void mesh_path_timer(unsigned long data) spin_unlock_bh(&mpath->state_lock); mesh_queue_preq(mpath, 0); } else { - mpath->flags = 0; + mpath->flags &= ~(MESH_PATH_RESOLVING | + MESH_PATH_RESOLVED | + MESH_PATH_REQ_QUEUED); mpath->exp_time = jiffies; spin_unlock_bh(&mpath->state_lock); if (!mpath->is_gate && mesh_gate_num(sdata) > 0) { -- cgit v1.2.3 From d8f0300a7aca5cd9208112104c64d894ad82da1f Mon Sep 17 00:00:00 2001 From: Jesse Jones Date: Fri, 12 Jun 2015 15:38:07 -0700 Subject: mac80211: mac80211: Check SN for deactivated mpaths When processing a PREQ or PREP it's critical to use the incoming SN. If that is improperly done routing loops and other types of badness can happen. But the code was always processing path messages for deactivated paths. This path fixes that so that if we have a valid SN then we use it to verify that it is a message we can accept. For reference the relevant section of the standard is 13.10.8.4 which doesn't address the deactivated path case at all. I also included a special case for when our peer reboots or restarts networking. This is an important case because without it there can be a very long delay before we accept path messages from that peer. It's also a simple case and intimately associated with processing messages for deactivated paths so I used one patch instead of two. Signed-off-by: Alexis Green Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 5ed38c5a998f..f053213e5adb 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -79,6 +79,12 @@ static inline u16 u16_field_get(const u8 *preq_elem, int offset, bool ae) #define MSEC_TO_TU(x) (x*1000/1024) #define SN_GT(x, y) ((s32)(y - x) < 0) #define SN_LT(x, y) ((s32)(x - y) < 0) +#define MAX_SANE_SN_DELTA 32 + +static inline u32 SN_DELTA(u32 x, u32 y) +{ + return x >= y ? x - y : y - x; +} #define net_traversal_jiffies(s) \ msecs_to_jiffies(s->u.mesh.mshcfg.dot11MeshHWMPnetDiameterTraversalTime) @@ -441,6 +447,26 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, process = false; fresh_info = false; } + } else if (!(mpath->flags & MESH_PATH_ACTIVE)) { + bool have_sn, newer_sn, bounced; + + have_sn = mpath->flags & MESH_PATH_SN_VALID; + newer_sn = have_sn && SN_GT(orig_sn, mpath->sn); + bounced = have_sn && + (SN_DELTA(orig_sn, mpath->sn) > + MAX_SANE_SN_DELTA); + + if (!have_sn || newer_sn) { + /* if SN is newer than what we had + * then we can take it */; + } else if (bounced) { + /* if SN is way different than what + * we had then assume the other side + * rebooted or restarted */; + } else { + process = false; + fresh_info = false; + } } } else { mpath = mesh_path_add(sdata, orig_addr); -- cgit v1.2.3 From 3633ebebab2bbe88124388b7620442315c968e8f Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Sat, 13 Jun 2015 10:16:31 -0400 Subject: mac80211: enable assoc check for mesh interfaces We already set a station to be associated when peering completes, both in user space and in the kernel. Thus we should always have an associated sta before sending data frames to that station. Failure to check assoc state can cause crashes in the lower-level driver due to transmitting unicast data frames before driver sta structures (e.g. ampdu state in ath9k) are initialized. This occurred when forwarding in the presence of fixed mesh paths: frames were transmitted to stations with whom we hadn't yet completed peering. Cc: stable@vger.kernel.org Reported-by: Alexis Green Tested-by: Jesse Jones Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 7c6832f91dc3..c0d6af809640 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -311,9 +311,6 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) if (tx->sdata->vif.type == NL80211_IFTYPE_WDS) return TX_CONTINUE; - if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT) - return TX_CONTINUE; - if (tx->flags & IEEE80211_TX_PS_BUFFERED) return TX_CONTINUE; -- cgit v1.2.3 From d51c2ea3704be07f030c78d57641d6b972e301ee Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Sun, 14 Jun 2015 16:53:46 +0300 Subject: mac80211: TDLS: correctly configure SMPS state The IEEE802.11-2012 specification is vague regarding SMPS operation during TDLS. It does not define a clear way to transition between SMPS states. To avoid interop issues, set SMPS to off when TDLS peers are connected. Accomplish this by extending the definition of the AUTOMATIC state. If the driver forces a state other than OFF, disconnect all TDLS peers. While at it, avoid changing the SMPS state of the peer STA. We have no way to control it, so try and behave correctly towards it. Move the TDLS peer-teardown function to where the rest of the TDLS code resides. Signed-off-by: Arik Nemtsov Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 21 ++++++++++++++++++--- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 18 ------------------ net/mac80211/tdls.c | 36 ++++++++++++++++++++++++++++++++---- 4 files changed, 51 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index a32575bf0546..5789d8353505 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2368,6 +2368,8 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, const u8 *ap; enum ieee80211_smps_mode old_req; int err; + struct sta_info *sta; + bool tdls_peer_found = false; lockdep_assert_held(&sdata->wdev.mtx); @@ -2392,11 +2394,22 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, ap = sdata->u.mgd.associated->bssid; + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + continue; + + tdls_peer_found = true; + break; + } + rcu_read_unlock(); + if (smps_mode == IEEE80211_SMPS_AUTOMATIC) { - if (sdata->u.mgd.powersave) - smps_mode = IEEE80211_SMPS_DYNAMIC; - else + if (tdls_peer_found || !sdata->u.mgd.powersave) smps_mode = IEEE80211_SMPS_OFF; + else + smps_mode = IEEE80211_SMPS_DYNAMIC; } /* send SM PS frame to AP */ @@ -2404,6 +2417,8 @@ int __ieee80211_request_smps_mgd(struct ieee80211_sub_if_data *sdata, ap, ap); if (err) sdata->u.mgd.req_smps = old_req; + else if (smps_mode != IEEE80211_SMPS_OFF && tdls_peer_found) + ieee80211_teardown_tdls_peers(sdata); return err; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 6376c673a9fe..dd131e9b41d7 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2054,6 +2054,7 @@ void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, const u8 *addr); void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata); extern const struct ethtool_ops ieee80211_ethtool_ops; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index ae5d6c48272d..6332ff705ec3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1096,24 +1096,6 @@ static void ieee80211_chswitch_timer(unsigned long data) ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.chswitch_work); } -static void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) -{ - struct sta_info *sta; - u16 reason = WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED; - - rcu_read_lock(); - list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { - if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || - !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) - continue; - - ieee80211_tdls_oper_request(&sdata->vif, sta->sta.addr, - NL80211_TDLS_TEARDOWN, reason, - GFP_ATOMIC); - } - rcu_read_unlock(); -} - static void ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, u64 timestamp, u32 device_timestamp, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index fb846cb047d6..8536789da17d 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -4,6 +4,7 @@ * Copyright 2006-2010 Johannes Berg * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH + * Copyright 2015 Intel Deutschland GmbH * * This file is GPLv2 as found in COPYING. */ @@ -448,10 +449,6 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_ht_cap(pos, &ht_cap, ht_cap.cap); } else if (action_code == WLAN_TDLS_SETUP_RESPONSE && ht_cap.ht_supported && sta->sta.ht_cap.ht_supported) { - /* disable SMPS in TDLS responder */ - sta->sta.ht_cap.cap |= WLAN_HT_CAP_SM_PS_DISABLED - << IEEE80211_HT_CAP_SM_PS_SHIFT; - /* the peer caps are already intersected with our own */ memcpy(&ht_cap, &sta->sta.ht_cap, sizeof(ht_cap)); @@ -1063,8 +1060,17 @@ ieee80211_tdls_mgmt_setup(struct wiphy *wiphy, struct net_device *dev, { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; + enum ieee80211_smps_mode smps_mode = sdata->u.mgd.driver_smps_mode; int ret; + /* don't support setup with forced SMPS mode that's not off */ + if (smps_mode != IEEE80211_SMPS_AUTOMATIC && + smps_mode != IEEE80211_SMPS_OFF) { + tdls_dbg(sdata, "Aborting TDLS setup due to SMPS mode %d\n", + smps_mode); + return -ENOTSUPP; + } + mutex_lock(&local->mtx); /* we don't support concurrent TDLS peer setups */ @@ -1323,6 +1329,10 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev, eth_zero_addr(sdata->u.mgd.tdls_peer); } + if (ret == 0) + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.request_smps_work); + mutex_unlock(&local->mtx); return ret; } @@ -1819,3 +1829,21 @@ void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, return; } } + +void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + u16 reason = WLAN_REASON_TDLS_TEARDOWN_UNSPECIFIED; + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) { + if (!sta->sta.tdls || sta->sdata != sdata || !sta->uploaded || + !test_sta_flag(sta, WLAN_STA_AUTHORIZED)) + continue; + + ieee80211_tdls_oper_request(&sdata->vif, sta->sta.addr, + NL80211_TDLS_TEARDOWN, reason, + GFP_ATOMIC); + } + rcu_read_unlock(); +} -- cgit v1.2.3 From 932e628da2fe9be759e19b14e3e5bf1e0e6f0984 Mon Sep 17 00:00:00 2001 From: Chun-Yeow Yeoh Date: Mon, 15 Jun 2015 11:58:53 +0800 Subject: mac80211: mesh process the target only subfield for mesh hwmp This patch does the following: - Remove unnecessary flags field used by PERR element - Use the per target flags defined in - Process the target only subfield based on case E2 of IEEE802.11-2012 13.10.9.3 Signed-off-by: Chun-Yeow Yeoh Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index f053213e5adb..be635341c802 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -19,15 +19,6 @@ #define MAX_PREQ_QUEUE_LEN 64 -/* Destination only */ -#define MP_F_DO 0x1 -/* Reply and forward */ -#define MP_F_RF 0x2 -/* Unknown Sequence Number */ -#define MP_F_USN 0x01 -/* Reason code Present */ -#define MP_F_RCODE 0x02 - static void mesh_queue_preq(struct mesh_path *, u8); static inline u32 u32_field_get(const u8 *preq_elem, int offset, bool ae) @@ -285,15 +276,10 @@ int mesh_path_error_tx(struct ieee80211_sub_if_data *sdata, *pos++ = ttl; /* number of destinations */ *pos++ = 1; - /* - * flags bit, bit 1 is unset if we know the sequence number and - * bit 2 is set if we have a reason code + /* Flags field has AE bit only as defined in + * sec 8.4.2.117 IEEE802.11-2012 */ *pos = 0; - if (!target_sn) - *pos |= MP_F_USN; - if (target_rcode) - *pos |= MP_F_RCODE; pos++; memcpy(pos, target, ETH_ALEN); pos += ETH_ALEN; @@ -596,15 +582,13 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata, SN_LT(mpath->sn, target_sn)) { mpath->sn = target_sn; mpath->flags |= MESH_PATH_SN_VALID; - } else if ((!(target_flags & MP_F_DO)) && + } else if ((!(target_flags & IEEE80211_PREQ_TO_FLAG)) && (mpath->flags & MESH_PATH_ACTIVE)) { reply = true; target_metric = mpath->metric; target_sn = mpath->sn; - if (target_flags & MP_F_RF) - target_flags |= MP_F_DO; - else - forward = false; + /* Case E2 of sec 13.10.9.3 IEEE 802.11-2012*/ + target_flags |= IEEE80211_PREQ_TO_FLAG; } } rcu_read_unlock(); @@ -1003,7 +987,7 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct mesh_preq_queue *preq_node; struct mesh_path *mpath; - u8 ttl, target_flags; + u8 ttl, target_flags = 0; const u8 *da; u32 lifetime; @@ -1062,9 +1046,9 @@ void mesh_path_start_discovery(struct ieee80211_sub_if_data *sdata) } if (preq_node->flags & PREQ_Q_F_REFRESH) - target_flags = MP_F_DO; + target_flags |= IEEE80211_PREQ_TO_FLAG; else - target_flags = MP_F_RF; + target_flags &= ~IEEE80211_PREQ_TO_FLAG; spin_unlock_bh(&mpath->state_lock); da = (mpath->is_root) ? mpath->rann_snd_addr : broadcast_addr; -- cgit v1.2.3 From a3ebb4e1b76346156e8e7233c262ce24e3a86a24 Mon Sep 17 00:00:00 2001 From: Krishna Chaitanya Date: Fri, 12 Jun 2015 02:34:52 +0530 Subject: mac80211: minstrel_ht: handle peers in dynamic SMPS In case of Dynamic SMPS enable RTS/CTS for all rates. Signed-off-by: Chaitanya T K [change comment] Signed-off-by: Johannes Berg --- net/mac80211/rc80211_minstrel_ht.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 543b67233535..3928dbd24e25 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -867,7 +867,13 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, else idx = index % MCS_GROUP_RATES + (group->streams - 1) * 8; - if (offset > 0) { + /* enable RTS/CTS if needed: + * - if station is in dynamic SMPS (and streams > 1) + * - for fallback rates, to increase chances of getting through + */ + if (offset > 0 && + (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC && + group->streams > 1)) { ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts; flags |= IEEE80211_TX_RC_USE_RTS_CTS; } -- cgit v1.2.3 From a76d5e0a2311ad6b5a8bfa92d3d627194c8c389a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 18 Jun 2015 16:20:08 +0200 Subject: mac80211: mesh: move fail_avg into mesh struct This value is only used in mesh, so move it into the new mesh sub-struct of the station info. Signed-off-by: Johannes Berg --- net/mac80211/mesh_hwmp.c | 9 +++++---- net/mac80211/sta_info.h | 7 ++++--- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index be635341c802..d80e0a4c16cf 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -308,8 +308,9 @@ void ieee80211s_update_metric(struct ieee80211_local *local, failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK); /* moving average, scaled to 100 */ - sta->fail_avg = ((80 * sta->fail_avg + 5) / 100 + 20 * failed); - if (sta->fail_avg > 95) + sta->mesh->fail_avg = + ((80 * sta->mesh->fail_avg + 5) / 100 + 20 * failed); + if (sta->mesh->fail_avg > 95) mesh_plink_broken(sta); } @@ -325,7 +326,7 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, u32 tx_time, estimated_retx; u64 result; - if (sta->fail_avg >= 100) + if (sta->mesh->fail_avg >= 100) return MAX_METRIC; sta_set_rate_info_tx(sta, &sta->last_tx_rate, &rinfo); @@ -333,7 +334,7 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, if (WARN_ON(!rate)) return MAX_METRIC; - err = (sta->fail_avg << ARITH_SHIFT) / 100; + err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100; /* bitrate is in units of 100 Kbps, while we need rate in units of * 1Mbps. This will be corrected on tx_time computation. diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 0fbf3f348446..6dcb33484eac 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -290,6 +290,7 @@ struct ieee80211_fast_tx { * @nonpeer_pm: STA power save mode towards non-peer neighbors * @processed_beacon: set to true after peer rates and capabilities are * processed + * @fail_avg: moving percentage of failed MSDUs */ struct mesh_sta { struct timer_list plink_timer; @@ -312,6 +313,9 @@ struct mesh_sta { enum nl80211_mesh_power_mode local_pm; enum nl80211_mesh_power_mode peer_pm; enum nl80211_mesh_power_mode nonpeer_pm; + + /* moving percentage of failed MSDUs */ + unsigned int fail_avg; }; /** @@ -369,7 +373,6 @@ struct mesh_sta { * @tx_filtered_count: number of frames the hardware filtered for this STA * @tx_retry_failed: number of frames that failed retry * @tx_retry_count: total number of retries for frames to this STA - * @fail_avg: moving percentage of failed MSDUs * @tx_packets: number of RX/TX MSDUs * @tx_bytes: number of bytes transmitted to this STA * @tid_seq: per-TID sequence numbers for sending to this STA @@ -470,8 +473,6 @@ struct sta_info { /* Updated from TX status path only, no locking requirements */ unsigned long tx_filtered_count; unsigned long tx_retry_failed, tx_retry_count; - /* moving percentage of failed MSDUs */ - unsigned int fail_avg; /* Updated from TX path only, no locking requirements */ u64 tx_packets[IEEE80211_NUM_ACS]; -- cgit v1.2.3 From 69f132236827ce7d4531846cc2b9447dd5620aff Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 26 Jun 2015 09:21:01 +0200 Subject: mac80211: shrink struct ieee80211_fragment_entry Most of the fields in this struct use too wide types, change that to shrink the struct from 64 to 48 bytes (on 64-bit.) This results in a total saving of 64 bytes for each interface. Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index dd131e9b41d7..52930e91c0fd 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -84,13 +84,13 @@ struct ieee80211_local; #define IEEE80211_DEAUTH_FRAME_LEN (24 /* hdr */ + 2 /* reason */) struct ieee80211_fragment_entry { - unsigned long first_frag_time; - unsigned int seq; - unsigned int rx_queue; - unsigned int last_frag; - unsigned int extra_len; struct sk_buff_head skb_list; - int ccmp; /* Whether fragments were encrypted with CCMP */ + unsigned long first_frag_time; + u16 seq; + u16 extra_len; + u16 last_frag; + u8 rx_queue; + bool ccmp; /* Whether fragments were encrypted with CCMP */ u8 last_pn[6]; /* PN of the last fragment if CCMP was used */ }; -- cgit v1.2.3 From 33d8783c58427683b533664f67f8c4378ed64495 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 23 Jun 2015 17:47:05 +0200 Subject: cfg80211: allow mgmt_frame_register callback to sleep This callback is currently not allowed to sleep, which makes it more difficult to implement proper driver methods in mac80211 than it has to be. Instead of doing asynchronous work here in mac80211, make it possible for the callback to sleep by doing some asynchronous work in cfg80211. This also enables improvements to other drivers, like ath6kl, that would like to sleep in this callback. While at it, also fix the code to call the driver on the implicit unregistration when an interface is removed, and do that also when a P2P-Device wdev is destroyed (otherwise we leak the structs.) Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 3 +- net/wireless/core.c | 5 ++++ net/wireless/core.h | 5 ++++ net/wireless/mlme.c | 75 +++++++++++++++++++++++++++++++++++++++---------- net/wireless/rdev-ops.h | 2 ++ 5 files changed, 73 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index a741678f24a2..9a529c48f6ca 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2369,8 +2369,7 @@ struct cfg80211_qos_map { * method returns 0.) * * @mgmt_frame_register: Notify driver that a management frame type was - * registered. Note that this callback may not sleep, and cannot run - * concurrently with itself. + * registered. The callback is allowed to sleep. * * @set_antenna: Set antenna configuration (tx_ant, rx_ant) on the device. * Parameters are bitmaps of allowed antennas to use for TX/RX. Drivers may diff --git a/net/wireless/core.c b/net/wireless/core.c index 2a0bbd22854b..3893409dee95 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -407,6 +407,9 @@ use_default_name: INIT_LIST_HEAD(&rdev->bss_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results); + INIT_LIST_HEAD(&rdev->mlme_unreg); + spin_lock_init(&rdev->mlme_unreg_lock); + INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk); INIT_DELAYED_WORK(&rdev->dfs_update_channels_wk, cfg80211_dfs_channels_update_work); #ifdef CONFIG_CFG80211_WEXT @@ -802,6 +805,7 @@ void wiphy_unregister(struct wiphy *wiphy) cancel_delayed_work_sync(&rdev->dfs_update_channels_wk); flush_work(&rdev->destroy_work); flush_work(&rdev->sched_scan_stop_wk); + flush_work(&rdev->mlme_unreg_wk); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) @@ -855,6 +859,7 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) switch (wdev->iftype) { case NL80211_IFTYPE_P2P_DEVICE: + cfg80211_mlme_purge_registrations(wdev); cfg80211_stop_p2p_device(rdev, wdev); break; default: diff --git a/net/wireless/core.h b/net/wireless/core.h index 311eef26bf88..b9d5bc8c148d 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -59,6 +59,10 @@ struct cfg80211_registered_device { struct list_head beacon_registrations; spinlock_t beacon_registrations_lock; + struct list_head mlme_unreg; + spinlock_t mlme_unreg_lock; + struct work_struct mlme_unreg_wk; + /* protected by RTNL only */ int num_running_ifaces; int num_running_monitor_ifaces; @@ -348,6 +352,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, int match_len); +void cfg80211_mlme_unreg_wk(struct work_struct *wk); void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 7aae329e2b4e..fb44fa3bf4ef 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -2,6 +2,7 @@ * cfg80211 MLME SAP interface * * Copyright (c) 2009, Jouni Malinen + * Copyright (c) 2015 Intel Deutschland GmbH */ #include @@ -389,6 +390,7 @@ void cfg80211_mlme_down(struct cfg80211_registered_device *rdev, struct cfg80211_mgmt_registration { struct list_head list; + struct wireless_dev *wdev; u32 nlportid; @@ -399,6 +401,46 @@ struct cfg80211_mgmt_registration { u8 match[]; }; +static void +cfg80211_process_mlme_unregistrations(struct cfg80211_registered_device *rdev) +{ + struct cfg80211_mgmt_registration *reg; + + ASSERT_RTNL(); + + spin_lock_bh(&rdev->mlme_unreg_lock); + while ((reg = list_first_entry_or_null(&rdev->mlme_unreg, + struct cfg80211_mgmt_registration, + list))) { + list_del(®->list); + spin_unlock_bh(&rdev->mlme_unreg_lock); + + if (rdev->ops->mgmt_frame_register) { + u16 frame_type = le16_to_cpu(reg->frame_type); + + rdev_mgmt_frame_register(rdev, reg->wdev, + frame_type, false); + } + + kfree(reg); + + spin_lock_bh(&rdev->mlme_unreg_lock); + } + spin_unlock_bh(&rdev->mlme_unreg_lock); +} + +void cfg80211_mlme_unreg_wk(struct work_struct *wk) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(wk, struct cfg80211_registered_device, + mlme_unreg_wk); + + rtnl_lock(); + cfg80211_process_mlme_unregistrations(rdev); + rtnl_unlock(); +} + int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, u16 frame_type, const u8 *match_data, int match_len) @@ -449,11 +491,18 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid, nreg->match_len = match_len; nreg->nlportid = snd_portid; nreg->frame_type = cpu_to_le16(frame_type); + nreg->wdev = wdev; list_add(&nreg->list, &wdev->mgmt_registrations); + spin_unlock_bh(&wdev->mgmt_registrations_lock); + + /* process all unregistrations to avoid driver confusion */ + cfg80211_process_mlme_unregistrations(rdev); if (rdev->ops->mgmt_frame_register) rdev_mgmt_frame_register(rdev, wdev, frame_type, true); + return 0; + out: spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -472,15 +521,12 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) if (reg->nlportid != nlportid) continue; - if (rdev->ops->mgmt_frame_register) { - u16 frame_type = le16_to_cpu(reg->frame_type); - - rdev_mgmt_frame_register(rdev, wdev, - frame_type, false); - } - list_del(®->list); - kfree(reg); + spin_lock(&rdev->mlme_unreg_lock); + list_add_tail(®->list, &rdev->mlme_unreg); + spin_unlock(&rdev->mlme_unreg_lock); + + schedule_work(&rdev->mlme_unreg_wk); } spin_unlock_bh(&wdev->mgmt_registrations_lock); @@ -496,16 +542,15 @@ void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlportid) void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { - struct cfg80211_mgmt_registration *reg, *tmp; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); spin_lock_bh(&wdev->mgmt_registrations_lock); - - list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { - list_del(®->list); - kfree(reg); - } - + spin_lock(&rdev->mlme_unreg_lock); + list_splice_tail_init(&wdev->mgmt_registrations, &rdev->mlme_unreg); + spin_unlock(&rdev->mlme_unreg_lock); spin_unlock_bh(&wdev->mgmt_registrations_lock); + + cfg80211_process_mlme_unregistrations(rdev); } int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index c6e83a7468c0..c23516d0f807 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -733,6 +733,8 @@ static inline void rdev_mgmt_frame_register(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u16 frame_type, bool reg) { + might_sleep(); + trace_rdev_mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); rdev->ops->mgmt_frame_register(&rdev->wiphy, wdev , frame_type, reg); trace_rdev_return_void(&rdev->wiphy); -- cgit v1.2.3 From 841b351cf98e0b4ef25f6459d35251e63c0a7a49 Mon Sep 17 00:00:00 2001 From: John Linville Date: Wed, 24 Jun 2015 11:42:25 -0400 Subject: wireless: remove superfluous if statement in regulatory code Commit eeca9fce1d71 ('cfg80211: Schedule timeout for all CRDA calls') left behind a superfluous check after it removed some earlier code. In reg_process_hint, the test of "treatment == REG_REQ_IGNORE || treatment == REG_REQ_ALREADY_SET" is superfluous because the code in the if-then branch is identical to the code after the if statement. Coverity CID #1295939 I also removed the unnecessary assignment of treatment in this case, and added a comment reminding any future patch authors to ensure that treatment is properly assigned before it is used after the switch. Signed-off-by: John W. Linville Signed-off-by: Johannes Berg --- net/wireless/reg.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index d359e0610198..62d8ea42dbfb 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2079,10 +2079,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) reg_process_hint_core(reg_request); return; case NL80211_REGDOM_SET_BY_USER: - treatment = reg_process_hint_user(reg_request); - if (treatment == REG_REQ_IGNORE || - treatment == REG_REQ_ALREADY_SET) - return; + reg_process_hint_user(reg_request); return; case NL80211_REGDOM_SET_BY_DRIVER: if (!wiphy) @@ -2099,7 +2096,9 @@ static void reg_process_hint(struct regulatory_request *reg_request) goto out_free; } - /* This is required so that the orig_* parameters are saved */ + /* This is required so that the orig_* parameters are saved. + * NOTE: treatment must be set for any case that reaches here! + */ if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { wiphy_update_regulatory(wiphy, reg_request->initiator); -- cgit v1.2.3 From ccc6bb96ff058ad737fb8236e15aeaa56e822296 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 23 Jun 2015 11:50:52 +0200 Subject: mac80211: account TX MSDUs properly with segmentation offload If an SKB will be segmented by the driver, count it for multiple MSDUs that are being transmitted rather than just a single. Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index c0d6af809640..d14f3618069f 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2771,7 +2771,11 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, sdata->sequence_number += 0x10; } - sta->tx_msdu[tid]++; + if (skb_shinfo(skb)->gso_size) + sta->tx_msdu[tid] += + DIV_ROUND_UP(skb->len, skb_shinfo(skb)->gso_size); + else + sta->tx_msdu[tid]++; info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)]; -- cgit v1.2.3 From 322cd406da0b102dda9c0eec46181458a3667dbb Mon Sep 17 00:00:00 2001 From: Sara Sharon Date: Wed, 8 Jul 2015 15:41:43 +0300 Subject: mac80211: Add support for declaring MU-MIMO capability Add support for declaring MU-MIMO beamformee capability for relevant hardware. When sending association request, the capability is included if both hardware and the AP support it, and no other virtual interface is using it. This is in order to avoid multiple interfaces using MU-MIMO in parallel which might lead to contradictions in the group-id mechanism. Signed-off-by: Sara Sharon Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/mlme.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 52930e91c0fd..90580e903926 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -721,6 +721,7 @@ struct ieee80211_if_mesh { * back to wireless media and to the local net stack. * @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume. * @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver + * @IEEE80211_SDATA_MU_MIMO_OWNER: indicates interface owns MU-MIMO capability */ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_ALLMULTI = BIT(0), @@ -728,6 +729,7 @@ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4), IEEE80211_SDATA_IN_DRIVER = BIT(5), + IEEE80211_SDATA_MU_MIMO_OWNER = BIT(6), }; /** diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 6332ff705ec3..705ef1d040ed 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6,6 +6,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH + * Copyright (C) 2015 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -538,11 +539,16 @@ static void ieee80211_add_ht_ie(struct ieee80211_sub_if_data *sdata, ieee80211_ie_build_ht_cap(pos, &ht_cap, cap); } +/* This function determines vht capability flags for the association + * and builds the IE. + * Note - the function may set the owner of the MU-MIMO capability + */ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_supported_band *sband, struct ieee80211_vht_cap *ap_vht_cap) { + struct ieee80211_local *local = sdata->local; u8 *pos; u32 cap; struct ieee80211_sta_vht_cap vht_cap; @@ -576,7 +582,34 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata, */ if (!(ap_vht_cap->vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE))) - cap &= ~IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE; + cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | + IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE); + else if (!(ap_vht_cap->vht_cap_info & + cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE))) + cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + + /* + * If some other vif is using the MU-MIMO capablity we cannot associate + * using MU-MIMO - this will lead to contradictions in the group-id + * mechanism. + * Ownership is defined since association request, in order to avoid + * simultaneous associations with MU-MIMO. + */ + if (cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) { + bool disable_mu_mimo = false; + struct ieee80211_sub_if_data *other; + + list_for_each_entry_rcu(other, &local->interfaces, list) { + if (other->flags & IEEE80211_SDATA_MU_MIMO_OWNER) { + disable_mu_mimo = true; + break; + } + } + if (disable_mu_mimo) + cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; + else + sdata->flags |= IEEE80211_SDATA_MU_MIMO_OWNER; + } mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK; @@ -2058,6 +2091,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask)); memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa)); memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask)); + sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER; sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL; @@ -2520,6 +2554,7 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata, eth_zero_addr(sdata->u.mgd.bssid); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID); sdata->u.mgd.flags = 0; + sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER; mutex_lock(&sdata->local->mtx); ieee80211_vif_release_channel(sdata); mutex_unlock(&sdata->local->mtx); -- cgit v1.2.3 From c8ff71e667d9fcf775e8b8bbd568d32d48cfb864 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 8 Jul 2015 15:41:45 +0300 Subject: mac80211: TDLS: handle chan-switch in RTNL locked work Move TDLS channel-switch Rx handling into an RTNL locked work. This is required to add proper regulatory checking to incoming channel-switch requests. Queue incoming requests in a dedicated skb queue and handle the request in a device-specific work to avoid deadlocking on interface removal. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/ieee80211_i.h | 8 +++++--- net/mac80211/iface.c | 2 -- net/mac80211/main.c | 5 +++++ net/mac80211/rx.c | 5 ++--- net/mac80211/tdls.c | 34 ++++++++++++++++++++++++++++++++-- 5 files changed, 44 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 90580e903926..36f217e842d8 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1008,7 +1008,6 @@ enum sdata_queue_type { IEEE80211_SDATA_QUEUE_AGG_STOP = 2, IEEE80211_SDATA_QUEUE_RX_AGG_START = 3, IEEE80211_SDATA_QUEUE_RX_AGG_STOP = 4, - IEEE80211_SDATA_QUEUE_TDLS_CHSW = 5, }; enum { @@ -1351,6 +1350,10 @@ struct ieee80211_local { /* extended capabilities provided by mac80211 */ u8 ext_capa[8]; + + /* TDLS channel switch */ + struct work_struct tdls_chsw_work; + struct sk_buff_head skb_queue_tdls_chsw; }; static inline struct ieee80211_sub_if_data * @@ -2054,9 +2057,8 @@ int ieee80211_tdls_channel_switch(struct wiphy *wiphy, struct net_device *dev, void ieee80211_tdls_cancel_channel_switch(struct wiphy *wiphy, struct net_device *dev, const u8 *addr); -void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb); void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata); +void ieee80211_tdls_chsw_work(struct work_struct *wk); extern const struct ethtool_ops ieee80211_ethtool_ops; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 553ac6dd4867..0fba7f97a963 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1242,8 +1242,6 @@ static void ieee80211_iface_work(struct work_struct *work) WLAN_BACK_RECIPIENT, 0, false); mutex_unlock(&local->sta_mtx); - } else if (skb->pkt_type == IEEE80211_SDATA_QUEUE_TDLS_CHSW) { - ieee80211_process_tdls_channel_switch(sdata, skb); } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_BACK) { int len = skb->len; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index dba0a86dee18..ff79a13d231d 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -629,6 +629,8 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, INIT_WORK(&local->sched_scan_stopped_work, ieee80211_sched_scan_stopped_work); + INIT_WORK(&local->tdls_chsw_work, ieee80211_tdls_chsw_work); + spin_lock_init(&local->ack_status_lock); idr_init(&local->ack_status_frames); @@ -645,6 +647,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); + skb_queue_head_init(&local->skb_queue_tdls_chsw); ieee80211_alloc_led_names(local); @@ -1161,6 +1164,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) cancel_work_sync(&local->restart_work); cancel_work_sync(&local->reconfig_filter); + cancel_work_sync(&local->tdls_chsw_work); flush_work(&local->sched_scan_stopped_work); ieee80211_clear_tx_pending(local); @@ -1171,6 +1175,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) wiphy_warn(local->hw.wiphy, "skb_queue not empty\n"); skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); + skb_queue_purge(&local->skb_queue_tdls_chsw); destroy_workqueue(local->workqueue); wiphy_unregister(local->hw.wiphy); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 3a1462810c8e..f673304f70f5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2410,9 +2410,8 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) tf->category == WLAN_CATEGORY_TDLS && (tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_REQUEST || tf->action_code == WLAN_TDLS_CHANNEL_SWITCH_RESPONSE)) { - rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TDLS_CHSW; - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&rx->local->hw, &sdata->work); + skb_queue_tail(&local->skb_queue_tdls_chsw, rx->skb); + schedule_work(&local->tdls_chsw_work); if (rx->sta) rx->sta->rx_packets++; diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 20c9dbde3b2b..91e86bf76867 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "ieee80211_i.h" #include "driver-ops.h" @@ -1800,12 +1801,15 @@ out: return ret; } -void ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb) +static void +ieee80211_process_tdls_channel_switch(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) { struct ieee80211_tdls_data *tf = (void *)skb->data; struct wiphy *wiphy = sdata->local->hw.wiphy; + ASSERT_RTNL(); + /* make sure the driver supports it */ if (!(wiphy->features & NL80211_FEATURE_TDLS_CHANNEL_SWITCH)) return; @@ -1847,3 +1851,29 @@ void ieee80211_teardown_tdls_peers(struct ieee80211_sub_if_data *sdata) } rcu_read_unlock(); } + +void ieee80211_tdls_chsw_work(struct work_struct *wk) +{ + struct ieee80211_local *local = + container_of(wk, struct ieee80211_local, tdls_chsw_work); + struct ieee80211_sub_if_data *sdata; + struct sk_buff *skb; + struct ieee80211_tdls_data *tf; + + rtnl_lock(); + while ((skb = skb_dequeue(&local->skb_queue_tdls_chsw))) { + tf = (struct ieee80211_tdls_data *)skb->data; + list_for_each_entry(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata) || + sdata->vif.type != NL80211_IFTYPE_STATION || + !ether_addr_equal(tf->da, sdata->vif.addr)) + continue; + + ieee80211_process_tdls_channel_switch(sdata, skb); + break; + } + + kfree_skb(skb); + } + rtnl_unlock(); +} -- cgit v1.2.3 From 42d8d789615d539cb13733e516b94e874a34f775 Mon Sep 17 00:00:00 2001 From: Arik Nemtsov Date: Wed, 8 Jul 2015 15:41:46 +0300 Subject: mac80211: TDLS: deny ch-switch req on disallowed channels If a TDLS station is not allowed to beacon on a channel, don't accept a channel switch request to this channel. Move channel building code up to avoid lockdep violations - reg_can_beacon needs to take the wdev lock. Signed-off-by: Arik Nemtsov Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/tdls.c | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index 91e86bf76867..aee701a5649e 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1737,6 +1737,31 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, return -EINVAL; } + if (!elems.sec_chan_offs) { + chan_type = NL80211_CHAN_HT20; + } else { + switch (elems.sec_chan_offs->sec_chan_offs) { + case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: + chan_type = NL80211_CHAN_HT40PLUS; + break; + case IEEE80211_HT_PARAM_CHA_SEC_BELOW: + chan_type = NL80211_CHAN_HT40MINUS; + break; + default: + chan_type = NL80211_CHAN_HT20; + break; + } + } + + cfg80211_chandef_create(&chandef, chan, chan_type); + + /* we will be active on the TDLS link */ + if (!cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &chandef, + sdata->wdev.iftype)) { + tdls_dbg(sdata, "TDLS chan switch to forbidden channel\n"); + return -EINVAL; + } + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, tf->sa); if (!sta || !test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) { @@ -1757,27 +1782,15 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, goto out; } - if (!sta->sta.ht_cap.ht_supported) { - chan_type = NL80211_CHAN_NO_HT; - } else if (!elems.sec_chan_offs) { - chan_type = NL80211_CHAN_HT20; - } else { - switch (elems.sec_chan_offs->sec_chan_offs) { - case IEEE80211_HT_PARAM_CHA_SEC_ABOVE: - chan_type = NL80211_CHAN_HT40PLUS; - break; - case IEEE80211_HT_PARAM_CHA_SEC_BELOW: - chan_type = NL80211_CHAN_HT40MINUS; - break; - default: - chan_type = NL80211_CHAN_HT20; - break; - } + /* peer should have known better */ + if (!sta->sta.ht_cap.ht_supported && elems.sec_chan_offs && + elems.sec_chan_offs->sec_chan_offs) { + tdls_dbg(sdata, "TDLS chan switch - wide chan unsupported\n"); + ret = -ENOTSUPP; + goto out; } - cfg80211_chandef_create(&chandef, chan, chan_type); params.chandef = &chandef; - params.switch_time = le16_to_cpu(elems.ch_sw_timing->switch_time); params.switch_timeout = le16_to_cpu(elems.ch_sw_timing->switch_timeout); -- cgit v1.2.3 From b0485e9f3defbed6effcde595df9b9fdbdb2524e Mon Sep 17 00:00:00 2001 From: Eliad Peller Date: Wed, 8 Jul 2015 15:41:47 +0300 Subject: mac80211: clear local->suspended before calling drv_resume() Currently, mac80211 calls drv_resume() on wowlan resume, but drops any incoming frame until local->suspended is cleared later on. This requires the low-level driver to support a new state, in which it is expected to fully work (as it was resumed) but not passing rx frames yet (as they will be dropped). iwlwifi (and probably other drivers as well) has issues supporting such mode. Since in the wowlan case we already short-circuit ieee80211_reconfig, there's nothing that prevents us from clearing local->suspend before calling drv_resume(), and letting the low-level driver work normally. Signed-off-by: Eliad Peller Signed-off-by: Emmanuel Grumbach Signed-off-by: Johannes Berg --- net/mac80211/util.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/util.c b/net/mac80211/util.c index e54596f95663..1104421bc525 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1716,16 +1716,24 @@ int ieee80211_reconfig(struct ieee80211_local *local) struct ieee80211_sub_if_data *sched_scan_sdata; struct cfg80211_sched_scan_request *sched_scan_req; bool sched_scan_stopped = false; + bool suspended = local->suspended; /* nothing to do if HW shouldn't run */ if (!local->open_count) goto wake_up; #ifdef CONFIG_PM - if (local->suspended) + if (suspended) local->resuming = true; if (local->wowlan) { + /* + * In the wowlan case, both mac80211 and the device + * are functional when the resume op is called, so + * clear local->suspended so the device could operate + * normally (e.g. pass rx frames). + */ + local->suspended = false; res = drv_resume(local); local->wowlan = false; if (res < 0) { @@ -1738,8 +1746,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* * res is 1, which means the driver requested * to go through a regular reset on wakeup. + * restore local->suspended in this case. */ reconfig_due_to_wowlan = true; + local->suspended = true; } #endif @@ -1751,7 +1761,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) */ res = drv_start(local); if (res) { - if (local->suspended) + if (suspended) WARN(1, "Hardware became unavailable upon resume. This could be a software issue prior to suspend or a hardware issue.\n"); else WARN(1, "Hardware became unavailable during restart.\n"); @@ -2045,10 +2055,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) * If this is for hw restart things are still running. * We may want to change that later, however. */ - if (local->open_count && (!local->suspended || reconfig_due_to_wowlan)) + if (local->open_count && (!suspended || reconfig_due_to_wowlan)) drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_RESTART); - if (!local->suspended) + if (!suspended) return 0; #ifdef CONFIG_PM -- cgit v1.2.3 From fa87a6566ca8f17a92ba81980bd47c456262907c Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Tue, 14 Jul 2015 08:31:57 -0400 Subject: mac80211: reorder mesh_plink to remove forward decl Move mesh_plink_frame_tx() above the first caller to remove the forward declaration. Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/mesh_plink.c | 109 ++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index ac1029f28133..a5aa0345dd7e 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -53,11 +53,6 @@ static const char * const mplevents[] = { [CLS_IGNR] = "CLS_IGNR" }; -static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, - enum ieee80211_self_protected_actioncode action, - u8 *da, u16 llid, u16 plid, u16 reason); - - /* We only need a valid sta if user configured a minimum rssi_threshold. */ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, struct sta_info *sta) @@ -204,58 +199,6 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) return BSS_CHANGED_HT; } -/** - * __mesh_plink_deactivate - deactivate mesh peer link - * - * @sta: mesh peer link to deactivate - * - * All mesh paths with this peer as next hop will be flushed - * Returns beacon changed flag if the beacon content changed. - * - * Locking: the caller must hold sta->mesh->plink_lock - */ -static u32 __mesh_plink_deactivate(struct sta_info *sta) -{ - struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed = 0; - - lockdep_assert_held(&sta->mesh->plink_lock); - - if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) - changed = mesh_plink_dec_estab_count(sdata); - sta->mesh->plink_state = NL80211_PLINK_BLOCKED; - mesh_path_flush_by_nexthop(sta); - - ieee80211_mps_sta_status_update(sta); - changed |= ieee80211_mps_set_sta_local_pm(sta, - NL80211_MESH_POWER_UNKNOWN); - - return changed; -} - -/** - * mesh_plink_deactivate - deactivate mesh peer link - * - * @sta: mesh peer link to deactivate - * - * All mesh paths with this peer as next hop will be flushed - */ -u32 mesh_plink_deactivate(struct sta_info *sta) -{ - struct ieee80211_sub_if_data *sdata = sta->sdata; - u32 changed; - - spin_lock_bh(&sta->mesh->plink_lock); - changed = __mesh_plink_deactivate(sta); - sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; - mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, - sta->sta.addr, sta->mesh->llid, sta->mesh->plid, - sta->mesh->reason); - spin_unlock_bh(&sta->mesh->plink_lock); - - return changed; -} - static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, enum ieee80211_self_protected_actioncode action, u8 *da, u16 llid, u16 plid, u16 reason) @@ -375,6 +318,58 @@ free: return err; } +/** + * __mesh_plink_deactivate - deactivate mesh peer link + * + * @sta: mesh peer link to deactivate + * + * All mesh paths with this peer as next hop will be flushed + * Returns beacon changed flag if the beacon content changed. + * + * Locking: the caller must hold sta->mesh->plink_lock + */ +static u32 __mesh_plink_deactivate(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 changed = 0; + + lockdep_assert_held(&sta->mesh->plink_lock); + + if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) + changed = mesh_plink_dec_estab_count(sdata); + sta->mesh->plink_state = NL80211_PLINK_BLOCKED; + mesh_path_flush_by_nexthop(sta); + + ieee80211_mps_sta_status_update(sta); + changed |= ieee80211_mps_set_sta_local_pm(sta, + NL80211_MESH_POWER_UNKNOWN); + + return changed; +} + +/** + * mesh_plink_deactivate - deactivate mesh peer link + * + * @sta: mesh peer link to deactivate + * + * All mesh paths with this peer as next hop will be flushed + */ +u32 mesh_plink_deactivate(struct sta_info *sta) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + u32 changed; + + spin_lock_bh(&sta->mesh->plink_lock); + changed = __mesh_plink_deactivate(sta); + sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; + mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, + sta->sta.addr, sta->mesh->llid, sta->mesh->plid, + sta->mesh->reason); + spin_unlock_bh(&sta->mesh->plink_lock); + + return changed; +} + static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee802_11_elems *elems, bool insert) -- cgit v1.2.3 From a69bd8e60b02946896c097439b94eb77c0c2c9e4 Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Tue, 14 Jul 2015 08:31:58 -0400 Subject: mac80211: mesh: separate plid and aid concepts According to 802.11-2012 13.3.1, a mesh STA should assign an AID upon receipt of a mesh peering open frame rather than using the link id of the peer. Using the peer link id has two potential issues: it may not be unique among the peers, and by its nature it is random, so the TIM may not compress well. In preparation for allocating it properly, use sta->sta.aid, but keep the existing behavior of using the plid in the aid we send. Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/mesh_plink.c | 29 +++++++++++++++++++---------- net/mac80211/mesh_ps.c | 2 +- net/mac80211/sta_info.c | 5 +---- net/mac80211/sta_info.h | 2 ++ 4 files changed, 23 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index a5aa0345dd7e..3323413acb77 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -13,6 +13,7 @@ #include "rate.h" #include "mesh.h" +#define PLINK_CNF_AID(mgmt) ((mgmt)->u.action.u.self_prot.variable + 2) #define PLINK_GET_LLID(p) (p + 2) #define PLINK_GET_PLID(p) (p + 4) @@ -200,6 +201,7 @@ static u32 mesh_set_ht_prot_mode(struct ieee80211_sub_if_data *sdata) } static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, enum ieee80211_self_protected_actioncode action, u8 *da, u16 llid, u16 plid, u16 reason) { @@ -249,7 +251,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, if (action == WLAN_SP_MESH_PEERING_CONFIRM) { /* AID */ pos = skb_put(skb, 2); - put_unaligned_le16(plid, pos); + put_unaligned_le16(sta->sta.aid, pos); } if (ieee80211_add_srates_ie(sdata, skb, true, band) || ieee80211_add_ext_srates_ie(sdata, skb, true, band) || @@ -362,7 +364,7 @@ u32 mesh_plink_deactivate(struct sta_info *sta) spin_lock_bh(&sta->mesh->plink_lock); changed = __mesh_plink_deactivate(sta); sta->mesh->reason = WLAN_REASON_MESH_PEER_CANCELED; - mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, + mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CLOSE, sta->sta.addr, sta->mesh->llid, sta->mesh->plid, sta->mesh->reason); spin_unlock_bh(&sta->mesh->plink_lock); @@ -619,7 +621,7 @@ static void mesh_plink_timer(unsigned long data) } spin_unlock_bh(&sta->mesh->plink_lock); if (action) - mesh_plink_frame_tx(sdata, action, sta->sta.addr, + mesh_plink_frame_tx(sdata, sta, action, sta->sta.addr, sta->mesh->llid, sta->mesh->plid, reason); } @@ -689,7 +691,7 @@ u32 mesh_plink_open(struct sta_info *sta) /* set the non-peer mode to active during peering */ changed = ieee80211_mps_local_status_update(sdata); - mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_OPEN, + mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_OPEN, sta->sta.addr, sta->mesh->llid, 0, 0); return changed; } @@ -871,13 +873,13 @@ static u32 mesh_plink_fsm(struct ieee80211_sub_if_data *sdata, } spin_unlock_bh(&sta->mesh->plink_lock); if (action) { - mesh_plink_frame_tx(sdata, action, sta->sta.addr, + mesh_plink_frame_tx(sdata, sta, action, sta->sta.addr, sta->mesh->llid, sta->mesh->plid, sta->mesh->reason); /* also send confirm in open case */ if (action == WLAN_SP_MESH_PEERING_OPEN) { - mesh_plink_frame_tx(sdata, + mesh_plink_frame_tx(sdata, sta, WLAN_SP_MESH_PEERING_CONFIRM, sta->sta.addr, sta->mesh->llid, sta->mesh->plid, 0); @@ -1067,8 +1069,9 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, goto unlock_rcu; } sta->mesh->plid = plid; + sta->sta.aid = plid; } else if (!sta && event == OPN_RJCT) { - mesh_plink_frame_tx(sdata, WLAN_SP_MESH_PEERING_CLOSE, + mesh_plink_frame_tx(sdata, NULL, WLAN_SP_MESH_PEERING_CLOSE, mgmt->sa, 0, plid, WLAN_REASON_MESH_CONFIG); goto unlock_rcu; @@ -1077,9 +1080,15 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, goto unlock_rcu; } - /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */ - if (!sta->mesh->plid && event == CNF_ACPT) - sta->mesh->plid = plid; + if (event == CNF_ACPT) { + /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */ + if (!sta->mesh->plid) { + sta->mesh->plid = plid; + sta->sta.aid = sta->mesh->plid; + } + + sta->mesh->aid = get_unaligned_le16(PLINK_CNF_AID(mgmt)); + } changed |= mesh_plink_fsm(sdata, sta, event); diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 29747f92b9b0..90a268abea17 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -579,7 +579,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, - sta->mesh->llid); + sta->mesh->aid); if (has_buffered) mps_dbg(sta->sdata, "%pM indicates buffered frames\n", diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 9da7d2bc271a..70cd9fa57424 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -635,7 +635,7 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; - u16 id; + u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { @@ -643,12 +643,9 @@ static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) return; ps = &sta->sdata->bss->ps; - id = sta->sta.aid; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; - /* TIM map only for 1 <= PLID <= IEEE80211_MAX_AID */ - id = sta->mesh->plid % (IEEE80211_MAX_AID + 1); #endif } else { return; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 6dcb33484eac..1d2805c598c0 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -277,6 +277,7 @@ struct ieee80211_fast_tx { * @plink_lock: serialize access to plink fields * @llid: Local link ID * @plid: Peer link ID + * @aid: local aid supplied by peer * @reason: Cancel reason on PLINK_HOLDING state * @plink_retries: Retries in establishment * @plink_state: peer link state @@ -301,6 +302,7 @@ struct mesh_sta { spinlock_t plink_lock; u16 llid; u16 plid; + u16 aid; u16 reason; u8 plink_retries; -- cgit v1.2.3 From 0e0060fcfb3d0f5a53ef43e7b6a50227b934ab7c Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Tue, 14 Jul 2015 08:31:59 -0400 Subject: mac80211: select an AID when creating new mesh STAs Instead of using peer link id for AID, generate a new AID when creating mesh STAs in the kernel peering manager. This enables smaller TIM elements and more closely follows the standard, and it also enables mesh to work on drivers that require a valid AID when the STA is inserted (ath10k firmware has this requirement, for example). In the case of userspace-managed stations, we use the AID from NL80211_CMD_NEW_STATION. Signed-off-by: Bob Copeland Signed-off-by: Johannes Berg --- net/mac80211/mesh_plink.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 3323413acb77..e12be2e4e8df 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -422,20 +422,54 @@ out: spin_unlock_bh(&sta->mesh->plink_lock); } +static int mesh_allocate_aid(struct ieee80211_sub_if_data *sdata) +{ + struct sta_info *sta; + unsigned long *aid_map; + int aid; + + aid_map = kcalloc(BITS_TO_LONGS(IEEE80211_MAX_AID + 1), + sizeof(*aid_map), GFP_KERNEL); + if (!aid_map) + return -ENOMEM; + + /* reserve aid 0 for mcast indication */ + __set_bit(0, aid_map); + + rcu_read_lock(); + list_for_each_entry_rcu(sta, &sdata->local->sta_list, list) + __set_bit(sta->sta.aid, aid_map); + rcu_read_unlock(); + + aid = find_first_zero_bit(aid_map, IEEE80211_MAX_AID + 1); + kfree(aid_map); + + if (aid > IEEE80211_MAX_AID) + return -ENOBUFS; + + return aid; +} + static struct sta_info * __mesh_sta_info_alloc(struct ieee80211_sub_if_data *sdata, u8 *hw_addr) { struct sta_info *sta; + int aid; if (sdata->local->num_sta >= MESH_MAX_PLINKS) return NULL; + aid = mesh_allocate_aid(sdata); + if (aid < 0) + return NULL; + sta = sta_info_alloc(sdata, hw_addr, GFP_KERNEL); if (!sta) return NULL; sta->mesh->plink_state = NL80211_PLINK_LISTEN; sta->sta.wme = true; + sta->sta.aid = aid; sta_info_pre_move_state(sta, IEEE80211_STA_AUTH); sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC); @@ -659,8 +693,6 @@ static u16 mesh_get_new_llid(struct ieee80211_sub_if_data *sdata) do { get_random_bytes(&llid, sizeof(llid)); - /* for mesh PS we still only have the AID range for TIM bits */ - llid = (llid % IEEE80211_MAX_AID) + 1; } while (llid_in_use(sdata, llid)); return llid; @@ -1069,7 +1101,6 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, goto unlock_rcu; } sta->mesh->plid = plid; - sta->sta.aid = plid; } else if (!sta && event == OPN_RJCT) { mesh_plink_frame_tx(sdata, NULL, WLAN_SP_MESH_PEERING_CLOSE, mgmt->sa, 0, plid, @@ -1082,10 +1113,8 @@ mesh_process_plink_frame(struct ieee80211_sub_if_data *sdata, if (event == CNF_ACPT) { /* 802.11-2012 13.3.7.2 - update plid on CNF if not set */ - if (!sta->mesh->plid) { + if (!sta->mesh->plid) sta->mesh->plid = plid; - sta->sta.aid = sta->mesh->plid; - } sta->mesh->aid = get_unaligned_le16(PLINK_CNF_AID(mgmt)); } -- cgit v1.2.3 From 727da60be91c9fd59f1b084ca537b5123ab97744 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 15 Jul 2015 14:56:05 +0200 Subject: mac80211: deinline drv_sta_state With this .config: http://busybox.net/~vda/kernel_config, after deinlining the function size is 3132 bytes and there are 7 callsites. Total size reduction: about 20 kbytes. Signed-off-by: Denys Vlasenko CC: John Linville CC: Michal Kazior Cc: Johannes Berg Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/Makefile | 1 + net/mac80211/driver-ops.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/mac80211/driver-ops.h | 29 ++--------------------------- 3 files changed, 44 insertions(+), 27 deletions(-) create mode 100644 net/mac80211/driver-ops.c (limited to 'net') diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index 3275f01881be..783e891b7525 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_MAC80211) += mac80211.o # mac80211 objects mac80211-y := \ main.o status.o \ + driver-ops.o \ sta_info.o \ wep.o \ wpa.o \ diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c new file mode 100644 index 000000000000..267c3b1ca047 --- /dev/null +++ b/net/mac80211/driver-ops.c @@ -0,0 +1,41 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include "ieee80211_i.h" +#include "trace.h" +#include "driver-ops.h" + +__must_check +int drv_sta_state(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, + enum ieee80211_sta_state old_state, + enum ieee80211_sta_state new_state) +{ + int ret = 0; + + might_sleep(); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return -EIO; + + trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); + if (local->ops->sta_state) { + ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta, + old_state, new_state); + } else if (old_state == IEEE80211_STA_AUTH && + new_state == IEEE80211_STA_ASSOC) { + ret = drv_sta_add(local, sdata, &sta->sta); + if (ret == 0) + sta->uploaded = true; + } else if (old_state == IEEE80211_STA_ASSOC && + new_state == IEEE80211_STA_AUTH) { + drv_sta_remove(local, sdata, &sta->sta); + } + trace_drv_return_int(local, ret); + return ret; +} diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 32a2e707e222..02d91332d7dd 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -573,37 +573,12 @@ static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, trace_drv_return_void(local); } -static inline __must_check +__must_check int drv_sta_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta, enum ieee80211_sta_state old_state, - enum ieee80211_sta_state new_state) -{ - int ret = 0; - - might_sleep(); - - sdata = get_bss_sdata(sdata); - if (!check_sdata_in_driver(sdata)) - return -EIO; - - trace_drv_sta_state(local, sdata, &sta->sta, old_state, new_state); - if (local->ops->sta_state) { - ret = local->ops->sta_state(&local->hw, &sdata->vif, &sta->sta, - old_state, new_state); - } else if (old_state == IEEE80211_STA_AUTH && - new_state == IEEE80211_STA_ASSOC) { - ret = drv_sta_add(local, sdata, &sta->sta); - if (ret == 0) - sta->uploaded = true; - } else if (old_state == IEEE80211_STA_ASSOC && - new_state == IEEE80211_STA_AUTH) { - drv_sta_remove(local, sdata, &sta->sta); - } - trace_drv_return_int(local, ret); - return ret; -} + enum ieee80211_sta_state new_state); static inline void drv_sta_rc_update(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From eb6d9293dfed245a114cad7d975259963e1e04c2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 15 Jul 2015 14:56:06 +0200 Subject: mac80211: deinline rate_control_rate_init, rate_control_rate_update With this .config: http://busybox.net/~vda/kernel_config, after deinlining these functions have sizes and callsite counts as follows: rate_control_rate_init: 554 bytes, 8 calls rate_control_rate_update: 1596 bytes, 5 calls Total size reduction: about 11 kbytes. Signed-off-by: Denys Vlasenko CC: John Linville CC: Michal Kazior CC: Johannes Berg Cc: linux-wireless@vger.kernel.org Cc: netdev@vger.kernel.org CC: linux-kernel@vger.kernel.org Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/mac80211/rate.h | 60 +++-------------------------------------------------- 2 files changed, 62 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index fda33f961d83..03687d22b405 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -29,6 +29,65 @@ module_param(ieee80211_default_rc_algo, charp, 0644); MODULE_PARM_DESC(ieee80211_default_rc_algo, "Default rate control algorithm for mac80211 to use"); +void rate_control_rate_init(struct sta_info *sta) +{ + struct ieee80211_local *local = sta->sdata->local; + struct rate_control_ref *ref = sta->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + struct ieee80211_supported_band *sband; + struct ieee80211_chanctx_conf *chanctx_conf; + + ieee80211_sta_set_rx_nss(sta); + + if (!ref) + return; + + rcu_read_lock(); + + chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); + if (WARN_ON(!chanctx_conf)) { + rcu_read_unlock(); + return; + } + + sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; + + spin_lock_bh(&sta->rate_ctrl_lock); + ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, + priv_sta); + spin_unlock_bh(&sta->rate_ctrl_lock); + rcu_read_unlock(); + set_sta_flag(sta, WLAN_STA_RATE_CONTROL); +} + +void rate_control_rate_update(struct ieee80211_local *local, + struct ieee80211_supported_band *sband, + struct sta_info *sta, u32 changed) +{ + struct rate_control_ref *ref = local->rate_ctrl; + struct ieee80211_sta *ista = &sta->sta; + void *priv_sta = sta->rate_ctrl_priv; + struct ieee80211_chanctx_conf *chanctx_conf; + + if (ref && ref->ops->rate_update) { + rcu_read_lock(); + + chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); + if (WARN_ON(!chanctx_conf)) { + rcu_read_unlock(); + return; + } + + spin_lock_bh(&sta->rate_ctrl_lock); + ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, + ista, priv_sta, changed); + spin_unlock_bh(&sta->rate_ctrl_lock); + rcu_read_unlock(); + } + drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); +} + int ieee80211_rate_control_register(const struct rate_control_ops *ops) { struct rate_control_alg *alg; diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 25c9be5dd7fd..624fe5b81615 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -71,64 +71,10 @@ rate_control_tx_status_noskb(struct ieee80211_local *local, spin_unlock_bh(&sta->rate_ctrl_lock); } -static inline void rate_control_rate_init(struct sta_info *sta) -{ - struct ieee80211_local *local = sta->sdata->local; - struct rate_control_ref *ref = sta->rate_ctrl; - struct ieee80211_sta *ista = &sta->sta; - void *priv_sta = sta->rate_ctrl_priv; - struct ieee80211_supported_band *sband; - struct ieee80211_chanctx_conf *chanctx_conf; - - ieee80211_sta_set_rx_nss(sta); - - if (!ref) - return; - - rcu_read_lock(); - - chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); - if (WARN_ON(!chanctx_conf)) { - rcu_read_unlock(); - return; - } - - sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; - - spin_lock_bh(&sta->rate_ctrl_lock); - ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, - priv_sta); - spin_unlock_bh(&sta->rate_ctrl_lock); - rcu_read_unlock(); - set_sta_flag(sta, WLAN_STA_RATE_CONTROL); -} - -static inline void rate_control_rate_update(struct ieee80211_local *local, +void rate_control_rate_init(struct sta_info *sta); +void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, - struct sta_info *sta, u32 changed) -{ - struct rate_control_ref *ref = local->rate_ctrl; - struct ieee80211_sta *ista = &sta->sta; - void *priv_sta = sta->rate_ctrl_priv; - struct ieee80211_chanctx_conf *chanctx_conf; - - if (ref && ref->ops->rate_update) { - rcu_read_lock(); - - chanctx_conf = rcu_dereference(sta->sdata->vif.chanctx_conf); - if (WARN_ON(!chanctx_conf)) { - rcu_read_unlock(); - return; - } - - spin_lock_bh(&sta->rate_ctrl_lock); - ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, - ista, priv_sta, changed); - spin_unlock_bh(&sta->rate_ctrl_lock); - rcu_read_unlock(); - } - drv_sta_rc_update(local, sta->sdata, &sta->sta, changed); -} + struct sta_info *sta, u32 changed); static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, struct sta_info *sta, gfp_t gfp) -- cgit v1.2.3 From e317fa505dcdfa25f0e4c888f991eb7fd1562e1e Mon Sep 17 00:00:00 2001 From: Eric W. Biederman Date: Sat, 18 Jul 2015 10:21:14 -0500 Subject: netfilter: Fix memory leak in nf_register_net_hook In the rare case that when it is a attempted to use a per network device netfilter hook and the network device does not exist the newly allocated structure can leak. Be a good citizen and free the newly allocated structure in the error handling code. Fixes: 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") Reported-by: kbuild@01.org Reported-by: Dan Carpenter Signed-off-by: "Eric W. Biederman" Signed-off-by: Pablo Neira Ayuso --- net/netfilter/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 6896cee8b733..87d237d20870 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -96,8 +96,10 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) new->priority = reg->priority; nf_hook_list = find_nf_hook_list(net, reg); - if (!nf_hook_list) + if (!nf_hook_list) { + kfree(new); return -ENOENT; + } mutex_lock(&nf_hook_mutex); list_for_each_entry(elem, nf_hook_list, list) { -- cgit v1.2.3 From b87a173e25d6bf5c26f13d329cdddf57dbd4061a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jul 2015 14:21:41 +0200 Subject: cls_cgroup: factor out classid retrieval Split out retrieving the cgroups net_cls classid retrieval into its own function, so that it can be reused later on from other parts of the traffic control subsystem. If there's no skb->sk, then the small helper returns 0 as well, which in cls_cgroup terms means 'could not classify'. Signed-off-by: Daniel Borkmann Cc: Thomas Graf Signed-off-by: David S. Miller --- include/net/cls_cgroup.h | 29 +++++++++++++++++++++++++++++ net/sched/cls_cgroup.c | 23 ++--------------------- 2 files changed, 31 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index c15d39456e14..ccd6d8bffa4d 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -49,9 +49,38 @@ static inline void sock_update_classid(struct sock *sk) if (classid != sk->sk_classid) sk->sk_classid = classid; } + +static inline u32 task_get_classid(const struct sk_buff *skb) +{ + u32 classid = task_cls_state(current)->classid; + + /* Due to the nature of the classifier it is required to ignore all + * packets originating from softirq context as accessing `current' + * would lead to false results. + * + * This test assumes that all callers of dev_queue_xmit() explicitly + * disable bh. Knowing this, it is possible to detect softirq based + * calls by looking at the number of nested bh disable calls because + * softirqs always disables bh. + */ + if (in_serving_softirq()) { + /* If there is an sk_classid we'll use that. */ + if (!skb->sk) + return 0; + + classid = skb->sk->sk_classid; + } + + return classid; +} #else /* !CONFIG_CGROUP_NET_CLASSID */ static inline void sock_update_classid(struct sock *sk) { } + +static inline u32 task_get_classid(const struct sk_buff *skb) +{ + return 0; +} #endif /* CONFIG_CGROUP_NET_CLASSID */ #endif /* _NET_CLS_CGROUP_H */ diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index ea611b216412..4c85bd3a750c 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -30,35 +30,16 @@ static int cls_cgroup_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_cgroup_head *head = rcu_dereference_bh(tp->root); - u32 classid; - - classid = task_cls_state(current)->classid; - - /* - * Due to the nature of the classifier it is required to ignore all - * packets originating from softirq context as accessing `current' - * would lead to false results. - * - * This test assumes that all callers of dev_queue_xmit() explicitely - * disable bh. Knowing this, it is possible to detect softirq based - * calls by looking at the number of nested bh disable calls because - * softirqs always disables bh. - */ - if (in_serving_softirq()) { - /* If there is an sk_classid we'll use that. */ - if (!skb->sk) - return -1; - classid = skb->sk->sk_classid; - } + u32 classid = task_get_classid(skb); if (!classid) return -1; - if (!tcf_em_tree_match(skb, &head->ematches, NULL)) return -1; res->classid = classid; res->class = 0; + return tcf_exts_exec(skb, &head->exts, res); } -- cgit v1.2.3 From 8d20aabe1c76cccac544d9fcc3ad7823d9e98a2d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 15 Jul 2015 14:21:42 +0200 Subject: ebpf: add helper to retrieve net_cls's classid cookie It would be very useful to retrieve the net_cls's classid from an eBPF program to allow for a more fine-grained classification, it could be directly used or in conjunction with additional policies. I.e. docker, but also tooling such as cgexec, can easily run applications via net_cls cgroups: cgcreate -g net_cls:/foo echo 42 > foo/net_cls.classid cgexec -g net_cls:foo Thus, their respecitve classid cookie of foo can then be looked up on the egress path to apply further policies. The helper is desigend such that a non-zero value returns the cgroup id. Signed-off-by: Daniel Borkmann Cc: Thomas Graf Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 7 +++++++ net/core/filter.c | 15 +++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 29ef6f99e43d..2de87e58b12b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -249,6 +249,13 @@ enum bpf_func_id { * Return: 0 on success */ BPF_FUNC_get_current_comm, + + /** + * bpf_get_cgroup_classid(skb) - retrieve a proc's classid + * @skb: pointer to skb + * Return: classid if != 0 + */ + BPF_FUNC_get_cgroup_classid, __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index be3098fb65e4..247450a5e387 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -47,6 +47,7 @@ #include #include #include +#include /** * sk_filter - run a packet through a socket filter @@ -1424,6 +1425,18 @@ const struct bpf_func_proto bpf_clone_redirect_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return task_get_classid((struct sk_buff *) (unsigned long) r1); +} + +static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { + .func = bpf_get_cgroup_classid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1461,6 +1474,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_l4_csum_replace_proto; case BPF_FUNC_clone_redirect: return &bpf_clone_redirect_proto; + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From ef8299de7e2bf61ed24d1da699fa5ba13549d5f8 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Wed, 15 Jul 2015 07:16:50 -0700 Subject: bridge: multicast: notify on group delete Group notifications were not sent when a group expired or was deleted due to bridge/port device being deleted. So add br_mdb_notify() to br_multicast_del_pg(). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_multicast.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 5a44cd9473f2..55903d91d0a8 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -273,6 +273,8 @@ static void br_multicast_del_pg(struct net_bridge *br, rcu_assign_pointer(*pp, p->next); hlist_del_init(&p->mglist); del_timer(&p->timer); + br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB, + p->state); call_rcu_bh(&p->rcu, br_multicast_free_pg); if (!mp->ports && !mp->mglist && -- cgit v1.2.3 From e10177abf842d0c40dfecc43bd57a0a762a2fccf Mon Sep 17 00:00:00 2001 From: Satish Ashok Date: Wed, 15 Jul 2015 07:16:51 -0700 Subject: bridge: multicast: fix handling of temp and perm entries When the bridge (or port) is brought down/up flush only temp entries and leave the perm ones. Flush perm entries only when deleting the bridge device or the associated port. Signed-off-by: Satish Ashok Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_if.c | 1 + net/bridge/br_multicast.c | 27 ++++++++++++++++++++------- net/bridge/br_private.h | 1 + 3 files changed, 22 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index a538cb1199a3..45e4757c6fd2 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -281,6 +281,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) br_fdb_delete_by_port(br, NULL, 0, 1); br_vlan_flush(br); + br_multicast_dev_del(br); del_timer_sync(&br->gc_timer); br_sysfs_delbr(br->dev); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 55903d91d0a8..0dd3cd90962c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -916,6 +916,15 @@ void br_multicast_add_port(struct net_bridge_port *port) void br_multicast_del_port(struct net_bridge_port *port) { + struct net_bridge *br = port->br; + struct net_bridge_port_group *pg; + struct hlist_node *n; + + /* Take care of the remaining groups, only perm ones should be left */ + spin_lock_bh(&br->multicast_lock); + hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) + br_multicast_del_pg(br, pg); + spin_unlock_bh(&br->multicast_lock); del_timer_sync(&port->multicast_router_timer); } @@ -955,7 +964,8 @@ void br_multicast_disable_port(struct net_bridge_port *port) spin_lock(&br->multicast_lock); hlist_for_each_entry_safe(pg, n, &port->mglist, mglist) - br_multicast_del_pg(br, pg); + if (pg->state == MDB_TEMPORARY) + br_multicast_del_pg(br, pg); if (!hlist_unhashed(&port->rlist)) hlist_del_init_rcu(&port->rlist); @@ -1732,12 +1742,6 @@ void br_multicast_open(struct net_bridge *br) void br_multicast_stop(struct net_bridge *br) { - struct net_bridge_mdb_htable *mdb; - struct net_bridge_mdb_entry *mp; - struct hlist_node *n; - u32 ver; - int i; - del_timer_sync(&br->multicast_router_timer); del_timer_sync(&br->ip4_other_query.timer); del_timer_sync(&br->ip4_own_query.timer); @@ -1745,6 +1749,15 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif +} + +void br_multicast_dev_del(struct net_bridge *br) +{ + struct net_bridge_mdb_htable *mdb; + struct net_bridge_mdb_entry *mp; + struct hlist_node *n; + u32 ver; + int i; spin_lock_bh(&br->multicast_lock); mdb = mlock_dereference(br->mdb, br); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index c73fd785654d..2bada2279721 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -466,6 +466,7 @@ void br_multicast_disable_port(struct net_bridge_port *port); void br_multicast_init(struct net_bridge *br); void br_multicast_open(struct net_bridge *br); void br_multicast_stop(struct net_bridge *br); +void br_multicast_dev_del(struct net_bridge *br); void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb); void br_multicast_forward(struct net_bridge_mdb_entry *mdst, -- cgit v1.2.3 From a0a9f33bdf8bf9061c31faab46d8eb46ff644622 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 15 Jul 2015 21:56:26 +0200 Subject: net/ipv6: update flowi6_oif in ip6_dst_lookup_flow if not set Newly created flows don't have flowi6_oif set (at least if the associated socket is not interface-bound). This leads to a mismatch in __xfrm6_selector_match() for policies which specify an interface in the selector (sel->ifindex != 0). Backtracing shows this happens in code-paths originating from e.g. ip6_datagram_connect(), rawv6_sendmsg() or tcp_v6_connect(). (UDP was not tested for.) In summary, this patch fixes policy matching on outgoing interface for locally generated packets. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d5f7716662db..c5fc85286ef6 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1023,6 +1023,8 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, return ERR_PTR(err); if (final_dst) fl6->daddr = *final_dst; + if (!fl6->flowi6_oif) + fl6->flowi6_oif = dst->dev->ifindex; return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); } -- cgit v1.2.3 From a7ce45a74bedd0ed80b956b3e14f8bc6dec85327 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 20 Jul 2015 23:03:45 +0200 Subject: bridge: mcast: fix br_multicast_dev_del warn when igmp snooping is not defined Fix: net/bridge/br_if.c: In function 'br_dev_delete': >> net/bridge/br_if.c:284:2: error: implicit declaration of function >> 'br_multicast_dev_del' [-Werror=implicit-function-declaration] br_multicast_dev_del(br); ^ cc1: some warnings being treated as errors when igmp snooping is not defined. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2bada2279721..3ad1290528af 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -566,6 +566,10 @@ static inline void br_multicast_stop(struct net_bridge *br) { } +static inline void br_multicast_dev_del(struct net_bridge *br) +{ +} + static inline void br_multicast_deliver(struct net_bridge_mdb_entry *mdst, struct sk_buff *skb) { -- cgit v1.2.3 From 0c4f691ff6791e55ac831666df0b49b1679c56e4 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:48 -0700 Subject: net: don't reforward packets already forwarded by offload device Just before queuing skb for xmit on port, check if skb has been marked by switchdev port driver as already fordwarded by device. If so, drop skb. A non-zero skb->offload_fwd_mark field is set by the switchdev port driver/device on ingress to indicate the skb has already been forwarded by the device to egress ports with matching dev->skb_mark. The switchdev port driver would assign a non-zero dev->offload_skb_mark for each device port netdev during registration, for example. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Acked-by: Roopa Prabhu Acked-by: Nicolas Dichtel Signed-off-by: David S. Miller --- include/linux/netdevice.h | 6 ++++++ include/linux/skbuff.h | 9 ++++++++- net/core/dev.c | 10 ++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 45cfd797eb77..8364f29e08be 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1456,6 +1456,8 @@ enum netdev_priv_flags { * * @xps_maps: XXX: need comments on this one * + * @offload_fwd_mark: Offload device fwding mark + * * @trans_start: Time (in jiffies) of last Tx * @watchdog_timeo: Represents the timeout that is used by * the watchdog ( see dev_watchdog() ) @@ -1697,6 +1699,10 @@ struct net_device { struct xps_dev_maps __rcu *xps_maps; #endif +#ifdef CONFIG_NET_SWITCHDEV + u32 offload_fwd_mark; +#endif + /* These may be needed for future network-power-down code. */ /* diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6cdd6e87d53..af7a09650fa2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -506,6 +506,7 @@ static inline u32 skb_mstamp_us_delta(const struct skb_mstamp *t1, * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking + * @offload_fwd_mark: fwding offload mark * @mark: Generic packet mark * @vlan_proto: vlan encapsulation protocol * @vlan_tci: vlan tag control information @@ -650,9 +651,15 @@ struct sk_buff { unsigned int sender_cpu; }; #endif + union { #ifdef CONFIG_NETWORK_SECMARK - __u32 secmark; + __u32 secmark; +#endif +#ifdef CONFIG_NET_SWITCHDEV + __u32 offload_fwd_mark; #endif + }; + union { __u32 mark; __u32 reserved_tailroom; diff --git a/net/core/dev.c b/net/core/dev.c index 8810b6bbebfe..2ee15afb412d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3061,6 +3061,16 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) else skb_dst_force(skb); +#ifdef CONFIG_NET_SWITCHDEV + /* Don't forward if offload device already forwarded */ + if (skb->offload_fwd_mark && + skb->offload_fwd_mark == dev->offload_fwd_mark) { + consume_skb(skb); + rc = NET_XMIT_SUCCESS; + goto out; + } +#endif + txq = netdev_pick_tx(dev, skb, accel_priv); q = rcu_dereference_bh(txq->qdisc); -- cgit v1.2.3 From d754f98b502ad9a8c7570d494e1eaa0e6bc0350c Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:49 -0700 Subject: net: add phys ID compare helper to test if two IDs are the same Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ net/switchdev/switchdev.c | 8 ++------ 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8364f29e08be..607b5f41f46f 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -766,6 +766,13 @@ struct netdev_phys_item_id { unsigned char id_len; }; +static inline bool netdev_phys_item_id_same(struct netdev_phys_item_id *a, + struct netdev_phys_item_id *b) +{ + return a->id_len == b->id_len && + memcmp(a->id, b->id, a->id_len) == 0; +} + typedef u16 (*select_queue_fallback_t)(struct net_device *dev, struct sk_buff *skb); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 9f2add3cba26..4e5bba50ccff 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -910,13 +910,9 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi) if (switchdev_port_attr_get(dev, &attr)) return NULL; - if (nhsel > 0) { - if (prev_attr.u.ppid.id_len != attr.u.ppid.id_len) + if (nhsel > 0 && + !netdev_phys_item_id_same(&prev_attr.u.ppid, &attr.u.ppid)) return NULL; - if (memcmp(prev_attr.u.ppid.id, attr.u.ppid.id, - attr.u.ppid.id_len)) - return NULL; - } prev_attr = attr; } -- cgit v1.2.3 From 1a3b2ec93d4277b121979321b4024b438cb09504 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Sat, 18 Jul 2015 18:24:50 -0700 Subject: switchdev: add offload_fwd_mark generator helper skb->offload_fwd_mark and dev->offload_fwd_mark are 32-bit and should be unique for device and may even be unique for a sub-set of ports within device, so add switchdev helper function to generate unique marks based on port's switch ID and group_ifindex. group_ifindex would typically be the container dev's ifindex, such as the bridge's ifindex. The generator uses a global hash table to store offload_fwd_marks hashed by {switch ID, group_ifindex} key. Signed-off-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/switchdev.h | 9 ++++ net/switchdev/switchdev.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) (limited to 'net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index d5671f118bfc..89da8934519b 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -157,6 +157,9 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int idx); +void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining); #else @@ -271,6 +274,12 @@ static inline int switchdev_port_fdb_dump(struct sk_buff *skb, return -EOPNOTSUPP; } +static inline void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining) +{ +} + #endif #endif /* _LINUX_SWITCHDEV_H_ */ diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 4e5bba50ccff..33bafa2e703e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -1039,3 +1039,106 @@ void switchdev_fib_ipv4_abort(struct fib_info *fi) fi->fib_net->ipv4.fib_offload_disabled = true; } EXPORT_SYMBOL_GPL(switchdev_fib_ipv4_abort); + +static bool switchdev_port_same_parent_id(struct net_device *a, + struct net_device *b) +{ + struct switchdev_attr a_attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + struct switchdev_attr b_attr = { + .id = SWITCHDEV_ATTR_PORT_PARENT_ID, + .flags = SWITCHDEV_F_NO_RECURSE, + }; + + if (switchdev_port_attr_get(a, &a_attr) || + switchdev_port_attr_get(b, &b_attr)) + return false; + + return netdev_phys_item_id_same(&a_attr.u.ppid, &b_attr.u.ppid); +} + +static u32 switchdev_port_fwd_mark_get(struct net_device *dev, + struct net_device *group_dev) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(group_dev, lower_dev, iter) { + if (lower_dev == dev) + continue; + if (switchdev_port_same_parent_id(dev, lower_dev)) + return lower_dev->offload_fwd_mark; + return switchdev_port_fwd_mark_get(dev, lower_dev); + } + + return dev->ifindex; +} + +static void switchdev_port_fwd_mark_reset(struct net_device *group_dev, + u32 old_mark, u32 *reset_mark) +{ + struct net_device *lower_dev; + struct list_head *iter; + + netdev_for_each_lower_dev(group_dev, lower_dev, iter) { + if (lower_dev->offload_fwd_mark == old_mark) { + if (!*reset_mark) + *reset_mark = lower_dev->ifindex; + lower_dev->offload_fwd_mark = *reset_mark; + } + switchdev_port_fwd_mark_reset(lower_dev, old_mark, reset_mark); + } +} + +/** + * switchdev_port_fwd_mark_set - Set port offload forwarding mark + * + * @dev: port device + * @group_dev: containing device + * @joining: true if dev is joining group; false if leaving group + * + * An ungrouped port's offload mark is just its ifindex. A grouped + * port's (member of a bridge, for example) offload mark is the ifindex + * of one of the ports in the group with the same parent (switch) ID. + * Ports on the same device in the same group will have the same mark. + * + * Example: + * + * br0 ifindex=9 + * sw1p1 ifindex=2 mark=2 + * sw1p2 ifindex=3 mark=2 + * sw2p1 ifindex=4 mark=5 + * sw2p2 ifindex=5 mark=5 + * + * If sw2p2 leaves the bridge, we'll have: + * + * br0 ifindex=9 + * sw1p1 ifindex=2 mark=2 + * sw1p2 ifindex=3 mark=2 + * sw2p1 ifindex=4 mark=4 + * sw2p2 ifindex=5 mark=5 + */ +void switchdev_port_fwd_mark_set(struct net_device *dev, + struct net_device *group_dev, + bool joining) +{ + u32 mark = dev->ifindex; + u32 reset_mark = 0; + + if (group_dev && joining) { + mark = switchdev_port_fwd_mark_get(dev, group_dev); + } else if (group_dev && !joining) { + if (dev->offload_fwd_mark == mark) + /* Ohoh, this port was the mark reference port, + * but it's leaving the group, so reset the + * mark for the remaining ports in the group. + */ + switchdev_port_fwd_mark_reset(group_dev, mark, + &reset_mark); + } + + dev->offload_fwd_mark = mark; +} +EXPORT_SYMBOL_GPL(switchdev_port_fwd_mark_set); -- cgit v1.2.3 From 9d13ec65ede775f896c3da1cfa35283afe2f796c Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:19 -0400 Subject: tipc: introduce link entry structure to struct tipc_node struct 'tipc_node' currently contains two arrays for link attributes, one for the link pointers, and one for the usable link MTUs. We now group those into a new struct 'tipc_link_entry', and intoduce one single array consisting of such enties. Apart from being a cosmetic improvement, this is a starting point for the strict master-slave relation between node and link that we will introduce in the following commits. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 2 +- net/tipc/discover.c | 2 +- net/tipc/link.c | 60 ++++++++++--------- net/tipc/name_distr.c | 2 +- net/tipc/node.c | 163 ++++++++++++++++++++++++-------------------------- net/tipc/node.h | 50 +++++++++------- 6 files changed, 143 insertions(+), 136 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index a816382fc8af..59b2f2a538e1 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -413,7 +413,7 @@ static void bclink_accept_pkt(struct tipc_node *node, u32 seqno) * all nodes in the cluster don't ACK at the same time */ if (((seqno - tn->own_addr) % TIPC_MIN_LINK_WIN) == 0) { - tipc_link_proto_xmit(node->active_links[node->addr & 1], + tipc_link_proto_xmit(node_active_link(node, node->addr), STATE_MSG, 0, 0, 0, 0); tn->bcl->stats.sent_acks++; } diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 967e292f53c8..933445337fb4 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -170,7 +170,7 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, return; tipc_node_lock(node); node->capabilities = caps; - link = node->links[bearer->identity]; + link = node->links[bearer->identity].link; /* Prepare to validate requesting node's signature and media address */ sign_match = (signature == node->signature); diff --git a/net/tipc/link.c b/net/tipc/link.c index eaa9fe54b4ae..03372a7e98df 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -132,9 +132,11 @@ static void tipc_link_put(struct tipc_link *l_ptr) static struct tipc_link *tipc_parallel_link(struct tipc_link *l) { - if (l->owner->active_links[0] != l) - return l->owner->active_links[0]; - return l->owner->active_links[1]; + struct tipc_node *n = l->owner; + + if (node_active_link(n, 0) != l) + return node_active_link(n, 0); + return node_active_link(n, 1); } /* @@ -147,10 +149,11 @@ int tipc_link_is_up(struct tipc_link *l_ptr) return link_working_working(l_ptr) || link_working_unknown(l_ptr); } -int tipc_link_is_active(struct tipc_link *l_ptr) +int tipc_link_is_active(struct tipc_link *l) { - return (l_ptr->owner->active_links[0] == l_ptr) || - (l_ptr->owner->active_links[1] == l_ptr); + struct tipc_node *n = l->owner; + + return (node_active_link(n, 0) == l) || (node_active_link(n, 1) == l); } /** @@ -240,7 +243,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, return NULL; } - if (n_ptr->links[b_ptr->identity]) { + if (n_ptr->links[b_ptr->identity].link) { tipc_addr_string_fill(addr_string, n_ptr->addr); pr_err("Attempt to establish second link on <%s> to %s\n", b_ptr->name, addr_string); @@ -321,7 +324,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id) rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (link) tipc_link_delete(link); tipc_node_unlock(node); @@ -446,7 +449,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET)) return; - tipc_node_link_down(l_ptr->owner, l_ptr); + tipc_node_link_down(l_ptr->owner, l_ptr->bearer_id); tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr); if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { @@ -482,7 +485,7 @@ static void link_activate(struct tipc_link *link) link->rcv_nxt = 1; link->stats.recv_info = 1; link->silent_intv_cnt = 0; - tipc_node_link_up(node, link); + tipc_node_link_up(node, link->bearer_id); tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); } @@ -577,7 +580,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) case TRAFFIC_MSG_EVT: break; case ACTIVATE_MSG: - other = l_ptr->owner->active_links[0]; + other = node_active_link(l_ptr->owner, 0); if (other && link_working_unknown(other)) break; l_ptr->state = WORKING_WORKING; @@ -606,7 +609,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) switch (event) { case TRAFFIC_MSG_EVT: case ACTIVATE_MSG: - other = l_ptr->owner->active_links[0]; + other = node_active_link(l_ptr->owner, 0); if (other && link_working_unknown(other)) break; l_ptr->state = WORKING_WORKING; @@ -755,7 +758,7 @@ int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, node = tipc_node_find(net, dnode); if (node) { tipc_node_lock(node); - link = node->active_links[selector & 1]; + link = node_active_link(node, selector & 1); if (link) rc = __tipc_link_xmit(net, link, list); tipc_node_unlock(node); @@ -858,9 +861,9 @@ void tipc_link_reset_all(struct tipc_node *node) tipc_addr_string_fill(addr_string, node->addr)); for (i = 0; i < MAX_BEARERS; i++) { - if (node->links[i]) { - link_print(node->links[i], "Resetting link\n"); - tipc_link_reset(node->links[i]); + if (node->links[i].link) { + link_print(node->links[i].link, "Resetting link\n"); + tipc_link_reset(node->links[i].link); } } @@ -1029,7 +1032,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) tipc_node_lock(n_ptr); /* Locate unicast link endpoint that should handle message */ - l_ptr = n_ptr->links[b_ptr->identity]; + l_ptr = n_ptr->links[b_ptr->identity].link; if (unlikely(!l_ptr)) goto unlock; @@ -1496,7 +1499,7 @@ static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, struct sk_buff *skb; u32 length = msg_size(msg); - tunnel = l_ptr->owner->active_links[selector & 1]; + tunnel = node_active_link(l_ptr->owner, selector & 1); if (!tipc_link_is_up(tunnel)) { pr_warn("%stunnel link no longer available\n", link_co_err); return; @@ -1522,7 +1525,7 @@ static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, void tipc_link_failover_send_queue(struct tipc_link *l_ptr) { int msgcount; - struct tipc_link *tunnel = l_ptr->owner->active_links[0]; + struct tipc_link *tunnel = node_active_link(l_ptr->owner, 0); struct tipc_msg tunnel_hdr; struct sk_buff *skb; int split_bundles; @@ -1556,8 +1559,8 @@ void tipc_link_failover_send_queue(struct tipc_link *l_ptr) return; } - split_bundles = (l_ptr->owner->active_links[0] != - l_ptr->owner->active_links[1]); + split_bundles = (node_active_link(l_ptr->owner, 0) != + node_active_link(l_ptr->owner, 0)); skb_queue_walk(&l_ptr->transmq, skb) { struct tipc_msg *msg = buf_msg(skb); @@ -1660,7 +1663,7 @@ static bool tipc_link_failover_rcv(struct tipc_link *link, if (bearer_id == link->bearer_id) goto exit; - pl = link->owner->links[bearer_id]; + pl = link->owner->links[bearer_id].link; if (pl && tipc_link_is_up(pl)) tipc_link_reset(pl); @@ -1743,7 +1746,7 @@ static struct tipc_node *tipc_link_find_owner(struct net *net, list_for_each_entry_rcu(n_ptr, &tn->node_list, list) { tipc_node_lock(n_ptr); for (i = 0; i < MAX_BEARERS; i++) { - l_ptr = n_ptr->links[i]; + l_ptr = n_ptr->links[i].link; if (l_ptr && !strcmp(l_ptr->name, link_name)) { *bearer_id = i; found_node = n_ptr; @@ -1865,7 +1868,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; @@ -2055,10 +2058,11 @@ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; - if (!node->links[i]) + if (!node->links[i].link) continue; - err = __tipc_nl_add_link(net, msg, node->links[i], NLM_F_MULTI); + err = __tipc_nl_add_link(net, msg, + node->links[i].link, NLM_F_MULTI); if (err) return err; } @@ -2172,7 +2176,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info) return -EINVAL; tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (!link) { tipc_node_unlock(node); nlmsg_free(msg.skb); @@ -2227,7 +2231,7 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info) tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (!link) { tipc_node_unlock(node); return -EINVAL; diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 41e7b7e4dda0..3a1539e96294 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -96,7 +96,7 @@ void named_cluster_distribute(struct net *net, struct sk_buff *skb) dnode = node->addr; if (in_own_node(net, dnode)) continue; - if (!tipc_node_active_links(node)) + if (!tipc_node_is_up(node)) continue; oskb = pskb_copy(skb, GFP_ATOMIC); if (!oskb) diff --git a/net/tipc/node.c b/net/tipc/node.c index 0b1d61a5f853..db46e5d1d156 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -224,126 +224,119 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) * * Link becomes active (alone or shared) or standby, depending on its priority. */ -void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr) +void tipc_node_link_up(struct tipc_node *n, int bearer_id) { - struct tipc_link **active = &n_ptr->active_links[0]; + struct tipc_link_entry **actv = &n->active_links[0]; + struct tipc_link_entry *le = &n->links[bearer_id]; + struct tipc_link *l = le->link; - n_ptr->working_links++; - n_ptr->action_flags |= TIPC_NOTIFY_LINK_UP; - n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id; + /* Leave room for tunnel header when returning 'mtu' to users: */ + n->links[bearer_id].mtu = l->mtu - INT_H_SIZE; + + n->working_links++; + n->action_flags |= TIPC_NOTIFY_LINK_UP; + n->link_id = l->peer_bearer_id << 16 | l->bearer_id; pr_debug("Established link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); + l->name, l->net_plane); - if (!active[0]) { - active[0] = active[1] = l_ptr; - node_established_contact(n_ptr); - goto exit; + /* No active links ? => take both active slots */ + if (!actv[0]) { + actv[0] = le; + actv[1] = le; + node_established_contact(n); + return; } - if (l_ptr->priority < active[0]->priority) { - pr_debug("New link <%s> becomes standby\n", l_ptr->name); - goto exit; + if (l->priority < actv[0]->link->priority) { + pr_debug("New link <%s> becomes standby\n", l->name); + return; } - tipc_link_dup_queue_xmit(active[0], l_ptr); - if (l_ptr->priority == active[0]->priority) { - active[0] = l_ptr; - goto exit; + tipc_link_dup_queue_xmit(actv[0]->link, l); + + /* Take one active slot if applicable */ + if (l->priority == actv[0]->link->priority) { + actv[0] = le; + return; } - pr_debug("Old link <%s> becomes standby\n", active[0]->name); - if (active[1] != active[0]) - pr_debug("Old link <%s> becomes standby\n", active[1]->name); - active[0] = active[1] = l_ptr; -exit: - /* Leave room for changeover header when returning 'mtu' to users: */ - n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; + /* Higher prio than current active? => take both active slots */ + pr_debug("Old l <%s> becomes standby\n", actv[0]->link->name); + if (actv[1] != actv[0]) + pr_debug("Old link <%s> now standby\n", actv[1]->link->name); + actv[0] = le; + actv[1] = le; } /** - * node_select_active_links - select active link + * node_select_active_links - select which working links should be active */ -static void node_select_active_links(struct tipc_node *n_ptr) +static void node_select_active_links(struct tipc_node *n) { - struct tipc_link **active = &n_ptr->active_links[0]; - u32 i; - u32 highest_prio = 0; + struct tipc_link_entry **actv = &n->active_links[0]; + struct tipc_link *l; + u32 b, highest = 0; - active[0] = active[1] = NULL; - - for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link *l_ptr = n_ptr->links[i]; + actv[0] = NULL; + actv[1] = NULL; - if (!l_ptr || !tipc_link_is_up(l_ptr) || - (l_ptr->priority < highest_prio)) + for (b = 0; b < MAX_BEARERS; b++) { + l = n->links[b].link; + if (!l || !tipc_link_is_up(l) || (l->priority < highest)) + continue; + if (l->priority > highest) { + highest = l->priority; + actv[0] = &n->links[b]; + actv[1] = &n->links[b]; continue; - - if (l_ptr->priority > highest_prio) { - highest_prio = l_ptr->priority; - active[0] = active[1] = l_ptr; - } else { - active[1] = l_ptr; } + actv[1] = &n->links[b]; } } /** * tipc_node_link_down - handle loss of link */ -void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr) +void tipc_node_link_down(struct tipc_node *n, int bearer_id) { - struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); - struct tipc_link **active; + struct tipc_link_entry **actv = &n->active_links[0]; + struct tipc_link_entry *le = &n->links[bearer_id]; + struct tipc_link *l = le->link; - n_ptr->working_links--; - n_ptr->action_flags |= TIPC_NOTIFY_LINK_DOWN; - n_ptr->link_id = l_ptr->peer_bearer_id << 16 | l_ptr->bearer_id; + n->working_links--; + n->action_flags |= TIPC_NOTIFY_LINK_DOWN; + n->link_id = l->peer_bearer_id << 16 | l->bearer_id; - if (!tipc_link_is_active(l_ptr)) { + if (!tipc_link_is_active(l)) { pr_debug("Lost standby link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); + l->name, l->net_plane); return; } pr_debug("Lost link <%s> on network plane %c\n", - l_ptr->name, l_ptr->net_plane); - - active = &n_ptr->active_links[0]; - if (active[0] == l_ptr) - active[0] = active[1]; - if (active[1] == l_ptr) - active[1] = active[0]; - if (active[0] == l_ptr) - node_select_active_links(n_ptr); - if (tipc_node_is_up(n_ptr)) - tipc_link_failover_send_queue(l_ptr); - else - node_lost_contact(n_ptr); + l->name, l->net_plane); - /* Leave room for changeover header when returning 'mtu' to users: */ - if (active[0]) { - n_ptr->act_mtus[0] = active[0]->mtu - INT_H_SIZE; - n_ptr->act_mtus[1] = active[1]->mtu - INT_H_SIZE; - return; - } - /* Loopback link went down? No fragmentation needed from now on. */ - if (n_ptr->addr == tn->own_addr) { - n_ptr->act_mtus[0] = MAX_MSG_SIZE; - n_ptr->act_mtus[1] = MAX_MSG_SIZE; - } -} + /* Resdistribute active slots if applicable */ + if (actv[0] == le) + actv[0] = actv[1]; + if (actv[1] == le) + actv[1] = actv[0]; -int tipc_node_active_links(struct tipc_node *n_ptr) -{ - return n_ptr->active_links[0] != NULL; + /* Last link of this priority? => select other ones if available */ + if (actv[0] == le) + node_select_active_links(n); + + if (tipc_node_is_up(n)) + tipc_link_failover_send_queue(l); + else + node_lost_contact(n); } -int tipc_node_is_up(struct tipc_node *n_ptr) +bool tipc_node_is_up(struct tipc_node *n) { - return tipc_node_active_links(n_ptr); + return n->active_links[0]; } void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) { - n_ptr->links[l_ptr->bearer_id] = l_ptr; + n_ptr->links[l_ptr->bearer_id].link = l_ptr; n_ptr->link_cnt++; } @@ -352,9 +345,9 @@ void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) int i; for (i = 0; i < MAX_BEARERS; i++) { - if (l_ptr != n_ptr->links[i]) + if (l_ptr != n_ptr->links[i].link) continue; - n_ptr->links[i] = NULL; + n_ptr->links[i].link = NULL; n_ptr->link_cnt--; } } @@ -396,7 +389,7 @@ static void node_lost_contact(struct tipc_node *n_ptr) /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link *l_ptr = n_ptr->links[i]; + struct tipc_link *l_ptr = n_ptr->links[i].link; if (!l_ptr) continue; l_ptr->flags &= ~LINK_FAILINGOVER; @@ -453,7 +446,7 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, goto exit; tipc_node_lock(node); - link = node->links[bearer_id]; + link = node->links[bearer_id].link; if (link) { strncpy(linkname, link->name, len); err = 0; diff --git a/net/tipc/node.h b/net/tipc/node.h index 5a834cf142c8..320cea313bdc 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -89,6 +89,11 @@ struct tipc_node_bclink { bool recv_permitted; }; +struct tipc_link_entry { + struct tipc_link *link; + u32 mtu; +}; + /** * struct tipc_node - TIPC node structure * @addr: network address of node @@ -98,9 +103,8 @@ struct tipc_node_bclink { * @hash: links to adjacent nodes in unsorted hash chain * @inputq: pointer to input queue containing messages for msg event * @namedq: pointer to name table input queue with name table messages - * @curr_link: the link holding the node lock, if any - * @active_links: pointers to active links to node - * @links: pointers to all links to node + * @active_links: pointer into links[] array, identifying which links are active + * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions * @bclink: broadcast-related info * @list: links to adjacent nodes in sorted list of cluster's nodes @@ -120,9 +124,8 @@ struct tipc_node { struct hlist_node hash; struct sk_buff_head *inputq; struct sk_buff_head *namedq; - struct tipc_link *active_links[2]; - u32 act_mtus[2]; - struct tipc_link *links[MAX_BEARERS]; + struct tipc_link_entry *active_links[2]; + struct tipc_link_entry links[MAX_BEARERS]; int action_flags; struct tipc_node_bclink bclink; struct list_head list; @@ -142,10 +145,9 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr); void tipc_node_stop(struct net *net); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -void tipc_node_link_down(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -void tipc_node_link_up(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -int tipc_node_active_links(struct tipc_node *n_ptr); -int tipc_node_is_up(struct tipc_node *n_ptr); +void tipc_node_link_down(struct tipc_node *n_ptr, int bearer_id); +void tipc_node_link_up(struct tipc_node *n_ptr, int bearer_id); +bool tipc_node_is_up(struct tipc_node *n); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); void tipc_node_unlock(struct tipc_node *node); @@ -165,20 +167,28 @@ static inline bool tipc_node_blocked(struct tipc_node *node) TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN)); } -static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector) +static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel) { - struct tipc_node *node; - u32 mtu; + struct tipc_link_entry *le = n->active_links[sel & 1]; - node = tipc_node_find(net, addr); + if (likely(le)) + return le->link; + return NULL; +} - if (likely(node)) { - mtu = node->act_mtus[selector & 1]; - tipc_node_put(node); - } else { - mtu = MAX_MSG_SIZE; - } +static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector) +{ + struct tipc_node *n; + struct tipc_link_entry *le; + unsigned int mtu = MAX_MSG_SIZE; + n = tipc_node_find(net, addr); + if (unlikely(!n)) + return mtu; + le = n->active_links[selector & 1]; + if (likely(le)) + mtu = le->mtu; + tipc_node_put(n); return mtu; } -- cgit v1.2.3 From d3a43b907ae688af6cb753c53cd7de05f3c1ba85 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:20 -0400 Subject: tipc: move link creation from neighbor discoverer to node As a step towards turning links into node internal entities, we move the creation of links from the neighbor discovery logics to the node's link control logics. We also create an additional entry for the link's media address in the newly introduced struct tipc_link_entry, since this is where it is needed in the upcoming commits. The current copy in struct tipc_link is kept for now, but will be removed later. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/discover.c | 20 ++++---------------- net/tipc/node.c | 27 +++++++++++++++++++++++++++ net/tipc/node.h | 6 ++++++ 3 files changed, 37 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 933445337fb4..164d08907d6f 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -35,7 +35,7 @@ */ #include "core.h" -#include "link.h" +#include "node.h" #include "discover.h" /* min delay during bearer start up */ @@ -125,7 +125,6 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; - struct tipc_link *link; struct tipc_media_addr maddr; struct sk_buff *rbuf; struct tipc_msg *msg = buf_msg(buf); @@ -170,13 +169,10 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, return; tipc_node_lock(node); node->capabilities = caps; - link = node->links[bearer->identity].link; /* Prepare to validate requesting node's signature and media address */ sign_match = (signature == node->signature); - addr_match = link && !memcmp(&link->media_addr, &maddr, sizeof(maddr)); - link_up = link && tipc_link_is_up(link); - + tipc_node_check_dest(node, bearer, &link_up, &addr_match, &maddr); /* These three flags give us eight permutations: */ @@ -239,16 +235,8 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, if (accept_sign) node->signature = signature; - if (accept_addr) { - if (!link) - link = tipc_link_create(node, bearer, &maddr); - if (link) { - memcpy(&link->media_addr, &maddr, sizeof(maddr)); - tipc_link_reset(link); - } else { - respond = false; - } - } + if (accept_addr && !tipc_node_update_dest(node, bearer, &maddr)) + respond = false; /* Send response, if necessary */ if (respond && (mtyp == DSC_REQ_MSG)) { diff --git a/net/tipc/node.c b/net/tipc/node.c index db46e5d1d156..06f642abdf38 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -334,6 +334,33 @@ bool tipc_node_is_up(struct tipc_node *n) return n->active_links[0]; } +void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *b, + bool *link_up, bool *addr_match, + struct tipc_media_addr *maddr) +{ + struct tipc_link *l = n->links[b->identity].link; + struct tipc_media_addr *curr = &n->links[b->identity].maddr; + + *link_up = l && tipc_link_is_up(l); + *addr_match = l && !memcmp(curr, maddr, sizeof(*maddr)); +} + +bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *b, + struct tipc_media_addr *maddr) +{ + struct tipc_link *l = n->links[b->identity].link; + struct tipc_media_addr *curr = &n->links[b->identity].maddr; + + if (!l) + l = tipc_link_create(n, b, maddr); + if (!l) + return false; + memcpy(&l->media_addr, maddr, sizeof(*maddr)); + memcpy(curr, maddr, sizeof(*maddr)); + tipc_link_reset(l); + return true; +} + void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) { n_ptr->links[l_ptr->bearer_id].link = l_ptr; diff --git a/net/tipc/node.h b/net/tipc/node.h index 320cea313bdc..68579c70748b 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -92,6 +92,7 @@ struct tipc_node_bclink { struct tipc_link_entry { struct tipc_link *link; u32 mtu; + struct tipc_media_addr maddr; }; /** @@ -143,6 +144,11 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr); void tipc_node_put(struct tipc_node *node); struct tipc_node *tipc_node_create(struct net *net, u32 addr); void tipc_node_stop(struct net *net); +void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *bearer, + bool *link_up, bool *addr_match, + struct tipc_media_addr *maddr); +bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *bearer, + struct tipc_media_addr *maddr); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_link_down(struct tipc_node *n_ptr, int bearer_id); -- cgit v1.2.3 From d39bbd445dc44259c77bbbc8aadcce7dcdba39cc Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:21 -0400 Subject: tipc: move link input queue to tipc_node At present, the link input queue and the name distributor receive queues are fields aggregated in struct tipc_link. This is a hazard, because a link might be deleted while a receiving socket still keeps reference to one of the queues. This commit fixes this bug. However, rather than adding yet another reference counter to the critical data path, we move the two queues to safe ground inside struct tipc_node, which is already protected, and let the link code only handle references to the queues. This is also in line with planned later changes in this area. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 27 +++++++++++++++------------ net/tipc/link.h | 12 +++++++----- net/tipc/node.c | 4 +++- net/tipc/node.h | 3 ++- 4 files changed, 27 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 03372a7e98df..f8e0e2ceceb4 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -227,7 +227,9 @@ static void link_set_timer(struct tipc_link *link, unsigned long time) */ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, struct tipc_bearer *b_ptr, - const struct tipc_media_addr *media_addr) + const struct tipc_media_addr *media_addr, + struct sk_buff_head *inputq, + struct sk_buff_head *namedq) { struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); struct tipc_link *l_ptr; @@ -289,8 +291,9 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, __skb_queue_head_init(&l_ptr->backlogq); __skb_queue_head_init(&l_ptr->deferdq); skb_queue_head_init(&l_ptr->wakeupq); - skb_queue_head_init(&l_ptr->inputq); - skb_queue_head_init(&l_ptr->namedq); + l_ptr->inputq = inputq; + l_ptr->namedq = namedq; + skb_queue_head_init(l_ptr->inputq); link_reset_statistics(l_ptr); tipc_node_attach_link(n_ptr, l_ptr); setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr); @@ -391,8 +394,8 @@ void link_prepare_wakeup(struct tipc_link *l) if ((pnd[imp] + l->backlog[imp].len) >= lim) break; skb_unlink(skb, &l->wakeupq); - skb_queue_tail(&l->inputq, skb); - l->owner->inputq = &l->inputq; + skb_queue_tail(l->inputq, skb); + l->owner->inputq = l->inputq; l->owner->action_flags |= TIPC_MSG_EVT; } } @@ -465,7 +468,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) __skb_queue_purge(&l_ptr->transmq); __skb_queue_purge(&l_ptr->deferdq); if (!owner->inputq) - owner->inputq = &l_ptr->inputq; + owner->inputq = l_ptr->inputq; skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; @@ -962,7 +965,7 @@ static bool link_synch(struct tipc_link *l) /* Is it still in the input queue ? */ post_synch = mod(pl->rcv_nxt - l->synch_point) - 1; - if (skb_queue_len(&pl->inputq) > post_synch) + if (skb_queue_len(pl->inputq) > post_synch) return false; synched: l->flags &= ~LINK_SYNCHING; @@ -1141,16 +1144,16 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: case CONN_MANAGER: - if (tipc_skb_queue_tail(&link->inputq, skb, dport)) { - node->inputq = &link->inputq; + if (tipc_skb_queue_tail(link->inputq, skb, dport)) { + node->inputq = link->inputq; node->action_flags |= TIPC_MSG_EVT; } return true; case NAME_DISTRIBUTOR: node->bclink.recv_permitted = true; - node->namedq = &link->namedq; - skb_queue_tail(&link->namedq, skb); - if (skb_queue_len(&link->namedq) == 1) + node->namedq = link->namedq; + skb_queue_tail(link->namedq, skb); + if (skb_queue_len(link->namedq) == 1) node->action_flags |= TIPC_NAMED_MSG_EVT; return true; case MSG_BUNDLER: diff --git a/net/tipc/link.h b/net/tipc/link.h index ae0a0ea572f2..9c71d9e42e93 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -192,8 +192,8 @@ struct tipc_link { u16 rcv_nxt; u32 rcv_unacked; struct sk_buff_head deferdq; - struct sk_buff_head inputq; - struct sk_buff_head namedq; + struct sk_buff_head *inputq; + struct sk_buff_head *namedq; /* Congestion handling */ struct sk_buff_head wakeupq; @@ -207,9 +207,11 @@ struct tipc_link { struct tipc_port; -struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, - struct tipc_bearer *b_ptr, - const struct tipc_media_addr *media_addr); +struct tipc_link *tipc_link_create(struct tipc_node *n, + struct tipc_bearer *b, + const struct tipc_media_addr *maddr, + struct sk_buff_head *inputq, + struct sk_buff_head *namedq); void tipc_link_delete(struct tipc_link *link); void tipc_link_delete_list(struct net *net, unsigned int bearer_id); void tipc_link_failover_send_queue(struct tipc_link *l_ptr); diff --git a/net/tipc/node.c b/net/tipc/node.c index 06f642abdf38..20ec61ceffac 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -132,6 +132,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) INIT_LIST_HEAD(&n_ptr->list); INIT_LIST_HEAD(&n_ptr->publ_list); INIT_LIST_HEAD(&n_ptr->conn_sks); + skb_queue_head_init(&n_ptr->bclink.namedq); __skb_queue_head_init(&n_ptr->bclink.deferdq); hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { @@ -350,9 +351,10 @@ bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *b, { struct tipc_link *l = n->links[b->identity].link; struct tipc_media_addr *curr = &n->links[b->identity].maddr; + struct sk_buff_head *inputq = &n->links[b->identity].inputq; if (!l) - l = tipc_link_create(n, b, maddr); + l = tipc_link_create(n, b, maddr, inputq, &n->bclink.namedq); if (!l) return false; memcpy(&l->media_addr, maddr, sizeof(*maddr)); diff --git a/net/tipc/node.h b/net/tipc/node.h index 68579c70748b..0657cbf1f5cd 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -85,13 +85,14 @@ struct tipc_node_bclink { u32 deferred_size; struct sk_buff_head deferdq; struct sk_buff *reasm_buf; - int inputq_map; + struct sk_buff_head namedq; bool recv_permitted; }; struct tipc_link_entry { struct tipc_link *link; u32 mtu; + struct sk_buff_head inputq; struct tipc_media_addr maddr; }; -- cgit v1.2.3 From 36e78a463b26c9b8017a2e11dcd6c4b8e34b4161 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:22 -0400 Subject: tipc: use bearer index when looking up active links struct tipc_node currently holds two arrays of link pointers; one, indexed by bearer identity, which contains all links irrespective of current state, and one two-slot array for the currently active link or links. The latter array contains direct pointers into the elements of the former. This has the effect that we cannot know the bearer id of a link when accessing it via the "active_links[]" array without actually dereferencing the pointer, something we want to avoid in some cases. In this commit, we do instead store the bearer identity in the "active_links" array, and use this as an index to find the right element in the overall link entry array. This change should be seen as a preparation for the later commits in this series. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 106 +++++++++++++++++++++++--------------------------------- net/tipc/node.h | 26 ++++++++------ 2 files changed, 59 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 20ec61ceffac..19729645d494 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -142,6 +142,8 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) list_add_tail_rcu(&n_ptr->list, &temp_node->list); n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN; n_ptr->signature = INVALID_NODE_SIG; + n_ptr->active_links[0] = INVALID_BEARER_ID; + n_ptr->active_links[1] = INVALID_BEARER_ID; tipc_node_get(n_ptr); exit: spin_unlock_bh(&tn->node_list_lock); @@ -227,12 +229,13 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) */ void tipc_node_link_up(struct tipc_node *n, int bearer_id) { - struct tipc_link_entry **actv = &n->active_links[0]; - struct tipc_link_entry *le = &n->links[bearer_id]; - struct tipc_link *l = le->link; + int *slot0 = &n->active_links[0]; + int *slot1 = &n->active_links[1]; + struct tipc_link_entry *links = n->links; + struct tipc_link *l = n->links[bearer_id].link; /* Leave room for tunnel header when returning 'mtu' to users: */ - n->links[bearer_id].mtu = l->mtu - INT_H_SIZE; + links[bearer_id].mtu = l->mtu - INT_H_SIZE; n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; @@ -242,55 +245,30 @@ void tipc_node_link_up(struct tipc_node *n, int bearer_id) l->name, l->net_plane); /* No active links ? => take both active slots */ - if (!actv[0]) { - actv[0] = le; - actv[1] = le; + if (*slot0 < 0) { + *slot0 = bearer_id; + *slot1 = bearer_id; node_established_contact(n); return; } - if (l->priority < actv[0]->link->priority) { + + /* Lower prio than current active ? => no slot */ + if (l->priority < links[*slot0].link->priority) { pr_debug("New link <%s> becomes standby\n", l->name); return; } - tipc_link_dup_queue_xmit(actv[0]->link, l); + tipc_link_dup_queue_xmit(links[*slot0].link, l); - /* Take one active slot if applicable */ - if (l->priority == actv[0]->link->priority) { - actv[0] = le; + /* Same prio as current active ? => take one slot */ + if (l->priority == links[*slot0].link->priority) { + *slot0 = bearer_id; return; } - /* Higher prio than current active? => take both active slots */ - pr_debug("Old l <%s> becomes standby\n", actv[0]->link->name); - if (actv[1] != actv[0]) - pr_debug("Old link <%s> now standby\n", actv[1]->link->name); - actv[0] = le; - actv[1] = le; -} - -/** - * node_select_active_links - select which working links should be active - */ -static void node_select_active_links(struct tipc_node *n) -{ - struct tipc_link_entry **actv = &n->active_links[0]; - struct tipc_link *l; - u32 b, highest = 0; - actv[0] = NULL; - actv[1] = NULL; - - for (b = 0; b < MAX_BEARERS; b++) { - l = n->links[b].link; - if (!l || !tipc_link_is_up(l) || (l->priority < highest)) - continue; - if (l->priority > highest) { - highest = l->priority; - actv[0] = &n->links[b]; - actv[1] = &n->links[b]; - continue; - } - actv[1] = &n->links[b]; - } + /* Higher prio than current active => take both active slots */ + pr_debug("Old link <%s> now standby\n", links[*slot0].link->name); + *slot0 = bearer_id; + *slot1 = bearer_id; } /** @@ -298,32 +276,36 @@ static void node_select_active_links(struct tipc_node *n) */ void tipc_node_link_down(struct tipc_node *n, int bearer_id) { - struct tipc_link_entry **actv = &n->active_links[0]; - struct tipc_link_entry *le = &n->links[bearer_id]; - struct tipc_link *l = le->link; + int *slot0 = &n->active_links[0]; + int *slot1 = &n->active_links[1]; + int i, highest = 0; + struct tipc_link *l, *_l; + l = n->links[bearer_id].link; n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = l->peer_bearer_id << 16 | l->bearer_id; - if (!tipc_link_is_active(l)) { - pr_debug("Lost standby link <%s> on network plane %c\n", - l->name, l->net_plane); - return; - } pr_debug("Lost link <%s> on network plane %c\n", l->name, l->net_plane); - /* Resdistribute active slots if applicable */ - if (actv[0] == le) - actv[0] = actv[1]; - if (actv[1] == le) - actv[1] = actv[0]; - - /* Last link of this priority? => select other ones if available */ - if (actv[0] == le) - node_select_active_links(n); - + /* Select new active link if any available */ + *slot0 = INVALID_BEARER_ID; + *slot1 = INVALID_BEARER_ID; + for (i = 0; i < MAX_BEARERS; i++) { + _l = n->links[i].link; + if (!_l || !tipc_link_is_up(_l)) + continue; + if (_l->priority < highest) + continue; + if (_l->priority > highest) { + highest = _l->priority; + *slot0 = i; + *slot1 = i; + continue; + } + *slot1 = i; + } if (tipc_node_is_up(n)) tipc_link_failover_send_queue(l); else @@ -332,7 +314,7 @@ void tipc_node_link_down(struct tipc_node *n, int bearer_id) bool tipc_node_is_up(struct tipc_node *n) { - return n->active_links[0]; + return n->active_links[0] != INVALID_BEARER_ID; } void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *b, diff --git a/net/tipc/node.h b/net/tipc/node.h index 0657cbf1f5cd..74f278adada3 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -45,6 +45,8 @@ /* Out-of-range value for node signature */ #define INVALID_NODE_SIG 0x10000 +#define INVALID_BEARER_ID -1 + /* Flags used to take different actions according to flag type * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down @@ -105,7 +107,7 @@ struct tipc_link_entry { * @hash: links to adjacent nodes in unsorted hash chain * @inputq: pointer to input queue containing messages for msg event * @namedq: pointer to name table input queue with name table messages - * @active_links: pointer into links[] array, identifying which links are active + * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions * @bclink: broadcast-related info @@ -126,7 +128,7 @@ struct tipc_node { struct hlist_node hash; struct sk_buff_head *inputq; struct sk_buff_head *namedq; - struct tipc_link_entry *active_links[2]; + int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; int action_flags; struct tipc_node_bclink bclink; @@ -176,25 +178,27 @@ static inline bool tipc_node_blocked(struct tipc_node *node) static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel) { - struct tipc_link_entry *le = n->active_links[sel & 1]; + int bearer_id = n->active_links[sel & 1]; + + if (unlikely(bearer_id == INVALID_BEARER_ID)) + return NULL; - if (likely(le)) - return le->link; - return NULL; + return n->links[bearer_id].link; } -static inline uint tipc_node_get_mtu(struct net *net, u32 addr, u32 selector) +static inline unsigned int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel) { struct tipc_node *n; - struct tipc_link_entry *le; + int bearer_id; unsigned int mtu = MAX_MSG_SIZE; n = tipc_node_find(net, addr); if (unlikely(!n)) return mtu; - le = n->active_links[selector & 1]; - if (likely(le)) - mtu = le->mtu; + + bearer_id = n->active_links[sel & 1]; + if (likely(bearer_id != INVALID_BEARER_ID)) + mtu = n->links[bearer_id].mtu; tipc_node_put(n); return mtu; } -- cgit v1.2.3 From 22d85c79428b8ca9a01623aa3e3a1fe29a30a119 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:23 -0400 Subject: tipc: change sk_buffer handling in tipc_link_xmit() When the function tipc_link_xmit() is given a buffer list for transmission, it currently consumes the list both when transmission is successful and when it fails, except for the special case when it encounters link congestion. This behavior is inconsistent, and needs to be corrected if we want to avoid problems in later commits in this series. In this commit, we change this to let the function consume the list only when transmission is successful, and leave the list with the sender in all other cases. We also modifiy the socket code so that it adapts to this change, i.e., purges the list when a non-congestion error code is returned. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 5 ++--- net/tipc/link.c | 23 +++++++++-------------- net/tipc/socket.c | 49 ++++++++++++++++++++++++++----------------------- 3 files changed, 37 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 59b2f2a538e1..295bdc26f103 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -358,10 +358,9 @@ int tipc_bclink_xmit(struct net *net, struct sk_buff_head *list) /* Prepare clone of message for local node */ skb = tipc_msg_reassemble(list); - if (unlikely(!skb)) { - __skb_queue_purge(list); + if (unlikely(!skb)) return -EHOSTUNREACH; - } + /* Broadcast to all nodes */ if (likely(bclink)) { tipc_bclink_lock(net); diff --git a/net/tipc/link.c b/net/tipc/link.c index f8e0e2ceceb4..ea32679b6737 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -340,7 +340,7 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id) * @link: congested link * @list: message that was attempted sent * Create pseudo msg to send back to user when congestion abates - * Only consumes message if there is an error + * Does not consume buffer list */ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) { @@ -354,7 +354,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); tipc_link_reset(link); - goto err; + return -ENOBUFS; } /* Non-blocking sender: */ if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending) @@ -364,15 +364,12 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, addr, addr, oport, 0, 0); if (!skb) - goto err; + return -ENOBUFS; TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list); TIPC_SKB_CB(skb)->chain_imp = imp; skb_queue_tail(&link->wakeupq, skb); link->stats.link_congs++; return -ELINKCONG; -err: - __skb_queue_purge(list); - return -ENOBUFS; } /** @@ -641,8 +638,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) * @link: link to use * @list: chain of buffers containing message * - * Consumes the buffer chain, except when returning -ELINKCONG, - * since the caller then may want to make more send attempts. + * Consumes the buffer chain, except when returning an error code, * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted */ @@ -666,10 +662,9 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, if (unlikely(link->backlog[i].len >= link->backlog[i].limit)) return link_schedule_user(link, list); } - if (unlikely(msg_size(msg) > mtu)) { - __skb_queue_purge(list); + if (unlikely(msg_size(msg) > mtu)) return -EMSGSIZE; - } + /* Prepare each packet for sending, and add to relevant queue: */ while (skb_queue_len(list)) { skb = skb_peek(list); @@ -722,7 +717,7 @@ static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) /* tipc_link_xmit_skb(): send single buffer to destination * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE - * messages, which will not be rejected + * messages, which will not cause link congestion * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. * TODO: Return real return value, and let callers use @@ -736,7 +731,7 @@ int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, skb2list(skb, &head); rc = tipc_link_xmit(net, &head, dnode, selector); - if (rc == -ELINKCONG) + if (rc) kfree_skb(skb); return 0; } @@ -748,7 +743,7 @@ int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, * @dsz: amount of user data to be sent * @dnode: address of destination node * @selector: a number used for deterministic link selection - * Consumes the buffer chain, except when returning -ELINKCONG + * Consumes the buffer chain, except when returning error * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE */ int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 3a7567f690f3..87fef25f6519 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -686,21 +686,22 @@ new_mtu: do { rc = tipc_bclink_xmit(net, pktchain); - if (likely(rc >= 0)) { - rc = dsz; - break; + if (likely(!rc)) + return dsz; + + if (rc == -ELINKCONG) { + tsk->link_cong = 1; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (!rc) + continue; } + __skb_queue_purge(pktchain); if (rc == -EMSGSIZE) { msg->msg_iter = save; goto new_mtu; } - if (rc != -ELINKCONG) - break; - tipc_sk(sk)->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); - } while (!rc); + break; + } while (1); return rc; } @@ -925,23 +926,24 @@ new_mtu: skb = skb_peek(pktchain); TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; rc = tipc_link_xmit(net, pktchain, dnode, tsk->portid); - if (likely(rc >= 0)) { + if (likely(!rc)) { if (sock->state != SS_READY) sock->state = SS_CONNECTING; - rc = dsz; - break; + return dsz; } + if (rc == -ELINKCONG) { + tsk->link_cong = 1; + rc = tipc_wait_for_sndmsg(sock, &timeo); + if (!rc) + continue; + } + __skb_queue_purge(pktchain); if (rc == -EMSGSIZE) { m->msg_iter = save; goto new_mtu; } - if (rc != -ELINKCONG) - break; - tsk->link_cong = 1; - rc = tipc_wait_for_sndmsg(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); - } while (!rc); + break; + } while (1); return rc; } @@ -1048,10 +1050,11 @@ next: tsk->sent_unacked++; sent += send; if (sent == dsz) - break; + return dsz; goto next; } if (rc == -EMSGSIZE) { + __skb_queue_purge(pktchain); tsk->max_pkt = tipc_node_get_mtu(net, dnode, portid); m->msg_iter = save; @@ -1059,13 +1062,13 @@ next: } if (rc != -ELINKCONG) break; + tsk->link_cong = 1; } rc = tipc_wait_for_sndpkt(sock, &timeo); - if (rc) - __skb_queue_purge(pktchain); } while (!rc); + __skb_queue_purge(pktchain); return sent ? sent : rc; } -- cgit v1.2.3 From af9b028e270fda6fb812d70d17d902297df1ceb5 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:24 -0400 Subject: tipc: make media xmit call outside node spinlock context Currently, message sending is performed through a deep call chain, where the node spinlock is grabbed and held during a significant part of the transmission time. This is clearly detrimental to overall throughput performance; it would be better if we could send the message after the spinlock has been released. In this commit, we do instead let the call revert on the stack after the buffer chain has been added to the transmission queue, whereafter clones of the buffers are transmitted to the device layer outside the spinlock scope. As a further step in our effort to separate the roles of the node and link entities we also move the function tipc_link_xmit() to node.c, and rename it to tipc_node_xmit(). Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 26 ++++++++++ net/tipc/bearer.h | 3 ++ net/tipc/link.c | 132 +++++++++++++++++++++++++++----------------------- net/tipc/link.h | 6 +-- net/tipc/name_distr.c | 4 +- net/tipc/node.c | 78 +++++++++++++++++++++++++++++ net/tipc/node.h | 4 ++ net/tipc/socket.c | 22 ++++----- 8 files changed, 198 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 00bc0e620532..eae58a6b121c 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -470,6 +470,32 @@ void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf, rcu_read_unlock(); } +/* tipc_bearer_xmit() -send buffer to destination over bearer + */ +void tipc_bearer_xmit(struct net *net, u32 bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr *dst) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_bearer *b; + struct sk_buff *skb, *tmp; + + if (skb_queue_empty(xmitq)) + return; + + rcu_read_lock(); + b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]); + if (likely(b)) { + skb_queue_walk_safe(xmitq, skb, tmp) { + __skb_dequeue(xmitq); + b->media->send_msg(net, skb, b, dst); + /* Until we remove cloning in tipc_l2_send_msg(): */ + kfree_skb(skb); + } + } + rcu_read_unlock(); +} + /** * tipc_l2_rcv_msg - handle incoming TIPC message from an interface * @buf: the received packet diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h index dc714d977768..6426f242f626 100644 --- a/net/tipc/bearer.h +++ b/net/tipc/bearer.h @@ -217,5 +217,8 @@ void tipc_bearer_cleanup(void); void tipc_bearer_stop(struct net *net); void tipc_bearer_send(struct net *net, u32 bearer_id, struct sk_buff *buf, struct tipc_media_addr *dest); +void tipc_bearer_xmit(struct net *net, u32 bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr *dst); #endif /* _TIPC_BEARER_H */ diff --git a/net/tipc/link.c b/net/tipc/link.c index ea32679b6737..c052437a7cfa 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -353,7 +353,6 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list) /* This really cannot happen... */ if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) { pr_warn("%s<%s>, send queue full", link_rst_msg, link->name); - tipc_link_reset(link); return -ENOBUFS; } /* Non-blocking sender: */ @@ -701,6 +700,78 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, return 0; } +/** + * tipc_link_xmit(): enqueue buffer list according to queue situation + * @link: link to use + * @list: chain of buffers containing message + * @xmitq: returned list of packets to be sent by caller + * + * Consumes the buffer chain, except when returning -ELINKCONG, + * since the caller then may want to make more send attempts. + * Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS + * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted + */ +int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, + struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr = buf_msg(skb_peek(list)); + unsigned int maxwin = l->window; + unsigned int i, imp = msg_importance(hdr); + unsigned int mtu = l->mtu; + u16 ack = l->rcv_nxt - 1; + u16 seqno = l->snd_nxt; + u16 bc_last_in = l->owner->bclink.last_in; + struct sk_buff_head *transmq = &l->transmq; + struct sk_buff_head *backlogq = &l->backlogq; + struct sk_buff *skb, *_skb, *bskb; + + /* Match msg importance against this and all higher backlog limits: */ + for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) { + if (unlikely(l->backlog[i].len >= l->backlog[i].limit)) + return link_schedule_user(l, list); + } + if (unlikely(msg_size(hdr) > mtu)) + return -EMSGSIZE; + + /* Prepare each packet for sending, and add to relevant queue: */ + while (skb_queue_len(list)) { + skb = skb_peek(list); + hdr = buf_msg(skb); + msg_set_seqno(hdr, seqno); + msg_set_ack(hdr, ack); + msg_set_bcast_ack(hdr, bc_last_in); + + if (likely(skb_queue_len(transmq) < maxwin)) { + _skb = skb_clone(skb, GFP_ATOMIC); + if (!_skb) + return -ENOBUFS; + __skb_dequeue(list); + __skb_queue_tail(transmq, skb); + __skb_queue_tail(xmitq, _skb); + l->rcv_unacked = 0; + seqno++; + continue; + } + if (tipc_msg_bundle(skb_peek_tail(backlogq), hdr, mtu)) { + kfree_skb(__skb_dequeue(list)); + l->stats.sent_bundled++; + continue; + } + if (tipc_msg_make_bundle(&bskb, hdr, mtu, l->addr)) { + kfree_skb(__skb_dequeue(list)); + __skb_queue_tail(backlogq, bskb); + l->backlog[msg_importance(buf_msg(bskb))].len++; + l->stats.sent_bundled++; + l->stats.sent_bundles++; + continue; + } + l->backlog[imp].len += skb_queue_len(list); + skb_queue_splice_tail_init(list, backlogq); + } + l->snd_nxt = seqno; + return 0; +} + static void skb2list(struct sk_buff *skb, struct sk_buff_head *list) { skb_queue_head_init(list); @@ -715,65 +786,6 @@ static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) return __tipc_link_xmit(link->owner->net, link, &head); } -/* tipc_link_xmit_skb(): send single buffer to destination - * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE - * messages, which will not cause link congestion - * The only exception is datagram messages rerouted after secondary - * lookup, which are rare and safe to dispose of anyway. - * TODO: Return real return value, and let callers use - * tipc_wait_for_sendpkt() where applicable - */ -int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, - u32 selector) -{ - struct sk_buff_head head; - int rc; - - skb2list(skb, &head); - rc = tipc_link_xmit(net, &head, dnode, selector); - if (rc) - kfree_skb(skb); - return 0; -} - -/** - * tipc_link_xmit() is the general link level function for message sending - * @net: the applicable net namespace - * @list: chain of buffers containing message - * @dsz: amount of user data to be sent - * @dnode: address of destination node - * @selector: a number used for deterministic link selection - * Consumes the buffer chain, except when returning error - * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE - */ -int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, - u32 selector) -{ - struct tipc_link *link = NULL; - struct tipc_node *node; - int rc = -EHOSTUNREACH; - - node = tipc_node_find(net, dnode); - if (node) { - tipc_node_lock(node); - link = node_active_link(node, selector & 1); - if (link) - rc = __tipc_link_xmit(net, link, list); - tipc_node_unlock(node); - tipc_node_put(node); - } - if (link) - return rc; - - if (likely(in_own_node(net, dnode))) { - tipc_sk_rcv(net, list); - return 0; - } - - __skb_queue_purge(list); - return rc; -} - /* * tipc_link_sync_xmit - synchronize broadcast link endpoints. * diff --git a/net/tipc/link.h b/net/tipc/link.h index 9c71d9e42e93..7add2b90361d 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -223,12 +223,10 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr); void tipc_link_purge_backlog(struct tipc_link *l); void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); -int tipc_link_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, - u32 selector); -int tipc_link_xmit(struct net *net, struct sk_buff_head *list, u32 dest, - u32 selector); int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list); +int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list, + struct sk_buff_head *xmitq); void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int prob, u32 gap, u32 tolerance, u32 priority); void tipc_link_push_packets(struct tipc_link *l_ptr); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 3a1539e96294..e6018b7eb197 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -102,7 +102,7 @@ void named_cluster_distribute(struct net *net, struct sk_buff *skb) if (!oskb) break; msg_set_destnode(buf_msg(oskb), dnode); - tipc_link_xmit_skb(net, oskb, dnode, dnode); + tipc_node_xmit_skb(net, oskb, dnode, dnode); } rcu_read_unlock(); @@ -223,7 +223,7 @@ void tipc_named_node_up(struct net *net, u32 dnode) &tn->nametbl->publ_list[TIPC_ZONE_SCOPE]); rcu_read_unlock(); - tipc_link_xmit(net, &head, dnode, dnode); + tipc_node_xmit(net, &head, dnode, dnode); } static void tipc_publ_subscribe(struct net *net, struct publication *publ, diff --git a/net/tipc/node.c b/net/tipc/node.c index 19729645d494..ad759bb034e7 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -563,6 +563,84 @@ msg_full: return -EMSGSIZE; } +static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel, + int *bearer_id, + struct tipc_media_addr **maddr) +{ + int id = n->active_links[sel & 1]; + + if (unlikely(id < 0)) + return NULL; + + *bearer_id = id; + *maddr = &n->links[id].maddr; + return n->links[id].link; +} + +/** + * tipc_node_xmit() is the general link level function for message sending + * @net: the applicable net namespace + * @list: chain of buffers containing message + * @dnode: address of destination node + * @selector: a number used for deterministic link selection + * Consumes the buffer chain, except when returning -ELINKCONG + * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE + */ +int tipc_node_xmit(struct net *net, struct sk_buff_head *list, + u32 dnode, int selector) +{ + struct tipc_link *l = NULL; + struct tipc_node *n; + struct sk_buff_head xmitq; + struct tipc_media_addr *maddr; + int bearer_id; + int rc = -EHOSTUNREACH; + + __skb_queue_head_init(&xmitq); + n = tipc_node_find(net, dnode); + if (likely(n)) { + tipc_node_lock(n); + l = tipc_node_select_link(n, selector, &bearer_id, &maddr); + if (likely(l)) + rc = tipc_link_xmit(l, list, &xmitq); + if (unlikely(rc == -ENOBUFS)) + tipc_link_reset(l); + tipc_node_unlock(n); + tipc_node_put(n); + } + if (likely(!rc)) { + tipc_bearer_xmit(net, bearer_id, &xmitq, maddr); + return 0; + } + if (likely(in_own_node(net, dnode))) { + tipc_sk_rcv(net, list); + return 0; + } + return rc; +} + +/* tipc_node_xmit_skb(): send single buffer to destination + * Buffers sent via this functon are generally TIPC_SYSTEM_IMPORTANCE + * messages, which will not be rejected + * The only exception is datagram messages rerouted after secondary + * lookup, which are rare and safe to dispose of anyway. + * TODO: Return real return value, and let callers use + * tipc_wait_for_sendpkt() where applicable + */ +int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, + u32 selector) +{ + struct sk_buff_head head; + int rc; + + skb_queue_head_init(&head); + __skb_queue_tail(&head, skb); + rc = tipc_node_xmit(net, &head, dnode, selector); + if (rc == -ELINKCONG) + kfree_skb(skb); + return 0; +} + int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; diff --git a/net/tipc/node.h b/net/tipc/node.h index 74f278adada3..86b7c740cf84 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -160,6 +160,10 @@ bool tipc_node_is_up(struct tipc_node *n); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); void tipc_node_unlock(struct tipc_node *node); +int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, + int selector); +int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, + u32 selector); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 87fef25f6519..5b0b08d58fcc 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -261,7 +261,7 @@ static void tsk_rej_rx_queue(struct sock *sk) while ((skb = __skb_dequeue(&sk->sk_receive_queue))) { if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_ERR_NO_PORT)) - tipc_link_xmit_skb(sock_net(sk), skb, dnode, 0); + tipc_node_xmit_skb(sock_net(sk), skb, dnode, 0); } } @@ -443,7 +443,7 @@ static int tipc_release(struct socket *sock) } if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, TIPC_ERR_NO_PORT)) - tipc_link_xmit_skb(net, skb, dnode, 0); + tipc_node_xmit_skb(net, skb, dnode, 0); } } @@ -456,7 +456,7 @@ static int tipc_release(struct socket *sock) tsk_own_node(tsk), tsk_peer_port(tsk), tsk->portid, TIPC_ERR_NO_PORT); if (skb) - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); tipc_node_remove_conn(net, dnode, tsk->portid); } @@ -925,7 +925,7 @@ new_mtu: do { skb = skb_peek(pktchain); TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong; - rc = tipc_link_xmit(net, pktchain, dnode, tsk->portid); + rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid); if (likely(!rc)) { if (sock->state != SS_READY) sock->state = SS_CONNECTING; @@ -1045,7 +1045,7 @@ next: return rc; do { if (likely(!tsk_conn_cong(tsk))) { - rc = tipc_link_xmit(net, pktchain, dnode, portid); + rc = tipc_node_xmit(net, pktchain, dnode, portid); if (likely(!rc)) { tsk->sent_unacked++; sent += send; @@ -1224,7 +1224,7 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk, uint ack) return; msg = buf_msg(skb); msg_set_msgcnt(msg, ack); - tipc_link_xmit_skb(net, skb, dnode, msg_link_selector(msg)); + tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); } static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) @@ -1703,7 +1703,7 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) return 0; } if (!err || tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, -err)) - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); return 0; } @@ -1799,7 +1799,7 @@ int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) if (!tipc_msg_reverse(tn->own_addr, skb, &dnode, -err)) continue; xmit: - tipc_link_xmit_skb(net, skb, dnode, dport); + tipc_node_xmit_skb(net, skb, dnode, dport); } return err ? -EHOSTUNREACH : 0; } @@ -2092,7 +2092,7 @@ restart: } if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, TIPC_CONN_SHUTDOWN)) - tipc_link_xmit_skb(net, skb, dnode, + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); } else { dnode = tsk_peer_node(tsk); @@ -2102,7 +2102,7 @@ restart: 0, dnode, tsk_own_node(tsk), tsk_peer_port(tsk), tsk->portid, TIPC_CONN_SHUTDOWN); - tipc_link_xmit_skb(net, skb, dnode, tsk->portid); + tipc_node_xmit_skb(net, skb, dnode, tsk->portid); } tsk->connected = 0; sock->state = SS_DISCONNECTING; @@ -2164,7 +2164,7 @@ static void tipc_sk_timeout(unsigned long data) } bh_unlock_sock(sk); if (skb) - tipc_link_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); + tipc_node_xmit_skb(sock_net(sk), skb, peer_node, tsk->portid); exit: sock_put(sk); } -- cgit v1.2.3 From d3504c3449fead545e5254bfb11da916f72c4734 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:25 -0400 Subject: tipc: clean up definitions and usage of link flags The status flag LINK_STOPPED is not needed any more, since the mechanism for delayed deletion of links has been removed. Likewise, LINK_STARTED and LINK_START_EVT are unnecessary, because we can just as well start the link timer directly from inside tipc_link_create(). We eliminate these flags in this commit. Instead of the above flags, we now introduce three new link modes, TIPC_LINK_OPEN, TIPC_LINK_BLOCKED and TIPC_LINK_TUNNEL. The values indicate whether, and in the case of TIPC_LINK_TUNNEL, which, messages the link is allowed to receive in this state. TIPC_LINK_BLOCKED also blocks timer-driven protocol messages to be sent out, and any change to the link FSM. Since the modes are mutually exclusive, we convert them to state values, and rename the 'flags' field in struct tipc_link to 'exec_mode'. Finally, we move the #defines for link FSM states and events from link.h into enums inside the file link.c, which is the real usage scope of these definitions. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 1 - net/tipc/link.c | 98 +++++++++++++++++++++++++++++++++----------------------- net/tipc/link.h | 44 +++++-------------------- net/tipc/node.c | 2 +- 4 files changed, 67 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 295bdc26f103..aab4e8dd7b32 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -924,7 +924,6 @@ int tipc_bclink_init(struct net *net) tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); bcl->bearer_id = MAX_BEARERS; rcu_assign_pointer(tn->bearer_list[MAX_BEARERS], &bcbearer->bearer); - bcl->state = WORKING_WORKING; bcl->pmsg = (struct tipc_msg *)&bcl->proto_msg; msg_set_prevnode(bcl->pmsg, tn->own_addr); strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME); diff --git a/net/tipc/link.c b/net/tipc/link.c index c052437a7cfa..35a2da688db1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -79,19 +79,49 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { /* * Out-of-range value for link session numbers */ -#define INVALID_SESSION 0x10000 +#define WILDCARD_SESSION 0x10000 -/* - * Link state events: +/* State value stored in 'failover_pkts' */ -#define STARTING_EVT 856384768 /* link processing trigger */ -#define TRAFFIC_MSG_EVT 560815u /* rx'd ??? */ -#define SILENCE_EVT 560817u /* timer dicovered silence from peer */ +#define FIRST_FAILOVER 0xffffu -/* - * State value stored in 'failover_pkts' +/* Link FSM states and events: */ -#define FIRST_FAILOVER 0xffffu +enum { + WORKING_WORKING, + WORKING_UNKNOWN, + RESET_RESET, + RESET_UNKNOWN +}; + +enum { + PEER_RESET_EVT = RESET_MSG, + ACTIVATE_EVT = ACTIVATE_MSG, + TRAFFIC_EVT, /* Any other valid msg from peer */ + SILENCE_EVT /* Peer was silent during last timer interval*/ +}; + +/* Link FSM state checking routines + */ +static int link_working_working(struct tipc_link *l) +{ + return l->state == WORKING_WORKING; +} + +static int link_working_unknown(struct tipc_link *l) +{ + return l->state == WORKING_UNKNOWN; +} + +static int link_reset_unknown(struct tipc_link *l) +{ + return l->state == RESET_UNKNOWN; +} + +static int link_reset_reset(struct tipc_link *l) +{ + return l->state == RESET_RESET; +} static void link_handle_out_of_seq_msg(struct tipc_link *link, struct sk_buff *skb); @@ -268,7 +298,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, /* note: peer i/f name is updated by reset/activate message */ memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr)); l_ptr->owner = n_ptr; - l_ptr->peer_session = INVALID_SESSION; + l_ptr->peer_session = WILDCARD_SESSION; l_ptr->bearer_id = b_ptr->identity; link_set_supervision_props(l_ptr, b_ptr->tolerance); l_ptr->state = RESET_UNKNOWN; @@ -297,8 +327,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, link_reset_statistics(l_ptr); tipc_node_attach_link(n_ptr, l_ptr); setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr); - link_state_event(l_ptr, STARTING_EVT); - + link_set_timer(l_ptr, l_ptr->keepalive_intv); return l_ptr; } @@ -311,7 +340,6 @@ void tipc_link_delete(struct tipc_link *l) tipc_link_reset(l); if (del_timer(&l->timer)) tipc_link_put(l); - l->flags |= LINK_STOPPED; /* Delete link now, or when timer is finished: */ tipc_link_reset_fragments(l); tipc_node_detach_link(l->owner, l); @@ -438,7 +466,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); /* Link is down, accept any session */ - l_ptr->peer_session = INVALID_SESSION; + l_ptr->peer_session = WILDCARD_SESSION; /* Prepare for renewed mtu size negotiation */ l_ptr->mtu = l_ptr->advertised_mtu; @@ -452,7 +480,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr); if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { - l_ptr->flags |= LINK_FAILINGOVER; + l_ptr->exec_mode = TIPC_LINK_BLOCKED; l_ptr->failover_checkpt = l_ptr->rcv_nxt; pl->failover_pkts = FIRST_FAILOVER; pl->failover_checkpt = l_ptr->rcv_nxt; @@ -496,21 +524,14 @@ static void link_activate(struct tipc_link *link) static void link_state_event(struct tipc_link *l_ptr, unsigned int event) { struct tipc_link *other; - unsigned long timer_intv = l_ptr->keepalive_intv; - - if (l_ptr->flags & LINK_STOPPED) - return; - - if (!(l_ptr->flags & LINK_STARTED) && (event != STARTING_EVT)) - return; /* Not yet. */ - if (l_ptr->flags & LINK_FAILINGOVER) + if (l_ptr->exec_mode == TIPC_LINK_BLOCKED) return; switch (l_ptr->state) { case WORKING_WORKING: switch (event) { - case TRAFFIC_MSG_EVT: + case TRAFFIC_EVT: case ACTIVATE_MSG: l_ptr->silent_intv_cnt = 0; break; @@ -538,7 +559,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) break; case WORKING_UNKNOWN: switch (event) { - case TRAFFIC_MSG_EVT: + case TRAFFIC_EVT: case ACTIVATE_MSG: l_ptr->state = WORKING_WORKING; l_ptr->silent_intv_cnt = 0; @@ -576,7 +597,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) break; case RESET_UNKNOWN: switch (event) { - case TRAFFIC_MSG_EVT: + case TRAFFIC_EVT: break; case ACTIVATE_MSG: other = node_active_link(l_ptr->owner, 0); @@ -593,10 +614,6 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, 1, 0, 0, 0); break; - case STARTING_EVT: - l_ptr->flags |= LINK_STARTED; - link_set_timer(l_ptr, timer_intv); - break; case SILENCE_EVT: tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); break; @@ -606,7 +623,7 @@ static void link_state_event(struct tipc_link *l_ptr, unsigned int event) break; case RESET_RESET: switch (event) { - case TRAFFIC_MSG_EVT: + case TRAFFIC_EVT: case ACTIVATE_MSG: other = node_active_link(l_ptr->owner, 0); if (other && link_working_unknown(other)) @@ -975,7 +992,7 @@ static bool link_synch(struct tipc_link *l) if (skb_queue_len(pl->inputq) > post_synch) return false; synched: - l->flags &= ~LINK_SYNCHING; + l->exec_mode = TIPC_LINK_OPEN; return true; } @@ -1091,7 +1108,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) } /* Traffic message. Conditionally activate link */ - link_state_event(l_ptr, TRAFFIC_MSG_EVT); + link_state_event(l_ptr, TRAFFIC_EVT); if (link_working_working(l_ptr)) { /* Re-insert buffer in front of queue */ @@ -1112,7 +1129,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) l_ptr->silent_intv_cnt = 0; /* Synchronize with parallel link if applicable */ - if (unlikely((l_ptr->flags & LINK_SYNCHING) && !msg_dup(msg))) { + if (unlikely((l_ptr->exec_mode == TIPC_LINK_TUNNEL) && + !msg_dup(msg))) { if (!link_synch(l_ptr)) goto unlock; } @@ -1193,7 +1211,7 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) switch (msg_user(msg)) { case TUNNEL_PROTOCOL: if (msg_dup(msg)) { - link->flags |= LINK_SYNCHING; + link->exec_mode = TIPC_LINK_TUNNEL; link->synch_point = msg_seqno(msg_get_wrapped(msg)); kfree_skb(skb); break; @@ -1315,7 +1333,7 @@ void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, u16 last_rcv; /* Don't send protocol message during link failover */ - if (l_ptr->flags & LINK_FAILINGOVER) + if (l_ptr->exec_mode == TIPC_LINK_BLOCKED) return; /* Abort non-RESET send if communication with node is prohibited */ @@ -1390,7 +1408,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, u32 msg_tol; struct tipc_msg *msg = buf_msg(buf); - if (l_ptr->flags & LINK_FAILINGOVER) + if (l_ptr->exec_mode == TIPC_LINK_BLOCKED) goto exit; if (l_ptr->net_plane != msg_net_plane(msg)) @@ -1401,7 +1419,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, case RESET_MSG: if (!link_working_unknown(l_ptr) && - (l_ptr->peer_session != INVALID_SESSION)) { + (l_ptr->peer_session != WILDCARD_SESSION)) { if (less_eq(msg_session(msg), l_ptr->peer_session)) break; /* duplicate or old reset: ignore */ } @@ -1465,7 +1483,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, /* Record reception; force mismatch at next timeout: */ l_ptr->silent_intv_cnt = 0; - link_state_event(l_ptr, TRAFFIC_MSG_EVT); + link_state_event(l_ptr, TRAFFIC_EVT); l_ptr->stats.recv_states++; if (link_reset_unknown(l_ptr)) break; @@ -1704,7 +1722,7 @@ static bool tipc_link_failover_rcv(struct tipc_link *link, } exit: if (!link->failover_pkts && pl) - pl->flags &= ~LINK_FAILINGOVER; + pl->exec_mode = TIPC_LINK_OPEN; kfree_skb(*skb); *skb = iskb; return *skb; diff --git a/net/tipc/link.h b/net/tipc/link.h index 7add2b90361d..0509c6de03cd 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -49,19 +49,14 @@ */ #define INVALID_LINK_SEQ 0x10000 -/* Link working states - */ -#define WORKING_WORKING 560810u -#define WORKING_UNKNOWN 560811u -#define RESET_UNKNOWN 560812u -#define RESET_RESET 560813u -/* Link endpoint execution states +/* Link endpoint receive states */ -#define LINK_STARTED 0x0001 -#define LINK_STOPPED 0x0002 -#define LINK_SYNCHING 0x0004 -#define LINK_FAILINGOVER 0x0008 +enum { + TIPC_LINK_OPEN, + TIPC_LINK_BLOCKED, + TIPC_LINK_TUNNEL +}; /* Starting value for maximum packet size negotiation on unicast links * (unless bearer MTU is less) @@ -106,7 +101,6 @@ struct tipc_stats { * @timer: link timer * @owner: pointer to peer node * @refcnt: reference counter for permanent references (owner node & timer) - * @flags: execution state flags for link endpoint instance * @peer_session: link session # being used by peer end of link * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link @@ -119,6 +113,7 @@ struct tipc_stats { * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') + * @exec_mode: transmit/receive mode for link endpoint instance * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset @@ -149,7 +144,6 @@ struct tipc_link { struct kref ref; /* Management and link supervision data */ - unsigned int flags; u32 peer_session; u32 peer_bearer_id; u32 bearer_id; @@ -165,6 +159,7 @@ struct tipc_link { struct tipc_msg *pmsg; u32 priority; char net_plane; + u8 exec_mode; u16 synch_point; /* Failover */ @@ -249,27 +244,4 @@ static inline u32 link_own_addr(struct tipc_link *l) return msg_prevnode(l->pmsg); } -/* - * Link status checking routines - */ -static inline int link_working_working(struct tipc_link *l_ptr) -{ - return l_ptr->state == WORKING_WORKING; -} - -static inline int link_working_unknown(struct tipc_link *l_ptr) -{ - return l_ptr->state == WORKING_UNKNOWN; -} - -static inline int link_reset_unknown(struct tipc_link *l_ptr) -{ - return l_ptr->state == RESET_UNKNOWN; -} - -static inline int link_reset_reset(struct tipc_link *l_ptr) -{ - return l_ptr->state == RESET_RESET; -} - #endif diff --git a/net/tipc/node.c b/net/tipc/node.c index ad759bb034e7..b7a4457f653c 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -403,7 +403,7 @@ static void node_lost_contact(struct tipc_node *n_ptr) struct tipc_link *l_ptr = n_ptr->links[i].link; if (!l_ptr) continue; - l_ptr->flags &= ~LINK_FAILINGOVER; + l_ptr->exec_mode = TIPC_LINK_OPEN; l_ptr->failover_checkpt = 0; l_ptr->failover_pkts = 0; kfree_skb(l_ptr->failover_skb); -- cgit v1.2.3 From 426cc2b86d1813959497d608dcb52c32df2d448a Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:26 -0400 Subject: tipc: introduce new link protocol msg create function As a preparation for later changes, we introduce a new function tipc_link_build_proto_msg(). Instead of actually sending the created protocol message, it only creates it and adds it to the head of a skb queue provided by the caller. Since we still need the existing function tipc_link_protocol_xmit() for a while, we redesign it to make use of the new function. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 144 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 77 insertions(+), 67 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 35a2da688db1..657ba91fde41 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -129,6 +129,9 @@ static void tipc_link_proto_rcv(struct tipc_link *link, struct sk_buff *skb); static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol); static void link_state_event(struct tipc_link *l_ptr, u32 event); +static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, + u16 rcvgap, int tolerance, int priority, + struct sk_buff_head *xmitq); static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); static void tipc_link_sync_xmit(struct tipc_link *l); @@ -1323,77 +1326,21 @@ static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, /* * Send protocol message to the other endpoint. */ -void tipc_link_proto_xmit(struct tipc_link *l_ptr, u32 msg_typ, int probe_msg, +void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg, u32 gap, u32 tolerance, u32 priority) { - struct sk_buff *buf = NULL; - struct tipc_msg *msg = l_ptr->pmsg; - u32 msg_size = sizeof(l_ptr->proto_msg); - int r_flag; - u16 last_rcv; + struct sk_buff *skb = NULL; + struct sk_buff_head xmitq; - /* Don't send protocol message during link failover */ - if (l_ptr->exec_mode == TIPC_LINK_BLOCKED) - return; - - /* Abort non-RESET send if communication with node is prohibited */ - if ((tipc_node_blocked(l_ptr->owner)) && (msg_typ != RESET_MSG)) - return; - - /* Create protocol message with "out-of-sequence" sequence number */ - msg_set_type(msg, msg_typ); - msg_set_net_plane(msg, l_ptr->net_plane); - msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); - msg_set_last_bcast(msg, tipc_bclink_get_last_sent(l_ptr->owner->net)); - - if (msg_typ == STATE_MSG) { - u16 next_sent = l_ptr->snd_nxt; - - if (!tipc_link_is_up(l_ptr)) - return; - msg_set_next_sent(msg, next_sent); - if (!skb_queue_empty(&l_ptr->deferdq)) { - last_rcv = buf_seqno(skb_peek(&l_ptr->deferdq)); - gap = mod(last_rcv - l_ptr->rcv_nxt); - } - msg_set_seq_gap(msg, gap); - if (gap) - l_ptr->stats.sent_nacks++; - msg_set_link_tolerance(msg, tolerance); - msg_set_linkprio(msg, priority); - msg_set_max_pkt(msg, l_ptr->mtu); - msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1)); - msg_set_probe(msg, probe_msg != 0); - if (probe_msg) - l_ptr->stats.sent_probes++; - l_ptr->stats.sent_states++; - } else { /* RESET_MSG or ACTIVATE_MSG */ - msg_set_ack(msg, mod(l_ptr->failover_checkpt - 1)); - msg_set_seq_gap(msg, 0); - msg_set_next_sent(msg, 1); - msg_set_probe(msg, 0); - msg_set_link_tolerance(msg, l_ptr->tolerance); - msg_set_linkprio(msg, l_ptr->priority); - msg_set_max_pkt(msg, l_ptr->advertised_mtu); - } - - r_flag = (l_ptr->owner->working_links > tipc_link_is_up(l_ptr)); - msg_set_redundant_link(msg, r_flag); - msg_set_linkprio(msg, l_ptr->priority); - msg_set_size(msg, msg_size); - - msg_set_seqno(msg, mod(l_ptr->snd_nxt + (0xffff / 2))); - - buf = tipc_buf_acquire(msg_size); - if (!buf) + __skb_queue_head_init(&xmitq); + tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap, + tolerance, priority, &xmitq); + skb = __skb_dequeue(&xmitq); + if (!skb) return; - - skb_copy_to_linear_data(buf, msg, sizeof(l_ptr->proto_msg)); - buf->priority = TC_PRIO_CONTROL; - tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, buf, - &l_ptr->media_addr); - l_ptr->rcv_unacked = 0; - kfree_skb(buf); + tipc_bearer_send(l->owner->net, l->bearer_id, skb, &l->media_addr); + l->rcv_unacked = 0; + kfree_skb(skb); } /* @@ -1514,6 +1461,69 @@ exit: kfree_skb(buf); } +/* tipc_link_build_proto_msg: prepare link protocol message for transmission + */ +static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, + u16 rcvgap, int tolerance, int priority, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb = NULL; + struct tipc_msg *hdr = l->pmsg; + u16 snd_nxt = l->snd_nxt; + u16 rcv_nxt = l->rcv_nxt; + u16 rcv_last = rcv_nxt - 1; + int node_up = l->owner->bclink.recv_permitted; + + /* Don't send protocol message during reset or link failover */ + if (l->exec_mode == TIPC_LINK_BLOCKED) + return; + + /* Abort non-RESET send if communication with node is prohibited */ + if ((tipc_node_blocked(l->owner)) && (mtyp != RESET_MSG)) + return; + + msg_set_type(hdr, mtyp); + msg_set_net_plane(hdr, l->net_plane); + msg_set_bcast_ack(hdr, l->owner->bclink.last_in); + msg_set_last_bcast(hdr, tipc_bclink_get_last_sent(l->owner->net)); + msg_set_link_tolerance(hdr, tolerance); + msg_set_linkprio(hdr, priority); + msg_set_redundant_link(hdr, node_up); + msg_set_seq_gap(hdr, 0); + + /* Compatibility: created msg must not be in sequence with pkt flow */ + msg_set_seqno(hdr, snd_nxt + U16_MAX / 2); + + if (mtyp == STATE_MSG) { + if (!tipc_link_is_up(l)) + return; + msg_set_next_sent(hdr, snd_nxt); + + /* Override rcvgap if there are packets in deferred queue */ + if (!skb_queue_empty(&l->deferdq)) + rcvgap = buf_seqno(skb_peek(&l->deferdq)) - rcv_nxt; + if (rcvgap) { + msg_set_seq_gap(hdr, rcvgap); + l->stats.sent_nacks++; + } + msg_set_ack(hdr, rcv_last); + msg_set_probe(hdr, probe); + if (probe) + l->stats.sent_probes++; + l->stats.sent_states++; + } else { + /* RESET_MSG or ACTIVATE_MSG */ + msg_set_max_pkt(hdr, l->advertised_mtu); + msg_set_ack(hdr, l->failover_checkpt - 1); + msg_set_next_sent(hdr, 1); + } + skb = tipc_buf_acquire(msg_size(hdr)); + if (!skb) + return; + skb_copy_to_linear_data(skb, hdr, msg_size(hdr)); + skb->priority = TC_PRIO_CONTROL; + __skb_queue_head(xmitq, skb); +} /* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to * a different bearer. Owner node is locked. -- cgit v1.2.3 From 6ab30f9cbe134d19559f48dc748587d036529aaf Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:27 -0400 Subject: tipc: improve link FSM implementation The link FSM implementation is currently unnecessarily complex. It sometimes checks for conditional state outside the FSM data before deciding next state, and often performs actions directly inside the FSM logics. In this commit, we create a second, simpler FSM implementation, that as far as possible acts only on states and events that it is strictly defined for, and postpone any actions until it is finished with its decisions. It also returns an event flag field and an a buffer queue which may potentially contain a protocol message to be sent by the caller. Unfortunately, we cannot yet make the FSM "clean", in the sense that its decisions are only based on FSM state and event, and that state changes happen only here. That will have to wait until the activate/reset logics has been cleaned up in a future commit. We also rename the link states as follows: WORKING_WORKING -> TIPC_LINK_WORKING WORKING_UNKNOWN -> TIPC_LINK_PROBING RESET_UNKNOWN -> TIPC_LINK_RESETTING RESET_RESET -> TIPC_LINK_ESTABLISHING The existing FSM function, link_state_event(), is still needed for a while, so we redesign it to make use of the new function. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 344 +++++++++++++++++++++++++++++++------------------------- net/tipc/link.h | 7 ++ 2 files changed, 195 insertions(+), 156 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 657ba91fde41..5d2f9198c6bc 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -88,10 +88,10 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { /* Link FSM states and events: */ enum { - WORKING_WORKING, - WORKING_UNKNOWN, - RESET_RESET, - RESET_UNKNOWN + TIPC_LINK_WORKING, + TIPC_LINK_PROBING, + TIPC_LINK_RESETTING, + TIPC_LINK_ESTABLISHING }; enum { @@ -103,24 +103,24 @@ enum { /* Link FSM state checking routines */ -static int link_working_working(struct tipc_link *l) +static int link_working(struct tipc_link *l) { - return l->state == WORKING_WORKING; + return l->state == TIPC_LINK_WORKING; } -static int link_working_unknown(struct tipc_link *l) +static int link_probing(struct tipc_link *l) { - return l->state == WORKING_UNKNOWN; + return l->state == TIPC_LINK_PROBING; } -static int link_reset_unknown(struct tipc_link *l) +static int link_resetting(struct tipc_link *l) { - return l->state == RESET_UNKNOWN; + return l->state == TIPC_LINK_RESETTING; } -static int link_reset_reset(struct tipc_link *l) +static int link_establishing(struct tipc_link *l) { - return l->state == RESET_RESET; + return l->state == TIPC_LINK_ESTABLISHING; } static void link_handle_out_of_seq_msg(struct tipc_link *link, @@ -140,6 +140,8 @@ static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); static void link_set_timer(struct tipc_link *link, unsigned long time); +static void link_activate(struct tipc_link *link); + /* * Simple link routines */ @@ -179,7 +181,7 @@ int tipc_link_is_up(struct tipc_link *l_ptr) { if (!l_ptr) return 0; - return link_working_working(l_ptr) || link_working_unknown(l_ptr); + return link_working(l_ptr) || link_probing(l_ptr); } int tipc_link_is_active(struct tipc_link *l) @@ -234,8 +236,11 @@ static void link_timeout(unsigned long data) } /* do all other link processing performed on a periodic basis */ - if (l_ptr->silent_intv_cnt || tipc_bclink_acks_missing(l_ptr->owner)) + if (l_ptr->silent_intv_cnt) link_state_event(l_ptr, SILENCE_EVT); + else if (link_working(l_ptr) && tipc_bclink_acks_missing(l_ptr->owner)) + tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); + l_ptr->silent_intv_cnt++; if (skb_queue_len(&l_ptr->backlogq)) tipc_link_push_packets(l_ptr); @@ -304,7 +309,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, l_ptr->peer_session = WILDCARD_SESSION; l_ptr->bearer_id = b_ptr->identity; link_set_supervision_props(l_ptr, b_ptr->tolerance); - l_ptr->state = RESET_UNKNOWN; + l_ptr->state = TIPC_LINK_RESETTING; l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; msg = l_ptr->pmsg; @@ -366,6 +371,134 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id) rcu_read_unlock(); } +/** + * tipc_link_fsm_evt - link finite state machine + * @l: pointer to link + * @evt: state machine event to be processed + * @xmitq: queue to prepend created protocol message, if any + */ +static int tipc_link_fsm_evt(struct tipc_link *l, int evt, + struct sk_buff_head *xmitq) +{ + int mtyp = 0, rc = 0; + struct tipc_link *pl; + enum { + LINK_RESET = 1, + LINK_ACTIVATE = (1 << 1), + SND_PROBE = (1 << 2), + SND_STATE = (1 << 3), + SND_RESET = (1 << 4), + SND_ACTIVATE = (1 << 5) + } actions = 0; + + if (l->exec_mode == TIPC_LINK_BLOCKED) + return rc; + + switch (l->state) { + case TIPC_LINK_WORKING: + switch (evt) { + case TRAFFIC_EVT: + case ACTIVATE_EVT: + break; + case SILENCE_EVT: + l->state = TIPC_LINK_PROBING; + actions |= SND_PROBE; + break; + case PEER_RESET_EVT: + actions |= LINK_RESET | SND_ACTIVATE; + break; + default: + pr_debug("%s%u WORKING\n", link_unk_evt, evt); + } + break; + case TIPC_LINK_PROBING: + switch (evt) { + case TRAFFIC_EVT: + case ACTIVATE_EVT: + l->state = TIPC_LINK_WORKING; + break; + case PEER_RESET_EVT: + actions |= LINK_RESET | SND_ACTIVATE; + break; + case SILENCE_EVT: + if (l->silent_intv_cnt <= l->abort_limit) { + actions |= SND_PROBE; + break; + } + actions |= LINK_RESET | SND_RESET; + break; + default: + pr_err("%s%u PROBING\n", link_unk_evt, evt); + } + break; + case TIPC_LINK_RESETTING: + switch (evt) { + case TRAFFIC_EVT: + break; + case ACTIVATE_EVT: + pl = node_active_link(l->owner, 0); + if (pl && link_probing(pl)) + break; + actions |= LINK_ACTIVATE; + if (l->owner->working_links == 1) + tipc_link_sync_xmit(l); + break; + case PEER_RESET_EVT: + l->state = TIPC_LINK_ESTABLISHING; + actions |= SND_ACTIVATE; + break; + case SILENCE_EVT: + actions |= SND_RESET; + break; + default: + pr_err("%s%u in RESETTING\n", link_unk_evt, evt); + } + break; + case TIPC_LINK_ESTABLISHING: + switch (evt) { + case TRAFFIC_EVT: + case ACTIVATE_EVT: + pl = node_active_link(l->owner, 0); + if (pl && link_probing(pl)) + break; + actions |= LINK_ACTIVATE; + if (l->owner->working_links == 1) + tipc_link_sync_xmit(l); + break; + case PEER_RESET_EVT: + break; + case SILENCE_EVT: + actions |= SND_ACTIVATE; + break; + default: + pr_err("%s%u ESTABLISHING\n", link_unk_evt, evt); + } + break; + default: + pr_err("Unknown link state %u/%u\n", l->state, evt); + } + + /* Perform actions as decided by FSM */ + if (actions & LINK_RESET) { + l->exec_mode = TIPC_LINK_BLOCKED; + rc |= TIPC_LINK_DOWN_EVT; + } + if (actions & LINK_ACTIVATE) { + l->exec_mode = TIPC_LINK_OPEN; + rc |= TIPC_LINK_UP_EVT; + } + if (actions & (SND_STATE | SND_PROBE)) + mtyp = STATE_MSG; + if (actions & SND_RESET) + mtyp = RESET_MSG; + if (actions & SND_ACTIVATE) + mtyp = ACTIVATE_MSG; + if (actions & (SND_PROBE | SND_STATE | SND_RESET | SND_ACTIVATE)) + tipc_link_build_proto_msg(l, mtyp, actions & SND_PROBE, + 0, 0, 0, xmitq); + return rc; +} + /** * link_schedule_user - schedule a message sender for wakeup after congestion * @link: congested link @@ -474,9 +607,10 @@ void tipc_link_reset(struct tipc_link *l_ptr) /* Prepare for renewed mtu size negotiation */ l_ptr->mtu = l_ptr->advertised_mtu; - l_ptr->state = RESET_UNKNOWN; + l_ptr->state = TIPC_LINK_RESETTING; - if ((prev_state == RESET_UNKNOWN) || (prev_state == RESET_RESET)) + if ((prev_state == TIPC_LINK_RESETTING) || + (prev_state == TIPC_LINK_ESTABLISHING)) return; tipc_node_link_down(l_ptr->owner, l_ptr->bearer_id); @@ -515,6 +649,8 @@ static void link_activate(struct tipc_link *link) link->rcv_nxt = 1; link->stats.recv_info = 1; link->silent_intv_cnt = 0; + link->state = TIPC_LINK_WORKING; + link->exec_mode = TIPC_LINK_OPEN; tipc_node_link_up(node, link->bearer_id); tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); } @@ -524,132 +660,29 @@ static void link_activate(struct tipc_link *link) * @l_ptr: pointer to link * @event: state machine event to process */ -static void link_state_event(struct tipc_link *l_ptr, unsigned int event) +static void link_state_event(struct tipc_link *l, unsigned int evt) { - struct tipc_link *other; + int rc; + struct sk_buff_head xmitq; + struct sk_buff *skb; - if (l_ptr->exec_mode == TIPC_LINK_BLOCKED) + if (l->exec_mode == TIPC_LINK_BLOCKED) return; - switch (l_ptr->state) { - case WORKING_WORKING: - switch (event) { - case TRAFFIC_EVT: - case ACTIVATE_MSG: - l_ptr->silent_intv_cnt = 0; - break; - case SILENCE_EVT: - if (!l_ptr->silent_intv_cnt) { - if (tipc_bclink_acks_missing(l_ptr->owner)) - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0); - break; - } - l_ptr->state = WORKING_UNKNOWN; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - break; - case RESET_MSG: - pr_debug("%s<%s>, requested by peer\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_RESET; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - break; - default: - pr_debug("%s%u in WW state\n", link_unk_evt, event); - } - break; - case WORKING_UNKNOWN: - switch (event) { - case TRAFFIC_EVT: - case ACTIVATE_MSG: - l_ptr->state = WORKING_WORKING; - l_ptr->silent_intv_cnt = 0; - break; - case RESET_MSG: - pr_debug("%s<%s>, requested by peer while probing\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_RESET; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - break; - case SILENCE_EVT: - if (!l_ptr->silent_intv_cnt) { - l_ptr->state = WORKING_WORKING; - if (tipc_bclink_acks_missing(l_ptr->owner)) - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 0, 0, 0, 0); - } else if (l_ptr->silent_intv_cnt < - l_ptr->abort_limit) { - tipc_link_proto_xmit(l_ptr, STATE_MSG, - 1, 0, 0, 0); - } else { /* Link has failed */ - pr_debug("%s<%s>, peer not responding\n", - link_rst_msg, l_ptr->name); - tipc_link_reset(l_ptr); - l_ptr->state = RESET_UNKNOWN; - tipc_link_proto_xmit(l_ptr, RESET_MSG, - 0, 0, 0, 0); - } - break; - default: - pr_err("%s%u in WU state\n", link_unk_evt, event); - } - break; - case RESET_UNKNOWN: - switch (event) { - case TRAFFIC_EVT: - break; - case ACTIVATE_MSG: - other = node_active_link(l_ptr->owner, 0); - if (other && link_working_unknown(other)) - break; - l_ptr->state = WORKING_WORKING; - link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - if (l_ptr->owner->working_links == 1) - tipc_link_sync_xmit(l_ptr); - break; - case RESET_MSG: - l_ptr->state = RESET_RESET; - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 1, 0, 0, 0); - break; - case SILENCE_EVT: - tipc_link_proto_xmit(l_ptr, RESET_MSG, 0, 0, 0, 0); - break; - default: - pr_err("%s%u in RU state\n", link_unk_evt, event); - } - break; - case RESET_RESET: - switch (event) { - case TRAFFIC_EVT: - case ACTIVATE_MSG: - other = node_active_link(l_ptr->owner, 0); - if (other && link_working_unknown(other)) - break; - l_ptr->state = WORKING_WORKING; - link_activate(l_ptr); - tipc_link_proto_xmit(l_ptr, STATE_MSG, 1, 0, 0, 0); - if (l_ptr->owner->working_links == 1) - tipc_link_sync_xmit(l_ptr); - break; - case RESET_MSG: - break; - case SILENCE_EVT: - tipc_link_proto_xmit(l_ptr, ACTIVATE_MSG, - 0, 0, 0, 0); - break; - default: - pr_err("%s%u in RR state\n", link_unk_evt, event); - } - break; - default: - pr_err("Unknown link state %u/%u\n", l_ptr->state, event); - } + __skb_queue_head_init(&xmitq); + + rc = tipc_link_fsm_evt(l, evt, &xmitq); + + if (rc & TIPC_LINK_UP_EVT) + link_activate(l); + + if (rc & TIPC_LINK_DOWN_EVT) + tipc_link_reset(l); + + skb = __skb_dequeue(&xmitq); + if (!skb) + return; + tipc_bearer_send(l->owner->net, l->bearer_id, skb, &l->media_addr); } /** @@ -1102,7 +1135,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) link_prepare_wakeup(l_ptr); /* Process the incoming packet */ - if (unlikely(!link_working_working(l_ptr))) { + if (unlikely(!link_working(l_ptr))) { if (msg_user(msg) == LINK_PROTOCOL) { tipc_link_proto_rcv(l_ptr, skb); link_retrieve_defq(l_ptr, &head); @@ -1113,7 +1146,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) /* Traffic message. Conditionally activate link */ link_state_event(l_ptr, TRAFFIC_EVT); - if (link_working_working(l_ptr)) { + if (link_working(l_ptr)) { /* Re-insert buffer in front of queue */ __skb_queue_head(&head, skb); skb = NULL; @@ -1122,7 +1155,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) goto unlock; } - /* Link is now in state WORKING_WORKING */ + /* Link is now in state TIPC_LINK_WORKING */ if (unlikely(seq_no != l_ptr->rcv_nxt)) { link_handle_out_of_seq_msg(l_ptr, skb); link_retrieve_defq(l_ptr, &head); @@ -1365,16 +1398,15 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, switch (msg_type(msg)) { case RESET_MSG: - if (!link_working_unknown(l_ptr) && + if (!link_probing(l_ptr) && (l_ptr->peer_session != WILDCARD_SESSION)) { if (less_eq(msg_session(msg), l_ptr->peer_session)) break; /* duplicate or old reset: ignore */ } - if (!msg_redundant_link(msg) && (link_working_working(l_ptr) || - link_working_unknown(l_ptr))) { - /* - * peer has lost contact -- don't allow peer's links + if (!msg_redundant_link(msg) && (link_working(l_ptr) || + link_probing(l_ptr))) { + /* peer has lost contact -- don't allow peer's links * to reactivate before we recognize loss & clean up */ l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN; @@ -1432,7 +1464,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, link_state_event(l_ptr, TRAFFIC_EVT); l_ptr->stats.recv_states++; - if (link_reset_unknown(l_ptr)) + if (link_resetting(l_ptr)) break; if (less_eq(l_ptr->rcv_nxt, msg_next_sent(msg))) @@ -1822,14 +1854,14 @@ static void link_print(struct tipc_link *l_ptr, const char *str) pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name); rcu_read_unlock(); - if (link_working_unknown(l_ptr)) - pr_cont(":WU\n"); - else if (link_reset_reset(l_ptr)) - pr_cont(":RR\n"); - else if (link_reset_unknown(l_ptr)) - pr_cont(":RU\n"); - else if (link_working_working(l_ptr)) - pr_cont(":WW\n"); + if (link_probing(l_ptr)) + pr_cont(":P\n"); + else if (link_establishing(l_ptr)) + pr_cont(":E\n"); + else if (link_resetting(l_ptr)) + pr_cont(":R\n"); + else if (link_working(l_ptr)) + pr_cont(":W\n"); else pr_cont("\n"); } diff --git a/net/tipc/link.h b/net/tipc/link.h index 0509c6de03cd..ef68424f492d 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -58,6 +58,13 @@ enum { TIPC_LINK_TUNNEL }; +/* Events occurring at packet reception or at timeout + */ +enum { + TIPC_LINK_UP_EVT = 1, + TIPC_LINK_DOWN_EVT = (1 << 1) +}; + /* Starting value for maximum packet size negotiation on unicast links * (unless bearer MTU is less) */ -- cgit v1.2.3 From 333ef69ed2121f535e00ceb26e095d3745584c6e Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:28 -0400 Subject: tipc: simplify link timer implementation We create a second, simpler, link timer function, tipc_link_timeout(). The new function makes use of the new FSM function introduced in the previous commit, and just like it, takes a buffer queue as parameter. It returns an event bit field and potentially a link protocol packet to the caller. The existing timer function, link_timeout(), is still needed for a while, so we redesign it to become a wrapper around the new function. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 116 ++++++++++++++++++++++++++++++++++---------------------- net/tipc/link.h | 1 + 2 files changed, 72 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 5d2f9198c6bc..f58bb434d1c8 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -193,60 +193,30 @@ int tipc_link_is_active(struct tipc_link *l) /** * link_timeout - handle expiration of link timer - * @l_ptr: pointer to link */ static void link_timeout(unsigned long data) { - struct tipc_link *l_ptr = (struct tipc_link *)data; + struct tipc_link *l = (struct tipc_link *)data; + struct sk_buff_head xmitq; struct sk_buff *skb; + int rc; - tipc_node_lock(l_ptr->owner); + __skb_queue_head_init(&xmitq); - /* update counters used in statistical profiling of send traffic */ - l_ptr->stats.accu_queue_sz += skb_queue_len(&l_ptr->transmq); - l_ptr->stats.queue_sz_counts++; + tipc_node_lock(l->owner); - skb = skb_peek(&l_ptr->transmq); - if (skb) { - struct tipc_msg *msg = buf_msg(skb); - u32 length = msg_size(msg); - - if ((msg_user(msg) == MSG_FRAGMENTER) && - (msg_type(msg) == FIRST_FRAGMENT)) { - length = msg_size(msg_get_wrapped(msg)); - } - if (length) { - l_ptr->stats.msg_lengths_total += length; - l_ptr->stats.msg_length_counts++; - if (length <= 64) - l_ptr->stats.msg_length_profile[0]++; - else if (length <= 256) - l_ptr->stats.msg_length_profile[1]++; - else if (length <= 1024) - l_ptr->stats.msg_length_profile[2]++; - else if (length <= 4096) - l_ptr->stats.msg_length_profile[3]++; - else if (length <= 16384) - l_ptr->stats.msg_length_profile[4]++; - else if (length <= 32768) - l_ptr->stats.msg_length_profile[5]++; - else - l_ptr->stats.msg_length_profile[6]++; - } - } + rc = tipc_link_timeout(l, &xmitq); - /* do all other link processing performed on a periodic basis */ - if (l_ptr->silent_intv_cnt) - link_state_event(l_ptr, SILENCE_EVT); - else if (link_working(l_ptr) && tipc_bclink_acks_missing(l_ptr->owner)) - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); + if (rc & TIPC_LINK_DOWN_EVT) + tipc_link_reset(l); - l_ptr->silent_intv_cnt++; - if (skb_queue_len(&l_ptr->backlogq)) - tipc_link_push_packets(l_ptr); - link_set_timer(l_ptr, l_ptr->keepalive_intv); - tipc_node_unlock(l_ptr->owner); - tipc_link_put(l_ptr); + skb = __skb_dequeue(&xmitq); + if (skb) + tipc_bearer_send(l->owner->net, l->bearer_id, + skb, &l->media_addr); + link_set_timer(l, l->keepalive_intv); + tipc_node_unlock(l->owner); + tipc_link_put(l); } static void link_set_timer(struct tipc_link *link, unsigned long time) @@ -499,6 +469,62 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, return rc; } +/* link_profile_stats - update statistical profiling of traffic + */ +static void link_profile_stats(struct tipc_link *l) +{ + struct sk_buff *skb; + struct tipc_msg *msg; + int length; + + /* Update counters used in statistical profiling of send traffic */ + l->stats.accu_queue_sz += skb_queue_len(&l->transmq); + l->stats.queue_sz_counts++; + + skb = skb_peek(&l->transmq); + if (!skb) + return; + msg = buf_msg(skb); + length = msg_size(msg); + + if (msg_user(msg) == MSG_FRAGMENTER) { + if (msg_type(msg) != FIRST_FRAGMENT) + return; + length = msg_size(msg_get_wrapped(msg)); + } + l->stats.msg_lengths_total += length; + l->stats.msg_length_counts++; + if (length <= 64) + l->stats.msg_length_profile[0]++; + else if (length <= 256) + l->stats.msg_length_profile[1]++; + else if (length <= 1024) + l->stats.msg_length_profile[2]++; + else if (length <= 4096) + l->stats.msg_length_profile[3]++; + else if (length <= 16384) + l->stats.msg_length_profile[4]++; + else if (length <= 32768) + l->stats.msg_length_profile[5]++; + else + l->stats.msg_length_profile[6]++; +} + +/* tipc_link_timeout - perform periodic task as instructed from node timeout + */ +int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + int rc = 0; + + link_profile_stats(l); + if (l->silent_intv_cnt) + rc = tipc_link_fsm_evt(l, SILENCE_EVT, xmitq); + else if (link_working(l) && tipc_bclink_acks_missing(l->owner)) + tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); + l->silent_intv_cnt++; + return rc; +} + /** * link_schedule_user - schedule a message sender for wakeup after congestion * @link: congested link diff --git a/net/tipc/link.h b/net/tipc/link.h index ef68424f492d..98507b0f008d 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -245,6 +245,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); void link_prepare_wakeup(struct tipc_link *l); +int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq); static inline u32 link_own_addr(struct tipc_link *l) { -- cgit v1.2.3 From 8a1577c96f122308ac9b5f195f9f9a7dd74ac541 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:29 -0400 Subject: tipc: move link supervision timer to node level In our effort to move control of the links to the link aggregation layer, we move the perodic link supervision timer to struct tipc_node. The new timer is shared between all links belonging to the node, thus saving resources, while still kicking the FSM on both its pertaining links at each expiration. The current link timer and corresponding functions are removed. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 78 +++------------------------------------------------------ net/tipc/link.h | 2 -- net/tipc/node.c | 66 +++++++++++++++++++++++++++++++++++++++++++++--- net/tipc/node.h | 2 ++ 4 files changed, 68 insertions(+), 80 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index f58bb434d1c8..5b4609bd0ddc 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -127,7 +127,6 @@ static void link_handle_out_of_seq_msg(struct tipc_link *link, struct sk_buff *skb); static void tipc_link_proto_rcv(struct tipc_link *link, struct sk_buff *skb); -static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol); static void link_state_event(struct tipc_link *l_ptr, u32 event); static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, @@ -139,7 +138,6 @@ static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); -static void link_set_timer(struct tipc_link *link, unsigned long time); static void link_activate(struct tipc_link *link); /* @@ -150,21 +148,6 @@ static unsigned int align(unsigned int i) return (i + 3) & ~3u; } -static void tipc_link_release(struct kref *kref) -{ - kfree(container_of(kref, struct tipc_link, ref)); -} - -static void tipc_link_get(struct tipc_link *l_ptr) -{ - kref_get(&l_ptr->ref); -} - -static void tipc_link_put(struct tipc_link *l_ptr) -{ - kref_put(&l_ptr->ref, tipc_link_release); -} - static struct tipc_link *tipc_parallel_link(struct tipc_link *l) { struct tipc_node *n = l->owner; @@ -191,40 +174,6 @@ int tipc_link_is_active(struct tipc_link *l) return (node_active_link(n, 0) == l) || (node_active_link(n, 1) == l); } -/** - * link_timeout - handle expiration of link timer - */ -static void link_timeout(unsigned long data) -{ - struct tipc_link *l = (struct tipc_link *)data; - struct sk_buff_head xmitq; - struct sk_buff *skb; - int rc; - - __skb_queue_head_init(&xmitq); - - tipc_node_lock(l->owner); - - rc = tipc_link_timeout(l, &xmitq); - - if (rc & TIPC_LINK_DOWN_EVT) - tipc_link_reset(l); - - skb = __skb_dequeue(&xmitq); - if (skb) - tipc_bearer_send(l->owner->net, l->bearer_id, - skb, &l->media_addr); - link_set_timer(l, l->keepalive_intv); - tipc_node_unlock(l->owner); - tipc_link_put(l); -} - -static void link_set_timer(struct tipc_link *link, unsigned long time) -{ - if (!mod_timer(&link->timer, jiffies + time)) - tipc_link_get(link); -} - /** * tipc_link_create - create a new link * @n_ptr: pointer to associated node @@ -265,7 +214,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, pr_warn("Link creation failed, no memory\n"); return NULL; } - kref_init(&l_ptr->ref); l_ptr->addr = peer; if_name = strchr(b_ptr->name, ':') + 1; sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:unknown", @@ -278,7 +226,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, l_ptr->owner = n_ptr; l_ptr->peer_session = WILDCARD_SESSION; l_ptr->bearer_id = b_ptr->identity; - link_set_supervision_props(l_ptr, b_ptr->tolerance); + l_ptr->tolerance = b_ptr->tolerance; l_ptr->state = TIPC_LINK_RESETTING; l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; @@ -304,8 +252,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, skb_queue_head_init(l_ptr->inputq); link_reset_statistics(l_ptr); tipc_node_attach_link(n_ptr, l_ptr); - setup_timer(&l_ptr->timer, link_timeout, (unsigned long)l_ptr); - link_set_timer(l_ptr, l_ptr->keepalive_intv); return l_ptr; } @@ -316,12 +262,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, void tipc_link_delete(struct tipc_link *l) { tipc_link_reset(l); - if (del_timer(&l->timer)) - tipc_link_put(l); - /* Delete link now, or when timer is finished: */ tipc_link_reset_fragments(l); tipc_node_detach_link(l->owner, l); - tipc_link_put(l); } void tipc_link_delete_list(struct net *net, unsigned int bearer_id) @@ -1447,7 +1389,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, msg_tol = msg_link_tolerance(msg); if (msg_tol > l_ptr->tolerance) - link_set_supervision_props(l_ptr, msg_tol); + l_ptr->tolerance = msg_tol; if (msg_linkprio(msg) > l_ptr->priority) l_ptr->priority = msg_linkprio(msg); @@ -1473,7 +1415,7 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, msg_tol = msg_link_tolerance(msg); if (msg_tol) - link_set_supervision_props(l_ptr, msg_tol); + l_ptr->tolerance = msg_tol; if (msg_linkprio(msg) && (msg_linkprio(msg) != l_ptr->priority)) { @@ -1796,18 +1738,6 @@ exit: return *skb; } -static void link_set_supervision_props(struct tipc_link *l_ptr, u32 tol) -{ - unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; - - if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL)) - return; - - l_ptr->tolerance = tol; - l_ptr->keepalive_intv = msecs_to_jiffies(intv); - l_ptr->abort_limit = tol / (jiffies_to_msecs(l_ptr->keepalive_intv)); -} - void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) { int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); @@ -1984,7 +1914,7 @@ int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info) u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); - link_set_supervision_props(link, tol); + link->tolerance = tol; tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0); } if (props[TIPC_NLA_PROP_PRIO]) { diff --git a/net/tipc/link.h b/net/tipc/link.h index 98507b0f008d..0cf7d2b11803 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -146,9 +146,7 @@ struct tipc_link { u32 addr; char name[TIPC_MAX_LINK_NAME]; struct tipc_media_addr media_addr; - struct timer_list timer; struct tipc_node *owner; - struct kref ref; /* Management and link supervision data */ u32 peer_session; diff --git a/net/tipc/node.c b/net/tipc/node.c index b7a4457f653c..77effb233725 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -44,6 +44,7 @@ static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); static void tipc_node_delete(struct tipc_node *node); +static void tipc_node_timeout(unsigned long data); struct tipc_sock_conn { u32 port; @@ -145,11 +146,27 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) n_ptr->active_links[0] = INVALID_BEARER_ID; n_ptr->active_links[1] = INVALID_BEARER_ID; tipc_node_get(n_ptr); + setup_timer(&n_ptr->timer, tipc_node_timeout, (unsigned long)n_ptr); + n_ptr->keepalive_intv = U32_MAX; exit: spin_unlock_bh(&tn->node_list_lock); return n_ptr; } +static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) +{ + unsigned long tol = l->tolerance; + unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; + unsigned long keepalive_intv = msecs_to_jiffies(intv); + + /* Link with lowest tolerance determines timer interval */ + if (keepalive_intv < n->keepalive_intv) + n->keepalive_intv = keepalive_intv; + + /* Ensure link's abort limit corresponds to current interval */ + l->abort_limit = l->tolerance / jiffies_to_msecs(n->keepalive_intv); +} + static void tipc_node_delete(struct tipc_node *node) { list_del_rcu(&node->list); @@ -163,8 +180,11 @@ void tipc_node_stop(struct net *net) struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); - list_for_each_entry_safe(node, t_node, &tn->node_list, list) + list_for_each_entry_safe(node, t_node, &tn->node_list, list) { + if (del_timer(&node->timer)) + tipc_node_put(node); tipc_node_put(node); + } spin_unlock_bh(&tn->node_list_lock); } @@ -222,6 +242,38 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) tipc_node_put(node); } +/* tipc_node_timeout - handle expiration of node timer + */ +static void tipc_node_timeout(unsigned long data) +{ + struct tipc_node *n = (struct tipc_node *)data; + struct sk_buff_head xmitq; + struct tipc_link *l; + struct tipc_media_addr *maddr; + int bearer_id; + int rc = 0; + + __skb_queue_head_init(&xmitq); + + for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { + tipc_node_lock(n); + l = n->links[bearer_id].link; + if (l) { + /* Link tolerance may change asynchronously: */ + tipc_node_calculate_timer(n, l); + rc = tipc_link_timeout(l, &xmitq); + if (rc & TIPC_LINK_DOWN_EVT) + tipc_link_reset(l); + } + tipc_node_unlock(n); + maddr = &n->links[bearer_id].maddr; + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + } + if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + tipc_node_get(n); + tipc_node_put(n); +} + /** * tipc_node_link_up - handle addition of link * @@ -335,10 +387,16 @@ bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *b, struct tipc_media_addr *curr = &n->links[b->identity].maddr; struct sk_buff_head *inputq = &n->links[b->identity].inputq; - if (!l) + if (!l) { l = tipc_link_create(n, b, maddr, inputq, &n->bclink.namedq); - if (!l) - return false; + if (!l) + return false; + tipc_node_calculate_timer(n, l); + if (n->link_cnt == 1) { + if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) + tipc_node_get(n); + } + } memcpy(&l->media_addr, maddr, sizeof(*maddr)); memcpy(curr, maddr, sizeof(*maddr)); tipc_link_reset(l); diff --git a/net/tipc/node.h b/net/tipc/node.h index 86b7c740cf84..2d56344962e7 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -140,6 +140,8 @@ struct tipc_node { u32 link_id; struct list_head publ_list; struct list_head conn_sks; + unsigned long keepalive_intv; + struct timer_list timer; struct rcu_head rcu; }; -- cgit v1.2.3 From 1a20cc254e60e79929ef7edb5cf784df86b46e42 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:30 -0400 Subject: tipc: introduce node contact FSM The logics for determining when a node is permitted to establish and maintain contact with its peer node becomes non-trivial in the presence of multiple parallel links that may come and go independently. A known failure scenario is that one endpoint registers both its links to the peer lost, cleans up it binding table, and prepares for a table update once contact is re-establihed, while the other endpoint may see its links reset and re-established one by one, hence seeing no need to re-synchronize the binding table. To avoid this, a node must not allow re-establishing contact until it has confirmation that even the peer has lost both links. Currently, the mechanism for handling this consists of setting and resetting two state flags from different locations in the code. This solution is hard to understand and maintain. A closer analysis even reveals that it is not completely safe. In this commit we do instead introduce an FSM that keeps track of the conditions for when the node can establish and maintain links. It has six states and four events, and is strictly based on explicit knowledge about the own node's and the peer node's contact states. Only events leading to state change are shown as edges in the figure below. +--------------+ | SELF_UP/ | +---------------->| PEER_COMING |-----------------+ SELF_ | +--------------+ |PEER_ ESTBL_ | | |ESTBL_ CONTACT| SELF_LOST_CONTACT | |CONTACT | v | | +--------------+ | | PEER_ | SELF_DOWN/ | SELF_ | | LOST_ +--| PEER_LEAVING |<--+ LOST_ v +-------------+ CONTACT | +--------------+ | CONTACT +-----------+ | SELF_DOWN/ |<----------+ +----------| SELF_UP/ | | PEER_DOWN |<----------+ +----------| PEER_UP | +-------------+ SELF_ | +--------------+ | PEER_ +-----------+ | LOST_ +--| SELF_LEAVING/|<--+ LOST_ A | CONTACT | PEER_DOWN | CONTACT | | +--------------+ | | A | PEER_ | PEER_LOST_CONTACT | |SELF_ ESTBL_ | | |ESTBL_ CONTACT| +--------------+ |CONTACT +---------------->| PEER_UP/ |-----------------+ | SELF_COMING | +--------------+ Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 74 ++++++++++++++------------------ net/tipc/msg.h | 7 +++ net/tipc/node.c | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- net/tipc/node.h | 28 +++++++++--- 4 files changed, 185 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 5b4609bd0ddc..eaccf4552d15 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -911,9 +911,13 @@ static void link_retransmit_failure(struct tipc_link *l_ptr, if (l_ptr->addr) { /* Handle failure on standard link */ - link_print(l_ptr, "Resetting link\n"); + link_print(l_ptr, "Resetting link "); + pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", + msg_user(msg), msg_type(msg), msg_size(msg), + msg_errcode(msg)); + pr_info("sqno %u, prev: %x, src: %x\n", + msg_seqno(msg), msg_prevnode(msg), msg_orignode(msg)); tipc_link_reset(l_ptr); - } else { /* Handle failure on broadcast link */ struct tipc_node *n_ptr; @@ -1067,15 +1071,8 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) if (unlikely(!l_ptr)) goto unlock; - /* Verify that communication with node is currently allowed */ - if ((n_ptr->action_flags & TIPC_WAIT_PEER_LINKS_DOWN) && - msg_user(msg) == LINK_PROTOCOL && - (msg_type(msg) == RESET_MSG || - msg_type(msg) == ACTIVATE_MSG) && - !msg_redundant_link(msg)) - n_ptr->action_flags &= ~TIPC_WAIT_PEER_LINKS_DOWN; - - if (tipc_node_blocked(n_ptr)) + /* Is reception of this pkt permitted at the moment ? */ + if (!tipc_node_filter_skb(n_ptr, msg)) goto unlock; /* Validate message sequence number info */ @@ -1371,15 +1368,6 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, if (less_eq(msg_session(msg), l_ptr->peer_session)) break; /* duplicate or old reset: ignore */ } - - if (!msg_redundant_link(msg) && (link_working(l_ptr) || - link_probing(l_ptr))) { - /* peer has lost contact -- don't allow peer's links - * to reactivate before we recognize loss & clean up - */ - l_ptr->owner->action_flags |= TIPC_WAIT_OWN_LINKS_DOWN; - } - link_state_event(l_ptr, RESET_MSG); /* fall thru' */ @@ -1408,6 +1396,8 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, l_ptr->peer_session = msg_session(msg); l_ptr->peer_bearer_id = msg_bearer_id(msg); + if (!msg_peer_is_up(msg)) + tipc_node_fsm_evt(l_ptr->owner, PEER_LOST_CONTACT_EVT); if (msg_type(msg) == ACTIVATE_MSG) link_state_event(l_ptr, ACTIVATE_MSG); break; @@ -1419,11 +1409,11 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, if (msg_linkprio(msg) && (msg_linkprio(msg) != l_ptr->priority)) { - pr_debug("%s<%s>, priority change %u->%u\n", - link_rst_msg, l_ptr->name, - l_ptr->priority, msg_linkprio(msg)); + pr_info("%s<%s>, priority change %u->%u\n", + link_rst_msg, l_ptr->name, + l_ptr->priority, msg_linkprio(msg)); l_ptr->priority = msg_linkprio(msg); - tipc_link_reset(l_ptr); /* Enforce change to take effect */ + tipc_link_reset(l_ptr); break; } @@ -1446,15 +1436,18 @@ static void tipc_link_proto_rcv(struct tipc_link *l_ptr, tipc_bclink_update_link_state(l_ptr->owner, msg_last_bcast(msg)); - if (rec_gap || (msg_probe(msg))) { + if (rec_gap || (msg_probe(msg))) tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, rec_gap, 0, 0); - } + if (msg_seq_gap(msg)) { l_ptr->stats.recv_nacks++; tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq), msg_seq_gap(msg)); } + if (tipc_link_is_up(l_ptr)) + tipc_node_fsm_evt(l_ptr->owner, + PEER_ESTABL_CONTACT_EVT); break; } exit: @@ -1478,10 +1471,6 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, if (l->exec_mode == TIPC_LINK_BLOCKED) return; - /* Abort non-RESET send if communication with node is prohibited */ - if ((tipc_node_blocked(l->owner)) && (mtyp != RESET_MSG)) - return; - msg_set_type(hdr, mtyp); msg_set_net_plane(hdr, l->net_plane); msg_set_bcast_ack(hdr, l->owner->bclink.last_in); @@ -1799,27 +1788,28 @@ static void link_reset_statistics(struct tipc_link *l_ptr) l_ptr->stats.recv_info = l_ptr->rcv_nxt; } -static void link_print(struct tipc_link *l_ptr, const char *str) +static void link_print(struct tipc_link *l, const char *str) { - struct tipc_net *tn = net_generic(l_ptr->owner->net, tipc_net_id); - struct tipc_bearer *b_ptr; + struct sk_buff *hskb = skb_peek(&l->transmq); + u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt; + u16 tail = l->snd_nxt - 1; - rcu_read_lock(); - b_ptr = rcu_dereference_rtnl(tn->bearer_list[l_ptr->bearer_id]); - if (b_ptr) - pr_info("%s Link %x<%s>:", str, l_ptr->addr, b_ptr->name); - rcu_read_unlock(); + pr_info("%s Link <%s>:", str, l->name); - if (link_probing(l_ptr)) + if (link_probing(l)) pr_cont(":P\n"); - else if (link_establishing(l_ptr)) + else if (link_establishing(l)) pr_cont(":E\n"); - else if (link_resetting(l_ptr)) + else if (link_resetting(l)) pr_cont(":R\n"); - else if (link_working(l_ptr)) + else if (link_working(l)) pr_cont(":W\n"); else pr_cont("\n"); + + pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n", + skb_queue_len(&l->transmq), head, tail, + skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt); } /* Parse and validate nested (link) properties valid for media, bearer and link diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 19c45fb66238..4dc66d9f69cc 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -766,6 +766,13 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } +static inline bool msg_peer_is_up(struct tipc_msg *m) +{ + if (likely(msg_user(m) != LINK_PROTOCOL) || (msg_type(m) == STATE_MSG)) + return true; + return msg_redundant_link(m); +} + struct sk_buff *tipc_buf_acquire(u32 size); bool tipc_msg_validate(struct sk_buff *skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, diff --git a/net/tipc/node.c b/net/tipc/node.c index 77effb233725..9dbbb5de287b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -141,7 +141,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) break; } list_add_tail_rcu(&n_ptr->list, &temp_node->list); - n_ptr->action_flags = TIPC_WAIT_PEER_LINKS_DOWN; + n_ptr->state = SELF_DOWN_PEER_DOWN; n_ptr->signature = INVALID_NODE_SIG; n_ptr->active_links[0] = INVALID_BEARER_ID; n_ptr->active_links[1] = INVALID_BEARER_ID; @@ -421,8 +421,131 @@ void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) } } +/* tipc_node_fsm_evt - node finite state machine + * Determines when contact is allowed with peer node + */ +void tipc_node_fsm_evt(struct tipc_node *n, int evt) +{ + int state = n->state; + + switch (state) { + case SELF_DOWN_PEER_DOWN: + switch (evt) { + case SELF_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_COMING; + break; + case PEER_ESTABL_CONTACT_EVT: + state = SELF_COMING_PEER_UP; + break; + case SELF_LOST_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_UP_PEER_UP: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_DOWN_PEER_LEAVING: + switch (evt) { + case PEER_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_DOWN; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + case SELF_LOST_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_UP_PEER_COMING: + switch (evt) { + case PEER_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_UP; + break; + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_COMING_PEER_UP: + switch (evt) { + case SELF_ESTABL_CONTACT_EVT: + state = SELF_UP_PEER_UP; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case SELF_LOST_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + case SELF_LEAVING_PEER_DOWN: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_DOWN; + break; + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + case PEER_LOST_CONTACT_EVT: + break; + default: + pr_err("Unknown node fsm evt %x/%x\n", state, evt); + } + break; + default: + pr_err("Unknown node fsm state %x\n", state); + break; + } + + n->state = state; +} + +bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_msg *hdr) +{ + int state = n->state; + + if (likely(state == SELF_UP_PEER_UP)) + return true; + if (state == SELF_DOWN_PEER_DOWN) + return true; + if (state == SELF_UP_PEER_COMING) + return true; + if (state == SELF_COMING_PEER_UP) + return true; + if (state == SELF_LEAVING_PEER_DOWN) + return false; + if (state == SELF_DOWN_PEER_LEAVING) + if (!msg_peer_is_up(hdr)) + return true; + return false; +} + static void node_established_contact(struct tipc_node *n_ptr) { + tipc_node_fsm_evt(n_ptr, SELF_ESTABL_CONTACT_EVT); n_ptr->action_flags |= TIPC_NOTIFY_NODE_UP; n_ptr->bclink.oos_state = 0; n_ptr->bclink.acked = tipc_bclink_get_last_sent(n_ptr->net); @@ -468,11 +591,8 @@ static void node_lost_contact(struct tipc_node *n_ptr) l_ptr->failover_skb = NULL; tipc_link_reset_fragments(l_ptr); } - - n_ptr->action_flags &= ~TIPC_WAIT_OWN_LINKS_DOWN; - /* Prevent re-contact with node until cleanup is done */ - n_ptr->action_flags |= TIPC_WAIT_PEER_LINKS_DOWN; + tipc_node_fsm_evt(n_ptr, SELF_LOST_CONTACT_EVT); /* Notify publications from this node */ n_ptr->action_flags |= TIPC_NOTIFY_NODE_DOWN; diff --git a/net/tipc/node.h b/net/tipc/node.h index 2d56344962e7..270256e09ee5 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -47,6 +47,24 @@ #define INVALID_BEARER_ID -1 +/* Node FSM states and events: + */ +enum { + SELF_DOWN_PEER_DOWN = 0xdd, + SELF_UP_PEER_UP = 0xaa, + SELF_DOWN_PEER_LEAVING = 0xd1, + SELF_UP_PEER_COMING = 0xac, + SELF_COMING_PEER_UP = 0xca, + SELF_LEAVING_PEER_DOWN = 0x1d, +}; + +enum { + SELF_ESTABL_CONTACT_EVT = 0xec, + SELF_LOST_CONTACT_EVT = 0x1c, + PEER_ESTABL_CONTACT_EVT = 0xfec, + PEER_LOST_CONTACT_EVT = 0xf1c +}; + /* Flags used to take different actions according to flag type * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down @@ -56,8 +74,6 @@ */ enum { TIPC_MSG_EVT = 1, - TIPC_WAIT_PEER_LINKS_DOWN = (1 << 1), - TIPC_WAIT_OWN_LINKS_DOWN = (1 << 2), TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_WAKEUP_BCAST_USERS = (1 << 5), @@ -133,6 +149,7 @@ struct tipc_node { int action_flags; struct tipc_node_bclink bclink; struct list_head list; + int state; int link_cnt; u16 working_links; u16 capabilities; @@ -176,11 +193,8 @@ static inline void tipc_node_lock(struct tipc_node *node) spin_lock_bh(&node->lock); } -static inline bool tipc_node_blocked(struct tipc_node *node) -{ - return (node->action_flags & (TIPC_WAIT_PEER_LINKS_DOWN | - TIPC_NOTIFY_NODE_DOWN | TIPC_WAIT_OWN_LINKS_DOWN)); -} +void tipc_node_fsm_evt(struct tipc_node *n, int evt); +bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_msg *hdr); static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel) { -- cgit v1.2.3 From d999297c3dbbe7fdd832f7fa4ec84301e170b3e6 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 16 Jul 2015 16:54:31 -0400 Subject: tipc: reduce locking scope during packet reception We convert packet/message reception according to the same principle we have been using for message sending and timeout handling: We move the function tipc_rcv() to node.c, hence handling the initial packet reception at the link aggregation level. The function grabs the node lock, selects the receiving link, and accesses it via a new call tipc_link_rcv(). This function appends buffers to the input queue for delivery upwards, but it may also append outgoing packets to the xmit queue, just as we do during regular message sending. The latter will happen when buffers are forwarded from the link backlog, or when retransmission is requested. Upon return of this function, and after having released the node lock, tipc_rcv() delivers/tranmsits the contents of those queues, but it may also perform actions such as link activation or reset, as indicated by the return flags from the link. This reduces the number of cpu cycles spent inside the node spinlock, and reduces contention on that lock. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bcast.c | 23 ++ net/tipc/bcast.h | 1 + net/tipc/core.h | 5 + net/tipc/link.c | 673 +++++++++++++++++++++++++------------------------------ net/tipc/link.h | 6 +- net/tipc/msg.h | 50 ++++- net/tipc/node.c | 105 ++++++++- net/tipc/node.h | 4 - 8 files changed, 478 insertions(+), 389 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index aab4e8dd7b32..8b010c976b2f 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -316,6 +316,29 @@ void tipc_bclink_update_link_state(struct tipc_node *n_ptr, } } +void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *hdr) +{ + u16 last = msg_last_bcast(hdr); + int mtyp = msg_type(hdr); + + if (unlikely(msg_user(hdr) != LINK_PROTOCOL)) + return; + if (mtyp == STATE_MSG) { + tipc_bclink_update_link_state(n, last); + return; + } + /* Compatibility: older nodes don't know BCAST_PROTOCOL synchronization, + * and transfer synch info in LINK_PROTOCOL messages. + */ + if (tipc_node_is_up(n)) + return; + if ((mtyp != RESET_MSG) && (mtyp != ACTIVATE_MSG)) + return; + n->bclink.last_sent = last; + n->bclink.last_in = last; + n->bclink.oos_state = 0; +} + /** * bclink_peek_nack - monitor retransmission requests sent by other nodes * diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index 3c290a48f720..d74c69bcf60b 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -133,5 +133,6 @@ void tipc_bclink_wakeup_users(struct net *net); int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg); int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]); void tipc_bclink_input(struct net *net); +void tipc_bclink_sync_state(struct tipc_node *n, struct tipc_msg *msg); #endif diff --git a/net/tipc/core.h b/net/tipc/core.h index 0fcf133d5cb7..f4ed67778c54 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -129,6 +129,11 @@ static inline int less(u16 left, u16 right) return less_eq(left, right) && (mod(right) != mod(left)); } +static inline int in_range(u16 val, u16 min, u16 max) +{ + return !less(val, min) && !more(val, max); +} + #ifdef CONFIG_SYSCTL int tipc_register_sysctl(void); void tipc_unregister_sysctl(void); diff --git a/net/tipc/link.c b/net/tipc/link.c index eaccf4552d15..55b675d20de8 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -76,6 +76,10 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 } }; +/* + * Interval between NACKs when packets arrive out of order + */ +#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2) /* * Out-of-range value for link session numbers */ @@ -123,22 +127,19 @@ static int link_establishing(struct tipc_link *l) return l->state == TIPC_LINK_ESTABLISHING; } -static void link_handle_out_of_seq_msg(struct tipc_link *link, - struct sk_buff *skb); -static void tipc_link_proto_rcv(struct tipc_link *link, - struct sk_buff *skb); -static void link_state_event(struct tipc_link *l_ptr, u32 event); +static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq); static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq); static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); -static void tipc_link_sync_xmit(struct tipc_link *l); +static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, + struct sk_buff_head *xmitq); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); -static void link_activate(struct tipc_link *link); /* * Simple link routines @@ -283,6 +284,26 @@ void tipc_link_delete_list(struct net *net, unsigned int bearer_id) rcu_read_unlock(); } +/* tipc_link_build_bcast_sync_msg() - synchronize broadcast link endpoints. + * + * Give a newly added peer node the sequence number where it should + * start receiving and acking broadcast packets. + */ +static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) +{ + struct sk_buff *skb; + struct sk_buff_head list; + + skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, + 0, l->addr, link_own_addr(l), 0, 0, 0); + if (!skb) + return; + __skb_queue_head_init(&list); + __skb_queue_tail(&list, skb); + tipc_link_xmit(l, &list, xmitq); +} + /** * tipc_link_fsm_evt - link finite state machine * @l: pointer to link @@ -295,12 +316,13 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, int mtyp = 0, rc = 0; struct tipc_link *pl; enum { - LINK_RESET = 1, - LINK_ACTIVATE = (1 << 1), - SND_PROBE = (1 << 2), - SND_STATE = (1 << 3), - SND_RESET = (1 << 4), - SND_ACTIVATE = (1 << 5) + LINK_RESET = 1, + LINK_ACTIVATE = (1 << 1), + SND_PROBE = (1 << 2), + SND_STATE = (1 << 3), + SND_RESET = (1 << 4), + SND_ACTIVATE = (1 << 5), + SND_BCAST_SYNC = (1 << 6) } actions = 0; if (l->exec_mode == TIPC_LINK_BLOCKED) @@ -352,8 +374,8 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, if (pl && link_probing(pl)) break; actions |= LINK_ACTIVATE; - if (l->owner->working_links == 1) - tipc_link_sync_xmit(l); + if (!l->owner->working_links) + actions |= SND_BCAST_SYNC; break; case PEER_RESET_EVT: l->state = TIPC_LINK_ESTABLISHING; @@ -374,8 +396,8 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, if (pl && link_probing(pl)) break; actions |= LINK_ACTIVATE; - if (l->owner->working_links == 1) - tipc_link_sync_xmit(l); + if (!l->owner->working_links) + actions |= SND_BCAST_SYNC; break; case PEER_RESET_EVT: break; @@ -408,6 +430,8 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, if (actions & (SND_PROBE | SND_STATE | SND_RESET | SND_ACTIVATE)) tipc_link_build_proto_msg(l, mtyp, actions & SND_PROBE, 0, 0, 0, xmitq); + if (actions & SND_BCAST_SYNC) + tipc_link_build_bcast_sync_msg(l, xmitq); return rc; } @@ -605,12 +629,14 @@ void tipc_link_reset(struct tipc_link *l_ptr) l_ptr->reasm_buf = NULL; l_ptr->rcv_unacked = 0; l_ptr->snd_nxt = 1; + l_ptr->rcv_nxt = 1; l_ptr->silent_intv_cnt = 0; + l_ptr->stats.recv_info = 0; l_ptr->stale_count = 0; link_reset_statistics(l_ptr); } -static void link_activate(struct tipc_link *link) +void tipc_link_activate(struct tipc_link *link) { struct tipc_node *node = link->owner; @@ -623,36 +649,6 @@ static void link_activate(struct tipc_link *link) tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); } -/** - * link_state_event - link finite state machine - * @l_ptr: pointer to link - * @event: state machine event to process - */ -static void link_state_event(struct tipc_link *l, unsigned int evt) -{ - int rc; - struct sk_buff_head xmitq; - struct sk_buff *skb; - - if (l->exec_mode == TIPC_LINK_BLOCKED) - return; - - __skb_queue_head_init(&xmitq); - - rc = tipc_link_fsm_evt(l, evt, &xmitq); - - if (rc & TIPC_LINK_UP_EVT) - link_activate(l); - - if (rc & TIPC_LINK_DOWN_EVT) - tipc_link_reset(l); - - skb = __skb_dequeue(&xmitq); - if (!skb) - return; - tipc_bearer_send(l->owner->net, l->bearer_id, skb, &l->media_addr); -} - /** * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked * @link: link to use @@ -807,30 +803,6 @@ static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) return __tipc_link_xmit(link->owner->net, link, &head); } -/* - * tipc_link_sync_xmit - synchronize broadcast link endpoints. - * - * Give a newly added peer node the sequence number where it should - * start receiving and acking broadcast packets. - * - * Called with node locked - */ -static void tipc_link_sync_xmit(struct tipc_link *link) -{ - struct sk_buff *skb; - struct tipc_msg *msg; - - skb = tipc_buf_acquire(INT_H_SIZE); - if (!skb) - return; - - msg = buf_msg(skb); - tipc_msg_init(link_own_addr(link), msg, BCAST_PROTOCOL, STATE_MSG, - INT_H_SIZE, link->addr); - msg_set_last_bcast(msg, link->owner->bclink.acked); - __tipc_link_xmit_skb(link, skb); -} - /* * tipc_link_sync_rcv - synchronize broadcast link endpoints. * Receive the sequence number where we should start receiving and @@ -881,6 +853,34 @@ void tipc_link_push_packets(struct tipc_link *link) link->snd_nxt = seqno; } +void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) +{ + struct sk_buff *skb, *_skb; + struct tipc_msg *hdr; + u16 seqno = l->snd_nxt; + u16 ack = l->rcv_nxt - 1; + + while (skb_queue_len(&l->transmq) < l->window) { + skb = skb_peek(&l->backlogq); + if (!skb) + break; + _skb = skb_clone(skb, GFP_ATOMIC); + if (!_skb) + break; + __skb_dequeue(&l->backlogq); + hdr = buf_msg(skb); + l->backlog[msg_importance(hdr)].len--; + __skb_queue_tail(&l->transmq, skb); + __skb_queue_tail(xmitq, _skb); + msg_set_ack(hdr, ack); + msg_set_seqno(hdr, seqno); + msg_set_bcast_ack(hdr, l->owner->bclink.last_in); + l->rcv_unacked = 0; + seqno++; + } + l->snd_nxt = seqno; +} + void tipc_link_reset_all(struct tipc_node *node) { char addr_string[16]; @@ -978,6 +978,41 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, } } +static int tipc_link_retransm(struct tipc_link *l, int retransm, + struct sk_buff_head *xmitq) +{ + struct sk_buff *_skb, *skb = skb_peek(&l->transmq); + struct tipc_msg *hdr; + + if (!skb) + return 0; + + /* Detect repeated retransmit failures on same packet */ + if (likely(l->last_retransm != buf_seqno(skb))) { + l->last_retransm = buf_seqno(skb); + l->stale_count = 1; + } else if (++l->stale_count > 100) { + link_retransmit_failure(l, skb); + return TIPC_LINK_DOWN_EVT; + } + skb_queue_walk(&l->transmq, skb) { + if (!retransm) + return 0; + hdr = buf_msg(skb); + _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); + if (!_skb) + return 0; + hdr = buf_msg(_skb); + msg_set_ack(hdr, l->rcv_nxt - 1); + msg_set_bcast_ack(hdr, l->owner->bclink.last_in); + _skb->priority = TC_PRIO_CONTROL; + __skb_queue_tail(xmitq, _skb); + retransm--; + l->stats.retransmitted++; + } + return 0; +} + /* link_synch(): check if all packets arrived before the synch * point have been consumed * Returns true if the parallel links are synched, otherwise false @@ -1004,155 +1039,6 @@ synched: return true; } -static void link_retrieve_defq(struct tipc_link *link, - struct sk_buff_head *list) -{ - u16 seq_no; - - if (skb_queue_empty(&link->deferdq)) - return; - - seq_no = buf_seqno(skb_peek(&link->deferdq)); - if (seq_no == link->rcv_nxt) - skb_queue_splice_tail_init(&link->deferdq, list); -} - -/** - * tipc_rcv - process TIPC packets/messages arriving from off-node - * @net: the applicable net namespace - * @skb: TIPC packet - * @b_ptr: pointer to bearer message arrived on - * - * Invoked with no locks held. Bearer pointer must point to a valid bearer - * structure (i.e. cannot be NULL), but bearer can be inactive. - */ -void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct sk_buff_head head; - struct tipc_node *n_ptr; - struct tipc_link *l_ptr; - struct sk_buff *skb1, *tmp; - struct tipc_msg *msg; - u16 seq_no; - u16 ackd; - u32 released; - - skb2list(skb, &head); - - while ((skb = __skb_dequeue(&head))) { - /* Ensure message is well-formed */ - if (unlikely(!tipc_msg_validate(skb))) - goto discard; - - /* Handle arrival of a non-unicast link message */ - msg = buf_msg(skb); - if (unlikely(msg_non_seq(msg))) { - if (msg_user(msg) == LINK_CONFIG) - tipc_disc_rcv(net, skb, b_ptr); - else - tipc_bclink_rcv(net, skb); - continue; - } - - /* Discard unicast link messages destined for another node */ - if (unlikely(!msg_short(msg) && - (msg_destnode(msg) != tn->own_addr))) - goto discard; - - /* Locate neighboring node that sent message */ - n_ptr = tipc_node_find(net, msg_prevnode(msg)); - if (unlikely(!n_ptr)) - goto discard; - - tipc_node_lock(n_ptr); - /* Locate unicast link endpoint that should handle message */ - l_ptr = n_ptr->links[b_ptr->identity].link; - if (unlikely(!l_ptr)) - goto unlock; - - /* Is reception of this pkt permitted at the moment ? */ - if (!tipc_node_filter_skb(n_ptr, msg)) - goto unlock; - - /* Validate message sequence number info */ - seq_no = msg_seqno(msg); - ackd = msg_ack(msg); - - /* Release acked messages */ - if (unlikely(n_ptr->bclink.acked != msg_bcast_ack(msg))) - tipc_bclink_acknowledge(n_ptr, msg_bcast_ack(msg)); - - released = 0; - skb_queue_walk_safe(&l_ptr->transmq, skb1, tmp) { - if (more(buf_seqno(skb1), ackd)) - break; - __skb_unlink(skb1, &l_ptr->transmq); - kfree_skb(skb1); - released = 1; - } - - /* Try sending any messages link endpoint has pending */ - if (unlikely(skb_queue_len(&l_ptr->backlogq))) - tipc_link_push_packets(l_ptr); - - if (released && !skb_queue_empty(&l_ptr->wakeupq)) - link_prepare_wakeup(l_ptr); - - /* Process the incoming packet */ - if (unlikely(!link_working(l_ptr))) { - if (msg_user(msg) == LINK_PROTOCOL) { - tipc_link_proto_rcv(l_ptr, skb); - link_retrieve_defq(l_ptr, &head); - skb = NULL; - goto unlock; - } - - /* Traffic message. Conditionally activate link */ - link_state_event(l_ptr, TRAFFIC_EVT); - - if (link_working(l_ptr)) { - /* Re-insert buffer in front of queue */ - __skb_queue_head(&head, skb); - skb = NULL; - goto unlock; - } - goto unlock; - } - - /* Link is now in state TIPC_LINK_WORKING */ - if (unlikely(seq_no != l_ptr->rcv_nxt)) { - link_handle_out_of_seq_msg(l_ptr, skb); - link_retrieve_defq(l_ptr, &head); - skb = NULL; - goto unlock; - } - l_ptr->silent_intv_cnt = 0; - - /* Synchronize with parallel link if applicable */ - if (unlikely((l_ptr->exec_mode == TIPC_LINK_TUNNEL) && - !msg_dup(msg))) { - if (!link_synch(l_ptr)) - goto unlock; - } - l_ptr->rcv_nxt++; - if (unlikely(!skb_queue_empty(&l_ptr->deferdq))) - link_retrieve_defq(l_ptr, &head); - if (unlikely(++l_ptr->rcv_unacked >= TIPC_MIN_LINK_WIN)) { - l_ptr->stats.sent_acks++; - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - } - tipc_link_input(l_ptr, skb); - skb = NULL; -unlock: - tipc_node_unlock(n_ptr); - tipc_node_put(n_ptr); -discard: - if (unlikely(skb)) - kfree_skb(skb); - } -} - /* tipc_data_input - deliver data and name distr msgs to upper layer * * Consumes buffer if message is of right type @@ -1206,9 +1092,6 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) struct sk_buff *iskb; int pos = 0; - if (likely(tipc_data_input(link, skb))) - return; - switch (msg_user(msg)) { case TUNNEL_PROTOCOL: if (msg_dup(msg)) { @@ -1247,6 +1130,110 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) }; } +static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) +{ + bool released = false; + struct sk_buff *skb, *tmp; + + skb_queue_walk_safe(&l->transmq, skb, tmp) { + if (more(buf_seqno(skb), acked)) + break; + __skb_unlink(skb, &l->transmq); + kfree_skb(skb); + released = true; + } + return released; +} + +/* tipc_link_rcv - process TIPC packets/messages arriving from off-node + * @link: the link that should handle the message + * @skb: TIPC packet + * @xmitq: queue to place packets to be sent after this call + */ +int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq) +{ + struct sk_buff_head *arrvq = &l->deferdq; + struct sk_buff *tmp; + struct tipc_msg *hdr; + u16 seqno, rcv_nxt; + int rc = 0; + + if (unlikely(!__tipc_skb_queue_sorted(arrvq, skb))) { + if (!(skb_queue_len(arrvq) % TIPC_NACK_INTV)) + tipc_link_build_proto_msg(l, STATE_MSG, 0, + 0, 0, 0, xmitq); + return rc; + } + + skb_queue_walk_safe(arrvq, skb, tmp) { + hdr = buf_msg(skb); + + /* Verify and update link state */ + if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) { + __skb_dequeue(arrvq); + rc |= tipc_link_proto_rcv(l, skb, xmitq); + continue; + } + + if (unlikely(!link_working(l))) { + rc |= tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq); + if (!link_working(l)) { + kfree_skb(__skb_dequeue(arrvq)); + return rc; + } + } + + l->silent_intv_cnt = 0; + + /* Forward queues and wake up waiting users */ + if (likely(tipc_link_release_pkts(l, msg_ack(hdr)))) { + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + } + + /* Defer reception if there is a gap in the sequence */ + seqno = msg_seqno(hdr); + rcv_nxt = l->rcv_nxt; + if (unlikely(less(rcv_nxt, seqno))) { + l->stats.deferred_recv++; + return rc; + } + + __skb_dequeue(arrvq); + + /* Drop if packet already received */ + if (unlikely(more(rcv_nxt, seqno))) { + l->stats.duplicates++; + kfree_skb(skb); + return rc; + } + + /* Synchronize with parallel link if applicable */ + if (unlikely(l->exec_mode == TIPC_LINK_TUNNEL)) + if (!msg_dup(hdr) && !link_synch(l)) { + kfree_skb(skb); + return rc; + } + + /* Packet can be delivered */ + l->rcv_nxt++; + l->stats.recv_info++; + if (unlikely(!tipc_data_input(l, skb))) + tipc_link_input(l, skb); + + /* Ack at regular intervals */ + if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) { + l->rcv_unacked = 0; + l->stats.sent_acks++; + tipc_link_build_proto_msg(l, STATE_MSG, + 0, 0, 0, 0, xmitq); + } + } + return rc; +} + /** * tipc_link_defer_pkt - Add out-of-sequence message to deferred reception queue * @@ -1286,41 +1273,6 @@ u32 tipc_link_defer_pkt(struct sk_buff_head *list, struct sk_buff *skb) return 1; } -/* - * link_handle_out_of_seq_msg - handle arrival of out-of-sequence packet - */ -static void link_handle_out_of_seq_msg(struct tipc_link *l_ptr, - struct sk_buff *buf) -{ - u32 seq_no = buf_seqno(buf); - - if (likely(msg_user(buf_msg(buf)) == LINK_PROTOCOL)) { - tipc_link_proto_rcv(l_ptr, buf); - return; - } - - /* Record OOS packet arrival */ - l_ptr->silent_intv_cnt = 0; - - /* - * Discard packet if a duplicate; otherwise add it to deferred queue - * and notify peer of gap as per protocol specification - */ - if (less(seq_no, l_ptr->rcv_nxt)) { - l_ptr->stats.duplicates++; - kfree_skb(buf); - return; - } - - if (tipc_link_defer_pkt(&l_ptr->deferdq, buf)) { - l_ptr->stats.deferred_recv++; - if ((skb_queue_len(&l_ptr->deferdq) % TIPC_MIN_LINK_WIN) == 1) - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, 0, 0, 0); - } else { - l_ptr->stats.duplicates++; - } -} - /* * Send protocol message to the other endpoint. */ @@ -1341,119 +1293,6 @@ void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg, kfree_skb(skb); } -/* - * Receive protocol message : - * Note that network plane id propagates through the network, and may - * change at any time. The node with lowest address rules - */ -static void tipc_link_proto_rcv(struct tipc_link *l_ptr, - struct sk_buff *buf) -{ - u32 rec_gap = 0; - u32 msg_tol; - struct tipc_msg *msg = buf_msg(buf); - - if (l_ptr->exec_mode == TIPC_LINK_BLOCKED) - goto exit; - - if (l_ptr->net_plane != msg_net_plane(msg)) - if (link_own_addr(l_ptr) > msg_prevnode(msg)) - l_ptr->net_plane = msg_net_plane(msg); - - switch (msg_type(msg)) { - - case RESET_MSG: - if (!link_probing(l_ptr) && - (l_ptr->peer_session != WILDCARD_SESSION)) { - if (less_eq(msg_session(msg), l_ptr->peer_session)) - break; /* duplicate or old reset: ignore */ - } - link_state_event(l_ptr, RESET_MSG); - - /* fall thru' */ - case ACTIVATE_MSG: - /* Update link settings according other endpoint's values */ - strcpy((strrchr(l_ptr->name, ':') + 1), (char *)msg_data(msg)); - - msg_tol = msg_link_tolerance(msg); - if (msg_tol > l_ptr->tolerance) - l_ptr->tolerance = msg_tol; - - if (msg_linkprio(msg) > l_ptr->priority) - l_ptr->priority = msg_linkprio(msg); - - if (l_ptr->mtu > msg_max_pkt(msg)) - l_ptr->mtu = msg_max_pkt(msg); - - /* Synchronize broadcast link info, if not done previously */ - if (!tipc_node_is_up(l_ptr->owner)) { - l_ptr->owner->bclink.last_sent = - l_ptr->owner->bclink.last_in = - msg_last_bcast(msg); - l_ptr->owner->bclink.oos_state = 0; - } - - l_ptr->peer_session = msg_session(msg); - l_ptr->peer_bearer_id = msg_bearer_id(msg); - - if (!msg_peer_is_up(msg)) - tipc_node_fsm_evt(l_ptr->owner, PEER_LOST_CONTACT_EVT); - if (msg_type(msg) == ACTIVATE_MSG) - link_state_event(l_ptr, ACTIVATE_MSG); - break; - case STATE_MSG: - - msg_tol = msg_link_tolerance(msg); - if (msg_tol) - l_ptr->tolerance = msg_tol; - - if (msg_linkprio(msg) && - (msg_linkprio(msg) != l_ptr->priority)) { - pr_info("%s<%s>, priority change %u->%u\n", - link_rst_msg, l_ptr->name, - l_ptr->priority, msg_linkprio(msg)); - l_ptr->priority = msg_linkprio(msg); - tipc_link_reset(l_ptr); - break; - } - - /* Record reception; force mismatch at next timeout: */ - l_ptr->silent_intv_cnt = 0; - - link_state_event(l_ptr, TRAFFIC_EVT); - l_ptr->stats.recv_states++; - if (link_resetting(l_ptr)) - break; - - if (less_eq(l_ptr->rcv_nxt, msg_next_sent(msg))) - rec_gap = mod(msg_next_sent(msg) - l_ptr->rcv_nxt); - - if (msg_probe(msg)) - l_ptr->stats.recv_probes++; - - /* Protocol message before retransmits, reduce loss risk */ - if (l_ptr->owner->bclink.recv_permitted) - tipc_bclink_update_link_state(l_ptr->owner, - msg_last_bcast(msg)); - - if (rec_gap || (msg_probe(msg))) - tipc_link_proto_xmit(l_ptr, STATE_MSG, 0, - rec_gap, 0, 0); - - if (msg_seq_gap(msg)) { - l_ptr->stats.recv_nacks++; - tipc_link_retransmit(l_ptr, skb_peek(&l_ptr->transmq), - msg_seq_gap(msg)); - } - if (tipc_link_is_up(l_ptr)) - tipc_node_fsm_evt(l_ptr->owner, - PEER_ESTABL_CONTACT_EVT); - break; - } -exit: - kfree_skb(buf); -} - /* tipc_link_build_proto_msg: prepare link protocol message for transmission */ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, @@ -1727,6 +1566,96 @@ exit: return *skb; } +/* tipc_link_proto_rcv(): receive link level protocol message : + * Note that network plane id propagates through the network, and may + * change at any time. The node with lowest numerical id determines + * network plane + */ +static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq) +{ + struct tipc_msg *hdr = buf_msg(skb); + u16 rcvgap = 0; + u16 nacked_gap = msg_seq_gap(hdr); + u16 peers_snd_nxt = msg_next_sent(hdr); + u16 peers_tol = msg_link_tolerance(hdr); + u16 peers_prio = msg_linkprio(hdr); + char *if_name; + int rc = 0; + + if (l->exec_mode == TIPC_LINK_BLOCKED) + goto exit; + + if (link_own_addr(l) > msg_prevnode(hdr)) + l->net_plane = msg_net_plane(hdr); + + switch (msg_type(hdr)) { + case RESET_MSG: + + /* Ignore duplicate RESET with old session number */ + if ((less_eq(msg_session(hdr), l->peer_session)) && + (l->peer_session != WILDCARD_SESSION)) + break; + /* fall thru' */ + case ACTIVATE_MSG: + + /* Complete own link name with peer's interface name */ + if_name = strrchr(l->name, ':') + 1; + if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) + break; + if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) + break; + strncpy(if_name, msg_data(hdr), TIPC_MAX_IF_NAME); + + /* Update own tolerance if peer indicates a non-zero value */ + if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) + l->tolerance = peers_tol; + + /* Update own priority if peer's priority is higher */ + if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) + l->priority = peers_prio; + + l->peer_session = msg_session(hdr); + l->peer_bearer_id = msg_bearer_id(hdr); + rc = tipc_link_fsm_evt(l, msg_type(hdr), xmitq); + if (l->mtu > msg_max_pkt(hdr)) + l->mtu = msg_max_pkt(hdr); + break; + case STATE_MSG: + /* Update own tolerance if peer indicates a non-zero value */ + if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) + l->tolerance = peers_tol; + + l->silent_intv_cnt = 0; + l->stats.recv_states++; + if (msg_probe(hdr)) + l->stats.recv_probes++; + rc = tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq); + if (!tipc_link_is_up(l)) + break; + + /* Has peer sent packets we haven't received yet ? */ + if (more(peers_snd_nxt, l->rcv_nxt)) + rcvgap = peers_snd_nxt - l->rcv_nxt; + if (rcvgap || (msg_probe(hdr))) + tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap, + 0, l->mtu, xmitq); + tipc_link_release_pkts(l, msg_ack(hdr)); + + /* If NACK, retransmit will now start at right position */ + if (nacked_gap) { + rc |= tipc_link_retransm(l, nacked_gap, xmitq); + l->stats.recv_nacks++; + } + tipc_link_advance_backlog(l, xmitq); + if (unlikely(!skb_queue_empty(&l->wakeupq))) + link_prepare_wakeup(l); + } +exit: + kfree_skb(skb); + return rc; +} + void tipc_link_set_queue_limits(struct tipc_link *l, u32 win) { int max_bulk = TIPC_MAX_PUBLICATIONS / (l->mtu / ITEM_SIZE); diff --git a/net/tipc/link.h b/net/tipc/link.h index 0cf7d2b11803..37cfd7d7bf7d 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -58,7 +58,7 @@ enum { TIPC_LINK_TUNNEL }; -/* Events occurring at packet reception or at timeout +/* Events returned from link at packet reception or at timeout */ enum { TIPC_LINK_UP_EVT = 1, @@ -223,6 +223,7 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr); void tipc_link_purge_backlog(struct tipc_link *l); void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); +void tipc_link_activate(struct tipc_link *link); int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list); int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list, @@ -244,7 +245,8 @@ int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); void link_prepare_wakeup(struct tipc_link *l); int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq); - +int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *xmitq); static inline u32 link_own_addr(struct tipc_link *l) { return msg_prevnode(l->pmsg); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 4dc66d9f69cc..2f1563b47e24 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -38,6 +38,7 @@ #define _TIPC_MSG_H #include +#include "core.h" /* * Constants and routines used to read and write TIPC payload message headers @@ -658,12 +659,12 @@ static inline void msg_set_link_selector(struct tipc_msg *m, u32 n) /* * Word 5 */ -static inline u32 msg_session(struct tipc_msg *m) +static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } -static inline void msg_set_session(struct tipc_msg *m, u32 n) +static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } @@ -766,10 +767,19 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } -static inline bool msg_peer_is_up(struct tipc_msg *m) +static inline bool msg_is_traffic(struct tipc_msg *m) { - if (likely(msg_user(m) != LINK_PROTOCOL) || (msg_type(m) == STATE_MSG)) + if (likely(msg_user(m) != LINK_PROTOCOL)) return true; + if ((msg_type(m) == RESET_MSG) || (msg_type(m) == ACTIVATE_MSG)) + return false; + return true; +} + +static inline bool msg_peer_is_up(struct tipc_msg *m) +{ + if (likely(msg_is_traffic(m))) + return false; return msg_redundant_link(m); } @@ -886,4 +896,36 @@ static inline bool tipc_skb_queue_tail(struct sk_buff_head *list, return rv; } +/* tipc_skb_queue_sorted(); sort pkt into list according to sequence number + * @list: list to be appended to + * @skb: buffer to add + * Returns true if queue should treated further, otherwise false + */ +static inline bool __tipc_skb_queue_sorted(struct sk_buff_head *list, + struct sk_buff *skb) +{ + struct sk_buff *_skb, *tmp; + struct tipc_msg *hdr = buf_msg(skb); + u16 seqno = msg_seqno(hdr); + + if (skb_queue_empty(list) || (msg_user(hdr) == LINK_PROTOCOL)) { + __skb_queue_head(list, skb); + return true; + } + if (likely(less(seqno, buf_seqno(skb_peek(list))))) { + __skb_queue_head(list, skb); + return true; + } + if (!more(seqno, buf_seqno(skb_peek_tail(list)))) { + skb_queue_walk_safe(list, _skb, tmp) { + if (likely(less(seqno, buf_seqno(_skb)))) { + __skb_queue_before(list, _skb, skb); + return true; + } + } + } + __skb_queue_tail(list, skb); + return false; +} + #endif diff --git a/net/tipc/node.c b/net/tipc/node.c index 9dbbb5de287b..e92f84afbf95 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -40,11 +40,13 @@ #include "name_distr.h" #include "socket.h" #include "bcast.h" +#include "discover.h" static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(unsigned long data); +static void tipc_node_fsm_evt(struct tipc_node *n, int evt); struct tipc_sock_conn { u32 port; @@ -141,7 +143,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) break; } list_add_tail_rcu(&n_ptr->list, &temp_node->list); - n_ptr->state = SELF_DOWN_PEER_DOWN; + n_ptr->state = SELF_DOWN_PEER_LEAVING; n_ptr->signature = INVALID_NODE_SIG; n_ptr->active_links[0] = INVALID_BEARER_ID; n_ptr->active_links[1] = INVALID_BEARER_ID; @@ -424,7 +426,7 @@ void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ -void tipc_node_fsm_evt(struct tipc_node *n, int evt) +static void tipc_node_fsm_evt(struct tipc_node *n, int evt) { int state = n->state; @@ -523,23 +525,36 @@ void tipc_node_fsm_evt(struct tipc_node *n, int evt) n->state = state; } -bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_msg *hdr) +bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_link *l, + struct tipc_msg *hdr) { int state = n->state; if (likely(state == SELF_UP_PEER_UP)) return true; + if (state == SELF_DOWN_PEER_DOWN) return true; - if (state == SELF_UP_PEER_COMING) + + if (state == SELF_UP_PEER_COMING) { + /* If not traffic msg, peer may still be ESTABLISHING */ + if (tipc_link_is_up(l) && msg_is_traffic(hdr)) + tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); return true; + } + if (state == SELF_COMING_PEER_UP) return true; + if (state == SELF_LEAVING_PEER_DOWN) return false; - if (state == SELF_DOWN_PEER_LEAVING) - if (!msg_peer_is_up(hdr)) - return true; + + if (state == SELF_DOWN_PEER_LEAVING) { + if (msg_peer_is_up(hdr)) + return false; + tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); + return true; + } return false; } @@ -819,6 +834,82 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, return 0; } +/** + * tipc_rcv - process TIPC packets/messages arriving from off-node + * @net: the applicable net namespace + * @skb: TIPC packet + * @bearer: pointer to bearer message arrived on + * + * Invoked with no locks held. Bearer pointer must point to a valid bearer + * structure (i.e. cannot be NULL), but bearer can be inactive. + */ +void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) +{ + struct sk_buff_head xmitq; + struct tipc_node *n; + struct tipc_link *l; + struct tipc_msg *hdr; + struct tipc_media_addr *maddr; + int bearer_id = b->identity; + int rc = 0; + + __skb_queue_head_init(&xmitq); + + /* Ensure message is well-formed */ + if (unlikely(!tipc_msg_validate(skb))) + goto discard; + + /* Handle arrival of a non-unicast link packet */ + hdr = buf_msg(skb); + if (unlikely(msg_non_seq(hdr))) { + if (msg_user(hdr) == LINK_CONFIG) + tipc_disc_rcv(net, skb, b); + else + tipc_bclink_rcv(net, skb); + return; + } + + /* Locate neighboring node that sent packet */ + n = tipc_node_find(net, msg_prevnode(hdr)); + if (unlikely(!n)) + goto discard; + tipc_node_lock(n); + + /* Locate link endpoint that should handle packet */ + l = n->links[bearer_id].link; + if (unlikely(!l)) + goto unlock; + + /* Is reception of this packet permitted at the moment ? */ + if (unlikely(n->state != SELF_UP_PEER_UP)) + if (!tipc_node_filter_skb(n, l, hdr)) + goto unlock; + + if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) + tipc_bclink_sync_state(n, hdr); + + /* Release acked broadcast messages */ + if (unlikely(n->bclink.acked != msg_bcast_ack(hdr))) + tipc_bclink_acknowledge(n, msg_bcast_ack(hdr)); + + /* Check protocol and update link state */ + rc = tipc_link_rcv(l, skb, &xmitq); + + if (unlikely(rc & TIPC_LINK_UP_EVT)) + tipc_link_activate(l); + if (unlikely(rc & TIPC_LINK_DOWN_EVT)) + tipc_link_reset(l); + skb = NULL; +unlock: + tipc_node_unlock(n); + tipc_sk_rcv(net, &n->links[bearer_id].inputq); + maddr = &n->links[bearer_id].maddr; + tipc_bearer_xmit(net, bearer_id, &xmitq, maddr); + tipc_node_put(n); +discard: + kfree_skb(skb); +} + int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; diff --git a/net/tipc/node.h b/net/tipc/node.h index 270256e09ee5..5e7016802077 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -185,7 +185,6 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest, u32 selector); int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port); void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port); - int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb); static inline void tipc_node_lock(struct tipc_node *node) @@ -193,9 +192,6 @@ static inline void tipc_node_lock(struct tipc_node *node) spin_lock_bh(&node->lock); } -void tipc_node_fsm_evt(struct tipc_node *n, int evt); -bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_msg *hdr); - static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; -- cgit v1.2.3 From 4e10df9a60d96ced321dd2af71da558c6b750078 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Jul 2015 20:34:18 -0700 Subject: bpf: introduce bpf_skb_vlan_push/pop() helpers Allow eBPF programs attached to TC qdiscs call skb_vlan_push/pop via helper functions. These functions may change skb->data/hlen which are cached by some JITs to improve performance of ld_abs/ld_ind instructions. Therefore JITs need to recognize bpf_skb_vlan_push/pop() calls, re-compute header len and re-cache skb->data/hlen back into cpu registers. Note, skb->data/hlen are not directly accessible from the programs, so any changes to skb->data done either by these helpers or by other TC actions are safe. eBPF JIT supported by three architectures: - arm64 JIT is using bpf_load_pointer() without caching, so it's ok as-is. - x64 JIT re-caches skb->data/hlen unconditionally after vlan_push/pop calls (experiments showed that conditional re-caching is slower). - s390 JIT falls back to interpreter for now when bpf_skb_vlan_push() is present in the program (re-caching is tbd). These helpers allow more scalable handling of vlan from the programs. Instead of creating thousands of vlan netdevs on top of eth0 and attaching TC+ingress+bpf to all of them, the program can be attached to eth0 directly and manipulate vlans as necessary. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 4 +++ arch/x86/net/bpf_jit_comp.c | 80 +++++++++++++++++++++++--------------------- include/linux/bpf.h | 2 ++ include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 48 ++++++++++++++++++++++++++ 6 files changed, 99 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index fee782acc2ee..79c731e8d178 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -973,6 +973,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i */ const u64 func = (u64)__bpf_call_base + imm; + if (bpf_helper_changes_skb_data((void *)func)) + /* TODO reload skb->data, hlen */ + return -1; + REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; /* lg %w1,(%l) */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 579a8fd74be0..6c335a8fc086 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -315,6 +315,26 @@ static void emit_bpf_tail_call(u8 **pprog) *pprog = prog; } + +static void emit_load_skb_data_hlen(u8 **pprog) +{ + u8 *prog = *pprog; + int cnt = 0; + + /* r9d = skb->len - skb->data_len (headlen) + * r10 = skb->data + */ + /* mov %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x8b, 0x8f, offsetof(struct sk_buff, len)); + + /* sub %r9d, off32(%rdi) */ + EMIT3_off32(0x44, 0x2b, 0x8f, offsetof(struct sk_buff, data_len)); + + /* mov %r10, off32(%rdi) */ + EMIT3_off32(0x4c, 0x8b, 0x97, offsetof(struct sk_buff, data)); + *pprog = prog; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int oldproglen, struct jit_context *ctx) { @@ -329,36 +349,8 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, emit_prologue(&prog); - if (seen_ld_abs) { - /* r9d : skb->len - skb->data_len (headlen) - * r10 : skb->data - */ - if (is_imm8(offsetof(struct sk_buff, len))) - /* mov %r9d, off8(%rdi) */ - EMIT4(0x44, 0x8b, 0x4f, - offsetof(struct sk_buff, len)); - else - /* mov %r9d, off32(%rdi) */ - EMIT3_off32(0x44, 0x8b, 0x8f, - offsetof(struct sk_buff, len)); - - if (is_imm8(offsetof(struct sk_buff, data_len))) - /* sub %r9d, off8(%rdi) */ - EMIT4(0x44, 0x2b, 0x4f, - offsetof(struct sk_buff, data_len)); - else - EMIT3_off32(0x44, 0x2b, 0x8f, - offsetof(struct sk_buff, data_len)); - - if (is_imm8(offsetof(struct sk_buff, data))) - /* mov %r10, off8(%rdi) */ - EMIT4(0x4c, 0x8b, 0x57, - offsetof(struct sk_buff, data)); - else - /* mov %r10, off32(%rdi) */ - EMIT3_off32(0x4c, 0x8b, 0x97, - offsetof(struct sk_buff, data)); - } + if (seen_ld_abs) + emit_load_skb_data_hlen(&prog); for (i = 0; i < insn_cnt; i++, insn++) { const s32 imm32 = insn->imm; @@ -367,6 +359,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 b1 = 0, b2 = 0, b3 = 0; s64 jmp_offset; u8 jmp_cond; + bool reload_skb_data; int ilen; u8 *func; @@ -818,12 +811,18 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - EMIT2(0x41, 0x52); /* push %r10 */ - EMIT2(0x41, 0x51); /* push %r9 */ - /* need to adjust jmp offset, since - * pop %r9, pop %r10 take 4 bytes after call insn - */ - jmp_offset += 4; + reload_skb_data = bpf_helper_changes_skb_data(func); + if (reload_skb_data) { + EMIT1(0x57); /* push %rdi */ + jmp_offset += 22; /* pop, mov, sub, mov */ + } else { + EMIT2(0x41, 0x52); /* push %r10 */ + EMIT2(0x41, 0x51); /* push %r9 */ + /* need to adjust jmp offset, since + * pop %r9, pop %r10 take 4 bytes after call insn + */ + jmp_offset += 4; + } } if (!imm32 || !is_simm32(jmp_offset)) { pr_err("unsupported bpf func %d addr %p image %p\n", @@ -832,8 +831,13 @@ xadd: if (is_imm8(insn->off)) } EMIT1_off32(0xE8, jmp_offset); if (seen_ld_abs) { - EMIT2(0x41, 0x59); /* pop %r9 */ - EMIT2(0x41, 0x5A); /* pop %r10 */ + if (reload_skb_data) { + EMIT1(0x5F); /* pop %rdi */ + emit_load_skb_data_hlen(&prog); + } else { + EMIT2(0x41, 0x59); /* pop %r9 */ + EMIT2(0x41, 0x5A); /* pop %r10 */ + } } break; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 4383476a0d48..139d6d2e123f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -192,5 +192,7 @@ extern const struct bpf_func_proto bpf_ktime_get_ns_proto; extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; extern const struct bpf_func_proto bpf_get_current_comm_proto; +extern const struct bpf_func_proto bpf_skb_vlan_push_proto; +extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; #endif /* _LINUX_BPF_H */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 17724f6ea983..69d00555ce35 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -411,6 +411,7 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); void bpf_int_jit_compile(struct bpf_prog *fp); +bool bpf_helper_changes_skb_data(void *func); #ifdef CONFIG_BPF_JIT typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2de87e58b12b..2f6c83d714e9 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -256,6 +256,8 @@ enum bpf_func_id { * Return: classid if != 0 */ BPF_FUNC_get_cgroup_classid, + BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */ + BPF_FUNC_skb_vlan_pop, /* bpf_skb_vlan_pop(skb) */ __BPF_FUNC_MAX_ID, }; diff --git a/net/core/filter.c b/net/core/filter.c index 247450a5e387..50338071fac4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1437,6 +1437,50 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + __be16 vlan_proto = (__force __be16) r2; + + if (unlikely(vlan_proto != htons(ETH_P_8021Q) && + vlan_proto != htons(ETH_P_8021AD))) + vlan_proto = htons(ETH_P_8021Q); + + return skb_vlan_push(skb, vlan_proto, vlan_tci); +} + +const struct bpf_func_proto bpf_skb_vlan_push_proto = { + .func = bpf_skb_vlan_push, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + +static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + + return skb_vlan_pop(skb); +} + +const struct bpf_func_proto bpf_skb_vlan_pop_proto = { + .func = bpf_skb_vlan_pop, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +bool bpf_helper_changes_skb_data(void *func) +{ + if (func == bpf_skb_vlan_push) + return true; + if (func == bpf_skb_vlan_pop) + return true; + return false; +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1476,6 +1520,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_clone_redirect_proto; case BPF_FUNC_get_cgroup_classid: return &bpf_get_cgroup_classid_proto; + case BPF_FUNC_skb_vlan_push: + return &bpf_skb_vlan_push_proto; + case BPF_FUNC_skb_vlan_pop: + return &bpf_skb_vlan_pop_proto; default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From 4d9c5c53ac99e4cb5d031897863203d7817b36e0 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 20 Jul 2015 20:34:19 -0700 Subject: test_bpf: add bpf_skb_vlan_push/pop() tests improve accuracy of timing in test_bpf and add two stress tests: - {skb->data[0], get_smp_processor_id} repeated 2k times - {skb->data[0], vlan_push} x 68 followed by {skb->data[0], vlan_pop} x 68 1st test is useful to test performance of JIT implementation of BPF_LD_ABS together with BPF_CALL instructions. 2nd test is stressing skb_vlan_push/pop logic together with skb->data access via BPF_LD_ABS insn which checks that re-caching of skb->data is done correctly. In order to call bpf_skb_vlan_push() from test_bpf.ko have to add three export_symbol_gpl. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 1 + lib/test_bpf.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- net/core/filter.c | 2 ++ 3 files changed, 98 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bf38f5e8196c..fafa74161445 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } +EXPORT_SYMBOL_GPL(__bpf_call_base); /** * __bpf_prog_run - run eBPF program on a given context diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 9198f28a5528..8b5e66f008b0 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -355,6 +356,81 @@ static int bpf_fill_ja(struct bpf_test *self) return __bpf_fill_ja(self, 12, 9); } +static int bpf_fill_ld_abs_get_processor_id(struct bpf_test *self) +{ + unsigned int len = BPF_MAXINSNS; + struct sock_filter *insn; + int i; + + insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL); + if (!insn) + return -ENOMEM; + + for (i = 0; i < len - 1; i += 2) { + insn[i] = __BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 0); + insn[i + 1] = __BPF_STMT(BPF_LD | BPF_W | BPF_ABS, + SKF_AD_OFF + SKF_AD_CPU); + } + + insn[len - 1] = __BPF_STMT(BPF_RET | BPF_K, 0xbee); + + self->u.ptr.insns = insn; + self->u.ptr.len = len; + + return 0; +} + +#define PUSH_CNT 68 +/* test: {skb->data[0], vlan_push} x 68 + {skb->data[0], vlan_pop} x 68 */ +static int bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self) +{ + unsigned int len = BPF_MAXINSNS; + struct bpf_insn *insn; + int i = 0, j, k = 0; + + insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL); + if (!insn) + return -ENOMEM; + + insn[i++] = BPF_MOV64_REG(R6, R1); +loop: + for (j = 0; j < PUSH_CNT; j++) { + insn[i++] = BPF_LD_ABS(BPF_B, 0); + insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2); + i++; + insn[i++] = BPF_MOV64_REG(R1, R6); + insn[i++] = BPF_MOV64_IMM(R2, 1); + insn[i++] = BPF_MOV64_IMM(R3, 2); + insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + bpf_skb_vlan_push_proto.func - __bpf_call_base); + insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2); + i++; + } + + for (j = 0; j < PUSH_CNT; j++) { + insn[i++] = BPF_LD_ABS(BPF_B, 0); + insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0x34, len - i - 2); + i++; + insn[i++] = BPF_MOV64_REG(R1, R6); + insn[i++] = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + bpf_skb_vlan_pop_proto.func - __bpf_call_base); + insn[i] = BPF_JMP_IMM(BPF_JNE, R0, 0, len - i - 2); + i++; + } + if (++k < 5) + goto loop; + + for (; i < len - 1; i++) + insn[i] = BPF_ALU32_IMM(BPF_MOV, R0, 0xbef); + + insn[len - 1] = BPF_EXIT_INSN(); + + self->u.ptr.insns = insn; + self->u.ptr.len = len; + + return 0; +} + static struct bpf_test tests[] = { { "TAX", @@ -4398,6 +4474,22 @@ static struct bpf_test tests[] = { { { 0, 0xababcbac } }, .fill_helper = bpf_fill_maxinsns11, }, + { + "BPF_MAXINSNS: ld_abs+get_processor_id", + { }, + CLASSIC, + { }, + { { 1, 0xbee } }, + .fill_helper = bpf_fill_ld_abs_get_processor_id, + }, + { + "BPF_MAXINSNS: ld_abs+vlan_push/pop", + { }, + INTERNAL, + { 0x34 }, + { { 1, 0xbef } }, + .fill_helper = bpf_fill_ld_abs_vlan_push_pop, + }, }; static struct net_device dev; @@ -4551,14 +4643,14 @@ static int __run_one(const struct bpf_prog *fp, const void *data, u64 start, finish; int ret = 0, i; - start = ktime_to_us(ktime_get()); + start = ktime_get_ns(); for (i = 0; i < runs; i++) ret = BPF_PROG_RUN(fp, data); - finish = ktime_to_us(ktime_get()); + finish = ktime_get_ns(); - *duration = (finish - start) * 1000ULL; + *duration = finish - start; do_div(*duration, runs); return ret; diff --git a/net/core/filter.c b/net/core/filter.c index 50338071fac4..786722a9c6f2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1457,6 +1457,7 @@ const struct bpf_func_proto bpf_skb_vlan_push_proto = { .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, }; +EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { @@ -1471,6 +1472,7 @@ const struct bpf_func_proto bpf_skb_vlan_pop_proto = { .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, }; +EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); bool bpf_helper_changes_skb_data(void *func) { -- cgit v1.2.3 From 07868284e530d52f72b14b800398a8fefa7cf5d0 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 17 Jul 2015 12:34:17 -0300 Subject: sctp: reduce indent level on sctp_v4_get_dst Paves the day for the next patch. Functionality stays untouched. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/protocol.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 59e80356672b..fa80fe4f2362 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -489,21 +489,23 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid) continue; - if ((laddr->state == SCTP_ADDR_SRC) && - (AF_INET == laddr->a.sa.sa_family)) { - fl4->fl4_sport = laddr->a.v4.sin_port; - flowi4_update_output(fl4, - asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS(asoc->base.sk), - daddr->v4.sin_addr.s_addr, - laddr->a.v4.sin_addr.s_addr); - - rt = ip_route_output_key(sock_net(sk), fl4); - if (!IS_ERR(rt)) { - dst = &rt->dst; - goto out_unlock; - } - } + if (laddr->state != SCTP_ADDR_SRC || + AF_INET != laddr->a.sa.sa_family) + continue; + + fl4->fl4_sport = laddr->a.v4.sin_port; + flowi4_update_output(fl4, + asoc->base.sk->sk_bound_dev_if, + RT_CONN_FLAGS(asoc->base.sk), + daddr->v4.sin_addr.s_addr, + laddr->a.v4.sin_addr.s_addr); + + rt = ip_route_output_key(sock_net(sk), fl4); + if (IS_ERR(rt)) + continue; + + dst = &rt->dst; + break; } out_unlock: -- cgit v1.2.3 From 0ca50d12fe46cdf3c0dc9ec5ca98607a52afdc62 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 17 Jul 2015 12:34:18 -0300 Subject: sctp: fix src address selection if using secondary addresses In short, sctp is likely to incorrectly choose src address if socket is bound to secondary addresses. This patch fixes it by adding a new check that checks if such src address belongs to the interface that routing identified as output. This is enough to avoid rp_filter drops on remote peer. Details: Currently, sctp will do a routing attempt without specifying the src address and compare the returned value (preferred source) with the addresses that the socket is bound to. When using secondary addresses, this will not match. Then it will try specifying each of the addresses that the socket is bound to and re-routing, checking if that address is valid as src for that dst. Thing is, this check alone is weak: # ip r l 192.168.100.0/24 dev eth1 proto kernel scope link src 192.168.100.149 192.168.122.0/24 dev eth0 proto kernel scope link src 192.168.122.147 # ip a l 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 127.0.0.1/8 scope host lo valid_lft forever preferred_lft forever inet6 ::1/128 scope host valid_lft forever preferred_lft forever 2: eth0: mtu 1500 qdisc pfifo_fast state UP group default qlen 1000 link/ether 52:54:00:15:18:6a brd ff:ff:ff:ff:ff:ff inet 192.168.122.147/24 brd 192.168.122.255 scope global dynamic eth0 valid_lft 2160sec preferred_lft 2160sec inet 192.168.122.148/24 scope global secondary eth0 valid_lft forever preferred_lft forever inet6 fe80::5054:ff:fe15:186a/64 scope link valid_lft forever preferred_lft forever 3: eth1: mtu 1500 qdisc pfifo_fast state UP group default qlen 1000 link/ether 52:54:00:b3:91:46 brd ff:ff:ff:ff:ff:ff inet 192.168.100.149/24 brd 192.168.100.255 scope global dynamic eth1 valid_lft 2162sec preferred_lft 2162sec inet 192.168.100.148/24 scope global secondary eth1 valid_lft forever preferred_lft forever inet6 fe80::5054:ff:feb3:9146/64 scope link valid_lft forever preferred_lft forever 4: ens9: mtu 1500 qdisc pfifo_fast state UP group default qlen 1000 link/ether 52:54:00:05:47:ee brd ff:ff:ff:ff:ff:ff inet6 fe80::5054:ff:fe05:47ee/64 scope link valid_lft forever preferred_lft forever # ip r g 192.168.100.193 from 192.168.122.148 192.168.100.193 from 192.168.122.148 dev eth1 cache Even if you specify an interface: # ip r g 192.168.100.193 from 192.168.122.148 oif eth1 192.168.100.193 from 192.168.122.148 dev eth1 cache Although this would be valid, peers using rp_filter will drop such packets as their src doesn't match the routes for that interface. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/protocol.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index fa80fe4f2362..4345790ad326 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -487,6 +487,8 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { + struct net_device *odev; + if (!laddr->valid) continue; if (laddr->state != SCTP_ADDR_SRC || @@ -504,6 +506,14 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, if (IS_ERR(rt)) continue; + /* Ensure the src address belongs to the output + * interface. + */ + odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, + false); + if (!odev || odev->ifindex != fl4->flowi4_oif) + continue; + dst = &rt->dst; break; } -- cgit v1.2.3 From b52effd263e0926cc52713703f53d0a6c77259a1 Mon Sep 17 00:00:00 2001 From: Marcelo Ricardo Leitner Date: Fri, 17 Jul 2015 13:50:21 -0300 Subject: sctp: fix cut and paste issue in comment Cookie ACK is always received by the association initiator, so fix the comment to avoid confusion. Signed-off-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 3ee27b7704ff..d7eaa7354cf7 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -853,7 +853,7 @@ nomem: /* * Respond to a normal COOKIE ACK chunk. - * We are the side that is being asked for an association. + * We are the side that is asking for an association. * * RFC 2960 5.1 Normal Establishment of an Association * -- cgit v1.2.3 From 0c199a903d1cbc67fd99a9b074aaff4f72fa84d0 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Sat, 18 Jul 2015 14:41:51 +0200 Subject: xfrm: Fix a typo Signed-off-by: Jakub Wilk Signed-off-by: David S. Miller --- net/xfrm/xfrm_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index bd16c6c7e1e7..0cebf1fc37a2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2048,7 +2048,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_audit_policy_delete(xp, 1, true); } else { // reset the timers here? - WARN(1, "Dont know what to do with soft policy expire\n"); + WARN(1, "Don't know what to do with soft policy expire\n"); } km_policy_expired(xp, p->dir, up->hard, nlh->nlmsg_pid); -- cgit v1.2.3 From 499a24256862714539e902c0499b67da2bb3ab72 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:46 +0200 Subject: lwtunnel: infrastructure for handling light weight tunnels like mpls Provides infrastructure to parse/dump/store encap information for light weight tunnels like mpls. Encap information for such tunnels is associated with fib routes. This infrastructure is based on previous suggestions from Eric Biederman to follow the xfrm infrastructure. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/lwtunnel.h | 6 ++ include/net/lwtunnel.h | 132 +++++++++++++++++++++++++++++++ include/uapi/linux/lwtunnel.h | 15 ++++ net/Kconfig | 7 ++ net/core/Makefile | 1 + net/core/lwtunnel.c | 179 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 340 insertions(+) create mode 100644 include/linux/lwtunnel.h create mode 100644 include/net/lwtunnel.h create mode 100644 include/uapi/linux/lwtunnel.h create mode 100644 net/core/lwtunnel.c (limited to 'net') diff --git a/include/linux/lwtunnel.h b/include/linux/lwtunnel.h new file mode 100644 index 000000000000..97f32f8b4ae1 --- /dev/null +++ b/include/linux/lwtunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_LWTUNNEL_H_ +#define _LINUX_LWTUNNEL_H_ + +#include + +#endif /* _LINUX_LWTUNNEL_H_ */ diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h new file mode 100644 index 000000000000..df24b3611ff4 --- /dev/null +++ b/include/net/lwtunnel.h @@ -0,0 +1,132 @@ +#ifndef __NET_LWTUNNEL_H +#define __NET_LWTUNNEL_H 1 + +#include +#include +#include +#include +#include + +#define LWTUNNEL_HASH_BITS 7 +#define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) + +/* lw tunnel state flags */ +#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 + +struct lwtunnel_state { + __u16 type; + __u16 flags; + atomic_t refcnt; + int len; + __u8 data[0]; +}; + +struct lwtunnel_encap_ops { + int (*build_state)(struct net_device *dev, struct nlattr *encap, + struct lwtunnel_state **ts); + int (*output)(struct sock *sk, struct sk_buff *skb); + int (*fill_encap)(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); + int (*get_encap_size)(struct lwtunnel_state *lwtstate); + int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); +}; + +extern const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; + +#ifdef CONFIG_LWTUNNEL +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ + atomic_inc(&lws->refcnt); +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ + if (!lws) + return; + + if (atomic_dec_and_test(&lws->refcnt)) + kfree(lws); +} + +static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_OUTPUT_REDIRECT)) + return true; + + return false; +} + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num); +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws); +int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate); +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); +struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); + +#else + +static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +{ +} + +static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +{ +} + +static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + +static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, + unsigned int num) +{ + return -EOPNOTSUPP; + +} + +static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, + unsigned int num) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, + struct lwtunnel_state **lws) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + return 0; +} + +static inline int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + return 0; +} + +static inline struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len) +{ + return NULL; +} + +static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, + struct lwtunnel_state *b) +{ + return 0; +} + +#endif + +#endif /* __NET_LWTUNNEL_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h new file mode 100644 index 000000000000..aa611d931a31 --- /dev/null +++ b/include/uapi/linux/lwtunnel.h @@ -0,0 +1,15 @@ +#ifndef _UAPI_LWTUNNEL_H_ +#define _UAPI_LWTUNNEL_H_ + +#include + +enum lwtunnel_encap_types { + LWTUNNEL_ENCAP_NONE, + LWTUNNEL_ENCAP_MPLS, + __LWTUNNEL_ENCAP_MAX, +}; + +#define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1) + + +#endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/net/Kconfig b/net/Kconfig index 57a7c5af3175..7021c1bf44d6 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -374,6 +374,13 @@ source "net/caif/Kconfig" source "net/ceph/Kconfig" source "net/nfc/Kconfig" +config LWTUNNEL + bool "Network light weight tunnels" + ---help--- + This feature provides an infrastructure to support light weight + tunnels like mpls. There is no netdevice associated with a light + weight tunnel endpoint. Tunnel encapsulation parameters are stored + with light weight tunnel state associated with fib routes. endif # if NET diff --git a/net/core/Makefile b/net/core/Makefile index fec0856dd6c0..086b01fbe1bd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -23,3 +23,4 @@ obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o +obj-$(CONFIG_LWTUNNEL) += lwtunnel.o diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c new file mode 100644 index 000000000000..d7ae3a235b4b --- /dev/null +++ b/net/core/lwtunnel.c @@ -0,0 +1,179 @@ +/* + * lwtunnel Infrastructure for light weight tunnels like mpls + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) +{ + struct lwtunnel_state *lws; + + lws = kzalloc(sizeof(*lws) + encap_len, GFP_ATOMIC); + + return lws; +} +EXPORT_SYMBOL(lwtunnel_state_alloc); + +const struct lwtunnel_encap_ops __rcu * + lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; + +int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, + unsigned int num) +{ + if (num > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + return !cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[num], + NULL, ops) ? 0 : -1; +} +EXPORT_SYMBOL(lwtunnel_encap_add_ops); + +int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, + unsigned int encap_type) +{ + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return -ERANGE; + + ret = (cmpxchg((const struct lwtunnel_encap_ops **) + &lwtun_encaps[encap_type], + ops, NULL) == ops) ? 0 : -1; + + synchronize_net(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_encap_del_ops); + +int lwtunnel_build_state(struct net_device *dev, u16 encap_type, + struct nlattr *encap, struct lwtunnel_state **lws) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (encap_type == LWTUNNEL_ENCAP_NONE || + encap_type > LWTUNNEL_ENCAP_MAX) + return ret; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[encap_type]); + if (likely(ops && ops->build_state)) + ret = ops->build_state(dev, encap, lws); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_build_state); + +int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + struct nlattr *nest; + int ret = -EINVAL; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + nest = nla_nest_start(skb, RTA_ENCAP); + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->fill_encap)) + ret = ops->fill_encap(skb, lwtstate); + rcu_read_unlock(); + + if (ret) + goto nla_put_failure; + nla_nest_end(skb, nest); + ret = nla_put_u16(skb, RTA_ENCAP_TYPE, lwtstate->type); + if (ret) + goto nla_put_failure; + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nest); + + return (ret == -EOPNOTSUPP ? 0 : ret); +} +EXPORT_SYMBOL(lwtunnel_fill_encap); + +int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!lwtstate) + return 0; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->get_encap_size)) + ret = nla_total_size(ops->get_encap_size(lwtstate)); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_get_encap_size); + +int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + const struct lwtunnel_encap_ops *ops; + int ret = 0; + + if (!a && !b) + return 0; + + if (!a || !b) + return 1; + + if (a->type != b->type) + return 1; + + if (a->type == LWTUNNEL_ENCAP_NONE || + a->type > LWTUNNEL_ENCAP_MAX) + return 0; + + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[a->type]); + if (likely(ops && ops->cmp_encap)) + ret = ops->cmp_encap(a, b); + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(lwtunnel_cmp_encap); -- cgit v1.2.3 From 571e722676fe386bb66f72a75b64a6ebf535c077 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:47 +0200 Subject: ipv4: support for fib route lwtunnel encap attributes This patch adds support in ipv4 fib functions to parse user provided encap attributes and attach encap state data to fib_nh and rtable. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip_fib.h | 5 ++- include/net/route.h | 1 + net/ipv4/fib_frontend.c | 8 ++++ net/ipv4/fib_semantics.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++- net/ipv4/route.c | 16 +++++++- 5 files changed, 122 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 49c142bdf01e..5e0196084f1e 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -44,7 +44,9 @@ struct fib_config { u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; - }; + struct nlattr *fc_encap; + u16 fc_encap_type; +}; struct fib_info; struct rtable; @@ -89,6 +91,7 @@ struct fib_nh { struct rtable __rcu * __percpu *nh_pcpu_rth_output; struct rtable __rcu *nh_rth_input; struct fnhe_hash_bucket __rcu *nh_exceptions; + struct lwtunnel_state *nh_lwtstate; }; /* diff --git a/include/net/route.h b/include/net/route.h index fe22d03afb6a..2d45f419477f 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -66,6 +66,7 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; + struct lwtunnel_state *rt_lwtstate; }; static inline bool rt_is_input_route(const struct rtable *rt) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6bbc54940eb4..9b2019cc3586 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -591,6 +591,8 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_FLOW] = { .type = NLA_U32 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, @@ -656,6 +658,12 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_TABLE: cfg->fc_table = nla_get_u32(attr); break; + case RTA_ENCAP: + cfg->fc_encap = attr; + break; + case RTA_ENCAP_TYPE: + cfg->fc_encap_type = nla_get_u16(attr); + break; } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c7358ea4ae93..6754c64b2fe0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -42,6 +42,7 @@ #include #include #include +#include #include "fib_lookup.h" @@ -208,6 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); + lwtunnel_state_put(nexthop_nh->nh_lwtstate); free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); @@ -266,6 +268,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) #ifdef CONFIG_IP_ROUTE_CLASSID nh->nh_tclassid != onh->nh_tclassid || #endif + lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) || ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK)) return -1; onh++; @@ -366,6 +369,7 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) payload += nla_total_size((RTAX_MAX * nla_total_size(4))); if (fi->fib_nhs) { + size_t nh_encapsize = 0; /* Also handles the special case fib_nhs == 1 */ /* each nexthop is packed in an attribute */ @@ -374,8 +378,21 @@ static inline size_t fib_nlmsg_size(struct fib_info *fi) /* may contain flow and gateway attribute */ nhsize += 2 * nla_total_size(4); + /* grab encap info */ + for_nexthops(fi) { + if (nh->nh_lwtstate) { + /* RTA_ENCAP_TYPE */ + nh_encapsize += lwtunnel_get_encap_size( + nh->nh_lwtstate); + /* RTA_ENCAP */ + nh_encapsize += nla_total_size(2); + } + } endfor_nexthops(fi); + /* all nexthops are packed in a nested attribute */ - payload += nla_total_size(fi->fib_nhs * nhsize); + payload += nla_total_size((fi->fib_nhs * nhsize) + + nh_encapsize); + } return payload; @@ -452,6 +469,9 @@ static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining) static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, int remaining, struct fib_config *cfg) { + struct net *net = cfg->fc_nlinfo.nl_net; + int ret; + change_nexthops(fi) { int attrlen; @@ -475,18 +495,66 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (nexthop_nh->nh_tclassid) fi->fib_net->ipv4.fib_num_tclassid_users++; #endif + nla = nla_find(attrs, attrlen, RTA_ENCAP); + if (nla) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + struct nlattr *nla_entype; + + nla_entype = nla_find(attrs, attrlen, + RTA_ENCAP_TYPE); + if (!nla_entype) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + ret = lwtunnel_build_state(dev, nla_get_u16( + nla_entype), + nla, &lwtstate); + if (ret) + goto errout; + lwtunnel_state_get(lwtstate); + nexthop_nh->nh_lwtstate = lwtstate; + } } rtnh = rtnh_next(rtnh, &remaining); } endfor_nexthops(fi); return 0; + +err_inval: + ret = -EINVAL; + +errout: + return ret; } #endif +int fib_encap_match(struct net *net, u16 encap_type, + struct nlattr *encap, + int oif, const struct fib_nh *nh) +{ + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + int ret; + + if (encap_type == LWTUNNEL_ENCAP_NONE) + return 0; + + if (oif) + dev = __dev_get_by_index(net, oif); + ret = lwtunnel_build_state(dev, encap_type, + encap, &lwtstate); + if (!ret) + return lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + + return 0; +} + int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) { + struct net *net = cfg->fc_nlinfo.nl_net; #ifdef CONFIG_IP_ROUTE_MULTIPATH struct rtnexthop *rtnh; int remaining; @@ -496,6 +564,12 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) return 1; if (cfg->fc_oif || cfg->fc_gw) { + if (cfg->fc_encap) { + if (fib_encap_match(net, cfg->fc_encap_type, + cfg->fc_encap, cfg->fc_oif, + fi->fib_nh)) + return 1; + } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) return 0; @@ -882,6 +956,22 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } else { struct fib_nh *nh = fi->fib_nh; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + struct net_device *dev = NULL; + + if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE) + goto err_inval; + if (cfg->fc_oif) + dev = __dev_get_by_index(net, cfg->fc_oif); + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, &lwtstate); + if (err) + goto failure; + + lwtunnel_state_get(lwtstate); + nh->nh_lwtstate = lwtstate; + } nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; nh->nh_flags = cfg->fc_flags; @@ -1055,6 +1145,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid)) goto nla_put_failure; #endif + if (fi->fib_nh->nh_lwtstate) + lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate); } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fi->fib_nhs > 1) { @@ -1090,6 +1182,8 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event, nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid)) goto nla_put_failure; #endif + if (nh->nh_lwtstate) + lwtunnel_fill_encap(skb, nh->nh_lwtstate); /* length of rtnetlink header + attributes */ rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh; } endfor_nexthops(fi); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 04c83de4f79e..226570ba1ced 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_SYSCTL @@ -1355,6 +1356,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } + lwtunnel_state_put(rt->rt_lwtstate); } void rt_flush_dev(struct net_device *dev) @@ -1403,6 +1405,12 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif + if (nh->nh_lwtstate) { + lwtunnel_state_get(nh->nh_lwtstate); + rt->rt_lwtstate = nh->nh_lwtstate; + } else { + rt->rt_lwtstate = NULL; + } if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) @@ -1488,6 +1496,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1617,6 +1626,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; @@ -1791,6 +1801,8 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + rth->rt_lwtstate = NULL; + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1980,7 +1992,7 @@ add: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - + rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) @@ -2260,7 +2272,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - + rt->rt_lwtstate = NULL; dst_free(new); } -- cgit v1.2.3 From 19e42e45150672124b6a4341e2bc7982d247f0ac Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:48 +0200 Subject: ipv6: support for fib route lwtunnel encap attributes This patch adds support in ipv6 fib functions to parse Netlink RTA encap attributes and attach encap state data to rt6_info. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/ip6_fib.h | 3 +++ net/ipv6/ip6_fib.c | 2 ++ net/ipv6/route.c | 33 ++++++++++++++++++++++++++++++--- 3 files changed, 35 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 3b76849c190f..276328e3daa6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -51,6 +51,8 @@ struct fib6_config { struct nlattr *fc_mp; struct nl_info fc_nlinfo; + struct nlattr *fc_encap; + u16 fc_encap_type; }; struct fib6_node { @@ -131,6 +133,7 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; + struct lwtunnel_state *rt6i_lwtstate; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 55d19861ab20..d715f2e0c4e7 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -177,6 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { + lwtunnel_state_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6090969937f8..b3431b79dfb1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -58,6 +58,7 @@ #include #include #include +#include #include @@ -1770,6 +1771,17 @@ int ip6_route_add(struct fib6_config *cfg) rt->dst.output = ip6_output; + if (cfg->fc_encap) { + struct lwtunnel_state *lwtstate; + + err = lwtunnel_build_state(dev, cfg->fc_encap_type, + cfg->fc_encap, &lwtstate); + if (err) + goto out; + lwtunnel_state_get(lwtstate); + rt->rt6i_lwtstate = lwtstate; + } + ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); rt->rt6i_dst.plen = cfg->fc_dst_len; if (rt->rt6i_dst.plen == 128) @@ -2595,6 +2607,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = { [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, [RTA_PREF] = { .type = NLA_U8 }, + [RTA_ENCAP_TYPE] = { .type = NLA_U16 }, + [RTA_ENCAP] = { .type = NLA_NESTED }, }; static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -2689,6 +2703,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->fc_flags |= RTF_PREF(pref); } + if (tb[RTA_ENCAP]) + cfg->fc_encap = tb[RTA_ENCAP]; + + if (tb[RTA_ENCAP_TYPE]) + cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); + err = 0; errout: return err; @@ -2721,6 +2741,10 @@ beginning: r_cfg.fc_gateway = nla_get_in6_addr(nla); r_cfg.fc_flags |= RTF_GATEWAY; } + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + r_cfg.fc_encap_type = nla_get_u16(nla); } err = add ? ip6_route_add(&r_cfg) : ip6_route_del(&r_cfg); if (err) { @@ -2783,7 +2807,7 @@ static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) return ip6_route_add(&cfg); } -static inline size_t rt6_nlmsg_size(void) +static inline size_t rt6_nlmsg_size(struct rt6_info *rt) { return NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(16) /* RTA_SRC */ @@ -2797,7 +2821,8 @@ static inline size_t rt6_nlmsg_size(void) + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */ + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ - + nla_total_size(1); /* RTA_PREF */ + + nla_total_size(1) /* RTA_PREF */ + + lwtunnel_get_encap_size(rt->rt6i_lwtstate); } static int rt6_fill_node(struct net *net, @@ -2945,6 +2970,8 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; + lwtunnel_fill_encap(skb, rt->rt6i_lwtstate); + nlmsg_end(skb, nlh); return 0; @@ -3071,7 +3098,7 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info) err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(), gfp_any()); + skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); if (!skb) goto errout; -- cgit v1.2.3 From ffce41962ef64b8e685e5b621caf24bf381addd9 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:49 +0200 Subject: lwtunnel: support dst output redirect function This patch introduces lwtunnel_output function to call corresponding lwtunnels output function to xmit the packet. It adds two variants lwtunnel_output and lwtunnel_output6 for ipv4 and ipv6 respectively today. But this is subject to change when lwtstate will reside in dst or dst_metadata (as per upstream discussions). Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 12 +++++++++++ net/core/lwtunnel.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index df24b3611ff4..918e03c1dafa 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -69,6 +69,8 @@ int lwtunnel_fill_encap(struct sk_buff *skb, int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); +int lwtunnel_output(struct sock *sk, struct sk_buff *skb); +int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); #else @@ -127,6 +129,16 @@ static inline int lwtunnel_cmp_encap(struct lwtunnel_state *a, return 0; } +static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + #endif #endif /* __NET_LWTUNNEL_H */ diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index d7ae3a235b4b..bb58826c708d 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -25,6 +25,7 @@ #include #include +#include struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) { @@ -177,3 +178,58 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) return ret; } EXPORT_SYMBOL(lwtunnel_cmp_encap); + +int __lwtunnel_output(struct sock *sk, struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (!lwtstate) + goto drop; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->output)) + ret = ops->output(sk, skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree(skb); + + return ret; +} + +int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) +{ + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt6i_lwtstate; + + return __lwtunnel_output(sk, skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_output6); + +int lwtunnel_output(struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt_lwtstate; + + return __lwtunnel_output(sk, skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_output); -- cgit v1.2.3 From 8602a625024737602630fe0dee423b3096d31524 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:50 +0200 Subject: ipv4: redirect dst output to lwtunnel output For input routes with tunnel encap state this patch redirects dst output functions to lwtunnel_output which later resolves to the corresponding lwtunnel output function. This has been tested to work with mpls ip tunnels. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 226570ba1ced..cd3157c464e6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1633,6 +1633,8 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); + if (lwtunnel_output_redirect(rth->rt_lwtstate)) + rth->dst.output = lwtunnel_output; skb_dst_set(skb, &rth->dst); out: err = 0; -- cgit v1.2.3 From 74a0f2fe8ed51e3adbb1c882be04672fe7bb6996 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:51 +0200 Subject: ipv6: rt6_info output redirect to tunnel output This is similar to ipv4 redirect of dst output to lwtunnel output function for encapsulation and xmit. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index b3431b79dfb1..7f2214f8fde7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1780,6 +1780,7 @@ int ip6_route_add(struct fib6_config *cfg) goto out; lwtunnel_state_get(lwtstate); rt->rt6i_lwtstate = lwtstate; + rt->dst.output = lwtunnel_output6; } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); -- cgit v1.2.3 From face0188e31b3cfc598d8dc3470e28e00fb3b07c Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:52 +0200 Subject: mpls: export mpls functions for use by mpls iptunnels Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 11 ++++++++--- net/mpls/internal.h | 9 +++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 1f93a5978f2a..6e669114f829 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -58,10 +58,11 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) return rcu_dereference_rtnl(dev->mpls_ptr); } -static bool mpls_output_possible(const struct net_device *dev) +bool mpls_output_possible(const struct net_device *dev) { return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev); } +EXPORT_SYMBOL_GPL(mpls_output_possible); static unsigned int mpls_rt_header_size(const struct mpls_route *rt) { @@ -69,13 +70,14 @@ static unsigned int mpls_rt_header_size(const struct mpls_route *rt) return rt->rt_labels * sizeof(struct mpls_shim_hdr); } -static unsigned int mpls_dev_mtu(const struct net_device *dev) +unsigned int mpls_dev_mtu(const struct net_device *dev) { /* The amount of data the layer 2 frame can hold */ return dev->mtu; } +EXPORT_SYMBOL_GPL(mpls_dev_mtu); -static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) +bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; @@ -85,6 +87,7 @@ static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) return true; } +EXPORT_SYMBOL_GPL(mpls_pkt_too_big); static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, struct mpls_entry_decoded dec) @@ -626,6 +629,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, return 0; } +EXPORT_SYMBOL_GPL(nla_put_labels); int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]) @@ -671,6 +675,7 @@ int nla_get_labels(const struct nlattr *nla, *labels = nla_labels; return 0; } +EXPORT_SYMBOL_GPL(nla_get_labels); static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, struct mpls_route_config *cfg) diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 8cabeb5a1cb9..2681a4ba6c37 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -50,7 +50,12 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr * return result; } -int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, u32 label[]); +int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, + const u32 label[]); +int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels, + u32 label[]); +bool mpls_output_possible(const struct net_device *dev); +unsigned int mpls_dev_mtu(const struct net_device *dev); +bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); #endif /* MPLS_INTERNAL_H */ -- cgit v1.2.3 From e3e4712ec0961ed586a8db340bd994c4ad7f5dba Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 10:43:53 +0200 Subject: mpls: ip tunnel support This implementation uses lwtunnel infrastructure to register hooks for mpls tunnel encaps. It picks cues from iptunnel_encaps infrastructure and previous mpls iptunnel RFC patches from Eric W. Biederman and Robert Shearman Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/mpls_iptunnel.h | 6 + include/net/mpls_iptunnel.h | 29 +++++ include/uapi/linux/mpls_iptunnel.h | 28 +++++ net/mpls/Kconfig | 8 +- net/mpls/Makefile | 1 + net/mpls/mpls_iptunnel.c | 233 +++++++++++++++++++++++++++++++++++++ 6 files changed, 304 insertions(+), 1 deletion(-) create mode 100644 include/linux/mpls_iptunnel.h create mode 100644 include/net/mpls_iptunnel.h create mode 100644 include/uapi/linux/mpls_iptunnel.h create mode 100644 net/mpls/mpls_iptunnel.c (limited to 'net') diff --git a/include/linux/mpls_iptunnel.h b/include/linux/mpls_iptunnel.h new file mode 100644 index 000000000000..ef29eb2d6dfd --- /dev/null +++ b/include/linux/mpls_iptunnel.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_MPLS_IPTUNNEL_H +#define _LINUX_MPLS_IPTUNNEL_H + +#include + +#endif /* _LINUX_MPLS_IPTUNNEL_H */ diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h new file mode 100644 index 000000000000..4757997f76ed --- /dev/null +++ b/include/net/mpls_iptunnel.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2015 Cumulus Networks, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef _NET_MPLS_IPTUNNEL_H +#define _NET_MPLS_IPTUNNEL_H 1 + +#define MAX_NEW_LABELS 2 + +struct mpls_iptunnel_encap { + u32 label[MAX_NEW_LABELS]; + u32 labels; +}; + +static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate) +{ + return (struct mpls_iptunnel_encap *)lwtstate->data; +} + +#endif diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h new file mode 100644 index 000000000000..d80a0498f77e --- /dev/null +++ b/include/uapi/linux/mpls_iptunnel.h @@ -0,0 +1,28 @@ +/* + * mpls tunnel api + * + * Authors: + * Roopa Prabhu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_MPLS_IPTUNNEL_H +#define _UAPI_LINUX_MPLS_IPTUNNEL_H + +/* MPLS tunnel attributes + * [RTA_ENCAP] = { + * [MPLS_IPTUNNEL_DST] + * } + */ +enum { + MPLS_IPTUNNEL_UNSPEC, + MPLS_IPTUNNEL_DST, + __MPLS_IPTUNNEL_MAX, +}; +#define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1) + +#endif /* _UAPI_LINUX_MPLS_IPTUNNEL_H */ diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig index 17bde799c854..5c467ef97311 100644 --- a/net/mpls/Kconfig +++ b/net/mpls/Kconfig @@ -24,7 +24,13 @@ config NET_MPLS_GSO config MPLS_ROUTING tristate "MPLS: routing support" - help + ---help--- Add support for forwarding of mpls packets. +config MPLS_IPTUNNEL + tristate "MPLS: IP over MPLS tunnel support" + depends on LWTUNNEL && MPLS_ROUTING + ---help--- + mpls ip tunnel support. + endif # MPLS diff --git a/net/mpls/Makefile b/net/mpls/Makefile index 65bbe68c72e6..9ca923625016 100644 --- a/net/mpls/Makefile +++ b/net/mpls/Makefile @@ -3,5 +3,6 @@ # obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o +obj-$(CONFIG_MPLS_IPTUNNEL) += mpls_iptunnel.o mpls_router-y := af_mpls.o diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c new file mode 100644 index 000000000000..eea096f21ba5 --- /dev/null +++ b/net/mpls/mpls_iptunnel.c @@ -0,0 +1,233 @@ +/* + * mpls tunnels An implementation mpls tunnels using the light weight tunnel + * infrastructure + * + * Authors: Roopa Prabhu, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { + [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, +}; + +static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) +{ + /* The size of the layer 2.5 labels to be added for this route */ + return en->labels * sizeof(struct mpls_shim_hdr); +} + +int mpls_output(struct sock *sk, struct sk_buff *skb) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct mpls_shim_hdr *hdr; + struct net_device *out_dev; + unsigned int hh_len; + unsigned int new_header_size; + unsigned int mtu; + struct dst_entry *dst = skb_dst(skb); + struct rtable *rt = NULL; + struct rt6_info *rt6 = NULL; + struct lwtunnel_state *lwtstate = NULL; + int err = 0; + bool bos; + int i; + unsigned int ttl; + + /* Obtain the ttl */ + if (skb->protocol == htons(ETH_P_IP)) { + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + lwtstate = rt->rt_lwtstate; + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + lwtstate = rt6->rt6i_lwtstate; + } else { + goto drop; + } + + skb_orphan(skb); + + /* Find the output device */ + out_dev = rcu_dereference(dst->dev); + if (!mpls_output_possible(out_dev) || + !lwtstate || skb_warn_if_lro(skb)) + goto drop; + + skb_forward_csum(skb); + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + /* Verify the destination can hold the packet */ + new_header_size = mpls_encap_size(tun_encap_info); + mtu = mpls_dev_mtu(out_dev); + if (mpls_pkt_too_big(skb, mtu - new_header_size)) + goto drop; + + hh_len = LL_RESERVED_SPACE(out_dev); + if (!out_dev->header_ops) + hh_len = 0; + + /* Ensure there is enough space for the headers in the skb */ + if (skb_cow(skb, hh_len + new_header_size)) + goto drop; + + skb_push(skb, new_header_size); + skb_reset_network_header(skb); + + skb->dev = out_dev; + skb->protocol = htons(ETH_P_MPLS_UC); + + /* Push the new labels */ + hdr = mpls_hdr(skb); + bos = true; + for (i = tun_encap_info->labels - 1; i >= 0; i--) { + hdr[i] = mpls_entry_encode(tun_encap_info->label[i], + ttl, 0, bos); + bos = false; + } + + if (rt) + err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway, + skb); + else if (rt6) + err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway, + skb); + if (err) + net_dbg_ratelimited("%s: packet transmission failed: %d\n", + __func__, err); + + return 0; + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int mpls_build_state(struct net_device *dev, struct nlattr *nla, + struct lwtunnel_state **ts) +{ + struct mpls_iptunnel_encap *tun_encap_info; + struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; + struct lwtunnel_state *newts; + int tun_encap_info_len; + int ret; + + ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, + mpls_iptunnel_policy); + if (ret < 0) + return ret; + + if (!tb[MPLS_IPTUNNEL_DST]) + return -EINVAL; + + tun_encap_info_len = sizeof(*tun_encap_info); + + newts = lwtunnel_state_alloc(tun_encap_info_len); + if (!newts) + return -ENOMEM; + + newts->len = tun_encap_info_len; + tun_encap_info = mpls_lwtunnel_encap(newts); + ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, + &tun_encap_info->labels, tun_encap_info->label); + if (ret) + goto errout; + newts->type = LWTUNNEL_ENCAP_MPLS; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + + *ts = newts; + + return 0; + +errout: + kfree(newts); + *ts = NULL; + + return ret; +} + +static int mpls_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels, + tun_encap_info->label)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + struct mpls_iptunnel_encap *tun_encap_info; + + tun_encap_info = mpls_lwtunnel_encap(lwtstate); + + return nla_total_size(tun_encap_info->labels * 4); +} + +static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a); + struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); + int l; + + if (a_hdr->labels != b_hdr->labels) + return 1; + + for (l = 0; l < MAX_NEW_LABELS; l++) + if (a_hdr->label[l] != b_hdr->label[l]) + return 1; + return 0; +} + +static const struct lwtunnel_encap_ops mpls_iptun_ops = { + .build_state = mpls_build_state, + .output = mpls_output, + .fill_encap = mpls_fill_encap_info, + .get_encap_size = mpls_encap_nlsize, + .cmp_encap = mpls_encap_cmp, +}; + +static int __init mpls_iptunnel_init(void) +{ + return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_init(mpls_iptunnel_init); + +static void __exit mpls_iptunnel_exit(void) +{ + lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS); +} +module_exit(mpls_iptunnel_exit); + +MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 1d8fff907342d2339796dbd27ea47d0e76a6a2d0 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:54 +0200 Subject: ip_tunnel: Make ovs_tunnel_info and ovs_key_ipv4_tunnel generic Rename the tunnel metadata data structures currently internal to OVS and make them generic for use by all IP tunnels. Both structures are kernel internal and will stay that way. Their members are exposed to user space through individual Netlink attributes by OVS. It will therefore be possible to extend/modify these structures without affecting user ABI. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 63 +++++++++++++++++++++++++++++++++ include/uapi/linux/openvswitch.h | 2 +- net/openvswitch/actions.c | 2 +- net/openvswitch/datapath.h | 5 +-- net/openvswitch/flow.c | 4 +-- net/openvswitch/flow.h | 76 ++-------------------------------------- net/openvswitch/flow_netlink.c | 16 ++++----- net/openvswitch/flow_netlink.h | 2 +- net/openvswitch/vport-geneve.c | 17 +++++---- net/openvswitch/vport-gre.c | 16 ++++----- net/openvswitch/vport-vxlan.c | 18 +++++----- net/openvswitch/vport.c | 30 ++++++++-------- net/openvswitch/vport.h | 12 +++---- 13 files changed, 128 insertions(+), 135 deletions(-) (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index d8214cb88bbc..6b9d559ce5f5 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -22,6 +22,28 @@ /* Keep error state on tunnel for 30 sec */ #define IPTUNNEL_ERR_TIMEO (30*HZ) +/* Used to memset ip_tunnel padding. */ +#define IP_TUNNEL_KEY_SIZE \ + (offsetof(struct ip_tunnel_key, tp_dst) + \ + FIELD_SIZEOF(struct ip_tunnel_key, tp_dst)) + +struct ip_tunnel_key { + __be64 tun_id; + __be32 ipv4_src; + __be32 ipv4_dst; + __be16 tun_flags; + __u8 ipv4_tos; + __u8 ipv4_ttl; + __be16 tp_src; + __be16 tp_dst; +} __packed __aligned(4); /* Minimize padding. */ + +struct ip_tunnel_info { + struct ip_tunnel_key key; + const void *options; + u8 options_len; +}; + /* 6rd prefix/relay information */ #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd_parm { @@ -136,6 +158,47 @@ int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); +static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags, + const void *opts, u8 opts_len) +{ + tun_info->key.tun_id = tun_id; + tun_info->key.ipv4_src = saddr; + tun_info->key.ipv4_dst = daddr; + tun_info->key.ipv4_tos = tos; + tun_info->key.ipv4_ttl = ttl; + tun_info->key.tun_flags = tun_flags; + + /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of + * the upper tunnel are used. + * E.g: GRE over IPSEC, the tp_src and tp_port are zero. + */ + tun_info->key.tp_src = tp_src; + tun_info->key.tp_dst = tp_dst; + + /* Clear struct padding. */ + if (sizeof(tun_info->key) != IP_TUNNEL_KEY_SIZE) + memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_SIZE, + 0, sizeof(tun_info->key) - IP_TUNNEL_KEY_SIZE); + + tun_info->options = opts; + tun_info->options_len = opts_len; +} + +static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, + const struct iphdr *iph, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags, + const void *opts, u8 opts_len) +{ + __ip_tunnel_info_init(tun_info, iph->saddr, iph->daddr, + iph->tos, iph->ttl, tp_src, tp_dst, + tun_id, tun_flags, opts, opts_len); +} + #ifdef CONFIG_INET int ip_tunnel_init(struct net_device *dev); diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 1dab77601c21..d6b885460187 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -321,7 +321,7 @@ enum ovs_key_attr { * the accepted length of the array. */ #ifdef __KERNEL__ - OVS_KEY_ATTR_TUNNEL_INFO, /* struct ovs_tunnel_info */ + OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ #endif __OVS_KEY_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8a8c0b8b4f63..27c1687cfd92 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -611,7 +611,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, const struct nlattr *actions, int actions_len) { - struct ovs_tunnel_info info; + struct ip_tunnel_info info; struct dp_upcall_info upcall; const struct nlattr *a; int rem; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index cd691e935e08..6b28c5cedb23 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -25,6 +25,7 @@ #include #include #include +#include #include "flow.h" #include "flow_table.h" @@ -98,7 +99,7 @@ struct datapath { * when a packet is received by OVS. */ struct ovs_skb_cb { - struct ovs_tunnel_info *egress_tun_info; + struct ip_tunnel_info *egress_tun_info; struct vport *input_vport; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -114,7 +115,7 @@ struct ovs_skb_cb { * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. */ struct dp_upcall_info { - const struct ovs_tunnel_info *egress_tun_info; + const struct ip_tunnel_info *egress_tun_info; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bc7b0aba994a..8db22ef73626 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -682,12 +682,12 @@ int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) return key_extract(skb, key); } -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key) { /* Extract metadata from packet. */ if (tun_info) { - memcpy(&key->tun_key, &tun_info->tunnel, sizeof(key->tun_key)); + memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); if (tun_info->options) { BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index a076e445ccc2..cadc6c5c3545 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -32,31 +32,10 @@ #include #include #include +#include struct sk_buff; -/* Used to memset ovs_key_ipv4_tunnel padding. */ -#define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, tp_dst) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, tp_dst)) - -struct ovs_key_ipv4_tunnel { - __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; - __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; - __be16 tp_src; - __be16 tp_dst; -} __packed __aligned(4); /* Minimize padding. */ - -struct ovs_tunnel_info { - struct ovs_key_ipv4_tunnel tunnel; - const void *options; - u8 options_len; -}; - /* Store options at the end of the array if they are less than the * maximum size. This allows us to get the benefits of variable length * matching for small options. @@ -66,55 +45,6 @@ struct ovs_tunnel_info { #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) -static inline void __ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - tun_info->tunnel.tun_id = tun_id; - tun_info->tunnel.ipv4_src = saddr; - tun_info->tunnel.ipv4_dst = daddr; - tun_info->tunnel.ipv4_tos = tos; - tun_info->tunnel.ipv4_ttl = ttl; - tun_info->tunnel.tun_flags = tun_flags; - - /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of - * the upper tunnel are used. - * E.g: GRE over IPSEC, the tp_src and tp_port are zero. - */ - tun_info->tunnel.tp_src = tp_src; - tun_info->tunnel.tp_dst = tp_dst; - - /* Clear struct padding. */ - if (sizeof(tun_info->tunnel) != OVS_TUNNEL_KEY_SIZE) - memset((unsigned char *)&tun_info->tunnel + OVS_TUNNEL_KEY_SIZE, - 0, sizeof(tun_info->tunnel) - OVS_TUNNEL_KEY_SIZE); - - tun_info->options = opts; - tun_info->options_len = opts_len; -} - -static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, - const struct iphdr *iph, - __be16 tp_src, - __be16 tp_dst, - __be64 tun_id, - __be16 tun_flags, - const void *opts, - u8 opts_len) -{ - __ovs_flow_tun_info_init(tun_info, iph->saddr, iph->daddr, - iph->tos, iph->ttl, - tp_src, tp_dst, - tun_id, tun_flags, - opts, opts_len); -} - #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) @@ -122,7 +52,7 @@ static inline void ovs_flow_tun_info_init(struct ovs_tunnel_info *tun_info, struct sw_flow_key { u8 tun_opts[255]; u8 tun_opts_len; - struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ + struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ @@ -273,7 +203,7 @@ void ovs_flow_stats_clear(struct sw_flow *); u64 ovs_flow_used_time(unsigned long flow_jiffies); int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); -int ovs_flow_key_extract(const struct ovs_tunnel_info *tun_info, +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 624e41c4267f..ecfa530d3461 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -641,7 +641,7 @@ static int vxlan_opt_to_nlattr(struct sk_buff *skb, } static int __ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, + const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len) { if (output->tun_flags & TUNNEL_KEY && @@ -689,7 +689,7 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, } static int ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *output, + const struct ip_tunnel_key *output, const void *tun_opts, int swkey_tun_opts_len) { struct nlattr *nla; @@ -708,9 +708,9 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, } int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ovs_tunnel_info *egress_tun_info) + const struct ip_tunnel_info *egress_tun_info) { - return __ipv4_tun_to_nlattr(skb, &egress_tun_info->tunnel, + return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, egress_tun_info->options, egress_tun_info->options_len); } @@ -1746,7 +1746,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; - struct ovs_tunnel_info *tun_info; + struct ip_tunnel_info *tun_info; struct nlattr *a; int err = 0, start, opts_type; @@ -1777,7 +1777,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return PTR_ERR(a); tun_info = nla_data(a); - tun_info->tunnel = key.tun_key; + tun_info->key = key.tun_key; tun_info->options_len = key.tun_opts_len; if (tun_info->options_len) { @@ -2227,13 +2227,13 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) switch (key_type) { case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ovs_tunnel_info *tun_info = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = nla_data(ovs_key); start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) return -EMSGSIZE; - err = ipv4_tun_to_nlattr(skb, &tun_info->tunnel, + err = ipv4_tun_to_nlattr(skb, &tun_info->key, tun_info->options_len ? tun_info->options : NULL, tun_info->options_len); diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 5c3d75bff310..ec53eb6e632b 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -55,7 +55,7 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, const struct nlattr *mask, bool log); int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ovs_tunnel_info *); + const struct ip_tunnel_info *); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 208c576bd1b6..1da3a14d1010 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -77,7 +77,7 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) struct vport *vport = gs->rcv_data; struct genevehdr *geneveh = geneve_hdr(skb); int opts_len; - struct ovs_tunnel_info tun_info; + struct ip_tunnel_info tun_info; __be64 key; __be16 flags; @@ -90,10 +90,9 @@ static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) key = vni_to_tunnel_id(geneveh->vni); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, - geneveh->options, opts_len); + ip_tunnel_info_init(&tun_info, ip_hdr(skb), + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, geneveh->options, opts_len); ovs_vport_receive(vport, skb, &tun_info); } @@ -165,8 +164,8 @@ error: static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) { - const struct ovs_key_ipv4_tunnel *tun_key; - struct ovs_tunnel_info *tun_info; + const struct ip_tunnel_key *tun_key; + struct ip_tunnel_info *tun_info; struct net *net = ovs_dp_get_net(vport->dp); struct geneve_port *geneve_port = geneve_vport(vport); __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; @@ -183,7 +182,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) goto error; } - tun_key = &tun_info->tunnel; + tun_key = &tun_info->key; rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -225,7 +224,7 @@ static const char *geneve_get_name(const struct vport *vport) } static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct ip_tunnel_info *egress_tun_info) { struct geneve_port *geneve_port = geneve_vport(vport); struct net *net = ovs_dp_get_net(vport->dp); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index f17ac9642f4e..b87656c66aaf 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -67,9 +67,9 @@ static struct sk_buff *__build_header(struct sk_buff *skb, int tunnel_hlen) { struct tnl_ptk_info tpi; - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + tun_key = &OVS_CB(skb)->egress_tun_info->key; skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); if (IS_ERR(skb)) @@ -97,7 +97,7 @@ static __be64 key_to_tunnel_id(__be32 key, __be32 seq) static int gre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { - struct ovs_tunnel_info tun_info; + struct ip_tunnel_info tun_info; struct ovs_net *ovs_net; struct vport *vport; __be64 key; @@ -108,8 +108,8 @@ static int gre_rcv(struct sk_buff *skb, return PACKET_REJECT; key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_info_init(&tun_info, ip_hdr(skb), 0, 0, key, - filter_tnl_flags(tpi->flags), NULL, 0); + ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key, + filter_tnl_flags(tpi->flags), NULL, 0); ovs_vport_receive(vport, skb, &tun_info); return PACKET_RCVD; @@ -134,7 +134,7 @@ static int gre_err(struct sk_buff *skb, u32 info, static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) { struct net *net = ovs_dp_get_net(vport->dp); - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; struct flowi4 fl; struct rtable *rt; int min_headroom; @@ -147,7 +147,7 @@ static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) goto err_free_skb; } - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + tun_key = &OVS_CB(skb)->egress_tun_info->key; rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -277,7 +277,7 @@ static void gre_tnl_destroy(struct vport *vport) } static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct ip_tunnel_info *egress_tun_info) { return ovs_tunnel_get_egress_info(egress_tun_info, ovs_dp_get_net(vport->dp), diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 6d39766e7828..6f7986fabb70 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -64,7 +64,7 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, struct vxlan_metadata *md) { - struct ovs_tunnel_info tun_info; + struct ip_tunnel_info tun_info; struct vxlan_port *vxlan_port; struct vport *vport = vs->data; struct iphdr *iph; @@ -82,9 +82,9 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, /* Save outer tunnel values */ iph = ip_hdr(skb); key = cpu_to_be64(ntohl(md->vni) >> 8); - ovs_flow_tun_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, &opts, sizeof(opts)); + ip_tunnel_info_init(&tun_info, iph, + udp_hdr(skb)->source, udp_hdr(skb)->dest, + key, flags, &opts, sizeof(opts)); ovs_vport_receive(vport, skb, &tun_info); } @@ -205,13 +205,13 @@ error: static int vxlan_ext_gbp(struct sk_buff *skb) { - const struct ovs_tunnel_info *tun_info; + const struct ip_tunnel_info *tun_info; const struct ovs_vxlan_opts *opts; tun_info = OVS_CB(skb)->egress_tun_info; opts = tun_info->options; - if (tun_info->tunnel.tun_flags & TUNNEL_VXLAN_OPT && + if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT && tun_info->options_len >= sizeof(*opts)) return opts->gbp; else @@ -224,7 +224,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) struct vxlan_port *vxlan_port = vxlan_vport(vport); struct sock *sk = vxlan_port->vs->sock->sk; __be16 dst_port = inet_sk(sk)->inet_sport; - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; struct vxlan_metadata md = {0}; struct rtable *rt; struct flowi4 fl; @@ -238,7 +238,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) goto error; } - tun_key = &OVS_CB(skb)->egress_tun_info->tunnel; + tun_key = &OVS_CB(skb)->egress_tun_info->key; rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); if (IS_ERR(rt)) { err = PTR_ERR(rt); @@ -269,7 +269,7 @@ error: } static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *egress_tun_info) + struct ip_tunnel_info *egress_tun_info) { struct net *net = ovs_dp_get_net(vport->dp); struct vxlan_port *vxlan_port = vxlan_vport(vport); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 067a3fff1d2c..af23ba077836 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -469,7 +469,7 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * skb->data should point to the Ethernet header. */ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ovs_tunnel_info *tun_info) + const struct ip_tunnel_info *tun_info) { struct pcpu_sw_netstats *stats; struct sw_flow_key key; @@ -572,22 +572,22 @@ void ovs_vport_deferred_free(struct vport *vport) } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, - const struct ovs_tunnel_info *tun_info, + const struct ip_tunnel_info *tun_info, u8 ipproto, u32 skb_mark, __be16 tp_src, __be16 tp_dst) { - const struct ovs_key_ipv4_tunnel *tun_key; + const struct ip_tunnel_key *tun_key; struct rtable *rt; struct flowi4 fl; if (unlikely(!tun_info)) return -EINVAL; - tun_key = &tun_info->tunnel; + tun_key = &tun_info->key; /* Route lookup to get srouce IP address. * The process may need to be changed if the corresponding process @@ -602,22 +602,22 @@ int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, /* Generate egress_tun_info based on tun_info, * saddr, tp_src and tp_dst */ - __ovs_flow_tun_info_init(egress_tun_info, - fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags, - tun_info->options, - tun_info->options_len); + __ip_tunnel_info_init(egress_tun_info, + fl.saddr, tun_key->ipv4_dst, + tun_key->ipv4_tos, + tun_key->ipv4_ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags, + tun_info->options, + tun_info->options_len); return 0; } EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info) + struct ip_tunnel_info *info) { /* get_egress_tun_info() is only implemented on tunnel ports. */ if (unlikely(!vport->ops->get_egress_tun_info)) diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index bc85331a6c60..4750fb673a9f 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -58,15 +58,15 @@ u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); int ovs_vport_send(struct vport *, struct sk_buff *); -int ovs_tunnel_get_egress_info(struct ovs_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, - const struct ovs_tunnel_info *tun_info, + const struct ip_tunnel_info *tun_info, u8 ipproto, u32 skb_mark, __be16 tp_src, __be16 tp_dst); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ovs_tunnel_info *info); + struct ip_tunnel_info *info); /* The following definitions are for implementers of vport devices: */ @@ -176,7 +176,7 @@ struct vport_ops { int (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct ovs_tunnel_info *); + struct ip_tunnel_info *); struct module *owner; struct list_head list; @@ -226,7 +226,7 @@ static inline struct vport *vport_from_priv(void *priv) } void ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ovs_tunnel_info *); + const struct ip_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -239,7 +239,7 @@ int ovs_vport_ops_register(struct vport_ops *ops); void ovs_vport_ops_unregister(struct vport_ops *ops); static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, - const struct ovs_key_ipv4_tunnel *key, + const struct ip_tunnel_key *key, u32 mark, struct flowi4 *fl, u8 protocol) -- cgit v1.2.3 From 773a69d64bf65eb6c212c97e9737963a2cf668fd Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:55 +0200 Subject: icmp: Don't leak original dst into ip_route_input() ip_route_input() unconditionally overwrites the dst. Hide the original dst attached to the skb by calling skb_dst_set(skb, NULL) prior to ip_route_input(). Reported-by: Julian Anastasov Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f5203fba6236..c0556f1e4bf0 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -496,6 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, } /* Ugh! */ orefdst = skb_in->_skb_refdst; /* save old refdst */ + skb_dst_set(skb_in, NULL); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, RT_TOS(tos), rt2->dst.dev); -- cgit v1.2.3 From f38a9eb1f77b296ff07e000823884a0f64d67b2a Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:56 +0200 Subject: dst: Metadata destinations Introduces a new dst_metadata which enables to carry per packet metadata between forwarding and processing elements via the skb->dst pointer. The structure is set up to be a union. Thus, each separate type of metadata requires its own dst instance. If demand arises to carry multiple types of metadata concurrently, metadata dst entries can be made stackable. The metadata dst entry is refcnt'ed as expected for now but a non reference counted use is possible if the reference is forced before queueing the skb. In order to allow allocating dsts with variable length, the existing dst_alloc() is split into a dst_alloc() and dst_init() function. The existing dst_init() function to initialize the subsystem is being renamed to dst_subsys_init() to make it clear what is what. The check before ip_route_input() is changed to ignore metadata dsts and drop the dst inside the routing function thus allowing to interpret metadata in a later commit. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst.h | 6 +++- include/net/dst_metadata.h | 32 ++++++++++++++++++ net/core/dev.c | 2 +- net/core/dst.c | 84 ++++++++++++++++++++++++++++++++++++++-------- net/ipv4/ip_input.c | 3 +- net/ipv4/route.c | 2 ++ 6 files changed, 112 insertions(+), 17 deletions(-) create mode 100644 include/net/dst_metadata.h (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index 2bc73f8a00a9..2578811cef51 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -57,6 +57,7 @@ struct dst_entry { #define DST_FAKE_RTABLE 0x0040 #define DST_XFRM_TUNNEL 0x0080 #define DST_XFRM_QUEUE 0x0100 +#define DST_METADATA 0x0200 unsigned short pending_confirm; @@ -356,6 +357,9 @@ static inline int dst_discard(struct sk_buff *skb) } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags); +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags); void __dst_free(struct dst_entry *dst); struct dst_entry *dst_destroy(struct dst_entry *dst); @@ -457,7 +461,7 @@ static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) return dst; } -void dst_init(void); +void dst_subsys_init(void); /* Flags for xfrm_lookup flags argument. */ enum { diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h new file mode 100644 index 000000000000..4f7694f3c7d0 --- /dev/null +++ b/include/net/dst_metadata.h @@ -0,0 +1,32 @@ +#ifndef __NET_DST_METADATA_H +#define __NET_DST_METADATA_H 1 + +#include +#include +#include + +struct metadata_dst { + struct dst_entry dst; + size_t opts_len; +}; + +static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) +{ + struct metadata_dst *md_dst = (struct metadata_dst *) skb_dst(skb); + + if (md_dst && md_dst->dst.flags & DST_METADATA) + return md_dst; + + return NULL; +} + +static inline bool skb_valid_dst(const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + return dst && !(dst->flags & DST_METADATA); +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); + +#endif /* __NET_DST_METADATA_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 2ee15afb412d..cb52cba30ae8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7669,7 +7669,7 @@ static int __init net_dev_init(void) open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); - dst_init(); + dst_subsys_init(); rc = 0; out: return rc; diff --git a/net/core/dst.c b/net/core/dst.c index e956ce6d1378..917364f0d0be 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -22,6 +22,7 @@ #include #include +#include /* * Theory of operations: @@ -158,19 +159,10 @@ const u32 dst_default_metrics[RTAX_MAX + 1] = { [RTAX_MAX] = 0xdeadbeef, }; - -void *dst_alloc(struct dst_ops *ops, struct net_device *dev, - int initial_ref, int initial_obsolete, unsigned short flags) +void dst_init(struct dst_entry *dst, struct dst_ops *ops, + struct net_device *dev, int initial_ref, int initial_obsolete, + unsigned short flags) { - struct dst_entry *dst; - - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { - if (ops->gc(ops)) - return NULL; - } - dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); - if (!dst) - return NULL; dst->child = NULL; dst->dev = dev; if (dev) @@ -200,6 +192,25 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); +} +EXPORT_SYMBOL(dst_init); + +void *dst_alloc(struct dst_ops *ops, struct net_device *dev, + int initial_ref, int initial_obsolete, unsigned short flags) +{ + struct dst_entry *dst; + + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc(ops)) + return NULL; + } + + dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); + if (!dst) + return NULL; + + dst_init(dst, ops, dev, initial_ref, initial_obsolete, flags); + return dst; } EXPORT_SYMBOL(dst_alloc); @@ -248,7 +259,11 @@ again: dst->ops->destroy(dst); if (dst->dev) dev_put(dst->dev); - kmem_cache_free(dst->ops->kmem_cachep, dst); + + if (dst->flags & DST_METADATA) + kfree(dst); + else + kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) { @@ -327,6 +342,47 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) } EXPORT_SYMBOL(__dst_destroy_metrics_generic); +static struct dst_ops md_dst_ops = { + .family = AF_UNSPEC, +}; + +static int dst_md_discard_sk(struct sock *sk, struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call output on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +static int dst_md_discard(struct sk_buff *skb) +{ + WARN_ONCE(1, "Attempting to call input on metadata dst\n"); + kfree_skb(skb); + return 0; +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +{ + struct metadata_dst *md_dst; + struct dst_entry *dst; + + md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + if (!md_dst) + return ERR_PTR(-ENOMEM); + + dst = &md_dst->dst; + dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, + DST_METADATA | DST_NOCACHE | DST_NOCOUNT); + + dst->input = dst_md_discard; + dst->output = dst_md_discard_sk; + + memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); + md_dst->opts_len = optslen; + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice @@ -391,7 +447,7 @@ static struct notifier_block dst_dev_notifier = { .priority = -10, /* must be called after other network notifiers */ }; -void __init dst_init(void) +void __init dst_subsys_init(void) { register_netdevice_notifier(&dst_dev_notifier); } diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2db4c8773c1b..f4fc8a77aaa7 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -146,6 +146,7 @@ #include #include #include +#include /* * Process Router Attention IP option (RFC 2113) @@ -331,7 +332,7 @@ static int ip_rcv_finish(struct sock *sk, struct sk_buff *skb) * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (!skb_dst(skb)) { + if (!skb_valid_dst(skb)) { int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, iph->tos, skb->dev); if (unlikely(err)) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index cd3157c464e6..4c8e84e75871 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1690,6 +1690,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ + skb_dst_drop(skb); + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) goto martian_source; -- cgit v1.2.3 From 0accfc268f4d3345693d3af3d5780aae3ad93d8d Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:57 +0200 Subject: arp: Inherit metadata dst when creating ARP requests If output device wants to see the dst, inherit the dst of the original skb and pass it on to generate the ARP request. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/arp.c | 65 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 933a92820d26..1d59e50ce8b7 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -291,6 +291,40 @@ static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) kfree_skb(skb); } +/* Create and send an arp packet. */ +static void arp_send_dst(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, + const unsigned char *src_hw, + const unsigned char *target_hw, struct sk_buff *oskb) +{ + struct sk_buff *skb; + + /* arp on this interface. */ + if (dev->flags & IFF_NOARP) + return; + + skb = arp_create(type, ptype, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (!skb) + return; + + if (oskb) + skb_dst_copy(skb, oskb); + + arp_xmit(skb); +} + +void arp_send(int type, int ptype, __be32 dest_ip, + struct net_device *dev, __be32 src_ip, + const unsigned char *dest_hw, const unsigned char *src_hw, + const unsigned char *target_hw) +{ + arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, + target_hw, NULL); +} +EXPORT_SYMBOL(arp_send); + static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; @@ -346,8 +380,9 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) } } - arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, - dst_hw, dev->dev_addr, NULL); + arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_hw, dev->dev_addr, NULL, + dev->priv_flags & IFF_XMIT_DST_RELEASE ? NULL : skb); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) @@ -596,32 +631,6 @@ void arp_xmit(struct sk_buff *skb) } EXPORT_SYMBOL(arp_xmit); -/* - * Create and send an arp packet. - */ -void arp_send(int type, int ptype, __be32 dest_ip, - struct net_device *dev, __be32 src_ip, - const unsigned char *dest_hw, const unsigned char *src_hw, - const unsigned char *target_hw) -{ - struct sk_buff *skb; - - /* - * No arp on this interface. - */ - - if (dev->flags&IFF_NOARP) - return; - - skb = arp_create(type, ptype, dest_ip, dev, src_ip, - dest_hw, src_hw, target_hw); - if (!skb) - return; - - arp_xmit(skb); -} -EXPORT_SYMBOL(arp_send); - /* * Process an arp request. */ -- cgit v1.2.3 From 1b7179d3adff0ab71f85ee24d7de28ccb7734b89 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:43:59 +0200 Subject: route: Extend flow representation with tunnel key Add a new flowi_tunnel structure which is a subset of ip_tunnel_key to allow routes to match on tunnel metadata. For now, the tunnel id is added to flowi_tunnel which allows for routes to be bound to specific virtual tunnels. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/flow.h | 8 ++++++++ net/ipv4/fib_frontend.c | 2 ++ net/ipv4/route.c | 8 ++++++++ 3 files changed, 18 insertions(+) (limited to 'net') diff --git a/include/net/flow.h b/include/net/flow.h index 8109a159d1b3..3098ae33a178 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -19,6 +19,10 @@ #define LOOPBACK_IFINDEX 1 +struct flowi_tunnel { + __be64 tun_id; +}; + struct flowi_common { int flowic_oif; int flowic_iif; @@ -30,6 +34,7 @@ struct flowi_common { #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 __u32 flowic_secid; + struct flowi_tunnel flowic_tun_key; }; union flowi_uli { @@ -66,6 +71,7 @@ struct flowi4 { #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid +#define flowi4_tun_key __fl_common.flowic_tun_key /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; @@ -95,6 +101,7 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; + fl4->flowi4_tun_key.tun_id = 0; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; @@ -165,6 +172,7 @@ struct flowi { #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid +#define flowi_tun_key u.__fl_common.flowic_tun_key } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 9b2019cc3586..6b98de0d7949 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -280,6 +280,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_scope = scope; fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0; + fl4.flowi4_tun_key.tun_id = 0; if (!fib_lookup(net, &fl4, &res, 0)) return FIB_RES_PREFSRC(net, res); } else { @@ -313,6 +314,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.saddr = dst; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_tun_key.tun_id = 0; no_addr = idev->ifa_list == NULL; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4c8e84e75871..91da18be0a71 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -110,6 +111,7 @@ #include #endif #include +#include #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -1673,6 +1675,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, { struct fib_result res; struct in_device *in_dev = __in_dev_get_rcu(dev); + struct ip_tunnel_info *tun_info; struct flowi4 fl4; unsigned int flags = 0; u32 itag = 0; @@ -1690,6 +1693,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ + tun_info = skb_tunnel_info(skb); + if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; + else + fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) -- cgit v1.2.3 From 3093fbe7ff4bc7d1571fc217dade1cf80330a714 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:00 +0200 Subject: route: Per route IP tunnel metadata via lightweight tunnel This introduces a new IP tunnel lightweight tunnel type which allows to specify IP tunnel instructions per route. Only IPv4 is supported at this point. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 10 +++- include/net/dst_metadata.h | 12 ++++- include/net/ip_tunnels.h | 7 ++- include/uapi/linux/lwtunnel.h | 1 + include/uapi/linux/rtnetlink.h | 15 ++++++ net/ipv4/ip_tunnel_core.c | 114 +++++++++++++++++++++++++++++++++++++++++ net/ipv4/route.c | 2 +- net/openvswitch/vport.h | 1 + 8 files changed, 157 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 06c092b05a51..9486d7ec128c 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1935,7 +1935,7 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, struct vxlan_rdst *rdst, bool did_rsc) { - struct ip_tunnel_info *info = skb_tunnel_info(skb); + struct ip_tunnel_info *info; struct vxlan_dev *vxlan = netdev_priv(dev); struct sock *sk = vxlan->vn_sock->sock->sk; struct rtable *rt = NULL; @@ -1952,6 +1952,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, int err; u32 flags = vxlan->flags; + /* FIXME: Support IPv6 */ + info = skb_tunnel_info(skb, AF_INET); + if (rdst) { dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; vni = rdst->remote_vni; @@ -2141,12 +2144,15 @@ tx_free: static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) { struct vxlan_dev *vxlan = netdev_priv(dev); - const struct ip_tunnel_info *info = skb_tunnel_info(skb); + const struct ip_tunnel_info *info; struct ethhdr *eth; bool did_rsc = false; struct vxlan_rdst *rdst, *fdst = NULL; struct vxlan_fdb *f; + /* FIXME: Support IPv6 */ + info = skb_tunnel_info(skb, AF_INET); + skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index e843937fb30a..7b0306894663 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -23,13 +23,23 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb, + int family) { struct metadata_dst *md_dst = skb_metadata_dst(skb); + struct rtable *rt; if (md_dst) return &md_dst->u.tun_info; + switch (family) { + case AF_INET: + rt = (struct rtable *)skb_dst(skb); + if (rt && rt->rt_lwtstate) + return lwt_tun_info(rt->rt_lwtstate); + break; + } + return NULL; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index d11530f1c1e2..0b7e18cfa0b4 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -9,9 +9,9 @@ #include #include #include -#include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -298,6 +298,11 @@ static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) return info + 1; } +static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) +{ + return (struct ip_tunnel_info *)lwtstate->data; +} + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index aa611d931a31..31377bbea3f8 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -6,6 +6,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_NONE, LWTUNNEL_ENCAP_MPLS, + LWTUNNEL_ENCAP_IP, __LWTUNNEL_ENCAP_MAX, }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 0d3d3cc43356..47d24cb3fbc1 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -286,6 +286,21 @@ enum rt_class_t { /* Routing message attributes */ +enum ip_tunnel_t { + IP_TUN_UNSPEC, + IP_TUN_ID, + IP_TUN_DST, + IP_TUN_SRC, + IP_TUN_TTL, + IP_TUN_TOS, + IP_TUN_SPORT, + IP_TUN_DPORT, + IP_TUN_FLAGS, + __IP_TUN_MAX, +}; + +#define IP_TUN_MAX (__IP_TUN_MAX - 1) + enum rtattr_type_t { RTA_UNSPEC, RTA_DST, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 6a51a71a6c67..025b76e803fd 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -190,3 +190,117 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, return tot; } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); + +static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = { + [IP_TUN_ID] = { .type = NLA_U64 }, + [IP_TUN_DST] = { .type = NLA_U32 }, + [IP_TUN_SRC] = { .type = NLA_U32 }, + [IP_TUN_TTL] = { .type = NLA_U8 }, + [IP_TUN_TOS] = { .type = NLA_U8 }, + [IP_TUN_SPORT] = { .type = NLA_U16 }, + [IP_TUN_DPORT] = { .type = NLA_U16 }, + [IP_TUN_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[IP_TUN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy); + if (err < 0) + return err; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; + + new_state->type = LWTUNNEL_ENCAP_IP; + + tun_info = lwt_tun_info(new_state); + + if (tb[IP_TUN_ID]) + tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]); + + if (tb[IP_TUN_DST]) + tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]); + + if (tb[IP_TUN_SRC]) + tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]); + + if (tb[IP_TUN_TTL]) + tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]); + + if (tb[IP_TUN_TOS]) + tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]); + + if (tb[IP_TUN_SPORT]) + tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]); + + if (tb[IP_TUN_DPORT]) + tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]); + + if (tb[IP_TUN_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->options = NULL; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; +} + +static int ip_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) || + nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) || + nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) || + nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) || + nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) || + nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) || + nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) || + nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* IP_TUN_ID */ + + nla_total_size(4) /* IP_TUN_DST */ + + nla_total_size(4) /* IP_TUN_SRC */ + + nla_total_size(1) /* IP_TUN_TOS */ + + nla_total_size(1) /* IP_TUN_TTL */ + + nla_total_size(2) /* IP_TUN_SPORT */ + + nla_total_size(2) /* IP_TUN_DPORT */ + + nla_total_size(2); /* IP_TUN_FLAGS */ +} + +static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { + .build_state = ip_tun_build_state, + .fill_encap = ip_tun_fill_encap_info, + .get_encap_size = ip_tun_encap_nlsize, +}; + +static int __init ip_tunnel_core_init(void) +{ + lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); + + return 0; +} +module_init(ip_tunnel_core_init); + +static void __exit ip_tunnel_core_exit(void) +{ + lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); +} +module_exit(ip_tunnel_core_exit); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 91da18be0a71..519ec232818d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1693,7 +1693,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - tun_info = skb_tunnel_info(skb); + tun_info = skb_tunnel_info(skb, AF_INET); if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 4750fb673a9f..75d68248ba69 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -27,6 +27,7 @@ #include #include #include +#include #include "datapath.h" -- cgit v1.2.3 From e7030878fc8448492b6e5cecd574043f63271298 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:01 +0200 Subject: fib: Add fib rule match on tunnel id This add the ability to select a routing table based on the tunnel id which allows to maintain separate routing tables for each virtual tunnel network. ip rule add from all tunnel-id 100 lookup 100 ip rule add from all tunnel-id 200 lookup 200 A new static key controls the collection of metadata at tunnel level upon demand. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 3 ++- include/net/fib_rules.h | 1 + include/net/ip_tunnels.h | 11 +++++++++++ include/uapi/linux/fib_rules.h | 2 +- net/core/fib_rules.c | 24 ++++++++++++++++++++++-- net/ipv4/ip_tunnel_core.c | 16 ++++++++++++++++ 6 files changed, 53 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 9486d7ec128c..2587ac84f71a 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -143,7 +143,8 @@ static struct workqueue_struct *vxlan_wq; static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) { - return vs->flags & VXLAN_F_COLLECT_METADATA; + return vs->flags & VXLAN_F_COLLECT_METADATA || + ip_tunnel_collect_metadata(); } #if IS_ENABLED(CONFIG_IPV6) diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 903a55efbffe..4e8f804f4589 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -19,6 +19,7 @@ struct fib_rule { u8 action; /* 3 bytes hole, try to use */ u32 target; + __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0b7e18cfa0b4..0a5a7763eec2 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -303,6 +303,17 @@ static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstat return (struct ip_tunnel_info *)lwtstate->data; } +extern struct static_key ip_tunnel_metadata_cnt; + +/* Returns > 0 if metadata should be collected */ +static inline int ip_tunnel_collect_metadata(void) +{ + return static_key_false(&ip_tunnel_metadata_cnt); +} + +void ip_tunnel_need_metadata(void); +void ip_tunnel_unneed_metadata(void); + #endif /* CONFIG_INET */ #endif /* __NET_IP_TUNNELS_H */ diff --git a/include/uapi/linux/fib_rules.h b/include/uapi/linux/fib_rules.h index 2b82d7e30974..96161b8202b5 100644 --- a/include/uapi/linux/fib_rules.h +++ b/include/uapi/linux/fib_rules.h @@ -43,7 +43,7 @@ enum { FRA_UNUSED5, FRA_FWMARK, /* mark */ FRA_FLOW, /* flow/class id */ - FRA_UNUSED6, + FRA_TUN_ID, FRA_SUPPRESS_IFGROUP, FRA_SUPPRESS_PREFIXLEN, FRA_TABLE, /* Extended table id */ diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 9a12668f7d62..ae8306e7c56f 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -16,6 +16,7 @@ #include #include #include +#include int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) @@ -186,6 +187,9 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, if ((rule->mark ^ fl->flowi_mark) & rule->mark_mask) goto out; + if (rule->tun_id && (rule->tun_id != fl->flowi_tun_key.tun_id)) + goto out; + ret = ops->match(rule, fl, flags); out: return (rule->flags & FIB_RULE_INVERT) ? !ret : ret; @@ -330,6 +334,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (tb[FRA_FWMASK]) rule->mark_mask = nla_get_u32(tb[FRA_FWMASK]); + if (tb[FRA_TUN_ID]) + rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + rule->action = frh->action; rule->flags = frh->flags; rule->table = frh_get_table(frh, tb); @@ -407,6 +414,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh) if (unresolved) ops->unresolved_rules++; + if (rule->tun_id) + ip_tunnel_need_metadata(); + notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid); flush_route_cache(ops); rules_ops_put(ops); @@ -473,6 +483,10 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) (rule->mark_mask != nla_get_u32(tb[FRA_FWMASK]))) continue; + if (tb[FRA_TUN_ID] && + (rule->tun_id != nla_get_be64(tb[FRA_TUN_ID]))) + continue; + if (!ops->compare(rule, frh, tb)) continue; @@ -487,6 +501,9 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh) goto errout; } + if (rule->tun_id) + ip_tunnel_unneed_metadata(); + list_del_rcu(&rule->list); if (rule->action == FR_ACT_GOTO) { @@ -535,7 +552,8 @@ static inline size_t fib_rule_nlmsg_size(struct fib_rules_ops *ops, + nla_total_size(4) /* FRA_SUPPRESS_PREFIXLEN */ + nla_total_size(4) /* FRA_SUPPRESS_IFGROUP */ + nla_total_size(4) /* FRA_FWMARK */ - + nla_total_size(4); /* FRA_FWMASK */ + + nla_total_size(4) /* FRA_FWMASK */ + + nla_total_size(8); /* FRA_TUN_ID */ if (ops->nlmsg_payload) payload += ops->nlmsg_payload(rule); @@ -591,7 +609,9 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule, ((rule->mark_mask || rule->mark) && nla_put_u32(skb, FRA_FWMASK, rule->mark_mask)) || (rule->target && - nla_put_u32(skb, FRA_GOTO, rule->target))) + nla_put_u32(skb, FRA_GOTO, rule->target)) || + (rule->tun_id && + nla_put_be64(skb, FRA_TUN_ID, rule->tun_id))) goto nla_put_failure; if (rule->suppress_ifgroup != -1) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 025b76e803fd..630e6d5712e8 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -304,3 +305,18 @@ static void __exit ip_tunnel_core_exit(void) lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); } module_exit(ip_tunnel_core_exit); + +struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL(ip_tunnel_metadata_cnt); + +void ip_tunnel_need_metadata(void) +{ + static_key_slow_inc(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_need_metadata); + +void ip_tunnel_unneed_metadata(void) +{ + static_key_slow_dec(&ip_tunnel_metadata_cnt); +} +EXPORT_SYMBOL_GPL(ip_tunnel_unneed_metadata); -- cgit v1.2.3 From 34ae932a40369be6bd6ea97d66b6686361b4370d Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:03 +0200 Subject: openvswitch: Make tunnel set action attach a metadata dst Utilize the new metadata dst to attach encapsulation instructions to the skb. The existing egress_tun_info via the OVS_CB() is left in place until all tunnel vports have been converted to the new method. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 10 ++++++- net/openvswitch/datapath.c | 8 +++--- net/openvswitch/flow.h | 5 ++++ net/openvswitch/flow_netlink.c | 64 +++++++++++++++++++++++++++++++++++++----- net/openvswitch/flow_netlink.h | 1 + net/openvswitch/flow_table.c | 4 ++- 6 files changed, 79 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 27c1687cfd92..cf04c2f8b32a 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -733,7 +733,15 @@ static int execute_set_action(struct sk_buff *skb, { /* Only tunnel set execution is supported without a mask. */ if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { - OVS_CB(skb)->egress_tun_info = nla_data(a); + struct ovs_tunnel_info *tun = nla_data(a); + + skb_dst_drop(skb); + dst_hold((struct dst_entry *)tun->tun_dst); + skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); + + /* FIXME: Remove when all vports have been converted */ + OVS_CB(skb)->egress_tun_info = &tun->tun_dst->u.tun_info; + return 0; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ff8c4a4c1609..02082107c74c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1018,7 +1018,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_unlock(); - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); ovs_flow_free(new_flow, false); } @@ -1030,7 +1030,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1157,7 +1157,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); if (old_acts) - ovs_nla_free_flow_actions(old_acts); + ovs_nla_free_flow_actions_rcu(old_acts); return 0; @@ -1165,7 +1165,7 @@ err_unlock_ovs: ovs_unlock(); kfree_skb(reply); err_kfree_acts: - kfree(acts); + ovs_nla_free_flow_actions(acts); error: return error; } diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index cadc6c5c3545..b62cdb3e3589 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -33,6 +33,7 @@ #include #include #include +#include struct sk_buff; @@ -45,6 +46,10 @@ struct sk_buff; #define TUN_METADATA_OPTS(flow_key, opt_len) \ ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) +struct ovs_tunnel_info { + struct metadata_dst *tun_dst; +}; + #define OVS_SW_FLOW_KEY_METADATA_SIZE \ (offsetof(struct sw_flow_key, recirc_id) + \ FIELD_SIZEOF(struct sw_flow_key, recirc_id)) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ecfa530d3461..e7906dfb8814 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1548,11 +1548,48 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size, bool log) return sfa; } +static void ovs_nla_free_set_action(const struct nlattr *a) +{ + const struct nlattr *ovs_key = nla_data(a); + struct ovs_tunnel_info *ovs_tun; + + switch (nla_type(ovs_key)) { + case OVS_KEY_ATTR_TUNNEL_INFO: + ovs_tun = nla_data(ovs_key); + dst_release((struct dst_entry *)ovs_tun->tun_dst); + break; + } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + const struct nlattr *a; + int rem; + + if (!sf_acts) + return; + + nla_for_each_attr(a, sf_acts->actions, sf_acts->actions_len, rem) { + switch (nla_type(a)) { + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; + } + } + + kfree(sf_acts); +} + +static void __ovs_nla_free_flow_actions(struct rcu_head *head) +{ + ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); +} + /* Schedules 'sf_acts' to be freed after the next RCU grace period. * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) { - kfree_rcu(sf_acts, rcu); + call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); } static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, @@ -1746,7 +1783,9 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, { struct sw_flow_match match; struct sw_flow_key key; + struct metadata_dst *tun_dst; struct ip_tunnel_info *tun_info; + struct ovs_tunnel_info *ovs_tun; struct nlattr *a; int err = 0, start, opts_type; @@ -1771,12 +1810,22 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, if (start < 0) return start; + tun_dst = metadata_dst_alloc(key.tun_opts_len, GFP_KERNEL); + if (!tun_dst) + return -ENOMEM; + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, - sizeof(*tun_info) + key.tun_opts_len, log); - if (IS_ERR(a)) + sizeof(*ovs_tun), log); + if (IS_ERR(a)) { + dst_release((struct dst_entry *)tun_dst); return PTR_ERR(a); + } + + ovs_tun = nla_data(a); + ovs_tun->tun_dst = tun_dst; - tun_info = nla_data(a); + tun_info = &tun_dst->u.tun_info; + tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->key = key.tun_key; tun_info->options_len = key.tun_opts_len; @@ -2177,7 +2226,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) - kfree(*sfa); + ovs_nla_free_flow_actions(*sfa); return err; } @@ -2227,7 +2276,8 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) switch (key_type) { case OVS_KEY_ATTR_TUNNEL_INFO: { - struct ip_tunnel_info *tun_info = nla_data(ovs_key); + struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); if (!start) diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index ec53eb6e632b..acd074408f0a 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -69,5 +69,6 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); void ovs_nla_free_flow_actions(struct sw_flow_actions *); +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); #endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 4613df8c8290..b70d845e7efb 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -18,6 +18,7 @@ #include "flow.h" #include "datapath.h" +#include "flow_netlink.h" #include #include #include @@ -143,7 +144,8 @@ static void flow_free(struct sw_flow *flow) if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); - kfree((struct sw_flow_actions __force *)flow->sf_acts); + if (flow->sf_acts) + ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts); for_each_node(node) if (flow->stats[node]) kmem_cache_free(flow_stats_cache, -- cgit v1.2.3 From be4ace6e6b1bc12e18b25fe764917e09a1f96d7b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:04 +0200 Subject: openvswitch: Move dev pointer into vport itself This is the first step in representing all OVS vports as regular struct net_devices. Move the net_device pointer into the vport structure itself to get rid of struct vport_netdev. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 7 +-- net/openvswitch/dp_notify.c | 5 +-- net/openvswitch/vport-internal_dev.c | 37 +++++++--------- net/openvswitch/vport-netdev.c | 86 ++++++++++++++++-------------------- net/openvswitch/vport-netdev.h | 12 ----- net/openvswitch/vport.h | 3 +- 6 files changed, 59 insertions(+), 91 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 02082107c74c..19df28ee5094 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -188,7 +188,7 @@ static int get_dpifindex(const struct datapath *dp) local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) - ifindex = netdev_vport_priv(local)->dev->ifindex; + ifindex = local->dev->ifindex; else ifindex = 0; @@ -2219,13 +2219,10 @@ static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, struct vport *vport; hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (dev_net(netdev_vport->dev) == dnet) + if (dev_net(vport->dev) == dnet) list_add(&vport->detach_list, head); } } diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 2c631fe76be1..a7a80a6b77b0 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -58,13 +58,10 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) continue; - netdev_vport = netdev_vport_priv(vport); - if (!(netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH)) + if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 6a55f7105505..a2c205d9a8d5 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -156,49 +156,44 @@ static void do_setup(struct net_device *netdev) static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; - struct netdev_vport *netdev_vport; struct internal_dev *internal_dev; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_internal_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, NET_NAME_UNKNOWN, - do_setup); - if (!netdev_vport->dev) { + vport->dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_UNKNOWN, do_setup); + if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } - dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); - internal_dev = internal_dev_priv(netdev_vport->dev); + dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->features |= NETIF_F_NETNS_LOCAL; rtnl_lock(); - err = register_netdevice(netdev_vport->dev); + err = register_netdevice(vport->dev); if (err) goto error_free_netdev; - dev_set_promiscuity(netdev_vport->dev, 1); + dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); - netif_start_queue(netdev_vport->dev); + netif_start_queue(vport->dev); return vport; error_free_netdev: rtnl_unlock(); - free_netdev(netdev_vport->dev); + free_netdev(vport->dev); error_free_vport: ovs_vport_free(vport); error: @@ -207,21 +202,19 @@ error: static void internal_dev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - - netif_stop_queue(netdev_vport->dev); + netif_stop_queue(vport->dev); rtnl_lock(); - dev_set_promiscuity(netdev_vport->dev, -1); + dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ - unregister_netdevice(netdev_vport->dev); + unregister_netdevice(vport->dev); rtnl_unlock(); } static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) { - struct net_device *netdev = netdev_vport_priv(vport)->dev; + struct net_device *netdev = vport->dev; int len; if (unlikely(!(netdev->flags & IFF_UP))) { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 33e6d6e2908f..1c9696693f66 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -83,104 +83,97 @@ static struct net_device *get_dpdev(const struct datapath *dp) local = ovs_vport_ovsl(dp, OVSP_LOCAL); BUG_ON(!local); - return netdev_vport_priv(local)->dev; + return local->dev; } -static struct vport *netdev_create(const struct vport_parms *parms) +static struct vport *netdev_link(struct vport *vport, const char *name) { - struct vport *vport; - struct netdev_vport *netdev_vport; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_netdev_vport_ops, parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } - - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); - if (!netdev_vport->dev) { + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); + if (!vport->dev) { err = -ENODEV; goto error_free_vport; } - if (netdev_vport->dev->flags & IFF_LOOPBACK || - netdev_vport->dev->type != ARPHRD_ETHER || - ovs_is_internal_dev(netdev_vport->dev)) { + if (vport->dev->flags & IFF_LOOPBACK || + vport->dev->type != ARPHRD_ETHER || + ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); - err = netdev_master_upper_dev_link(netdev_vport->dev, + err = netdev_master_upper_dev_link(vport->dev, get_dpdev(vport->dp)); if (err) goto error_unlock; - err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, + err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, vport); if (err) goto error_master_upper_dev_unlink; - dev_disable_lro(netdev_vport->dev); - dev_set_promiscuity(netdev_vport->dev, 1); - netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + dev_disable_lro(vport->dev); + dev_set_promiscuity(vport->dev, 1); + vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; error_master_upper_dev_unlink: - netdev_upper_dev_unlink(netdev_vport->dev, get_dpdev(vport->dp)); + netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: - dev_put(netdev_vport->dev); + dev_put(vport->dev); error_free_vport: ovs_vport_free(vport); -error: return ERR_PTR(err); } +static struct vport *netdev_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + return netdev_link(vport, parms->name); +} + static void free_port_rcu(struct rcu_head *rcu) { - struct netdev_vport *netdev_vport = container_of(rcu, - struct netdev_vport, rcu); + struct vport *vport = container_of(rcu, struct vport, rcu); - dev_put(netdev_vport->dev); - ovs_vport_free(vport_from_priv(netdev_vport)); + dev_put(vport->dev); + ovs_vport_free(vport); } void ovs_netdev_detach_dev(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - ASSERT_RTNL(); - netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; - netdev_rx_handler_unregister(netdev_vport->dev); - netdev_upper_dev_unlink(netdev_vport->dev, - netdev_master_upper_dev_get(netdev_vport->dev)); - dev_set_promiscuity(netdev_vport->dev, -1); + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(vport->dev); + netdev_upper_dev_unlink(vport->dev, + netdev_master_upper_dev_get(vport->dev)); + dev_set_promiscuity(vport->dev, -1); } static void netdev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - rtnl_lock(); - if (netdev_vport->dev->priv_flags & IFF_OVS_DATAPATH) + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) ovs_netdev_detach_dev(vport); rtnl_unlock(); - call_rcu(&netdev_vport->rcu, free_port_rcu); + call_rcu(&vport->rcu, free_port_rcu); } const char *ovs_netdev_get_name(const struct vport *vport) { - const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - return netdev_vport->dev->name; + return vport->dev->name; } static unsigned int packet_length(const struct sk_buff *skb) @@ -195,18 +188,17 @@ static unsigned int packet_length(const struct sk_buff *skb) static int netdev_send(struct vport *vport, struct sk_buff *skb) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - int mtu = netdev_vport->dev->mtu; + int mtu = vport->dev->mtu; int len; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - netdev_vport->dev->name, + vport->dev->name, packet_length(skb), mtu); goto drop; } - skb->dev = netdev_vport->dev; + skb->dev = vport->dev; len = skb->len; dev_queue_xmit(skb); diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 6f7038e79c52..1c52aed255c5 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -26,18 +26,6 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct netdev_vport { - struct rcu_head rcu; - - struct net_device *dev; -}; - -static inline struct netdev_vport * -netdev_vport_priv(const struct vport *vport) -{ - return vport_priv(vport); -} - const char *ovs_netdev_get_name(const struct vport *); void ovs_netdev_detach_dev(struct vport *); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 75d68248ba69..e05ec68439d1 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -107,7 +107,7 @@ struct vport_portids { * @detach_list: list used for detaching vport in net-exit call. */ struct vport { - struct rcu_head rcu; + struct net_device *dev; struct datapath *dp; struct vport_portids __rcu *upcall_portids; u16 port_no; @@ -120,6 +120,7 @@ struct vport { struct vport_err_stats err_stats; struct list_head detach_list; + struct rcu_head rcu; }; /** -- cgit v1.2.3 From c9db965c524ea27451e60d5ddcd242f6c33a70fd Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:05 +0200 Subject: openvswitch: Abstract vport name through ovs_vport_name() This allows to get rid of the get_name() vport ops later on. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 4 ++-- net/openvswitch/vport-internal_dev.c | 1 - net/openvswitch/vport-netdev.c | 6 ------ net/openvswitch/vport-netdev.h | 1 - net/openvswitch/vport.c | 4 ++-- net/openvswitch/vport.h | 5 +++++ 6 files changed, 9 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 19df28ee5094..ffe984f5b95c 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -176,7 +176,7 @@ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); - return vport->ops->get_name(vport); + return ovs_vport_name(vport); } static int get_dpifindex(const struct datapath *dp) @@ -1800,7 +1800,7 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || nla_put_string(skb, OVS_VPORT_ATTR_NAME, - vport->ops->get_name(vport))) + ovs_vport_name(vport))) goto nla_put_failure; ovs_vport_get_stats(vport, &vport_stats); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index a2c205d9a8d5..c058bbf876c3 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -242,7 +242,6 @@ static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, - .get_name = ovs_netdev_get_name, .send = internal_dev_recv, }; diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 1c9696693f66..e682bdc34a5c 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -171,11 +171,6 @@ static void netdev_destroy(struct vport *vport) call_rcu(&vport->rcu, free_port_rcu); } -const char *ovs_netdev_get_name(const struct vport *vport) -{ - return vport->dev->name; -} - static unsigned int packet_length(const struct sk_buff *skb) { unsigned int length = skb->len - ETH_HLEN; @@ -223,7 +218,6 @@ static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, - .get_name = ovs_netdev_get_name, .send = netdev_send, }; diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 1c52aed255c5..684fb88723a4 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -26,7 +26,6 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -const char *ovs_netdev_get_name(const struct vport *); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index af23ba077836..d14f59403c5e 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -113,7 +113,7 @@ struct vport *ovs_vport_locate(const struct net *net, const char *name) struct vport *vport; hlist_for_each_entry_rcu(vport, bucket, hash_node) - if (!strcmp(name, vport->ops->get_name(vport)) && + if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -226,7 +226,7 @@ struct vport *ovs_vport_add(const struct vport_parms *parms) } bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); + ovs_vport_name(vport)); hlist_add_head_rcu(&vport->hash_node, bucket); return vport; } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index e05ec68439d1..1a689c28b5a6 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -237,6 +237,11 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); } +static inline const char *ovs_vport_name(struct vport *vport) +{ + return vport->dev ? vport->dev->name : vport->ops->get_name(vport); +} + int ovs_vport_ops_register(struct vport_ops *ops); void ovs_vport_ops_unregister(struct vport_ops *ops); -- cgit v1.2.3 From 614732eaa12dd462c0ab274700bed14f36afea5e Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 21 Jul 2015 10:44:06 +0200 Subject: openvswitch: Use regular VXLAN net_device device This gets rid of all OVS specific VXLAN code in the receive and transmit path by using a VXLAN net_device to represent the vport. Only a small shim layer remains which takes care of handling the VXLAN specific OVS Netlink configuration. Unexports vxlan_sock_add(), vxlan_sock_release(), vxlan_xmit_skb() since they are no longer needed. Signed-off-by: Thomas Graf Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 242 +++++++++++++++---------------- include/net/rtnetlink.h | 1 + include/net/vxlan.h | 24 +-- net/core/rtnetlink.c | 26 ++-- net/openvswitch/Kconfig | 12 -- net/openvswitch/Makefile | 1 - net/openvswitch/flow_netlink.c | 6 +- net/openvswitch/vport-netdev.c | 201 ++++++++++++++++++++++++- net/openvswitch/vport-vxlan.c | 322 ----------------------------------------- net/openvswitch/vport-vxlan.h | 11 -- 10 files changed, 339 insertions(+), 507 deletions(-) delete mode 100644 net/openvswitch/vport-vxlan.c delete mode 100644 net/openvswitch/vport-vxlan.h (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 30e1f215af73..e9feefb41f0b 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -75,6 +75,9 @@ static struct rtnl_link_ops vxlan_link_ops; static const u8 all_zeros_mac[ETH_ALEN]; +static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + bool no_share, u32 flags); + /* per-network namespace private data for this module */ struct vxlan_net { struct list_head vxlan_list; @@ -1027,7 +1030,7 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) return false; } -void vxlan_sock_release(struct vxlan_sock *vs) +static void vxlan_sock_release(struct vxlan_sock *vs) { struct sock *sk = vs->sock->sk; struct net *net = sock_net(sk); @@ -1043,7 +1046,6 @@ void vxlan_sock_release(struct vxlan_sock *vs) queue_work(vxlan_wq, &vs->del_work); } -EXPORT_SYMBOL_GPL(vxlan_sock_release); /* Update multicast group membership when first VNI on * multicast address is brought up @@ -1126,6 +1128,102 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, return vh; } +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, + struct vxlan_metadata *md, u32 vni, + struct metadata_dst *tun_dst) +{ + struct iphdr *oip = NULL; + struct ipv6hdr *oip6 = NULL; + struct vxlan_dev *vxlan; + struct pcpu_sw_netstats *stats; + union vxlan_addr saddr; + int err = 0; + union vxlan_addr *remote_ip; + + /* For flow based devices, map all packets to VNI 0 */ + if (vs->flags & VXLAN_F_FLOW_BASED) + vni = 0; + + /* Is this VNI defined? */ + vxlan = vxlan_vs_find_vni(vs, vni); + if (!vxlan) + goto drop; + + remote_ip = &vxlan->default_dst.remote_ip; + skb_reset_mac_header(skb); + skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); + skb->protocol = eth_type_trans(skb, vxlan->dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + + /* Ignore packet loops (and multicast echo) */ + if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) + goto drop; + + /* Re-examine inner Ethernet packet */ + if (remote_ip->sa.sa_family == AF_INET) { + oip = ip_hdr(skb); + saddr.sin.sin_addr.s_addr = oip->saddr; + saddr.sa.sa_family = AF_INET; +#if IS_ENABLED(CONFIG_IPV6) + } else { + oip6 = ipv6_hdr(skb); + saddr.sin6.sin6_addr = oip6->saddr; + saddr.sa.sa_family = AF_INET6; +#endif + } + + if (tun_dst) { + skb_dst_set(skb, (struct dst_entry *)tun_dst); + tun_dst = NULL; + } + + if ((vxlan->flags & VXLAN_F_LEARN) && + vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) + goto drop; + + skb_reset_network_header(skb); + /* In flow-based mode, GBP is carried in dst_metadata */ + if (!(vs->flags & VXLAN_F_FLOW_BASED)) + skb->mark = md->gbp; + + if (oip6) + err = IP6_ECN_decapsulate(oip6, skb); + if (oip) + err = IP_ECN_decapsulate(oip, skb); + + if (unlikely(err)) { + if (log_ecn_error) { + if (oip6) + net_info_ratelimited("non-ECT from %pI6\n", + &oip6->saddr); + if (oip) + net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", + &oip->saddr, oip->tos); + } + if (err > 1) { + ++vxlan->dev->stats.rx_frame_errors; + ++vxlan->dev->stats.rx_errors; + goto drop; + } + } + + stats = this_cpu_ptr(vxlan->dev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); + + netif_rx(skb); + + return; +drop: + if (tun_dst) + dst_release((struct dst_entry *)tun_dst); + + /* Consume bad packet */ + kfree_skb(skb); +} + /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) { @@ -1192,7 +1290,6 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) info->key.tun_flags |= TUNNEL_CSUM; md = ip_tunnel_info_opts(info, sizeof(*md)); - md->tun_dst = tun_dst; } else { memset(md, 0, sizeof(*md)); } @@ -1231,8 +1328,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto bad_flags; } - md->vni = vxh->vx_vni; - vs->rcv(vs, skb, md); + vxlan_rcv(vs, skb, md, vni >> 8, tun_dst); return 0; drop: @@ -1252,104 +1348,6 @@ error: return 1; } -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct iphdr *oip = NULL; - struct ipv6hdr *oip6 = NULL; - struct vxlan_dev *vxlan; - struct pcpu_sw_netstats *stats; - union vxlan_addr saddr; - __u32 vni; - int err = 0; - union vxlan_addr *remote_ip; - - /* For flow based devices, map all packets to VNI 0 */ - if (vs->flags & VXLAN_F_FLOW_BASED) - vni = 0; - else - vni = ntohl(md->vni) >> 8; - - /* Is this VNI defined? */ - vxlan = vxlan_vs_find_vni(vs, vni); - if (!vxlan) - goto drop; - - remote_ip = &vxlan->default_dst.remote_ip; - skb_reset_mac_header(skb); - skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); - skb->protocol = eth_type_trans(skb, vxlan->dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - - /* Ignore packet loops (and multicast echo) */ - if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) - goto drop; - - /* Re-examine inner Ethernet packet */ - if (remote_ip->sa.sa_family == AF_INET) { - oip = ip_hdr(skb); - saddr.sin.sin_addr.s_addr = oip->saddr; - saddr.sa.sa_family = AF_INET; -#if IS_ENABLED(CONFIG_IPV6) - } else { - oip6 = ipv6_hdr(skb); - saddr.sin6.sin6_addr = oip6->saddr; - saddr.sa.sa_family = AF_INET6; -#endif - } - - if (md->tun_dst) { - skb_dst_set(skb, (struct dst_entry *)md->tun_dst); - md->tun_dst = NULL; - } - - if ((vxlan->flags & VXLAN_F_LEARN) && - vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) - goto drop; - - skb_reset_network_header(skb); - /* In flow-based mode, GBP is carried in dst_metadata */ - if (!(vs->flags & VXLAN_F_FLOW_BASED)) - skb->mark = md->gbp; - - if (oip6) - err = IP6_ECN_decapsulate(oip6, skb); - if (oip) - err = IP_ECN_decapsulate(oip, skb); - - if (unlikely(err)) { - if (log_ecn_error) { - if (oip6) - net_info_ratelimited("non-ECT from %pI6\n", - &oip6->saddr); - if (oip) - net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", - &oip->saddr, oip->tos); - } - if (err > 1) { - ++vxlan->dev->stats.rx_frame_errors; - ++vxlan->dev->stats.rx_errors; - goto drop; - } - } - - stats = this_cpu_ptr(vxlan->dev->tstats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); - - netif_rx(skb); - - return; -drop: - if (md->tun_dst) - dst_release((struct dst_entry *)md->tun_dst); - - /* Consume bad packet */ - kfree_skb(skb); -} - static int arp_reduce(struct net_device *dev, struct sk_buff *skb) { struct vxlan_dev *vxlan = netdev_priv(dev); @@ -1688,7 +1686,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, - __be16 src_port, __be16 dst_port, + __be16 src_port, __be16 dst_port, __u32 vni, struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; @@ -1738,7 +1736,7 @@ static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk, vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = md->vni; + vxh->vx_vni = vni; if (type & SKB_GSO_TUNNEL_REMCSUM) { u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> @@ -1771,10 +1769,10 @@ err: } #endif -int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, - __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, - struct vxlan_metadata *md, bool xnet, u32 vxflags) +static int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, + __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, + __be16 src_port, __be16 dst_port, __u32 vni, + struct vxlan_metadata *md, bool xnet, u32 vxflags) { struct vxlanhdr *vxh; int min_headroom; @@ -1817,7 +1815,7 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); vxh->vx_flags = htonl(VXLAN_HF_VNI); - vxh->vx_vni = md->vni; + vxh->vx_vni = vni; if (type & SKB_GSO_TUNNEL_REMCSUM) { u32 data = (skb_checksum_start_offset(skb) - hdrlen) >> @@ -1844,7 +1842,6 @@ int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, ttl, df, src_port, dst_port, xnet, !(vxflags & VXLAN_F_UDP_CSUM)); } -EXPORT_SYMBOL_GPL(vxlan_xmit_skb); /* Bypass encapsulation if the destination is local */ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, @@ -2012,10 +2009,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = ip_tunnel_ecn_encap(tos, old_iph, skb); ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - md->vni = htonl(vni << 8); err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr, dst->sin.sin_addr.s_addr, tos, ttl, df, - src_port, dst_port, md, + src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), flags); if (err < 0) { @@ -2070,11 +2066,10 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, } ttl = ttl ? : ip6_dst_hoplimit(ndst); - md->vni = htonl(vni << 8); md->gbp = skb->mark; err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr, - 0, ttl, src_port, dst_port, md, + 0, ttl, src_port, dst_port, htonl(vni << 8), md, !net_eq(vxlan->net, dev_net(vxlan->dev)), vxlan->flags); #endif @@ -2269,8 +2264,8 @@ static int vxlan_open(struct net_device *dev) struct vxlan_sock *vs; int ret = 0; - vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, vxlan_rcv, - NULL, vxlan->cfg.no_share, vxlan->flags); + vs = vxlan_sock_add(vxlan->net, vxlan->cfg.dst_port, + vxlan->cfg.no_share, vxlan->flags); if (IS_ERR(vs)) return PTR_ERR(vs); @@ -2563,7 +2558,6 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, /* Create new listen socket if needed */ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); @@ -2592,8 +2586,6 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, vs->sock = sock; atomic_set(&vs->refcnt, 1); - vs->rcv = rcv; - vs->data = data; vs->flags = (flags & VXLAN_F_RCV_FLAGS); /* Initialize the vxlan udp offloads structure */ @@ -2617,9 +2609,8 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, return vs; } -struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, - vxlan_rcv_t *rcv, void *data, - bool no_share, u32 flags) +static struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, + bool no_share, u32 flags) { struct vxlan_net *vn = net_generic(net, vxlan_net_id); struct vxlan_sock *vs; @@ -2629,7 +2620,7 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, spin_lock(&vn->sock_lock); vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, flags); - if (vs && vs->rcv == rcv) { + if (vs) { if (!atomic_add_unless(&vs->refcnt, 1, 0)) vs = ERR_PTR(-EBUSY); spin_unlock(&vn->sock_lock); @@ -2638,9 +2629,8 @@ struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, spin_unlock(&vn->sock_lock); } - return vxlan_socket_create(net, port, rcv, data, flags); + return vxlan_socket_create(net, port, flags); } -EXPORT_SYMBOL_GPL(vxlan_sock_add); static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, struct vxlan_config *conf) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 343d922d15c2..18fdb98185ab 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -141,6 +141,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[]); +int rtnl_delete_link(struct net_device *dev); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len); diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 19535f85eb2c..eb8d721cdb67 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -101,22 +101,12 @@ struct vxlanhdr { #define FDB_HASH_SIZE (1<vn_sock->sock->sk)->inet_sport; +} static inline netdev_features_t vxlan_features_check(struct sk_buff *skb, netdev_features_t features) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 03d61b54aac0..5fb4af20c6dd 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1960,16 +1960,30 @@ static int rtnl_group_dellink(const struct net *net, int group) return 0; } +int rtnl_delete_link(struct net_device *dev) +{ + const struct rtnl_link_ops *ops; + LIST_HEAD(list_kill); + + ops = dev->rtnl_link_ops; + if (!ops || !ops->dellink) + return -EOPNOTSUPP; + + ops->dellink(dev, &list_kill); + unregister_netdevice_many(&list_kill); + + return 0; +} +EXPORT_SYMBOL_GPL(rtnl_delete_link); + static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); - const struct rtnl_link_ops *ops; struct net_device *dev; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; - LIST_HEAD(list_kill); err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1991,13 +2005,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) if (!dev) return -ENODEV; - ops = dev->rtnl_link_ops; - if (!ops || !ops->dellink) - return -EOPNOTSUPP; - - ops->dellink(dev, &list_kill); - unregister_netdevice_many(&list_kill); - return 0; + return rtnl_delete_link(dev); } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 15840401a2ce..1119f46b80b4 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -44,18 +44,6 @@ config OPENVSWITCH_GRE If unsure, say Y. -config OPENVSWITCH_VXLAN - tristate "Open vSwitch VXLAN tunneling support" - depends on OPENVSWITCH - depends on VXLAN - default OPENVSWITCH - ---help--- - If you say Y here, then the Open vSwitch will be able create vxlan vport. - - Say N to exclude this support and reduce the binary size. - - If unsure, say Y. - config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 91b9478413ef..38e0e149c55e 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -16,5 +16,4 @@ openvswitch-y := \ vport-netdev.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o -obj-$(CONFIG_OPENVSWITCH_VXLAN) += vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e7906dfb8814..a6eb77ab1a64 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -47,9 +47,9 @@ #include #include #include +#include #include "flow_netlink.h" -#include "vport-vxlan.h" struct ovs_len_tbl { int len; @@ -475,7 +475,7 @@ static int vxlan_tun_opt_from_nlattr(const struct nlattr *a, { struct nlattr *tb[OVS_VXLAN_EXT_MAX+1]; unsigned long opt_key_offset; - struct ovs_vxlan_opts opts; + struct vxlan_metadata opts; int err; BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); @@ -626,7 +626,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, static int vxlan_opt_to_nlattr(struct sk_buff *skb, const void *tun_opts, int swkey_tun_opts_len) { - const struct ovs_vxlan_opts *opts = tun_opts; + const struct vxlan_metadata *opts = tun_opts; struct nlattr *nla; nla = nla_nest_start(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index e682bdc34a5c..68d0582fc001 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -27,9 +27,13 @@ #include #include -#include +#include +#include +#include +#include #include "datapath.h" +#include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -147,7 +151,8 @@ static void free_port_rcu(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); - dev_put(vport->dev); + if (vport->dev) + dev_put(vport->dev); ovs_vport_free(vport); } @@ -221,12 +226,202 @@ static struct vport_ops ovs_netdev_vport_ops = { .send = netdev_send, }; +/* Compat code for old userspace. */ +#if IS_ENABLED(CONFIG_VXLAN) +static struct vport_ops ovs_vxlan_netdev_vport_ops; + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP) { + struct nlattr *exts; + + exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); + if (!exts) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP && + nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) + return -EMSGSIZE; + + nla_nest_end(skb, exts); + } + + return 0; +} + +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, +}; + +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) +{ + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; + int err; + + if (nla_len(attr) < sizeof(struct nlattr)) + return -EINVAL; + + err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); + if (err < 0) + return err; + + if (exts[OVS_VXLAN_EXT_GBP]) + conf->flags |= VXLAN_F_GBP; + + return 0; +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct net_device *dev; + struct vport *vport; + struct nlattr *a; + int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA, + }; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + conf.dst_port = htons(nla_get_u16(a)); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); + if (a) { + err = vxlan_configure_exts(vport, a, &conf); + if (err) { + ovs_vport_free(vport); + goto error; + } + } + + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); + return vport; +error: + return ERR_PTR(err); +} + +static struct vport *vxlan_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return netdev_link(vport, parms->name); +} + +static void vxlan_destroy(struct vport *vport) +{ + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); + + /* Early release so we can unregister the device */ + dev_put(vport->dev); + rtnl_delete_link(vport->dev); + vport->dev = NULL; + rtnl_unlock(); + + call_rcu(&vport->rcu, free_port_rcu); +} + +static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ip_tunnel_info *egress_tun_info) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dst_port = vxlan_dev_dst_port(vxlan); + __be16 src_port; + int port_min; + int port_max; + + inet_get_local_port_range(net, &port_min, &port_max); + src_port = udp_flow_src_port(net, skb, 0, 0, true); + + return ovs_tunnel_get_egress_info(egress_tun_info, net, + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, + src_port, dst_port); +} + +static struct vport_ops ovs_vxlan_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_create, + .destroy = vxlan_destroy, + .get_options = vxlan_get_options, + .send = netdev_send, + .get_egress_tun_info = vxlan_get_egress_tun_info, +}; + +static int vxlan_compat_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); +} + +static void vxlan_compat_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); +} +#else +static int vxlan_compat_init(void) +{ + return 0; +} + +static void vxlan_compat_exit(void) +{ +} +#endif + int __init ovs_netdev_init(void) { - return ovs_vport_ops_register(&ovs_netdev_vport_ops); + int err; + + err = ovs_vport_ops_register(&ovs_netdev_vport_ops); + if (err) + return err; + err = vxlan_compat_init(); + if (err) + vxlan_compat_exit(); + return err; } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); + vxlan_compat_exit(); } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c deleted file mode 100644 index 6f7986fabb70..000000000000 --- a/net/openvswitch/vport-vxlan.c +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Copyright (c) 2014 Nicira, Inc. - * Copyright (c) 2013 Cisco Systems, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "datapath.h" -#include "vport.h" -#include "vport-vxlan.h" - -/** - * struct vxlan_port - Keeps track of open UDP ports - * @vs: vxlan_sock created for the port. - * @name: vport name. - */ -struct vxlan_port { - struct vxlan_sock *vs; - char name[IFNAMSIZ]; - u32 exts; /* VXLAN_F_* in */ -}; - -static struct vport_ops ovs_vxlan_vport_ops; - -static inline struct vxlan_port *vxlan_vport(const struct vport *vport) -{ - return vport_priv(vport); -} - -/* Called with rcu_read_lock and BH disabled. */ -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, - struct vxlan_metadata *md) -{ - struct ip_tunnel_info tun_info; - struct vxlan_port *vxlan_port; - struct vport *vport = vs->data; - struct iphdr *iph; - struct ovs_vxlan_opts opts = { - .gbp = md->gbp, - }; - __be64 key; - __be16 flags; - - flags = TUNNEL_KEY | (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0); - vxlan_port = vxlan_vport(vport); - if (vxlan_port->exts & VXLAN_F_GBP && md->gbp) - flags |= TUNNEL_VXLAN_OPT; - - /* Save outer tunnel values */ - iph = ip_hdr(skb); - key = cpu_to_be64(ntohl(md->vni) >> 8); - ip_tunnel_info_init(&tun_info, iph, - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, &opts, sizeof(opts)); - - ovs_vport_receive(vport, skb, &tun_info); -} - -static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) - return -EMSGSIZE; - - if (vxlan_port->exts) { - struct nlattr *exts; - - exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); - if (!exts) - return -EMSGSIZE; - - if (vxlan_port->exts & VXLAN_F_GBP && - nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) - return -EMSGSIZE; - - nla_nest_end(skb, exts); - } - - return 0; -} - -static void vxlan_tnl_destroy(struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - - vxlan_sock_release(vxlan_port->vs); - - ovs_vport_deferred_free(vport); -} - -static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX+1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, -}; - -static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr) -{ - struct nlattr *exts[OVS_VXLAN_EXT_MAX+1]; - struct vxlan_port *vxlan_port; - int err; - - if (nla_len(attr) < sizeof(struct nlattr)) - return -EINVAL; - - err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); - if (err < 0) - return err; - - vxlan_port = vxlan_vport(vport); - - if (exts[OVS_VXLAN_EXT_GBP]) - vxlan_port->exts |= VXLAN_F_GBP; - - return 0; -} - -static struct vport *vxlan_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct nlattr *options = parms->options; - struct vxlan_port *vxlan_port; - struct vxlan_sock *vs; - struct vport *vport; - struct nlattr *a; - u16 dst_port; - int err; - - if (!options) { - err = -EINVAL; - goto error; - } - a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); - if (a && nla_len(a) == sizeof(u16)) { - dst_port = nla_get_u16(a); - } else { - /* Require destination port from userspace. */ - err = -EINVAL; - goto error; - } - - vport = ovs_vport_alloc(sizeof(struct vxlan_port), - &ovs_vxlan_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - vxlan_port = vxlan_vport(vport); - strncpy(vxlan_port->name, parms->name, IFNAMSIZ); - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); - if (a) { - err = vxlan_configure_exts(vport, a); - if (err) { - ovs_vport_free(vport); - goto error; - } - } - - vs = vxlan_sock_add(net, htons(dst_port), vxlan_rcv, vport, true, - vxlan_port->exts); - if (IS_ERR(vs)) { - ovs_vport_free(vport); - return (void *)vs; - } - vxlan_port->vs = vs; - - return vport; - -error: - return ERR_PTR(err); -} - -static int vxlan_ext_gbp(struct sk_buff *skb) -{ - const struct ip_tunnel_info *tun_info; - const struct ovs_vxlan_opts *opts; - - tun_info = OVS_CB(skb)->egress_tun_info; - opts = tun_info->options; - - if (tun_info->key.tun_flags & TUNNEL_VXLAN_OPT && - tun_info->options_len >= sizeof(*opts)) - return opts->gbp; - else - return 0; -} - -static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - struct sock *sk = vxlan_port->vs->sock->sk; - __be16 dst_port = inet_sk(sk)->inet_sport; - const struct ip_tunnel_key *tun_key; - struct vxlan_metadata md = {0}; - struct rtable *rt; - struct flowi4 fl; - __be16 src_port; - __be16 df; - int err; - u32 vxflags; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->key; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - src_port = udp_flow_src_port(net, skb, 0, 0, true); - md.vni = htonl(be64_to_cpu(tun_key->tun_id) << 8); - md.gbp = vxlan_ext_gbp(skb); - vxflags = vxlan_port->exts | - (tun_key->tun_flags & TUNNEL_CSUM ? VXLAN_F_UDP_CSUM : 0); - - err = vxlan_xmit_skb(rt, sk, skb, fl.saddr, tun_key->ipv4_dst, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, - src_port, dst_port, - &md, false, vxflags); - if (err < 0) - ip_rt_put(rt); - return err; -error: - kfree_skb(skb); - return err; -} - -static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct vxlan_port *vxlan_port = vxlan_vport(vport); - __be16 dst_port = inet_sk(vxlan_port->vs->sock->sk)->inet_sport; - __be16 src_port; - int port_min; - int port_max; - - inet_get_local_port_range(net, &port_min, &port_max); - src_port = udp_flow_src_port(net, skb, 0, 0, true); - - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, - src_port, dst_port); -} - -static const char *vxlan_get_name(const struct vport *vport) -{ - struct vxlan_port *vxlan_port = vxlan_vport(vport); - return vxlan_port->name; -} - -static struct vport_ops ovs_vxlan_vport_ops = { - .type = OVS_VPORT_TYPE_VXLAN, - .create = vxlan_tnl_create, - .destroy = vxlan_tnl_destroy, - .get_name = vxlan_get_name, - .get_options = vxlan_get_options, - .send = vxlan_tnl_send, - .get_egress_tun_info = vxlan_get_egress_tun_info, - .owner = THIS_MODULE, -}; - -static int __init ovs_vxlan_tnl_init(void) -{ - return ovs_vport_ops_register(&ovs_vxlan_vport_ops); -} - -static void __exit ovs_vxlan_tnl_exit(void) -{ - ovs_vport_ops_unregister(&ovs_vxlan_vport_ops); -} - -module_init(ovs_vxlan_tnl_init); -module_exit(ovs_vxlan_tnl_exit); - -MODULE_DESCRIPTION("OVS: VXLAN switching port"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("vport-type-4"); diff --git a/net/openvswitch/vport-vxlan.h b/net/openvswitch/vport-vxlan.h deleted file mode 100644 index 4b08233e73d5..000000000000 --- a/net/openvswitch/vport-vxlan.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef VPORT_VXLAN_H -#define VPORT_VXLAN_H 1 - -#include -#include - -struct ovs_vxlan_opts { - __u32 gbp; -}; - -#endif -- cgit v1.2.3 From e181a5430491f038c198f0eacc3142d6e871c2da Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 19 Jul 2015 22:21:13 +0200 Subject: net: #ifdefify sk_classid member of struct sock The sk_classid member is only required when CONFIG_CGROUP_NET_CLASSID is enabled. #ifdefify it to reduce the size of struct sock on 32 bit systems, at least. Signed-off-by: Mathias Krause Signed-off-by: David S. Miller --- include/net/sock.h | 2 ++ net/netfilter/nft_meta.c | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 05a8c1aea251..4353ef70bf48 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -429,7 +429,9 @@ struct sock { void *sk_security; #endif __u32 sk_mark; +#ifdef CONFIG_CGROUP_NET_CLASSID u32 sk_classid; +#endif struct cg_proto *sk_cgrp; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 52561e1c31e2..cb2f13ebb5a6 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -166,11 +166,13 @@ void nft_meta_get_eval(const struct nft_expr *expr, goto err; *dest = out->group; break; +#ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: if (skb->sk == NULL || !sk_fullsock(skb->sk)) goto err; *dest = skb->sk->sk_classid; break; +#endif default: WARN_ON(1); goto err; @@ -246,7 +248,9 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_CPU: case NFT_META_IIFGROUP: case NFT_META_OIFGROUP: +#ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: +#endif len = sizeof(u32); break; case NFT_META_IIFNAME: -- cgit v1.2.3 From 16040894b26af9f85d9395f072c53d76a44eba21 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Tue, 21 Jul 2015 06:42:28 -0400 Subject: tipc: fix compatibility bug In commit d999297c3dbbe7fdd832f7fa4ec84301e170b3e6 ("tipc: reduce locking scope during packet reception") we introduced a new function tipc_link_proto_rcv(). This function contains a bug, so that it sometimes by error sends out a non-zero link priority value in created protocol messages. The bug may lead to an extra link reset at initial link establising with older nodes. This will never happen more than once, whereafter the link will work as intended. We fix this bug in this commit. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 55b675d20de8..b63d57390bb7 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1639,7 +1639,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rcvgap = peers_snd_nxt - l->rcv_nxt; if (rcvgap || (msg_probe(hdr))) tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap, - 0, l->mtu, xmitq); + 0, 0, xmitq); tipc_link_release_pkts(l, msg_ack(hdr)); /* If NACK, retransmit will now start at right position */ -- cgit v1.2.3 From 01faef2cebae02685e2bcfc9bbee8416d5ec19fc Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 09:16:24 -0700 Subject: mpls: make RTA_OIF optional If user did not specify an oif, try and get it from the via address. If failed to get device, return with -ENODEV. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 6e669114f829..49f1b0e41bdf 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "internal.h" #define LABEL_NOT_SPECIFIED (1<<20) @@ -330,6 +331,70 @@ static unsigned find_free_label(struct net *net) return LABEL_NOT_SPECIFIED; } +static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) +{ + struct net_device *dev = NULL; + struct rtable *rt; + struct in_addr daddr; + + memcpy(&daddr, addr, sizeof(struct in_addr)); + rt = ip_route_output(net, daddr.s_addr, 0, 0, 0); + if (IS_ERR(rt)) + goto errout; + + dev = rt->dst.dev; + dev_hold(dev); + + ip_rt_put(rt); + +errout: + return dev; +} + +static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr) +{ + struct net_device *dev = NULL; + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) + goto errout; + + dev = dst->dev; + dev_hold(dev); + +errout: + dst_release(dst); + + return dev; +} + +static struct net_device *find_outdev(struct net *net, + struct mpls_route_config *cfg) +{ + struct net_device *dev = NULL; + + if (!cfg->rc_ifindex) { + switch (cfg->rc_via_table) { + case NEIGH_ARP_TABLE: + dev = inet_fib_lookup_dev(net, cfg->rc_via); + break; + case NEIGH_ND_TABLE: + dev = inet6_fib_lookup_dev(net, cfg->rc_via); + break; + case NEIGH_LINK_TABLE: + break; + } + } else { + dev = dev_get_by_index(net, cfg->rc_ifindex); + } + + return dev; +} + static int mpls_route_add(struct mpls_route_config *cfg) { struct mpls_route __rcu **platform_label; @@ -361,7 +426,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) goto errout; err = -ENODEV; - dev = dev_get_by_index(net, cfg->rc_ifindex); + dev = find_outdev(net, cfg); if (!dev) goto errout; -- cgit v1.2.3 From b56ea2985d389a3676638203323ebe22c261b7fe Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Tue, 21 Jul 2015 16:14:13 -0700 Subject: net: track success and failure of TCP PMTU probing Track success and failure of TCP PMTU probing. Signed-off-by: Rick Jones Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 2 ++ net/ipv4/proc.c | 2 ++ net/ipv4/tcp_input.c | 2 ++ 3 files changed, 6 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index eee8968407f0..25a9ad8bcef1 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -278,6 +278,8 @@ enum LINUX_MIB_TCPACKSKIPPEDCHALLENGE, /* TCPACKSkippedChallenge */ LINUX_MIB_TCPWINPROBE, /* TCPWinProbe */ LINUX_MIB_TCPKEEPALIVE, /* TCPKeepAlive */ + LINUX_MIB_TCPMTUPFAIL, /* TCPMTUPFail */ + LINUX_MIB_TCPMTUPSUCCESS, /* TCPMTUPSuccess */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index da5d483e236a..3abd9d7a3adf 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -300,6 +300,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPACKSkippedChallenge", LINUX_MIB_TCPACKSKIPPEDCHALLENGE), SNMP_MIB_ITEM("TCPWinProbe", LINUX_MIB_TCPWINPROBE), SNMP_MIB_ITEM("TCPKeepAlive", LINUX_MIB_TCPKEEPALIVE), + SNMP_MIB_ITEM("TCPMTUPFail", LINUX_MIB_TCPMTUPFAIL), + SNMP_MIB_ITEM("TCPMTUPSuccess", LINUX_MIB_TCPMTUPSUCCESS), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1578fc2a6f39..cda3ffedadb6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2593,6 +2593,7 @@ static void tcp_mtup_probe_failed(struct sock *sk) icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1; icsk->icsk_mtup.probe_size = 0; + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPFAIL); } static void tcp_mtup_probe_success(struct sock *sk) @@ -2612,6 +2613,7 @@ static void tcp_mtup_probe_success(struct sock *sk) icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMTUPSUCCESS); } /* Do a simple retransmit without using the backoff mechanisms in -- cgit v1.2.3 From de18547d48c0e735309d6874852f048352e08a88 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 21 Jul 2015 22:49:00 -0700 Subject: mpls_iptunnel: fix sparse warn: remove incorrect rcu_dereference fix for: net/mpls/mpls_iptunnel.c:73:19: sparse: incompatible types in comparison expression (different address spaces) remove incorrect rcu_dereference possibly left over from earlier revisions of the code. Reported-by: kbuild test robot Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/mpls/mpls_iptunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index eea096f21ba5..276f8c992218 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -70,7 +70,7 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) skb_orphan(skb); /* Find the output device */ - out_dev = rcu_dereference(dst->dev); + out_dev = dst->dev; if (!mpls_output_possible(out_dev) || !lwtstate || skb_warn_if_lro(skb)) goto drop; -- cgit v1.2.3 From 3985e8a3611a93bb36789f65db862e5700aab65e Mon Sep 17 00:00:00 2001 From: Erik Kline Date: Wed, 22 Jul 2015 16:38:25 +0900 Subject: ipv6: sysctl to restrict candidate source addresses Per RFC 6724, section 4, "Candidate Source Addresses": It is RECOMMENDED that the candidate source addresses be the set of unicast addresses assigned to the interface that will be used to send to the destination (the "outgoing" interface). Add a sysctl to enable this behaviour. Signed-off-by: Erik Kline Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 22 +++++++++++++++++++--- 4 files changed, 28 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index f63aeefd2c24..1a5ab21bcca5 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1460,6 +1460,13 @@ router_solicitations - INTEGER routers are present. Default: 3 +use_oif_addrs_only - BOOLEAN + When enabled, the candidate source addresses for destinations + routed via this interface are restricted to the set of addresses + configured on this interface (vis. RFC 6724, section 4). + + Default: false + use_tempaddr - INTEGER Preference for Privacy Extensions (RFC3041). <= 0 : disable Privacy Extensions diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 1319a6bb6b82..06ed637225b8 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -57,6 +57,7 @@ struct ipv6_devconf { bool initialized; struct in6_addr secret; } stable_secret; + __s32 use_oif_addrs_only; void *sysctl; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 5efa54ae567c..641a146ead7d 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -171,6 +171,7 @@ enum { DEVCONF_USE_OPTIMISTIC, DEVCONF_ACCEPT_RA_MTU, DEVCONF_STABLE_SECRET, + DEVCONF_USE_OIF_ADDRS_ONLY, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 32153c248959..eb0c6a3a8a00 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -211,7 +211,8 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_ra_mtu = 1, .stable_secret = { .initialized = false, - } + }, + .use_oif_addrs_only = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -253,6 +254,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .stable_secret = { .initialized = false, }, + .use_oif_addrs_only = 0, }; /* Check if a valid qdisc is available */ @@ -1472,11 +1474,16 @@ int ipv6_dev_get_saddr(struct net *net, const struct net_device *dst_dev, * include addresses assigned to interfaces * belonging to the same site as the outgoing * interface.) + * - "It is RECOMMENDED that the candidate source addresses + * be the set of unicast addresses assigned to the + * interface that will be used to send to the destination + * (the 'outgoing' interface)." (RFC 6724) */ if (dst_dev) { + idev = __in6_dev_get(dst_dev); if ((dst_type & IPV6_ADDR_MULTICAST) || - dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL) { - idev = __in6_dev_get(dst_dev); + dst.scope <= IPV6_ADDR_SCOPE_LINKLOCAL || + (idev && idev->cnf.use_oif_addrs_only)) { use_oif_addr = true; } } @@ -4607,6 +4614,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; /* we omit DEVCONF_STABLE_SECRET for now */ + array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; } static inline size_t inet6_ifla6_size(void) @@ -5605,6 +5613,14 @@ static struct addrconf_sysctl_table .mode = 0600, .proc_handler = addrconf_sysctl_stable_secret, }, + { + .procname = "use_oif_addrs_only", + .data = &ipv6_devconf.use_oif_addrs_only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + + }, { /* sentinel */ } -- cgit v1.2.3 From 045a0fa0c5f5ea0f16c009f924ea579634afbba8 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 23 Jul 2015 10:08:44 +0200 Subject: ip_tunnel: Call ip_tunnel_core_init() from inet_init() Convert the module_init() to a invocation from inet_init() since ip_tunnel_core is part of the INET built-in. Fixes: 3093fbe7ff4 ("route: Per route IP tunnel metadata via lightweight tunnel") Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 2 ++ net/ipv4/af_inet.c | 3 +++ net/ipv4/ip_tunnel_core.c | 11 +---------- 3 files changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index d975b3ebd6c7..47984415f5d1 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -311,6 +311,8 @@ static inline int ip_tunnel_collect_metadata(void) return static_key_false(&ip_tunnel_metadata_cnt); } +void __init ip_tunnel_core_init(void); + void ip_tunnel_need_metadata(void); void ip_tunnel_unneed_metadata(void); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9532ee87151f..cc4e498a0ccf 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -112,6 +112,7 @@ #include #include #include +#include #include #include #include @@ -1780,6 +1781,8 @@ static int __init inet_init(void) dev_add_pack(&ip_packet_type); + ip_tunnel_core_init(); + rc = 0; out: return rc; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 630e6d5712e8..5512f4e4ec1b 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -292,19 +292,10 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .get_encap_size = ip_tun_encap_nlsize, }; -static int __init ip_tunnel_core_init(void) +void __init ip_tunnel_core_init(void) { lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); - - return 0; -} -module_init(ip_tunnel_core_init); - -static void __exit ip_tunnel_core_exit(void) -{ - lwtunnel_encap_del_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); } -module_exit(ip_tunnel_core_exit); struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; EXPORT_SYMBOL(ip_tunnel_metadata_cnt); -- cgit v1.2.3 From 2385eb0c5fbcb4316d3490b3affba8e15efc7eb8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 20 Jul 2015 12:55:02 +0200 Subject: netfilter: nf_queue: fix nf_queue_nf_hook_drop() This function reacquires the rtnl_lock() which is already held by nf_unregister_hook(). This can be triggered via: modprobe nf_conntrack_ipv4 && rmmod nf_conntrack_ipv4 [ 720.628746] INFO: task rmmod:3578 blocked for more than 120 seconds. [ 720.628749] Not tainted 4.2.0-rc2+ #113 [ 720.628752] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 720.628754] rmmod D ffff8800ca46fd58 0 3578 3571 0x00000080 [...] [ 720.628783] Call Trace: [ 720.628790] [] schedule+0x6b/0x90 [ 720.628795] [] schedule_preempt_disabled+0x13/0x20 [ 720.628799] [] mutex_lock_nested+0x1f5/0x380 [ 720.628803] [] ? rtnl_lock+0x12/0x20 [ 720.628807] [] ? rtnl_lock+0x12/0x20 [ 720.628812] [] rtnl_lock+0x12/0x20 [ 720.628817] [] nf_queue_nf_hook_drop+0x15/0x160 [ 720.628825] [] nf_unregister_net_hook+0x168/0x190 [ 720.628831] [] nf_unregister_hook+0x64/0x80 [ 720.628837] [] nf_unregister_hooks+0x20/0x30 [...] Moreover, nf_unregister_net_hook() should only destroy the queue for this netns, not for every netns. Reported-by: Fengguang Wu Fixes: 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") Signed-off-by: Pablo Neira Ayuso Acked-by: "Eric W. Biederman" --- net/netfilter/core.c | 2 +- net/netfilter/nf_internals.h | 2 +- net/netfilter/nf_queue.c | 12 +++--------- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 87d237d20870..12504fbbeef7 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -154,7 +154,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(elem); + nf_queue_nf_hook_drop(net, elem); kfree(elem); } EXPORT_SYMBOL(nf_unregister_net_hook); diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 399210693c2a..065522564ac6 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -19,7 +19,7 @@ unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, /* nf_queue.c */ int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, struct nf_hook_state *state, unsigned int queuenum); -void nf_queue_nf_hook_drop(struct nf_hook_ops *ops); +void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 8a8b2abc35ff..96777f9a9350 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -105,21 +105,15 @@ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); -void nf_queue_nf_hook_drop(struct nf_hook_ops *ops) +void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) { const struct nf_queue_handler *qh; - struct net *net; - rtnl_lock(); rcu_read_lock(); qh = rcu_dereference(queue_handler); - if (qh) { - for_each_net(net) { - qh->nf_hook_drop(net, ops); - } - } + if (qh) + qh->nf_hook_drop(net, ops); rcu_read_unlock(); - rtnl_unlock(); } /* -- cgit v1.2.3 From 7181ebafd4306c9328fa1cd0ead69afa397ffe75 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 20 Jul 2015 09:31:25 +0200 Subject: netfilter: fix possible removal of wrong hook nf_unregister_net_hook() uses the nf_hook_ops fields as tuple to look up for the corresponding hook in the list. However, we may have two hooks with exactly the same configuration. This shouldn't be a problem for nftables since every new chain has an unique priv field set, but this may still cause us problems in the future, so better address this problem now by keeping a reference to the original nf_hook_ops structure to make sure we delete the right hook from nf_unregister_net_hook(). Fixes: 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") Signed-off-by: Pablo Neira Ayuso Acked-by: "Eric W. Biederman" --- net/netfilter/core.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 12504fbbeef7..0ecb2b52f276 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -78,26 +78,27 @@ static struct list_head *find_nf_hook_list(struct net *net, return nf_hook_list; } +struct nf_hook_entry { + const struct nf_hook_ops *orig_ops; + struct nf_hook_ops ops; +}; + int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { struct list_head *nf_hook_list; - struct nf_hook_ops *elem, *new; + struct nf_hook_entry *entry; + struct nf_hook_ops *elem; - new = kzalloc(sizeof(*new), GFP_KERNEL); - if (!new) + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) return -ENOMEM; - new->hook = reg->hook; - new->dev = reg->dev; - new->owner = reg->owner; - new->priv = reg->priv; - new->pf = reg->pf; - new->hooknum = reg->hooknum; - new->priority = reg->priority; + entry->orig_ops = reg; + entry->ops = *reg; nf_hook_list = find_nf_hook_list(net, reg); if (!nf_hook_list) { - kfree(new); + kfree(entry); return -ENOENT; } @@ -106,7 +107,7 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) if (reg->priority < elem->priority) break; } - list_add_rcu(&new->list, elem->list.prev); + list_add_rcu(&entry->ops.list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -122,6 +123,7 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { struct list_head *nf_hook_list; + struct nf_hook_entry *entry; struct nf_hook_ops *elem; nf_hook_list = find_nf_hook_list(net, reg); @@ -130,14 +132,9 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) mutex_lock(&nf_hook_mutex); list_for_each_entry(elem, nf_hook_list, list) { - if ((reg->hook == elem->hook) && - (reg->dev == elem->dev) && - (reg->owner == elem->owner) && - (reg->priv == elem->priv) && - (reg->pf == elem->pf) && - (reg->hooknum == elem->hooknum) && - (reg->priority == elem->priority)) { - list_del_rcu(&elem->list); + entry = container_of(elem, struct nf_hook_entry, ops); + if (entry->orig_ops == reg) { + list_del_rcu(&entry->ops.list); break; } } @@ -154,8 +151,8 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); - nf_queue_nf_hook_drop(net, elem); - kfree(elem); + nf_queue_nf_hook_drop(net, &entry->ops); + kfree(entry); } EXPORT_SYMBOL(nf_unregister_net_hook); -- cgit v1.2.3 From 3bbd14e0a2e3a988b1b5fe702a2539bd8d0ec622 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 20 Jul 2015 13:32:52 +0200 Subject: netfilter: rename local nf_hook_list to hook_list 085db2c04557 ("netfilter: Per network namespace netfilter hooks.") introduced a new nf_hook_list that is global, so let's avoid this overlap. Signed-off-by: Pablo Neira Ayuso Acked-by: "Eric W. Biederman" --- include/linux/netfilter.h | 14 +++++++------- net/netfilter/core.c | 28 ++++++++++++++-------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index e01da73ee6c4..d788ce62d826 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -140,20 +140,20 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg); #ifdef HAVE_JUMP_LABEL extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; -static inline bool nf_hook_list_active(struct list_head *nf_hook_list, +static inline bool nf_hook_list_active(struct list_head *hook_list, u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) return static_key_false(&nf_hooks_needed[pf][hook]); - return !list_empty(nf_hook_list); + return !list_empty(hook_list); } #else -static inline bool nf_hook_list_active(struct list_head *nf_hook_list, +static inline bool nf_hook_list_active(struct list_head *hook_list, u_int8_t pf, unsigned int hook) { - return !list_empty(nf_hook_list); + return !list_empty(hook_list); } #endif @@ -175,12 +175,12 @@ static inline int nf_hook_thresh(u_int8_t pf, unsigned int hook, int thresh) { struct net *net = dev_net(indev ? indev : outdev); - struct list_head *nf_hook_list = &net->nf.hooks[pf][hook]; + struct list_head *hook_list = &net->nf.hooks[pf][hook]; - if (nf_hook_list_active(nf_hook_list, pf, hook)) { + if (nf_hook_list_active(hook_list, pf, hook)) { struct nf_hook_state state; - nf_hook_state_init(&state, nf_hook_list, hook, thresh, + nf_hook_state_init(&state, hook_list, hook, thresh, pf, indev, outdev, sk, okfn); return nf_hook_slow(skb, &state); } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 0ecb2b52f276..2a5a0704245c 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -62,20 +62,20 @@ EXPORT_SYMBOL(nf_hooks_needed); static DEFINE_MUTEX(nf_hook_mutex); -static struct list_head *find_nf_hook_list(struct net *net, +static struct list_head *nf_find_hook_list(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list = NULL; + struct list_head *hook_list = NULL; if (reg->pf != NFPROTO_NETDEV) - nf_hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; + hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; else if (reg->hooknum == NF_NETDEV_INGRESS) { #ifdef CONFIG_NETFILTER_INGRESS if (reg->dev && dev_net(reg->dev) == net) - nf_hook_list = ®->dev->nf_hooks_ingress; + hook_list = ®->dev->nf_hooks_ingress; #endif } - return nf_hook_list; + return hook_list; } struct nf_hook_entry { @@ -85,7 +85,7 @@ struct nf_hook_entry { int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list; + struct list_head *hook_list; struct nf_hook_entry *entry; struct nf_hook_ops *elem; @@ -96,14 +96,14 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) entry->orig_ops = reg; entry->ops = *reg; - nf_hook_list = find_nf_hook_list(net, reg); - if (!nf_hook_list) { + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) { kfree(entry); return -ENOENT; } mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, nf_hook_list, list) { + list_for_each_entry(elem, hook_list, list) { if (reg->priority < elem->priority) break; } @@ -122,16 +122,16 @@ EXPORT_SYMBOL(nf_register_net_hook); void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { - struct list_head *nf_hook_list; + struct list_head *hook_list; struct nf_hook_entry *entry; struct nf_hook_ops *elem; - nf_hook_list = find_nf_hook_list(net, reg); - if (!nf_hook_list) + hook_list = nf_find_hook_list(net, reg); + if (!hook_list) return; mutex_lock(&nf_hook_mutex); - list_for_each_entry(elem, nf_hook_list, list) { + list_for_each_entry(elem, hook_list, list) { entry = container_of(elem, struct nf_hook_entry, ops); if (entry->orig_ops == reg) { list_del_rcu(&entry->ops.list); @@ -139,7 +139,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) } } mutex_unlock(&nf_hook_mutex); - if (&elem->list == nf_hook_list) { + if (&elem->list == hook_list) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); return; } -- cgit v1.2.3 From d77b4852b4d3698a90eef219acf9cddc964b1f3a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 21 Jun 2015 16:45:20 +0200 Subject: mac802154: add llsec address update workaround This patch adds a workaround for using the new nl802154 netlink interface with the old ieee802154 netlink interface togehter. The nl802154 currently supports no access for llsec layer, currently there are users outside which are using both interfaces at the same time. This patch adds a necessary call when addresses are updated. Reported-by: Simon Vincent Suggested-by: Phoebe Buckheister Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/cfg.c | 9 +++++++-- net/mac802154/ieee802154_i.h | 2 ++ net/mac802154/iface.c | 7 ++++++- 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 317c4662e544..44db8613812e 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -145,13 +145,18 @@ static int ieee802154_set_pan_id(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, __le16 pan_id) { + int ret; + ASSERT_RTNL(); if (wpan_dev->pan_id == pan_id) return 0; - wpan_dev->pan_id = pan_id; - return 0; + ret = mac802154_wpan_update_llsec(wpan_dev->netdev); + if (!ret) + wpan_dev->pan_id = pan_id; + + return ret; } static int diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 34755d5751a4..6d4f3ccc3c1c 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -167,6 +167,8 @@ void mac802154_get_table(struct net_device *dev, struct ieee802154_llsec_table **t); void mac802154_unlock_table(struct net_device *dev); +int mac802154_wpan_update_llsec(struct net_device *dev); + /* interface handling */ int ieee802154_iface_init(void); void ieee802154_iface_exit(void); diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 8b698246a51b..4760368a3493 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -30,7 +30,7 @@ #include "ieee802154_i.h" #include "driver-ops.h" -static int mac802154_wpan_update_llsec(struct net_device *dev) +int mac802154_wpan_update_llsec(struct net_device *dev) { struct ieee802154_sub_if_data *sdata = IEEE802154_DEV_TO_SUB_IF(dev); struct ieee802154_mlme_ops *ops = ieee802154_mlme_ops(dev); @@ -471,6 +471,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, enum nl802154_iftype type) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; + int ret; u8 tmp; /* set some type-dependent values */ @@ -505,6 +506,10 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, mutex_init(&sdata->sec_mtx); mac802154_llsec_init(&sdata->sec); + ret = mac802154_wpan_update_llsec(sdata->dev); + if (ret < 0) + return ret; + break; case NL802154_IFTYPE_MONITOR: sdata->dev->destructor = free_netdev; -- cgit v1.2.3 From 301d7d42a207e0d4f5bf11c7e2b6862ddebc8c36 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sun, 21 Jun 2015 17:11:19 +0200 Subject: 6lowpan: add request for ipv6 module The iphc module depends on CONFIG_IPV6, because it's not very useful to build the module without IPv6 support. Recently an user reported about issues for setting an IPv6 address to a 6LoWPAN interface. The issues was solved by modprobe the ipv6 module before. To avoid such user issues we try to request the ipv6 module when the 6LoWPAN module is loaded. Signed-off-by: Alexander Aring Acked-by: Jukka Rissanen Signed-off-by: Marcel Holtmann --- net/6lowpan/iphc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 94a375c04f21..9055d7b9d112 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -613,6 +613,8 @@ EXPORT_SYMBOL_GPL(lowpan_header_compress); static int __init lowpan_module_init(void) { + request_module_nowait("ipv6"); + request_module_nowait("nhc_dest"); request_module_nowait("nhc_fragment"); request_module_nowait("nhc_hop"); -- cgit v1.2.3 From 8f451829dd97fb22f03844ca52a49828e2e1d666 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Tue, 23 Jun 2015 11:41:03 +0530 Subject: mac802154: use WARN_ON() macro This patch will generate the warning if the required driver ops were not defined. Also it removes unnecessary debug message. Signed-off-by: Varka Bhadram Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 356b346e1ee8..4caf04b676d7 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -58,11 +58,9 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) struct ieee802154_local *local; size_t priv_size; - if (!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed || - !ops->start || !ops->stop || !ops->set_channel) { - pr_err("undefined IEEE802.15.4 device operations\n"); + if (WARN_ON(!ops || !(ops->xmit_async || ops->xmit_sync) || !ops->ed || + !ops->start || !ops->stop || !ops->set_channel)) return NULL; - } /* Ensure 32-byte alignment of our private data and hw private data. * We use the wpan_phy priv data for both our ieee802154_local and for -- cgit v1.2.3 From 927e031c7cb266e5f7fd600899f1603813ee7439 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Tue, 23 Jun 2015 16:03:44 +0530 Subject: mac802154: remove unused macro This patch removes the unused macro which was removed with the rework of linux-wpan kernel. Signed-off-by: Varka Bhadram Signed-off-by: Marcel Holtmann --- net/mac802154/ieee802154_i.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 6d4f3ccc3c1c..ed8746334bef 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -94,8 +94,6 @@ struct ieee802154_sub_if_data { struct mac802154_llsec sec; }; -#define MAC802154_CHAN_NONE 0xff /* No channel is assigned */ - /* utility functions/constants */ extern const void *const mac802154_wpan_phy_privid; /* for wpan_phy privid */ -- cgit v1.2.3 From c4227c8a6246b30793bd7360113ddc7b66d526dc Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 24 Jun 2015 11:36:34 +0200 Subject: mac802154: util: add stop_device utility function This patch adds ieee802154_stop_device for preparing a utility function to stop the ieee802154 device. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/ieee802154_i.h | 1 + net/mac802154/iface.c | 7 ++----- net/mac802154/util.c | 8 ++++++++ 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index ed8746334bef..04077830e88c 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -176,5 +176,6 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, unsigned char name_assign_type, enum nl802154_iftype type, __le64 extended_addr); void ieee802154_remove_interfaces(struct ieee802154_local *local); +void ieee802154_stop_device(struct ieee802154_local *local); #endif /* __IEEE802154_I_H */ diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 4760368a3493..416de903e467 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -314,11 +314,8 @@ static int mac802154_slave_close(struct net_device *dev) clear_bit(SDATA_STATE_RUNNING, &sdata->state); - if (!local->open_count) { - flush_workqueue(local->workqueue); - hrtimer_cancel(&local->ifs_timer); - drv_stop(local); - } + if (!local->open_count) + ieee802154_stop_device(local); return 0; } diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 583435f38930..f9fd0957ab67 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -14,6 +14,7 @@ */ #include "ieee802154_i.h" +#include "driver-ops.h" /* privid for wpan_phys to determine whether they belong to us or not */ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; @@ -92,3 +93,10 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, dev_consume_skb_any(skb); } EXPORT_SYMBOL(ieee802154_xmit_complete); + +void ieee802154_stop_device(struct ieee802154_local *local) +{ + flush_workqueue(local->workqueue); + hrtimer_cancel(&local->ifs_timer); + drv_stop(local); +} -- cgit v1.2.3 From a6cb869b3b7c16fd7c3ee766dd9f9a4fdda7edf9 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Wed, 24 Jun 2015 11:36:35 +0200 Subject: cfg802154: add PM hooks This patch help to implement suspend/resume in mac802154, these hooks will be run before the device is suspended and after it resumes. Signed-off-by: Varka Bhadram Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/cfg802154.h | 2 ++ net/ieee802154/rdev-ops.h | 20 ++++++++++++++++++++ net/ieee802154/sysfs.c | 38 ++++++++++++++++++++++++++++++++++++++ net/ieee802154/trace.h | 22 ++++++++++++++++++++++ 4 files changed, 82 insertions(+) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 290a9a69af07..382f94b59f2f 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -34,6 +34,8 @@ struct cfg802154_ops { int type); void (*del_virtual_intf_deprecated)(struct wpan_phy *wpan_phy, struct net_device *dev); + int (*suspend)(struct wpan_phy *wpan_phy); + int (*resume)(struct wpan_phy *wpan_phy); int (*add_virtual_intf)(struct wpan_phy *wpan_phy, const char *name, unsigned char name_assign_type, diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index b2155a123f6c..8d5960a37195 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -23,6 +23,26 @@ rdev_del_virtual_intf_deprecated(struct cfg802154_registered_device *rdev, rdev->ops->del_virtual_intf_deprecated(&rdev->wpan_phy, dev); } +static inline int +rdev_suspend(struct cfg802154_registered_device *rdev) +{ + int ret; + trace_802154_rdev_suspend(&rdev->wpan_phy); + ret = rdev->ops->suspend(&rdev->wpan_phy); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + +static inline int +rdev_resume(struct cfg802154_registered_device *rdev) +{ + int ret; + trace_802154_rdev_resume(&rdev->wpan_phy); + ret = rdev->ops->resume(&rdev->wpan_phy); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + static inline int rdev_add_virtual_intf(struct cfg802154_registered_device *rdev, char *name, unsigned char name_assign_type, diff --git a/net/ieee802154/sysfs.c b/net/ieee802154/sysfs.c index 133b4280660c..bd88525b041e 100644 --- a/net/ieee802154/sysfs.c +++ b/net/ieee802154/sysfs.c @@ -14,11 +14,13 @@ */ #include +#include #include #include "core.h" #include "sysfs.h" +#include "rdev-ops.h" static inline struct cfg802154_registered_device * dev_to_rdev(struct device *dev) @@ -62,10 +64,46 @@ static struct attribute *pmib_attrs[] = { }; ATTRIBUTE_GROUPS(pmib); +#ifdef CONFIG_PM_SLEEP +static int wpan_phy_suspend(struct device *dev) +{ + struct cfg802154_registered_device *rdev = dev_to_rdev(dev); + int ret = 0; + + if (rdev->ops->suspend) { + rtnl_lock(); + ret = rdev_suspend(rdev); + rtnl_unlock(); + } + + return ret; +} + +static int wpan_phy_resume(struct device *dev) +{ + struct cfg802154_registered_device *rdev = dev_to_rdev(dev); + int ret = 0; + + if (rdev->ops->resume) { + rtnl_lock(); + ret = rdev_resume(rdev); + rtnl_unlock(); + } + + return ret; +} + +static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume); +#define WPAN_PHY_PM_OPS (&wpan_phy_pm_ops) +#else +#define WPAN_PHY_PM_OPS NULL +#endif + struct class wpan_phy_class = { .name = "ieee802154", .dev_release = wpan_phy_release, .dev_groups = pmib_groups, + .pm = WPAN_PHY_PM_OPS, }; int wpan_phy_sysfs_init(void) diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index 9b5f0eb36696..4399b7fbaa31 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -40,6 +40,28 @@ * rdev->ops traces * *************************************************************/ +DECLARE_EVENT_CLASS(wpan_phy_only_evt, + TP_PROTO(struct wpan_phy *wpan_phy), + TP_ARGS(wpan_phy), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + ), + TP_printk(WPAN_PHY_PR_FMT, WPAN_PHY_PR_ARG) +); + +DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_suspend, + TP_PROTO(struct wpan_phy *wpan_phy), + TP_ARGS(wpan_phy) +); + +DEFINE_EVENT(wpan_phy_only_evt, 802154_rdev_resume, + TP_PROTO(struct wpan_phy *wpan_phy), + TP_ARGS(wpan_phy) +); + TRACE_EVENT(802154_rdev_add_virtual_intf, TP_PROTO(struct wpan_phy *wpan_phy, char *name, enum nl802154_iftype type, __le64 extended_addr), -- cgit v1.2.3 From 3cf24cf8c3c06f9a6cacc8fc2cad94661b6096b6 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 24 Jun 2015 11:36:36 +0200 Subject: mac802154: cfg: add suspend and resume callbacks This patch introduces suspend and resume callbacks to mac802154. When doing suspend we calling the stop driver callback which should stop the receiving of frames. A transceiver should go into low-power mode then. Calling resume will call the start driver callback, which starts receiving again and allow to transmit frames. This was tested only with the fakelb driver and a qemu vm by doing the following commands: echo "devices" > /sys/power/pm_test echo "freeze" > /sys/power/state while doing some high traffic between two fakelb phys. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/cfg.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ net/mac802154/ieee802154_i.h | 1 + net/mac802154/rx.c | 10 ++++++++-- 3 files changed, 54 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 44db8613812e..f7ba51e8b4ca 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -44,6 +44,49 @@ static void ieee802154_del_iface_deprecated(struct wpan_phy *wpan_phy, ieee802154_if_remove(sdata); } +#ifdef CONFIG_PM +static int ieee802154_suspend(struct wpan_phy *wpan_phy) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + + if (!local->open_count) + goto suspend; + + ieee802154_stop_queue(&local->hw); + synchronize_net(); + + /* stop hardware - this must stop RX */ + ieee802154_stop_device(local); + +suspend: + local->suspended = true; + return 0; +} + +static int ieee802154_resume(struct wpan_phy *wpan_phy) +{ + struct ieee802154_local *local = wpan_phy_priv(wpan_phy); + int ret; + + /* nothing to do if HW shouldn't run */ + if (!local->open_count) + goto wake_up; + + /* restart hardware */ + ret = drv_start(local); + if (ret) + return ret; + +wake_up: + ieee802154_wake_queue(&local->hw); + local->suspended = false; + return 0; +} +#else +#define ieee802154_suspend NULL +#define ieee802154_resume NULL +#endif + static int ieee802154_add_iface(struct wpan_phy *phy, const char *name, unsigned char name_assign_type, @@ -232,6 +275,8 @@ ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, const struct cfg802154_ops mac802154_config_ops = { .add_virtual_intf_deprecated = ieee802154_add_iface_deprecated, .del_virtual_intf_deprecated = ieee802154_del_iface_deprecated, + .suspend = ieee802154_suspend, + .resume = ieee802154_resume, .add_virtual_intf = ieee802154_add_iface, .del_virtual_intf = ieee802154_del_iface, .set_channel = ieee802154_set_channel, diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 04077830e88c..0054f39d499b 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -56,6 +56,7 @@ struct ieee802154_local { struct hrtimer ifs_timer; bool started; + bool suspended; struct tasklet_struct tasklet; struct sk_buff_head skb_queue; diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index d93ad2d4a4fc..5a258c11ed3b 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -253,6 +253,9 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb) WARN_ON_ONCE(softirq_count() == 0); + if (local->suspended) + goto drop; + /* TODO: When a transceiver omits the checksum here, we * add an own calculated one. This is currently an ugly * solution because the monitor needs a crc here. @@ -273,8 +276,7 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb) crc = crc_ccitt(0, skb->data, skb->len); if (crc) { rcu_read_unlock(); - kfree_skb(skb); - return; + goto drop; } } /* remove crc */ @@ -283,6 +285,10 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb) __ieee802154_rx_handle_packet(local, skb); rcu_read_unlock(); + + return; +drop: + kfree_skb(skb); } EXPORT_SYMBOL(ieee802154_rx); -- cgit v1.2.3 From 729a8989b3fa8ae7965c537dfccbd08512e84d3c Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Tue, 7 Jul 2015 10:50:42 +0530 Subject: mac802154: do not export ieee802154_rx() Right now there are no other users for ieee802154_rx() in kernel. So lets remove EXPORT_SYMBOL() for this. Also it moves the function prototype from global header file to local header file. Signed-off-by: Varka Bhadram Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/mac802154.h | 17 ----------------- net/mac802154/ieee802154_i.h | 1 + net/mac802154/rx.c | 1 - 3 files changed, 1 insertion(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/mac802154.h b/include/net/mac802154.h index f534a46911dc..b7f99615224b 100644 --- a/include/net/mac802154.h +++ b/include/net/mac802154.h @@ -320,23 +320,6 @@ int ieee802154_register_hw(struct ieee802154_hw *hw); */ void ieee802154_unregister_hw(struct ieee802154_hw *hw); -/** - * ieee802154_rx - receive frame - * - * Use this function to hand received frames to mac802154. The receive - * buffer in @skb must start with an IEEE 802.15.4 header. In case of a - * paged @skb is used, the driver is recommended to put the ieee802154 - * header of the frame on the linear part of the @skb to avoid memory - * allocation and/or memcpy by the stack. - * - * This function may not be called in IRQ context. Calls to this function - * for a single hardware must be synchronized against each other. - * - * @hw: the hardware this frame came in on - * @skb: the buffer to receive, owned by mac802154 after this call - */ -void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb); - /** * ieee802154_rx_irqsafe - receive frame * diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 0054f39d499b..eb8502a6e719 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -124,6 +124,7 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata) extern struct ieee802154_mlme_ops mac802154_mlme_wpan; +void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 5a258c11ed3b..7791c9b8cb57 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -290,7 +290,6 @@ void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb) drop: kfree_skb(skb); } -EXPORT_SYMBOL(ieee802154_rx); void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi) -- cgit v1.2.3 From d10270ce941ee89afd74076ea3ed8dbef8a0ff25 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Tue, 7 Jul 2015 10:50:43 +0530 Subject: mac802154: fix ieee802154_rx handling Instead of passing ieee802154_hw pointer to ieee802154_rx, we can directly pass the ieee802154_local pointer. Signed-off-by: Varka Bhadram Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/ieee802154_i.h | 2 +- net/mac802154/main.c | 2 +- net/mac802154/rx.c | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index eb8502a6e719..6810d7a25aca 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -124,7 +124,7 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata) extern struct ieee802154_mlme_ops mac802154_mlme_wpan; -void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb); +void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 4caf04b676d7..91f120845a45 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -40,7 +40,7 @@ static void ieee802154_tasklet_handler(unsigned long data) * netstack. */ skb->pkt_type = 0; - ieee802154_rx(&local->hw, skb); + ieee802154_rx(local, skb); break; default: WARN(1, "mac802154: Packet is of unknown type %d\n", diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index 7791c9b8cb57..d1c33c1d6b9b 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -246,9 +246,8 @@ ieee802154_monitors_rx(struct ieee802154_local *local, struct sk_buff *skb) } } -void ieee802154_rx(struct ieee802154_hw *hw, struct sk_buff *skb) +void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) { - struct ieee802154_local *local = hw_to_local(hw); u16 crc; WARN_ON_ONCE(softirq_count() == 0); -- cgit v1.2.3 From 2baea85dec1aebe0b100d4836dee8bcf29a51e94 Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Tue, 23 Jun 2015 17:59:32 +0100 Subject: Bluetooth: L2CAP ERTM shutdown protect sk and chan During execution of l2cap_sock_shutdown() which might sleep, the sk and chan structures can be in an unlocked condition which potentially allows the structures to be freed by other running threads. Therefore, there is a possibility of a malfunction or memory reuse after being freed. Keep the sk and chan structures alive during the execution of l2cap_sock_shutdown() by using their respective hold and put functions. This allows the structures to be freeable at the end of l2cap_sock_shutdown(). Signed-off-by: Kautuk Consul Signed-off-by: Dean Jenkins Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_sock.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 244287706f91..3794c2386c21 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1098,7 +1098,12 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) if (!sk) return 0; + /* prevent sk structure from being freed whilst unlocked */ + sock_hold(sk); + chan = l2cap_pi(sk)->chan; + /* prevent chan structure from being freed whilst unlocked */ + l2cap_chan_hold(chan); conn = chan->conn; BT_DBG("chan %p state %s", chan, state_to_string(chan->state)); @@ -1134,6 +1139,9 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) if (conn) mutex_unlock(&conn->chan_lock); + l2cap_chan_put(chan); + sock_put(sk); + return err; } -- cgit v1.2.3 From f65468f6e26c3bd05e642e10e80a485b99b7de05 Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Tue, 23 Jun 2015 17:59:33 +0100 Subject: Bluetooth: Make __l2cap_wait_ack more efficient Use chan->state instead of chan->conn because waiting for ACK's is only possible in the BT_CONNECTED state. Also avoids reference to the conn structure so makes locking easier. Only call __l2cap_wait_ack() when the needed condition of chan->unacked_frames > 0 && chan->state == BT_CONNECTED is true and convert the while loop to a do while loop. __l2cap_wait_ack() change the function prototype to pass in the chan variable as chan is already available in the calling function l2cap_sock_shutdown(). Avoids locking issues. Signed-off-by: Dean Jenkins Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_sock.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 3794c2386c21..29042880c449 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1054,16 +1054,15 @@ static void l2cap_sock_kill(struct sock *sk) sock_put(sk); } -static int __l2cap_wait_ack(struct sock *sk) +static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan) { - struct l2cap_chan *chan = l2cap_pi(sk)->chan; DECLARE_WAITQUEUE(wait, current); int err = 0; int timeo = HZ/5; add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); - while (chan->unacked_frames > 0 && chan->conn) { + do { if (!timeo) timeo = HZ/5; @@ -1080,7 +1079,10 @@ static int __l2cap_wait_ack(struct sock *sk) err = sock_error(sk); if (err) break; - } + + } while (chan->unacked_frames > 0 && + chan->state == BT_CONNECTED); + set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return err; @@ -1115,8 +1117,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) lock_sock(sk); if (!sk->sk_shutdown) { - if (chan->mode == L2CAP_MODE_ERTM) - err = __l2cap_wait_ack(sk); + if (chan->mode == L2CAP_MODE_ERTM && + chan->unacked_frames > 0 && + chan->state == BT_CONNECTED) + err = __l2cap_wait_ack(sk, chan); sk->sk_shutdown = SHUTDOWN_MASK; -- cgit v1.2.3 From 451e4c6c6b3fd1a9f446a10eb9f6d4c2c476043c Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Tue, 23 Jun 2015 17:59:37 +0100 Subject: Bluetooth: Add BT_DBG to l2cap_sock_shutdown() Add helpful BT_DBG debug to l2cap_sock_shutdown() and __l2cap_wait_ack() so that the code flow can be analysed. Signed-off-by: Dean Jenkins Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_sock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 29042880c449..d915e4a96313 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1063,6 +1063,8 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan) add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); do { + BT_DBG("Waiting for %d ACKs", chan->unacked_frames); + if (!timeo) timeo = HZ/5; @@ -1146,6 +1148,8 @@ static int l2cap_sock_shutdown(struct socket *sock, int how) l2cap_chan_put(chan); sock_put(sk); + BT_DBG("err: %d", err); + return err; } -- cgit v1.2.3 From cb02a25583b59ce48267472cd092485d754964f9 Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Tue, 23 Jun 2015 17:59:38 +0100 Subject: Bluetooth: __l2cap_wait_ack() use msecs_to_jiffies() Use msecs_to_jiffies() instead of using HZ so that it is easier to specify the time in milliseconds. Also add a #define L2CAP_WAIT_ACK_POLL_PERIOD to specify the 200ms polling period so that it is defined in a single place. Signed-off-by: Dean Jenkins Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_sock.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 2239a3753092..3dcad4159b0b 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -55,6 +55,7 @@ #define L2CAP_INFO_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_ERTX_TIMEOUT msecs_to_jiffies(60000) +#define L2CAP_WAIT_ACK_POLL_PERIOD msecs_to_jiffies(200) #define L2CAP_A2MP_DEFAULT_MTU 670 diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index d915e4a96313..f0b052a75e8a 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1058,7 +1058,7 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan) { DECLARE_WAITQUEUE(wait, current); int err = 0; - int timeo = HZ/5; + int timeo = L2CAP_WAIT_ACK_POLL_PERIOD; add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); @@ -1066,7 +1066,7 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan) BT_DBG("Waiting for %d ACKs", chan->unacked_frames); if (!timeo) - timeo = HZ/5; + timeo = L2CAP_WAIT_ACK_POLL_PERIOD; if (signal_pending(current)) { err = sock_intr_errno(timeo); -- cgit v1.2.3 From e432c72c464d2deb6c66d1e2a5f548dc1f0ef4dc Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Tue, 23 Jun 2015 17:59:39 +0100 Subject: Bluetooth: __l2cap_wait_ack() add defensive timeout Add a timeout to prevent the do while loop running in an infinite loop. This ensures that the channel will be instructed to close within 10 seconds so prevents l2cap_sock_shutdown() getting stuck forever. Returns -ENOLINK when the timeout is reached. The channel will be subequently closed and not all data will be ACK'ed. Signed-off-by: Dean Jenkins Signed-off-by: Marcel Holtmann --- include/net/bluetooth/l2cap.h | 1 + net/bluetooth/l2cap_sock.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 3dcad4159b0b..c98afc08cc26 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -56,6 +56,7 @@ #define L2CAP_MOVE_TIMEOUT msecs_to_jiffies(4000) #define L2CAP_MOVE_ERTX_TIMEOUT msecs_to_jiffies(60000) #define L2CAP_WAIT_ACK_POLL_PERIOD msecs_to_jiffies(200) +#define L2CAP_WAIT_ACK_TIMEOUT msecs_to_jiffies(10000) #define L2CAP_A2MP_DEFAULT_MTU 670 diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index f0b052a75e8a..586b3d580cfc 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -1059,11 +1059,15 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan) DECLARE_WAITQUEUE(wait, current); int err = 0; int timeo = L2CAP_WAIT_ACK_POLL_PERIOD; + /* Timeout to prevent infinite loop */ + unsigned long timeout = jiffies + L2CAP_WAIT_ACK_TIMEOUT; add_wait_queue(sk_sleep(sk), &wait); set_current_state(TASK_INTERRUPTIBLE); do { - BT_DBG("Waiting for %d ACKs", chan->unacked_frames); + BT_DBG("Waiting for %d ACKs, timeout %04d ms", + chan->unacked_frames, time_after(jiffies, timeout) ? 0 : + jiffies_to_msecs(timeout - jiffies)); if (!timeo) timeo = L2CAP_WAIT_ACK_POLL_PERIOD; @@ -1082,6 +1086,11 @@ static int __l2cap_wait_ack(struct sock *sk, struct l2cap_chan *chan) if (err) break; + if (time_after(jiffies, timeout)) { + err = -ENOLINK; + break; + } + } while (chan->unacked_frames > 0 && chan->state == BT_CONNECTED); -- cgit v1.2.3 From 9a0a8a8e852dabeda848baafac19627cb469d5e9 Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Mon, 20 Jul 2015 13:12:49 +0200 Subject: Bluetooth: Move IRK checking logic in preparation to new connect method Move IRK checking logic in preparation to new connect method. Also make sure that MGMT_STATUS_INVALID_PARAMS is returned when non identity address is passed to ADD_DEVICE. Right now MGMT_STATUS_FAILED is returned, which might be misleading. Signed-off-by: Jakub Pawlowski Signed-off-by: Johan Hedberg --- net/bluetooth/hci_core.c | 11 ----------- net/bluetooth/mgmt.c | 23 +++++++++++++++++++++++ 2 files changed, 23 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 2f8fb33067e1..bc43b6490555 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2822,10 +2822,6 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, { struct hci_conn_params *params; - /* The conn params list only contains identity addresses */ - if (!hci_is_identity_address(addr, addr_type)) - return NULL; - list_for_each_entry(params, &hdev->le_conn_params, list) { if (bacmp(¶ms->addr, addr) == 0 && params->addr_type == addr_type) { @@ -2842,10 +2838,6 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, { struct hci_conn_params *param; - /* The list only contains identity addresses */ - if (!hci_is_identity_address(addr, addr_type)) - return NULL; - list_for_each_entry(param, list, action) { if (bacmp(¶m->addr, addr) == 0 && param->addr_type == addr_type) @@ -2861,9 +2853,6 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, { struct hci_conn_params *params; - if (!hci_is_identity_address(addr, addr_type)) - return NULL; - params = hci_conn_params_lookup(hdev, addr, addr_type); if (params) return params; diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7998fb279165..7ab191589541 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6226,6 +6226,17 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, else auto_conn = HCI_AUTO_CONN_REPORT; + /* Kernel internally uses conn_params with resolvable private + * address, but Add Device allows only identity addresses. + * Make sure it is enforced before calling + * hci_conn_params_lookup. + */ + if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) { + err = cmd->cmd_complete(cmd, MGMT_STATUS_INVALID_PARAMS); + mgmt_pending_remove(cmd); + goto unlock; + } + /* If the connection parameters don't exist for this device, * they will be created and configured with defaults. */ @@ -6340,6 +6351,18 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, else addr_type = ADDR_LE_DEV_RANDOM; + /* Kernel internally uses conn_params with resolvable private + * address, but Remove Device allows only identity addresses. + * Make sure it is enforced before calling + * hci_conn_params_lookup. + */ + if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) { + err = cmd->cmd_complete(cmd, + MGMT_STATUS_INVALID_PARAMS); + mgmt_pending_remove(cmd); + goto unlock; + } + params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, addr_type); if (!params) { -- cgit v1.2.3 From 6ca91c604074788dfeb7dd714e74f0ad6bea8a77 Mon Sep 17 00:00:00 2001 From: Rosen, Rami Date: Wed, 22 Jul 2015 07:57:02 +0300 Subject: bridge: Fix setting a flag in br_fill_ifvlaninfo_range(). This patch fixes setting of vinfo.flags in the br_fill_ifvlaninfo_range() method. The assignment of vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN has no effect and is unneeded, as vinfo.flags value is overriden by the immediately following vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END assignement. Signed-off-by: Rami Rosen Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 364bdc98bd9b..793d247ac2ca 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -164,8 +164,6 @@ static int br_fill_ifvlaninfo_range(struct sk_buff *skb, u16 vid_start, sizeof(vinfo), &vinfo)) goto nla_put_failure; - vinfo.flags &= ~BRIDGE_VLAN_INFO_RANGE_BEGIN; - vinfo.vid = vid_end; vinfo.flags = flags | BRIDGE_VLAN_INFO_RANGE_END; if (nla_put(skb, IFLA_BRIDGE_VLAN_INFO, -- cgit v1.2.3 From 29042e19f2c602fabe4705b5b719550b4627639c Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 22 Jul 2015 10:11:18 -0400 Subject: tipc: let function tipc_msg_reverse() expand header when needed The shortest TIPC message header, for cluster local CONNECTED messages, is 24 bytes long. With this format, the fields "dest_node" and "orig_node" are optimized away, since they in reality are redundant in this particular case. However, the absence of these fields leads to code inconsistencies that are difficult to handle in some cases, especially when we need to reverse or reject messages at the socket layer. In this commit, we concentrate the handling of the absent fields to one place, by letting the function tipc_msg_reverse() reallocate the buffer and expand the header to 32 bytes when necessary. This means that the socket code now can assume that the two previously absent fields are present in the header when a message needs to be rejected. This opens up for some further simplifications of the socket code. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 67 ++++++++++++++++++++++++++++++++++--------------------- net/tipc/msg.h | 3 +-- net/tipc/socket.c | 12 +++++----- 3 files changed, 48 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 08b4cc7d496d..4339aab93034 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -463,43 +463,58 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, /** * tipc_msg_reverse(): swap source and destination addresses and add error code - * @buf: buffer containing message to be reversed - * @dnode: return value: node where to send message after reversal - * @err: error code to be set in message - * Consumes buffer if failure + * @own_node: originating node id for reversed message + * @skb: buffer containing message to be reversed; may be replaced. + * @err: error code to be set in message, if any + * Consumes buffer at failure * Returns true if success, otherwise false */ -bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, - int err) +bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, u32 *dnode, int err) { - struct tipc_msg *msg = buf_msg(buf); + struct sk_buff *_skb = *skb; + struct tipc_msg *hdr = buf_msg(_skb); struct tipc_msg ohdr; - uint rdsz = min_t(uint, msg_data_sz(msg), MAX_FORWARD_SIZE); + int dlen = min_t(uint, msg_data_sz(hdr), MAX_FORWARD_SIZE); - if (skb_linearize(buf)) + if (skb_linearize(_skb)) goto exit; - msg = buf_msg(buf); - if (msg_dest_droppable(msg)) + hdr = buf_msg(_skb); + if (msg_dest_droppable(hdr)) goto exit; - if (msg_errcode(msg)) + if (msg_errcode(hdr)) goto exit; - memcpy(&ohdr, msg, msg_hdr_sz(msg)); - msg_set_errcode(msg, err); - msg_set_origport(msg, msg_destport(&ohdr)); - msg_set_destport(msg, msg_origport(&ohdr)); - msg_set_prevnode(msg, own_addr); - if (!msg_short(msg)) { - msg_set_orignode(msg, msg_destnode(&ohdr)); - msg_set_destnode(msg, msg_orignode(&ohdr)); + + /* Take a copy of original header before altering message */ + memcpy(&ohdr, hdr, msg_hdr_sz(hdr)); + + /* Never return SHORT header; expand by replacing buffer if necessary */ + if (msg_short(hdr)) { + *skb = tipc_buf_acquire(BASIC_H_SIZE + dlen); + if (!*skb) + goto exit; + memcpy((*skb)->data + BASIC_H_SIZE, msg_data(hdr), dlen); + kfree_skb(_skb); + _skb = *skb; + hdr = buf_msg(_skb); + memcpy(hdr, &ohdr, BASIC_H_SIZE); + msg_set_hdr_sz(hdr, BASIC_H_SIZE); } - msg_set_size(msg, msg_hdr_sz(msg) + rdsz); - skb_trim(buf, msg_size(msg)); - skb_orphan(buf); - *dnode = msg_orignode(&ohdr); + + /* Now reverse the concerned fields */ + msg_set_errcode(hdr, err); + msg_set_origport(hdr, msg_destport(&ohdr)); + msg_set_destport(hdr, msg_origport(&ohdr)); + msg_set_destnode(hdr, msg_prevnode(&ohdr)); + msg_set_prevnode(hdr, own_node); + msg_set_orignode(hdr, own_node); + msg_set_size(hdr, msg_hdr_sz(hdr) + dlen); + *dnode = msg_destnode(hdr); + skb_trim(_skb, msg_size(hdr)); + skb_orphan(_skb); return true; exit: - kfree_skb(buf); - *dnode = 0; + kfree_skb(_skb); + *skb = NULL; return false; } diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 2f1563b47e24..0e96f59e3315 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -785,8 +785,7 @@ static inline bool msg_peer_is_up(struct tipc_msg *m) struct sk_buff *tipc_buf_acquire(u32 size); bool tipc_msg_validate(struct sk_buff *skb); -bool tipc_msg_reverse(u32 own_addr, struct sk_buff *buf, u32 *dnode, - int err); +bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, u32 *dnode, int err); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 5b0b08d58fcc..e2d5b9831485 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -260,7 +260,7 @@ static void tsk_rej_rx_queue(struct sock *sk) u32 own_node = tsk_own_node(tipc_sk(sk)); while ((skb = __skb_dequeue(&sk->sk_receive_queue))) { - if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_ERR_NO_PORT)) + if (tipc_msg_reverse(own_node, &skb, &dnode, TIPC_ERR_NO_PORT)) tipc_node_xmit_skb(sock_net(sk), skb, dnode, 0); } } @@ -441,7 +441,7 @@ static int tipc_release(struct socket *sock) tsk->connected = 0; tipc_node_remove_conn(net, dnode, tsk->portid); } - if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, + if (tipc_msg_reverse(tsk_own_node(tsk), &skb, &dnode, TIPC_ERR_NO_PORT)) tipc_node_xmit_skb(net, skb, dnode, 0); } @@ -784,7 +784,7 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff **skb) if (conn_cong) tsk->sk.sk_write_space(&tsk->sk); } else if (msg_type(msg) == CONN_PROBE) { - if (tipc_msg_reverse(own_node, *skb, &dnode, TIPC_OK)) { + if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_OK)) { msg_set_type(msg, CONN_PROBE_REPLY); return; } @@ -1702,7 +1702,7 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) atomic_add(truesize, dcnt); return 0; } - if (!err || tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, -err)) + if (!err || tipc_msg_reverse(tsk_own_node(tsk), &skb, &dnode, -err)) tipc_node_xmit_skb(net, skb, dnode, tsk->portid); return 0; } @@ -1796,7 +1796,7 @@ int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) goto xmit; } tn = net_generic(net, tipc_net_id); - if (!tipc_msg_reverse(tn->own_addr, skb, &dnode, -err)) + if (!tipc_msg_reverse(tn->own_addr, &skb, &dnode, -err)) continue; xmit: tipc_node_xmit_skb(net, skb, dnode, dport); @@ -2090,7 +2090,7 @@ restart: kfree_skb(skb); goto restart; } - if (tipc_msg_reverse(tsk_own_node(tsk), skb, &dnode, + if (tipc_msg_reverse(tsk_own_node(tsk), &skb, &dnode, TIPC_CONN_SHUTDOWN)) tipc_node_xmit_skb(net, skb, dnode, tsk->portid); -- cgit v1.2.3 From bcd3ffd4f6d7c994c93be2ab8598fdfb2952a1f1 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 22 Jul 2015 10:11:19 -0400 Subject: tipc: introduce new tipc_sk_respond() function Currently, we use the code sequence if (msg_reverse()) tipc_link_xmit_skb() at numerous locations in socket.c. The preparation of arguments for these calls, as well as the sequence itself, makes the code unecessarily complex. In this commit, we introduce a new function, tipc_sk_respond(), that performs this call combination. We also replace some, but not yet all, of these explicit call sequences with calls to the new function. Notably, we let the function tipc_sk_proto_rcv() use the new function to directly send out PROBE_REPLY messages, instead of deferring this to the calling tipc_sk_rcv() function, as we do now. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 3 +-- net/tipc/msg.h | 2 +- net/tipc/socket.c | 81 ++++++++++++++++++++++++++++++------------------------- 3 files changed, 47 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 4339aab93034..b6cc58ec7346 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -469,7 +469,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, * Consumes buffer at failure * Returns true if success, otherwise false */ -bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, u32 *dnode, int err) +bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, int err) { struct sk_buff *_skb = *skb; struct tipc_msg *hdr = buf_msg(_skb); @@ -508,7 +508,6 @@ bool tipc_msg_reverse(u32 own_node, struct sk_buff **skb, u32 *dnode, int err) msg_set_prevnode(hdr, own_node); msg_set_orignode(hdr, own_node); msg_set_size(hdr, msg_hdr_sz(hdr) + dlen); - *dnode = msg_destnode(hdr); skb_trim(_skb, msg_size(hdr)); skb_orphan(_skb); return true; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 0e96f59e3315..d0834bc519aa 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -785,7 +785,7 @@ static inline bool msg_peer_is_up(struct tipc_msg *m) struct sk_buff *tipc_buf_acquire(u32 size); bool tipc_msg_validate(struct sk_buff *skb); -bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, u32 *dnode, int err); +bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e2d5b9831485..71d88adadb18 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -248,6 +248,22 @@ static void tsk_advance_rx_queue(struct sock *sk) kfree_skb(__skb_dequeue(&sk->sk_receive_queue)); } +/* tipc_sk_respond() : send response message back to sender + */ +static void tipc_sk_respond(struct sock *sk, struct sk_buff *skb, int err) +{ + u32 selector; + u32 dnode; + u32 onode = tipc_own_addr(sock_net(sk)); + + if (!tipc_msg_reverse(onode, &skb, err)) + return; + + dnode = msg_destnode(buf_msg(skb)); + selector = msg_origport(buf_msg(skb)); + tipc_node_xmit_skb(sock_net(sk), skb, dnode, selector); +} + /** * tsk_rej_rx_queue - reject all buffers in socket receive queue * @@ -256,13 +272,9 @@ static void tsk_advance_rx_queue(struct sock *sk) static void tsk_rej_rx_queue(struct sock *sk) { struct sk_buff *skb; - u32 dnode; - u32 own_node = tsk_own_node(tipc_sk(sk)); - while ((skb = __skb_dequeue(&sk->sk_receive_queue))) { - if (tipc_msg_reverse(own_node, &skb, &dnode, TIPC_ERR_NO_PORT)) - tipc_node_xmit_skb(sock_net(sk), skb, dnode, 0); - } + while ((skb = __skb_dequeue(&sk->sk_receive_queue))) + tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT); } /* tsk_peer_msg - verify if message was sent by connected port's peer @@ -441,9 +453,7 @@ static int tipc_release(struct socket *sock) tsk->connected = 0; tipc_node_remove_conn(net, dnode, tsk->portid); } - if (tipc_msg_reverse(tsk_own_node(tsk), &skb, &dnode, - TIPC_ERR_NO_PORT)) - tipc_node_xmit_skb(net, skb, dnode, 0); + tipc_sk_respond(sk, skb, TIPC_ERR_NO_PORT); } } @@ -764,35 +774,35 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, /** * tipc_sk_proto_rcv - receive a connection mng protocol message * @tsk: receiving socket - * @skb: pointer to message buffer. Set to NULL if buffer is consumed. + * @skb: pointer to message buffer. */ -static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff **skb) +static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb) { - struct tipc_msg *msg = buf_msg(*skb); + struct sock *sk = &tsk->sk; + struct tipc_msg *hdr = buf_msg(skb); + int mtyp = msg_type(hdr); int conn_cong; - u32 dnode; - u32 own_node = tsk_own_node(tsk); + /* Ignore if connection cannot be validated: */ - if (!tsk_peer_msg(tsk, msg)) + if (!tsk_peer_msg(tsk, hdr)) goto exit; tsk->probing_state = TIPC_CONN_OK; - if (msg_type(msg) == CONN_ACK) { + if (mtyp == CONN_PROBE) { + msg_set_type(hdr, CONN_PROBE_REPLY); + tipc_sk_respond(sk, skb, TIPC_OK); + return; + } else if (mtyp == CONN_ACK) { conn_cong = tsk_conn_cong(tsk); - tsk->sent_unacked -= msg_msgcnt(msg); + tsk->sent_unacked -= msg_msgcnt(hdr); if (conn_cong) - tsk->sk.sk_write_space(&tsk->sk); - } else if (msg_type(msg) == CONN_PROBE) { - if (tipc_msg_reverse(own_node, skb, &dnode, TIPC_OK)) { - msg_set_type(msg, CONN_PROBE_REPLY); - return; - } + sk->sk_write_space(sk); + } else if (mtyp != CONN_PROBE_REPLY) { + pr_warn("Received unknown CONN_PROTO msg\n"); } - /* Do nothing if msg_type() == CONN_PROBE_REPLY */ exit: - kfree_skb(*skb); - *skb = NULL; + kfree_skb(skb); } static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p) @@ -1638,7 +1648,7 @@ static int filter_rcv(struct sock *sk, struct sk_buff **skb) int rc = TIPC_OK; if (unlikely(msg_user(msg) == CONN_MANAGER)) { - tipc_sk_proto_rcv(tsk, skb); + tipc_sk_proto_rcv(tsk, *skb); return TIPC_OK; } @@ -1690,7 +1700,7 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int err; atomic_t *dcnt; - u32 dnode; + u32 dnode = msg_prevnode(buf_msg(skb)); struct tipc_sock *tsk = tipc_sk(sk); struct net *net = sock_net(sk); uint truesize = skb->truesize; @@ -1702,7 +1712,7 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) atomic_add(truesize, dcnt); return 0; } - if (!err || tipc_msg_reverse(tsk_own_node(tsk), &skb, &dnode, -err)) + if (!err || tipc_msg_reverse(tsk_own_node(tsk), &skb, -err)) tipc_node_xmit_skb(net, skb, dnode, tsk->portid); return 0; } @@ -1794,9 +1804,11 @@ int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) if (!err) { dnode = msg_destnode(buf_msg(skb)); goto xmit; + } else { + dnode = msg_prevnode(buf_msg(skb)); } tn = net_generic(net, tipc_net_id); - if (!tipc_msg_reverse(tn->own_addr, &skb, &dnode, -err)) + if (!tipc_msg_reverse(tn->own_addr, &skb, -err)) continue; xmit: tipc_node_xmit_skb(net, skb, dnode, dport); @@ -2083,6 +2095,8 @@ static int tipc_shutdown(struct socket *sock, int how) case SS_CONNECTED: restart: + dnode = tsk_peer_node(tsk); + /* Disconnect and send a 'FIN+' or 'FIN-' message to peer */ skb = __skb_dequeue(&sk->sk_receive_queue); if (skb) { @@ -2090,13 +2104,8 @@ restart: kfree_skb(skb); goto restart; } - if (tipc_msg_reverse(tsk_own_node(tsk), &skb, &dnode, - TIPC_CONN_SHUTDOWN)) - tipc_node_xmit_skb(net, skb, dnode, - tsk->portid); + tipc_sk_respond(sk, skb, TIPC_CONN_SHUTDOWN); } else { - dnode = tsk_peer_node(tsk); - skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, dnode, tsk_own_node(tsk), -- cgit v1.2.3 From cda3696d3d26eb798c94de0dab5bd66ddb5627cb Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Wed, 22 Jul 2015 10:11:20 -0400 Subject: tipc: clean up socket layer message reception When a message is received in a socket, one of the call chains tipc_sk_rcv()->tipc_sk_enqueue()->filter_rcv()(->tipc_sk_proto_rcv()) or tipc_sk_backlog_rcv()->filter_rcv()(->tipc_sk_proto_rcv()) are followed. At each of these levels we may encounter situations where the message may need to be rejected, or a new message produced for transfer back to the sender. Despite recent improvements, the current code for doing this is perceived as awkward and hard to follow. Leveraging the two previous commits in this series, we now introduce a more uniform handling of such situations. We let each of the functions in the chain itself produce/reverse the message to be returned to the sender, but also perform the actual forwarding. This simplifies the necessary logics within each function. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/msg.c | 20 ++--- net/tipc/msg.h | 3 +- net/tipc/socket.c | 255 ++++++++++++++++++++++++++---------------------------- net/tipc/socket.h | 2 +- 4 files changed, 134 insertions(+), 146 deletions(-) (limited to 'net') diff --git a/net/tipc/msg.c b/net/tipc/msg.c index b6cc58ec7346..562c926a51cc 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -520,17 +520,15 @@ exit: /** * tipc_msg_lookup_dest(): try to find new destination for named message * @skb: the buffer containing the message. - * @dnode: return value: next-hop node, if destination found - * @err: return value: error code to use, if message to be rejected + * @err: error code to be used by caller if lookup fails * Does not consume buffer * Returns true if a destination is found, false otherwise */ -bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, - u32 *dnode, int *err) +bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err) { struct tipc_msg *msg = buf_msg(skb); - u32 dport; - u32 own_addr = tipc_own_addr(net); + u32 dport, dnode; + u32 onode = tipc_own_addr(net); if (!msg_isdata(msg)) return false; @@ -543,15 +541,15 @@ bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, return false; if (msg_reroute_cnt(msg)) return false; - *dnode = addr_domain(net, msg_lookup_scope(msg)); + dnode = addr_domain(net, msg_lookup_scope(msg)); dport = tipc_nametbl_translate(net, msg_nametype(msg), - msg_nameinst(msg), dnode); + msg_nameinst(msg), &dnode); if (!dport) return false; msg_incr_reroute_cnt(msg); - if (*dnode != own_addr) - msg_set_prevnode(msg, own_addr); - msg_set_destnode(msg, *dnode); + if (dnode != onode) + msg_set_prevnode(msg, onode); + msg_set_destnode(msg, dnode); msg_set_destport(msg, dport); *err = TIPC_OK; return true; diff --git a/net/tipc/msg.h b/net/tipc/msg.h index d0834bc519aa..234fb0531d1d 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -798,8 +798,7 @@ bool tipc_msg_make_bundle(struct sk_buff **skb, struct tipc_msg *msg, bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); -bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, u32 *dnode, - int *err); +bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); struct sk_buff *tipc_msg_reassemble(struct sk_buff_head *list); static inline u16 buf_seqno(struct sk_buff *skb) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 71d88adadb18..1060d52ff23e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1520,82 +1520,81 @@ static void tipc_data_ready(struct sock *sk) * @tsk: TIPC socket * @skb: pointer to message buffer. Set to NULL if buffer is consumed * - * Returns 0 (TIPC_OK) if everything ok, -TIPC_ERR_NO_PORT otherwise + * Returns true if everything ok, false otherwise */ -static int filter_connect(struct tipc_sock *tsk, struct sk_buff **skb) +static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct socket *sock = sk->sk_socket; - struct tipc_msg *msg = buf_msg(*skb); - int retval = -TIPC_ERR_NO_PORT; + struct tipc_msg *hdr = buf_msg(skb); - if (msg_mcast(msg)) - return retval; + if (unlikely(msg_mcast(hdr))) + return false; switch ((int)sock->state) { case SS_CONNECTED: + /* Accept only connection-based messages sent by peer */ - if (tsk_peer_msg(tsk, msg)) { - if (unlikely(msg_errcode(msg))) { - sock->state = SS_DISCONNECTING; - tsk->connected = 0; - /* let timer expire on it's own */ - tipc_node_remove_conn(net, tsk_peer_node(tsk), - tsk->portid); - } - retval = TIPC_OK; + if (unlikely(!tsk_peer_msg(tsk, hdr))) + return false; + + if (unlikely(msg_errcode(hdr))) { + sock->state = SS_DISCONNECTING; + tsk->connected = 0; + /* Let timer expire on it's own */ + tipc_node_remove_conn(net, tsk_peer_node(tsk), + tsk->portid); } - break; + return true; + case SS_CONNECTING: - /* Accept only ACK or NACK message */ - if (unlikely(!msg_connected(msg))) - break; + /* Accept only ACK or NACK message */ + if (unlikely(!msg_connected(hdr))) + return false; - if (unlikely(msg_errcode(msg))) { + if (unlikely(msg_errcode(hdr))) { sock->state = SS_DISCONNECTING; sk->sk_err = ECONNREFUSED; - retval = TIPC_OK; - break; + return true; } - if (unlikely(msg_importance(msg) > TIPC_CRITICAL_IMPORTANCE)) { + if (unlikely(!msg_isdata(hdr))) { sock->state = SS_DISCONNECTING; sk->sk_err = EINVAL; - retval = TIPC_OK; - break; + return true; } - tipc_sk_finish_conn(tsk, msg_origport(msg), msg_orignode(msg)); - msg_set_importance(&tsk->phdr, msg_importance(msg)); + tipc_sk_finish_conn(tsk, msg_origport(hdr), msg_orignode(hdr)); + msg_set_importance(&tsk->phdr, msg_importance(hdr)); sock->state = SS_CONNECTED; - /* If an incoming message is an 'ACK-', it should be - * discarded here because it doesn't contain useful - * data. In addition, we should try to wake up - * connect() routine if sleeping. - */ - if (msg_data_sz(msg) == 0) { - kfree_skb(*skb); - *skb = NULL; - if (waitqueue_active(sk_sleep(sk))) - wake_up_interruptible(sk_sleep(sk)); - } - retval = TIPC_OK; - break; + /* If 'ACK+' message, add to socket receive queue */ + if (msg_data_sz(hdr)) + return true; + + /* If empty 'ACK-' message, wake up sleeping connect() */ + if (waitqueue_active(sk_sleep(sk))) + wake_up_interruptible(sk_sleep(sk)); + + /* 'ACK-' message is neither accepted nor rejected: */ + msg_set_dest_droppable(hdr, 1); + return false; + case SS_LISTENING: case SS_UNCONNECTED: + /* Accept only SYN message */ - if (!msg_connected(msg) && !(msg_errcode(msg))) - retval = TIPC_OK; + if (!msg_connected(hdr) && !(msg_errcode(hdr))) + return true; break; case SS_DISCONNECTING: break; default: pr_err("Unknown socket state %u\n", sock->state); } - return retval; + return false; } /** @@ -1630,61 +1629,70 @@ static unsigned int rcvbuf_limit(struct sock *sk, struct sk_buff *buf) /** * filter_rcv - validate incoming message * @sk: socket - * @skb: pointer to message. Set to NULL if buffer is consumed. + * @skb: pointer to message. * * Enqueues message on receive queue if acceptable; optionally handles * disconnect indication for a connected socket. * * Called with socket lock already taken * - * Returns 0 (TIPC_OK) if message was ok, -TIPC error code if rejected + * Returns true if message was added to socket receive queue, otherwise false */ -static int filter_rcv(struct sock *sk, struct sk_buff **skb) +static bool filter_rcv(struct sock *sk, struct sk_buff *skb) { struct socket *sock = sk->sk_socket; struct tipc_sock *tsk = tipc_sk(sk); - struct tipc_msg *msg = buf_msg(*skb); - unsigned int limit = rcvbuf_limit(sk, *skb); - int rc = TIPC_OK; + struct tipc_msg *hdr = buf_msg(skb); + unsigned int limit = rcvbuf_limit(sk, skb); + int err = TIPC_OK; + int usr = msg_user(hdr); - if (unlikely(msg_user(msg) == CONN_MANAGER)) { - tipc_sk_proto_rcv(tsk, *skb); - return TIPC_OK; + if (unlikely(msg_user(hdr) == CONN_MANAGER)) { + tipc_sk_proto_rcv(tsk, skb); + return false; } - if (unlikely(msg_user(msg) == SOCK_WAKEUP)) { - kfree_skb(*skb); + if (unlikely(usr == SOCK_WAKEUP)) { + kfree_skb(skb); tsk->link_cong = 0; sk->sk_write_space(sk); - *skb = NULL; - return TIPC_OK; + return false; } - /* Reject message if it is wrong sort of message for socket */ - if (msg_type(msg) > TIPC_DIRECT_MSG) - return -TIPC_ERR_NO_PORT; + /* Drop if illegal message type */ + if (unlikely(msg_type(hdr) > TIPC_DIRECT_MSG)) { + kfree_skb(skb); + return false; + } - if (sock->state == SS_READY) { - if (msg_connected(msg)) - return -TIPC_ERR_NO_PORT; - } else { - rc = filter_connect(tsk, skb); - if (rc != TIPC_OK || !*skb) - return rc; + /* Reject if wrong message type for current socket state */ + if (unlikely(sock->state == SS_READY)) { + if (msg_connected(hdr)) { + err = TIPC_ERR_NO_PORT; + goto reject; + } + } else if (unlikely(!filter_connect(tsk, skb))) { + err = TIPC_ERR_NO_PORT; + goto reject; } /* Reject message if there isn't room to queue it */ - if (sk_rmem_alloc_get(sk) + (*skb)->truesize >= limit) - return -TIPC_ERR_OVERLOAD; + if (unlikely(sk_rmem_alloc_get(sk) + skb->truesize >= limit)) { + err = TIPC_ERR_OVERLOAD; + goto reject; + } /* Enqueue message */ - TIPC_SKB_CB(*skb)->handle = NULL; - __skb_queue_tail(&sk->sk_receive_queue, *skb); - skb_set_owner_r(*skb, sk); + TIPC_SKB_CB(skb)->handle = NULL; + __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_set_owner_r(skb, sk); sk->sk_data_ready(sk); - *skb = NULL; - return TIPC_OK; + return true; + +reject: + tipc_sk_respond(sk, skb, err); + return false; } /** @@ -1698,22 +1706,10 @@ static int filter_rcv(struct sock *sk, struct sk_buff **skb) */ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) { - int err; - atomic_t *dcnt; - u32 dnode = msg_prevnode(buf_msg(skb)); - struct tipc_sock *tsk = tipc_sk(sk); - struct net *net = sock_net(sk); - uint truesize = skb->truesize; + unsigned int truesize = skb->truesize; - err = filter_rcv(sk, &skb); - if (likely(!skb)) { - dcnt = &tsk->dupl_rcvcnt; - if (atomic_read(dcnt) < TIPC_CONN_OVERLOAD_LIMIT) - atomic_add(truesize, dcnt); - return 0; - } - if (!err || tipc_msg_reverse(tsk_own_node(tsk), &skb, -err)) - tipc_node_xmit_skb(net, skb, dnode, tsk->portid); + if (likely(filter_rcv(sk, skb))) + atomic_add(truesize, &tipc_sk(sk)->dupl_rcvcnt); return 0; } @@ -1723,45 +1719,43 @@ static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb) * @inputq: list of incoming buffers with potentially different destinations * @sk: socket where the buffers should be enqueued * @dport: port number for the socket - * @_skb: returned buffer to be forwarded or rejected, if applicable * * Caller must hold socket lock - * - * Returns TIPC_OK if all buffers enqueued, otherwise -TIPC_ERR_OVERLOAD - * or -TIPC_ERR_NO_PORT */ -static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, - u32 dport, struct sk_buff **_skb) +static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, + u32 dport) { unsigned int lim; atomic_t *dcnt; - int err; struct sk_buff *skb; unsigned long time_limit = jiffies + 2; while (skb_queue_len(inputq)) { if (unlikely(time_after_eq(jiffies, time_limit))) - return TIPC_OK; + return; + skb = tipc_skb_dequeue(inputq, dport); if (unlikely(!skb)) - return TIPC_OK; + return; + + /* Add message directly to receive queue if possible */ if (!sock_owned_by_user(sk)) { - err = filter_rcv(sk, &skb); - if (likely(!skb)) - continue; - *_skb = skb; - return err; + filter_rcv(sk, skb); + continue; } + + /* Try backlog, compensating for double-counted bytes */ dcnt = &tipc_sk(sk)->dupl_rcvcnt; if (sk->sk_backlog.len) atomic_set(dcnt, 0); lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt); if (likely(!sk_add_backlog(sk, skb, lim))) continue; - *_skb = skb; - return -TIPC_ERR_OVERLOAD; + + /* Overload => reject message back to sender */ + tipc_sk_respond(sk, skb, TIPC_ERR_OVERLOAD); + break; } - return TIPC_OK; } /** @@ -1769,51 +1763,46 @@ static int tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, * @inputq: buffer list containing the buffers * Consumes all buffers in list until inputq is empty * Note: may be called in multiple threads referring to the same queue - * Returns 0 if last buffer was accepted, otherwise -EHOSTUNREACH - * Only node local calls check the return value, sending single-buffer queues */ -int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) +void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq) { u32 dnode, dport = 0; int err; - struct sk_buff *skb; struct tipc_sock *tsk; - struct tipc_net *tn; struct sock *sk; + struct sk_buff *skb; while (skb_queue_len(inputq)) { - err = -TIPC_ERR_NO_PORT; - skb = NULL; dport = tipc_skb_peek_port(inputq, dport); tsk = tipc_sk_lookup(net, dport); + if (likely(tsk)) { sk = &tsk->sk; if (likely(spin_trylock_bh(&sk->sk_lock.slock))) { - err = tipc_sk_enqueue(inputq, sk, dport, &skb); + tipc_sk_enqueue(inputq, sk, dport); spin_unlock_bh(&sk->sk_lock.slock); - dport = 0; } sock_put(sk); - } else { - skb = tipc_skb_dequeue(inputq, dport); - } - if (likely(!skb)) continue; - if (tipc_msg_lookup_dest(net, skb, &dnode, &err)) - goto xmit; - if (!err) { - dnode = msg_destnode(buf_msg(skb)); - goto xmit; - } else { - dnode = msg_prevnode(buf_msg(skb)); } - tn = net_generic(net, tipc_net_id); - if (!tipc_msg_reverse(tn->own_addr, &skb, -err)) + + /* No destination socket => dequeue skb if still there */ + skb = tipc_skb_dequeue(inputq, dport); + if (!skb) + return; + + /* Try secondary lookup if unresolved named message */ + err = TIPC_ERR_NO_PORT; + if (tipc_msg_lookup_dest(net, skb, &err)) + goto xmit; + + /* Prepare for message rejection */ + if (!tipc_msg_reverse(tipc_own_addr(net), &skb, err)) continue; xmit: + dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, dport); } - return err ? -EHOSTUNREACH : 0; } static int tipc_wait_for_connect(struct socket *sock, long *timeo_p) @@ -2082,7 +2071,10 @@ static int tipc_shutdown(struct socket *sock, int how) struct net *net = sock_net(sk); struct tipc_sock *tsk = tipc_sk(sk); struct sk_buff *skb; - u32 dnode; + u32 dnode = tsk_peer_node(tsk); + u32 dport = tsk_peer_port(tsk); + u32 onode = tipc_own_addr(net); + u32 oport = tsk->portid; int res; if (how != SHUT_RDWR) @@ -2108,9 +2100,8 @@ restart: } else { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, - 0, dnode, tsk_own_node(tsk), - tsk_peer_port(tsk), - tsk->portid, TIPC_CONN_SHUTDOWN); + 0, dnode, onode, dport, oport, + TIPC_CONN_SHUTDOWN); tipc_node_xmit_skb(net, skb, dnode, tsk->portid); } tsk->connected = 0; diff --git a/net/tipc/socket.h b/net/tipc/socket.h index bf6551389522..4241f22069dc 100644 --- a/net/tipc/socket.h +++ b/net/tipc/socket.h @@ -44,7 +44,7 @@ SKB_TRUESIZE(TIPC_MAX_USER_MSG_SIZE)) int tipc_socket_init(void); void tipc_socket_stop(void); -int tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); +void tipc_sk_rcv(struct net *net, struct sk_buff_head *inputq); void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq, struct sk_buff_head *inputq); void tipc_sk_reinit(struct net *net); -- cgit v1.2.3 From 48fb6b554501914b6befcf9b38292cbeb91de330 Mon Sep 17 00:00:00 2001 From: Wei-Chun Chao Date: Wed, 22 Jul 2015 18:13:12 -0700 Subject: ipv6: fix crash over flow-based vxlan device Similar check was added in ip_rcv but not in ipv6_rcv. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] ipv6_rcv+0xfa/0x500 Call Trace: [] ? ip_rcv+0x296/0x400 [] ? packet_rcv+0x52/0x410 [] __netif_receive_skb_core+0x63f/0x9a0 [] ? br_handle_frame_finish+0x580/0x580 [bridge] [] ? update_rq_clock.part.81+0x1c/0x40 [] __netif_receive_skb+0x18/0x60 [] process_backlog+0x9f/0x150 Fixes: ee122c79d422 (vxlan: Flow based tunneling) Signed-off-by: Wei-Chun Chao Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv6/ip6_input.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 57990c929cd8..adba03ac7ce9 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -45,6 +45,7 @@ #include #include #include +#include int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb) { @@ -55,7 +56,7 @@ int ip6_rcv_finish(struct sock *sk, struct sk_buff *skb) if (ipprot && ipprot->early_demux) ipprot->early_demux(skb); } - if (!skb_dst(skb)) + if (!skb_valid_dst(skb)) ip6_route_input(skb); return dst_input(skb); @@ -98,7 +99,7 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt * arrived via the sending interface (ethX), because of the * nature of scoping architecture. --yoshfuji */ - IP6CB(skb)->iif = skb_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; + IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) goto err; -- cgit v1.2.3 From 88f643203668b95b884b60b45ff182289153843c Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Thu, 23 Jul 2015 10:39:35 +0300 Subject: ipv4: be more aggressive when probing alternative gateways Currently, we do not notice if new alternative gateways are added. We can do it by checking for present neigh entry. Also, gateways that are currently probed (NUD_INCOMPLETE) can be skipped from round-robin probing. Suggested-by: Florian Westphal Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6754c64b2fe0..d4c6732cfbfa 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -438,13 +438,15 @@ static int fib_detect_death(struct fib_info *fi, int order, if (n) { state = n->nud_state; neigh_release(n); + } else { + return 0; } if (state == NUD_REACHABLE) return 0; if ((state & NUD_VALID) && order != dflt) return 0; if ((state & NUD_VALID) || - (*last_idx < 0 && order > dflt)) { + (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) { *last_resort = fi; *last_idx = order; } -- cgit v1.2.3 From 2661371ace963280f34fe583d4a6697afecd87d5 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Thu, 23 Jul 2015 11:29:07 +0200 Subject: openvswitch: fix compilation when vxlan is a module With CONFIG_VXLAN=m and CONFIG_OPENVSWITCH=y, there was the following compilation error: LD init/built-in.o net/built-in.o: In function `vxlan_tnl_create': .../net/openvswitch/vport-netdev.c:322: undefined reference to `vxlan_dev_create' make: *** [vmlinux] Error 1 CC: Thomas Graf Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device") Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 1119f46b80b4..6ed1d2da9fcd 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -5,6 +5,7 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET + depends on VXLAN select LIBCRC32C select MPLS select NET_MPLS_GSO -- cgit v1.2.3 From 597798e438371128519fa94dacdc42311b4254a6 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Thu, 23 Jul 2015 13:04:44 +0200 Subject: openvswitch: Retrieve tunnel metadata when receiving from vport-netdev Retrieve the tunnel metadata for packets received by a net_device and provide it to ovs_vport_receive() for flow key extraction. [This hunk was in the GRE patch in the initial series and missed the cut for the initial submission for merging.] Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device") Signed-off-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport-netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 68d0582fc001..2254b742168f 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -58,7 +58,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb, NULL); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb, AF_INET)); return; error: -- cgit v1.2.3 From 949f1e39a6177503349ef8adeedd13fa1c89e5dd Mon Sep 17 00:00:00 2001 From: Satish Ashok Date: Thu, 23 Jul 2015 05:00:53 -0700 Subject: bridge: mdb: notify on router port add and del Send notifications on router port add and del/expire, re-use the already existing MDBA_ROUTER and send NEWMDB/DELMDB netlink notifications respectively. Signed-off-by: Satish Ashok Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_multicast.c | 10 +++++-- net/bridge/br_private.h | 2 ++ 3 files changed, 77 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 6a591e6e2d04..5e9d1c5e1194 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -247,6 +247,73 @@ void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, __br_mdb_notify(dev, &entry, type); } +static int nlmsg_populate_rtr_fill(struct sk_buff *skb, + struct net_device *dev, + int ifindex, u32 pid, + u32 seq, int type, unsigned int flags) +{ + struct br_port_msg *bpm; + struct nlmsghdr *nlh; + struct nlattr *nest; + + nlh = nlmsg_put(skb, pid, seq, type, sizeof(*bpm), NLM_F_MULTI); + if (!nlh) + return -EMSGSIZE; + + bpm = nlmsg_data(nlh); + memset(bpm, 0, sizeof(*bpm)); + bpm->family = AF_BRIDGE; + bpm->ifindex = dev->ifindex; + nest = nla_nest_start(skb, MDBA_ROUTER); + if (!nest) + goto cancel; + + if (nla_put_u32(skb, MDBA_ROUTER_PORT, ifindex)) + goto end; + + nla_nest_end(skb, nest); + nlmsg_end(skb, nlh); + return 0; + +end: + nla_nest_end(skb, nest); +cancel: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +static inline size_t rtnl_rtr_nlmsg_size(void) +{ + return NLMSG_ALIGN(sizeof(struct br_port_msg)) + + nla_total_size(sizeof(__u32)); +} + +void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, + int type) +{ + struct net *net = dev_net(dev); + struct sk_buff *skb; + int err = -ENOBUFS; + int ifindex; + + ifindex = port ? port->dev->ifindex : 0; + skb = nlmsg_new(rtnl_rtr_nlmsg_size(), GFP_ATOMIC); + if (!skb) + goto errout; + + err = nlmsg_populate_rtr_fill(skb, dev, ifindex, 0, 0, type, NTF_SELF); + if (err < 0) { + kfree_skb(skb); + goto errout; + } + + rtnl_notify(skb, net, 0, RTNLGRP_MDB, NULL, GFP_ATOMIC); + return; + +errout: + rtnl_set_sk_err(net, RTNLGRP_MDB, err); +} + static bool is_valid_mdb_entry(struct br_mdb_entry *entry) { if (entry->ifindex == 0) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index ed5dc684a4ce..fd238587e032 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -766,6 +766,7 @@ static void br_multicast_router_expired(unsigned long data) goto out; hlist_del_init_rcu(&port->rlist); + br_rtr_notify(br->dev, port, RTM_DELMDB); out: spin_unlock(&br->multicast_lock); @@ -977,8 +978,10 @@ void br_multicast_disable_port(struct net_bridge_port *port) if (pg->state == MDB_TEMPORARY) br_multicast_del_pg(br, pg); - if (!hlist_unhashed(&port->rlist)) + if (!hlist_unhashed(&port->rlist)) { hlist_del_init_rcu(&port->rlist); + br_rtr_notify(br->dev, port, RTM_DELMDB); + } del_timer(&port->multicast_router_timer); del_timer(&port->ip4_own_query.timer); #if IS_ENABLED(CONFIG_IPV6) @@ -1216,6 +1219,7 @@ static void br_multicast_add_router(struct net_bridge *br, hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); + br_rtr_notify(br->dev, port, RTM_NEWMDB); } static void br_multicast_mark_router(struct net_bridge *br, @@ -1848,8 +1852,10 @@ int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val) p->multicast_router = val; err = 0; - if (val < 2 && !hlist_unhashed(&p->rlist)) + if (val < 2 && !hlist_unhashed(&p->rlist)) { hlist_del_init_rcu(&p->rlist); + br_rtr_notify(br->dev, p, RTM_DELMDB); + } if (val == 1) break; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 3ad1290528af..e2cb359f9dd3 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -490,6 +490,8 @@ void br_mdb_init(void); void br_mdb_uninit(void); void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port, struct br_ip *group, int type, u8 state); +void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port, + int type); #define mlock_dereference(X, br) \ rcu_dereference_protected(X, lockdep_is_held(&br->multicast_lock)) -- cgit v1.2.3 From b469139e81ca8265fb4797c007f8d3338f4191a5 Mon Sep 17 00:00:00 2001 From: subashab@codeaurora.org Date: Fri, 24 Jul 2015 03:03:29 +0000 Subject: dev: Spelling fix in comments Fix the following typo - unchainged -> unchanged Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index cb52cba30ae8..4870c3556a5a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4995,7 +4995,7 @@ EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu); * Gets the next netdev_adjacent->private from the dev's lower neighbour * list, starting from iter position. The caller must hold either hold the * RTNL lock or its own locking that guarantees that the neighbour lower - * list will remain unchainged. + * list will remain unchanged. */ void *netdev_lower_get_next_private(struct net_device *dev, struct list_head **iter) @@ -5050,7 +5050,7 @@ EXPORT_SYMBOL(netdev_lower_get_next_private_rcu); * Gets the next netdev_adjacent from the dev's lower neighbour * list, starting from iter position. The caller must hold RTNL lock or * its own locking that guarantees that the neighbour lower - * list will remain unchainged. + * list will remain unchanged. */ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter) { -- cgit v1.2.3 From 6673a9f4e35c1f0e9976cd4e88042f87674a6b02 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 24 Jul 2015 10:59:41 +0200 Subject: ipv6: use lwtunnel_output6() only if flag redirect is set This function make sense only when LWTUNNEL_STATE_OUTPUT_REDIRECT is set. The check is already done in IPv4. CC: Thomas Graf CC: Roopa Prabhu Fixes: 74a0f2fe8ed5 ("ipv6: rt6_info output redirect to tunnel output") Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7f2214f8fde7..f91d2637072b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1780,7 +1780,8 @@ int ip6_route_add(struct fib6_config *cfg) goto out; lwtunnel_state_get(lwtstate); rt->rt6i_lwtstate = lwtstate; - rt->dst.output = lwtunnel_output6; + if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) + rt->dst.output = lwtunnel_output6; } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); -- cgit v1.2.3 From d943659508a4fb883507fdd3f998329e70a8f922 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 24 Jul 2015 12:28:35 +0200 Subject: ipv6: copy lwtstate in ip6_rt_copy_init() We need to copy this field (ip6_rt_cache_alloc() and ip6_rt_pcpu_alloc() use ip6_rt_copy_init() to build a dst). CC: Thomas Graf CC: Roopa Prabhu Fixes: 19e42e451506 ("ipv6: support for fib route lwtunnel encap attributes") Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index f91d2637072b..fbe27fb6bd3f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2161,6 +2161,10 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; + if (ort->rt6i_lwtstate) { + lwtunnel_state_get(ort->rt6i_lwtstate); + rt->rt6i_lwtstate = ort->rt6i_lwtstate; + } } #ifdef CONFIG_IPV6_ROUTE_INFO -- cgit v1.2.3 From 5a6228a0b472062646434cd2536d109c102b606e Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Fri, 24 Jul 2015 12:28:36 +0200 Subject: lwtunnel: change prototype of lwtunnel_state_get() It saves some lines and simplify a bit the code when the state is returning by this function. It's also useful to handle a NULL entry. To avoid too long lines, I've also renamed lwtunnel_state_get() and lwtunnel_state_put() to lwtstate_get() and lwtstate_put(). CC: Thomas Graf CC: Roopa Prabhu Signed-off-by: Nicolas Dichtel Acked-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 16 +++++++++++----- net/ipv4/fib_semantics.c | 9 ++++----- net/ipv4/route.c | 9 ++------- net/ipv6/ip6_fib.c | 2 +- net/ipv6/route.c | 8 ++------ 5 files changed, 20 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 918e03c1dafa..b02039081b04 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -35,12 +35,16 @@ extern const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; #ifdef CONFIG_LWTUNNEL -static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +static inline struct lwtunnel_state * +lwtstate_get(struct lwtunnel_state *lws) { - atomic_inc(&lws->refcnt); + if (lws) + atomic_inc(&lws->refcnt); + + return lws; } -static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +static inline void lwtstate_put(struct lwtunnel_state *lws) { if (!lws) return; @@ -74,11 +78,13 @@ int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); #else -static inline void lwtunnel_state_get(struct lwtunnel_state *lws) +static inline struct lwtunnel_state * +lwtstate_get(struct lwtunnel_state *lws) { + return lws; } -static inline void lwtunnel_state_put(struct lwtunnel_state *lws) +static inline void lwtstate_put(struct lwtunnel_state *lws) { } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d4c6732cfbfa..65e00399a9a6 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -209,7 +209,7 @@ static void free_fib_info_rcu(struct rcu_head *head) change_nexthops(fi) { if (nexthop_nh->nh_dev) dev_put(nexthop_nh->nh_dev); - lwtunnel_state_put(nexthop_nh->nh_lwtstate); + lwtstate_put(nexthop_nh->nh_lwtstate); free_nh_exceptions(nexthop_nh); rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output); rt_fibinfo_free(&nexthop_nh->nh_rth_input); @@ -514,8 +514,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, nla, &lwtstate); if (ret) goto errout; - lwtunnel_state_get(lwtstate); - nexthop_nh->nh_lwtstate = lwtstate; + nexthop_nh->nh_lwtstate = + lwtstate_get(lwtstate); } } @@ -971,8 +971,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (err) goto failure; - lwtunnel_state_get(lwtstate); - nh->nh_lwtstate = lwtstate; + nh->nh_lwtstate = lwtstate_get(lwtstate); } nh->nh_oif = cfg->fc_oif; nh->nh_gw = cfg->fc_gw; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 519ec232818d..11096396ef4a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1358,7 +1358,7 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } - lwtunnel_state_put(rt->rt_lwtstate); + lwtstate_put(rt->rt_lwtstate); } void rt_flush_dev(struct net_device *dev) @@ -1407,12 +1407,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - if (nh->nh_lwtstate) { - lwtunnel_state_get(nh->nh_lwtstate); - rt->rt_lwtstate = nh->nh_lwtstate; - } else { - rt->rt_lwtstate = NULL; - } + rt->rt_lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index d715f2e0c4e7..5693b5eb8482 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -178,7 +178,7 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { - lwtunnel_state_put(rt->rt6i_lwtstate); + lwtstate_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fbe27fb6bd3f..c9b2b9fe83fc 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1778,8 +1778,7 @@ int ip6_route_add(struct fib6_config *cfg) cfg->fc_encap, &lwtstate); if (err) goto out; - lwtunnel_state_get(lwtstate); - rt->rt6i_lwtstate = lwtstate; + rt->rt6i_lwtstate = lwtstate_get(lwtstate); if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) rt->dst.output = lwtunnel_output6; } @@ -2161,10 +2160,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; - if (ort->rt6i_lwtstate) { - lwtunnel_state_get(ort->rt6i_lwtstate); - rt->rt6i_lwtstate = ort->rt6i_lwtstate; - } + rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate); } #ifdef CONFIG_IPV6_ROUTE_INFO -- cgit v1.2.3 From 990edb428c2c85c22ca770330437db7183cbe8b5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 24 Jul 2015 09:57:42 -0700 Subject: ipv6: Re-arrange code in rt6_probe() It is a prep work for the next patch to remove write_lock from rt6_probe(). 1. Reduce the number of if(neigh) check. From 4 to 1. 2. Bring the write_(un)lock() closer to the operations that the lock is protecting. Hopefully, the above make rt6_probe() more readable. Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa Cc: Julian Anastasov Cc: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/route.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c9b2b9fe83fc..0ef52623fc77 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -545,6 +545,7 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct rt6_info *rt) { + struct __rt6_probe_work *work; struct neighbour *neigh; /* * Okay, this does not seem to be appropriate @@ -559,34 +560,29 @@ static void rt6_probe(struct rt6_info *rt) rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (neigh) { + work = NULL; write_lock(&neigh->lock); - if (neigh->nud_state & NUD_VALID) - goto out; - } - - if (!neigh || - time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) { - struct __rt6_probe_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - - if (neigh && work) - __neigh_set_probe_once(neigh); - - if (neigh) - write_unlock(&neigh->lock); - - if (work) { - INIT_WORK(&work->work, rt6_probe_deferred); - work->target = rt->rt6i_gateway; - dev_hold(rt->dst.dev); - work->dev = rt->dst.dev; - schedule_work(&work->work); + if (!(neigh->nud_state & NUD_VALID) && + time_after(jiffies, + neigh->updated + + rt->rt6i_idev->cnf.rtr_probe_interval)) { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) + __neigh_set_probe_once(neigh); } - } else { -out: write_unlock(&neigh->lock); + } else { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + } + + if (work) { + INIT_WORK(&work->work, rt6_probe_deferred); + work->target = rt->rt6i_gateway; + dev_hold(rt->dst.dev); + work->dev = rt->dst.dev; + schedule_work(&work->work); } + rcu_read_unlock_bh(); } #else -- cgit v1.2.3 From 8d6c31bf574177c8de48dd1387d96e1ec3a8b8bc Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 24 Jul 2015 09:57:43 -0700 Subject: ipv6: Avoid rt6_probe() taking writer lock in the fast path The patch checks neigh->nud_state before acquiring the writer lock. Note that rt6_probe() is only used in CONFIG_IPV6_ROUTER_PREF. 40 udpflood processes and a /64 gateway route are used. The gateway has NUD_PERMANENT. Each of them is run for 30s. At the end, the total number of finished sendto(): Before: 55M After: 95M Signed-off-by: Martin KaFai Lau Cc: Hannes Frederic Sowa CC: Julian Anastasov CC: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/route.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0ef52623fc77..54fccf0d705d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -560,6 +560,9 @@ static void rt6_probe(struct rt6_info *rt) rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (neigh) { + if (neigh->nud_state & NUD_VALID) + goto out; + work = NULL; write_lock(&neigh->lock); if (!(neigh->nud_state & NUD_VALID) && @@ -583,6 +586,7 @@ static void rt6_probe(struct rt6_info *rt) schedule_work(&work->work); } +out: rcu_read_unlock_bh(); } #else -- cgit v1.2.3 From 99d7662a04c4f9822b042f22ae37f8c7355c2d5a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 26 Jul 2015 09:45:24 +0200 Subject: tcp: tso: allow deferring under reordering state While doing experiments with reordering resilience, we found linux senders were not able to send at full speed under reordering, because every incoming SACK was releasing one MSS. This patch removes the limitation, as we did for CWR state in commit a0ea700e409 ("tcp: tso: allow CA_CWR state in tcp_tso_should_defer()") Neal Cardwell had a concern about limited transmit so Yuchung conducted experiments on GFE and found nothing worth adding an extra check on fast path : if (icsk->icsk_ca_state == TCP_CA_Disorder && tcp_sk(sk)->reordering == sysctl_tcp_reordering) goto send_now; Signed-off-by: Eric Dumazet Signed-off-by: Yuchung Cheng Cc: Neal Cardwell Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 71057849593a..7d1efa762b75 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1774,7 +1774,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto send_now; - if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR))) + if (icsk->icsk_ca_state >= TCP_CA_Recovery) goto send_now; /* Avoid bursty behavior by allowing defer -- cgit v1.2.3 From e11f40b9352f75d924cb216abcc5413b38d9288b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 27 Jul 2015 11:07:47 +0300 Subject: lwtunnel: use kfree_skb() instead of vanilla kfree() kfree_skb() is correct here. Fixes: ffce41962ef6 ('lwtunnel: support dst output redirect function') Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/core/lwtunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index bb58826c708d..5f7fae70ef68 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -205,7 +205,7 @@ int __lwtunnel_output(struct sock *sk, struct sk_buff *skb, return ret; drop: - kfree(skb); + kfree_skb(skb); return ret; } -- cgit v1.2.3 From 73d0fcf2f487465f73329ee11ae900a774408fe5 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Tue, 28 Jul 2015 14:21:26 +0200 Subject: packet: remove handling of tx_ring from prb_shutdown_retire_blk_timer() Follow e8e85cc5eb57 ("packet: remove handling of tx_ring") and remove the tx_ring parameter from prb_shutdown_retire_blk_timer() as it is only called with tx_ring = 0. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/packet/af_packet.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c9e8741226c6..2af8590d9e0f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -518,13 +518,11 @@ static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, - int tx_ring, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; - pkc = tx_ring ? GET_PBDQC_FROM_RB(&po->tx_ring) : - GET_PBDQC_FROM_RB(&po->rx_ring); + pkc = GET_PBDQC_FROM_RB(&po->rx_ring); spin_lock_bh(&rb_queue->lock); pkc->delete_blk_timer = 1; @@ -4044,7 +4042,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, if (closing && (po->tp_version > TPACKET_V2)) { /* Because we don't support block-based V3 on tx-ring */ if (!tx_ring) - prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue); + prb_shutdown_retire_blk_timer(po, rb_queue); } release_sock(sk); -- cgit v1.2.3 From 5a4c355229da12558b5ded0775f4d0bc6650d28d Mon Sep 17 00:00:00 2001 From: Jon Maloy Date: Wed, 29 Jul 2015 18:28:01 -0400 Subject: tipc: fix bug in broadcast synch message create function In commit d999297c3dbbe7fdd832f7fa4ec84301e170b3e6 ("tipc: reduce locking scope during packet reception") we introduced a new function tipc_build_bcast_sync_msg(), which carries initial synchronization data between two nodes at first contact and at re-contact. In this function, we missed to add synchronization data, with the effect that the broadcast link endpoints will fail to synchronize correctly at re-contact between a running and a restarted node. All other cases work as intended. With this commit, we fix this bug. Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index b63d57390bb7..cc40aa6eb66c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -294,11 +294,14 @@ static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, { struct sk_buff *skb; struct sk_buff_head list; + u16 last_sent; skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, 0, l->addr, link_own_addr(l), 0, 0, 0); if (!skb) return; + last_sent = tipc_bclink_get_last_sent(l->owner->net); + msg_set_last_bcast(buf_msg(skb), last_sent); __skb_queue_head_init(&list); __skb_queue_tail(&list, skb); tipc_link_xmit(l, &list, xmitq); -- cgit v1.2.3 From 877d1f6291f8e391237e324be58479a3e3a7407c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 28 Jul 2015 16:02:05 -0700 Subject: net: Set sk_txhash from a random number This patch creates sk_set_txhash and eliminates protocol specific inet_set_txhash and ip6_set_txhash. sk_set_txhash simply sets a random number instead of performing flow dissection. sk_set_txash is also allowed to be called multiple times for the same socket, we'll need this when redoing the hash for negative routing advice. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ip.h | 16 ---------------- include/net/ipv6.h | 19 ------------------- include/net/sock.h | 8 ++++++++ net/ipv4/datagram.c | 2 +- net/ipv4/tcp_ipv4.c | 4 ++-- net/ipv6/datagram.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- 7 files changed, 14 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index d5fe9f2ab699..bee5f3582e38 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -370,22 +370,6 @@ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow, flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; } -static inline void inet_set_txhash(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct flow_keys keys; - - memset(&keys, 0, sizeof(keys)); - - keys.addrs.v4addrs.src = inet->inet_saddr; - keys.addrs.v4addrs.dst = inet->inet_daddr; - keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - keys.ports.src = inet->inet_sport; - keys.ports.dst = inet->inet_dport; - - sk->sk_txhash = flow_hash_from_keys(&keys); -} - static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto) { const struct iphdr *iph = skb_gro_network_header(skb); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 82dbdb092a5d..7c79798bcaab 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -707,25 +707,6 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, } #if IS_ENABLED(CONFIG_IPV6) -static inline void ip6_set_txhash(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct flow_keys keys; - - memset(&keys, 0, sizeof(keys)); - - memcpy(&keys.addrs.v6addrs.src, &np->saddr, - sizeof(keys.addrs.v6addrs.src)); - memcpy(&keys.addrs.v6addrs.dst, &sk->sk_v6_daddr, - sizeof(keys.addrs.v6addrs.dst)); - keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - keys.ports.src = inet->inet_sport; - keys.ports.dst = inet->inet_dport; - - sk->sk_txhash = flow_hash_from_keys(&keys); -} - static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel) { diff --git a/include/net/sock.h b/include/net/sock.h index 4353ef70bf48..fe735c4841f6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1687,6 +1687,14 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) kuid_t sock_i_uid(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); +static inline void sk_set_txhash(struct sock *sk) +{ + sk->sk_txhash = prandom_u32(); + + if (unlikely(!sk->sk_txhash)) + sk->sk_txhash = 1; +} + static inline struct dst_entry * __sk_dst_get(struct sock *sk) { diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 574fad9cca05..f915abff1350 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -74,7 +74,7 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; - inet_set_txhash(sk); + sk_set_txhash(sk); inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 486ba96ae91a..d27eb549ced6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -222,7 +222,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (err) goto failure; - inet_set_txhash(sk); + sk_set_txhash(sk); rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, inet->inet_sport, inet->inet_dport, sk); @@ -1277,7 +1277,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->mc_ttl = ip_hdr(skb)->ttl; newinet->rcv_tos = ip_hdr(skb)->tos; inet_csk(newsk)->icsk_ext_hdr_len = 0; - inet_set_txhash(newsk); + sk_set_txhash(newsk); if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2572a324b345..9aadd57808a5 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -199,7 +199,7 @@ ipv4_connected: NULL); sk->sk_state = TCP_ESTABLISHED; - ip6_set_txhash(sk); + sk_set_txhash(sk); out: fl6_sock_release(flowlabel); return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d540846a1a79..52dd0d9974d6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -276,7 +276,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) goto late_failure; - ip6_set_txhash(sk); + sk_set_txhash(sk); if (!tp->write_seq && likely(!tp->repair)) tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, @@ -1090,7 +1090,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; - ip6_set_txhash(newsk); + sk_set_txhash(newsk); /* Now IPv6 options... -- cgit v1.2.3 From 92a99bf3bae7c1267db87bb3e3babda2c6dcc8a7 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 29 Jul 2015 09:45:40 +0200 Subject: lwtunnel: Make lwtun_encaps[] static Any external user should use the registration API instead of accessing this directly. Cc: Roopa Prabhu Signed-off-by: Thomas Graf Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 3 --- net/core/lwtunnel.c | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index b02039081b04..33bd30963a95 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -31,9 +31,6 @@ struct lwtunnel_encap_ops { int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); }; -extern const struct lwtunnel_encap_ops __rcu * - lwtun_encaps[LWTUNNEL_ENCAP_MAX+1]; - #ifdef CONFIG_LWTUNNEL static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 5f7fae70ef68..c240c895b319 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -37,7 +37,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int encap_len) } EXPORT_SYMBOL(lwtunnel_state_alloc); -const struct lwtunnel_encap_ops __rcu * +static const struct lwtunnel_encap_ops __rcu * lwtun_encaps[LWTUNNEL_ENCAP_MAX + 1] __read_mostly; int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *ops, -- cgit v1.2.3 From 879c7220e828af8bd82ea9d774c7e45c46b976e4 Mon Sep 17 00:00:00 2001 From: Bogdan Hamciuc Date: Wed, 29 Jul 2015 12:45:58 +0300 Subject: net: pktgen: Observe needed_headroom of the device Allocate enough space so as not to force the outgoing net device to do skb_realloc_headroom(). Signed-off-by: Bogdan Hamciuc Signed-off-by: David S. Miller --- net/core/pktgen.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 1ebdf1c0d118..5961da69cb03 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2279,7 +2279,7 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) static inline void set_pkt_overhead(struct pktgen_dev *pkt_dev) { - pkt_dev->pkt_overhead = 0; + pkt_dev->pkt_overhead = LL_RESERVED_SPACE(pkt_dev->odev); pkt_dev->pkt_overhead += pkt_dev->nr_labels*sizeof(u32); pkt_dev->pkt_overhead += VLAN_TAG_SIZE(pkt_dev); pkt_dev->pkt_overhead += SVLAN_TAG_SIZE(pkt_dev); @@ -2788,6 +2788,7 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev, } else { skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT); } + skb_reserve(skb, LL_RESERVED_SPACE(dev)); return skb; } -- cgit v1.2.3 From 1ce4b2f44df74bac91ff5ddcbf903572577005bc Mon Sep 17 00:00:00 2001 From: Bogdan Hamciuc Date: Wed, 29 Jul 2015 12:47:16 +0300 Subject: net: pktgen: Remove unused 'allocated_skbs' field Field pktgen_dev.allocated_skbs had been written to, but never read from. The number of allocated skbs can be deduced anyway, from the total number of sent packets and the 'clone_skb' param. Signed-off-by: Bogdan Hamciuc Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5961da69cb03..0e0fb30cbc04 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -273,7 +273,6 @@ struct pktgen_dev { /* runtime counters relating to clone_skb */ - __u64 allocated_skbs; __u32 clone_count; int last_ok; /* Was last skb sent? * Or a failed transmit of some sort? @@ -3398,7 +3397,6 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) return; } pkt_dev->last_pkt_size = pkt_dev->skb->len; - pkt_dev->allocated_skbs++; pkt_dev->clone_count = 0; /* reset counter */ } -- cgit v1.2.3 From dcc38c033b32b81b88b798f0c0b8453839ac996b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 29 Jul 2015 13:52:06 +0200 Subject: openvswitch: Re-add CONFIG_OPENVSWITCH_VXLAN This readds the config option CONFIG_OPENVSWITCH_VXLAN to avoid a hard dependency of OVS on VXLAN. It moves the VXLAN config compat code to vport-vxlan.c and allows compliation as a module. Fixes: 614732eaa12d ("openvswitch: Use regular VXLAN net_device device") Fixes: 2661371ace96 ("openvswitch: fix compilation when vxlan is a module") Cc: Pravin B Shelar Cc: Nicolas Dichtel Signed-off-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/Kconfig | 13 ++- net/openvswitch/Makefile | 1 + net/openvswitch/vport-netdev.c | 211 +++-------------------------------------- net/openvswitch/vport-netdev.h | 3 + net/openvswitch/vport-vxlan.c | 207 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 235 insertions(+), 200 deletions(-) create mode 100644 net/openvswitch/vport-vxlan.c (limited to 'net') diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 6ed1d2da9fcd..15840401a2ce 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -5,7 +5,6 @@ config OPENVSWITCH tristate "Open vSwitch" depends on INET - depends on VXLAN select LIBCRC32C select MPLS select NET_MPLS_GSO @@ -45,6 +44,18 @@ config OPENVSWITCH_GRE If unsure, say Y. +config OPENVSWITCH_VXLAN + tristate "Open vSwitch VXLAN tunneling support" + depends on OPENVSWITCH + depends on VXLAN + default OPENVSWITCH + ---help--- + If you say Y here, then the Open vSwitch will be able create vxlan vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. + config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 38e0e149c55e..6e1701de04d8 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,5 +15,6 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 2254b742168f..cddb7069b11b 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -26,11 +26,10 @@ #include #include #include +#include -#include #include #include -#include #include "datapath.h" #include "vport.h" @@ -90,7 +89,7 @@ static struct net_device *get_dpdev(const struct datapath *dp) return local->dev; } -static struct vport *netdev_link(struct vport *vport, const char *name) +struct vport *ovs_netdev_link(struct vport *vport, const char *name) { int err; @@ -135,6 +134,7 @@ error_free_vport: ovs_vport_free(vport); return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(ovs_netdev_link); static struct vport *netdev_create(const struct vport_parms *parms) { @@ -144,10 +144,10 @@ static struct vport *netdev_create(const struct vport_parms *parms) if (IS_ERR(vport)) return vport; - return netdev_link(vport, parms->name); + return ovs_netdev_link(vport, parms->name); } -static void free_port_rcu(struct rcu_head *rcu) +void ovs_vport_free_rcu(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); @@ -155,6 +155,7 @@ static void free_port_rcu(struct rcu_head *rcu) dev_put(vport->dev); ovs_vport_free(vport); } +EXPORT_SYMBOL_GPL(ovs_vport_free_rcu); void ovs_netdev_detach_dev(struct vport *vport) { @@ -165,6 +166,7 @@ void ovs_netdev_detach_dev(struct vport *vport) netdev_master_upper_dev_get(vport->dev)); dev_set_promiscuity(vport->dev, -1); } +EXPORT_SYMBOL_GPL(ovs_netdev_detach_dev); static void netdev_destroy(struct vport *vport) { @@ -173,7 +175,7 @@ static void netdev_destroy(struct vport *vport) ovs_netdev_detach_dev(vport); rtnl_unlock(); - call_rcu(&vport->rcu, free_port_rcu); + call_rcu(&vport->rcu, ovs_vport_free_rcu); } static unsigned int packet_length(const struct sk_buff *skb) @@ -186,7 +188,7 @@ static unsigned int packet_length(const struct sk_buff *skb) return length; } -static int netdev_send(struct vport *vport, struct sk_buff *skb) +int ovs_netdev_send(struct vport *vport, struct sk_buff *skb) { int mtu = vport->dev->mtu; int len; @@ -208,6 +210,7 @@ drop: kfree_skb(skb); return 0; } +EXPORT_SYMBOL_GPL(ovs_netdev_send); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) @@ -223,205 +226,15 @@ static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, - .send = netdev_send, + .send = ovs_netdev_send, }; -/* Compat code for old userspace. */ -#if IS_ENABLED(CONFIG_VXLAN) -static struct vport_ops ovs_vxlan_netdev_vport_ops; - -static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) -{ - struct vxlan_dev *vxlan = netdev_priv(vport->dev); - __be16 dst_port = vxlan->cfg.dst_port; - - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) - return -EMSGSIZE; - - if (vxlan->flags & VXLAN_F_GBP) { - struct nlattr *exts; - - exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); - if (!exts) - return -EMSGSIZE; - - if (vxlan->flags & VXLAN_F_GBP && - nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) - return -EMSGSIZE; - - nla_nest_end(skb, exts); - } - - return 0; -} - -static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { - [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, -}; - -static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, - struct vxlan_config *conf) -{ - struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; - int err; - - if (nla_len(attr) < sizeof(struct nlattr)) - return -EINVAL; - - err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); - if (err < 0) - return err; - - if (exts[OVS_VXLAN_EXT_GBP]) - conf->flags |= VXLAN_F_GBP; - - return 0; -} - -static struct vport *vxlan_tnl_create(const struct vport_parms *parms) -{ - struct net *net = ovs_dp_get_net(parms->dp); - struct nlattr *options = parms->options; - struct net_device *dev; - struct vport *vport; - struct nlattr *a; - int err; - struct vxlan_config conf = { - .no_share = true, - .flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA, - }; - - if (!options) { - err = -EINVAL; - goto error; - } - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); - if (a && nla_len(a) == sizeof(u16)) { - conf.dst_port = htons(nla_get_u16(a)); - } else { - /* Require destination port from userspace. */ - err = -EINVAL; - goto error; - } - - vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); - if (IS_ERR(vport)) - return vport; - - a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); - if (a) { - err = vxlan_configure_exts(vport, a, &conf); - if (err) { - ovs_vport_free(vport); - goto error; - } - } - - rtnl_lock(); - dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); - if (IS_ERR(dev)) { - rtnl_unlock(); - ovs_vport_free(vport); - return ERR_CAST(dev); - } - - dev_change_flags(dev, dev->flags | IFF_UP); - rtnl_unlock(); - return vport; -error: - return ERR_PTR(err); -} - -static struct vport *vxlan_create(const struct vport_parms *parms) -{ - struct vport *vport; - - vport = vxlan_tnl_create(parms); - if (IS_ERR(vport)) - return vport; - - return netdev_link(vport, parms->name); -} - -static void vxlan_destroy(struct vport *vport) -{ - rtnl_lock(); - if (vport->dev->priv_flags & IFF_OVS_DATAPATH) - ovs_netdev_detach_dev(vport); - - /* Early release so we can unregister the device */ - dev_put(vport->dev); - rtnl_delete_link(vport->dev); - vport->dev = NULL; - rtnl_unlock(); - - call_rcu(&vport->rcu, free_port_rcu); -} - -static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) -{ - struct vxlan_dev *vxlan = netdev_priv(vport->dev); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dst_port = vxlan_dev_dst_port(vxlan); - __be16 src_port; - int port_min; - int port_max; - - inet_get_local_port_range(net, &port_min, &port_max); - src_port = udp_flow_src_port(net, skb, 0, 0, true); - - return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, - src_port, dst_port); -} - -static struct vport_ops ovs_vxlan_netdev_vport_ops = { - .type = OVS_VPORT_TYPE_VXLAN, - .create = vxlan_create, - .destroy = vxlan_destroy, - .get_options = vxlan_get_options, - .send = netdev_send, - .get_egress_tun_info = vxlan_get_egress_tun_info, -}; - -static int vxlan_compat_init(void) -{ - return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); -} - -static void vxlan_compat_exit(void) -{ - ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); -} -#else -static int vxlan_compat_init(void) -{ - return 0; -} - -static void vxlan_compat_exit(void) -{ -} -#endif - int __init ovs_netdev_init(void) { - int err; - - err = ovs_vport_ops_register(&ovs_netdev_vport_ops); - if (err) - return err; - err = vxlan_compat_init(); - if (err) - vxlan_compat_exit(); - return err; + return ovs_vport_ops_register(&ovs_netdev_vport_ops); } void ovs_netdev_exit(void) { ovs_vport_ops_unregister(&ovs_netdev_vport_ops); - vxlan_compat_exit(); } diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 684fb88723a4..804412697a90 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -26,7 +26,10 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); +struct vport *ovs_netdev_link(struct vport *vport, const char *name); +int ovs_netdev_send(struct vport *vport, struct sk_buff *skb); void ovs_netdev_detach_dev(struct vport *); +void ovs_vport_free_rcu(struct rcu_head *); int __init ovs_netdev_init(void); void ovs_netdev_exit(void); diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c new file mode 100644 index 000000000000..547173336cd3 --- /dev/null +++ b/net/openvswitch/vport-vxlan.c @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2014 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "vport.h" +#include "vport-netdev.h" + +static struct vport_ops ovs_vxlan_netdev_vport_ops; + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP) { + struct nlattr *exts; + + exts = nla_nest_start(skb, OVS_TUNNEL_ATTR_EXTENSION); + if (!exts) + return -EMSGSIZE; + + if (vxlan->flags & VXLAN_F_GBP && + nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) + return -EMSGSIZE; + + nla_nest_end(skb, exts); + } + + return 0; +} + +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, +}; + +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) +{ + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; + int err; + + if (nla_len(attr) < sizeof(struct nlattr)) + return -EINVAL; + + err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); + if (err < 0) + return err; + + if (exts[OVS_VXLAN_EXT_GBP]) + conf->flags |= VXLAN_F_GBP; + + return 0; +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct net_device *dev; + struct vport *vport; + struct nlattr *a; + int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA, + }; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + conf.dst_port = htons(nla_get_u16(a)); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); + if (a) { + err = vxlan_configure_exts(vport, a, &conf); + if (err) { + ovs_vport_free(vport); + goto error; + } + } + + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); + return vport; +error: + return ERR_PTR(err); +} + +static struct vport *vxlan_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return ovs_netdev_link(vport, parms->name); +} + +static void vxlan_destroy(struct vport *vport) +{ + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); + + /* Early release so we can unregister the device */ + dev_put(vport->dev); + rtnl_delete_link(vport->dev); + vport->dev = NULL; + rtnl_unlock(); + + call_rcu(&vport->rcu, ovs_vport_free_rcu); +} + +static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ip_tunnel_info *egress_tun_info) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dst_port = vxlan_dev_dst_port(vxlan); + __be16 src_port; + int port_min; + int port_max; + + inet_get_local_port_range(net, &port_min, &port_max); + src_port = udp_flow_src_port(net, skb, 0, 0, true); + + return ovs_tunnel_get_egress_info(egress_tun_info, net, + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, + src_port, dst_port); +} + +static struct vport_ops ovs_vxlan_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_create, + .destroy = vxlan_destroy, + .get_options = vxlan_get_options, + .send = ovs_netdev_send, + .get_egress_tun_info = vxlan_get_egress_tun_info, +}; + +static int __init ovs_vxlan_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); +} + +static void __exit ovs_vxlan_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); +} + +module_init(ovs_vxlan_tnl_init); +module_exit(ovs_vxlan_tnl_exit); + +MODULE_DESCRIPTION("OVS: VXLAN switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-4"); -- cgit v1.2.3 From d7ee3519042798be6224e97f259ed47a63da4620 Mon Sep 17 00:00:00 2001 From: Michal Kubeček Date: Fri, 17 Jul 2015 16:17:56 +0200 Subject: netfilter: nf_ct_sctp: minimal multihoming support Currently nf_conntrack_proto_sctp module handles only packets between primary addresses used to establish the connection. Any packets between secondary addresses are classified as invalid so that usual firewall configurations drop them. Allowing HEARTBEAT and HEARTBEAT-ACK chunks to establish a new conntrack would allow traffic between secondary addresses to pass through. A more sophisticated solution based on the addresses advertised in the initial handshake (and possibly also later dynamic address addition and removal) would be much harder to implement. Moreover, in general we cannot assume to always see the initial handshake as it can be routed through a different path. The patch adds two new conntrack states: SCTP_CONNTRACK_HEARTBEAT_SENT - a HEARTBEAT chunk seen but not acked SCTP_CONNTRACK_HEARTBEAT_ACKED - a HEARTBEAT acked by HEARTBEAT-ACK State transition rules: - HEARTBEAT_SENT responds to usual chunks the same way as NONE (so that the behaviour changes as little as possible) - HEARTBEAT_ACKED responds to usual chunks the same way as ESTABLISHED does, except the resulting state is HEARTBEAT_ACKED rather than ESTABLISHED - previously existing states except NONE are preserved when HEARTBEAT or HEARTBEAT-ACK is seen - NONE (in the initial direction) changes to HEARTBEAT_SENT on HEARTBEAT and to CLOSED on HEARTBEAT-ACK - HEARTBEAT_SENT changes to HEARTBEAT_ACKED on HEARTBEAT-ACK in the reply direction - HEARTBEAT_SENT and HEARTBEAT_ACKED are preserved on HEARTBEAT and HEARTBEAT-ACK otherwise Normally, vtag is set from the INIT chunk for the reply direction and from the INIT-ACK chunk for the originating direction (i.e. each of these defines vtag value for the opposite direction). For secondary conntracks, we can't rely on seeing INIT/INIT-ACK and even if we have seen them, we would need to connect two different conntracks. Therefore simplified logic is applied: vtag of first packet in each direction (HEARTBEAT in the originating and HEARTBEAT-ACK in reply direction) is saved and all following packets in that direction are compared with this saved value. While INIT and INIT-ACK define vtag for the opposite direction, vtags extracted from HEARTBEAT and HEARTBEAT-ACK are always for their direction. Default timeout values for new states are HEARTBEAT_SENT: 30 seconds (default hb_interval) HEARTBEAT_ACKED: 210 seconds (hb_interval * path_max_retry + max_rto) (We cannot expect to see the shutdown sequence so that, unlike ESTABLISHED, the HEARTBEAT_ACKED timeout shouldn't be too long.) Signed-off-by: Michal Kubecek Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_conntrack_sctp.h | 2 + include/uapi/linux/netfilter/nfnetlink_cttimeout.h | 2 + net/netfilter/nf_conntrack_proto_sctp.c | 101 ++++++++++++++++----- 3 files changed, 81 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_conntrack_sctp.h b/include/uapi/linux/netfilter/nf_conntrack_sctp.h index ceeefe6681b5..ed4e776e1242 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_sctp.h +++ b/include/uapi/linux/netfilter/nf_conntrack_sctp.h @@ -13,6 +13,8 @@ enum sctp_conntrack { SCTP_CONNTRACK_SHUTDOWN_SENT, SCTP_CONNTRACK_SHUTDOWN_RECD, SCTP_CONNTRACK_SHUTDOWN_ACK_SENT, + SCTP_CONNTRACK_HEARTBEAT_SENT, + SCTP_CONNTRACK_HEARTBEAT_ACKED, SCTP_CONNTRACK_MAX }; diff --git a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h index 1ab0b97b3a1e..f2c10dc140d6 100644 --- a/include/uapi/linux/netfilter/nfnetlink_cttimeout.h +++ b/include/uapi/linux/netfilter/nfnetlink_cttimeout.h @@ -92,6 +92,8 @@ enum ctattr_timeout_sctp { CTA_TIMEOUT_SCTP_SHUTDOWN_SENT, CTA_TIMEOUT_SCTP_SHUTDOWN_RECD, CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, + CTA_TIMEOUT_SCTP_HEARTBEAT_SENT, + CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED, __CTA_TIMEOUT_SCTP_MAX }; #define CTA_TIMEOUT_SCTP_MAX (__CTA_TIMEOUT_SCTP_MAX - 1) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index b45da90fad32..67197731eb68 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -42,6 +42,8 @@ static const char *const sctp_conntrack_names[] = { "SHUTDOWN_SENT", "SHUTDOWN_RECD", "SHUTDOWN_ACK_SENT", + "HEARTBEAT_SENT", + "HEARTBEAT_ACKED", }; #define SECS * HZ @@ -57,6 +59,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { [SCTP_CONNTRACK_SHUTDOWN_SENT] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_RECD] = 300 SECS / 1000, [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS, + [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS, + [SCTP_CONNTRACK_HEARTBEAT_ACKED] = 210 SECS, }; #define sNO SCTP_CONNTRACK_NONE @@ -67,6 +71,8 @@ static unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] __read_mostly = { #define sSS SCTP_CONNTRACK_SHUTDOWN_SENT #define sSR SCTP_CONNTRACK_SHUTDOWN_RECD #define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sHS SCTP_CONNTRACK_HEARTBEAT_SENT +#define sHA SCTP_CONNTRACK_HEARTBEAT_ACKED #define sIV SCTP_CONNTRACK_MAX /* @@ -88,6 +94,10 @@ SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite to that of the SHUTDOWN chunk. CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of the SHUTDOWN chunk. Connection is closed. +HEARTBEAT_SENT - We have seen a HEARTBEAT in a new flow. +HEARTBEAT_ACKED - We have seen a HEARTBEAT-ACK in the direction opposite to + that of the HEARTBEAT chunk. Secondary connection is + established. */ /* TODO @@ -97,36 +107,40 @@ CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of - Check the error type in the reply dir before transitioning from cookie echoed to closed. - Sec 5.2.4 of RFC 2960 - - Multi Homing support. + - Full Multi Homing support. */ /* SCTP conntrack state transitions */ -static const u8 sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { +static const u8 sctp_conntracks[2][11][SCTP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, -/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, -/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't have Stale cookie*/ -/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ -/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in orig dir */ -/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA, sCW, sHA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA, sCL, sSS}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA, sSA, sHA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sCL, sHA},/* Can't come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL, sCL, sHA}, +/* heartbeat */ {sHS, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, +/* heartbeat_ack*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA} }, { /* REPLY */ -/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ -/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ -/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, -/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, -/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, -/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, -/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, -/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Can't come in reply dir */ -/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, -/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA, sIV, sSR}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA, sIV, sHA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA, sIV, sHA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sIV, sHA},/* Can't come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA, sIV, sHA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL, sIV, sHA}, +/* heartbeat */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHS, sHA}, +/* heartbeat_ack*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA, sHA, sHA} } }; @@ -278,9 +292,16 @@ static int sctp_new_state(enum ip_conntrack_dir dir, pr_debug("SCTP_CID_SHUTDOWN_COMPLETE\n"); i = 8; break; + case SCTP_CID_HEARTBEAT: + pr_debug("SCTP_CID_HEARTBEAT"); + i = 9; + break; + case SCTP_CID_HEARTBEAT_ACK: + pr_debug("SCTP_CID_HEARTBEAT_ACK"); + i = 10; + break; default: - /* Other chunks like DATA, SACK, HEARTBEAT and - its ACK do not cause a change in state */ + /* Other chunks like DATA or SACK do not change the state */ pr_debug("Unknown chunk type, Will stay in %s\n", sctp_conntrack_names[cur_state]); return cur_state; @@ -329,6 +350,8 @@ static int sctp_packet(struct nf_conn *ct, !test_bit(SCTP_CID_COOKIE_ECHO, map) && !test_bit(SCTP_CID_ABORT, map) && !test_bit(SCTP_CID_SHUTDOWN_ACK, map) && + !test_bit(SCTP_CID_HEARTBEAT, map) && + !test_bit(SCTP_CID_HEARTBEAT_ACK, map) && sh->vtag != ct->proto.sctp.vtag[dir]) { pr_debug("Verification tag check failed\n"); goto out; @@ -357,6 +380,16 @@ static int sctp_packet(struct nf_conn *ct, /* Sec 8.5.1 (D) */ if (sh->vtag != ct->proto.sctp.vtag[dir]) goto out_unlock; + } else if (sch->type == SCTP_CID_HEARTBEAT || + sch->type == SCTP_CID_HEARTBEAT_ACK) { + if (ct->proto.sctp.vtag[dir] == 0) { + pr_debug("Setting vtag %x for dir %d\n", + sh->vtag, dir); + ct->proto.sctp.vtag[dir] = sh->vtag; + } else if (sh->vtag != ct->proto.sctp.vtag[dir]) { + pr_debug("Verification tag check failed\n"); + goto out_unlock; + } } old_state = ct->proto.sctp.state; @@ -466,6 +499,10 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Sec 8.5.1 (A) */ return false; } + } else if (sch->type == SCTP_CID_HEARTBEAT) { + pr_debug("Setting vtag %x for secondary conntrack\n", + sh->vtag); + ct->proto.sctp.vtag[IP_CT_DIR_ORIGINAL] = sh->vtag; } /* If it is a shutdown ack OOTB packet, we expect a return shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ @@ -610,6 +647,8 @@ sctp_timeout_nla_policy[CTA_TIMEOUT_SCTP_MAX+1] = { [CTA_TIMEOUT_SCTP_SHUTDOWN_SENT] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_RECD] = { .type = NLA_U32 }, [CTA_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_HEARTBEAT_SENT] = { .type = NLA_U32 }, + [CTA_TIMEOUT_SCTP_HEARTBEAT_ACKED] = { .type = NLA_U32 }, }; #endif /* CONFIG_NF_CT_NETLINK_TIMEOUT */ @@ -658,6 +697,18 @@ static struct ctl_table sctp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "nf_conntrack_sctp_timeout_heartbeat_sent", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_sctp_timeout_heartbeat_acked", + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, { } }; @@ -730,6 +781,8 @@ static int sctp_kmemdup_sysctl_table(struct nf_proto_net *pn, pn->ctl_table[4].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_SENT]; pn->ctl_table[5].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_RECD]; pn->ctl_table[6].data = &sn->timeouts[SCTP_CONNTRACK_SHUTDOWN_ACK_SENT]; + pn->ctl_table[7].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_SENT]; + pn->ctl_table[8].data = &sn->timeouts[SCTP_CONNTRACK_HEARTBEAT_ACKED]; #endif return 0; } -- cgit v1.2.3 From 244bc377591c3882f454882357bc730c90cbedb5 Mon Sep 17 00:00:00 2001 From: Arron Wang Date: Fri, 24 Jul 2015 17:12:55 +0800 Subject: Bluetooth: Add BT_HS config option Move A2MP Module under BT_HS config option and allow the user have flexible option to choose the feature only they need a2mp_discover_amp() & a2mp_channel_create() are a2mp module entry point for master and slave, and this is dynamic invoked depends on the userspace or remote request, then we defined their implementation depends on BT_HS config Signed-off-by: Arron Wang Signed-off-by: Marcel Holtmann --- net/bluetooth/Kconfig | 5 +++++ net/bluetooth/Makefile | 3 ++- net/bluetooth/a2mp.h | 19 +++++++++++++++++++ net/bluetooth/amp.h | 14 ++++++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index b8c794b87523..95d1a66ba03a 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -53,6 +53,11 @@ source "net/bluetooth/cmtp/Kconfig" source "net/bluetooth/hidp/Kconfig" +config BT_HS + bool "Bluetooth High Speed (HS) features" + depends on BT_BREDR + default y + config BT_LE bool "Bluetooth Low Energy (LE) features" depends on BT diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 29c12ae72a66..2b15ae8c1def 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -13,9 +13,10 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - a2mp.o amp.o ecc.o hci_request.o mgmt_util.o + ecc.o hci_request.o mgmt_util.o bluetooth-$(CONFIG_BT_BREDR) += sco.o +bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o diff --git a/net/bluetooth/a2mp.h b/net/bluetooth/a2mp.h index 296f665adb09..a4ff3ea9b38a 100644 --- a/net/bluetooth/a2mp.h +++ b/net/bluetooth/a2mp.h @@ -130,10 +130,29 @@ struct a2mp_physlink_rsp { #define A2MP_STATUS_SECURITY_VIOLATION 0x06 struct amp_mgr *amp_mgr_get(struct amp_mgr *mgr); + +#if IS_ENABLED(CONFIG_BT_HS) int amp_mgr_put(struct amp_mgr *mgr); struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn, struct sk_buff *skb); void a2mp_discover_amp(struct l2cap_chan *chan); +#else +static inline int amp_mgr_put(struct amp_mgr *mgr) +{ + return 0; +} + +static inline struct l2cap_chan *a2mp_channel_create(struct l2cap_conn *conn, + struct sk_buff *skb) +{ + return NULL; +} + +static inline void a2mp_discover_amp(struct l2cap_chan *chan) +{ +} +#endif + void a2mp_send_getinfo_rsp(struct hci_dev *hdev); void a2mp_send_getampassoc_rsp(struct hci_dev *hdev, u8 status); void a2mp_send_create_phy_link_req(struct hci_dev *hdev, u8 status); diff --git a/net/bluetooth/amp.h b/net/bluetooth/amp.h index 7ea3db77ba89..8848f8158ae4 100644 --- a/net/bluetooth/amp.h +++ b/net/bluetooth/amp.h @@ -44,6 +44,20 @@ void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, struct hci_conn *hcon); void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, struct hci_conn *hcon); + +#if IS_ENABLED(CONFIG_BT_HS) +void amp_create_logical_link(struct l2cap_chan *chan); +void amp_disconnect_logical_link(struct hci_chan *hchan); +#else +static inline void amp_create_logical_link(struct l2cap_chan *chan) +{ +} + +static inline void amp_disconnect_logical_link(struct hci_chan *hchan) +{ +} +#endif + void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle); void amp_write_rem_assoc_continue(struct hci_dev *hdev, u8 handle); void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon); -- cgit v1.2.3 From a77a6a14e54a6853141d99512bfed4acb5c2657a Mon Sep 17 00:00:00 2001 From: Arron Wang Date: Fri, 24 Jul 2015 17:13:15 +0800 Subject: Bluetooth: Move high speed specific event under BT_HS option Signed-off-by: Arron Wang Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 44 ++++++++++++++++++++++++-------------------- 1 file changed, 24 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 32363c2b7f83..4f0a97b94837 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4399,6 +4399,23 @@ unlock: hci_dev_unlock(hdev); } +#if IS_ENABLED(CONFIG_BT_HS) +static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_channel_selected *ev = (void *)skb->data; + struct hci_conn *hcon; + + BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle); + + skb_pull(skb, sizeof(*ev)); + + hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); + if (!hcon) + return; + + amp_read_loc_assoc_final_data(hdev, hcon); +} + static void hci_phy_link_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -4522,6 +4539,7 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, hci_dev_unlock(hdev); } +#endif static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) { @@ -5206,22 +5224,6 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb) } } -static void hci_chan_selected_evt(struct hci_dev *hdev, struct sk_buff *skb) -{ - struct hci_ev_channel_selected *ev = (void *) skb->data; - struct hci_conn *hcon; - - BT_DBG("%s handle 0x%2.2x", hdev->name, ev->phy_handle); - - skb_pull(skb, sizeof(*ev)); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) - return; - - amp_read_loc_assoc_final_data(hdev, hcon); -} - static bool hci_get_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 event, struct sk_buff *skb) { @@ -5442,14 +5444,15 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_le_meta_evt(hdev, skb); break; - case HCI_EV_CHANNEL_SELECTED: - hci_chan_selected_evt(hdev, skb); - break; - case HCI_EV_REMOTE_OOB_DATA_REQUEST: hci_remote_oob_data_request_evt(hdev, skb); break; +#if IS_ENABLED(CONFIG_BT_HS) + case HCI_EV_CHANNEL_SELECTED: + hci_chan_selected_evt(hdev, skb); + break; + case HCI_EV_PHY_LINK_COMPLETE: hci_phy_link_complete_evt(hdev, skb); break; @@ -5465,6 +5468,7 @@ void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) case HCI_EV_DISCONN_PHY_LINK_COMPLETE: hci_disconn_phylink_complete_evt(hdev, skb); break; +#endif case HCI_EV_NUM_COMP_BLOCKS: hci_num_comp_blocks_evt(hdev, skb); -- cgit v1.2.3 From 839278823c27b71b46d677c8cf56b2d1f95610af Mon Sep 17 00:00:00 2001 From: Arron Wang Date: Fri, 24 Jul 2015 17:10:16 +0800 Subject: Bluetooth: Move get info completed callback to a2mp.c To avoid a2mp module hooks from hci_event.c and send getinfo response operation only required by a2mp module, we can move this callback to a2mp.c Signed-off-by: Arron Wang Signed-off-by: Marcel Holtmann --- net/bluetooth/a2mp.c | 17 ++++++++++++++++- net/bluetooth/hci_event.c | 5 +---- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/a2mp.c b/net/bluetooth/a2mp.c index 5a04eb1a7e57..5f123c3320a7 100644 --- a/net/bluetooth/a2mp.c +++ b/net/bluetooth/a2mp.c @@ -16,6 +16,7 @@ #include #include +#include "hci_request.h" #include "a2mp.h" #include "amp.h" @@ -286,11 +287,21 @@ static int a2mp_change_notify(struct amp_mgr *mgr, struct sk_buff *skb, return 0; } +static void read_local_amp_info_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + a2mp_send_getinfo_rsp(hdev); +} + static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb, struct a2mp_cmd *hdr) { struct a2mp_info_req *req = (void *) skb->data; struct hci_dev *hdev; + struct hci_request hreq; + int err = 0; if (le16_to_cpu(hdr->len) < sizeof(*req)) return -EINVAL; @@ -311,7 +322,11 @@ static int a2mp_getinfo_req(struct amp_mgr *mgr, struct sk_buff *skb, } set_bit(READ_LOC_AMP_INFO, &mgr->state); - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL); + hci_req_init(&hreq, hdev); + hci_req_add(&hreq, HCI_OP_READ_LOCAL_AMP_INFO, 0, NULL); + err = hci_req_run(&hreq, read_local_amp_info_complete); + if (err < 0) + a2mp_send_getinfo_rsp(hdev); done: if (hdev) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 4f0a97b94837..a9a8e453817e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -823,7 +823,7 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev, BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); if (rp->status) - goto a2mp_rsp; + return; hdev->amp_status = rp->amp_status; hdev->amp_total_bw = __le32_to_cpu(rp->total_bw); @@ -835,9 +835,6 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev, hdev->amp_assoc_size = __le16_to_cpu(rp->max_assoc_size); hdev->amp_be_flush_to = __le32_to_cpu(rp->be_flush_to); hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to); - -a2mp_rsp: - a2mp_send_getinfo_rsp(hdev); } static void hci_cc_read_local_amp_assoc(struct hci_dev *hdev, -- cgit v1.2.3 From b3d3914006a05cae448684058760359ef0cded49 Mon Sep 17 00:00:00 2001 From: Arron Wang Date: Fri, 24 Jul 2015 17:10:50 +0800 Subject: Bluetooth: Move amp assoc read/write completed callback to amp.c To avoid amp module hooks from hci_event.c Signed-off-by: Arron Wang Signed-off-by: Marcel Holtmann --- net/bluetooth/amp.c | 81 ++++++++++++++++++++++++++++++++++++++++++++--- net/bluetooth/hci_event.c | 59 ---------------------------------- 2 files changed, 77 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index ee016f039100..45888fad8362 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -16,6 +16,7 @@ #include #include +#include "hci_request.h" #include "a2mp.h" #include "amp.h" @@ -220,10 +221,49 @@ int phylink_gen_key(struct hci_conn *conn, u8 *data, u8 *len, u8 *type) return hmac_sha256(gamp_key, HCI_AMP_LINK_KEY_SIZE, "802b", 4, data); } +static void read_local_amp_assoc_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + struct hci_rp_read_local_amp_assoc *rp = (void *)skb->data; + struct amp_assoc *assoc = &hdev->loc_assoc; + size_t rem_len, frag_len; + + BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); + + if (rp->status) + goto send_rsp; + + frag_len = skb->len - sizeof(*rp); + rem_len = __le16_to_cpu(rp->rem_len); + + if (rem_len > frag_len) { + BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len); + + memcpy(assoc->data + assoc->offset, rp->frag, frag_len); + assoc->offset += frag_len; + + /* Read other fragments */ + amp_read_loc_assoc_frag(hdev, rp->phy_handle); + + return; + } + + memcpy(assoc->data + assoc->offset, rp->frag, rem_len); + assoc->len = assoc->offset + rem_len; + assoc->offset = 0; + +send_rsp: + /* Send A2MP Rsp when all fragments are received */ + a2mp_send_getampassoc_rsp(hdev, rp->status); + a2mp_send_create_phy_link_req(hdev, rp->status); +} + void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle) { struct hci_cp_read_local_amp_assoc cp; struct amp_assoc *loc_assoc = &hdev->loc_assoc; + struct hci_request req; + int err = 0; BT_DBG("%s handle %d", hdev->name, phy_handle); @@ -231,12 +271,18 @@ void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle) cp.max_len = cpu_to_le16(hdev->amp_assoc_size); cp.len_so_far = cpu_to_le16(loc_assoc->offset); - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + err = hci_req_run_skb(&req, read_local_amp_assoc_complete); + if (err < 0) + a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID); } void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr) { struct hci_cp_read_local_amp_assoc cp; + struct hci_request req; + int err = 0; memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc)); memset(&cp, 0, sizeof(cp)); @@ -244,7 +290,11 @@ void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr) cp.max_len = cpu_to_le16(hdev->amp_assoc_size); set_bit(READ_LOC_AMP_ASSOC, &mgr->state); - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_run_skb(&req, read_local_amp_assoc_complete); + if (err < 0) + a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID); } void amp_read_loc_assoc_final_data(struct hci_dev *hdev, @@ -252,6 +302,8 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, { struct hci_cp_read_local_amp_assoc cp; struct amp_mgr *mgr = hcon->amp_mgr; + struct hci_request req; + int err = 0; cp.phy_handle = hcon->handle; cp.len_so_far = cpu_to_le16(0); @@ -260,7 +312,25 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, set_bit(READ_LOC_AMP_ASSOC_FINAL, &mgr->state); /* Read Local AMP Assoc final link information data */ - hci_send_cmd(hdev, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); + hci_req_run_skb(&req, read_local_amp_assoc_complete); + if (err < 0) + a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID); +} + +static void write_remote_amp_assoc_complete(struct hci_dev *hdev, u8 status, + u16 opcode, struct sk_buff *skb) +{ + struct hci_rp_write_remote_amp_assoc *rp = (void *)skb->data; + + BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x", + hdev->name, rp->status, rp->phy_handle); + + if (rp->status) + return; + + amp_write_rem_assoc_continue(hdev, rp->phy_handle); } /* Write AMP Assoc data fragments, returns true with last fragment written*/ @@ -270,6 +340,7 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev, struct hci_cp_write_remote_amp_assoc *cp; struct amp_mgr *mgr = hcon->amp_mgr; struct amp_ctrl *ctrl; + struct hci_request req; u16 frag_len, len; ctrl = amp_ctrl_lookup(mgr, hcon->remote_id); @@ -307,7 +378,9 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev, amp_ctrl_put(ctrl); - hci_send_cmd(hdev, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, sizeof(cp), &cp); + hci_req_run_skb(&req, write_remote_amp_assoc_complete); kfree(cp); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a9a8e453817e..1ee01864569f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -837,43 +837,6 @@ static void hci_cc_read_local_amp_info(struct hci_dev *hdev, hdev->amp_max_flush_to = __le32_to_cpu(rp->max_flush_to); } -static void hci_cc_read_local_amp_assoc(struct hci_dev *hdev, - struct sk_buff *skb) -{ - struct hci_rp_read_local_amp_assoc *rp = (void *) skb->data; - struct amp_assoc *assoc = &hdev->loc_assoc; - size_t rem_len, frag_len; - - BT_DBG("%s status 0x%2.2x", hdev->name, rp->status); - - if (rp->status) - goto a2mp_rsp; - - frag_len = skb->len - sizeof(*rp); - rem_len = __le16_to_cpu(rp->rem_len); - - if (rem_len > frag_len) { - BT_DBG("frag_len %zu rem_len %zu", frag_len, rem_len); - - memcpy(assoc->data + assoc->offset, rp->frag, frag_len); - assoc->offset += frag_len; - - /* Read other fragments */ - amp_read_loc_assoc_frag(hdev, rp->phy_handle); - - return; - } - - memcpy(assoc->data + assoc->offset, rp->frag, rem_len); - assoc->len = assoc->offset + rem_len; - assoc->offset = 0; - -a2mp_rsp: - /* Send A2MP Rsp when all fragments are received */ - a2mp_send_getampassoc_rsp(hdev, rp->status); - a2mp_send_create_phy_link_req(hdev, rp->status); -} - static void hci_cc_read_inq_rsp_tx_power(struct hci_dev *hdev, struct sk_buff *skb) { @@ -1406,20 +1369,6 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_unlock(hdev); } -static void hci_cc_write_remote_amp_assoc(struct hci_dev *hdev, - struct sk_buff *skb) -{ - struct hci_rp_write_remote_amp_assoc *rp = (void *) skb->data; - - BT_DBG("%s status 0x%2.2x phy_handle 0x%2.2x", - hdev->name, rp->status, rp->phy_handle); - - if (rp->status) - return; - - amp_write_rem_assoc_continue(hdev, rp->phy_handle); -} - static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_rp_read_rssi *rp = (void *) skb->data; @@ -2995,10 +2944,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_read_clock(hdev, skb); break; - case HCI_OP_READ_LOCAL_AMP_ASSOC: - hci_cc_read_local_amp_assoc(hdev, skb); - break; - case HCI_OP_READ_INQ_RSP_TX_POWER: hci_cc_read_inq_rsp_tx_power(hdev, skb); break; @@ -3103,10 +3048,6 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cc_set_adv_param(hdev, skb); break; - case HCI_OP_WRITE_REMOTE_AMP_ASSOC: - hci_cc_write_remote_amp_assoc(hdev, skb); - break; - case HCI_OP_READ_RSSI: hci_cc_read_rssi(hdev, skb); break; -- cgit v1.2.3 From df9b89c7e46cf88ebf5a29c28c45d3b0b8c5d502 Mon Sep 17 00:00:00 2001 From: Arron Wang Date: Fri, 24 Jul 2015 17:11:01 +0800 Subject: Bluetooth: Move create/accept phy link completed callback to amp.c To avoid amp module hooks from hci_event.c Signed-off-by: Arron Wang Signed-off-by: Marcel Holtmann --- net/bluetooth/amp.c | 53 +++++++++++++++++++++++++++++++++++++++++++++-- net/bluetooth/hci_event.c | 49 ------------------------------------------- 2 files changed, 51 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index 45888fad8362..238ddd3cf95f 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -417,10 +417,37 @@ void amp_write_remote_assoc(struct hci_dev *hdev, u8 handle) amp_write_rem_assoc_frag(hdev, hcon); } +static void create_phylink_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct hci_cp_create_phy_link *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK); + if (!cp) + return; + + hci_dev_lock(hdev); + + if (status) { + struct hci_conn *hcon; + + hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle); + if (hcon) + hci_conn_del(hcon); + } else { + amp_write_remote_assoc(hdev, cp->phy_handle); + } + + hci_dev_unlock(hdev); +} + void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, struct hci_conn *hcon) { struct hci_cp_create_phy_link cp; + struct hci_request req; cp.phy_handle = hcon->handle; @@ -433,13 +460,33 @@ void amp_create_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, return; } - hci_send_cmd(hdev, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_CREATE_PHY_LINK, sizeof(cp), &cp); + hci_req_run(&req, create_phylink_complete); +} + +static void accept_phylink_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct hci_cp_accept_phy_link *cp; + + BT_DBG("%s status 0x%2.2x", hdev->name, status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK); + if (!cp) + return; + + amp_write_remote_assoc(hdev, cp->phy_handle); } void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, struct hci_conn *hcon) { struct hci_cp_accept_phy_link cp; + struct hci_request req; cp.phy_handle = hcon->handle; @@ -452,7 +499,9 @@ void amp_accept_phylink(struct hci_dev *hdev, struct amp_mgr *mgr, return; } - hci_send_cmd(hdev, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp); + hci_req_init(&req, hdev); + hci_req_add(&req, HCI_OP_ACCEPT_PHY_LINK, sizeof(cp), &cp); + hci_req_run(&req, accept_phylink_complete); } void amp_physical_cfm(struct hci_conn *bredr_hcon, struct hci_conn *hs_hcon) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 1ee01864569f..218d7dfc342f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1890,47 +1890,6 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) hci_dev_unlock(hdev); } -static void hci_cs_create_phylink(struct hci_dev *hdev, u8 status) -{ - struct hci_cp_create_phy_link *cp; - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - cp = hci_sent_cmd_data(hdev, HCI_OP_CREATE_PHY_LINK); - if (!cp) - return; - - hci_dev_lock(hdev); - - if (status) { - struct hci_conn *hcon; - - hcon = hci_conn_hash_lookup_handle(hdev, cp->phy_handle); - if (hcon) - hci_conn_del(hcon); - } else { - amp_write_remote_assoc(hdev, cp->phy_handle); - } - - hci_dev_unlock(hdev); -} - -static void hci_cs_accept_phylink(struct hci_dev *hdev, u8 status) -{ - struct hci_cp_accept_phy_link *cp; - - BT_DBG("%s status 0x%2.2x", hdev->name, status); - - if (status) - return; - - cp = hci_sent_cmd_data(hdev, HCI_OP_ACCEPT_PHY_LINK); - if (!cp) - return; - - amp_write_remote_assoc(hdev, cp->phy_handle); -} - static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status) { struct hci_cp_le_create_conn *cp; @@ -3131,14 +3090,6 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb, hci_cs_setup_sync_conn(hdev, ev->status); break; - case HCI_OP_CREATE_PHY_LINK: - hci_cs_create_phylink(hdev, ev->status); - break; - - case HCI_OP_ACCEPT_PHY_LINK: - hci_cs_accept_phylink(hdev, ev->status); - break; - case HCI_OP_SNIFF_MODE: hci_cs_sniff_mode(hdev, ev->status); break; -- cgit v1.2.3 From 72b1e5e4cac72efa6b739b47e41f53e4520b4194 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 23 Jul 2015 16:21:30 +0200 Subject: netfilter: bridge: reduce nf_bridge_info to 32 bytes again We can use union for most of the temporary cruft (original ipv4/ipv6 address, source mac, physoutdev) since they're used during different stages of br netfilter traversal. Also get rid of the last two ->mask users. Shrinks struct from 48 to 32 on 64bit arch. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter_bridge.h | 12 +++++++++--- include/linux/skbuff.h | 19 +++++++++++++------ net/bridge/br_netfilter_hooks.c | 14 ++++++-------- net/bridge/br_netfilter_ipv6.c | 2 +- net/ipv4/netfilter/nf_defrag_ipv4.c | 7 ++----- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 7 ++----- 6 files changed, 33 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_bridge.h b/include/linux/netfilter_bridge.h index 6d80fc686323..2437b8a5d7a9 100644 --- a/include/linux/netfilter_bridge.h +++ b/include/linux/netfilter_bridge.h @@ -17,9 +17,6 @@ enum nf_br_hook_priorities { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) -#define BRNF_BRIDGED_DNAT 0x02 -#define BRNF_NF_BRIDGE_PREROUTING 0x08 - int br_handle_frame_finish(struct sock *sk, struct sk_buff *skb); static inline void br_drop_fake_rtable(struct sk_buff *skb) @@ -63,8 +60,17 @@ nf_bridge_get_physoutdev(const struct sk_buff *skb) { return skb->nf_bridge ? skb->nf_bridge->physoutdev : NULL; } + +static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) +{ + return skb->nf_bridge && skb->nf_bridge->in_prerouting; +} #else #define br_drop_fake_rtable(skb) do { } while (0) +static inline bool nf_bridge_in_prerouting(const struct sk_buff *skb) +{ + return false; +} #endif /* CONFIG_BRIDGE_NETFILTER */ #endif diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d6cdd6e87d53..ac732e67a6c8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -173,17 +173,24 @@ struct nf_bridge_info { BRNF_PROTO_8021Q, BRNF_PROTO_PPPOE } orig_proto:8; - bool pkt_otherhost; + u8 pkt_otherhost:1; + u8 in_prerouting:1; + u8 bridged_dnat:1; __u16 frag_max_size; - unsigned int mask; struct net_device *physindev; union { - struct net_device *physoutdev; - char neigh_header[8]; - }; - union { + /* prerouting: detect dnat in orig/reply direction */ __be32 ipv4_daddr; struct in6_addr ipv6_daddr; + + /* after prerouting + nat detected: store original source + * mac since neigh resolution overwrites it, only used while + * skb is out in neigh layer. + */ + char neigh_header[8]; + + /* always valid & non-NULL from FORWARD on, for physdev match */ + struct net_device *physoutdev; }; }; #endif diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index c8b9bcfe997e..ec51c2ba30e9 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -284,7 +284,7 @@ int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb) nf_bridge->neigh_header, ETH_HLEN-ETH_ALEN); /* tell br_dev_xmit to continue with forwarding */ - nf_bridge->mask |= BRNF_BRIDGED_DNAT; + nf_bridge->bridged_dnat = 1; /* FIXME Need to refragment */ ret = neigh->output(neigh, skb); } @@ -356,7 +356,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 0; if (br_nf_ipv4_daddr_was_changed(skb, nf_bridge)) { if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -444,7 +444,7 @@ struct net_device *setup_pre_routing(struct sk_buff *skb) nf_bridge->pkt_otherhost = true; } - nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 1; nf_bridge->physindev = skb->dev; skb->dev = brnf_get_logical_dev(skb, skb->dev); @@ -850,10 +850,8 @@ static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && - !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) return NF_STOP; - } return NF_ACCEPT; } @@ -872,7 +870,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) struct nf_bridge_info *nf_bridge = nf_bridge_info_get(skb); skb_pull(skb, ETH_HLEN); - nf_bridge->mask &= ~BRNF_BRIDGED_DNAT; + nf_bridge->bridged_dnat = 0; BUILD_BUG_ON(sizeof(nf_bridge->neigh_header) != (ETH_HLEN - ETH_ALEN)); @@ -887,7 +885,7 @@ static void br_nf_pre_routing_finish_bridge_slow(struct sk_buff *skb) static int br_nf_dev_xmit(struct sk_buff *skb) { - if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { + if (skb->nf_bridge && skb->nf_bridge->bridged_dnat) { br_nf_pre_routing_finish_bridge_slow(skb); return 1; } diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 13b7d1e3d185..77383bfe7ea3 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -174,7 +174,7 @@ static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb) skb->pkt_type = PACKET_OTHERHOST; nf_bridge->pkt_otherhost = false; } - nf_bridge->mask &= ~BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->in_prerouting = 0; if (br_nf_ipv6_daddr_was_changed(skb, nf_bridge)) { skb_dst_drop(skb); v6ops->route_input(skb); diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index c88b7d434718..b69e82bda215 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -49,12 +49,9 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + if (nf_bridge_in_prerouting(skb)) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; -#endif + if (hooknum == NF_INET_PRE_ROUTING) return IP_DEFRAG_CONNTRACK_IN + zone; else diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index a45db0b4785c..267fb8d5876e 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -39,12 +39,9 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, if (skb->nfct) zone = nf_ct_zone((struct nf_conn *)skb->nfct); #endif - -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + if (nf_bridge_in_prerouting(skb)) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; -#endif + if (hooknum == NF_INET_PRE_ROUTING) return IP6_DEFRAG_CONNTRACK_IN + zone; else -- cgit v1.2.3 From f4b3eee727e876d625cfe3585af48f4983c435d7 Mon Sep 17 00:00:00 2001 From: Bernhard Thaler Date: Thu, 30 Jul 2015 06:06:12 +0200 Subject: netfilter: bridge: do not initialize statics to 0 or NULL Fix checkpatch.pl "ERROR: do not initialise statics to 0 or NULL" for all statics explicitly initialized to 0. Signed-off-by: Bernhard Thaler Signed-off-by: Pablo Neira Ayuso --- net/bridge/br_netfilter_hooks.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index ec51c2ba30e9..0a6f095bb0c9 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -49,9 +49,9 @@ static struct ctl_table_header *brnf_sysctl_header; static int brnf_call_iptables __read_mostly = 1; static int brnf_call_ip6tables __read_mostly = 1; static int brnf_call_arptables __read_mostly = 1; -static int brnf_filter_vlan_tagged __read_mostly = 0; -static int brnf_filter_pppoe_tagged __read_mostly = 0; -static int brnf_pass_vlan_indev __read_mostly = 0; +static int brnf_filter_vlan_tagged __read_mostly; +static int brnf_filter_pppoe_tagged __read_mostly; +static int brnf_pass_vlan_indev __read_mostly; #else #define brnf_call_iptables 1 #define brnf_call_ip6tables 1 -- cgit v1.2.3 From 54c9ee39923c0ffa27b99a1b7321e0d07408c97b Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 16 Jul 2015 22:17:13 +0200 Subject: Bluetooth: cmtp: Do not use list_for_each_safe when not needed There is no need to use the safe version of list_for_each here. Signed-off-by: Christophe JAILLET Signed-off-by: Marcel Holtmann --- net/bluetooth/cmtp/capi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c index b0c6c6af76ef..9a50338772f3 100644 --- a/net/bluetooth/cmtp/capi.c +++ b/net/bluetooth/cmtp/capi.c @@ -100,9 +100,9 @@ static void cmtp_application_del(struct cmtp_session *session, struct cmtp_appli static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value) { struct cmtp_application *app; - struct list_head *p, *n; + struct list_head *p; - list_for_each_safe(p, n, &session->applications) { + list_for_each(p, &session->applications) { app = list_entry(p, struct cmtp_application, list); switch (pattern) { case CMTP_MSGNUM: @@ -511,13 +511,13 @@ static int cmtp_proc_show(struct seq_file *m, void *v) struct capi_ctr *ctrl = m->private; struct cmtp_session *session = ctrl->driverdata; struct cmtp_application *app; - struct list_head *p, *n; + struct list_head *p; seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl)); seq_printf(m, "addr %s\n", session->name); seq_printf(m, "ctrl %d\n", session->num); - list_for_each_safe(p, n, &session->applications) { + list_for_each(p, &session->applications) { app = list_entry(p, struct cmtp_application, list); seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping); } -- cgit v1.2.3 From c22ff7b4e74d8136a9911d8b8d0f25f9f7c3edc1 Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Tue, 21 Jul 2015 17:44:47 +0300 Subject: mac802154: Fix memory corruption with global deferred transmit state. When transmitting a packet via a mac802154 driver that can sleep in its transmit function, mac802154 defers the call to the driver's transmit function to a per-device workqueue. However, mac802154 uses a single global work_struct for this, which means that if you have more than one registered mac802154 interface in the system, and you transmit on more than one of them at the same time, you'll very easily cause memory corruption. This patch moves the deferred transmit processing state from global variables to struct ieee802154_local, and this seems to fix the memory corruption issue. Signed-off-by: Lennert Buytenhek Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/ieee802154_i.h | 4 ++++ net/mac802154/main.c | 2 ++ net/mac802154/tx.c | 27 ++++++--------------------- 3 files changed, 12 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 6810d7a25aca..56ccffa3f2bf 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -60,6 +60,9 @@ struct ieee802154_local { struct tasklet_struct tasklet; struct sk_buff_head skb_queue; + + struct sk_buff *tx_skb; + struct work_struct tx_work; }; enum { @@ -125,6 +128,7 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata) extern struct ieee802154_mlme_ops mac802154_mlme_wpan; void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb); +void ieee802154_xmit_worker(struct work_struct *work); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 91f120845a45..9e55431b9a5c 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -105,6 +105,8 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) skb_queue_head_init(&local->skb_queue); + INIT_WORK(&local->tx_work, ieee802154_xmit_worker); + /* init supported flags with 802.15.4 default ranges */ phy->supported.max_minbe = 8; phy->supported.min_maxbe = 3; diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index c62e95695c78..7ed439172f30 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -30,23 +30,11 @@ #include "ieee802154_i.h" #include "driver-ops.h" -/* IEEE 802.15.4 transceivers can sleep during the xmit session, so process - * packets through the workqueue. - */ -struct ieee802154_xmit_cb { - struct sk_buff *skb; - struct work_struct work; - struct ieee802154_local *local; -}; - -static struct ieee802154_xmit_cb ieee802154_xmit_cb; - -static void ieee802154_xmit_worker(struct work_struct *work) +void ieee802154_xmit_worker(struct work_struct *work) { - struct ieee802154_xmit_cb *cb = - container_of(work, struct ieee802154_xmit_cb, work); - struct ieee802154_local *local = cb->local; - struct sk_buff *skb = cb->skb; + struct ieee802154_local *local = + container_of(work, struct ieee802154_local, tx_work); + struct sk_buff *skb = local->tx_skb; struct net_device *dev = skb->dev; int res; @@ -106,11 +94,8 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) dev->stats.tx_packets++; dev->stats.tx_bytes += skb->len; } else { - INIT_WORK(&ieee802154_xmit_cb.work, ieee802154_xmit_worker); - ieee802154_xmit_cb.skb = skb; - ieee802154_xmit_cb.local = local; - - queue_work(local->workqueue, &ieee802154_xmit_cb.work); + local->tx_skb = skb; + queue_work(local->workqueue, &local->tx_work); } return NETDEV_TX_OK; -- cgit v1.2.3 From 5857d1dbae7d5bf4219efc39996ad002362a2951 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 30 Jul 2015 09:40:53 +0200 Subject: Bluetooth: 6lowpan: Fix possible race This patch fix a possible race after calling register_netdev. After calling netdev_register it could be possible that netdev_ops callbacks use the uninitialized private data of lowpan_dev. By moving the initialization of this data before netdev_register we can be sure that initialized private data is be used after netdev_register. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 2fb7b3064904..0ffe2e24020a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -859,9 +859,22 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); + *dev = netdev_priv(netdev); + (*dev)->netdev = netdev; + (*dev)->hdev = chan->conn->hcon->hdev; + INIT_LIST_HEAD(&(*dev)->peers); + + spin_lock(&devices_lock); + INIT_LIST_HEAD(&(*dev)->list); + list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); + spin_unlock(&devices_lock); + err = register_netdev(netdev); if (err < 0) { BT_INFO("register_netdev failed %d", err); + spin_lock(&devices_lock); + list_del_rcu(&(*dev)->list); + spin_unlock(&devices_lock); free_netdev(netdev); goto out; } @@ -871,16 +884,6 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) &chan->src, chan->src_type); set_bit(__LINK_STATE_PRESENT, &netdev->state); - *dev = netdev_priv(netdev); - (*dev)->netdev = netdev; - (*dev)->hdev = chan->conn->hcon->hdev; - INIT_LIST_HEAD(&(*dev)->peers); - - spin_lock(&devices_lock); - INIT_LIST_HEAD(&(*dev)->list); - list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); - spin_unlock(&devices_lock); - return 0; out: -- cgit v1.2.3 From 8013d1d7eafb0589ca766db6b74026f76b7f5cb4 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Thu, 30 Jul 2015 14:28:42 +0800 Subject: net/ipv6: add sysctl option accept_ra_min_hop_limit Commit 6fd99094de2b ("ipv6: Don't reduce hop limit for an interface") disabled accept hop limit from RA if it is smaller than the current hop limit for security stuff. But this behavior kind of break the RFC definition. RFC 4861, 6.3.4. Processing Received Router Advertisements A Router Advertisement field (e.g., Cur Hop Limit, Reachable Time, and Retrans Timer) may contain a value denoting that it is unspecified. In such cases, the parameter should be ignored and the host should continue using whatever value it is already using. If the received Cur Hop Limit value is non-zero, the host SHOULD set its CurHopLimit variable to the received value. So add sysctl option accept_ra_min_hop_limit to let user choose the minimum hop limit value they can accept from RA. And set default to 1 to meet RFC standards. Signed-off-by: Hangbin Liu Acked-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 8 ++++++++ include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 16 +++++++--------- 5 files changed, 27 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 1a5ab21bcca5..00d26d919459 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1340,6 +1340,14 @@ accept_ra_from_local - BOOLEAN disabled if accept_ra_from_local is disabled on a specific interface. +accept_ra_min_hop_limit - INTEGER + Minimum hop limit Information in Router Advertisement. + + Hop limit Information in Router Advertisement less than this + variable shall be ignored. + + Default: 1 + accept_ra_pinfo - BOOLEAN Learn Prefix Information in Router Advertisement. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 06ed637225b8..cb9dcad72372 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -29,6 +29,7 @@ struct ipv6_devconf { __s32 max_desync_factor; __s32 max_addresses; __s32 accept_ra_defrtr; + __s32 accept_ra_min_hop_limit; __s32 accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 641a146ead7d..80f3b74446a1 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -172,6 +172,7 @@ enum { DEVCONF_ACCEPT_RA_MTU, DEVCONF_STABLE_SECRET, DEVCONF_USE_OIF_ADDRS_ONLY, + DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index eb0c6a3a8a00..53e3a9d756b0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_from_local = 0, + .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -237,6 +238,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .max_addresses = IPV6_MAX_ADDRESSES, .accept_ra_defrtr = 1, .accept_ra_from_local = 0, + .accept_ra_min_hop_limit= 1, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -4588,6 +4590,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; array[DEVCONF_ACCEPT_RA_DEFRTR] = cnf->accept_ra_defrtr; + array[DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT] = cnf->accept_ra_min_hop_limit; array[DEVCONF_ACCEPT_RA_PINFO] = cnf->accept_ra_pinfo; #ifdef CONFIG_IPV6_ROUTER_PREF array[DEVCONF_ACCEPT_RA_RTR_PREF] = cnf->accept_ra_rtr_pref; @@ -5484,6 +5487,13 @@ static struct addrconf_sysctl_table .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "accept_ra_min_hop_limit", + .data = &ipv6_devconf.accept_ra_min_hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "accept_ra_pinfo", .data = &ipv6_devconf.accept_ra_pinfo, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 0a05b35a90fc..6e184e02fd3c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1225,18 +1225,16 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (rt) rt6_set_expires(rt, jiffies + (HZ * lifetime)); - if (ra_msg->icmph.icmp6_hop_limit) { - /* Only set hop_limit on the interface if it is higher than - * the current hop_limit. - */ - if (in6_dev->cnf.hop_limit < ra_msg->icmph.icmp6_hop_limit) { + if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && + ra_msg->icmph.icmp6_hop_limit) { + if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (rt) + dst_metric_set(&rt->dst, RTAX_HOPLIMIT, + ra_msg->icmph.icmp6_hop_limit); } else { - ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than current\n"); + ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); } - if (rt) - dst_metric_set(&rt->dst, RTAX_HOPLIMIT, - ra_msg->icmph.icmp6_hop_limit); } skip_defrtr: -- cgit v1.2.3 From cbeb83ca68dcedf69b336fd1c5263658cbe5b51e Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:15 -0400 Subject: tipc: eliminate function tipc_link_activate() The function tipc_link_activate() is redundant, since it mostly performs settings that have already been done in a preceding tipc_link_reset(). There are three exceptions to this: - The actual state change to TIPC_LINK_WORKING. This should anyway be done in the FSM, and not in a separate function. - Registration of the link with the bearer. This should be done by the node, since we don't want the link to have any knowledge about its specific bearer. - Call to tipc_node_link_up() for user access registration. With the new role distribution between link aggregation and link level this becomes the wrong call order; tipc_node_link_up() should instead be called directly as a result of a TIPC_LINK_UP event, hence by the node itself. This commit implements those changes. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 17 ++++------------- net/tipc/link.h | 1 - net/tipc/node.c | 6 ++++-- 3 files changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index cc40aa6eb66c..05837ba7b68c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -228,6 +228,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, l_ptr->peer_session = WILDCARD_SESSION; l_ptr->bearer_id = b_ptr->identity; l_ptr->tolerance = b_ptr->tolerance; + l_ptr->snd_nxt = 1; + l_ptr->rcv_nxt = 1; l_ptr->state = TIPC_LINK_RESETTING; l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; @@ -376,6 +378,7 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, pl = node_active_link(l->owner, 0); if (pl && link_probing(pl)) break; + l->state = TIPC_LINK_WORKING; actions |= LINK_ACTIVATE; if (!l->owner->working_links) actions |= SND_BCAST_SYNC; @@ -398,6 +401,7 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, pl = node_active_link(l->owner, 0); if (pl && link_probing(pl)) break; + l->state = TIPC_LINK_WORKING; actions |= LINK_ACTIVATE; if (!l->owner->working_links) actions |= SND_BCAST_SYNC; @@ -639,19 +643,6 @@ void tipc_link_reset(struct tipc_link *l_ptr) link_reset_statistics(l_ptr); } -void tipc_link_activate(struct tipc_link *link) -{ - struct tipc_node *node = link->owner; - - link->rcv_nxt = 1; - link->stats.recv_info = 1; - link->silent_intv_cnt = 0; - link->state = TIPC_LINK_WORKING; - link->exec_mode = TIPC_LINK_OPEN; - tipc_node_link_up(node, link->bearer_id); - tipc_bearer_add_dest(node->net, link->bearer_id, link->addr); -} - /** * __tipc_link_xmit(): same as tipc_link_xmit, but destlink is known & locked * @link: link to use diff --git a/net/tipc/link.h b/net/tipc/link.h index 37cfd7d7bf7d..279196d6baac 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -223,7 +223,6 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr); void tipc_link_purge_backlog(struct tipc_link *l); void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); -void tipc_link_activate(struct tipc_link *link); int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list); int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list, diff --git a/net/tipc/node.c b/net/tipc/node.c index e92f84afbf95..558df25a7fc6 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -295,11 +295,13 @@ void tipc_node_link_up(struct tipc_node *n, int bearer_id) n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = l->peer_bearer_id << 16 | l->bearer_id; + tipc_bearer_add_dest(n->net, bearer_id, n->addr); + pr_debug("Established link <%s> on network plane %c\n", l->name, l->net_plane); /* No active links ? => take both active slots */ - if (*slot0 < 0) { + if (!tipc_node_is_up(n)) { *slot0 = bearer_id; *slot1 = bearer_id; node_established_contact(n); @@ -896,7 +898,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) rc = tipc_link_rcv(l, skb, &xmitq); if (unlikely(rc & TIPC_LINK_UP_EVT)) - tipc_link_activate(l); + tipc_node_link_up(n, bearer_id); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_link_reset(l); skb = NULL; -- cgit v1.2.3 From 6144a996a65199480eed7521c1c50590c282e78e Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:16 -0400 Subject: tipc: move all link_reset() calls to link aggregation level In line with our effort to let the node level have full control over its links, we want to move all link reset calls from link.c to node.c. Some of the calls can be moved by simply moving the calling function, when this is the right thing to do. For the remaining calls we use the now established technique of returning a TIPC_LINK_DOWN_EVT flag from tipc_link_rcv(), whereafter we perform the reset call when the call returns. This change serves as a preparation for the coming commits. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/bearer.c | 4 +-- net/tipc/link.c | 81 +++++++++++++---------------------------------------- net/tipc/link.h | 3 -- net/tipc/node.c | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- net/tipc/node.h | 1 + 5 files changed, 104 insertions(+), 69 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index eae58a6b121c..ce9f7bfc0b92 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -343,7 +343,7 @@ restart: static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr) { pr_info("Resetting bearer <%s>\n", b_ptr->name); - tipc_link_delete_list(net, b_ptr->identity); + tipc_node_delete_links(net, b_ptr->identity); tipc_disc_reset(net, b_ptr); return 0; } @@ -361,7 +361,7 @@ static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr) pr_info("Disabling bearer <%s>\n", b_ptr->name); b_ptr->media->disable_media(b_ptr); - tipc_link_delete_list(net, b_ptr->identity); + tipc_node_delete_links(net, b_ptr->identity); if (b_ptr->link_req) tipc_disc_delete(b_ptr->link_req); diff --git a/net/tipc/link.c b/net/tipc/link.c index 05837ba7b68c..8c81db7b17f9 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -137,9 +137,9 @@ static void link_print(struct tipc_link *l_ptr, const char *str); static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); -static void tipc_link_input(struct tipc_link *l, struct sk_buff *skb); +static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); -static bool tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); +static int tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); /* * Simple link routines @@ -258,34 +258,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, return l_ptr; } -/** - * tipc_link_delete - Delete a link - * @l: link to be deleted - */ -void tipc_link_delete(struct tipc_link *l) -{ - tipc_link_reset(l); - tipc_link_reset_fragments(l); - tipc_node_detach_link(l->owner, l); -} - -void tipc_link_delete_list(struct net *net, unsigned int bearer_id) -{ - struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *link; - struct tipc_node *node; - - rcu_read_lock(); - list_for_each_entry_rcu(node, &tn->node_list, list) { - tipc_node_lock(node); - link = node->links[bearer_id].link; - if (link) - tipc_link_delete(link); - tipc_node_unlock(node); - } - rcu_read_unlock(); -} - /* tipc_link_build_bcast_sync_msg() - synchronize broadcast link endpoints. * * Give a newly added peer node the sequence number where it should @@ -875,26 +847,6 @@ void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) l->snd_nxt = seqno; } -void tipc_link_reset_all(struct tipc_node *node) -{ - char addr_string[16]; - u32 i; - - tipc_node_lock(node); - - pr_warn("Resetting all links to %s\n", - tipc_addr_string_fill(addr_string, node->addr)); - - for (i = 0; i < MAX_BEARERS; i++) { - if (node->links[i].link) { - link_print(node->links[i].link, "Resetting link\n"); - tipc_link_reset(node->links[i].link); - } - } - - tipc_node_unlock(node); -} - static void link_retransmit_failure(struct tipc_link *l_ptr, struct sk_buff *buf) { @@ -911,7 +863,6 @@ static void link_retransmit_failure(struct tipc_link *l_ptr, msg_errcode(msg)); pr_info("sqno %u, prev: %x, src: %x\n", msg_seqno(msg), msg_prevnode(msg), msg_orignode(msg)); - tipc_link_reset(l_ptr); } else { /* Handle failure on broadcast link */ struct tipc_node *n_ptr; @@ -987,6 +938,7 @@ static int tipc_link_retransm(struct tipc_link *l, int retransm, l->stale_count = 1; } else if (++l->stale_count > 100) { link_retransmit_failure(l, skb); + l->exec_mode = TIPC_LINK_BLOCKED; return TIPC_LINK_DOWN_EVT; } skb_queue_walk(&l->transmq, skb) { @@ -1079,12 +1031,13 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) * Consumes buffer * Node lock must be held */ -static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) +static int tipc_link_input(struct tipc_link *link, struct sk_buff *skb) { struct tipc_node *node = link->owner; struct tipc_msg *msg = buf_msg(skb); struct sk_buff *iskb; int pos = 0; + int rc = 0; switch (msg_user(msg)) { case TUNNEL_PROTOCOL: @@ -1094,7 +1047,8 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) kfree_skb(skb); break; } - if (!tipc_link_failover_rcv(link, &skb)) + rc |= tipc_link_failover_rcv(link, &skb); + if (!skb) break; if (msg_user(buf_msg(skb)) != MSG_BUNDLER) { tipc_data_input(link, skb); @@ -1113,7 +1067,8 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) link->stats.recv_fragmented++; tipc_data_input(link, skb); } else if (!link->reasm_buf) { - tipc_link_reset(link); + link->exec_mode = TIPC_LINK_BLOCKED; + rc |= TIPC_LINK_DOWN_EVT; } break; case BCAST_PROTOCOL: @@ -1122,6 +1077,7 @@ static void tipc_link_input(struct tipc_link *link, struct sk_buff *skb) default: break; }; + return rc; } static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) @@ -1215,7 +1171,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, l->rcv_nxt++; l->stats.recv_info++; if (unlikely(!tipc_data_input(l, skb))) - tipc_link_input(l, skb); + rc |= tipc_link_input(l, skb); /* Ack at regular intervals */ if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) { @@ -1504,14 +1460,15 @@ tunnel_queue: /* tipc_link_failover_rcv(): Receive a tunnelled FAILOVER_MSG packet * Owner node is locked. */ -static bool tipc_link_failover_rcv(struct tipc_link *link, - struct sk_buff **skb) +static int tipc_link_failover_rcv(struct tipc_link *link, + struct sk_buff **skb) { struct tipc_msg *msg = buf_msg(*skb); struct sk_buff *iskb = NULL; struct tipc_link *pl = NULL; int bearer_id = msg_bearer_id(msg); int pos = 0; + int rc = 0; if (msg_type(msg) != FAILOVER_MSG) { pr_warn("%sunknown tunnel pkt received\n", link_co_err); @@ -1524,8 +1481,6 @@ static bool tipc_link_failover_rcv(struct tipc_link *link, goto exit; pl = link->owner->links[bearer_id].link; - if (pl && tipc_link_is_up(pl)) - tipc_link_reset(pl); if (link->failover_pkts == FIRST_FAILOVER) link->failover_pkts = msg_msgcnt(msg); @@ -1550,14 +1505,18 @@ static bool tipc_link_failover_rcv(struct tipc_link *link, } if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) { link->stats.recv_fragments++; - tipc_buf_append(&link->failover_skb, &iskb); + if (!tipc_buf_append(&link->failover_skb, &iskb) && + !link->failover_skb) { + link->exec_mode = TIPC_LINK_BLOCKED; + rc |= TIPC_LINK_DOWN_EVT; + } } exit: if (!link->failover_pkts && pl) pl->exec_mode = TIPC_LINK_OPEN; kfree_skb(*skb); *skb = iskb; - return *skb; + return rc; } /* tipc_link_proto_rcv(): receive link level protocol message : diff --git a/net/tipc/link.h b/net/tipc/link.h index 279196d6baac..bb1378b7cb59 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -212,8 +212,6 @@ struct tipc_link *tipc_link_create(struct tipc_node *n, const struct tipc_media_addr *maddr, struct sk_buff_head *inputq, struct sk_buff_head *namedq); -void tipc_link_delete(struct tipc_link *link); -void tipc_link_delete_list(struct net *net, unsigned int bearer_id); void tipc_link_failover_send_queue(struct tipc_link *l_ptr); void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest); void tipc_link_reset_fragments(struct tipc_link *l_ptr); @@ -221,7 +219,6 @@ int tipc_link_is_up(struct tipc_link *l_ptr); int tipc_link_is_active(struct tipc_link *l_ptr); void tipc_link_purge_queues(struct tipc_link *l_ptr); void tipc_link_purge_backlog(struct tipc_link *l); -void tipc_link_reset_all(struct tipc_node *node); void tipc_link_reset(struct tipc_link *l_ptr); int __tipc_link_xmit(struct net *net, struct tipc_link *link, struct sk_buff_head *list); diff --git a/net/tipc/node.c b/net/tipc/node.c index 558df25a7fc6..6a0680ba98a9 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -407,6 +407,44 @@ bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *b, return true; } +void tipc_node_delete_links(struct net *net, int bearer_id) +{ + struct tipc_net *tn = net_generic(net, tipc_net_id); + struct tipc_link *l; + struct tipc_node *n; + + rcu_read_lock(); + list_for_each_entry_rcu(n, &tn->node_list, list) { + tipc_node_lock(n); + l = n->links[bearer_id].link; + if (l) { + tipc_link_reset(l); + n->links[bearer_id].link = NULL; + n->link_cnt--; + } + tipc_node_unlock(n); + kfree(l); + } + rcu_read_unlock(); +} + +static void tipc_node_reset_links(struct tipc_node *n) +{ + char addr_string[16]; + u32 i; + + tipc_node_lock(n); + + pr_warn("Resetting all links to %s\n", + tipc_addr_string_fill(addr_string, n->addr)); + + for (i = 0; i < MAX_BEARERS; i++) { + if (n->links[i].link) + tipc_link_reset(n->links[i].link); + } + tipc_node_unlock(n); +} + void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) { n_ptr->links[l_ptr->bearer_id].link = l_ptr; @@ -721,7 +759,7 @@ void tipc_node_unlock(struct tipc_node *node) tipc_bclink_input(net); if (flags & TIPC_BCAST_RESET) - tipc_link_reset_all(node); + tipc_node_reset_links(node); } /* Caller should hold node lock for the passed node */ @@ -836,6 +874,40 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, return 0; } +/* tipc_node_tnl_init(): handle a received TUNNEL_PROTOCOL packet, + * in order to control parallel link failover or synchronization + */ +static void tipc_node_tnl_init(struct tipc_node *n, int bearer_id, + struct sk_buff *skb) +{ + struct tipc_link *tnl, *pl; + struct tipc_msg *hdr = buf_msg(skb); + u16 oseqno = msg_seqno(hdr); + int pb_id = msg_bearer_id(hdr); + + if (pb_id >= MAX_BEARERS) + return; + + tnl = n->links[bearer_id].link; + if (!tnl) + return; + + /* Ignore if duplicate */ + if (less(oseqno, tnl->rcv_nxt)) + return; + + pl = n->links[pb_id].link; + if (!pl) + return; + + if (msg_type(hdr) == FAILOVER_MSG) { + if (tipc_link_is_up(pl)) { + tipc_link_reset(pl); + pl->exec_mode = TIPC_LINK_BLOCKED; + } + } +} + /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace @@ -854,6 +926,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) struct tipc_media_addr *maddr; int bearer_id = b->identity; int rc = 0; + int usr; __skb_queue_head_init(&xmitq); @@ -863,8 +936,9 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) /* Handle arrival of a non-unicast link packet */ hdr = buf_msg(skb); + usr = msg_user(hdr); if (unlikely(msg_non_seq(hdr))) { - if (msg_user(hdr) == LINK_CONFIG) + if (usr == LINK_CONFIG) tipc_disc_rcv(net, skb, b); else tipc_bclink_rcv(net, skb); @@ -877,6 +951,10 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) goto discard; tipc_node_lock(n); + /* Prepare links for tunneled reception if applicable */ + if (unlikely(usr == TUNNEL_PROTOCOL)) + tipc_node_tnl_init(n, bearer_id, skb); + /* Locate link endpoint that should handle packet */ l = n->links[bearer_id].link; if (unlikely(!l)) @@ -887,7 +965,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) if (!tipc_node_filter_skb(n, l, hdr)) goto unlock; - if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) + if (unlikely(usr == LINK_PROTOCOL)) tipc_bclink_sync_state(n, hdr); /* Release acked broadcast messages */ diff --git a/net/tipc/node.h b/net/tipc/node.h index 5e7016802077..49df0e934a65 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -171,6 +171,7 @@ void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *bearer, struct tipc_media_addr *maddr); bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *bearer, struct tipc_media_addr *maddr); +void tipc_node_delete_links(struct net *net, int bearer_id); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_link_down(struct tipc_node *n_ptr, int bearer_id); -- cgit v1.2.3 From 655fb243b8ae5e652f744311bcb6e806e83cea1e Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:17 -0400 Subject: tipc: reverse call order for link_reset()->node_link_down() In many cases the call order when a link is reset goes as follows: tipc_node_xx()->tipc_link_reset()->tipc_node_link_down() This is not the right order if we want the node to be in control, so in this commit we change the order to: tipc_node_xx()->tipc_node_link_down()->tipc_link_reset() The fact that tipc_link_reset() now is called from only one location with a well-defined state will also facilitate later simplifications of tipc_link_reset() and the link FSM. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 6 +----- net/tipc/node.c | 30 +++++++++++++++++++++--------- 2 files changed, 22 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 8c81db7b17f9..2ccdb6ffd5c8 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -566,7 +566,6 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr) void tipc_link_reset(struct tipc_link *l_ptr) { u32 prev_state = l_ptr->state; - int was_active_link = tipc_link_is_active(l_ptr); struct tipc_node *owner = l_ptr->owner; struct tipc_link *pl = tipc_parallel_link(l_ptr); @@ -584,10 +583,7 @@ void tipc_link_reset(struct tipc_link *l_ptr) (prev_state == TIPC_LINK_ESTABLISHING)) return; - tipc_node_link_down(l_ptr->owner, l_ptr->bearer_id); - tipc_bearer_remove_dest(owner->net, l_ptr->bearer_id, l_ptr->addr); - - if (was_active_link && tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { + if (tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { l_ptr->exec_mode = TIPC_LINK_BLOCKED; l_ptr->failover_checkpt = l_ptr->rcv_nxt; pl->failover_pkts = FIRST_FAILOVER; diff --git a/net/tipc/node.c b/net/tipc/node.c index 6a0680ba98a9..65c2c80cffe7 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -265,7 +265,7 @@ static void tipc_node_timeout(unsigned long data) tipc_node_calculate_timer(n, l); rc = tipc_link_timeout(l, &xmitq); if (rc & TIPC_LINK_DOWN_EVT) - tipc_link_reset(l); + tipc_node_link_down(n, bearer_id); } tipc_node_unlock(n); maddr = &n->links[bearer_id].maddr; @@ -338,10 +338,15 @@ void tipc_node_link_down(struct tipc_node *n, int bearer_id) struct tipc_link *l, *_l; l = n->links[bearer_id].link; + if (!l || !tipc_link_is_up(l)) + return; + n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = l->peer_bearer_id << 16 | l->bearer_id; + tipc_bearer_remove_dest(n->net, l->bearer_id, n->addr); + pr_debug("Lost link <%s> on network plane %c\n", l->name, l->net_plane); @@ -352,6 +357,8 @@ void tipc_node_link_down(struct tipc_node *n, int bearer_id) _l = n->links[i].link; if (!_l || !tipc_link_is_up(_l)) continue; + if (_l == l) + continue; if (_l->priority < highest) continue; if (_l->priority > highest) { @@ -362,9 +369,13 @@ void tipc_node_link_down(struct tipc_node *n, int bearer_id) } *slot1 = i; } + if (tipc_node_is_up(n)) tipc_link_failover_send_queue(l); - else + + tipc_link_reset(l); + + if (!tipc_node_is_up(n)) node_lost_contact(n); } @@ -403,7 +414,7 @@ bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *b, } memcpy(&l->media_addr, maddr, sizeof(*maddr)); memcpy(curr, maddr, sizeof(*maddr)); - tipc_link_reset(l); + tipc_node_link_down(n, b->identity); return true; } @@ -418,7 +429,7 @@ void tipc_node_delete_links(struct net *net, int bearer_id) tipc_node_lock(n); l = n->links[bearer_id].link; if (l) { - tipc_link_reset(l); + tipc_node_link_down(n, bearer_id); n->links[bearer_id].link = NULL; n->link_cnt--; } @@ -439,8 +450,9 @@ static void tipc_node_reset_links(struct tipc_node *n) tipc_addr_string_fill(addr_string, n->addr)); for (i = 0; i < MAX_BEARERS; i++) { - if (n->links[i].link) - tipc_link_reset(n->links[i].link); + if (!n->links[i].link) + continue; + tipc_node_link_down(n, i); } tipc_node_unlock(n); } @@ -837,7 +849,7 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, if (likely(l)) rc = tipc_link_xmit(l, list, &xmitq); if (unlikely(rc == -ENOBUFS)) - tipc_link_reset(l); + tipc_node_link_down(n, bearer_id); tipc_node_unlock(n); tipc_node_put(n); } @@ -902,7 +914,7 @@ static void tipc_node_tnl_init(struct tipc_node *n, int bearer_id, if (msg_type(hdr) == FAILOVER_MSG) { if (tipc_link_is_up(pl)) { - tipc_link_reset(pl); + tipc_node_link_down(n, pb_id); pl->exec_mode = TIPC_LINK_BLOCKED; } } @@ -978,7 +990,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) - tipc_link_reset(l); + tipc_node_link_down(n, bearer_id); skb = NULL; unlock: tipc_node_unlock(n); -- cgit v1.2.3 From 66996b6c47ed7f6bbb01a768e23fae262c7db8e0 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:18 -0400 Subject: tipc: extend node FSM In the next commit, we will move link synch/failover orchestration to the link aggregation level. In order to do this, we first need to extend the node FSM with two more states, NODE_SYNCHING and NODE_FAILINGOVER, plus four new events to enter and leave those states. This commit introduces this change, without yet making use of it. The node FSM now looks as follows: +-----------------------------------------+ | PEER_DOWN_EVT| | | +------------------------+----------------+ | |SELF_DOWN_EVT | | | | | | | | +-----------+ +-----------+ | | |NODE_ | |NODE_ | | | +----------|FAILINGOVER|<---------|SYNCHING |------------+ | | |SELF_ +-----------+ FAILOVER_+-----------+ PEER_ | | | |DOWN_EVT | A BEGIN_EVT A | DOWN_EVT| | | | | | | | | | | | | | | | | | | | |FAILOVER_|FAILOVER_ |SYNCH_ |SYNCH_ | | | | |END_EVT |BEGIN_EVT |BEGIN_EVT|END_EVT | | | | | | | | | | | | | | | | | | | | | +--------------+ | | | | | +------->| SELF_UP_ |<-------+ | | | | +----------------| PEER_UP |------------------+ | | | | |SELF_DOWN_EVT +--------------+ PEER_DOWN_EVT| | | | | | A A | | | | | | | | | | | | | | PEER_UP_EVT| |SELF_UP_EVT | | | | | | | | | | | V V V | | V V V +------------+ +-----------+ +-----------+ +------------+ |SELF_DOWN_ | |SELF_UP_ | |PEER_UP_ | |PEER_DOWN | |PEER_LEAVING|<------|PEER_COMING| |SELF_COMING|------>|SELF_LEAVING| +------------+ SELF_ +-----------+ +-----------+ PEER_ +------------+ | DOWN_EVT A A DOWN_EVT | | | | | | | | | | SELF_UP_EVT| |PEER_UP_EVT | | | | | | | | | |PEER_DOWN_EVT +--------------+ SELF_DOWN_EVT| +------------------->| SELF_DOWN_ |<--------------------+ | PEER_DOWN | +--------------+ Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- net/tipc/node.h | 14 ++++++--- 2 files changed, 92 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 65c2c80cffe7..6b18d73830ca 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -494,8 +494,12 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) case SELF_LOST_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: default: - pr_err("Unknown node fsm evt %x/%x\n", state, evt); + goto illegal_evt; } break; case SELF_UP_PEER_UP: @@ -506,11 +510,19 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; + case NODE_SYNCH_BEGIN_EVT: + state = NODE_SYNCHING; + break; + case NODE_FAILOVER_BEGIN_EVT: + state = NODE_FAILINGOVER; + break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: + case NODE_SYNCH_END_EVT: + case NODE_FAILOVER_END_EVT: break; default: - pr_err("Unknown node fsm evt %x/%x\n", state, evt); + goto illegal_evt; } break; case SELF_DOWN_PEER_LEAVING: @@ -522,8 +534,12 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) case PEER_ESTABL_CONTACT_EVT: case SELF_LOST_CONTACT_EVT: break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: default: - pr_err("Unknown node fsm evt %x/%x\n", state, evt); + goto illegal_evt; } break; case SELF_UP_PEER_COMING: @@ -537,8 +553,12 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: default: - pr_err("Unknown node fsm evt %x/%x\n", state, evt); + goto illegal_evt; } break; case SELF_COMING_PEER_UP: @@ -552,8 +572,12 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: default: - pr_err("Unknown node fsm evt %x/%x\n", state, evt); + goto illegal_evt; } break; case SELF_LEAVING_PEER_DOWN: @@ -565,16 +589,67 @@ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) case PEER_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; + case NODE_SYNCH_END_EVT: + case NODE_SYNCH_BEGIN_EVT: + case NODE_FAILOVER_BEGIN_EVT: + case NODE_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case NODE_FAILINGOVER: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case NODE_FAILOVER_END_EVT: + state = SELF_UP_PEER_UP; + break; + case NODE_FAILOVER_BEGIN_EVT: + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + case NODE_SYNCH_BEGIN_EVT: + case NODE_SYNCH_END_EVT: default: - pr_err("Unknown node fsm evt %x/%x\n", state, evt); + goto illegal_evt; + } + break; + case NODE_SYNCHING: + switch (evt) { + case SELF_LOST_CONTACT_EVT: + state = SELF_DOWN_PEER_LEAVING; + break; + case PEER_LOST_CONTACT_EVT: + state = SELF_LEAVING_PEER_DOWN; + break; + case NODE_SYNCH_END_EVT: + state = SELF_UP_PEER_UP; + break; + case NODE_FAILOVER_BEGIN_EVT: + state = NODE_FAILINGOVER; + break; + case NODE_SYNCH_BEGIN_EVT: + case SELF_ESTABL_CONTACT_EVT: + case PEER_ESTABL_CONTACT_EVT: + break; + case NODE_FAILOVER_END_EVT: + default: + goto illegal_evt; } break; default: pr_err("Unknown node fsm state %x\n", state); break; } - n->state = state; + return; + +illegal_evt: + pr_err("Illegal node fsm evt %x in state %x\n", evt, state); } bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_link *l, diff --git a/net/tipc/node.h b/net/tipc/node.h index 49df0e934a65..65e2728f66a6 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -56,13 +56,19 @@ enum { SELF_UP_PEER_COMING = 0xac, SELF_COMING_PEER_UP = 0xca, SELF_LEAVING_PEER_DOWN = 0x1d, + NODE_FAILINGOVER = 0xf0, + NODE_SYNCHING = 0xcc }; enum { - SELF_ESTABL_CONTACT_EVT = 0xec, - SELF_LOST_CONTACT_EVT = 0x1c, - PEER_ESTABL_CONTACT_EVT = 0xfec, - PEER_LOST_CONTACT_EVT = 0xf1c + SELF_ESTABL_CONTACT_EVT = 0xece, + SELF_LOST_CONTACT_EVT = 0x1ce, + PEER_ESTABL_CONTACT_EVT = 0xfece, + PEER_LOST_CONTACT_EVT = 0xf1ce, + NODE_FAILOVER_BEGIN_EVT = 0xfbe, + NODE_FAILOVER_END_EVT = 0xfee, + NODE_SYNCH_BEGIN_EVT = 0xcbe, + NODE_SYNCH_END_EVT = 0xcee }; /* Flags used to take different actions according to flag type -- cgit v1.2.3 From 6e498158a827fd515b514842e9a06bdf0f75ab86 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:19 -0400 Subject: tipc: move link synch and failover to link aggregation level Link failover and synchronization have until now been handled by the links themselves, forcing them to have knowledge about and to access parallel links in order to make the two algorithms work correctly. In this commit, we move the control part of this functionality to the link aggregation level in node.c, which is the right location for this. As a result, the two algorithms become easier to follow, and the link implementation becomes simpler. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 482 ++++++++++++++++---------------------------------------- net/tipc/link.h | 14 +- net/tipc/msg.h | 32 ++-- net/tipc/node.c | 291 ++++++++++++++++++++++------------ net/tipc/node.h | 31 +--- 5 files changed, 342 insertions(+), 508 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 2ccdb6ffd5c8..d5f4005f388f 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -48,7 +48,7 @@ /* * Error message prefixes */ -static const char *link_co_err = "Link changeover error, "; +static const char *link_co_err = "Link tunneling error, "; static const char *link_rst_msg = "Resetting link "; static const char *link_unk_evt = "Unknown link event "; @@ -139,24 +139,6 @@ static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); -static int tipc_link_failover_rcv(struct tipc_link *l, struct sk_buff **skb); - -/* - * Simple link routines - */ -static unsigned int align(unsigned int i) -{ - return (i + 3) & ~3u; -} - -static struct tipc_link *tipc_parallel_link(struct tipc_link *l) -{ - struct tipc_node *n = l->owner; - - if (node_active_link(n, 0) != l) - return node_active_link(n, 0); - return node_active_link(n, 1); -} /* * Simple non-static link routines (i.e. referenced outside this file) @@ -394,12 +376,10 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, /* Perform actions as decided by FSM */ if (actions & LINK_RESET) { l->exec_mode = TIPC_LINK_BLOCKED; - rc |= TIPC_LINK_DOWN_EVT; - } - if (actions & LINK_ACTIVATE) { - l->exec_mode = TIPC_LINK_OPEN; - rc |= TIPC_LINK_UP_EVT; + rc = TIPC_LINK_DOWN_EVT; } + if (actions & LINK_ACTIVATE) + rc = TIPC_LINK_UP_EVT; if (actions & (SND_STATE | SND_PROBE)) mtyp = STATE_MSG; if (actions & SND_RESET) @@ -461,6 +441,9 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) { int rc = 0; + if (l->exec_mode == TIPC_LINK_BLOCKED) + return rc; + link_profile_stats(l); if (l->silent_intv_cnt) rc = tipc_link_fsm_evt(l, SILENCE_EVT, xmitq); @@ -563,52 +546,42 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr) tipc_link_reset_fragments(l_ptr); } -void tipc_link_reset(struct tipc_link *l_ptr) +void tipc_link_reset(struct tipc_link *l) { - u32 prev_state = l_ptr->state; - struct tipc_node *owner = l_ptr->owner; - struct tipc_link *pl = tipc_parallel_link(l_ptr); + struct tipc_node *owner = l->owner; - msg_set_session(l_ptr->pmsg, ((msg_session(l_ptr->pmsg) + 1) & 0xffff)); + l->state = TIPC_LINK_RESETTING; /* Link is down, accept any session */ - l_ptr->peer_session = WILDCARD_SESSION; + l->peer_session = WILDCARD_SESSION; - /* Prepare for renewed mtu size negotiation */ - l_ptr->mtu = l_ptr->advertised_mtu; - - l_ptr->state = TIPC_LINK_RESETTING; + /* If peer is up, it only accepts an incremented session number */ + msg_set_session(l->pmsg, msg_session(l->pmsg) + 1); - if ((prev_state == TIPC_LINK_RESETTING) || - (prev_state == TIPC_LINK_ESTABLISHING)) - return; + /* Prepare for renewed mtu size negotiation */ + l->mtu = l->advertised_mtu; - if (tipc_node_is_up(l_ptr->owner) && (pl != l_ptr)) { - l_ptr->exec_mode = TIPC_LINK_BLOCKED; - l_ptr->failover_checkpt = l_ptr->rcv_nxt; - pl->failover_pkts = FIRST_FAILOVER; - pl->failover_checkpt = l_ptr->rcv_nxt; - pl->failover_skb = l_ptr->reasm_buf; - } else { - kfree_skb(l_ptr->reasm_buf); - } /* Clean up all queues, except inputq: */ - __skb_queue_purge(&l_ptr->transmq); - __skb_queue_purge(&l_ptr->deferdq); + __skb_queue_purge(&l->transmq); + __skb_queue_purge(&l->deferdq); if (!owner->inputq) - owner->inputq = l_ptr->inputq; - skb_queue_splice_init(&l_ptr->wakeupq, owner->inputq); + owner->inputq = l->inputq; + skb_queue_splice_init(&l->wakeupq, owner->inputq); if (!skb_queue_empty(owner->inputq)) owner->action_flags |= TIPC_MSG_EVT; - tipc_link_purge_backlog(l_ptr); - l_ptr->reasm_buf = NULL; - l_ptr->rcv_unacked = 0; - l_ptr->snd_nxt = 1; - l_ptr->rcv_nxt = 1; - l_ptr->silent_intv_cnt = 0; - l_ptr->stats.recv_info = 0; - l_ptr->stale_count = 0; - link_reset_statistics(l_ptr); + + tipc_link_purge_backlog(l); + kfree_skb(l->reasm_buf); + kfree_skb(l->failover_reasm_skb); + l->reasm_buf = NULL; + l->failover_reasm_skb = NULL; + l->rcv_unacked = 0; + l->snd_nxt = 1; + l->rcv_nxt = 1; + l->silent_intv_cnt = 0; + l->stats.recv_info = 0; + l->stale_count = 0; + link_reset_statistics(l); } /** @@ -751,20 +724,6 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, return 0; } -static void skb2list(struct sk_buff *skb, struct sk_buff_head *list) -{ - skb_queue_head_init(list); - __skb_queue_tail(list, skb); -} - -static int __tipc_link_xmit_skb(struct tipc_link *link, struct sk_buff *skb) -{ - struct sk_buff_head head; - - skb2list(skb, &head); - return __tipc_link_xmit(link->owner->net, link, &head); -} - /* * tipc_link_sync_rcv - synchronize broadcast link endpoints. * Receive the sequence number where we should start receiving and @@ -955,32 +914,6 @@ static int tipc_link_retransm(struct tipc_link *l, int retransm, return 0; } -/* link_synch(): check if all packets arrived before the synch - * point have been consumed - * Returns true if the parallel links are synched, otherwise false - */ -static bool link_synch(struct tipc_link *l) -{ - unsigned int post_synch; - struct tipc_link *pl; - - pl = tipc_parallel_link(l); - if (pl == l) - goto synched; - - /* Was last pre-synch packet added to input queue ? */ - if (less_eq(pl->rcv_nxt, l->synch_point)) - return false; - - /* Is it still in the input queue ? */ - post_synch = mod(pl->rcv_nxt - l->synch_point) - 1; - if (skb_queue_len(pl->inputq) > post_synch) - return false; -synched: - l->exec_mode = TIPC_LINK_OPEN; - return true; -} - /* tipc_data_input - deliver data and name distr msgs to upper layer * * Consumes buffer if message is of right type @@ -1025,54 +958,59 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) /* tipc_link_input - process packet that has passed link protocol check * * Consumes buffer - * Node lock must be held */ -static int tipc_link_input(struct tipc_link *link, struct sk_buff *skb) +static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb) { - struct tipc_node *node = link->owner; - struct tipc_msg *msg = buf_msg(skb); + struct tipc_node *node = l->owner; + struct tipc_msg *hdr = buf_msg(skb); + struct sk_buff **reasm_skb = &l->reasm_buf; struct sk_buff *iskb; - int pos = 0; + int usr = msg_user(hdr); int rc = 0; + int pos = 0; + int ipos = 0; - switch (msg_user(msg)) { - case TUNNEL_PROTOCOL: - if (msg_dup(msg)) { - link->exec_mode = TIPC_LINK_TUNNEL; - link->synch_point = msg_seqno(msg_get_wrapped(msg)); - kfree_skb(skb); - break; + if (unlikely(usr == TUNNEL_PROTOCOL)) { + if (msg_type(hdr) == SYNCH_MSG) { + __skb_queue_purge(&l->deferdq); + goto drop; } - rc |= tipc_link_failover_rcv(link, &skb); - if (!skb) - break; - if (msg_user(buf_msg(skb)) != MSG_BUNDLER) { - tipc_data_input(link, skb); - break; - } - case MSG_BUNDLER: - link->stats.recv_bundles++; - link->stats.recv_bundled += msg_msgcnt(msg); + if (!tipc_msg_extract(skb, &iskb, &ipos)) + return rc; + kfree_skb(skb); + skb = iskb; + hdr = buf_msg(skb); + if (less(msg_seqno(hdr), l->drop_point)) + goto drop; + if (tipc_data_input(l, skb)) + return rc; + usr = msg_user(hdr); + reasm_skb = &l->failover_reasm_skb; + } + if (usr == MSG_BUNDLER) { + l->stats.recv_bundles++; + l->stats.recv_bundled += msg_msgcnt(hdr); while (tipc_msg_extract(skb, &iskb, &pos)) - tipc_data_input(link, iskb); - break; - case MSG_FRAGMENTER: - link->stats.recv_fragments++; - if (tipc_buf_append(&link->reasm_buf, &skb)) { - link->stats.recv_fragmented++; - tipc_data_input(link, skb); - } else if (!link->reasm_buf) { - link->exec_mode = TIPC_LINK_BLOCKED; - rc |= TIPC_LINK_DOWN_EVT; + tipc_data_input(l, iskb); + return rc; + } else if (usr == MSG_FRAGMENTER) { + l->stats.recv_fragments++; + if (tipc_buf_append(reasm_skb, &skb)) { + l->stats.recv_fragmented++; + tipc_data_input(l, skb); + } else if (!*reasm_skb) { + l->exec_mode = TIPC_LINK_BLOCKED; + l->state = TIPC_LINK_RESETTING; + rc = TIPC_LINK_DOWN_EVT; } - break; - case BCAST_PROTOCOL: + return rc; + } else if (usr == BCAST_PROTOCOL) { tipc_link_sync_rcv(node, skb); - break; - default: - break; - }; + return rc; + } +drop: + kfree_skb(skb); return rc; } @@ -1100,7 +1038,6 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { struct sk_buff_head *arrvq = &l->deferdq; - struct sk_buff *tmp; struct tipc_msg *hdr; u16 seqno, rcv_nxt; int rc = 0; @@ -1112,18 +1049,18 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, return rc; } - skb_queue_walk_safe(arrvq, skb, tmp) { + while ((skb = skb_peek(arrvq))) { hdr = buf_msg(skb); /* Verify and update link state */ if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) { __skb_dequeue(arrvq); - rc |= tipc_link_proto_rcv(l, skb, xmitq); + rc = tipc_link_proto_rcv(l, skb, xmitq); continue; } if (unlikely(!link_working(l))) { - rc |= tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq); + rc = tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq); if (!link_working(l)) { kfree_skb(__skb_dequeue(arrvq)); return rc; @@ -1156,18 +1093,11 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, return rc; } - /* Synchronize with parallel link if applicable */ - if (unlikely(l->exec_mode == TIPC_LINK_TUNNEL)) - if (!msg_dup(hdr) && !link_synch(l)) { - kfree_skb(skb); - return rc; - } - /* Packet can be delivered */ l->rcv_nxt++; l->stats.recv_info++; if (unlikely(!tipc_data_input(l, skb))) - rc |= tipc_link_input(l, skb); + rc = tipc_link_input(l, skb); /* Ack at regular intervals */ if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) { @@ -1288,7 +1218,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, } else { /* RESET_MSG or ACTIVATE_MSG */ msg_set_max_pkt(hdr, l->advertised_mtu); - msg_set_ack(hdr, l->failover_checkpt - 1); + msg_set_ack(hdr, l->rcv_nxt - 1); msg_set_next_sent(hdr, 1); } skb = tipc_buf_acquire(msg_size(hdr)); @@ -1296,223 +1226,75 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, return; skb_copy_to_linear_data(skb, hdr, msg_size(hdr)); skb->priority = TC_PRIO_CONTROL; - __skb_queue_head(xmitq, skb); -} - -/* tipc_link_tunnel_xmit(): Tunnel one packet via a link belonging to - * a different bearer. Owner node is locked. - */ -static void tipc_link_tunnel_xmit(struct tipc_link *l_ptr, - struct tipc_msg *tunnel_hdr, - struct tipc_msg *msg, - u32 selector) -{ - struct tipc_link *tunnel; - struct sk_buff *skb; - u32 length = msg_size(msg); - - tunnel = node_active_link(l_ptr->owner, selector & 1); - if (!tipc_link_is_up(tunnel)) { - pr_warn("%stunnel link no longer available\n", link_co_err); - return; - } - msg_set_size(tunnel_hdr, length + INT_H_SIZE); - skb = tipc_buf_acquire(length + INT_H_SIZE); - if (!skb) { - pr_warn("%sunable to send tunnel msg\n", link_co_err); - return; - } - skb_copy_to_linear_data(skb, tunnel_hdr, INT_H_SIZE); - skb_copy_to_linear_data_offset(skb, INT_H_SIZE, msg, length); - __tipc_link_xmit_skb(tunnel, skb); + __skb_queue_tail(xmitq, skb); } - -/* tipc_link_failover_send_queue(): A link has gone down, but a second - * link is still active. We can do failover. Tunnel the failing link's - * whole send queue via the remaining link. This way, we don't lose - * any packets, and sequence order is preserved for subsequent traffic - * sent over the remaining link. Owner node is locked. +/* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets + * with contents of the link's tranmsit and backlog queues. */ -void tipc_link_failover_send_queue(struct tipc_link *l_ptr) +void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, + int mtyp, struct sk_buff_head *xmitq) { - int msgcount; - struct tipc_link *tunnel = node_active_link(l_ptr->owner, 0); - struct tipc_msg tunnel_hdr; - struct sk_buff *skb; - int split_bundles; + struct sk_buff *skb, *tnlskb; + struct tipc_msg *hdr, tnlhdr; + struct sk_buff_head *queue = &l->transmq; + struct sk_buff_head tmpxq, tnlq; + u16 pktlen, pktcnt, seqno = l->snd_nxt; - if (!tunnel) + if (!tnl) return; - tipc_msg_init(link_own_addr(l_ptr), &tunnel_hdr, TUNNEL_PROTOCOL, - FAILOVER_MSG, INT_H_SIZE, l_ptr->addr); + skb_queue_head_init(&tnlq); + skb_queue_head_init(&tmpxq); - skb_queue_walk(&l_ptr->backlogq, skb) { - msg_set_seqno(buf_msg(skb), l_ptr->snd_nxt); - l_ptr->snd_nxt = mod(l_ptr->snd_nxt + 1); - } - skb_queue_splice_tail_init(&l_ptr->backlogq, &l_ptr->transmq); - tipc_link_purge_backlog(l_ptr); - msgcount = skb_queue_len(&l_ptr->transmq); - msg_set_bearer_id(&tunnel_hdr, l_ptr->peer_bearer_id); - msg_set_msgcnt(&tunnel_hdr, msgcount); - - if (skb_queue_empty(&l_ptr->transmq)) { - skb = tipc_buf_acquire(INT_H_SIZE); - if (skb) { - skb_copy_to_linear_data(skb, &tunnel_hdr, INT_H_SIZE); - msg_set_size(&tunnel_hdr, INT_H_SIZE); - __tipc_link_xmit_skb(tunnel, skb); - } else { - pr_warn("%sunable to send changeover msg\n", - link_co_err); - } + /* At least one packet required for safe algorithm => add dummy */ + skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, + BASIC_H_SIZE, 0, l->addr, link_own_addr(l), + 0, 0, TIPC_ERR_NO_PORT); + if (!skb) { + pr_warn("%sunable to create tunnel packet\n", link_co_err); return; } - - split_bundles = (node_active_link(l_ptr->owner, 0) != - node_active_link(l_ptr->owner, 0)); - - skb_queue_walk(&l_ptr->transmq, skb) { - struct tipc_msg *msg = buf_msg(skb); - - if ((msg_user(msg) == MSG_BUNDLER) && split_bundles) { - struct tipc_msg *m = msg_get_wrapped(msg); - unchar *pos = (unchar *)m; - - msgcount = msg_msgcnt(msg); - while (msgcount--) { - msg_set_seqno(m, msg_seqno(msg)); - tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, m, - msg_link_selector(m)); - pos += align(msg_size(m)); - m = (struct tipc_msg *)pos; - } - } else { - tipc_link_tunnel_xmit(l_ptr, &tunnel_hdr, msg, - msg_link_selector(msg)); - } - } -} - -/* tipc_link_dup_queue_xmit(): A second link has become active. Tunnel a - * duplicate of the first link's send queue via the new link. This way, we - * are guaranteed that currently queued packets from a socket are delivered - * before future traffic from the same socket, even if this is using the - * new link. The last arriving copy of each duplicate packet is dropped at - * the receiving end by the regular protocol check, so packet cardinality - * and sequence order is preserved per sender/receiver socket pair. - * Owner node is locked. - */ -void tipc_link_dup_queue_xmit(struct tipc_link *link, - struct tipc_link *tnl) -{ - struct sk_buff *skb; - struct tipc_msg tnl_hdr; - struct sk_buff_head *queue = &link->transmq; - int mcnt; - u16 seqno; - - tipc_msg_init(link_own_addr(link), &tnl_hdr, TUNNEL_PROTOCOL, - SYNCH_MSG, INT_H_SIZE, link->addr); - mcnt = skb_queue_len(&link->transmq) + skb_queue_len(&link->backlogq); - msg_set_msgcnt(&tnl_hdr, mcnt); - msg_set_bearer_id(&tnl_hdr, link->peer_bearer_id); - -tunnel_queue: + skb_queue_tail(&tnlq, skb); + tipc_link_xmit(l, &tnlq, &tmpxq); + __skb_queue_purge(&tmpxq); + + /* Initialize reusable tunnel packet header */ + tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL, + mtyp, INT_H_SIZE, l->addr); + pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq); + msg_set_msgcnt(&tnlhdr, pktcnt); + msg_set_bearer_id(&tnlhdr, l->peer_bearer_id); +tnl: + /* Wrap each packet into a tunnel packet */ skb_queue_walk(queue, skb) { - struct sk_buff *outskb; - struct tipc_msg *msg = buf_msg(skb); - u32 len = msg_size(msg); - - msg_set_ack(msg, mod(link->rcv_nxt - 1)); - msg_set_bcast_ack(msg, link->owner->bclink.last_in); - msg_set_size(&tnl_hdr, len + INT_H_SIZE); - outskb = tipc_buf_acquire(len + INT_H_SIZE); - if (outskb == NULL) { - pr_warn("%sunable to send duplicate msg\n", - link_co_err); + hdr = buf_msg(skb); + if (queue == &l->backlogq) + msg_set_seqno(hdr, seqno++); + pktlen = msg_size(hdr); + msg_set_size(&tnlhdr, pktlen + INT_H_SIZE); + tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE); + if (!tnlskb) { + pr_warn("%sunable to send packet\n", link_co_err); return; } - skb_copy_to_linear_data(outskb, &tnl_hdr, INT_H_SIZE); - skb_copy_to_linear_data_offset(outskb, INT_H_SIZE, - skb->data, len); - __tipc_link_xmit_skb(tnl, outskb); - if (!tipc_link_is_up(link)) - return; + skb_copy_to_linear_data(tnlskb, &tnlhdr, INT_H_SIZE); + skb_copy_to_linear_data_offset(tnlskb, INT_H_SIZE, hdr, pktlen); + __skb_queue_tail(&tnlq, tnlskb); } - if (queue == &link->backlogq) - return; - seqno = link->snd_nxt; - skb_queue_walk(&link->backlogq, skb) { - msg_set_seqno(buf_msg(skb), seqno); - seqno = mod(seqno + 1); - } - queue = &link->backlogq; - goto tunnel_queue; -} - -/* tipc_link_failover_rcv(): Receive a tunnelled FAILOVER_MSG packet - * Owner node is locked. - */ -static int tipc_link_failover_rcv(struct tipc_link *link, - struct sk_buff **skb) -{ - struct tipc_msg *msg = buf_msg(*skb); - struct sk_buff *iskb = NULL; - struct tipc_link *pl = NULL; - int bearer_id = msg_bearer_id(msg); - int pos = 0; - int rc = 0; - - if (msg_type(msg) != FAILOVER_MSG) { - pr_warn("%sunknown tunnel pkt received\n", link_co_err); - goto exit; + if (queue != &l->backlogq) { + queue = &l->backlogq; + goto tnl; } - if (bearer_id >= MAX_BEARERS) - goto exit; - - if (bearer_id == link->bearer_id) - goto exit; - - pl = link->owner->links[bearer_id].link; - - if (link->failover_pkts == FIRST_FAILOVER) - link->failover_pkts = msg_msgcnt(msg); - - /* Should we expect an inner packet? */ - if (!link->failover_pkts) - goto exit; - if (!tipc_msg_extract(*skb, &iskb, &pos)) { - pr_warn("%sno inner failover pkt\n", link_co_err); - *skb = NULL; - goto exit; - } - link->failover_pkts--; - *skb = NULL; + tipc_link_xmit(tnl, &tnlq, xmitq); - /* Was this packet already delivered? */ - if (less(buf_seqno(iskb), link->failover_checkpt)) { - kfree_skb(iskb); - iskb = NULL; - goto exit; - } - if (msg_user(buf_msg(iskb)) == MSG_FRAGMENTER) { - link->stats.recv_fragments++; - if (!tipc_buf_append(&link->failover_skb, &iskb) && - !link->failover_skb) { - link->exec_mode = TIPC_LINK_BLOCKED; - rc |= TIPC_LINK_DOWN_EVT; - } + if (mtyp == FAILOVER_MSG) { + tnl->drop_point = l->rcv_nxt; + tnl->failover_reasm_skb = l->reasm_buf; + l->reasm_buf = NULL; + l->exec_mode = TIPC_LINK_BLOCKED; } -exit: - if (!link->failover_pkts && pl) - pl->exec_mode = TIPC_LINK_OPEN; - kfree_skb(*skb); - *skb = iskb; - return rc; } /* tipc_link_proto_rcv(): receive link level protocol message : @@ -1593,7 +1375,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, /* If NACK, retransmit will now start at right position */ if (nacked_gap) { - rc |= tipc_link_retransm(l, nacked_gap, xmitq); + rc = tipc_link_retransm(l, nacked_gap, xmitq); l->stats.recv_nacks++; } tipc_link_advance_backlog(l, xmitq); diff --git a/net/tipc/link.h b/net/tipc/link.h index bb1378b7cb59..e377d9ba41c5 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -164,13 +164,11 @@ struct tipc_link { struct tipc_msg *pmsg; u32 priority; char net_plane; - u8 exec_mode; - u16 synch_point; - /* Failover */ - u16 failover_pkts; - u16 failover_checkpt; - struct sk_buff *failover_skb; + /* Failover/synch */ + u8 exec_mode; + u16 drop_point; + struct sk_buff *failover_reasm_skb; /* Max packet negotiation */ u16 mtu; @@ -212,8 +210,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n, const struct tipc_media_addr *maddr, struct sk_buff_head *inputq, struct sk_buff_head *namedq); -void tipc_link_failover_send_queue(struct tipc_link *l_ptr); -void tipc_link_dup_queue_xmit(struct tipc_link *l_ptr, struct tipc_link *dest); +void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, + int mtyp, struct sk_buff_head *xmitq); void tipc_link_reset_fragments(struct tipc_link *l_ptr); int tipc_link_is_up(struct tipc_link *l_ptr); int tipc_link_is_active(struct tipc_link *l_ptr); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 234fb0531d1d..115bb2aa6bed 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -110,7 +110,6 @@ struct tipc_skb_cb { struct sk_buff *tail; bool validated; bool wakeup_pending; - bool bundling; u16 chain_sz; u16 chain_imp; }; @@ -559,15 +558,6 @@ static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) msg_set_bits(m, 1, 15, 0x1fff, n); } -static inline bool msg_dup(struct tipc_msg *m) -{ - if (likely(msg_user(m) != TUNNEL_PROTOCOL)) - return false; - if (msg_type(m) != SYNCH_MSG) - return false; - return true; -} - /* * Word 2 */ @@ -621,12 +611,12 @@ static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) } -static inline u32 msg_next_sent(struct tipc_msg *m) +static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } -static inline void msg_set_next_sent(struct tipc_msg *m, u32 n) +static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } @@ -727,12 +717,12 @@ static inline char *msg_media_addr(struct tipc_msg *m) /* * Word 9 */ -static inline u32 msg_msgcnt(struct tipc_msg *m) +static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } -static inline void msg_set_msgcnt(struct tipc_msg *m, u32 n) +static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } @@ -767,19 +757,19 @@ static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) msg_set_bits(m, 9, 0, 0xffff, n); } -static inline bool msg_is_traffic(struct tipc_msg *m) +static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) return true; - if ((msg_type(m) == RESET_MSG) || (msg_type(m) == ACTIVATE_MSG)) - return false; - return true; + if (msg_type(m) == STATE_MSG) + return true; + return false; } -static inline bool msg_peer_is_up(struct tipc_msg *m) +static inline bool msg_peer_node_is_up(struct tipc_msg *m) { - if (likely(msg_is_traffic(m))) - return false; + if (msg_peer_link_is_up(m)) + return true; return msg_redundant_link(m); } diff --git a/net/tipc/node.c b/net/tipc/node.c index 6b18d73830ca..b0372bb107f6 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -42,6 +42,31 @@ #include "bcast.h" #include "discover.h" +/* Node FSM states and events: + */ +enum { + SELF_DOWN_PEER_DOWN = 0xdd, + SELF_UP_PEER_UP = 0xaa, + SELF_DOWN_PEER_LEAVING = 0xd1, + SELF_UP_PEER_COMING = 0xac, + SELF_COMING_PEER_UP = 0xca, + SELF_LEAVING_PEER_DOWN = 0x1d, + NODE_FAILINGOVER = 0xf0, + NODE_SYNCHING = 0xcc +}; + +enum { + SELF_ESTABL_CONTACT_EVT = 0xece, + SELF_LOST_CONTACT_EVT = 0x1ce, + PEER_ESTABL_CONTACT_EVT = 0x9ece, + PEER_LOST_CONTACT_EVT = 0x91ce, + NODE_FAILOVER_BEGIN_EVT = 0xfbe, + NODE_FAILOVER_END_EVT = 0xfee, + NODE_SYNCH_BEGIN_EVT = 0xcbe, + NODE_SYNCH_END_EVT = 0xcee +}; + +static void tipc_node_link_down(struct tipc_node *n, int bearer_id); static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); static void tipc_node_delete(struct tipc_node *node); @@ -281,69 +306,75 @@ static void tipc_node_timeout(unsigned long data) * * Link becomes active (alone or shared) or standby, depending on its priority. */ -void tipc_node_link_up(struct tipc_node *n, int bearer_id) +static void tipc_node_link_up(struct tipc_node *n, int bearer_id, + struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; - struct tipc_link_entry *links = n->links; - struct tipc_link *l = n->links[bearer_id].link; - - /* Leave room for tunnel header when returning 'mtu' to users: */ - links[bearer_id].mtu = l->mtu - INT_H_SIZE; + struct tipc_link *ol = node_active_link(n, 0); + struct tipc_link *nl = n->links[bearer_id].link; + if (n->working_links > 1) { + pr_warn("Attempt to establish 3rd link to %x\n", n->addr); + return; + } n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; - n->link_id = l->peer_bearer_id << 16 | l->bearer_id; + n->link_id = nl->peer_bearer_id << 16 | bearer_id; + + /* Leave room for tunnel header when returning 'mtu' to users: */ + n->links[bearer_id].mtu = nl->mtu - INT_H_SIZE; tipc_bearer_add_dest(n->net, bearer_id, n->addr); pr_debug("Established link <%s> on network plane %c\n", - l->name, l->net_plane); + nl->name, nl->net_plane); - /* No active links ? => take both active slots */ - if (!tipc_node_is_up(n)) { + /* First link? => give it both slots */ + if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; + nl->exec_mode = TIPC_LINK_OPEN; node_established_contact(n); return; } - /* Lower prio than current active ? => no slot */ - if (l->priority < links[*slot0].link->priority) { - pr_debug("New link <%s> becomes standby\n", l->name); - return; - } - tipc_link_dup_queue_xmit(links[*slot0].link, l); - - /* Same prio as current active ? => take one slot */ - if (l->priority == links[*slot0].link->priority) { + /* Second link => redistribute slots */ + if (nl->priority > ol->priority) { + pr_debug("Old link <%s> becomes standby\n", ol->name); *slot0 = bearer_id; - return; + *slot1 = bearer_id; + } else if (nl->priority == ol->priority) { + *slot0 = bearer_id; + } else { + pr_debug("New link <%s> is standby\n", nl->name); } - /* Higher prio than current active => take both active slots */ - pr_debug("Old link <%s> now standby\n", links[*slot0].link->name); - *slot0 = bearer_id; - *slot1 = bearer_id; + /* Prepare synchronization with first link */ + tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** * tipc_node_link_down - handle loss of link */ -void tipc_node_link_down(struct tipc_node *n, int bearer_id) +static void tipc_node_link_down(struct tipc_node *n, int bearer_id) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; + struct tipc_media_addr *maddr = &n->links[bearer_id].maddr; int i, highest = 0; - struct tipc_link *l, *_l; + struct tipc_link *l, *_l, *tnl; + struct sk_buff_head xmitq; l = n->links[bearer_id].link; if (!l || !tipc_link_is_up(l)) return; + __skb_queue_head_init(&xmitq); + n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; - n->link_id = l->peer_bearer_id << 16 | l->bearer_id; + n->link_id = l->peer_bearer_id << 16 | bearer_id; tipc_bearer_remove_dest(n->net, l->bearer_id, n->addr); @@ -370,13 +401,19 @@ void tipc_node_link_down(struct tipc_node *n, int bearer_id) *slot1 = i; } - if (tipc_node_is_up(n)) - tipc_link_failover_send_queue(l); + if (!tipc_node_is_up(n)) { + tipc_link_reset(l); + node_lost_contact(n); + return; + } + /* There is still a working link => initiate failover */ + tnl = node_active_link(n, 0); + tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); + n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1); + tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, &xmitq); tipc_link_reset(l); - - if (!tipc_node_is_up(n)) - node_lost_contact(n); + tipc_bearer_xmit(n->net, tnl->bearer_id, &xmitq, maddr); } bool tipc_node_is_up(struct tipc_node *n) @@ -652,37 +689,22 @@ illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); } -bool tipc_node_filter_skb(struct tipc_node *n, struct tipc_link *l, - struct tipc_msg *hdr) +bool tipc_node_filter_pkt(struct tipc_node *n, struct tipc_msg *hdr) { int state = n->state; if (likely(state == SELF_UP_PEER_UP)) return true; - if (state == SELF_DOWN_PEER_DOWN) - return true; - - if (state == SELF_UP_PEER_COMING) { - /* If not traffic msg, peer may still be ESTABLISHING */ - if (tipc_link_is_up(l) && msg_is_traffic(hdr)) - tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); - return true; - } - - if (state == SELF_COMING_PEER_UP) - return true; - if (state == SELF_LEAVING_PEER_DOWN) return false; if (state == SELF_DOWN_PEER_LEAVING) { - if (msg_peer_is_up(hdr)) + if (msg_peer_node_is_up(hdr)) return false; - tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); - return true; } - return false; + + return true; } static void node_established_contact(struct tipc_node *n_ptr) @@ -727,10 +749,8 @@ static void node_lost_contact(struct tipc_node *n_ptr) if (!l_ptr) continue; l_ptr->exec_mode = TIPC_LINK_OPEN; - l_ptr->failover_checkpt = 0; - l_ptr->failover_pkts = 0; - kfree_skb(l_ptr->failover_skb); - l_ptr->failover_skb = NULL; + kfree_skb(l_ptr->failover_reasm_skb); + l_ptr->failover_reasm_skb = NULL; tipc_link_reset_fragments(l_ptr); } /* Prevent re-contact with node until cleanup is done */ @@ -961,38 +981,111 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, return 0; } -/* tipc_node_tnl_init(): handle a received TUNNEL_PROTOCOL packet, - * in order to control parallel link failover or synchronization +/** + * tipc_node_check_state - check and if necessary update node state + * @skb: TIPC packet + * @bearer_id: identity of bearer delivering the packet + * Returns true if state is ok, otherwise consumes buffer and returns false */ -static void tipc_node_tnl_init(struct tipc_node *n, int bearer_id, - struct sk_buff *skb) +static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, + int bearer_id) { - struct tipc_link *tnl, *pl; struct tipc_msg *hdr = buf_msg(skb); + int usr = msg_user(hdr); + int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); - int pb_id = msg_bearer_id(hdr); + u16 iseqno = msg_seqno(msg_get_wrapped(hdr)); + u16 exp_pkts = msg_msgcnt(hdr); + u16 rcv_nxt, syncpt, dlv_nxt; + int state = n->state; + struct tipc_link *l, *pl = NULL; + struct sk_buff_head; + int i; - if (pb_id >= MAX_BEARERS) - return; + l = n->links[bearer_id].link; + if (!l) + return false; + rcv_nxt = l->rcv_nxt; - tnl = n->links[bearer_id].link; - if (!tnl) - return; - /* Ignore if duplicate */ - if (less(oseqno, tnl->rcv_nxt)) - return; + if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) + return true; - pl = n->links[pb_id].link; - if (!pl) - return; + /* Find parallel link, if any */ + for (i = 0; i < MAX_BEARERS; i++) { + if ((i != bearer_id) && n->links[i].link) { + pl = n->links[i].link; + break; + } + } - if (msg_type(hdr) == FAILOVER_MSG) { - if (tipc_link_is_up(pl)) { - tipc_node_link_down(n, pb_id); + /* Update node accesibility if applicable */ + if (state == SELF_UP_PEER_COMING) { + if (!tipc_link_is_up(l)) + return true; + if (!msg_peer_link_is_up(hdr)) + return true; + tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); + } + + if (state == SELF_DOWN_PEER_LEAVING) { + if (msg_peer_node_is_up(hdr)) + return false; + tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); + } + + /* Ignore duplicate packets */ + if (less(oseqno, rcv_nxt)) + return true; + + /* Initiate or update failover mode if applicable */ + if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { + syncpt = oseqno + exp_pkts - 1; + if (pl && tipc_link_is_up(pl)) { + tipc_node_link_down(n, pl->bearer_id); pl->exec_mode = TIPC_LINK_BLOCKED; } + /* If pkts arrive out of order, use lowest calculated syncpt */ + if (less(syncpt, n->sync_point)) + n->sync_point = syncpt; + } + + /* Open parallel link when tunnel link reaches synch point */ + if ((n->state == NODE_FAILINGOVER) && (more(rcv_nxt, n->sync_point))) { + tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); + if (pl) + pl->exec_mode = TIPC_LINK_OPEN; + return true; + } + + /* Initiate or update synch mode if applicable */ + if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) { + syncpt = iseqno + exp_pkts - 1; + if (n->state == SELF_UP_PEER_UP) { + n->sync_point = syncpt; + tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); + } + l->exec_mode = TIPC_LINK_TUNNEL; + if (less(syncpt, n->sync_point)) + n->sync_point = syncpt; } + + /* Open tunnel link when parallel link reaches synch point */ + if ((n->state == NODE_SYNCHING) && (l->exec_mode == TIPC_LINK_TUNNEL)) { + if (pl) + dlv_nxt = mod(pl->rcv_nxt - skb_queue_len(pl->inputq)); + if (!pl || more(dlv_nxt, n->sync_point)) { + tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); + l->exec_mode = TIPC_LINK_OPEN; + return true; + } + if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) + return true; + if (usr == LINK_PROTOCOL) + return true; + return false; + } + return true; } /** @@ -1008,12 +1101,11 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_node *n; - struct tipc_link *l; - struct tipc_msg *hdr; - struct tipc_media_addr *maddr; + struct tipc_msg *hdr = buf_msg(skb); + int usr = msg_user(hdr); int bearer_id = b->identity; + struct tipc_link_entry *le; int rc = 0; - int usr; __skb_queue_head_init(&xmitq); @@ -1022,8 +1114,6 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) goto discard; /* Handle arrival of a non-unicast link packet */ - hdr = buf_msg(skb); - usr = msg_user(hdr); if (unlikely(msg_non_seq(hdr))) { if (usr == LINK_CONFIG) tipc_disc_rcv(net, skb, b); @@ -1036,42 +1126,41 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) goto discard; - tipc_node_lock(n); + le = &n->links[bearer_id]; - /* Prepare links for tunneled reception if applicable */ - if (unlikely(usr == TUNNEL_PROTOCOL)) - tipc_node_tnl_init(n, bearer_id, skb); + tipc_node_lock(n); - /* Locate link endpoint that should handle packet */ - l = n->links[bearer_id].link; - if (unlikely(!l)) + /* Is reception permitted at the moment ? */ + if (!tipc_node_filter_pkt(n, hdr)) goto unlock; - /* Is reception of this packet permitted at the moment ? */ - if (unlikely(n->state != SELF_UP_PEER_UP)) - if (!tipc_node_filter_skb(n, l, hdr)) - goto unlock; - - if (unlikely(usr == LINK_PROTOCOL)) + if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) tipc_bclink_sync_state(n, hdr); /* Release acked broadcast messages */ if (unlikely(n->bclink.acked != msg_bcast_ack(hdr))) tipc_bclink_acknowledge(n, msg_bcast_ack(hdr)); - /* Check protocol and update link state */ - rc = tipc_link_rcv(l, skb, &xmitq); + /* Check and if necessary update node state */ + if (likely(tipc_node_check_state(n, skb, bearer_id))) { + rc = tipc_link_rcv(le->link, skb, &xmitq); + skb = NULL; + } if (unlikely(rc & TIPC_LINK_UP_EVT)) - tipc_node_link_up(n, bearer_id); + tipc_node_link_up(n, bearer_id, &xmitq); + if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id); - skb = NULL; unlock: tipc_node_unlock(n); - tipc_sk_rcv(net, &n->links[bearer_id].inputq); - maddr = &n->links[bearer_id].maddr; - tipc_bearer_xmit(net, bearer_id, &xmitq, maddr); + + if (!skb_queue_empty(&le->inputq)) + tipc_sk_rcv(net, &le->inputq); + + if (!skb_queue_empty(&xmitq)) + tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr); + tipc_node_put(n); discard: kfree_skb(skb); diff --git a/net/tipc/node.h b/net/tipc/node.h index 65e2728f66a6..406c6fe0dbb2 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -47,33 +47,7 @@ #define INVALID_BEARER_ID -1 -/* Node FSM states and events: - */ -enum { - SELF_DOWN_PEER_DOWN = 0xdd, - SELF_UP_PEER_UP = 0xaa, - SELF_DOWN_PEER_LEAVING = 0xd1, - SELF_UP_PEER_COMING = 0xac, - SELF_COMING_PEER_UP = 0xca, - SELF_LEAVING_PEER_DOWN = 0x1d, - NODE_FAILINGOVER = 0xf0, - NODE_SYNCHING = 0xcc -}; - -enum { - SELF_ESTABL_CONTACT_EVT = 0xece, - SELF_LOST_CONTACT_EVT = 0x1ce, - PEER_ESTABL_CONTACT_EVT = 0xfece, - PEER_LOST_CONTACT_EVT = 0xf1ce, - NODE_FAILOVER_BEGIN_EVT = 0xfbe, - NODE_FAILOVER_END_EVT = 0xfee, - NODE_SYNCH_BEGIN_EVT = 0xcbe, - NODE_SYNCH_END_EVT = 0xcee -}; - /* Flags used to take different actions according to flag type - * TIPC_WAIT_PEER_LINKS_DOWN: wait to see that peer's links are down - * TIPC_WAIT_OWN_LINKS_DOWN: wait until peer node is declared down * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type @@ -133,6 +107,8 @@ struct tipc_link_entry { * @links: array containing references to all links to node * @action_flags: bit mask of different types of node actions * @bclink: broadcast-related info + * @state: connectivity state vs peer node + * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node @@ -156,6 +132,7 @@ struct tipc_node { struct tipc_node_bclink bclink; struct list_head list; int state; + u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; @@ -180,8 +157,6 @@ bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *bearer, void tipc_node_delete_links(struct net *net, int bearer_id); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -void tipc_node_link_down(struct tipc_node *n_ptr, int bearer_id); -void tipc_node_link_up(struct tipc_node *n_ptr, int bearer_id); bool tipc_node_is_up(struct tipc_node *n); int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node, char *linkname, size_t len); -- cgit v1.2.3 From 5045f7b9009f1455268b98cecbcc271663934c85 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:20 -0400 Subject: tipc: move protocol message sending away from link FSM The implementation of the link FSM currently takes decisions about and sends out link protocol messages. This is unnecessary, since such actions are not the result of any link state change, and are even decided based on non-FSM state information ("silent_intv_cnt"). We now move the sending of unicast link protocol messages to the function tipc_link_timeout(), and the initial broadcast synchronization message to tipc_node_link_up(). The latter is done because a link instance should not need to know whether it is the first or second link to a destination. Such information is now restricted to and handled by the link aggregation layer in node.c Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 51 ++++++++++++++++++++++++++++++--------------------- net/tipc/link.h | 2 ++ net/tipc/node.c | 1 + 3 files changed, 33 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index d5f4005f388f..9a3ccf910c49 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -134,8 +134,6 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, struct sk_buff_head *xmitq); static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); -static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, - struct sk_buff_head *xmitq); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb); static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); @@ -245,8 +243,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, * Give a newly added peer node the sequence number where it should * start receiving and acking broadcast packets. */ -static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, - struct sk_buff_head *xmitq) +void tipc_link_build_bcast_sync_msg(struct tipc_link *l, + struct sk_buff_head *xmitq) { struct sk_buff *skb; struct sk_buff_head list; @@ -272,7 +270,7 @@ static void tipc_link_build_bcast_sync_msg(struct tipc_link *l, static int tipc_link_fsm_evt(struct tipc_link *l, int evt, struct sk_buff_head *xmitq) { - int mtyp = 0, rc = 0; + int rc = 0; struct tipc_link *pl; enum { LINK_RESET = 1, @@ -380,17 +378,7 @@ static int tipc_link_fsm_evt(struct tipc_link *l, int evt, } if (actions & LINK_ACTIVATE) rc = TIPC_LINK_UP_EVT; - if (actions & (SND_STATE | SND_PROBE)) - mtyp = STATE_MSG; - if (actions & SND_RESET) - mtyp = RESET_MSG; - if (actions & SND_ACTIVATE) - mtyp = ACTIVATE_MSG; - if (actions & (SND_PROBE | SND_STATE | SND_RESET | SND_ACTIVATE)) - tipc_link_build_proto_msg(l, mtyp, actions & SND_PROBE, - 0, 0, 0, xmitq); - if (actions & SND_BCAST_SYNC) - tipc_link_build_bcast_sync_msg(l, xmitq); + return rc; } @@ -440,16 +428,37 @@ static void link_profile_stats(struct tipc_link *l) int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) { int rc = 0; + int mtyp = STATE_MSG; + bool xmit = false; + bool prb = false; if (l->exec_mode == TIPC_LINK_BLOCKED) return rc; link_profile_stats(l); - if (l->silent_intv_cnt) - rc = tipc_link_fsm_evt(l, SILENCE_EVT, xmitq); - else if (link_working(l) && tipc_bclink_acks_missing(l->owner)) - tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, xmitq); - l->silent_intv_cnt++; + + if (l->state == TIPC_LINK_WORKING) { + if (!l->silent_intv_cnt) { + if (tipc_bclink_acks_missing(l->owner)) + xmit = true; + } else if (l->silent_intv_cnt <= l->abort_limit) { + xmit = true; + prb = true; + } else { + l->exec_mode = TIPC_LINK_BLOCKED; + rc |= TIPC_LINK_DOWN_EVT; + } + l->silent_intv_cnt++; + } else if (l->state == TIPC_LINK_RESETTING) { + xmit = true; + mtyp = RESET_MSG; + } else if (l->state == TIPC_LINK_ESTABLISHING) { + xmit = true; + mtyp = ACTIVATE_MSG; + } + if (xmit) + tipc_link_build_proto_msg(l, mtyp, prb, 0, 0, 0, xmitq); + return rc; } diff --git a/net/tipc/link.h b/net/tipc/link.h index e377d9ba41c5..b317c4df9079 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -212,6 +212,8 @@ struct tipc_link *tipc_link_create(struct tipc_node *n, struct sk_buff_head *namedq); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); +void tipc_link_build_bcast_sync_msg(struct tipc_link *l, + struct sk_buff_head *xmitq); void tipc_link_reset_fragments(struct tipc_link *l_ptr); int tipc_link_is_up(struct tipc_link *l_ptr); int tipc_link_is_active(struct tipc_link *l_ptr); diff --git a/net/tipc/node.c b/net/tipc/node.c index b0372bb107f6..9e20acffb3d4 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -335,6 +335,7 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, *slot0 = bearer_id; *slot1 = bearer_id; nl->exec_mode = TIPC_LINK_OPEN; + tipc_link_build_bcast_sync_msg(nl, xmitq); node_established_contact(n); return; } -- cgit v1.2.3 From 662921cd0a53db4504838dfbb7d996f9e6e94001 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:21 -0400 Subject: tipc: merge link->exec_mode and link->state into one FSM Until now, we have been handling link failover and synchronization by using an additional link state variable, "exec_mode". This variable is not independent of the link FSM state, something causing a risk of inconsistencies, apart from the fact that it clutters the code. The conditions are now in place to define a new link FSM that covers all existing use cases, including failover and synchronization, and eliminate the "exec_mode" field altogether. The FSM must also support non-atomic resetting of links, which will be introduced later. The new link FSM is shown below, with 7 states and 8 events. Only events leading to state change are shown as edges. +------------------------------------+ |RESET_EVT | | | | +--------------+ | +-----------------| SYNCHING |-----------------+ | |FAILURE_EVT +--------------+ PEER_RESET_EVT| | | A | | | | | | | | | | | | | | |SYNCH_ |SYNCH_ | | | |BEGIN_EVT |END_EVT | | | | | | | V | V V | +-------------+ +--------------+ +------------+ | | RESETTING |<---------| ESTABLISHED |--------->| PEER_RESET | | +-------------+ FAILURE_ +--------------+ PEER_ +------------+ | | EVT | A RESET_EVT | | | | | | | | | | | | | +--------------+ | | | RESET_EVT| |RESET_EVT |ESTABLISH_EVT | | | | | | | | | | | | V V | | | +-------------+ +--------------+ RESET_EVT| +--->| RESET |--------->| ESTABLISHING |<----------------+ +-------------+ PEER_ +--------------+ | A RESET_EVT | | | | | | | |FAILOVER_ |FAILOVER_ |FAILOVER_ |BEGIN_EVT |END_EVT |BEGIN_EVT | | | V | | +-------------+ | | FAILINGOVER |<----------------+ +-------------+ These changes are fully backwards compatible. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 350 +++++++++++++++++++++++++++++++------------------------- net/tipc/link.h | 25 ++-- net/tipc/node.c | 31 ++--- 3 files changed, 226 insertions(+), 180 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 9a3ccf910c49..9840b03348e1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -50,7 +50,6 @@ */ static const char *link_co_err = "Link tunneling error, "; static const char *link_rst_msg = "Resetting link "; -static const char *link_unk_evt = "Unknown link event "; static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, @@ -85,46 +84,23 @@ static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { */ #define WILDCARD_SESSION 0x10000 -/* State value stored in 'failover_pkts' +/* Link FSM states: */ -#define FIRST_FAILOVER 0xffffu - -/* Link FSM states and events: - */ -enum { - TIPC_LINK_WORKING, - TIPC_LINK_PROBING, - TIPC_LINK_RESETTING, - TIPC_LINK_ESTABLISHING -}; - enum { - PEER_RESET_EVT = RESET_MSG, - ACTIVATE_EVT = ACTIVATE_MSG, - TRAFFIC_EVT, /* Any other valid msg from peer */ - SILENCE_EVT /* Peer was silent during last timer interval*/ + LINK_ESTABLISHED = 0xe, + LINK_ESTABLISHING = 0xe << 4, + LINK_RESET = 0x1 << 8, + LINK_RESETTING = 0x2 << 12, + LINK_PEER_RESET = 0xd << 16, + LINK_FAILINGOVER = 0xf << 20, + LINK_SYNCHING = 0xc << 24 }; /* Link FSM state checking routines */ -static int link_working(struct tipc_link *l) -{ - return l->state == TIPC_LINK_WORKING; -} - -static int link_probing(struct tipc_link *l) -{ - return l->state == TIPC_LINK_PROBING; -} - -static int link_resetting(struct tipc_link *l) +static int link_is_up(struct tipc_link *l) { - return l->state == TIPC_LINK_RESETTING; -} - -static int link_establishing(struct tipc_link *l) -{ - return l->state == TIPC_LINK_ESTABLISHING; + return l->state & (LINK_ESTABLISHED | LINK_SYNCHING); } static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, @@ -141,11 +117,29 @@ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); /* * Simple non-static link routines (i.e. referenced outside this file) */ -int tipc_link_is_up(struct tipc_link *l_ptr) +bool tipc_link_is_up(struct tipc_link *l) { - if (!l_ptr) - return 0; - return link_working(l_ptr) || link_probing(l_ptr); + return link_is_up(l); +} + +bool tipc_link_is_reset(struct tipc_link *l) +{ + return l->state & (LINK_RESET | LINK_FAILINGOVER | LINK_ESTABLISHING); +} + +bool tipc_link_is_synching(struct tipc_link *l) +{ + return l->state == LINK_SYNCHING; +} + +bool tipc_link_is_failingover(struct tipc_link *l) +{ + return l->state == LINK_FAILINGOVER; +} + +bool tipc_link_is_blocked(struct tipc_link *l) +{ + return l->state & (LINK_RESETTING | LINK_PEER_RESET | LINK_FAILINGOVER); } int tipc_link_is_active(struct tipc_link *l) @@ -210,7 +204,7 @@ struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, l_ptr->tolerance = b_ptr->tolerance; l_ptr->snd_nxt = 1; l_ptr->rcv_nxt = 1; - l_ptr->state = TIPC_LINK_RESETTING; + l_ptr->state = LINK_RESET; l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; msg = l_ptr->pmsg; @@ -265,120 +259,159 @@ void tipc_link_build_bcast_sync_msg(struct tipc_link *l, * tipc_link_fsm_evt - link finite state machine * @l: pointer to link * @evt: state machine event to be processed - * @xmitq: queue to prepend created protocol message, if any */ -static int tipc_link_fsm_evt(struct tipc_link *l, int evt, - struct sk_buff_head *xmitq) +int tipc_link_fsm_evt(struct tipc_link *l, int evt) { int rc = 0; - struct tipc_link *pl; - enum { - LINK_RESET = 1, - LINK_ACTIVATE = (1 << 1), - SND_PROBE = (1 << 2), - SND_STATE = (1 << 3), - SND_RESET = (1 << 4), - SND_ACTIVATE = (1 << 5), - SND_BCAST_SYNC = (1 << 6) - } actions = 0; - - if (l->exec_mode == TIPC_LINK_BLOCKED) - return rc; switch (l->state) { - case TIPC_LINK_WORKING: + case LINK_RESETTING: switch (evt) { - case TRAFFIC_EVT: - case ACTIVATE_EVT: + case LINK_PEER_RESET_EVT: + l->state = LINK_PEER_RESET; break; - case SILENCE_EVT: - l->state = TIPC_LINK_PROBING; - actions |= SND_PROBE; + case LINK_RESET_EVT: + l->state = LINK_RESET; + break; + case LINK_FAILURE_EVT: + case LINK_FAILOVER_BEGIN_EVT: + case LINK_ESTABLISH_EVT: + case LINK_FAILOVER_END_EVT: + case LINK_SYNCH_BEGIN_EVT: + case LINK_SYNCH_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_RESET: + switch (evt) { + case LINK_PEER_RESET_EVT: + l->state = LINK_ESTABLISHING; break; - case PEER_RESET_EVT: - actions |= LINK_RESET | SND_ACTIVATE; + case LINK_FAILOVER_BEGIN_EVT: + l->state = LINK_FAILINGOVER; + case LINK_FAILURE_EVT: + case LINK_RESET_EVT: + case LINK_ESTABLISH_EVT: + case LINK_FAILOVER_END_EVT: break; + case LINK_SYNCH_BEGIN_EVT: + case LINK_SYNCH_END_EVT: default: - pr_debug("%s%u WORKING\n", link_unk_evt, evt); + goto illegal_evt; } break; - case TIPC_LINK_PROBING: + case LINK_PEER_RESET: switch (evt) { - case TRAFFIC_EVT: - case ACTIVATE_EVT: - l->state = TIPC_LINK_WORKING; + case LINK_RESET_EVT: + l->state = LINK_ESTABLISHING; break; - case PEER_RESET_EVT: - actions |= LINK_RESET | SND_ACTIVATE; + case LINK_PEER_RESET_EVT: + case LINK_ESTABLISH_EVT: + case LINK_FAILURE_EVT: break; - case SILENCE_EVT: - if (l->silent_intv_cnt <= l->abort_limit) { - actions |= SND_PROBE; - break; - } - actions |= LINK_RESET | SND_RESET; + case LINK_SYNCH_BEGIN_EVT: + case LINK_SYNCH_END_EVT: + case LINK_FAILOVER_BEGIN_EVT: + case LINK_FAILOVER_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_FAILINGOVER: + switch (evt) { + case LINK_FAILOVER_END_EVT: + l->state = LINK_RESET; break; + case LINK_PEER_RESET_EVT: + case LINK_RESET_EVT: + case LINK_ESTABLISH_EVT: + case LINK_FAILURE_EVT: + break; + case LINK_FAILOVER_BEGIN_EVT: + case LINK_SYNCH_BEGIN_EVT: + case LINK_SYNCH_END_EVT: default: - pr_err("%s%u PROBING\n", link_unk_evt, evt); + goto illegal_evt; } break; - case TIPC_LINK_RESETTING: + case LINK_ESTABLISHING: switch (evt) { - case TRAFFIC_EVT: + case LINK_ESTABLISH_EVT: + l->state = LINK_ESTABLISHED; + rc |= TIPC_LINK_UP_EVT; break; - case ACTIVATE_EVT: - pl = node_active_link(l->owner, 0); - if (pl && link_probing(pl)) - break; - l->state = TIPC_LINK_WORKING; - actions |= LINK_ACTIVATE; - if (!l->owner->working_links) - actions |= SND_BCAST_SYNC; + case LINK_FAILOVER_BEGIN_EVT: + l->state = LINK_FAILINGOVER; + break; + case LINK_PEER_RESET_EVT: + case LINK_RESET_EVT: + case LINK_FAILURE_EVT: + case LINK_SYNCH_BEGIN_EVT: + case LINK_FAILOVER_END_EVT: + break; + case LINK_SYNCH_END_EVT: + default: + goto illegal_evt; + } + break; + case LINK_ESTABLISHED: + switch (evt) { + case LINK_PEER_RESET_EVT: + l->state = LINK_PEER_RESET; + rc |= TIPC_LINK_DOWN_EVT; + break; + case LINK_FAILURE_EVT: + l->state = LINK_RESETTING; + rc |= TIPC_LINK_DOWN_EVT; break; - case PEER_RESET_EVT: - l->state = TIPC_LINK_ESTABLISHING; - actions |= SND_ACTIVATE; + case LINK_RESET_EVT: + l->state = LINK_RESET; break; - case SILENCE_EVT: - actions |= SND_RESET; + case LINK_ESTABLISH_EVT: break; + case LINK_SYNCH_BEGIN_EVT: + l->state = LINK_SYNCHING; + break; + case LINK_SYNCH_END_EVT: + case LINK_FAILOVER_BEGIN_EVT: + case LINK_FAILOVER_END_EVT: default: - pr_err("%s%u in RESETTING\n", link_unk_evt, evt); + goto illegal_evt; } break; - case TIPC_LINK_ESTABLISHING: + case LINK_SYNCHING: switch (evt) { - case TRAFFIC_EVT: - case ACTIVATE_EVT: - pl = node_active_link(l->owner, 0); - if (pl && link_probing(pl)) - break; - l->state = TIPC_LINK_WORKING; - actions |= LINK_ACTIVATE; - if (!l->owner->working_links) - actions |= SND_BCAST_SYNC; + case LINK_PEER_RESET_EVT: + l->state = LINK_PEER_RESET; + rc |= TIPC_LINK_DOWN_EVT; + break; + case LINK_FAILURE_EVT: + l->state = LINK_RESETTING; + rc |= TIPC_LINK_DOWN_EVT; break; - case PEER_RESET_EVT: + case LINK_RESET_EVT: + l->state = LINK_RESET; break; - case SILENCE_EVT: - actions |= SND_ACTIVATE; + case LINK_ESTABLISH_EVT: + case LINK_SYNCH_BEGIN_EVT: break; + case LINK_SYNCH_END_EVT: + l->state = LINK_ESTABLISHED; + break; + case LINK_FAILOVER_BEGIN_EVT: + case LINK_FAILOVER_END_EVT: default: - pr_err("%s%u ESTABLISHING\n", link_unk_evt, evt); + goto illegal_evt; } break; default: - pr_err("Unknown link state %u/%u\n", l->state, evt); - } - - /* Perform actions as decided by FSM */ - if (actions & LINK_RESET) { - l->exec_mode = TIPC_LINK_BLOCKED; - rc = TIPC_LINK_DOWN_EVT; + pr_err("Unknown FSM state %x in %s\n", l->state, l->name); } - if (actions & LINK_ACTIVATE) - rc = TIPC_LINK_UP_EVT; - + return rc; +illegal_evt: + pr_err("Illegal FSM event %x in state %x on link %s\n", + evt, l->state, l->name); return rc; } @@ -432,12 +465,11 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) bool xmit = false; bool prb = false; - if (l->exec_mode == TIPC_LINK_BLOCKED) - return rc; - link_profile_stats(l); - if (l->state == TIPC_LINK_WORKING) { + switch (l->state) { + case LINK_ESTABLISHED: + case LINK_SYNCHING: if (!l->silent_intv_cnt) { if (tipc_bclink_acks_missing(l->owner)) xmit = true; @@ -445,17 +477,26 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) xmit = true; prb = true; } else { - l->exec_mode = TIPC_LINK_BLOCKED; - rc |= TIPC_LINK_DOWN_EVT; + rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } l->silent_intv_cnt++; - } else if (l->state == TIPC_LINK_RESETTING) { + break; + case LINK_RESET: xmit = true; mtyp = RESET_MSG; - } else if (l->state == TIPC_LINK_ESTABLISHING) { + break; + case LINK_ESTABLISHING: xmit = true; mtyp = ACTIVATE_MSG; + break; + case LINK_RESETTING: + case LINK_PEER_RESET: + case LINK_FAILINGOVER: + break; + default: + break; } + if (xmit) tipc_link_build_proto_msg(l, mtyp, prb, 0, 0, 0, xmitq); @@ -559,7 +600,7 @@ void tipc_link_reset(struct tipc_link *l) { struct tipc_node *owner = l->owner; - l->state = TIPC_LINK_RESETTING; + tipc_link_fsm_evt(l, LINK_RESET_EVT); /* Link is down, accept any session */ l->peer_session = WILDCARD_SESSION; @@ -902,8 +943,7 @@ static int tipc_link_retransm(struct tipc_link *l, int retransm, l->stale_count = 1; } else if (++l->stale_count > 100) { link_retransmit_failure(l, skb); - l->exec_mode = TIPC_LINK_BLOCKED; - return TIPC_LINK_DOWN_EVT; + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } skb_queue_walk(&l->transmq, skb) { if (!retransm) @@ -1002,25 +1042,23 @@ static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb) l->stats.recv_bundled += msg_msgcnt(hdr); while (tipc_msg_extract(skb, &iskb, &pos)) tipc_data_input(l, iskb); - return rc; + return 0; } else if (usr == MSG_FRAGMENTER) { l->stats.recv_fragments++; if (tipc_buf_append(reasm_skb, &skb)) { l->stats.recv_fragmented++; tipc_data_input(l, skb); } else if (!*reasm_skb) { - l->exec_mode = TIPC_LINK_BLOCKED; - l->state = TIPC_LINK_RESETTING; - rc = TIPC_LINK_DOWN_EVT; + return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } - return rc; + return 0; } else if (usr == BCAST_PROTOCOL) { tipc_link_sync_rcv(node, skb); - return rc; + return 0; } drop: kfree_skb(skb); - return rc; + return 0; } static bool tipc_link_release_pkts(struct tipc_link *l, u16 acked) @@ -1068,9 +1106,9 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, continue; } - if (unlikely(!link_working(l))) { - rc = tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq); - if (!link_working(l)) { + if (unlikely(!link_is_up(l))) { + rc = tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); + if (!link_is_up(l)) { kfree_skb(__skb_dequeue(arrvq)); return rc; } @@ -1192,7 +1230,7 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, int node_up = l->owner->bclink.recv_permitted; /* Don't send protocol message during reset or link failover */ - if (l->exec_mode == TIPC_LINK_BLOCKED) + if (tipc_link_is_blocked(l)) return; msg_set_type(hdr, mtyp); @@ -1302,7 +1340,6 @@ tnl: tnl->drop_point = l->rcv_nxt; tnl->failover_reasm_skb = l->reasm_buf; l->reasm_buf = NULL; - l->exec_mode = TIPC_LINK_BLOCKED; } } @@ -1323,7 +1360,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, char *if_name; int rc = 0; - if (l->exec_mode == TIPC_LINK_BLOCKED) + if (tipc_link_is_blocked(l)) goto exit; if (link_own_addr(l) > msg_prevnode(hdr)) @@ -1337,6 +1374,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, (l->peer_session != WILDCARD_SESSION)) break; /* fall thru' */ + case ACTIVATE_MSG: /* Complete own link name with peer's interface name */ @@ -1355,13 +1393,20 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; + if (msg_type(hdr) == RESET_MSG) { + rc |= tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); + } else if (!link_is_up(l)) { + tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); + rc |= tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); + } l->peer_session = msg_session(hdr); l->peer_bearer_id = msg_bearer_id(hdr); - rc = tipc_link_fsm_evt(l, msg_type(hdr), xmitq); if (l->mtu > msg_max_pkt(hdr)) l->mtu = msg_max_pkt(hdr); break; + case STATE_MSG: + /* Update own tolerance if peer indicates a non-zero value */ if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) l->tolerance = peers_tol; @@ -1370,11 +1415,11 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, l->stats.recv_states++; if (msg_probe(hdr)) l->stats.recv_probes++; - rc = tipc_link_fsm_evt(l, TRAFFIC_EVT, xmitq); - if (!tipc_link_is_up(l)) + rc = tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); + if (!link_is_up(l)) break; - /* Has peer sent packets we haven't received yet ? */ + /* Send NACK if peer has sent pkts we haven't received yet */ if (more(peers_snd_nxt, l->rcv_nxt)) rcvgap = peers_snd_nxt - l->rcv_nxt; if (rcvgap || (msg_probe(hdr))) @@ -1387,6 +1432,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, rc = tipc_link_retransm(l, nacked_gap, xmitq); l->stats.recv_nacks++; } + tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); @@ -1463,19 +1509,7 @@ static void link_print(struct tipc_link *l, const char *str) u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt; u16 tail = l->snd_nxt - 1; - pr_info("%s Link <%s>:", str, l->name); - - if (link_probing(l)) - pr_cont(":P\n"); - else if (link_establishing(l)) - pr_cont(":E\n"); - else if (link_resetting(l)) - pr_cont(":R\n"); - else if (link_working(l)) - pr_cont(":W\n"); - else - pr_cont("\n"); - + pr_info("%s Link <%s> state %x\n", str, l->name, l->state); pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n", skb_queue_len(&l->transmq), head, tail, skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt); diff --git a/net/tipc/link.h b/net/tipc/link.h index b317c4df9079..39b8c4c5121e 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -49,13 +49,17 @@ */ #define INVALID_LINK_SEQ 0x10000 - -/* Link endpoint receive states +/* Link FSM events: */ enum { - TIPC_LINK_OPEN, - TIPC_LINK_BLOCKED, - TIPC_LINK_TUNNEL + LINK_ESTABLISH_EVT = 0xec1ab1e, + LINK_PEER_RESET_EVT = 0x9eed0e, + LINK_FAILURE_EVT = 0xfa110e, + LINK_RESET_EVT = 0x10ca1d0e, + LINK_FAILOVER_BEGIN_EVT = 0xfa110bee, + LINK_FAILOVER_END_EVT = 0xfa110ede, + LINK_SYNCH_BEGIN_EVT = 0xc1ccbee, + LINK_SYNCH_END_EVT = 0xc1ccede }; /* Events returned from link at packet reception or at timeout @@ -120,7 +124,6 @@ struct tipc_stats { * @pmsg: convenience pointer to "proto_msg" field * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') - * @exec_mode: transmit/receive mode for link endpoint instance * @backlog_limit: backlog queue congestion thresholds (indexed by importance) * @exp_msg_count: # of tunnelled messages expected during link changeover * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset @@ -155,7 +158,7 @@ struct tipc_link { u32 tolerance; unsigned long keepalive_intv; u32 abort_limit; - int state; + u32 state; u32 silent_intv_cnt; struct { unchar hdr[INT_H_SIZE]; @@ -166,7 +169,6 @@ struct tipc_link { char net_plane; /* Failover/synch */ - u8 exec_mode; u16 drop_point; struct sk_buff *failover_reasm_skb; @@ -214,8 +216,13 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); void tipc_link_build_bcast_sync_msg(struct tipc_link *l, struct sk_buff_head *xmitq); +int tipc_link_fsm_evt(struct tipc_link *l, int evt); void tipc_link_reset_fragments(struct tipc_link *l_ptr); -int tipc_link_is_up(struct tipc_link *l_ptr); +bool tipc_link_is_up(struct tipc_link *l); +bool tipc_link_is_reset(struct tipc_link *l); +bool tipc_link_is_synching(struct tipc_link *l); +bool tipc_link_is_failingover(struct tipc_link *l); +bool tipc_link_is_blocked(struct tipc_link *l); int tipc_link_is_active(struct tipc_link *l_ptr); void tipc_link_purge_queues(struct tipc_link *l_ptr); void tipc_link_purge_backlog(struct tipc_link *l); diff --git a/net/tipc/node.c b/net/tipc/node.c index 9e20acffb3d4..a3ceeda2a80a 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -334,7 +334,6 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; - nl->exec_mode = TIPC_LINK_OPEN; tipc_link_build_bcast_sync_msg(nl, xmitq); node_established_contact(n); return; @@ -368,7 +367,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id) struct sk_buff_head xmitq; l = n->links[bearer_id].link; - if (!l || !tipc_link_is_up(l)) + if (!l || tipc_link_is_reset(l)) return; __skb_queue_head_init(&xmitq); @@ -414,6 +413,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id) n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, &xmitq); tipc_link_reset(l); + tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_bearer_xmit(n->net, tnl->bearer_id, &xmitq, maddr); } @@ -749,7 +749,7 @@ static void node_lost_contact(struct tipc_node *n_ptr) struct tipc_link *l_ptr = n_ptr->links[i].link; if (!l_ptr) continue; - l_ptr->exec_mode = TIPC_LINK_OPEN; + tipc_link_fsm_evt(l_ptr, LINK_FAILOVER_END_EVT); kfree_skb(l_ptr->failover_reasm_skb); l_ptr->failover_reasm_skb = NULL; tipc_link_reset_fragments(l_ptr); @@ -989,7 +989,7 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, * Returns true if state is ok, otherwise consumes buffer and returns false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, - int bearer_id) + int bearer_id, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); @@ -1042,42 +1042,47 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; - if (pl && tipc_link_is_up(pl)) { + if (pl && tipc_link_is_up(pl)) tipc_node_link_down(n, pl->bearer_id); - pl->exec_mode = TIPC_LINK_BLOCKED; - } + /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open parallel link when tunnel link reaches synch point */ - if ((n->state == NODE_FAILINGOVER) && (more(rcv_nxt, n->sync_point))) { + if ((n->state == NODE_FAILINGOVER) && !tipc_link_is_failingover(l)) { + if (!more(rcv_nxt, n->sync_point)) + return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); if (pl) - pl->exec_mode = TIPC_LINK_OPEN; + tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); return true; } /* Initiate or update synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) { syncpt = iseqno + exp_pkts - 1; + if (!tipc_link_is_up(l)) { + tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); + tipc_node_link_up(n, bearer_id, xmitq); + } if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; + tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } - l->exec_mode = TIPC_LINK_TUNNEL; if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open tunnel link when parallel link reaches synch point */ - if ((n->state == NODE_SYNCHING) && (l->exec_mode == TIPC_LINK_TUNNEL)) { + if ((n->state == NODE_SYNCHING) && tipc_link_is_synching(l)) { if (pl) dlv_nxt = mod(pl->rcv_nxt - skb_queue_len(pl->inputq)); if (!pl || more(dlv_nxt, n->sync_point)) { + tipc_link_fsm_evt(l, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); - l->exec_mode = TIPC_LINK_OPEN; return true; } if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) @@ -1143,7 +1148,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) tipc_bclink_acknowledge(n, msg_bcast_ack(hdr)); /* Check and if necessary update node state */ - if (likely(tipc_node_check_state(n, skb, bearer_id))) { + if (likely(tipc_node_check_state(n, skb, bearer_id, &xmitq))) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } -- cgit v1.2.3 From cf148816acb6def45474001302368eb472995e62 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:22 -0400 Subject: tipc: move received discovery data evaluation inside node.c The node lock is currently grabbed and and released in the function tipc_disc_rcv() in the file discover.c. As a preparation for the next commits, we need to move this node lock handling, along with the code area it is covering, to node.c. This commit introduces this change. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/discover.c | 116 +++++++++---------------------------------------- net/tipc/node.c | 123 +++++++++++++++++++++++++++++++++++++++++++--------- net/tipc/node.h | 11 +++-- 3 files changed, 127 insertions(+), 123 deletions(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index 164d08907d6f..d14e0a4aa9af 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -120,29 +120,24 @@ static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr, * @buf: buffer containing message * @bearer: bearer that message arrived on */ -void tipc_disc_rcv(struct net *net, struct sk_buff *buf, +void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *bearer) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_node *node; struct tipc_media_addr maddr; - struct sk_buff *rbuf; - struct tipc_msg *msg = buf_msg(buf); - u32 ddom = msg_dest_domain(msg); - u32 onode = msg_prevnode(msg); - u32 net_id = msg_bc_netid(msg); - u32 mtyp = msg_type(msg); - u32 signature = msg_node_sig(msg); - u16 caps = msg_node_capabilities(msg); - bool addr_match = false; - bool sign_match = false; - bool link_up = false; - bool accept_addr = false; - bool accept_sign = false; + struct sk_buff *rskb; + struct tipc_msg *hdr = buf_msg(skb); + u32 ddom = msg_dest_domain(hdr); + u32 onode = msg_prevnode(hdr); + u32 net_id = msg_bc_netid(hdr); + u32 mtyp = msg_type(hdr); + u32 signature = msg_node_sig(hdr); + u16 caps = msg_node_capabilities(hdr); bool respond = false; + bool dupl_addr = false; - bearer->media->msg2addr(bearer, &maddr, msg_media_addr(msg)); - kfree_skb(buf); + bearer->media->msg2addr(bearer, &maddr, msg_media_addr(hdr)); + kfree_skb(skb); /* Ensure message from node is valid and communication is permitted */ if (net_id != tn->net_id) @@ -164,91 +159,20 @@ void tipc_disc_rcv(struct net *net, struct sk_buff *buf, if (!tipc_in_scope(bearer->domain, onode)) return; - node = tipc_node_create(net, onode); - if (!node) - return; - tipc_node_lock(node); - node->capabilities = caps; - - /* Prepare to validate requesting node's signature and media address */ - sign_match = (signature == node->signature); - tipc_node_check_dest(node, bearer, &link_up, &addr_match, &maddr); - - /* These three flags give us eight permutations: */ - - if (sign_match && addr_match && link_up) { - /* All is fine. Do nothing. */ - } else if (sign_match && addr_match && !link_up) { - /* Respond. The link will come up in due time */ - respond = true; - } else if (sign_match && !addr_match && link_up) { - /* Peer has changed i/f address without rebooting. - * If so, the link will reset soon, and the next - * discovery will be accepted. So we can ignore it. - * It may also be an cloned or malicious peer having - * chosen the same node address and signature as an - * existing one. - * Ignore requests until the link goes down, if ever. - */ + tipc_node_check_dest(net, onode, bearer, caps, signature, + &maddr, &respond, &dupl_addr); + if (dupl_addr) disc_dupl_alert(bearer, onode, &maddr); - } else if (sign_match && !addr_match && !link_up) { - /* Peer link has changed i/f address without rebooting. - * It may also be a cloned or malicious peer; we can't - * distinguish between the two. - * The signature is correct, so we must accept. - */ - accept_addr = true; - respond = true; - } else if (!sign_match && addr_match && link_up) { - /* Peer node rebooted. Two possibilities: - * - Delayed re-discovery; this link endpoint has already - * reset and re-established contact with the peer, before - * receiving a discovery message from that node. - * (The peer happened to receive one from this node first). - * - The peer came back so fast that our side has not - * discovered it yet. Probing from this side will soon - * reset the link, since there can be no working link - * endpoint at the peer end, and the link will re-establish. - * Accept the signature, since it comes from a known peer. - */ - accept_sign = true; - } else if (!sign_match && addr_match && !link_up) { - /* The peer node has rebooted. - * Accept signature, since it is a known peer. - */ - accept_sign = true; - respond = true; - } else if (!sign_match && !addr_match && link_up) { - /* Peer rebooted with new address, or a new/duplicate peer. - * Ignore until the link goes down, if ever. - */ - disc_dupl_alert(bearer, onode, &maddr); - } else if (!sign_match && !addr_match && !link_up) { - /* Peer rebooted with new address, or it is a new peer. - * Accept signature and address. - */ - accept_sign = true; - accept_addr = true; - respond = true; - } - - if (accept_sign) - node->signature = signature; - - if (accept_addr && !tipc_node_update_dest(node, bearer, &maddr)) - respond = false; /* Send response, if necessary */ if (respond && (mtyp == DSC_REQ_MSG)) { - rbuf = tipc_buf_acquire(MAX_H_SIZE); - if (rbuf) { - tipc_disc_init_msg(net, rbuf, DSC_RESP_MSG, bearer); - tipc_bearer_send(net, bearer->identity, rbuf, &maddr); - kfree_skb(rbuf); + rskb = tipc_buf_acquire(MAX_H_SIZE); + if (rskb) { + tipc_disc_init_msg(net, rskb, DSC_RESP_MSG, bearer); + tipc_bearer_send(net, bearer->identity, rskb, &maddr); + kfree_skb(rskb); } } - tipc_node_unlock(node); - tipc_node_put(node); } /** diff --git a/net/tipc/node.c b/net/tipc/node.c index a3ceeda2a80a..d03e88f2273b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -138,7 +138,7 @@ struct tipc_node *tipc_node_find(struct net *net, u32 addr) return NULL; } -struct tipc_node *tipc_node_create(struct net *net, u32 addr) +struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n_ptr, *temp_node; @@ -154,6 +154,7 @@ struct tipc_node *tipc_node_create(struct net *net, u32 addr) } n_ptr->addr = addr; n_ptr->net = net; + n_ptr->capabilities = capabilities; kref_init(&n_ptr->kref); spin_lock_init(&n_ptr->lock); INIT_HLIST_NODE(&n_ptr->hash); @@ -422,38 +423,118 @@ bool tipc_node_is_up(struct tipc_node *n) return n->active_links[0] != INVALID_BEARER_ID; } -void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *b, - bool *link_up, bool *addr_match, - struct tipc_media_addr *maddr) +void tipc_node_check_dest(struct net *net, u32 onode, + struct tipc_bearer *b, + u16 capabilities, u32 signature, + struct tipc_media_addr *maddr, + bool *respond, bool *dupl_addr) { - struct tipc_link *l = n->links[b->identity].link; - struct tipc_media_addr *curr = &n->links[b->identity].maddr; + struct tipc_node *n; + struct tipc_link *l; + struct tipc_media_addr *curr_maddr; + struct sk_buff_head *inputq; + bool addr_match = false; + bool sign_match = false; + bool link_up = false; + bool accept_addr = false; + + *dupl_addr = false; + *respond = false; + + n = tipc_node_create(net, onode, capabilities); + if (!n) + return; - *link_up = l && tipc_link_is_up(l); - *addr_match = l && !memcmp(curr, maddr, sizeof(*maddr)); -} + tipc_node_lock(n); -bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *b, - struct tipc_media_addr *maddr) -{ - struct tipc_link *l = n->links[b->identity].link; - struct tipc_media_addr *curr = &n->links[b->identity].maddr; - struct sk_buff_head *inputq = &n->links[b->identity].inputq; + curr_maddr = &n->links[b->identity].maddr; + inputq = &n->links[b->identity].inputq; + + /* Prepare to validate requesting node's signature and media address */ + l = n->links[b->identity].link; + link_up = l && tipc_link_is_up(l); + addr_match = l && !memcmp(curr_maddr, maddr, sizeof(*maddr)); + sign_match = (signature == n->signature); + + /* These three flags give us eight permutations: */ + + if (sign_match && addr_match && link_up) { + /* All is fine. Do nothing. */ + } else if (sign_match && addr_match && !link_up) { + /* Respond. The link will come up in due time */ + *respond = true; + } else if (sign_match && !addr_match && link_up) { + /* Peer has changed i/f address without rebooting. + * If so, the link will reset soon, and the next + * discovery will be accepted. So we can ignore it. + * It may also be an cloned or malicious peer having + * chosen the same node address and signature as an + * existing one. + * Ignore requests until the link goes down, if ever. + */ + *dupl_addr = true; + } else if (sign_match && !addr_match && !link_up) { + /* Peer link has changed i/f address without rebooting. + * It may also be a cloned or malicious peer; we can't + * distinguish between the two. + * The signature is correct, so we must accept. + */ + accept_addr = true; + *respond = true; + } else if (!sign_match && addr_match && link_up) { + /* Peer node rebooted. Two possibilities: + * - Delayed re-discovery; this link endpoint has already + * reset and re-established contact with the peer, before + * receiving a discovery message from that node. + * (The peer happened to receive one from this node first). + * - The peer came back so fast that our side has not + * discovered it yet. Probing from this side will soon + * reset the link, since there can be no working link + * endpoint at the peer end, and the link will re-establish. + * Accept the signature, since it comes from a known peer. + */ + n->signature = signature; + } else if (!sign_match && addr_match && !link_up) { + /* The peer node has rebooted. + * Accept signature, since it is a known peer. + */ + n->signature = signature; + *respond = true; + } else if (!sign_match && !addr_match && link_up) { + /* Peer rebooted with new address, or a new/duplicate peer. + * Ignore until the link goes down, if ever. + */ + *dupl_addr = true; + } else if (!sign_match && !addr_match && !link_up) { + /* Peer rebooted with new address, or it is a new peer. + * Accept signature and address. + */ + n->signature = signature; + accept_addr = true; + *respond = true; + } + + if (!accept_addr) + goto exit; + /* Now create new link if not already existing */ if (!l) { l = tipc_link_create(n, b, maddr, inputq, &n->bclink.namedq); - if (!l) - return false; + if (!l) { + *respond = false; + goto exit; + } tipc_node_calculate_timer(n, l); - if (n->link_cnt == 1) { + if (n->link_cnt == 1) if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) tipc_node_get(n); - } } memcpy(&l->media_addr, maddr, sizeof(*maddr)); - memcpy(curr, maddr, sizeof(*maddr)); + memcpy(curr_maddr, maddr, sizeof(*maddr)); tipc_node_link_down(n, b->identity); - return true; +exit: + tipc_node_unlock(n); + tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) diff --git a/net/tipc/node.h b/net/tipc/node.h index 406c6fe0dbb2..9a977467fc46 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -147,13 +147,12 @@ struct tipc_node { struct tipc_node *tipc_node_find(struct net *net, u32 addr); void tipc_node_put(struct tipc_node *node); -struct tipc_node *tipc_node_create(struct net *net, u32 addr); void tipc_node_stop(struct net *net); -void tipc_node_check_dest(struct tipc_node *n, struct tipc_bearer *bearer, - bool *link_up, bool *addr_match, - struct tipc_media_addr *maddr); -bool tipc_node_update_dest(struct tipc_node *n, struct tipc_bearer *bearer, - struct tipc_media_addr *maddr); +void tipc_node_check_dest(struct net *net, u32 onode, + struct tipc_bearer *bearer, + u16 capabilities, u32 signature, + struct tipc_media_addr *maddr, + bool *respond, bool *dupl_addr); void tipc_node_delete_links(struct net *net, int bearer_id); void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr); -- cgit v1.2.3 From 598411d70f85dcf5b5c6c2369cc48637c251b656 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:23 -0400 Subject: tipc: make resetting of links non-atomic In order to facilitate future improvements to the locking structure, we want to make resetting and establishing of links non-atomic. I.e., the functions tipc_node_link_up() and tipc_node_link_down() should be called from outside the node lock context, and grab/release the node lock themselves. This requires that we can freeze the link state from the moment it is set to RESETTING or PEER_RESET in one lock context until it is set to RESET or ESTABLISHING in a later context. The recently introduced link FSM makes this possible, so we are now ready to introduce the above change. This commit implements this. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 2 +- net/tipc/msg.h | 29 ++++++++++ net/tipc/node.c | 166 +++++++++++++++++++++++++++++++++----------------------- 3 files changed, 127 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 9840b03348e1..3a92924711a1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -489,8 +489,8 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) xmit = true; mtyp = ACTIVATE_MSG; break; - case LINK_RESETTING: case LINK_PEER_RESET: + case LINK_RESETTING: case LINK_FAILINGOVER: break; default: diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 115bb2aa6bed..53d98ef78650 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -916,4 +916,33 @@ static inline bool __tipc_skb_queue_sorted(struct sk_buff_head *list, return false; } +/* tipc_skb_queue_splice_tail - append an skb list to lock protected list + * @list: the new list to append. Not lock protected + * @head: target list. Lock protected. + */ +static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, + struct sk_buff_head *head) +{ + spin_lock_bh(&head->lock); + skb_queue_splice_tail(list, head); + spin_unlock_bh(&head->lock); +} + +/* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists + * @list: the new list to add. Lock protected. Will be reinitialized + * @head: target list. Lock protected. + */ +static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, + struct sk_buff_head *head) +{ + struct sk_buff_head tmp; + + __skb_queue_head_init(&tmp); + + spin_lock_bh(&list->lock); + skb_queue_splice_tail_init(list, &tmp); + spin_unlock_bh(&list->lock); + tipc_skb_queue_splice_tail(&tmp, head); +} + #endif diff --git a/net/tipc/node.c b/net/tipc/node.c index d03e88f2273b..cdca57be85bf 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -66,8 +66,12 @@ enum { NODE_SYNCH_END_EVT = 0xcee }; -static void tipc_node_link_down(struct tipc_node *n, int bearer_id); -static void node_lost_contact(struct tipc_node *n_ptr); +static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr **maddr); +static void tipc_node_link_down(struct tipc_node *n, int bearer_id, + bool delete); +static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void node_established_contact(struct tipc_node *n_ptr); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(unsigned long data); @@ -275,9 +279,8 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) static void tipc_node_timeout(unsigned long data) { struct tipc_node *n = (struct tipc_node *)data; + struct tipc_link_entry *le; struct sk_buff_head xmitq; - struct tipc_link *l; - struct tipc_media_addr *maddr; int bearer_id; int rc = 0; @@ -285,17 +288,16 @@ static void tipc_node_timeout(unsigned long data) for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { tipc_node_lock(n); - l = n->links[bearer_id].link; - if (l) { + le = &n->links[bearer_id]; + if (le->link) { /* Link tolerance may change asynchronously: */ - tipc_node_calculate_timer(n, l); - rc = tipc_link_timeout(l, &xmitq); - if (rc & TIPC_LINK_DOWN_EVT) - tipc_node_link_down(n, bearer_id); + tipc_node_calculate_timer(n, le->link); + rc = tipc_link_timeout(le->link, &xmitq); } tipc_node_unlock(n); - maddr = &n->links[bearer_id].maddr; - tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr); + if (rc & TIPC_LINK_DOWN_EVT) + tipc_node_link_down(n, bearer_id, false); } if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) tipc_node_get(n); @@ -303,18 +305,21 @@ static void tipc_node_timeout(unsigned long data) } /** - * tipc_node_link_up - handle addition of link - * + * __tipc_node_link_up - handle addition of link + * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ -static void tipc_node_link_up(struct tipc_node *n, int bearer_id, - struct sk_buff_head *xmitq) +static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, + struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; + if (!nl || !tipc_link_is_up(nl)) + return; + if (n->working_links > 1) { pr_warn("Attempt to establish 3rd link to %x\n", n->addr); return; @@ -356,28 +361,40 @@ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, } /** - * tipc_node_link_down - handle loss of link + * tipc_node_link_up - handle addition of link + * + * Link becomes active (alone or shared) or standby, depending on its priority. */ -static void tipc_node_link_down(struct tipc_node *n, int bearer_id) +static void tipc_node_link_up(struct tipc_node *n, int bearer_id, + struct sk_buff_head *xmitq) { + tipc_node_lock(n); + __tipc_node_link_up(n, bearer_id, xmitq); + tipc_node_unlock(n); +} + +/** + * __tipc_node_link_down - handle loss of link + */ +static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, + struct sk_buff_head *xmitq, + struct tipc_media_addr **maddr) +{ + struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; - struct tipc_media_addr *maddr = &n->links[bearer_id].maddr; int i, highest = 0; struct tipc_link *l, *_l, *tnl; - struct sk_buff_head xmitq; - l = n->links[bearer_id].link; + l = n->links[*bearer_id].link; if (!l || tipc_link_is_reset(l)) return; - __skb_queue_head_init(&xmitq); - n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; - n->link_id = l->peer_bearer_id << 16 | bearer_id; + n->link_id = l->peer_bearer_id << 16 | *bearer_id; - tipc_bearer_remove_dest(n->net, l->bearer_id, n->addr); + tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", l->name, l->net_plane); @@ -404,18 +421,40 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id) if (!tipc_node_is_up(n)) { tipc_link_reset(l); - node_lost_contact(n); + node_lost_contact(n, &le->inputq); return; } /* There is still a working link => initiate failover */ tnl = node_active_link(n, 0); - tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1); - tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, &xmitq); + tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); - tipc_bearer_xmit(n->net, tnl->bearer_id, &xmitq, maddr); + tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); + *maddr = &n->links[tnl->bearer_id].maddr; + *bearer_id = tnl->bearer_id; +} + +static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) +{ + struct tipc_link_entry *le = &n->links[bearer_id]; + struct tipc_media_addr *maddr; + struct sk_buff_head xmitq; + + __skb_queue_head_init(&xmitq); + + tipc_node_lock(n); + __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); + if (delete && le->link) { + kfree(le->link); + le->link = NULL; + n->link_cnt--; + } + tipc_node_unlock(n); + + tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr); + tipc_sk_rcv(n->net, &le->inputq); } bool tipc_node_is_up(struct tipc_node *n) @@ -437,7 +476,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, bool sign_match = false; bool link_up = false; bool accept_addr = false; - + bool reset = true; *dupl_addr = false; *respond = false; @@ -460,6 +499,7 @@ void tipc_node_check_dest(struct net *net, u32 onode, if (sign_match && addr_match && link_up) { /* All is fine. Do nothing. */ + reset = false; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; @@ -531,29 +571,21 @@ void tipc_node_check_dest(struct net *net, u32 onode, } memcpy(&l->media_addr, maddr, sizeof(*maddr)); memcpy(curr_maddr, maddr, sizeof(*maddr)); - tipc_node_link_down(n, b->identity); exit: tipc_node_unlock(n); + if (reset) + tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); - struct tipc_link *l; struct tipc_node *n; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { - tipc_node_lock(n); - l = n->links[bearer_id].link; - if (l) { - tipc_node_link_down(n, bearer_id); - n->links[bearer_id].link = NULL; - n->link_cnt--; - } - tipc_node_unlock(n); - kfree(l); + tipc_node_link_down(n, bearer_id, true); } rcu_read_unlock(); } @@ -561,19 +593,14 @@ void tipc_node_delete_links(struct net *net, int bearer_id) static void tipc_node_reset_links(struct tipc_node *n) { char addr_string[16]; - u32 i; - - tipc_node_lock(n); + int i; pr_warn("Resetting all links to %s\n", tipc_addr_string_fill(addr_string, n->addr)); for (i = 0; i < MAX_BEARERS; i++) { - if (!n->links[i].link) - continue; - tipc_node_link_down(n, i); + tipc_node_link_down(n, i, false); } - tipc_node_unlock(n); } void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) @@ -798,10 +825,12 @@ static void node_established_contact(struct tipc_node *n_ptr) tipc_bclink_add_node(n_ptr->net, n_ptr->addr); } -static void node_lost_contact(struct tipc_node *n_ptr) +static void node_lost_contact(struct tipc_node *n_ptr, + struct sk_buff_head *inputq) { char addr_string[16]; struct tipc_sock_conn *conn, *safe; + struct tipc_link *l; struct list_head *conns = &n_ptr->conn_sks; struct sk_buff *skb; struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); @@ -827,14 +856,11 @@ static void node_lost_contact(struct tipc_node *n_ptr) /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { - struct tipc_link *l_ptr = n_ptr->links[i].link; - if (!l_ptr) - continue; - tipc_link_fsm_evt(l_ptr, LINK_FAILOVER_END_EVT); - kfree_skb(l_ptr->failover_reasm_skb); - l_ptr->failover_reasm_skb = NULL; - tipc_link_reset_fragments(l_ptr); + l = n_ptr->links[i].link; + if (l) + tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } + /* Prevent re-contact with node until cleanup is done */ tipc_node_fsm_evt(n_ptr, SELF_LOST_CONTACT_EVT); @@ -848,7 +874,7 @@ static void node_lost_contact(struct tipc_node *n_ptr) conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); if (likely(skb)) { - skb_queue_tail(n_ptr->inputq, skb); + skb_queue_tail(inputq, skb); n_ptr->action_flags |= TIPC_MSG_EVT; } list_del(&conn->list); @@ -1025,9 +1051,9 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, l = tipc_node_select_link(n, selector, &bearer_id, &maddr); if (likely(l)) rc = tipc_link_xmit(l, list, &xmitq); - if (unlikely(rc == -ENOBUFS)) - tipc_node_link_down(n, bearer_id); tipc_node_unlock(n); + if (unlikely(rc == -ENOBUFS)) + tipc_node_link_down(n, bearer_id, false); tipc_node_put(n); } if (likely(!rc)) { @@ -1081,8 +1107,8 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, u16 rcv_nxt, syncpt, dlv_nxt; int state = n->state; struct tipc_link *l, *pl = NULL; - struct sk_buff_head; - int i; + struct tipc_media_addr *maddr; + int i, pb_id; l = n->links[bearer_id].link; if (!l) @@ -1123,9 +1149,11 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; - if (pl && tipc_link_is_up(pl)) - tipc_node_link_down(n, pl->bearer_id); - + if (pl && tipc_link_is_up(pl)) { + pb_id = pl->bearer_id; + __tipc_node_link_down(n, &pb_id, xmitq, &maddr); + tipc_skb_queue_splice_tail_init(pl->inputq, l->inputq); + } /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; @@ -1146,7 +1174,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, syncpt = iseqno + exp_pkts - 1; if (!tipc_link_is_up(l)) { tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); - tipc_node_link_up(n, bearer_id, xmitq); + __tipc_node_link_up(n, bearer_id, xmitq); } if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; @@ -1224,7 +1252,7 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) tipc_bclink_sync_state(n, hdr); - /* Release acked broadcast messages */ + /* Release acked broadcast packets */ if (unlikely(n->bclink.acked != msg_bcast_ack(hdr))) tipc_bclink_acknowledge(n, msg_bcast_ack(hdr)); @@ -1233,14 +1261,14 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } +unlock: + tipc_node_unlock(n); if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) - tipc_node_link_down(n, bearer_id); -unlock: - tipc_node_unlock(n); + tipc_node_link_down(n, bearer_id, false); if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); -- cgit v1.2.3 From 23d8335d786472021b5c733f228c7074208dcfa0 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:24 -0400 Subject: tipc: remove implicit message delivery in node_unlock() After the most recent changes, all access calls to a link which may entail addition of messages to the link's input queue are postpended by an explicit call to tipc_sk_rcv(), using a reference to the correct queue. This means that the potentially hazardous implicit delivery, using tipc_node_unlock() in combination with a binary flag and a cached queue pointer, now has become redundant. This commit removes this implicit delivery mechanism both for regular data messages and for binding table update messages. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 21 +++------------------ net/tipc/msg.h | 22 ---------------------- net/tipc/node.c | 26 +++++++------------------- net/tipc/node.h | 4 ---- 4 files changed, 10 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 3a92924711a1..2aa19de715f6 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -559,8 +559,6 @@ void link_prepare_wakeup(struct tipc_link *l) break; skb_unlink(skb, &l->wakeupq); skb_queue_tail(l->inputq, skb); - l->owner->inputq = l->inputq; - l->owner->action_flags |= TIPC_MSG_EVT; } } @@ -598,8 +596,6 @@ void tipc_link_purge_queues(struct tipc_link *l_ptr) void tipc_link_reset(struct tipc_link *l) { - struct tipc_node *owner = l->owner; - tipc_link_fsm_evt(l, LINK_RESET_EVT); /* Link is down, accept any session */ @@ -611,14 +607,10 @@ void tipc_link_reset(struct tipc_link *l) /* Prepare for renewed mtu size negotiation */ l->mtu = l->advertised_mtu; - /* Clean up all queues, except inputq: */ + /* Clean up all queues: */ __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); - if (!owner->inputq) - owner->inputq = l->inputq; - skb_queue_splice_init(&l->wakeupq, owner->inputq); - if (!skb_queue_empty(owner->inputq)) - owner->action_flags |= TIPC_MSG_EVT; + skb_queue_splice_init(&l->wakeupq, l->inputq); tipc_link_purge_backlog(l); kfree_skb(l->reasm_buf); @@ -972,7 +964,6 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) { struct tipc_node *node = link->owner; struct tipc_msg *msg = buf_msg(skb); - u32 dport = msg_destport(msg); switch (msg_user(msg)) { case TIPC_LOW_IMPORTANCE: @@ -980,17 +971,11 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: case CONN_MANAGER: - if (tipc_skb_queue_tail(link->inputq, skb, dport)) { - node->inputq = link->inputq; - node->action_flags |= TIPC_MSG_EVT; - } + skb_queue_tail(link->inputq, skb); return true; case NAME_DISTRIBUTOR: node->bclink.recv_permitted = true; - node->namedq = link->namedq; skb_queue_tail(link->namedq, skb); - if (skb_queue_len(link->namedq) == 1) - node->action_flags |= TIPC_NAMED_MSG_EVT; return true; case MSG_BUNDLER: case TUNNEL_PROTOCOL: diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 53d98ef78650..a82c5848d4bc 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -862,28 +862,6 @@ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, return skb; } -/* tipc_skb_queue_tail(): add buffer to tail of list; - * @list: list to be appended to - * @skb: buffer to append. Always appended - * @dport: the destination port of the buffer - * returns true if dport differs from previous destination - */ -static inline bool tipc_skb_queue_tail(struct sk_buff_head *list, - struct sk_buff *skb, u32 dport) -{ - struct sk_buff *_skb = NULL; - bool rv = false; - - spin_lock_bh(&list->lock); - _skb = skb_peek_tail(list); - if (!_skb || (msg_destport(buf_msg(_skb)) != dport) || - (skb_queue_len(list) > 32)) - rv = true; - __skb_queue_tail(list, skb); - spin_unlock_bh(&list->lock); - return rv; -} - /* tipc_skb_queue_sorted(); sort pkt into list according to sequence number * @list: list to be appended to * @skb: buffer to add diff --git a/net/tipc/node.c b/net/tipc/node.c index cdca57be85bf..9e9b0938bd17 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -873,10 +873,8 @@ static void node_lost_contact(struct tipc_node *n_ptr, SHORT_H_SIZE, 0, tn->own_addr, conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); - if (likely(skb)) { + if (likely(skb)) skb_queue_tail(inputq, skb); - n_ptr->action_flags |= TIPC_MSG_EVT; - } list_del(&conn->list); kfree(conn); } @@ -923,27 +921,20 @@ void tipc_node_unlock(struct tipc_node *node) u32 flags = node->action_flags; u32 link_id = 0; struct list_head *publ_list; - struct sk_buff_head *inputq = node->inputq; - struct sk_buff_head *namedq; - if (likely(!flags || (flags == TIPC_MSG_EVT))) { - node->action_flags = 0; + if (likely(!flags)) { spin_unlock_bh(&node->lock); - if (flags == TIPC_MSG_EVT) - tipc_sk_rcv(net, inputq); return; } addr = node->addr; link_id = node->link_id; - namedq = node->namedq; publ_list = &node->publ_list; - node->action_flags &= ~(TIPC_MSG_EVT | - TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | + node->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP | TIPC_WAKEUP_BCAST_USERS | TIPC_BCAST_MSG_EVT | - TIPC_NAMED_MSG_EVT | TIPC_BCAST_RESET); + TIPC_BCAST_RESET); spin_unlock_bh(&node->lock); @@ -964,12 +955,6 @@ void tipc_node_unlock(struct tipc_node *node) tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr, link_id, addr); - if (flags & TIPC_MSG_EVT) - tipc_sk_rcv(net, inputq); - - if (flags & TIPC_NAMED_MSG_EVT) - tipc_named_rcv(net, namedq); - if (flags & TIPC_BCAST_MSG_EVT) tipc_bclink_input(net); @@ -1270,6 +1255,9 @@ unlock: if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id, false); + if (unlikely(!skb_queue_empty(&n->bclink.namedq))) + tipc_named_rcv(net, &n->bclink.namedq); + if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); diff --git a/net/tipc/node.h b/net/tipc/node.h index 9a977467fc46..344b3e7594fd 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -53,13 +53,11 @@ * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { - TIPC_MSG_EVT = 1, TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_WAKEUP_BCAST_USERS = (1 << 5), TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7), - TIPC_NAMED_MSG_EVT = (1 << 8), TIPC_BCAST_MSG_EVT = (1 << 9), TIPC_BCAST_RESET = (1 << 10) }; @@ -124,8 +122,6 @@ struct tipc_node { spinlock_t lock; struct net *net; struct hlist_node hash; - struct sk_buff_head *inputq; - struct sk_buff_head *namedq; int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; int action_flags; -- cgit v1.2.3 From 9073fb8be3ee6f89492b8ea8f6d3902913a9fc91 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:25 -0400 Subject: tipc: use temporary, non-protected skb queue for bundle reception Currently, when we extract small messages from a message bundle, or when many messages have accumulated in the link arrival queue, those messages are added one by one to the lock protected link input queue. This may increase contention with the reader of that queue, in the function tipc_sk_rcv(). This commit introduces a temporary, unprotected input queue in tipc_link_rcv() for such cases. Only when the arrival queue has been emptied, and the function is ready to return, does it splice the whole temporary queue into the real input queue. Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 2aa19de715f6..d683fe9f68c8 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -111,8 +111,6 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, static void link_reset_statistics(struct tipc_link *l_ptr); static void link_print(struct tipc_link *l_ptr, const char *str); static void tipc_link_sync_rcv(struct tipc_node *n, struct sk_buff *buf); -static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb); -static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb); /* * Simple non-static link routines (i.e. referenced outside this file) @@ -960,18 +958,18 @@ static int tipc_link_retransm(struct tipc_link *l, int retransm, * Consumes buffer if message is of right type * Node lock must be held */ -static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) +static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb, + struct sk_buff_head *inputq) { struct tipc_node *node = link->owner; - struct tipc_msg *msg = buf_msg(skb); - switch (msg_user(msg)) { + switch (msg_user(buf_msg(skb))) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: case CONN_MANAGER: - skb_queue_tail(link->inputq, skb); + __skb_queue_tail(inputq, skb); return true; case NAME_DISTRIBUTOR: node->bclink.recv_permitted = true; @@ -993,7 +991,8 @@ static bool tipc_data_input(struct tipc_link *link, struct sk_buff *skb) * * Consumes buffer */ -static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb) +static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb, + struct sk_buff_head *inputq) { struct tipc_node *node = l->owner; struct tipc_msg *hdr = buf_msg(skb); @@ -1016,7 +1015,7 @@ static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb) hdr = buf_msg(skb); if (less(msg_seqno(hdr), l->drop_point)) goto drop; - if (tipc_data_input(l, skb)) + if (tipc_data_input(l, skb, inputq)) return rc; usr = msg_user(hdr); reasm_skb = &l->failover_reasm_skb; @@ -1026,13 +1025,13 @@ static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb) l->stats.recv_bundles++; l->stats.recv_bundled += msg_msgcnt(hdr); while (tipc_msg_extract(skb, &iskb, &pos)) - tipc_data_input(l, iskb); + tipc_data_input(l, iskb, inputq); return 0; } else if (usr == MSG_FRAGMENTER) { l->stats.recv_fragments++; if (tipc_buf_append(reasm_skb, &skb)) { l->stats.recv_fragmented++; - tipc_data_input(l, skb); + tipc_data_input(l, skb, inputq); } else if (!*reasm_skb) { return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } @@ -1070,10 +1069,13 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { struct sk_buff_head *arrvq = &l->deferdq; + struct sk_buff_head tmpq; struct tipc_msg *hdr; u16 seqno, rcv_nxt; int rc = 0; + __skb_queue_head_init(&tmpq); + if (unlikely(!__tipc_skb_queue_sorted(arrvq, skb))) { if (!(skb_queue_len(arrvq) % TIPC_NACK_INTV)) tipc_link_build_proto_msg(l, STATE_MSG, 0, @@ -1095,7 +1097,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, rc = tipc_link_fsm_evt(l, LINK_ESTABLISH_EVT); if (!link_is_up(l)) { kfree_skb(__skb_dequeue(arrvq)); - return rc; + goto exit; } } @@ -1113,7 +1115,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, rcv_nxt = l->rcv_nxt; if (unlikely(less(rcv_nxt, seqno))) { l->stats.deferred_recv++; - return rc; + goto exit; } __skb_dequeue(arrvq); @@ -1122,14 +1124,14 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, if (unlikely(more(rcv_nxt, seqno))) { l->stats.duplicates++; kfree_skb(skb); - return rc; + goto exit; } /* Packet can be delivered */ l->rcv_nxt++; l->stats.recv_info++; - if (unlikely(!tipc_data_input(l, skb))) - rc = tipc_link_input(l, skb); + if (unlikely(!tipc_data_input(l, skb, &tmpq))) + rc = tipc_link_input(l, skb, &tmpq); /* Ack at regular intervals */ if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) { @@ -1139,6 +1141,8 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, 0, 0, 0, 0, xmitq); } } +exit: + tipc_skb_queue_splice_tail(&tmpq, l->inputq); return rc; } -- cgit v1.2.3 From 440d8963cd590ec9387d76a36e60c02da9ed944d Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 30 Jul 2015 18:24:26 -0400 Subject: tipc: clean up link creation We simplify the link creation function tipc_link_create() and the way the link struct it is connected to the node struct. In particular, we remove the duplicate initialization of some fields which are anyway set in tipc_link_reset(). Tested-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/core.h | 5 +++ net/tipc/link.c | 136 +++++++++++++++++++++++++------------------------------- net/tipc/link.h | 18 +++----- net/tipc/node.c | 48 +++++++------------- 4 files changed, 86 insertions(+), 121 deletions(-) (limited to 'net') diff --git a/net/tipc/core.h b/net/tipc/core.h index f4ed67778c54..b96b41eabf12 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -109,6 +109,11 @@ struct tipc_net { atomic_t subscription_count; }; +static inline struct tipc_net *tipc_net(struct net *net) +{ + return net_generic(net, tipc_net_id); +} + static inline u16 mod(u16 x) { return x & 0xffffu; diff --git a/net/tipc/link.c b/net/tipc/link.c index d683fe9f68c8..f067e5425560 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -147,87 +147,71 @@ int tipc_link_is_active(struct tipc_link *l) return (node_active_link(n, 0) == l) || (node_active_link(n, 1) == l); } +static u32 link_own_addr(struct tipc_link *l) +{ + return msg_prevnode(l->pmsg); +} + /** * tipc_link_create - create a new link - * @n_ptr: pointer to associated node - * @b_ptr: pointer to associated bearer - * @media_addr: media address to use when sending messages over link + * @n: pointer to associated node + * @b: pointer to associated bearer + * @ownnode: identity of own node + * @peer: identity of peer node + * @maddr: media address to be used + * @inputq: queue to put messages ready for delivery + * @namedq: queue to put binding table update messages ready for delivery + * @link: return value, pointer to put the created link * - * Returns pointer to link. + * Returns true if link was created, otherwise false */ -struct tipc_link *tipc_link_create(struct tipc_node *n_ptr, - struct tipc_bearer *b_ptr, - const struct tipc_media_addr *media_addr, - struct sk_buff_head *inputq, - struct sk_buff_head *namedq) +bool tipc_link_create(struct tipc_node *n, struct tipc_bearer *b, u32 session, + u32 ownnode, u32 peer, struct tipc_media_addr *maddr, + struct sk_buff_head *inputq, struct sk_buff_head *namedq, + struct tipc_link **link) { - struct tipc_net *tn = net_generic(n_ptr->net, tipc_net_id); - struct tipc_link *l_ptr; - struct tipc_msg *msg; + struct tipc_link *l; + struct tipc_msg *hdr; char *if_name; - char addr_string[16]; - u32 peer = n_ptr->addr; - if (n_ptr->link_cnt >= MAX_BEARERS) { - tipc_addr_string_fill(addr_string, n_ptr->addr); - pr_err("Cannot establish %uth link to %s. Max %u allowed.\n", - n_ptr->link_cnt, addr_string, MAX_BEARERS); - return NULL; - } + l = kzalloc(sizeof(*l), GFP_ATOMIC); + if (!l) + return false; + *link = l; - if (n_ptr->links[b_ptr->identity].link) { - tipc_addr_string_fill(addr_string, n_ptr->addr); - pr_err("Attempt to establish second link on <%s> to %s\n", - b_ptr->name, addr_string); - return NULL; - } + /* Note: peer i/f name is completed by reset/activate message */ + if_name = strchr(b->name, ':') + 1; + sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown", + tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode), + if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); - l_ptr = kzalloc(sizeof(*l_ptr), GFP_ATOMIC); - if (!l_ptr) { - pr_warn("Link creation failed, no memory\n"); - return NULL; - } - l_ptr->addr = peer; - if_name = strchr(b_ptr->name, ':') + 1; - sprintf(l_ptr->name, "%u.%u.%u:%s-%u.%u.%u:unknown", - tipc_zone(tn->own_addr), tipc_cluster(tn->own_addr), - tipc_node(tn->own_addr), - if_name, - tipc_zone(peer), tipc_cluster(peer), tipc_node(peer)); - /* note: peer i/f name is updated by reset/activate message */ - memcpy(&l_ptr->media_addr, media_addr, sizeof(*media_addr)); - l_ptr->owner = n_ptr; - l_ptr->peer_session = WILDCARD_SESSION; - l_ptr->bearer_id = b_ptr->identity; - l_ptr->tolerance = b_ptr->tolerance; - l_ptr->snd_nxt = 1; - l_ptr->rcv_nxt = 1; - l_ptr->state = LINK_RESET; - - l_ptr->pmsg = (struct tipc_msg *)&l_ptr->proto_msg; - msg = l_ptr->pmsg; - tipc_msg_init(tn->own_addr, msg, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, - l_ptr->addr); - msg_set_size(msg, sizeof(l_ptr->proto_msg)); - msg_set_session(msg, (tn->random & 0xffff)); - msg_set_bearer_id(msg, b_ptr->identity); - strcpy((char *)msg_data(msg), if_name); - l_ptr->net_plane = b_ptr->net_plane; - l_ptr->advertised_mtu = b_ptr->mtu; - l_ptr->mtu = l_ptr->advertised_mtu; - l_ptr->priority = b_ptr->priority; - tipc_link_set_queue_limits(l_ptr, b_ptr->window); - l_ptr->snd_nxt = 1; - __skb_queue_head_init(&l_ptr->transmq); - __skb_queue_head_init(&l_ptr->backlogq); - __skb_queue_head_init(&l_ptr->deferdq); - skb_queue_head_init(&l_ptr->wakeupq); - l_ptr->inputq = inputq; - l_ptr->namedq = namedq; - skb_queue_head_init(l_ptr->inputq); - link_reset_statistics(l_ptr); - tipc_node_attach_link(n_ptr, l_ptr); - return l_ptr; + l->addr = peer; + l->media_addr = maddr; + l->owner = n; + l->peer_session = WILDCARD_SESSION; + l->bearer_id = b->identity; + l->tolerance = b->tolerance; + l->net_plane = b->net_plane; + l->advertised_mtu = b->mtu; + l->mtu = b->mtu; + l->priority = b->priority; + tipc_link_set_queue_limits(l, b->window); + l->inputq = inputq; + l->namedq = namedq; + l->state = LINK_RESETTING; + l->pmsg = (struct tipc_msg *)&l->proto_msg; + hdr = l->pmsg; + tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer); + msg_set_size(hdr, sizeof(l->proto_msg)); + msg_set_session(hdr, session); + msg_set_bearer_id(hdr, l->bearer_id); + strcpy((char *)msg_data(hdr), if_name); + __skb_queue_head_init(&l->transmq); + __skb_queue_head_init(&l->backlogq); + __skb_queue_head_init(&l->deferdq); + skb_queue_head_init(&l->wakeupq); + skb_queue_head_init(l->inputq); + return true; } /* tipc_link_build_bcast_sync_msg() - synchronize broadcast link endpoints. @@ -643,7 +627,7 @@ int __tipc_link_xmit(struct net *net, struct tipc_link *link, u16 ack = mod(link->rcv_nxt - 1); u16 seqno = link->snd_nxt; u16 bc_last_in = link->owner->bclink.last_in; - struct tipc_media_addr *addr = &link->media_addr; + struct tipc_media_addr *addr = link->media_addr; struct sk_buff_head *transmq = &link->transmq; struct sk_buff_head *backlogq = &link->backlogq; struct sk_buff *skb, *bskb; @@ -809,7 +793,7 @@ void tipc_link_push_packets(struct tipc_link *link) link->rcv_unacked = 0; __skb_queue_tail(&link->transmq, skb); tipc_bearer_send(link->owner->net, link->bearer_id, - skb, &link->media_addr); + skb, link->media_addr); } link->snd_nxt = seqno; } @@ -912,7 +896,7 @@ void tipc_link_retransmit(struct tipc_link *l_ptr, struct sk_buff *skb, msg_set_ack(msg, mod(l_ptr->rcv_nxt - 1)); msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); tipc_bearer_send(l_ptr->owner->net, l_ptr->bearer_id, skb, - &l_ptr->media_addr); + l_ptr->media_addr); retransmits--; l_ptr->stats.retransmitted++; } @@ -1200,7 +1184,7 @@ void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg, skb = __skb_dequeue(&xmitq); if (!skb) return; - tipc_bearer_send(l->owner->net, l->bearer_id, skb, &l->media_addr); + tipc_bearer_send(l->owner->net, l->bearer_id, skb, l->media_addr); l->rcv_unacked = 0; kfree_skb(skb); } diff --git a/net/tipc/link.h b/net/tipc/link.h index 39b8c4c5121e..39ff8b6919a4 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -148,7 +148,7 @@ struct tipc_stats { struct tipc_link { u32 addr; char name[TIPC_MAX_LINK_NAME]; - struct tipc_media_addr media_addr; + struct tipc_media_addr *media_addr; struct tipc_node *owner; /* Management and link supervision data */ @@ -205,13 +205,10 @@ struct tipc_link { struct tipc_stats stats; }; -struct tipc_port; - -struct tipc_link *tipc_link_create(struct tipc_node *n, - struct tipc_bearer *b, - const struct tipc_media_addr *maddr, - struct sk_buff_head *inputq, - struct sk_buff_head *namedq); +bool tipc_link_create(struct tipc_node *n, struct tipc_bearer *b, u32 session, + u32 ownnode, u32 peer, struct tipc_media_addr *maddr, + struct sk_buff_head *inputq, struct sk_buff_head *namedq, + struct tipc_link **link); void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq); void tipc_link_build_bcast_sync_msg(struct tipc_link *l, @@ -246,13 +243,8 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info); int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info); int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info); int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]); -void link_prepare_wakeup(struct tipc_link *l); int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq); int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq); -static inline u32 link_own_addr(struct tipc_link *l) -{ - return msg_prevnode(l->pmsg); -} #endif diff --git a/net/tipc/node.c b/net/tipc/node.c index 9e9b0938bd17..7c191641b44f 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -320,10 +320,6 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, if (!nl || !tipc_link_is_up(nl)) return; - if (n->working_links > 1) { - pr_warn("Attempt to establish 3rd link to %x\n", n->addr); - return; - } n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = nl->peer_bearer_id << 16 | bearer_id; @@ -470,13 +466,13 @@ void tipc_node_check_dest(struct net *net, u32 onode, { struct tipc_node *n; struct tipc_link *l; - struct tipc_media_addr *curr_maddr; - struct sk_buff_head *inputq; + struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; bool link_up = false; bool accept_addr = false; bool reset = true; + *dupl_addr = false; *respond = false; @@ -486,13 +482,12 @@ void tipc_node_check_dest(struct net *net, u32 onode, tipc_node_lock(n); - curr_maddr = &n->links[b->identity].maddr; - inputq = &n->links[b->identity].inputq; + le = &n->links[b->identity]; /* Prepare to validate requesting node's signature and media address */ - l = n->links[b->identity].link; + l = le->link; link_up = l && tipc_link_is_up(l); - addr_match = l && !memcmp(curr_maddr, maddr, sizeof(*maddr)); + addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ @@ -559,18 +554,25 @@ void tipc_node_check_dest(struct net *net, u32 onode, /* Now create new link if not already existing */ if (!l) { - l = tipc_link_create(n, b, maddr, inputq, &n->bclink.namedq); - if (!l) { + if (n->link_cnt == 2) { + pr_warn("Cannot establish 3rd link to %x\n", n->addr); + goto exit; + } + if (!tipc_link_create(n, b, mod(tipc_net(net)->random), + tipc_own_addr(net), onode, &le->maddr, + &le->inputq, &n->bclink.namedq, &l)) { *respond = false; goto exit; } + tipc_link_reset(l); + le->link = l; + n->link_cnt++; tipc_node_calculate_timer(n, l); if (n->link_cnt == 1) if (!mod_timer(&n->timer, jiffies + n->keepalive_intv)) tipc_node_get(n); } - memcpy(&l->media_addr, maddr, sizeof(*maddr)); - memcpy(curr_maddr, maddr, sizeof(*maddr)); + memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_unlock(n); if (reset) @@ -603,24 +605,6 @@ static void tipc_node_reset_links(struct tipc_node *n) } } -void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) -{ - n_ptr->links[l_ptr->bearer_id].link = l_ptr; - n_ptr->link_cnt++; -} - -void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr) -{ - int i; - - for (i = 0; i < MAX_BEARERS; i++) { - if (l_ptr != n_ptr->links[i].link) - continue; - n_ptr->links[i].link = NULL; - n_ptr->link_cnt--; - } -} - /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ -- cgit v1.2.3 From d3aa45ce6b94c65b83971257317867db13e5f492 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Jul 2015 15:36:57 -0700 Subject: bpf: add helpers to access tunnel metadata Introduce helpers to let eBPF programs attached to TC manipulate tunnel metadata: bpf_skb_[gs]et_tunnel_key(skb, key, size, flags) skb: pointer to skb key: pointer to 'struct bpf_tunnel_key' size: size of 'struct bpf_tunnel_key' flags: room for future extensions First eBPF program that uses these helpers will allocate per_cpu metadata_dst structures that will be used on TX. On RX metadata_dst is allocated by tunnel driver. Typical usage for TX: struct bpf_tunnel_key tkey; ... populate tkey ... bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0); bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); RX: struct bpf_tunnel_key tkey = {}; bpf_skb_get_tunnel_key(skb, &tkey, sizeof(tkey), 0); ... lookup or redirect based on tkey ... 'struct bpf_tunnel_key' will be extended in the future by adding elements to the end and the 'size' argument will indicate which fields are populated, thereby keeping backwards compatibility. The 'flags' argument may be used as well when the 'size' is not enough or to indicate completely different layout of bpf_tunnel_key. Signed-off-by: Alexei Starovoitov Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 + include/uapi/linux/bpf.h | 17 ++++++++++ net/core/dst.c | 35 +++++++++++++++++---- net/core/filter.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 124 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 7b0306894663..075f523ff23f 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -51,5 +51,6 @@ static inline bool skb_valid_dst(const struct sk_buff *skb) } struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); +struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); #endif /* __NET_DST_METADATA_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2f6c83d714e9..bc0d27d3fbdd 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -258,6 +258,18 @@ enum bpf_func_id { BPF_FUNC_get_cgroup_classid, BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */ BPF_FUNC_skb_vlan_pop, /* bpf_skb_vlan_pop(skb) */ + + /** + * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags) + * retrieve or populate tunnel metadata + * @skb: pointer to skb + * @key: pointer to 'struct bpf_tunnel_key' + * @size: size of 'struct bpf_tunnel_key' + * @flags: room for future extensions + * Retrun: 0 on success + */ + BPF_FUNC_skb_get_tunnel_key, + BPF_FUNC_skb_set_tunnel_key, __BPF_FUNC_MAX_ID, }; @@ -280,4 +292,9 @@ struct __sk_buff { __u32 cb[5]; }; +struct bpf_tunnel_key { + __u32 tunnel_id; + __u32 remote_ipv4; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/net/core/dst.c b/net/core/dst.c index 76a617f6d60a..f8694d1b8702 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -362,15 +362,10 @@ static int dst_md_discard(struct sk_buff *skb) return 0; } -struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) { - struct metadata_dst *md_dst; struct dst_entry *dst; - md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); - if (!md_dst) - return ERR_PTR(-ENOMEM); - dst = &md_dst->dst; dst_init(dst, &md_dst_ops, NULL, 1, DST_OBSOLETE_NONE, DST_METADATA | DST_NOCACHE | DST_NOCOUNT); @@ -380,11 +375,39 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); md_dst->opts_len = optslen; +} + +struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) +{ + struct metadata_dst *md_dst; + + md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); + if (!md_dst) + return NULL; + + __metadata_dst_init(md_dst, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc); +struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags) +{ + int cpu; + struct metadata_dst __percpu *md_dst; + + md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, + __alignof__(struct metadata_dst), flags); + if (!md_dst) + return NULL; + + for_each_possible_cpu(cpu) + __metadata_dst_init(per_cpu_ptr(md_dst, cpu), optslen); + + return md_dst; +} +EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice diff --git a/net/core/filter.c b/net/core/filter.c index 786722a9c6f2..1b72264ff2ee 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -48,6 +48,7 @@ #include #include #include +#include /** * sk_filter - run a packet through a socket filter @@ -1483,6 +1484,78 @@ bool bpf_helper_changes_skb_data(void *func) return false; } +static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; + struct ip_tunnel_info *info = skb_tunnel_info(skb, AF_INET); + + if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) + return -EINVAL; + + to->tunnel_id = be64_to_cpu(info->key.tun_id); + to->remote_ipv4 = be32_to_cpu(info->key.ipv4_src); + + return 0; +} + +const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { + .func = bpf_skb_get_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static struct metadata_dst __percpu *md_dst; + +static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; + struct metadata_dst *md = this_cpu_ptr(md_dst); + struct ip_tunnel_info *info; + + if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags)) + return -EINVAL; + + skb_dst_drop(skb); + dst_hold((struct dst_entry *) md); + skb_dst_set(skb, (struct dst_entry *) md); + + info = &md->u.tun_info; + info->mode = IP_TUNNEL_INFO_TX; + info->key.tun_id = cpu_to_be64(from->tunnel_id); + info->key.ipv4_dst = cpu_to_be32(from->remote_ipv4); + + return 0; +} + +const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { + .func = bpf_skb_set_tunnel_key, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_STACK, + .arg3_type = ARG_CONST_STACK_SIZE, + .arg4_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +{ + if (!md_dst) { + /* race is not possible, since it's called from + * verifier that is holding verifier mutex + */ + md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); + if (!md_dst) + return NULL; + } + return &bpf_skb_set_tunnel_key_proto; +} + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1526,6 +1599,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_vlan_push_proto; case BPF_FUNC_skb_vlan_pop: return &bpf_skb_vlan_pop_proto; + case BPF_FUNC_skb_get_tunnel_key: + return &bpf_skb_get_tunnel_key_proto; + case BPF_FUNC_skb_set_tunnel_key: + return bpf_get_skb_set_tunnel_key_proto(); default: return sk_filter_func_proto(func_id); } -- cgit v1.2.3 From 343d60aada5a358ca186d6e9e353230379c426d8 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 30 Jul 2015 13:34:53 -0700 Subject: ipv6: change ipv6_stub_impl.ipv6_dst_lookup to take net argument This patch adds net argument to ipv6_stub_impl.ipv6_dst_lookup for use cases where sk is not available (like mpls). sk appears to be needed to get the namespace 'net' and is optional otherwise. This patch series changes ipv6_stub_impl.ipv6_dst_lookup to take net argument. sk remains optional. All callers of ipv6_stub_impl.ipv6_dst_lookup have been modified to pass net. I have modified them to use already available 'net' in the scope of the call. I can change them to sock_net(sk) to avoid any unintended change in behaviour if sock namespace is different. They dont seem to be from code inspection. Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 2 +- include/net/addrconf.h | 4 ++-- include/net/ipv6.h | 3 ++- net/ipv6/addrconf_core.c | 11 ++++++++++- net/ipv6/icmp.c | 6 +++--- net/ipv6/ip6_output.c | 12 ++++++------ net/tipc/udp_media.c | 3 ++- 7 files changed, 26 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 81f0f24b2cfb..beed5d4025a3 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2034,7 +2034,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, fl6.flowi6_mark = skb->mark; fl6.flowi6_proto = IPPROTO_UDP; - if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { + if (ipv6_stub->ipv6_dst_lookup(vxlan->net, sk, &ndst, &fl6)) { netdev_dbg(dev, "no route to %pI6\n", &dst->sin6.sin6_addr); dev->stats.tx_carrier_errors++; diff --git a/include/net/addrconf.h b/include/net/addrconf.h index def59d3a34d5..0c3ac5acb85f 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -158,8 +158,8 @@ struct ipv6_stub { const struct in6_addr *addr); int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, const struct in6_addr *addr); - int (*ipv6_dst_lookup)(struct sock *sk, struct dst_entry **dst, - struct flowi6 *fl6); + int (*ipv6_dst_lookup)(struct net *net, struct sock *sk, + struct dst_entry **dst, struct flowi6 *fl6); void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *daddr, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 7c79798bcaab..eecdfc92f807 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -813,7 +813,8 @@ static inline struct sk_buff *ip6_finish_skb(struct sock *sk) &inet6_sk(sk)->cork); } -int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); +int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, + struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, diff --git a/net/ipv6/addrconf_core.c b/net/ipv6/addrconf_core.c index ca09bf49ac68..bfa941fc1165 100644 --- a/net/ipv6/addrconf_core.c +++ b/net/ipv6/addrconf_core.c @@ -107,7 +107,16 @@ int inet6addr_notifier_call_chain(unsigned long val, void *v) } EXPORT_SYMBOL(inet6addr_notifier_call_chain); -const struct ipv6_stub *ipv6_stub __read_mostly; +static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1, + struct dst_entry **u2, + struct flowi6 *u3) +{ + return -EAFNOSUPPORT; +} + +const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) { + .ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup, +}; EXPORT_SYMBOL_GPL(ipv6_stub); /* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 713d7434c911..6c2b2132c8d3 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -329,7 +329,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, struct flowi6 fl2; int err; - err = ip6_dst_lookup(sk, &dst, fl6); + err = ip6_dst_lookup(net, sk, &dst, fl6); if (err) return ERR_PTR(err); @@ -361,7 +361,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, if (err) goto relookup_failed; - err = ip6_dst_lookup(sk, &dst2, &fl2); + err = ip6_dst_lookup(net, sk, &dst2, &fl2); if (err) goto relookup_failed; @@ -591,7 +591,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) else if (!fl6.flowi6_oif) fl6.flowi6_oif = np->ucast_oif; - err = ip6_dst_lookup(sk, &dst, &fl6); + err = ip6_dst_lookup(net, sk, &dst, &fl6); if (err) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c5fc85286ef6..92b7cf0dc1f9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -881,10 +881,9 @@ out: return dst; } -static int ip6_dst_lookup_tail(struct sock *sk, +static int ip6_dst_lookup_tail(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) { - struct net *net = sock_net(sk); #ifdef CONFIG_IPV6_OPTIMISTIC_DAD struct neighbour *n; struct rt6_info *rt; @@ -994,10 +993,11 @@ out_err_release: * * It returns zero on success, or a standard errno code on error. */ -int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) +int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, + struct flowi6 *fl6) { *dst = NULL; - return ip6_dst_lookup_tail(sk, dst, fl6); + return ip6_dst_lookup_tail(net, sk, dst, fl6); } EXPORT_SYMBOL_GPL(ip6_dst_lookup); @@ -1018,7 +1018,7 @@ struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, struct dst_entry *dst = NULL; int err; - err = ip6_dst_lookup_tail(sk, &dst, fl6); + err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) @@ -1052,7 +1052,7 @@ struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, dst = ip6_sk_dst_check(sk, dst, fl6); - err = ip6_dst_lookup_tail(sk, &dst, fl6); + err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); if (err) return ERR_PTR(err); if (final_dst) diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 66deebc66aa1..c170d3138953 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -194,7 +194,8 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb, .saddr = src->ipv6, .flowi6_proto = IPPROTO_UDP }; - err = ipv6_stub->ipv6_dst_lookup(ub->ubsock->sk, &ndst, &fl6); + err = ipv6_stub->ipv6_dst_lookup(net, ub->ubsock->sk, &ndst, + &fl6); if (err) goto tx_error; ttl = ip6_dst_hoplimit(ndst); -- cgit v1.2.3 From bf21563acc1de2391d21ac2141c3471aa4815c1a Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 30 Jul 2015 13:34:54 -0700 Subject: af_mpls: fix undefined reference to ip6_route_output Undefined reference to ip6_route_output and ip_route_output was reported with CONFIG_INET=n and CONFIG_IPV6=n. This patch uses ipv6_stub_impl.ipv6_dst_lookup instead of ip6_route_output. And wraps affected code under IS_ENABLED(CONFIG_INET) and IS_ENABLED(CONFIG_IPV6). Reported-by: kbuild test robot Reported-by: Thomas Graf Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 49f1b0e41bdf..88cfaa241c07 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -15,7 +15,10 @@ #include #include #include -#include +#if IS_ENABLED(CONFIG_IPV6) +#include +#include +#endif #include "internal.h" #define LABEL_NOT_SPECIFIED (1<<20) @@ -331,6 +334,7 @@ static unsigned find_free_label(struct net *net) return LABEL_NOT_SPECIFIED; } +#if IS_ENABLED(CONFIG_INET) static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) { struct net_device *dev = NULL; @@ -347,30 +351,49 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) ip_rt_put(rt); -errout: return dev; +errout: + return ERR_PTR(-ENODEV); +} +#else +static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) +{ + return ERR_PTR(-EAFNOSUPPORT); } +#endif +#if IS_ENABLED(CONFIG_IPV6) static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr) { struct net_device *dev = NULL; struct dst_entry *dst; struct flowi6 fl6; + int err; + + if (!ipv6_stub) + return ERR_PTR(-EAFNOSUPPORT); memset(&fl6, 0, sizeof(fl6)); memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) + err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6); + if (err) goto errout; dev = dst->dev; dev_hold(dev); - -errout: dst_release(dst); return dev; + +errout: + return ERR_PTR(err); } +#else +static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr) +{ + return ERR_PTR(-EAFNOSUPPORT); +} +#endif static struct net_device *find_outdev(struct net *net, struct mpls_route_config *cfg) @@ -425,10 +448,12 @@ static int mpls_route_add(struct mpls_route_config *cfg) if (cfg->rc_output_labels > MAX_NEW_LABELS) goto errout; - err = -ENODEV; dev = find_outdev(net, cfg); - if (!dev) + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + dev = NULL; goto errout; + } /* Ensure this is a supported device */ err = -EINVAL; -- cgit v1.2.3 From 85b1d8bbfd2ea6ae1dc6d0ccdb25abdcb52a5856 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 30 Jul 2015 23:54:28 -0700 Subject: br2684: Remove unnecessary formatting macros b1 and bs Use vsprintf extension %pI4 instead. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/atm/br2684.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/atm/br2684.c b/net/atm/br2684.c index cc78538d163b..aa0047c5c467 100644 --- a/net/atm/br2684.c +++ b/net/atm/br2684.c @@ -802,13 +802,10 @@ static int br2684_seq_show(struct seq_file *seq, void *v) (brdev->payload == p_bridged) ? "bridged" : "routed", brvcc->copies_failed, brvcc->copies_needed); #ifdef CONFIG_ATM_BR2684_IPFILTER -#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte] -#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3) if (brvcc->filter.netmask != 0) - seq_printf(seq, " filter=%d.%d.%d.%d/" - "%d.%d.%d.%d\n", bs(prefix), bs(netmask)); -#undef bs -#undef b1 + seq_printf(seq, " filter=%pI4/%pI4\n", + &brvcc->filter.prefix, + &brvcc->filter.netmask); #endif /* CONFIG_ATM_BR2684_IPFILTER */ } return 0; -- cgit v1.2.3 From 4ed70ce9f01c998999e48642a768d9013bee2c4f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 31 Jul 2015 11:42:56 -0700 Subject: net: dsa: Refactor transmit path to eliminate duplication All tagging protocols do the same thing: increment device statistics, make room for the tag to be inserted, create the tag, invoke the parent network device transmit function. In order to prepare for adding netpoll support, which requires the tag creation, but not using the parent network device transmit function, do some little refactoring which eliminates duplication between the 4 tagging protocols supported. We need to return a sk_buff pointer back to the caller because the tag specific transmit function may have to reallocate the original skb (e.g: tag_trailer.c) and this is the one we should be transmitting, not the original sk_buff we were passed. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 4 ++-- net/dsa/slave.c | 27 +++++++++++++++++++-------- net/dsa/tag_brcm.c | 15 +++------------ net/dsa/tag_dsa.c | 12 +++--------- net/dsa/tag_edsa.c | 12 +++--------- net/dsa/tag_trailer.c | 12 +++--------- 6 files changed, 33 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index d5f1f9b862ea..eeade901e67a 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -15,7 +15,7 @@ #include struct dsa_device_ops { - netdev_tx_t (*xmit)(struct sk_buff *skb, struct net_device *dev); + struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); int (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); }; @@ -26,7 +26,7 @@ struct dsa_slave_priv { * switch port. */ struct net_device *dev; - netdev_tx_t (*xmit)(struct sk_buff *skb, + struct sk_buff * (*xmit)(struct sk_buff *skb, struct net_device *dev); /* diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 0917123790ea..5fc87ee53905 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -421,21 +421,32 @@ static int dsa_slave_port_attr_get(struct net_device *dev, static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); + struct sk_buff *nskb; - return p->xmit(skb, dev); -} + dev->stats.tx_packets++; + dev->stats.tx_bytes += skb->len; -static netdev_tx_t dsa_slave_notag_xmit(struct sk_buff *skb, - struct net_device *dev) -{ - struct dsa_slave_priv *p = netdev_priv(dev); + /* Transmit function may have to reallocate the original SKB */ + nskb = p->xmit(skb, dev); + if (!nskb) + return NETDEV_TX_OK; - skb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(skb); + /* Queue the SKB for transmission on the parent interface, but + * do not modify its EtherType + */ + nskb->dev = p->parent->dst->master_netdev; + dev_queue_xmit(nskb); return NETDEV_TX_OK; } +static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + /* Just return the original SKB */ + return skb; +} + /* ethtool operations *******************************************************/ static int diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 83d3572cdb20..e2aadb73111d 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -58,14 +58,11 @@ #define BRCM_EG_TC_MASK 0x7 #define BRCM_EG_PID_MASK 0x1f -static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *brcm_tag; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - if (skb_cow_head(skb, BRCM_TAG_LEN) < 0) goto out_free; @@ -87,17 +84,11 @@ static netdev_tx_t brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev) brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; brcm_tag[3] = (1 << p->port) & BRCM_IG_DSTMAP1_MASK; - /* Queue the SKB for transmission on the parent interface, but - * do not modify its EtherType - */ - skb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(skb); - - return NETDEV_TX_OK; + return skb; out_free: kfree_skb(skb); - return NETDEV_TX_OK; + return NULL; } static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 2dab27063273..aa780e4ac0bd 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -15,14 +15,11 @@ #define DSA_HLEN 4 -static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *dsa_header; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - /* * Convert the outermost 802.1q tag to a DSA tag for tagged * packets, or insert a DSA tag between the addresses and @@ -63,14 +60,11 @@ static netdev_tx_t dsa_xmit(struct sk_buff *skb, struct net_device *dev) dsa_header[3] = 0x00; } - skb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(skb); - - return NETDEV_TX_OK; + return skb; out_free: kfree_skb(skb); - return NETDEV_TX_OK; + return NULL; } static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 9aeda596f7ec..2288c8098c42 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -16,14 +16,11 @@ #define DSA_HLEN 4 #define EDSA_HLEN 8 -static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); u8 *edsa_header; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - /* * Convert the outermost 802.1q tag to a DSA tag and prepend * a DSA ethertype field is the packet is tagged, or insert @@ -76,14 +73,11 @@ static netdev_tx_t edsa_xmit(struct sk_buff *skb, struct net_device *dev) edsa_header[7] = 0x00; } - skb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(skb); - - return NETDEV_TX_OK; + return skb; out_free: kfree_skb(skb); - return NETDEV_TX_OK; + return NULL; } static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index e268f9db8893..d25efc93d8f1 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -13,16 +13,13 @@ #include #include "dsa_priv.h" -static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); struct sk_buff *nskb; int padlen; u8 *trailer; - dev->stats.tx_packets++; - dev->stats.tx_bytes += skb->len; - /* * We have to make sure that the trailer ends up as the very * last 4 bytes of the packet. This means that we have to pad @@ -36,7 +33,7 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) nskb = alloc_skb(NET_IP_ALIGN + skb->len + padlen + 4, GFP_ATOMIC); if (nskb == NULL) { kfree_skb(skb); - return NETDEV_TX_OK; + return NULL; } skb_reserve(nskb, NET_IP_ALIGN); @@ -57,10 +54,7 @@ static netdev_tx_t trailer_xmit(struct sk_buff *skb, struct net_device *dev) trailer[2] = 0x10; trailer[3] = 0x00; - nskb->dev = p->parent->dst->master_netdev; - dev_queue_xmit(nskb); - - return NETDEV_TX_OK; + return nskb; } static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, -- cgit v1.2.3 From 04ff53f96a931751a70c2bb3926770900b5fbebe Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Fri, 31 Jul 2015 11:42:57 -0700 Subject: net: dsa: Add netconsole support Add support for using DSA slave network devices with netconsole, which requires us to allocate and free custom netpoll instances and invoke the parent network device poll controller callback. In order for netconsole to work, we need to construct the DSA tag, but not queue the skb for transmission on the master network device xmit function. Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa_priv.h | 4 ++++ net/dsa/slave.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index eeade901e67a..311796c809af 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -13,6 +13,7 @@ #include #include +#include struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); @@ -47,6 +48,9 @@ struct dsa_slave_priv { int old_duplex; struct net_device *bridge_dev; +#ifdef CONFIG_NET_POLL_CONTROLLER + struct netpoll *netpoll; +#endif }; /* dsa.c */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 5fc87ee53905..0010c690cc67 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "dsa_priv.h" /* slave mii_bus handling ***************************************************/ @@ -418,6 +419,18 @@ static int dsa_slave_port_attr_get(struct net_device *dev, return 0; } +static inline netdev_tx_t dsa_netpoll_send_skb(struct dsa_slave_priv *p, + struct sk_buff *skb) +{ +#ifdef CONFIG_NET_POLL_CONTROLLER + if (p->netpoll) + netpoll_send_skb(p->netpoll, skb); +#else + BUG(); +#endif + return NETDEV_TX_OK; +} + static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -431,6 +444,12 @@ static netdev_tx_t dsa_slave_xmit(struct sk_buff *skb, struct net_device *dev) if (!nskb) return NETDEV_TX_OK; + /* SKB for netpoll still need to be mangled with the protocol-specific + * tag to be successfully transmitted + */ + if (unlikely(netpoll_tx_running(dev))) + return dsa_netpoll_send_skb(p, nskb); + /* Queue the SKB for transmission on the parent interface, but * do not modify its EtherType */ @@ -676,6 +695,49 @@ static int dsa_slave_get_eee(struct net_device *dev, struct ethtool_eee *e) return ret; } +#ifdef CONFIG_NET_POLL_CONTROLLER +static int dsa_slave_netpoll_setup(struct net_device *dev, + struct netpoll_info *ni) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + struct net_device *master = ds->dst->master_netdev; + struct netpoll *netpoll; + int err = 0; + + netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); + if (!netpoll) + return -ENOMEM; + + err = __netpoll_setup(netpoll, master); + if (err) { + kfree(netpoll); + goto out; + } + + p->netpoll = netpoll; +out: + return err; +} + +static void dsa_slave_netpoll_cleanup(struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct netpoll *netpoll = p->netpoll; + + if (!netpoll) + return; + + p->netpoll = NULL; + + __netpoll_free_async(netpoll); +} + +static void dsa_slave_poll_controller(struct net_device *dev) +{ +} +#endif + static const struct ethtool_ops dsa_slave_ethtool_ops = { .get_settings = dsa_slave_get_settings, .set_settings = dsa_slave_set_settings, @@ -708,6 +770,11 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_fdb_dump = dsa_slave_fdb_dump, .ndo_do_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, +#ifdef CONFIG_NET_POLL_CONTROLLER + .ndo_netpoll_setup = dsa_slave_netpoll_setup, + .ndo_netpoll_cleanup = dsa_slave_netpoll_cleanup, + .ndo_poll_controller = dsa_slave_poll_controller, +#endif }; static const struct switchdev_ops dsa_slave_switchdev_ops = { -- cgit v1.2.3 From f70ea018da0631e10c26a02f5a82d626ffef5bd7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 31 Jul 2015 16:52:10 -0700 Subject: net: Add functions to get skb->hash based on flow structures Add skb_get_hash_flowi6 and skb_get_hash_flowi4 which derive an sk_buff hash from flowi6 and flowi4 structures respectively. These functions can be called when creating a packet in the output path where the new sk_buff does not yet contain a fully formed packet that is parsable by flow dissector. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 21 +++++++++++++++++ net/core/flow_dissector.c | 58 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 75 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 648a2c241993..b7c1286e247d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -37,6 +37,7 @@ #include #include #include +#include /* A. Checksumming of received packets by device. * @@ -945,6 +946,26 @@ static inline __u32 skb_get_hash(struct sk_buff *skb) return skb->hash; } +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6); + +static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +{ + if (!skb->l4_hash && !skb->sw_hash) + __skb_get_hash_flowi6(skb, fl6); + + return skb->hash; +} + +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl); + +static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +{ + if (!skb->l4_hash && !skb->sw_hash) + __skb_get_hash_flowi4(skb, fl4); + + return skb->hash; +} + __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb); static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2a834c6179b9..11e6540fa386 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -590,6 +590,15 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, } EXPORT_SYMBOL(make_flow_keys_digest); +static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash, + struct flow_keys *keys) +{ + if (keys->ports.ports) + skb->l4_hash = 1; + skb->sw_hash = 1; + skb->hash = hash; +} + /** * __skb_get_hash: calculate a flow hash * @skb: sk_buff to calculate flow hash from @@ -609,10 +618,8 @@ void __skb_get_hash(struct sk_buff *skb) hash = ___skb_get_hash(skb, &keys, hashrnd); if (!hash) return; - if (keys.ports.ports) - skb->l4_hash = 1; - skb->sw_hash = 1; - skb->hash = hash; + + __skb_set_sw_hash(skb, hash, &keys); } EXPORT_SYMBOL(__skb_get_hash); @@ -624,6 +631,49 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) } EXPORT_SYMBOL(skb_get_hash_perturb); +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +{ + struct flow_keys keys; + + memset(&keys, 0, sizeof(keys)); + + memcpy(&keys.addrs.v6addrs.src, &fl6->saddr, + sizeof(keys.addrs.v6addrs.src)); + memcpy(&keys.addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys.addrs.v6addrs.dst)); + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys.ports.src = fl6->fl6_sport; + keys.ports.dst = fl6->fl6_dport; + keys.keyid.keyid = fl6->fl6_gre_key; + keys.tags.flow_label = (__force u32)fl6->flowlabel; + keys.basic.ip_proto = fl6->flowi6_proto; + + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + + return skb->hash; +} +EXPORT_SYMBOL(__skb_get_hash_flowi6); + +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +{ + struct flow_keys keys; + + memset(&keys, 0, sizeof(keys)); + + keys.addrs.v4addrs.src = fl4->saddr; + keys.addrs.v4addrs.dst = fl4->daddr; + keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys.ports.src = fl4->fl4_sport; + keys.ports.dst = fl4->fl4_dport; + keys.keyid.keyid = fl4->fl4_gre_key; + keys.basic.ip_proto = fl4->flowi4_proto; + + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + + return skb->hash; +} +EXPORT_SYMBOL(__skb_get_hash_flowi4); + u32 __skb_get_poff(const struct sk_buff *skb, void *data, const struct flow_keys *keys, int hlen) { -- cgit v1.2.3 From 67800f9b1f4eb5bbefc32e3f5044097354bc85b3 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 31 Jul 2015 16:52:11 -0700 Subject: ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel We can't call skb_get_hash here since the packet is not complete to do flow_dissector. Create hash based on flowi6 instead. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/ipv6.h | 5 +++-- net/ipv6/ip6_gre.c | 5 +++-- net/ipv6/ip6_output.c | 4 ++-- net/ipv6/ip6_tunnel.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/ipv6.h b/include/net/ipv6.h index eecdfc92f807..3e334b33ef3a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -708,12 +708,13 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, #if IS_ENABLED(CONFIG_IPV6) static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, - __be32 flowlabel, bool autolabel) + __be32 flowlabel, bool autolabel, + struct flowi6 *fl6) { if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) { u32 hash; - hash = skb_get_hash(skb); + hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index a38d3ac0f18f..a7d1ca2337a9 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -728,7 +728,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, */ ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), - ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); + ip6_make_flowlabel(net, skb, fl6->flowlabel, false, fl6)); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -1182,7 +1182,8 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, false)); + t->fl.u.ip6.flowlabel, false, + &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; ipv6h->saddr = t->parms.laddr; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 92b7cf0dc1f9..26ea47930740 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -207,7 +207,7 @@ int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, hlimit = ip6_dst_hoplimit(dst); ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel)); + np->autoflowlabel, fl6)); hdr->payload_len = htons(seg_len); hdr->nexthdr = proto; @@ -1649,7 +1649,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_flow_hdr(hdr, v6_cork->tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, - np->autoflowlabel)); + np->autoflowlabel, fl6)); hdr->hop_limit = v6_cork->hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 2e67b660118b..54e694c4af0e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1095,7 +1095,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), - ip6_make_flowlabel(net, skb, fl6->flowlabel, false)); + ip6_make_flowlabel(net, skb, fl6->flowlabel, false, fl6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; -- cgit v1.2.3 From 42240901f7c438636715b9cb6ed93f4441ffc091 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 31 Jul 2015 16:52:12 -0700 Subject: ipv6: Implement different admin modes for automatic flow labels Change the meaning of net.ipv6.auto_flowlabels to provide a mode for automatic flow labels generation. There are four modes: 0: flow labels are disabled 1: flow labels are enabled, sockets can opt-out 2: flow labels are allowed, sockets can opt-in 3: flow labels are enabled and enforced, no opt-out for sockets np->autoflowlabel is initialized according to the sysctl value. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 20 ++++++++---- include/net/ipv6.h | 59 ++++++++++++++++++++++++++-------- net/ipv6/af_inet6.c | 3 +- net/ipv6/ip6_gre.c | 4 +-- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/sysctl_net_ipv6.c | 7 +++- 6 files changed, 70 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 00d26d919459..9ac3af3ab739 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1215,14 +1215,20 @@ flowlabel_consistency - BOOLEAN FALSE: disabled Default: TRUE -auto_flowlabels - BOOLEAN - Automatically generate flow labels based based on a flow hash - of the packet. This allows intermediate devices, such as routers, - to idenfify packet flows for mechanisms like Equal Cost Multipath +auto_flowlabels - INTEGER + Automatically generate flow labels based on a flow hash of the + packet. This allows intermediate devices, such as routers, to + identify packet flows for mechanisms like Equal Cost Multipath Routing (see RFC 6438). - TRUE: enabled - FALSE: disabled - Default: false + 0: automatic flow labels are completely disabled + 1: automatic flow labels are enabled by default, they can be + disabled on a per socket basis using the IPV6_AUTOFLOWLABEL + socket option + 2: automatic flow labels are allowed, they may be enabled on a + per socket basis using the IPV6_AUTOFLOWLABEL socket option + 3: automatic flow labels are enabled and enforced, they cannot + be disabled by the socket option + Default: 0 flowlabel_state_ranges - BOOLEAN Split the flow label number space into two ranges. 0-0x7FFFF is diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 3e334b33ef3a..c02c1c03363a 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -707,36 +707,69 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, } #if IS_ENABLED(CONFIG_IPV6) + +/* Sysctl settings for net ipv6.auto_flowlabels */ +#define IP6_AUTO_FLOW_LABEL_OFF 0 +#define IP6_AUTO_FLOW_LABEL_OPTOUT 1 +#define IP6_AUTO_FLOW_LABEL_OPTIN 2 +#define IP6_AUTO_FLOW_LABEL_FORCED 3 + +#define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED + +#define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OFF + static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { - if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) { - u32 hash; + u32 hash; - hash = skb_get_hash_flowi6(skb, fl6); + if (flowlabel || + net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || + (!autolabel && + net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) + return flowlabel; - /* Since this is being sent on the wire obfuscate hash a bit - * to minimize possbility that any useful information to an - * attacker is leaked. Only lower 20 bits are relevant. - */ - hash ^= hash >> 12; + hash = skb_get_hash_flowi6(skb, fl6); - flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; + /* Since this is being sent on the wire obfuscate hash a bit + * to minimize possbility that any useful information to an + * attacker is leaked. Only lower 20 bits are relevant. + */ + rol32(hash, 16); - if (net->ipv6.sysctl.flowlabel_state_ranges) - flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; - } + flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; + + if (net->ipv6.sysctl.flowlabel_state_ranges) + flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; return flowlabel; } + +static inline int ip6_default_np_autolabel(struct net *net) +{ + switch (net->ipv6.sysctl.auto_flowlabels) { + case IP6_AUTO_FLOW_LABEL_OFF: + case IP6_AUTO_FLOW_LABEL_OPTIN: + default: + return 0; + case IP6_AUTO_FLOW_LABEL_OPTOUT: + case IP6_AUTO_FLOW_LABEL_FORCED: + return 1; + } +} #else static inline void ip6_set_txhash(struct sock *sk) { } static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, - __be32 flowlabel, bool autolabel) + __be32 flowlabel, bool autolabel, + struct flowi6 *fl6) { return flowlabel; } +static inline int ip6_default_np_autolabel(struct net *net) +{ + return 0; +} #endif diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7bc92ea4ae8f..3f0ae3a7c0b1 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -197,6 +197,7 @@ lookup_protocol: np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; + np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk)); sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets @@ -767,7 +768,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.bindv6only = 0; net->ipv6.sysctl.icmpv6_time = 1*HZ; net->ipv6.sysctl.flowlabel_consistency = 1; - net->ipv6.sysctl.auto_flowlabels = 0; + net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; net->ipv6.sysctl.flowlabel_state_ranges = 1; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index a7d1ca2337a9..34f121812a14 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -728,7 +728,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, */ ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), - ip6_make_flowlabel(net, skb, fl6->flowlabel, false, fl6)); + ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = tunnel->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; @@ -1182,7 +1182,7 @@ static int ip6gre_header(struct sk_buff *skb, struct net_device *dev, ip6_flow_hdr(ipv6h, 0, ip6_make_flowlabel(dev_net(dev), skb, - t->fl.u.ip6.flowlabel, false, + t->fl.u.ip6.flowlabel, true, &t->fl.u.ip6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = NEXTHDR_GRE; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 54e694c4af0e..b0ab420612bc 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1095,7 +1095,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, skb_reset_network_header(skb); ipv6h = ipv6_hdr(skb); ip6_flow_hdr(ipv6h, INET_ECN_encapsulate(0, dsfield), - ip6_make_flowlabel(net, skb, fl6->flowlabel, false, fl6)); + ip6_make_flowlabel(net, skb, fl6->flowlabel, true, fl6)); ipv6h->hop_limit = t->parms.hop_limit; ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index db48aebd9c47..45243bbe5253 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -17,6 +17,9 @@ #include static int one = 1; +static int auto_flowlabels_min; +static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; + static struct ctl_table ipv6_table_template[] = { { @@ -45,7 +48,9 @@ static struct ctl_table ipv6_table_template[] = { .data = &init_net.ipv6.sysctl.auto_flowlabels, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dointvec_minmax, + .extra1 = &auto_flowlabels_min, + .extra2 = &auto_flowlabels_max }, { .procname = "fwmark_reflect", -- cgit v1.2.3 From be26849bfb6fcee4123c687fc39bd6da1b3be328 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Fri, 31 Jul 2015 16:52:13 -0700 Subject: ipv6: Disable flowlabel state ranges by default Per RFC6437 stateful flow labels (e.g. labels set by flow label manager) cannot "disturb" nodes taking part in stateless flow labels. While the ranges only reduce the flow label entropy by one bit, it is conceivable that this might bias the algorithm on some routers causing a load imbalance. For best results on the Internet we really need the full 20 bits. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3f0ae3a7c0b1..44bb66bde0e2 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -771,7 +771,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.auto_flowlabels = IP6_DEFAULT_AUTO_FLOW_LABELS; net->ipv6.sysctl.idgen_retries = 3; net->ipv6.sysctl.idgen_delay = 1 * HZ; - net->ipv6.sysctl.flowlabel_state_ranges = 1; + net->ipv6.sysctl.flowlabel_state_ranges = 0; atomic_set(&net->ipv6.fib6_sernum, 1); err = ipv6_init_mibs(net); -- cgit v1.2.3 From ba7591d8b28bd16a2eface5d009ab0b60c7629a4 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 1 Aug 2015 00:46:29 +0200 Subject: ebpf: add skb->hash to offset map for usage in {cls, act}_bpf or filters Add skb->hash to the __sk_buff offset map, so it can be accessed from an eBPF program. We currently already do this for classic BPF filters, but not yet on eBPF, it might be useful as a demuxer in combination with helpers like bpf_clone_redirect(), toy example: __section("cls-lb") int ingress_main(struct __sk_buff *skb) { unsigned int which = 3 + (skb->hash & 7); /* bpf_skb_store_bytes(skb, ...); */ /* bpf_l{3,4}_csum_replace(skb, ...); */ bpf_clone_redirect(skb, which, 0); return -1; } I was thinking whether to add skb_get_hash(), but then concluded the raw skb->hash seems fine in this case: we can directly access the hash w/o extra eBPF helper function call, it's filled out by many NICs on ingress, and in case the entropy level would not be sufficient, people can still implement their own specific sw fallback hash mix anyway. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 7 +++++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bc0d27d3fbdd..2ce13c109b00 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -290,6 +290,7 @@ struct __sk_buff { __u32 ifindex; __u32 tc_index; __u32 cb[5]; + __u32 hash; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index 1b72264ff2ee..a50dbfa83ad9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1711,6 +1711,13 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, offsetof(struct net_device, ifindex)); break; + case offsetof(struct __sk_buff, hash): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, hash)); + break; + case offsetof(struct __sk_buff, mark): BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); -- cgit v1.2.3 From 6678053092e808485f4d56dca8705098436495e9 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Fri, 31 Jul 2015 15:03:26 +0900 Subject: bridge: Don't segment multiple tagged packets on bridge device Bridge devices don't need to segment multiple tagged packets since thier ports can segment them. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- net/bridge/br_device.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 4ff77a16956c..0aa8f5cf46a1 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -339,6 +339,7 @@ static const struct net_device_ops br_netdev_ops = { .ndo_bridge_getlink = br_getlink, .ndo_bridge_setlink = br_setlink, .ndo_bridge_dellink = br_dellink, + .ndo_features_check = passthru_features_check, }; static void br_dev_free(struct net_device *dev) -- cgit v1.2.3 From e44deb2f0cce9183ca94d14effd4170a35eec31d Mon Sep 17 00:00:00 2001 From: Satish Ashok Date: Mon, 3 Aug 2015 13:29:16 +0200 Subject: bridge: mdb: add/del entry on all vlans if vlan_filter is enabled and vid is 0 Before this patch when a vid was not specified, the entry was added with vid 0 which is useless when vlan_filtering is enabled. This patch makes the entry to be added on all configured vlans when vlan filtering is enabled and respectively deleted from all, if the entry vid is 0. This is also closer to the way fdb works with regard to vid 0 and vlan filtering. Example: Setup: $ bridge vlan add vid 256 dev eth4 $ bridge vlan add vid 1024 dev eth4 $ bridge vlan add vid 64 dev eth3 $ bridge vlan add vid 128 dev eth3 $ bridge vlan port vlan ids eth3 1 PVID Egress Untagged 64 128 eth4 1 PVID Egress Untagged 256 1024 $ echo 1 > /sys/class/net/br0/bridge/vlan_filtering Before: $ bridge mdb add dev br0 port eth3 grp 239.0.0.1 $ bridge mdb dev br0 port eth3 grp 239.0.0.1 temp After: $ bridge mdb add dev br0 port eth3 grp 239.0.0.1 $ bridge mdb dev br0 port eth3 grp 239.0.0.1 temp vid 1 dev br0 port eth3 grp 239.0.0.1 temp vid 128 dev br0 port eth3 grp 239.0.0.1 temp vid 64 Signed-off-by: Satish Ashok Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 6a592856da1c..1df3ef4a73b9 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -464,8 +464,11 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); + unsigned short vid = VLAN_N_VID; + struct net_device *dev, *pdev; struct br_mdb_entry *entry; - struct net_device *dev; + struct net_bridge_port *p; + struct net_port_vlans *pv; struct net_bridge *br; int err; @@ -475,9 +478,32 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) br = netdev_priv(dev); - err = __br_mdb_add(net, br, entry); - if (!err) - __br_mdb_notify(dev, entry, RTM_NEWMDB); + /* If vlan filtering is enabled and VLAN is not specified + * install mdb entry on all vlans configured on the port. + */ + pdev = __dev_get_by_index(net, entry->ifindex); + if (!pdev) + return -ENODEV; + + p = br_port_get_rtnl(pdev); + if (!p || p->br != br || p->state == BR_STATE_DISABLED) + return -EINVAL; + + pv = nbp_get_vlan_info(p); + if (br->vlan_enabled && pv && entry->vid == 0) { + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { + entry->vid = vid; + err = __br_mdb_add(net, br, entry); + if (err) + break; + __br_mdb_notify(dev, entry, RTM_NEWMDB); + } + } else { + err = __br_mdb_add(net, br, entry); + if (!err) + __br_mdb_notify(dev, entry, RTM_NEWMDB); + } + return err; } @@ -539,8 +565,12 @@ unlock: static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) { - struct net_device *dev; + struct net *net = sock_net(skb->sk); + unsigned short vid = VLAN_N_VID; + struct net_device *dev, *pdev; struct br_mdb_entry *entry; + struct net_bridge_port *p; + struct net_port_vlans *pv; struct net_bridge *br; int err; @@ -550,9 +580,31 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) br = netdev_priv(dev); - err = __br_mdb_del(br, entry); - if (!err) - __br_mdb_notify(dev, entry, RTM_DELMDB); + /* If vlan filtering is enabled and VLAN is not specified + * delete mdb entry on all vlans configured on the port. + */ + pdev = __dev_get_by_index(net, entry->ifindex); + if (!pdev) + return -ENODEV; + + p = br_port_get_rtnl(pdev); + if (!p || p->br != br || p->state == BR_STATE_DISABLED) + return -EINVAL; + + pv = nbp_get_vlan_info(p); + if (br->vlan_enabled && pv && entry->vid == 0) { + for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { + entry->vid = vid; + err = __br_mdb_del(br, entry); + if (!err) + __br_mdb_notify(dev, entry, RTM_DELMDB); + } + } else { + err = __br_mdb_del(br, entry); + if (!err) + __br_mdb_notify(dev, entry, RTM_DELMDB); + } + return err; } -- cgit v1.2.3 From a5c90b29e5ccdb90922b808fd4831cfbaa63006c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 3 Aug 2015 16:21:57 +0200 Subject: act_bpf: properly support late binding of bpf action to a classifier Since the introduction of the BPF action in d23b8ad8ab23 ("tc: add BPF based action"), late binding was not working as expected. I.e. setting the action part for a classifier only via 'bpf index ', where is the index of an existing action, is being rejected by the kernel due to other missing parameters. It doesn't make sense to require these parameters such as BPF opcodes etc, as they are not going to be used anyway: in this case, they're just allocated/parsed and then freed again w/o doing anything meaningful. Instead, parse and verify the remaining parameters *after* the test on tcf_hash_check(), when we really know that we're dealing with creation of a new action or replacement of an existing one and where late binding is thus irrelevant. After patch, test case is now working: FOO="1,6 0 0 4294967295," tc actions add action bpf bytecode "$FOO" tc filter add dev foo parent 1: bpf bytecode "$FOO" flowid 1:1 action bpf index 1 tc actions show action bpf action order 0: bpf bytecode '1,6 0 0 4294967295' default-action pipe index 1 ref 2 bind 1 tc filter show dev foo filter protocol all pref 49152 bpf filter protocol all pref 49152 bpf handle 0x1 flowid 1:1 bytecode '1,6 0 0 4294967295' action order 1: bpf bytecode '1,6 0 0 4294967295' default-action pipe index 1 ref 2 bind 1 Late binding of a BPF action can be useful for preloading maps (e.g. before they hit traffic) in case of eBPF programs, or to share a single eBPF action with multiple classifiers. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 51 +++++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index aaae8e83bf18..1b97dabc621a 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -278,7 +278,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct tc_act_bpf *parm; struct tcf_bpf *prog; bool is_bpf, is_ebpf; - int ret; + int ret, res = 0; if (!nla) return -EINVAL; @@ -287,41 +287,43 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (ret < 0) return ret; - is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; - is_ebpf = tb[TCA_ACT_BPF_FD]; - - if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || - !tb[TCA_ACT_BPF_PARMS]) + if (!tb[TCA_ACT_BPF_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_ACT_BPF_PARMS]); - memset(&cfg, 0, sizeof(cfg)); - - ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : - tcf_bpf_init_from_efd(tb, &cfg); - if (ret < 0) - return ret; - if (!tcf_hash_check(parm->index, act, bind)) { ret = tcf_hash_create(parm->index, est, act, sizeof(*prog), bind, false); if (ret < 0) - goto destroy_fp; + return ret; - ret = ACT_P_CREATED; + res = ACT_P_CREATED; } else { /* Don't override defaults. */ if (bind) - goto destroy_fp; + return 0; tcf_hash_release(act, bind); - if (!replace) { - ret = -EEXIST; - goto destroy_fp; - } + if (!replace) + return -EEXIST; } + is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; + is_ebpf = tb[TCA_ACT_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) { + ret = -EINVAL; + goto out; + } + + memset(&cfg, 0, sizeof(cfg)); + + ret = is_bpf ? tcf_bpf_init_from_ops(tb, &cfg) : + tcf_bpf_init_from_efd(tb, &cfg); + if (ret < 0) + goto out; + prog = to_bpf(act); spin_lock_bh(&prog->tcf_lock); @@ -341,15 +343,16 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, spin_unlock_bh(&prog->tcf_lock); - if (ret == ACT_P_CREATED) + if (res == ACT_P_CREATED) tcf_hash_insert(act); else tcf_bpf_cfg_cleanup(&old); - return ret; + return res; +out: + if (res == ACT_P_CREATED) + tcf_hash_cleanup(act, est); -destroy_fp: - tcf_bpf_cfg_cleanup(&cfg); return ret; } -- cgit v1.2.3 From 58da018053531b9cb91423a64f2a762ef0fe7456 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 4 Aug 2015 01:19:58 +0200 Subject: bridge: mdb: fix vlan_enabled access when vlans are not configured Instead of trying to access br->vlan_enabled directly use the provided helper br_vlan_enabled(). Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_mdb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 1df3ef4a73b9..d747275fad18 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -490,7 +490,7 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; pv = nbp_get_vlan_info(p); - if (br->vlan_enabled && pv && entry->vid == 0) { + if (br_vlan_enabled(br) && pv && entry->vid == 0) { for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { entry->vid = vid; err = __br_mdb_add(net, br, entry); @@ -592,7 +592,7 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; pv = nbp_get_vlan_info(p); - if (br->vlan_enabled && pv && entry->vid == 0) { + if (br_vlan_enabled(br) && pv && entry->vid == 0) { for_each_set_bit(vid, pv->vlan_bitmap, VLAN_N_VID) { entry->vid = vid; err = __br_mdb_del(br, entry); -- cgit v1.2.3 From abf7c1c540f8330fead5d50730d92606dcbe7a7e Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Mon, 3 Aug 2015 17:39:20 +0100 Subject: lwtunnel: set skb protocol and dev In the locally-generated packet path skb->protocol may not be set and this is required for the lwtunnel encap in order to get the lwtstate. This would otherwise have been set by ip_output or ip6_output so set skb->protocol prior to calling the lwtunnel encap function. Additionally set skb->dev in case it is needed further down the transmit path. Signed-off-by: Robert Shearman Signed-off-by: David S. Miller --- net/core/lwtunnel.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index c240c895b319..5d6d8e3d450a 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -215,8 +215,12 @@ int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct lwtunnel_state *lwtstate = NULL; - if (rt) + if (rt) { lwtstate = rt->rt6i_lwtstate; + skb->dev = rt->dst.dev; + } + + skb->protocol = htons(ETH_P_IPV6); return __lwtunnel_output(sk, skb, lwtstate); } @@ -227,8 +231,12 @@ int lwtunnel_output(struct sock *sk, struct sk_buff *skb) struct rtable *rt = (struct rtable *)skb_dst(skb); struct lwtunnel_state *lwtstate = NULL; - if (rt) + if (rt) { lwtstate = rt->rt_lwtstate; + skb->dev = rt->dst.dev; + } + + skb->protocol = htons(ETH_P_IP); return __lwtunnel_output(sk, skb, lwtstate); } -- cgit v1.2.3 From 0335f5b500adcc28bc3fd5d8a1e4482c348cff4a Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Mon, 3 Aug 2015 17:39:21 +0100 Subject: ipv4: apply lwtunnel encap for locally-generated packets lwtunnel encap is applied for forwarded packets, but not for locally-generated packets. This is because the output function is not overridden in __mkroute_output, unlike it is in __mkroute_input. The lwtunnel state is correctly set on the rth through the call to rt_set_nexthop, so all that needs to be done is to override the dst output function to be lwtunnel_output if there is lwtunnel state present and it requires output redirection. Signed-off-by: Robert Shearman Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 908f7ef2f12a..18fd7c9095c7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2022,6 +2022,8 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); + if (lwtunnel_output_redirect(rth->rt_lwtstate)) + rth->dst.output = lwtunnel_output; return rth; } -- cgit v1.2.3 From a6affd24f439feddec04bab4d1e3ad6579868367 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Mon, 3 Aug 2015 17:50:04 +0100 Subject: mpls: Use definition for reserved label checks In multiple locations there are checks for whether the label in hand is a reserved label or not using the arbritray value of 16. Factor this out into a #define for better maintainability and for documentation. Signed-off-by: Robert Shearman Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/uapi/linux/mpls.h | 2 ++ net/mpls/af_mpls.c | 21 +++++++++++---------- 2 files changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h index 139d4dd1cab8..24a6cb1aec86 100644 --- a/include/uapi/linux/mpls.h +++ b/include/uapi/linux/mpls.h @@ -41,4 +41,6 @@ struct mpls_label { #define MPLS_LABEL_OAMALERT 14 /* RFC3429 */ #define MPLS_LABEL_EXTENSION 15 /* RFC7274 */ +#define MPLS_LABEL_FIRST_UNRESERVED 16 /* RFC3032 */ + #endif /* _UAPI_MPLS_H */ diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 88cfaa241c07..b6b9a6c4e784 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -293,7 +293,7 @@ static void mpls_notify_route(struct net *net, unsigned index, struct mpls_route *rt = new ? new : old; unsigned nlm_flags = (old && new) ? NLM_F_REPLACE : 0; /* Ignore reserved labels for now */ - if (rt && (index >= 16)) + if (rt && (index >= MPLS_LABEL_FIRST_UNRESERVED)) rtmsg_lfib(event, index, rt, nlh, net, portid, nlm_flags); } @@ -327,7 +327,8 @@ static unsigned find_free_label(struct net *net) platform_label = rtnl_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; - for (index = 16; index < platform_labels; index++) { + for (index = MPLS_LABEL_FIRST_UNRESERVED; index < platform_labels; + index++) { if (!rtnl_dereference(platform_label[index])) return index; } @@ -436,8 +437,8 @@ static int mpls_route_add(struct mpls_route_config *cfg) index = find_free_label(net); } - /* The first 16 labels are reserved, and may not be set */ - if (index < 16) + /* Reserved labels may not be set */ + if (index < MPLS_LABEL_FIRST_UNRESERVED) goto errout; /* The full 20 bit range may not be supported. */ @@ -516,8 +517,8 @@ static int mpls_route_del(struct mpls_route_config *cfg) index = cfg->rc_label; - /* The first 16 labels are reserved, and may not be removed */ - if (index < 16) + /* Reserved labels may not be removed */ + if (index < MPLS_LABEL_FIRST_UNRESERVED) goto errout; /* The full 20 bit range may not be supported */ @@ -835,8 +836,8 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, &cfg->rc_label)) goto errout; - /* The first 16 labels are reserved, and may not be set */ - if (cfg->rc_label < 16) + /* Reserved labels may not be set */ + if (cfg->rc_label < MPLS_LABEL_FIRST_UNRESERVED) goto errout; break; @@ -961,8 +962,8 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) ASSERT_RTNL(); index = cb->args[0]; - if (index < 16) - index = 16; + if (index < MPLS_LABEL_FIRST_UNRESERVED) + index = MPLS_LABEL_FIRST_UNRESERVED; platform_label = rtnl_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; -- cgit v1.2.3 From a6cd379b4d68867295ea35a719008e86d7a2ee9f Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Thu, 30 Jul 2015 16:53:45 +0000 Subject: netfilter: ip6t_REJECT: Remove debug messages from reject_tg6() Make it similar to reject_tg() in ipt_REJECT. Suggested-by: Pablo Neira Ayuso Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_REJECT.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 12331efd49cf..567367a75172 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -35,14 +35,12 @@ MODULE_AUTHOR("Yasuyuki KOZAKAI "); MODULE_DESCRIPTION("Xtables: packet \"rejection\" target for IPv6"); MODULE_LICENSE("GPL"); - static unsigned int reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct ip6t_reject_info *reject = par->targinfo; struct net *net = dev_net((par->in != NULL) ? par->in : par->out); - pr_debug("%s: medium point\n", __func__); switch (reject->with) { case IP6T_ICMP6_NO_ROUTE: nf_send_unreach6(net, skb, ICMPV6_NOROUTE, par->hooknum); @@ -65,9 +63,6 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) case IP6T_TCP_RESET: nf_send_reset6(net, skb, par->hooknum); break; - default: - net_info_ratelimited("case %u not handled yet\n", reject->with); - break; } return NF_DROP; -- cgit v1.2.3 From 5a9348b54d396c0b8bb877abcb107293034a87de Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 4 Aug 2015 10:44:22 +0300 Subject: mpls: small cleanup in inet/inet6_fib_lookup_dev() We recently changed this code from returning NULL to returning ERR_PTR. There are some left over NULL assignments which we can remove. We can preserve the error code from ip_route_output() instead of always returning -ENODEV. Also these functions use a mix of gotos and direct returns. There is no cleanup necessary so I changed the gotos to direct returns. Signed-off-by: Dan Carpenter Acked-by: Roopa Prabhu Acked-by: Robert Shearman Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index b6b9a6c4e784..d93c0301ad23 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -338,14 +338,14 @@ static unsigned find_free_label(struct net *net) #if IS_ENABLED(CONFIG_INET) static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) { - struct net_device *dev = NULL; + struct net_device *dev; struct rtable *rt; struct in_addr daddr; memcpy(&daddr, addr, sizeof(struct in_addr)); rt = ip_route_output(net, daddr.s_addr, 0, 0, 0); if (IS_ERR(rt)) - goto errout; + return ERR_CAST(rt); dev = rt->dst.dev; dev_hold(dev); @@ -353,8 +353,6 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) ip_rt_put(rt); return dev; -errout: - return ERR_PTR(-ENODEV); } #else static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) @@ -366,7 +364,7 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, void *addr) #if IS_ENABLED(CONFIG_IPV6) static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr) { - struct net_device *dev = NULL; + struct net_device *dev; struct dst_entry *dst; struct flowi6 fl6; int err; @@ -378,16 +376,13 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr) memcpy(&fl6.daddr, addr, sizeof(struct in6_addr)); err = ipv6_stub->ipv6_dst_lookup(net, NULL, &dst, &fl6); if (err) - goto errout; + return ERR_PTR(err); dev = dst->dev; dev_hold(dev); dst_release(dst); return dev; - -errout: - return ERR_PTR(err); } #else static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr) -- cgit v1.2.3 From 3dcb615e6841eb945a49f605743a6923ed591400 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Tue, 4 Aug 2015 06:36:24 -0700 Subject: af_mpls: add null dev check in find_outdev This patch adds null dev check for the 'cfg->rc_via_table == NEIGH_LINK_TABLE or dev_get_by_index() failed' case Reported-by: Dan Carpenter Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index d93c0301ad23..04306989d054 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -411,6 +411,9 @@ static struct net_device *find_outdev(struct net *net, dev = dev_get_by_index(net, cfg->rc_ifindex); } + if (!dev) + return ERR_PTR(-ENODEV); + return dev; } -- cgit v1.2.3 From 0c45e76960ded9da56bd66e603a7f0cae215f87e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 8 Jun 2015 14:42:40 +0200 Subject: netfilter: nft_counter: convert it to use per-cpu counters This patch converts the existing seqlock to per-cpu counters. Suggested-by: Eric Dumazet Suggested-by: Patrick McHardy Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_counter.c | 97 ++++++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 17591239229f..1067fb4c1ffa 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -18,39 +18,59 @@ #include struct nft_counter { - seqlock_t lock; u64 bytes; u64 packets; }; +struct nft_counter_percpu { + struct nft_counter counter; + struct u64_stats_sync syncp; +}; + +struct nft_counter_percpu_priv { + struct nft_counter_percpu __percpu *counter; +}; + static void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_counter *priv = nft_expr_priv(expr); - - write_seqlock_bh(&priv->lock); - priv->bytes += pkt->skb->len; - priv->packets++; - write_sequnlock_bh(&priv->lock); + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter_percpu *this_cpu; + + local_bh_disable(); + this_cpu = this_cpu_ptr(priv->counter); + u64_stats_update_begin(&this_cpu->syncp); + this_cpu->counter.bytes += pkt->skb->len; + this_cpu->counter.packets++; + u64_stats_update_end(&this_cpu->syncp); + local_bh_enable(); } static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) { - struct nft_counter *priv = nft_expr_priv(expr); + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter_percpu *cpu_stats; + struct nft_counter total; + u64 bytes, packets; unsigned int seq; - u64 bytes; - u64 packets; - - do { - seq = read_seqbegin(&priv->lock); - bytes = priv->bytes; - packets = priv->packets; - } while (read_seqretry(&priv->lock, seq)); - - if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(bytes))) - goto nla_put_failure; - if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(packets))) + int cpu; + + memset(&total, 0, sizeof(total)); + for_each_possible_cpu(cpu) { + cpu_stats = per_cpu_ptr(priv->counter, cpu); + do { + seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + bytes = cpu_stats->counter.bytes; + packets = cpu_stats->counter.packets; + } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + + total.packets += packets; + total.bytes += bytes; + } + + if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) || + nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets))) goto nla_put_failure; return 0; @@ -67,23 +87,44 @@ static int nft_counter_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nft_counter *priv = nft_expr_priv(expr); + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); + struct nft_counter_percpu __percpu *cpu_stats; + struct nft_counter_percpu *this_cpu; + + cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu); + if (cpu_stats == NULL) + return ENOMEM; + + preempt_disable(); + this_cpu = this_cpu_ptr(cpu_stats); + if (tb[NFTA_COUNTER_PACKETS]) { + this_cpu->counter.packets = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); + } + if (tb[NFTA_COUNTER_BYTES]) { + this_cpu->counter.bytes = + be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); + } + preempt_enable(); + priv->counter = cpu_stats; + return 0; +} - if (tb[NFTA_COUNTER_PACKETS]) - priv->packets = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_PACKETS])); - if (tb[NFTA_COUNTER_BYTES]) - priv->bytes = be64_to_cpu(nla_get_be64(tb[NFTA_COUNTER_BYTES])); +static void nft_counter_destroy(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_counter_percpu_priv *priv = nft_expr_priv(expr); - seqlock_init(&priv->lock); - return 0; + free_percpu(priv->counter); } static struct nft_expr_type nft_counter_type; static const struct nft_expr_ops nft_counter_ops = { .type = &nft_counter_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_counter)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_counter_percpu_priv)), .eval = nft_counter_eval, .init = nft_counter_init, + .destroy = nft_counter_destroy, .dump = nft_counter_dump, }; -- cgit v1.2.3 From 24b7811fa5de7bbbab3b782a38889034ea267f70 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 1 Jul 2015 16:38:10 +0200 Subject: netfilter: xt_TEE: get rid of WITH_CONNTRACK definition Use IS_ENABLED(CONFIG_NF_CONNTRACK) instead. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TEE.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index c5d6556dbc5e..0ed9fb61d470 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -24,10 +24,8 @@ #include #include #include - #if IS_ENABLED(CONFIG_NF_CONNTRACK) -# define WITH_CONNTRACK 1 -# include +#include #endif struct xt_tee_priv { @@ -99,7 +97,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) if (skb == NULL) return XT_CONTINUE; -#ifdef WITH_CONNTRACK +#if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ nf_conntrack_put(skb->nfct); skb->nfct = &nf_ct_untracked_get()->ct_general; @@ -175,7 +173,7 @@ tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) if (skb == NULL) return XT_CONTINUE; -#ifdef WITH_CONNTRACK +#if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_conntrack_put(skb->nfct); skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; -- cgit v1.2.3 From bbde9fc1824aab58bc78c084163007dd6c03fe5b Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 31 May 2015 17:54:44 +0200 Subject: netfilter: factor out packet duplication for IPv4/IPv6 Extracted from the xtables TEE target. This creates two new modules for IPv4 and IPv6 that are shared between the TEE target and the new nf_tables dup expressions. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/ipv4/nf_dup_ipv4.h | 7 ++ include/net/netfilter/ipv6/nf_dup_ipv6.h | 7 ++ net/ipv4/netfilter/Kconfig | 6 ++ net/ipv4/netfilter/Makefile | 2 + net/ipv4/netfilter/nf_dup_ipv4.c | 120 +++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 6 ++ net/ipv6/netfilter/Makefile | 2 + net/ipv6/netfilter/nf_dup_ipv6.c | 96 +++++++++++++++++++ net/netfilter/Kconfig | 2 + net/netfilter/xt_TEE.c | 158 ++----------------------------- 10 files changed, 254 insertions(+), 152 deletions(-) create mode 100644 include/net/netfilter/ipv4/nf_dup_ipv4.h create mode 100644 include/net/netfilter/ipv6/nf_dup_ipv6.h create mode 100644 net/ipv4/netfilter/nf_dup_ipv4.c create mode 100644 net/ipv6/netfilter/nf_dup_ipv6.c (limited to 'net') diff --git a/include/net/netfilter/ipv4/nf_dup_ipv4.h b/include/net/netfilter/ipv4/nf_dup_ipv4.h new file mode 100644 index 000000000000..42008f10dfc4 --- /dev/null +++ b/include/net/netfilter/ipv4/nf_dup_ipv4.h @@ -0,0 +1,7 @@ +#ifndef _NF_DUP_IPV4_H_ +#define _NF_DUP_IPV4_H_ + +void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct in_addr *gw, int oif); + +#endif /* _NF_DUP_IPV4_H_ */ diff --git a/include/net/netfilter/ipv6/nf_dup_ipv6.h b/include/net/netfilter/ipv6/nf_dup_ipv6.h new file mode 100644 index 000000000000..ed6bd66fa5a0 --- /dev/null +++ b/include/net/netfilter/ipv6/nf_dup_ipv6.h @@ -0,0 +1,7 @@ +#ifndef _NF_DUP_IPV6_H_ +#define _NF_DUP_IPV6_H_ + +void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum, + const struct in6_addr *gw, int oif); + +#endif /* _NF_DUP_IPV6_H_ */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2199a5db25e6..0142ea259d7d 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -67,6 +67,12 @@ config NF_TABLES_ARP endif # NF_TABLES +config NF_DUP_IPV4 + tristate "Netfilter IPv4 packet duplication to alternate destination" + help + This option enables the nf_dup_ipv4 core, which duplicates an IPv4 + packet to be rerouted to another destination. + config NF_LOG_ARP tristate "ARP packet logging" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 7fe6c703528f..9136ffc2d474 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -70,3 +70,5 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o # just filtering instance of ARP tables for now obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o + +obj-$(CONFIG_NF_DUP_IPV4) += nf_dup_ipv4.o diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c new file mode 100644 index 000000000000..eff85ab3f47d --- /dev/null +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -0,0 +1,120 @@ +/* + * (C) 2007 by Sebastian Claßen + * (C) 2007-2010 by Jan Engelhardt + * + * Extracted from xt_TEE.c + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 or later, as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + +static struct net *pick_net(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS + const struct dst_entry *dst; + + if (skb->dev != NULL) + return dev_net(skb->dev); + dst = skb_dst(skb); + if (dst != NULL && dst->dev != NULL) + return dev_net(dst->dev); +#endif + return &init_net; +} + +static bool nf_dup_ipv4_route(struct sk_buff *skb, const struct in_addr *gw, + int oif) +{ + const struct iphdr *iph = ip_hdr(skb); + struct net *net = pick_net(skb); + struct rtable *rt; + struct flowi4 fl4; + + memset(&fl4, 0, sizeof(fl4)); + if (oif != -1) + fl4.flowi4_oif = oif; + + fl4.daddr = gw->s_addr; + fl4.flowi4_tos = RT_TOS(iph->tos); + fl4.flowi4_scope = RT_SCOPE_UNIVERSE; + fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; + rt = ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + return false; + + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + skb->dev = rt->dst.dev; + skb->protocol = htons(ETH_P_IP); + + return true; +} + +void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, + const struct in_addr *gw, int oif) +{ + struct iphdr *iph; + + if (__this_cpu_read(nf_skb_duplicated)) + return; + /* + * Copy the skb, and route the copy. Will later return %XT_CONTINUE for + * the original skb, which should continue on its way as if nothing has + * happened. The copy should be independently delivered to the gateway. + */ + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + /* Avoid counting cloned packets towards the original connection. */ + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + /* + * If we are in PREROUTING/INPUT, the checksum must be recalculated + * since the length could have changed as a result of defragmentation. + * + * We also decrease the TTL to mitigate potential loops between two + * hosts. + * + * Set %IP_DF so that the original source is notified of a potentially + * decreased MTU on the clone route. IPv6 does this too. + */ + iph = ip_hdr(skb); + iph->frag_off |= htons(IP_DF); + if (hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_IN) + --iph->ttl; + ip_send_check(iph); + + if (nf_dup_ipv4_route(skb, gw, oif)) { + __this_cpu_write(nf_skb_duplicated, true); + ip_local_out(skb); + __this_cpu_write(nf_skb_duplicated, false); + } else { + kfree_skb(skb); + } +} +EXPORT_SYMBOL_GPL(nf_dup_ipv4); + +MODULE_AUTHOR("Sebastian Claßen "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("nf_dup_ipv4: Duplicate IPv4 packet"); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index b552cf0d6198..298daf30d857 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -50,6 +50,12 @@ config NFT_REJECT_IPV6 endif # NF_TABLES_IPV6 endif # NF_TABLES +config NF_DUP_IPV6 + tristate "Netfilter IPv6 packet duplication to alternate destination" + help + This option enables the nf_dup_ipv6 core, which duplicates an IPv6 + packet to be rerouted to another destination. + config NF_REJECT_IPV6 tristate "IPv6 packet rejection" default m if NETFILTER_ADVANCED=n diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index c36e0a5490de..dc6c732f98ca 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -30,6 +30,8 @@ obj-$(CONFIG_NF_LOG_IPV6) += nf_log_ipv6.o # reject obj-$(CONFIG_NF_REJECT_IPV6) += nf_reject_ipv6.o +obj-$(CONFIG_NF_DUP_IPV6) += nf_dup_ipv6.o + # nf_tables obj-$(CONFIG_NF_TABLES_IPV6) += nf_tables_ipv6.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV6) += nft_chain_route_ipv6.o diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c new file mode 100644 index 000000000000..399fdda18447 --- /dev/null +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -0,0 +1,96 @@ +/* + * (C) 2007 by Sebastian Claßen + * (C) 2007-2010 by Jan Engelhardt + * + * Extracted from xt_TEE.c + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 or later, as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include +#endif + +static struct net *pick_net(struct sk_buff *skb) +{ +#ifdef CONFIG_NET_NS + const struct dst_entry *dst; + + if (skb->dev != NULL) + return dev_net(skb->dev); + dst = skb_dst(skb); + if (dst != NULL && dst->dev != NULL) + return dev_net(dst->dev); +#endif + return &init_net; +} + +static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw, + int oif) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct net *net = pick_net(skb); + struct dst_entry *dst; + struct flowi6 fl6; + + memset(&fl6, 0, sizeof(fl6)); + if (oif != -1) + fl6.flowi6_oif = oif; + + fl6.daddr = *gw; + fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | + (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + dst_release(dst); + return false; + } + skb_dst_drop(skb); + skb_dst_set(skb, dst); + skb->dev = dst->dev; + skb->protocol = htons(ETH_P_IPV6); + + return true; +} + +void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum, + const struct in6_addr *gw, int oif) +{ + if (__this_cpu_read(nf_skb_duplicated)) + return; + skb = pskb_copy(skb, GFP_ATOMIC); + if (skb == NULL) + return; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + nf_conntrack_put(skb->nfct); + skb->nfct = &nf_ct_untracked_get()->ct_general; + skb->nfctinfo = IP_CT_NEW; + nf_conntrack_get(skb->nfct); +#endif + if (hooknum == NF_INET_PRE_ROUTING || + hooknum == NF_INET_LOCAL_IN) { + struct ipv6hdr *iph = ipv6_hdr(skb); + --iph->hop_limit; + } + if (nf_dup_ipv6_route(skb, gw, oif)) { + __this_cpu_write(nf_skb_duplicated, true); + ip6_local_out(skb); + __this_cpu_write(nf_skb_duplicated, false); + } else { + kfree_skb(skb); + } +} +EXPORT_SYMBOL_GPL(nf_dup_ipv6); + +MODULE_AUTHOR("Sebastian Claßen "); +MODULE_AUTHOR("Jan Engelhardt "); +MODULE_DESCRIPTION("nf_dup_ipv6: IPv6 packet duplication"); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 6eae69a698ed..3e1b4abf1897 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -867,6 +867,8 @@ config NETFILTER_XT_TARGET_TEE depends on NETFILTER_ADVANCED depends on IPV6 || IPV6=n depends on !NF_CONNTRACK || NF_CONNTRACK + select NF_DUP_IPV4 + select NF_DUP_IPV6 if IP6_NF_IPTABLES ---help--- This option adds a "TEE" target with which a packet can be cloned and this clone be rerouted to another nexthop. diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 0ed9fb61d470..49fee6aa2c0a 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -10,23 +10,14 @@ * modify it under the terms of the GNU General Public License * version 2 or later, as published by the Free Software Foundation. */ -#include #include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include +#include #include +#include +#include +#include #include -#if IS_ENABLED(CONFIG_NF_CONNTRACK) -#include -#endif struct xt_tee_priv { struct notifier_block notifier; @@ -36,161 +27,24 @@ struct xt_tee_priv { static const union nf_inet_addr tee_zero_address; -static struct net *pick_net(struct sk_buff *skb) -{ -#ifdef CONFIG_NET_NS - const struct dst_entry *dst; - - if (skb->dev != NULL) - return dev_net(skb->dev); - dst = skb_dst(skb); - if (dst != NULL && dst->dev != NULL) - return dev_net(dst->dev); -#endif - return &init_net; -} - -static bool -tee_tg_route4(struct sk_buff *skb, const struct xt_tee_tginfo *info) -{ - const struct iphdr *iph = ip_hdr(skb); - struct net *net = pick_net(skb); - struct rtable *rt; - struct flowi4 fl4; - - memset(&fl4, 0, sizeof(fl4)); - if (info->priv) { - if (info->priv->oif == -1) - return false; - fl4.flowi4_oif = info->priv->oif; - } - fl4.daddr = info->gw.ip; - fl4.flowi4_tos = RT_TOS(iph->tos); - fl4.flowi4_scope = RT_SCOPE_UNIVERSE; - fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; - rt = ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - return false; - - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - skb->dev = rt->dst.dev; - skb->protocol = htons(ETH_P_IP); - return true; -} - static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - struct iphdr *iph; - if (__this_cpu_read(nf_skb_duplicated)) - return XT_CONTINUE; - /* - * Copy the skb, and route the copy. Will later return %XT_CONTINUE for - * the original skb, which should continue on its way as if nothing has - * happened. The copy should be independently delivered to the TEE - * --gateway. - */ - skb = pskb_copy(skb, GFP_ATOMIC); - if (skb == NULL) - return XT_CONTINUE; - -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - /* Avoid counting cloned packets towards the original connection. */ - nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); -#endif - /* - * If we are in PREROUTING/INPUT, the checksum must be recalculated - * since the length could have changed as a result of defragmentation. - * - * We also decrease the TTL to mitigate potential TEE loops - * between two hosts. - * - * Set %IP_DF so that the original source is notified of a potentially - * decreased MTU on the clone route. IPv6 does this too. - */ - iph = ip_hdr(skb); - iph->frag_off |= htons(IP_DF); - if (par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_IN) - --iph->ttl; - ip_send_check(iph); + nf_dup_ipv4(skb, par->hooknum, &info->gw.in, info->priv->oif); - if (tee_tg_route4(skb, info)) { - __this_cpu_write(nf_skb_duplicated, true); - ip_local_out(skb); - __this_cpu_write(nf_skb_duplicated, false); - } else { - kfree_skb(skb); - } return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IPV6) -static bool -tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) -{ - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct net *net = pick_net(skb); - struct dst_entry *dst; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - if (info->priv) { - if (info->priv->oif == -1) - return false; - fl6.flowi6_oif = info->priv->oif; - } - fl6.daddr = info->gw.in6; - fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | - (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; - fl6.flowi6_flags = FLOWI_FLAG_KNOWN_NH; - dst = ip6_route_output(net, NULL, &fl6); - if (dst->error) { - dst_release(dst); - return false; - } - skb_dst_drop(skb); - skb_dst_set(skb, dst); - skb->dev = dst->dev; - skb->protocol = htons(ETH_P_IPV6); - return true; -} - static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; - if (__this_cpu_read(nf_skb_duplicated)) - return XT_CONTINUE; - skb = pskb_copy(skb, GFP_ATOMIC); - if (skb == NULL) - return XT_CONTINUE; + nf_dup_ipv6(skb, par->hooknum, &info->gw.in6, info->priv->oif); -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb->nfct); - skb->nfct = &nf_ct_untracked_get()->ct_general; - skb->nfctinfo = IP_CT_NEW; - nf_conntrack_get(skb->nfct); -#endif - if (par->hooknum == NF_INET_PRE_ROUTING || - par->hooknum == NF_INET_LOCAL_IN) { - struct ipv6hdr *iph = ipv6_hdr(skb); - --iph->hop_limit; - } - if (tee_tg_route6(skb, info)) { - __this_cpu_write(nf_skb_duplicated, true); - ip6_local_out(skb); - __this_cpu_write(nf_skb_duplicated, false); - } else { - kfree_skb(skb); - } return XT_CONTINUE; } #endif -- cgit v1.2.3 From d877f07112f1e5a247c6b585c971a93895c9f738 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 31 May 2015 18:04:11 +0200 Subject: netfilter: nf_tables: add nft_dup expression This new expression uses the nf_dup engine to clone packets to a given gateway. Unlike xt_TEE, we use an index to indicate output interface which should be fine at this stage. Moreover, change to the preemtion-safe this_cpu_read(nf_skb_duplicated) from nf_dup_ipv{4,6} to silence a lockdep splat. Based on the original tee expression from Arturo Borrero Gonzalez, although this patch has diverted quite a bit from this initial effort due to the change to support maps. Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_dup.h | 9 +++ include/uapi/linux/netfilter/nf_tables.h | 14 ++++ net/ipv4/netfilter/Kconfig | 6 ++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/nf_dup_ipv4.c | 2 +- net/ipv4/netfilter/nft_dup_ipv4.c | 110 +++++++++++++++++++++++++++++++ net/ipv6/netfilter/Kconfig | 6 ++ net/ipv6/netfilter/Makefile | 1 + net/ipv6/netfilter/nf_dup_ipv6.c | 2 +- net/ipv6/netfilter/nft_dup_ipv6.c | 108 ++++++++++++++++++++++++++++++ 10 files changed, 257 insertions(+), 2 deletions(-) create mode 100644 include/net/netfilter/nft_dup.h create mode 100644 net/ipv4/netfilter/nft_dup_ipv4.c create mode 100644 net/ipv6/netfilter/nft_dup_ipv6.c (limited to 'net') diff --git a/include/net/netfilter/nft_dup.h b/include/net/netfilter/nft_dup.h new file mode 100644 index 000000000000..6b84cf6491a2 --- /dev/null +++ b/include/net/netfilter/nft_dup.h @@ -0,0 +1,9 @@ +#ifndef _NFT_DUP_H_ +#define _NFT_DUP_H_ + +struct nft_dup_inet { + enum nft_registers sreg_addr:8; + enum nft_registers sreg_dev:8; +}; + +#endif /* _NFT_DUP_H_ */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index a99e6a997140..2ef35f2c9bda 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -935,6 +935,20 @@ enum nft_redir_attributes { }; #define NFTA_REDIR_MAX (__NFTA_REDIR_MAX - 1) +/** + * enum nft_dup_attributes - nf_tables dup expression netlink attributes + * + * @NFTA_DUP_SREG_ADDR: source register of address (NLA_U32: nft_registers) + * @NFTA_DUP_SREG_DEV: source register of output interface (NLA_U32: nft_register) + */ +enum nft_dup_attributes { + NFTA_DUP_UNSPEC, + NFTA_DUP_SREG_ADDR, + NFTA_DUP_SREG_DEV, + __NFTA_DUP_MAX +}; +#define NFTA_DUP_MAX (__NFTA_DUP_MAX - 1) + /** * enum nft_gen_attributes - nf_tables ruleset generation attributes * diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 0142ea259d7d..690d27d3f2f9 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -58,6 +58,12 @@ config NFT_REJECT_IPV4 default NFT_REJECT tristate +config NFT_DUP_IPV4 + tristate "IPv4 nf_tables packet duplication support" + select NF_DUP_IPV4 + help + This module enables IPv4 packet duplication support for nf_tables. + endif # NF_TABLES_IPV4 config NF_TABLES_ARP diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 9136ffc2d474..87b073da14c9 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -41,6 +41,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NFT_MASQ_IPV4) += nft_masq_ipv4.o obj-$(CONFIG_NFT_REDIR_IPV4) += nft_redir_ipv4.o +obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index eff85ab3f47d..b5bb37564b0e 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -69,7 +69,7 @@ void nf_dup_ipv4(struct sk_buff *skb, unsigned int hooknum, { struct iphdr *iph; - if (__this_cpu_read(nf_skb_duplicated)) + if (this_cpu_read(nf_skb_duplicated)) return; /* * Copy the skb, and route the copy. Will later return %XT_CONTINUE for diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c new file mode 100644 index 000000000000..25419fbddcb6 --- /dev/null +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_dup_ipv4 { + enum nft_registers sreg_addr:8; + enum nft_registers sreg_dev:8; +}; + +static void nft_dup_ipv4_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + struct in_addr gw = { + .s_addr = regs->data[priv->sreg_addr], + }; + int oif = regs->data[priv->sreg_dev]; + + nf_dup_ipv4(pkt->skb, pkt->ops->hooknum, &gw, oif); +} + +static int nft_dup_ipv4_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_DUP_SREG_ADDR] == NULL) + return -EINVAL; + + priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); + err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in_addr)); + if (err < 0) + return err; + + if (tb[NFTA_DUP_SREG_DEV] != NULL) { + priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); + return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + } + return 0; +} + +static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_dup_ipv4 *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dup_ipv4_type; +static const struct nft_expr_ops nft_dup_ipv4_ops = { + .type = &nft_dup_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv4)), + .eval = nft_dup_ipv4_eval, + .init = nft_dup_ipv4_init, + .dump = nft_dup_ipv4_dump, +}; + +static const struct nla_policy nft_dup_ipv4_policy[NFTA_DUP_MAX + 1] = { + [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 }, + [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 }, +}; + +static struct nft_expr_type nft_dup_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "dup", + .ops = &nft_dup_ipv4_ops, + .policy = nft_dup_ipv4_policy, + .maxattr = NFTA_DUP_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_dup_ipv4_module_init(void) +{ + return nft_register_expr(&nft_dup_ipv4_type); +} + +static void __exit nft_dup_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_dup_ipv4_type); +} + +module_init(nft_dup_ipv4_module_init); +module_exit(nft_dup_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "dup"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 298daf30d857..96833e4b3193 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -47,6 +47,12 @@ config NFT_REJECT_IPV6 default NFT_REJECT tristate +config NFT_DUP_IPV6 + tristate "IPv6 nf_tables packet duplication support" + select NF_DUP_IPV6 + help + This module enables IPv6 packet duplication support for nf_tables. + endif # NF_TABLES_IPV6 endif # NF_TABLES diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index dc6c732f98ca..b4f7d0b4e2af 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_NFT_CHAIN_NAT_IPV6) += nft_chain_nat_ipv6.o obj-$(CONFIG_NFT_REJECT_IPV6) += nft_reject_ipv6.o obj-$(CONFIG_NFT_MASQ_IPV6) += nft_masq_ipv6.o obj-$(CONFIG_NFT_REDIR_IPV6) += nft_redir_ipv6.o +obj-$(CONFIG_NFT_DUP_IPV6) += nft_dup_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 399fdda18447..d8ab654080b4 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -63,7 +63,7 @@ static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw, void nf_dup_ipv6(struct sk_buff *skb, unsigned int hooknum, const struct in6_addr *gw, int oif) { - if (__this_cpu_read(nf_skb_duplicated)) + if (this_cpu_read(nf_skb_duplicated)) return; skb = pskb_copy(skb, GFP_ATOMIC); if (skb == NULL) diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c new file mode 100644 index 000000000000..0eaa4f65fdea --- /dev/null +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct nft_dup_ipv6 { + enum nft_registers sreg_addr:8; + enum nft_registers sreg_dev:8; +}; + +static void nft_dup_ipv6_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_dup_ipv6 *priv = nft_expr_priv(expr); + struct in6_addr *gw = (struct in6_addr *)®s->data[priv->sreg_addr]; + int oif = regs->data[priv->sreg_dev]; + + nf_dup_ipv6(pkt->skb, pkt->ops->hooknum, gw, oif); +} + +static int nft_dup_ipv6_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_dup_ipv6 *priv = nft_expr_priv(expr); + int err; + + if (tb[NFTA_DUP_SREG_ADDR] == NULL) + return -EINVAL; + + priv->sreg_addr = nft_parse_register(tb[NFTA_DUP_SREG_ADDR]); + err = nft_validate_register_load(priv->sreg_addr, sizeof(struct in6_addr)); + if (err < 0) + return err; + + if (tb[NFTA_DUP_SREG_DEV] != NULL) { + priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]); + return nft_validate_register_load(priv->sreg_dev, sizeof(int)); + } + return 0; +} + +static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_dup_ipv6 *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static struct nft_expr_type nft_dup_ipv6_type; +static const struct nft_expr_ops nft_dup_ipv6_ops = { + .type = &nft_dup_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_ipv6)), + .eval = nft_dup_ipv6_eval, + .init = nft_dup_ipv6_init, + .dump = nft_dup_ipv6_dump, +}; + +static const struct nla_policy nft_dup_ipv6_policy[NFTA_DUP_MAX + 1] = { + [NFTA_DUP_SREG_ADDR] = { .type = NLA_U32 }, + [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 }, +}; + +static struct nft_expr_type nft_dup_ipv6_type __read_mostly = { + .family = NFPROTO_IPV6, + .name = "dup", + .ops = &nft_dup_ipv6_ops, + .policy = nft_dup_ipv6_policy, + .maxattr = NFTA_DUP_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_dup_ipv6_module_init(void) +{ + return nft_register_expr(&nft_dup_ipv6_type); +} + +static void __exit nft_dup_ipv6_module_exit(void) +{ + nft_unregister_expr(&nft_dup_ipv6_type); +} + +module_init(nft_dup_ipv6_module_init); +module_exit(nft_dup_ipv6_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET6, "dup"); -- cgit v1.2.3 From 09e4e42a00b99e94cfce27e63b06daca0c26e841 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 31 Jul 2015 14:16:51 +0200 Subject: netfilter: nft_limit: rename to nft_limit_pkts To prepare introduction of bytes ratelimit support. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 435c1ccd6c0e..d0788e172b4c 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -26,9 +26,9 @@ struct nft_limit { unsigned long stamp; }; -static void nft_limit_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_limit_pkts_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { struct nft_limit *priv = nft_expr_priv(expr); @@ -85,17 +85,17 @@ nla_put_failure: } static struct nft_expr_type nft_limit_type; -static const struct nft_expr_ops nft_limit_ops = { +static const struct nft_expr_ops nft_limit_pkts_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), - .eval = nft_limit_eval, + .eval = nft_limit_pkts_eval, .init = nft_limit_init, .dump = nft_limit_dump, }; static struct nft_expr_type nft_limit_type __read_mostly = { .name = "limit", - .ops = &nft_limit_ops, + .ops = &nft_limit_pkts_ops, .policy = nft_limit_policy, .maxattr = NFTA_LIMIT_MAX, .flags = NFT_EXPR_STATEFUL, -- cgit v1.2.3 From dba27ec1bc382014aa2ba46e96f421b5f6536dae Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 31 Jul 2015 14:10:22 +0200 Subject: netfilter: nft_limit: convert to token-based limiting at nanosecond granularity Rework the limit expression to use a token-based limiting approach that refills the bucket gradually. The tokens are calculated at nanosecond granularity instead jiffies to improve precision. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index d0788e172b4c..c79703e5baad 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -20,10 +20,11 @@ static DEFINE_SPINLOCK(limit_lock); struct nft_limit { + u64 last; u64 tokens; + u64 tokens_max; u64 rate; - u64 unit; - unsigned long stamp; + u64 nsecs; }; static void nft_limit_pkts_eval(const struct nft_expr *expr, @@ -31,18 +32,23 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr, const struct nft_pktinfo *pkt) { struct nft_limit *priv = nft_expr_priv(expr); + u64 now, tokens, cost = div_u64(priv->nsecs, priv->rate); + s64 delta; spin_lock_bh(&limit_lock); - if (time_after_eq(jiffies, priv->stamp)) { - priv->tokens = priv->rate; - priv->stamp = jiffies + priv->unit * HZ; - } - - if (priv->tokens >= 1) { - priv->tokens--; + now = ktime_get_ns(); + tokens = priv->tokens + now - priv->last; + if (tokens > priv->tokens_max) + tokens = priv->tokens_max; + + priv->last = now; + delta = tokens - cost; + if (delta >= 0) { + priv->tokens = delta; spin_unlock_bh(&limit_lock); return; } + priv->tokens = tokens; spin_unlock_bh(&limit_lock); regs->verdict.code = NFT_BREAK; @@ -58,25 +64,29 @@ static int nft_limit_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_limit *priv = nft_expr_priv(expr); + u64 unit; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; - priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); - priv->unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); - priv->stamp = jiffies + priv->unit * HZ; - priv->tokens = priv->rate; + priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); + priv->nsecs = unit * NSEC_PER_SEC; + if (priv->rate == 0 || priv->nsecs < unit) + return -EOVERFLOW; + priv->tokens = priv->tokens_max = priv->nsecs; + priv->last = ktime_get_ns(); return 0; } static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_limit *priv = nft_expr_priv(expr); + u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC); - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate))) - goto nla_put_failure; - if (nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(priv->unit))) + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate)) || + nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs))) goto nla_put_failure; return 0; -- cgit v1.2.3 From f8d3a6bc76011bb86d2515ebd0b3b300641f2f8c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 2 Aug 2015 14:16:42 +0200 Subject: netfilter: nft_limit: factor out shared code with per-byte limiting This patch prepares the introduction of per-byte limiting. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 86 +++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index c79703e5baad..c4d1b1b75b8f 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -27,65 +27,54 @@ struct nft_limit { u64 nsecs; }; -static void nft_limit_pkts_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) { - struct nft_limit *priv = nft_expr_priv(expr); - u64 now, tokens, cost = div_u64(priv->nsecs, priv->rate); + u64 now, tokens; s64 delta; spin_lock_bh(&limit_lock); now = ktime_get_ns(); - tokens = priv->tokens + now - priv->last; - if (tokens > priv->tokens_max) - tokens = priv->tokens_max; + tokens = limit->tokens + now - limit->last; + if (tokens > limit->tokens_max) + tokens = limit->tokens_max; - priv->last = now; + limit->last = now; delta = tokens - cost; if (delta >= 0) { - priv->tokens = delta; + limit->tokens = delta; spin_unlock_bh(&limit_lock); - return; + return false; } - priv->tokens = tokens; + limit->tokens = tokens; spin_unlock_bh(&limit_lock); - - regs->verdict.code = NFT_BREAK; + return true; } -static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { - [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, - [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, -}; - -static int nft_limit_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, +static int nft_limit_init(struct nft_limit *limit, const struct nlattr * const tb[]) { - struct nft_limit *priv = nft_expr_priv(expr); u64 unit; if (tb[NFTA_LIMIT_RATE] == NULL || tb[NFTA_LIMIT_UNIT] == NULL) return -EINVAL; - priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); + limit->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); - priv->nsecs = unit * NSEC_PER_SEC; - if (priv->rate == 0 || priv->nsecs < unit) + limit->nsecs = unit * NSEC_PER_SEC; + if (limit->rate == 0 || limit->nsecs < unit) return -EOVERFLOW; - priv->tokens = priv->tokens_max = priv->nsecs; - priv->last = ktime_get_ns(); + limit->tokens = limit->tokens_max = limit->nsecs; + limit->last = ktime_get_ns(); + return 0; } -static int nft_limit_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit) { - const struct nft_limit *priv = nft_expr_priv(expr); - u64 secs = div_u64(priv->nsecs, NSEC_PER_SEC); + u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(priv->rate)) || + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate)) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs))) goto nla_put_failure; return 0; @@ -94,13 +83,44 @@ nla_put_failure: return -1; } +static void nft_limit_pkts_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + if (nft_limit_eval(priv, div_u64(priv->nsecs, priv->rate))) + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { + [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, + [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, +}; + +static int nft_limit_pkts_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + return nft_limit_init(priv, tb); +} + +static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_limit *priv = nft_expr_priv(expr); + + return nft_limit_dump(skb, priv); +} + static struct nft_expr_type nft_limit_type; static const struct nft_expr_ops nft_limit_pkts_ops = { .type = &nft_limit_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), .eval = nft_limit_pkts_eval, - .init = nft_limit_init, - .dump = nft_limit_dump, + .init = nft_limit_pkts_init, + .dump = nft_limit_pkts_dump, }; static struct nft_expr_type nft_limit_type __read_mostly = { -- cgit v1.2.3 From 3e87baafa4f476017b2453e52a1ca7308b4e6ad5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 2 Aug 2015 18:02:14 +0200 Subject: netfilter: nft_limit: add burst parameter This patch adds the burst parameter. This burst indicates the number of packets that can exceed the limit. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_limit.c | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 2ef35f2c9bda..cafd78943c7d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -761,11 +761,13 @@ enum nft_ct_attributes { * * @NFTA_LIMIT_RATE: refill rate (NLA_U64) * @NFTA_LIMIT_UNIT: refill unit (NLA_U64) + * @NFTA_LIMIT_BURST: burst (NLA_U32) */ enum nft_limit_attributes { NFTA_LIMIT_UNSPEC, NFTA_LIMIT_RATE, NFTA_LIMIT_UNIT, + NFTA_LIMIT_BURST, __NFTA_LIMIT_MAX }; #define NFTA_LIMIT_MAX (__NFTA_LIMIT_MAX - 1) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index c4d1b1b75b8f..d8c5ff1bf7dd 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -25,6 +25,7 @@ struct nft_limit { u64 tokens_max; u64 rate; u64 nsecs; + u32 burst; }; static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) @@ -65,6 +66,18 @@ static int nft_limit_init(struct nft_limit *limit, if (limit->rate == 0 || limit->nsecs < unit) return -EOVERFLOW; limit->tokens = limit->tokens_max = limit->nsecs; + + if (tb[NFTA_LIMIT_BURST]) { + u64 rate; + + limit->burst = ntohl(nla_get_be32(tb[NFTA_LIMIT_BURST])); + + rate = limit->rate + limit->burst; + if (rate < limit->rate) + return -EOVERFLOW; + + limit->rate = rate; + } limit->last = ktime_get_ns(); return 0; @@ -73,9 +86,11 @@ static int nft_limit_init(struct nft_limit *limit, static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit) { u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); + u64 rate = limit->rate - limit->burst; - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(limit->rate)) || - nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs))) + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) || + nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) || + nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst))) goto nla_put_failure; return 0; @@ -96,6 +111,7 @@ static void nft_limit_pkts_eval(const struct nft_expr *expr, static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, + [NFTA_LIMIT_BURST] = { .type = NLA_U32 }, }; static int nft_limit_pkts_init(const struct nft_ctx *ctx, -- cgit v1.2.3 From 8bdf3626425520e9bc55996d5d46f60e12dd3ad7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 2 Aug 2015 14:24:24 +0200 Subject: netfilter: nft_limit: constant token cost per packet The cost per packet can be calculated from the control plane path since this doesn't ever change. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_limit.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index d8c5ff1bf7dd..b418698e26b0 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -98,13 +98,18 @@ nla_put_failure: return -1; } +struct nft_limit_pkts { + struct nft_limit limit; + u64 cost; +}; + static void nft_limit_pkts_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { - struct nft_limit *priv = nft_expr_priv(expr); + struct nft_limit_pkts *priv = nft_expr_priv(expr); - if (nft_limit_eval(priv, div_u64(priv->nsecs, priv->rate))) + if (nft_limit_eval(&priv->limit, priv->cost)) regs->verdict.code = NFT_BREAK; } @@ -118,22 +123,28 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { - struct nft_limit *priv = nft_expr_priv(expr); + struct nft_limit_pkts *priv = nft_expr_priv(expr); + int err; - return nft_limit_init(priv, tb); + err = nft_limit_init(&priv->limit, tb); + if (err < 0) + return err; + + priv->cost = div_u64(priv->limit.nsecs, priv->limit.rate); + return 0; } static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) { - const struct nft_limit *priv = nft_expr_priv(expr); + const struct nft_limit_pkts *priv = nft_expr_priv(expr); - return nft_limit_dump(skb, priv); + return nft_limit_dump(skb, &priv->limit); } static struct nft_expr_type nft_limit_type; static const struct nft_expr_ops nft_limit_pkts_ops = { .type = &nft_limit_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)), .eval = nft_limit_pkts_eval, .init = nft_limit_pkts_init, .dump = nft_limit_pkts_dump, -- cgit v1.2.3 From d2168e849ebf617b2b7feae44c0c0baf739cb610 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 5 Aug 2015 12:38:44 +0200 Subject: netfilter: nft_limit: add per-byte limiting This patch adds a new NFTA_LIMIT_TYPE netlink attribute to indicate the type of limiting. Contrary to per-packet limiting, the cost is calculated from the packet path since this depends on the packet length. The burst attribute indicates the number of bytes in which the rate can be exceeded. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 7 ++++ net/netfilter/nft_limit.c | 63 ++++++++++++++++++++++++++++++-- 2 files changed, 66 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index cafd78943c7d..d8c8a7c9d88a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -756,18 +756,25 @@ enum nft_ct_attributes { }; #define NFTA_CT_MAX (__NFTA_CT_MAX - 1) +enum nft_limit_type { + NFT_LIMIT_PKTS, + NFT_LIMIT_PKT_BYTES +}; + /** * enum nft_limit_attributes - nf_tables limit expression netlink attributes * * @NFTA_LIMIT_RATE: refill rate (NLA_U64) * @NFTA_LIMIT_UNIT: refill unit (NLA_U64) * @NFTA_LIMIT_BURST: burst (NLA_U32) + * @NFTA_LIMIT_TYPE: type of limit (NLA_U32: enum nft_limit_type) */ enum nft_limit_attributes { NFTA_LIMIT_UNSPEC, NFTA_LIMIT_RATE, NFTA_LIMIT_UNIT, NFTA_LIMIT_BURST, + NFTA_LIMIT_TYPE, __NFTA_LIMIT_MAX }; #define NFTA_LIMIT_MAX (__NFTA_LIMIT_MAX - 1) diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index b418698e26b0..5d67938f8b2f 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -83,14 +83,16 @@ static int nft_limit_init(struct nft_limit *limit, return 0; } -static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit) +static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, + enum nft_limit_type type) { u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); u64 rate = limit->rate - limit->burst; if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) || nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) || - nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst))) + nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || + nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type))) goto nla_put_failure; return 0; @@ -117,6 +119,7 @@ static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = { [NFTA_LIMIT_RATE] = { .type = NLA_U64 }, [NFTA_LIMIT_UNIT] = { .type = NLA_U64 }, [NFTA_LIMIT_BURST] = { .type = NLA_U32 }, + [NFTA_LIMIT_TYPE] = { .type = NLA_U32 }, }; static int nft_limit_pkts_init(const struct nft_ctx *ctx, @@ -138,7 +141,7 @@ static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_limit_pkts *priv = nft_expr_priv(expr); - return nft_limit_dump(skb, &priv->limit); + return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); } static struct nft_expr_type nft_limit_type; @@ -150,9 +153,61 @@ static const struct nft_expr_ops nft_limit_pkts_ops = { .dump = nft_limit_pkts_dump, }; +static void nft_limit_pkt_bytes_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_limit *priv = nft_expr_priv(expr); + u64 cost = div_u64(priv->nsecs * pkt->skb->len, priv->rate); + + if (nft_limit_eval(priv, cost)) + regs->verdict.code = NFT_BREAK; +} + +static int nft_limit_pkt_bytes_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_limit *priv = nft_expr_priv(expr); + + return nft_limit_init(priv, tb); +} + +static int nft_limit_pkt_bytes_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_limit *priv = nft_expr_priv(expr); + + return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); +} + +static const struct nft_expr_ops nft_limit_pkt_bytes_ops = { + .type = &nft_limit_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit)), + .eval = nft_limit_pkt_bytes_eval, + .init = nft_limit_pkt_bytes_init, + .dump = nft_limit_pkt_bytes_dump, +}; + +static const struct nft_expr_ops * +nft_limit_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_LIMIT_TYPE] == NULL) + return &nft_limit_pkts_ops; + + switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) { + case NFT_LIMIT_PKTS: + return &nft_limit_pkts_ops; + case NFT_LIMIT_PKT_BYTES: + return &nft_limit_pkt_bytes_ops; + } + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_limit_type __read_mostly = { .name = "limit", - .ops = &nft_limit_pkts_ops, + .select_ops = nft_limit_select_ops, .policy = nft_limit_policy, .maxattr = NFTA_LIMIT_MAX, .flags = NFT_EXPR_STATEFUL, -- cgit v1.2.3 From 3499abb249bb5ed9d21031944bc3059ec4aa2909 Mon Sep 17 00:00:00 2001 From: Andreas Schultz Date: Wed, 5 Aug 2015 17:51:45 +0200 Subject: netfilter: nfacct: per network namespace support - Move the nfnl_acct_list into the network namespace, initialize and destroy it per namespace - Keep track of refcnt on nfacct objects, the old logic does not longer work with a per namespace list - Adjust xt_nfacct to pass the namespace when registring objects Signed-off-by: Andreas Schultz Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_acct.h | 3 +- include/net/net_namespace.h | 3 ++ net/netfilter/nfnetlink_acct.c | 71 ++++++++++++++++++++++---------- net/netfilter/xt_nfacct.c | 2 +- 4 files changed, 56 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h index 6ec975748742..80ca889b164e 100644 --- a/include/linux/netfilter/nfnetlink_acct.h +++ b/include/linux/netfilter/nfnetlink_acct.h @@ -2,6 +2,7 @@ #define _NFNL_ACCT_H_ #include +#include enum { NFACCT_NO_QUOTA = -1, @@ -11,7 +12,7 @@ enum { struct nf_acct; -struct nf_acct *nfnl_acct_find_get(const char *filter_name); +struct nf_acct *nfnl_acct_find_get(struct net *net, const char *filter_name); void nfnl_acct_put(struct nf_acct *acct); void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct); extern int nfnl_acct_overquota(const struct sk_buff *skb, diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index e951453e0a23..2dcea635ecce 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -118,6 +118,9 @@ struct net { #endif struct sock *nfnl; struct sock *nfnl_stash; +#if IS_ENABLED(CONFIG_NETFILTER_NETLINK_ACCT) + struct list_head nfnl_acct_list; +#endif #endif #ifdef CONFIG_WEXT_CORE struct sk_buff_head wext_nlevents; diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index c18af2f63eef..fefbf5f0b28d 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -27,8 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso "); MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); -static LIST_HEAD(nfnl_acct_list); - struct nf_acct { atomic64_t pkts; atomic64_t bytes; @@ -53,6 +51,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { struct nf_acct *nfacct, *matching = NULL; + struct net *net = sock_net(nfnl); char *acct_name; unsigned int size = 0; u32 flags = 0; @@ -64,7 +63,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, if (strlen(acct_name) == 0) return -EINVAL; - list_for_each_entry(nfacct, &nfnl_acct_list, head) { + list_for_each_entry(nfacct, &net->nfnl_acct_list, head) { if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -124,7 +123,7 @@ nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } atomic_set(&nfacct->refcnt, 1); - list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); + list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list); return 0; } @@ -185,6 +184,7 @@ nla_put_failure: static int nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct nf_acct *cur, *last; const struct nfacct_filter *filter = cb->data; @@ -196,7 +196,7 @@ nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) cb->args[1] = 0; rcu_read_lock(); - list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { if (last) { if (cur != last) continue; @@ -257,6 +257,7 @@ static int nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { + struct net *net = sock_net(nfnl); int ret = -ENOENT; struct nf_acct *cur; char *acct_name; @@ -283,7 +284,7 @@ nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, return -EINVAL; acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &nfnl_acct_list, head) { + list_for_each_entry(cur, &net->nfnl_acct_list, head) { struct sk_buff *skb2; if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) @@ -336,19 +337,20 @@ static int nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const tb[]) { + struct net *net = sock_net(nfnl); char *acct_name; struct nf_acct *cur; int ret = -ENOENT; if (!tb[NFACCT_NAME]) { - list_for_each_entry(cur, &nfnl_acct_list, head) + list_for_each_entry(cur, &net->nfnl_acct_list, head) nfnl_acct_try_del(cur); return 0; } acct_name = nla_data(tb[NFACCT_NAME]); - list_for_each_entry(cur, &nfnl_acct_list, head) { + list_for_each_entry(cur, &net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) continue; @@ -394,12 +396,12 @@ static const struct nfnetlink_subsystem nfnl_acct_subsys = { MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); -struct nf_acct *nfnl_acct_find_get(const char *acct_name) +struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) { struct nf_acct *cur, *acct = NULL; rcu_read_lock(); - list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { + list_for_each_entry_rcu(cur, &net->nfnl_acct_list, head) { if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) continue; @@ -422,7 +424,9 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get); void nfnl_acct_put(struct nf_acct *acct) { - atomic_dec(&acct->refcnt); + if (atomic_dec_and_test(&acct->refcnt)) + kfree_rcu(acct, rcu_head); + module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(nfnl_acct_put); @@ -478,34 +482,59 @@ int nfnl_acct_overquota(const struct sk_buff *skb, struct nf_acct *nfacct) } EXPORT_SYMBOL_GPL(nfnl_acct_overquota); +static int __net_init nfnl_acct_net_init(struct net *net) +{ + INIT_LIST_HEAD(&net->nfnl_acct_list); + + return 0; +} + +static void __net_exit nfnl_acct_net_exit(struct net *net) +{ + struct nf_acct *cur, *tmp; + + list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) { + list_del_rcu(&cur->head); + + if (atomic_dec_and_test(&cur->refcnt)) + kfree_rcu(cur, rcu_head); + } +} + +static struct pernet_operations nfnl_acct_ops = { + .init = nfnl_acct_net_init, + .exit = nfnl_acct_net_exit, +}; + static int __init nfnl_acct_init(void) { int ret; + ret = register_pernet_subsys(&nfnl_acct_ops); + if (ret < 0) { + pr_err("nfnl_acct_init: failed to register pernet ops\n"); + goto err_out; + } + pr_info("nfnl_acct: registering with nfnetlink.\n"); ret = nfnetlink_subsys_register(&nfnl_acct_subsys); if (ret < 0) { pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); - goto err_out; + goto cleanup_pernet; } return 0; + +cleanup_pernet: + unregister_pernet_subsys(&nfnl_acct_ops); err_out: return ret; } static void __exit nfnl_acct_exit(void) { - struct nf_acct *cur, *tmp; - pr_info("nfnl_acct: unregistering from nfnetlink.\n"); nfnetlink_subsys_unregister(&nfnl_acct_subsys); - - list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) { - list_del_rcu(&cur->head); - /* We are sure that our objects have no clients at this point, - * it's safe to release them all without checking refcnt. */ - kfree_rcu(cur, rcu_head); - } + unregister_pernet_subsys(&nfnl_acct_ops); } module_init(nfnl_acct_init); diff --git a/net/netfilter/xt_nfacct.c b/net/netfilter/xt_nfacct.c index 8c646ed9c921..3048a7e3a90a 100644 --- a/net/netfilter/xt_nfacct.c +++ b/net/netfilter/xt_nfacct.c @@ -37,7 +37,7 @@ nfacct_mt_checkentry(const struct xt_mtchk_param *par) struct xt_nfacct_match_info *info = par->matchinfo; struct nf_acct *nfacct; - nfacct = nfnl_acct_find_get(info->name); + nfacct = nfnl_acct_find_get(par->net, info->name); if (nfacct == NULL) { pr_info("xt_nfacct: accounting object with name `%s' " "does not exists\n", info->name); -- cgit v1.2.3 From d5a8ac28a7ff2f250d1bedbb6008dd2f6f6f1638 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 5 Aug 2015 01:43:25 -0400 Subject: RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net Open the sockets calling sock_create_kern() with the correct struct net pointer, and use that struct net pointer when verifying the address passed to rds_bind(). Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/bind.c | 3 ++- net/rds/connection.c | 16 ++++++++++------ net/rds/ib.c | 2 +- net/rds/ib_cm.c | 5 +++-- net/rds/iw.c | 2 +- net/rds/iw_cm.c | 5 +++-- net/rds/rds.h | 23 +++++++++++++++++++---- net/rds/send.c | 3 ++- net/rds/tcp.c | 4 ++-- net/rds/tcp_connect.c | 3 ++- net/rds/tcp_listen.c | 16 ++++++++++++---- net/rds/transport.c | 4 ++-- 12 files changed, 59 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/rds/bind.c b/net/rds/bind.c index 4ebd29c128b6..dd666fb9b4e1 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ret = 0; goto out; } - trans = rds_trans_get_preferred(sin->sin_addr.s_addr); + trans = rds_trans_get_preferred(sock_net(sock->sk), + sin->sin_addr.s_addr); if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); diff --git a/net/rds/connection.c b/net/rds/connection.c index da6da57e5f36..d4fecb21ca25 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn) * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ -static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, +static struct rds_connection *__rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp, int is_outgoing) { @@ -157,6 +158,7 @@ new_conn: conn->c_faddr = faddr; spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; + rds_conn_net_set(conn, net); init_waitqueue_head(&conn->c_waitq); INIT_LIST_HEAD(&conn->c_send_queue); @@ -174,7 +176,7 @@ new_conn: * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - loop_trans = rds_trans_get_preferred(faddr); + loop_trans = rds_trans_get_preferred(net, faddr); if (loop_trans) { rds_trans_put(loop_trans); conn->c_loopback = 1; @@ -260,17 +262,19 @@ out: return conn; } -struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, 0); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 0); } EXPORT_SYMBOL_GPL(rds_conn_create); -struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create_outgoing(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, 1); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 1); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); diff --git a/net/rds/ib.c b/net/rds/ib.c index ba2dffeff608..13814227b3b2 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_ib_laddr_check(__be32 addr) +static int rds_ib_laddr_check(struct net *net, __be32 addr) { int ret; struct rdma_cm_id *cm_id; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 0da2a45b33bd..f40d8f52b753 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -448,8 +448,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, (unsigned long long)be64_to_cpu(lguid), (unsigned long long)be64_to_cpu(fguid)); - conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport, - GFP_KERNEL); + /* RDS/IB is not currently netns aware, thus init_net */ + conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, + &rds_ib_transport, GFP_KERNEL); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; diff --git a/net/rds/iw.c b/net/rds/iw.c index 589935661d66..5d5a9d258658 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -218,7 +218,7 @@ static void rds_iw_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy "routing". */ -static int rds_iw_laddr_check(__be32 addr) +static int rds_iw_laddr_check(struct net *net, __be32 addr) { int ret; struct rdma_cm_id *cm_id; diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index 8f486fa32079..a6553a6fb2bc 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -398,8 +398,9 @@ int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, &dp->dp_saddr, &dp->dp_daddr, RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version)); - conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport, - GFP_KERNEL); + /* RDS/IW is not currently netns aware, thus init_net */ + conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr, + &rds_iw_transport, GFP_KERNEL); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; diff --git a/net/rds/rds.h b/net/rds/rds.h index 2260c1e434b1..9005fb0586f6 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -128,8 +128,21 @@ struct rds_connection { /* Protocol version */ unsigned int c_version; + possible_net_t c_net; }; +static inline +struct net *rds_conn_net(struct rds_connection *conn) +{ + return read_pnet(&conn->c_net); +} + +static inline +void rds_conn_net_set(struct rds_connection *conn, struct net *net) +{ + write_pnet(&conn->c_net, net); +} + #define RDS_FLAG_CONG_BITMAP 0x01 #define RDS_FLAG_ACK_REQUIRED 0x02 #define RDS_FLAG_RETRANSMITTED 0x04 @@ -417,7 +430,7 @@ struct rds_transport { unsigned int t_prefer_loopback:1; unsigned int t_type; - int (*laddr_check)(__be32 addr); + int (*laddr_check)(struct net *net, __be32 addr); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); int (*conn_connect)(struct rds_connection *conn); @@ -608,9 +621,11 @@ struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ int rds_conn_init(void); void rds_conn_exit(void); -struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); -struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create_outgoing(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); @@ -795,7 +810,7 @@ void rds_connect_complete(struct rds_connection *conn); /* transport.c */ int rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); -struct rds_transport *rds_trans_get_preferred(__be32 addr); +struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr); void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); diff --git a/net/rds/send.c b/net/rds/send.c index e9430f537f9c..2581b8e3dbe7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1023,7 +1023,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (rs->rs_conn && rs->rs_conn->c_faddr == daddr) conn = rs->rs_conn; else { - conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr, + conn = rds_conn_create_outgoing(sock_net(sock->sk), + rs->rs_bound_addr, daddr, rs->rs_transport, sock->sk->sk_allocation); if (IS_ERR(conn)) { diff --git a/net/rds/tcp.c b/net/rds/tcp.c index edac9ef2bc8b..98f5de3a1c7b 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -189,9 +189,9 @@ out: spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags); } -static int rds_tcp_laddr_check(__be32 addr) +static int rds_tcp_laddr_check(struct net *net, __be32 addr) { - if (inet_addr_type(&init_net, addr) == RTN_LOCAL) + if (inet_addr_type(net, addr) == RTN_LOCAL) return 0; return -EADDRNOTAVAIL; } diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 973109c7b8e8..6473b7b377ae 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -79,7 +79,8 @@ int rds_tcp_conn_connect(struct rds_connection *conn) struct sockaddr_in src, dest; int ret; - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(rds_conn_net(conn), PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) goto out; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 0da49e34495f..398ffe5fc1d8 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -85,8 +85,9 @@ static int rds_tcp_accept_one(struct socket *sock) struct inet_sock *inet; struct rds_tcp_connection *rs_tcp; - ret = sock_create_lite(sock->sk->sk_family, sock->sk->sk_type, - sock->sk->sk_protocol, &new_sock); + ret = sock_create_kern(sock_net(sock->sk), sock->sk->sk_family, + sock->sk->sk_type, sock->sk->sk_protocol, + &new_sock); if (ret) goto out; @@ -108,7 +109,8 @@ static int rds_tcp_accept_one(struct socket *sock) &inet->inet_saddr, ntohs(inet->inet_sport), &inet->inet_daddr, ntohs(inet->inet_dport)); - conn = rds_conn_create(inet->inet_saddr, inet->inet_daddr, + conn = rds_conn_create(sock_net(sock->sk), + inet->inet_saddr, inet->inet_daddr, &rds_tcp_transport, GFP_KERNEL); if (IS_ERR(conn)) { ret = PTR_ERR(conn); @@ -187,7 +189,13 @@ int rds_tcp_listen_init(void) struct socket *sock = NULL; int ret; - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + /* MUST call sock_create_kern directly so that we avoid get_net() + * in sk_alloc(). Doing a get_net() will result in cleanup_net() + * never getting invoked, which will leave sock and other things + * in limbo. + */ + ret = sock_create_kern(current->nsproxy->net_ns, PF_INET, + SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) goto out; diff --git a/net/rds/transport.c b/net/rds/transport.c index 83498e1c75b8..f3afd1d60d3c 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -77,7 +77,7 @@ void rds_trans_put(struct rds_transport *trans) module_put(trans->t_owner); } -struct rds_transport *rds_trans_get_preferred(__be32 addr) +struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr) { struct rds_transport *ret = NULL; struct rds_transport *trans; @@ -90,7 +90,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr) for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; - if (trans && (trans->laddr_check(addr) == 0) && + if (trans && (trans->laddr_check(net, addr) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; -- cgit v1.2.3 From 467fa15356acfb7b2efa38839c3e76caa4e6e0ea Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Wed, 5 Aug 2015 01:43:26 -0400 Subject: RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns. Register pernet subsys init/stop functions that will set up and tear down per-net RDS-TCP listen endpoints. Unregister pernet subusys functions on 'modprobe -r' to clean up these end points. Enable keepalive on both accept and connect socket endpoints. The keepalive timer expiration will ensure that client socket endpoints will be removed as appropriate from the netns when an interface is removed from a namespace. Register a device notifier callback that will clean up all sockets (and thus avoid the need to wait for keepalive timeout) when the loopback device is unregistered from the netns indicating that the netns is getting deleted. Signed-off-by: Sowmini Varadhan Signed-off-by: David S. Miller --- net/rds/tcp.c | 161 +++++++++++++++++++++++++++++++++++++++++++++----- net/rds/tcp.h | 7 ++- net/rds/tcp_connect.c | 6 +- net/rds/tcp_listen.c | 38 +++--------- 4 files changed, 162 insertions(+), 50 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 98f5de3a1c7b..c42b60bf4c68 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -35,6 +35,9 @@ #include #include #include +#include +#include +#include #include "rds.h" #include "tcp.h" @@ -250,16 +253,7 @@ static void rds_tcp_destroy_conns(void) } } -static void rds_tcp_exit(void) -{ - rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); - rds_tcp_listen_stop(); - rds_tcp_destroy_conns(); - rds_trans_unregister(&rds_tcp_transport); - rds_tcp_recv_exit(); - kmem_cache_destroy(rds_tcp_conn_slab); -} -module_exit(rds_tcp_exit); +static void rds_tcp_exit(void); struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, @@ -281,6 +275,136 @@ struct rds_transport rds_tcp_transport = { .t_prefer_loopback = 1, }; +static int rds_tcp_netid; + +/* per-network namespace private data for this module */ +struct rds_tcp_net { + struct socket *rds_tcp_listen_sock; + struct work_struct rds_tcp_accept_w; +}; + +static void rds_tcp_accept_worker(struct work_struct *work) +{ + struct rds_tcp_net *rtn = container_of(work, + struct rds_tcp_net, + rds_tcp_accept_w); + + while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0) + cond_resched(); +} + +void rds_tcp_accept_work(struct sock *sk) +{ + struct net *net = sock_net(sk); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + queue_work(rds_wq, &rtn->rds_tcp_accept_w); +} + +static __net_init int rds_tcp_init_net(struct net *net) +{ + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net); + if (!rtn->rds_tcp_listen_sock) { + pr_warn("could not set up listen sock\n"); + return -EAFNOSUPPORT; + } + INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker); + return 0; +} + +static void __net_exit rds_tcp_exit_net(struct net *net) +{ + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + /* If rds_tcp_exit_net() is called as a result of netns deletion, + * the rds_tcp_kill_sock() device notifier would already have cleaned + * up the listen socket, thus there is no work to do in this function. + * + * If rds_tcp_exit_net() is called as a result of module unload, + * i.e., due to rds_tcp_exit() -> unregister_pernet_subsys(), then + * we do need to clean up the listen socket here. + */ + if (rtn->rds_tcp_listen_sock) { + rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); + rtn->rds_tcp_listen_sock = NULL; + flush_work(&rtn->rds_tcp_accept_w); + } +} + +static struct pernet_operations rds_tcp_net_ops = { + .init = rds_tcp_init_net, + .exit = rds_tcp_exit_net, + .id = &rds_tcp_netid, + .size = sizeof(struct rds_tcp_net), +}; + +static void rds_tcp_kill_sock(struct net *net) +{ + struct rds_tcp_connection *tc, *_tc; + struct sock *sk; + LIST_HEAD(tmp_list); + struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); + + rds_tcp_listen_stop(rtn->rds_tcp_listen_sock); + rtn->rds_tcp_listen_sock = NULL; + flush_work(&rtn->rds_tcp_accept_w); + spin_lock_irq(&rds_tcp_conn_lock); + list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { + struct net *c_net = read_pnet(&tc->conn->c_net); + + if (net != c_net || !tc->t_sock) + continue; + list_move_tail(&tc->t_tcp_node, &tmp_list); + } + spin_unlock_irq(&rds_tcp_conn_lock); + list_for_each_entry_safe(tc, _tc, &tmp_list, t_tcp_node) { + sk = tc->t_sock->sk; + sk->sk_prot->disconnect(sk, 0); + tcp_done(sk); + if (tc->conn->c_passive) + rds_conn_destroy(tc->conn->c_passive); + rds_conn_destroy(tc->conn); + } +} + +static int rds_tcp_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + /* rds-tcp registers as a pernet subys, so the ->exit will only + * get invoked after network acitivity has quiesced. We need to + * clean up all sockets to quiesce network activity, and use + * the unregistration of the per-net loopback device as a trigger + * to start that cleanup. + */ + if (event == NETDEV_UNREGISTER_FINAL && + dev->ifindex == LOOPBACK_IFINDEX) + rds_tcp_kill_sock(dev_net(dev)); + + return NOTIFY_DONE; +} + +static struct notifier_block rds_tcp_dev_notifier = { + .notifier_call = rds_tcp_dev_event, + .priority = -10, /* must be called after other network notifiers */ +}; + +static void rds_tcp_exit(void) +{ + rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); + unregister_pernet_subsys(&rds_tcp_net_ops); + if (unregister_netdevice_notifier(&rds_tcp_dev_notifier)) + pr_warn("could not unregister rds_tcp_dev_notifier\n"); + rds_tcp_destroy_conns(); + rds_trans_unregister(&rds_tcp_transport); + rds_tcp_recv_exit(); + kmem_cache_destroy(rds_tcp_conn_slab); +} +module_exit(rds_tcp_exit); + static int rds_tcp_init(void) { int ret; @@ -293,6 +417,16 @@ static int rds_tcp_init(void) goto out; } + ret = register_netdevice_notifier(&rds_tcp_dev_notifier); + if (ret) { + pr_warn("could not register rds_tcp_dev_notifier\n"); + goto out; + } + + ret = register_pernet_subsys(&rds_tcp_net_ops); + if (ret) + goto out_slab; + ret = rds_tcp_recv_init(); if (ret) goto out_slab; @@ -301,19 +435,14 @@ static int rds_tcp_init(void) if (ret) goto out_recv; - ret = rds_tcp_listen_init(); - if (ret) - goto out_register; - rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); goto out; -out_register: - rds_trans_unregister(&rds_tcp_transport); out_recv: rds_tcp_recv_exit(); out_slab: + unregister_pernet_subsys(&rds_tcp_net_ops); kmem_cache_destroy(rds_tcp_conn_slab); out: return ret; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 0dbdd37162da..64f873c0c6b6 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -52,6 +52,7 @@ u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); u64 rds_tcp_map_seq(struct rds_tcp_connection *tc, u32 seq); extern struct rds_transport rds_tcp_transport; +void rds_tcp_accept_work(struct sock *sk); /* tcp_connect.c */ int rds_tcp_conn_connect(struct rds_connection *conn); @@ -59,9 +60,11 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -int rds_tcp_listen_init(void); -void rds_tcp_listen_stop(void); +struct socket *rds_tcp_listen_init(struct net *); +void rds_tcp_listen_stop(struct socket *); void rds_tcp_listen_data_ready(struct sock *sk); +int rds_tcp_accept_one(struct socket *sock); +int rds_tcp_keepalive(struct socket *sock); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 6473b7b377ae..5cb16875c460 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -112,10 +112,12 @@ int rds_tcp_conn_connect(struct rds_connection *conn) rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; - if (ret == 0) + if (ret == 0) { + rds_tcp_keepalive(sock); sock = NULL; - else + } else { rds_tcp_restore_callbacks(sock, conn->c_transport_data); + } out: if (sock) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 398ffe5fc1d8..444d78d0bd77 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -38,14 +38,7 @@ #include "rds.h" #include "tcp.h" -/* - * cheesy, but simple.. - */ -static void rds_tcp_accept_worker(struct work_struct *work); -static DECLARE_WORK(rds_tcp_listen_work, rds_tcp_accept_worker); -static struct socket *rds_tcp_listen_sock; - -static int rds_tcp_keepalive(struct socket *sock) +int rds_tcp_keepalive(struct socket *sock) { /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ @@ -77,7 +70,7 @@ bail: return ret; } -static int rds_tcp_accept_one(struct socket *sock) +int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; struct rds_connection *conn; @@ -150,12 +143,6 @@ out: return ret; } -static void rds_tcp_accept_worker(struct work_struct *work) -{ - while (rds_tcp_accept_one(rds_tcp_listen_sock) == 0) - cond_resched(); -} - void rds_tcp_listen_data_ready(struct sock *sk) { void (*ready)(struct sock *sk); @@ -176,26 +163,20 @@ void rds_tcp_listen_data_ready(struct sock *sk) * socket */ if (sk->sk_state == TCP_LISTEN) - queue_work(rds_wq, &rds_tcp_listen_work); + rds_tcp_accept_work(sk); out: read_unlock(&sk->sk_callback_lock); ready(sk); } -int rds_tcp_listen_init(void) +struct socket *rds_tcp_listen_init(struct net *net) { struct sockaddr_in sin; struct socket *sock = NULL; int ret; - /* MUST call sock_create_kern directly so that we avoid get_net() - * in sk_alloc(). Doing a get_net() will result in cleanup_net() - * never getting invoked, which will leave sock and other things - * in limbo. - */ - ret = sock_create_kern(current->nsproxy->net_ns, PF_INET, - SOCK_STREAM, IPPROTO_TCP, &sock); + ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); if (ret < 0) goto out; @@ -219,17 +200,15 @@ int rds_tcp_listen_init(void) if (ret < 0) goto out; - rds_tcp_listen_sock = sock; - sock = NULL; + return sock; out: if (sock) sock_release(sock); - return ret; + return NULL; } -void rds_tcp_listen_stop(void) +void rds_tcp_listen_stop(struct socket *sock) { - struct socket *sock = rds_tcp_listen_sock; struct sock *sk; if (!sock) @@ -250,5 +229,4 @@ void rds_tcp_listen_stop(void) /* wait for accepts to stop and close the socket */ flush_workqueue(rds_wq); sock_release(sock); - rds_tcp_listen_sock = NULL; } -- cgit v1.2.3 From da8b43c0e1dcea3bcac5f37ea59934ddaa137aed Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 4 Aug 2015 22:51:07 -0700 Subject: vxlan: combine VXLAN_FLOWBASED into VXLAN_COLLECT_METADATA IFLA_VXLAN_FLOWBASED is useless without IFLA_VXLAN_COLLECT_METADATA, so combine them into single IFLA_VXLAN_COLLECT_METADATA flag. 'flowbased' doesn't convey real meaning of the vxlan tunnel mode. This mode can be used by routing, tc+bpf and ovs. Only ovs is strictly flow based, so 'collect metadata' is a better name for this tunnel mode. Signed-off-by: Alexei Starovoitov Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 17 ++++++----------- include/net/vxlan.h | 4 +--- include/uapi/linux/if_link.h | 1 - net/openvswitch/vport-vxlan.c | 2 +- 4 files changed, 8 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e90f7a484e1c..b6731fad19ba 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1141,7 +1141,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, union vxlan_addr *remote_ip; /* For flow based devices, map all packets to VNI 0 */ - if (vs->flags & VXLAN_F_FLOW_BASED) + if (vs->flags & VXLAN_F_COLLECT_METADATA) vni = 0; /* Is this VNI defined? */ @@ -1183,7 +1183,7 @@ static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, skb_reset_network_header(skb); /* In flow-based mode, GBP is carried in dst_metadata */ - if (!(vs->flags & VXLAN_F_FLOW_BASED)) + if (!(vs->flags & VXLAN_F_COLLECT_METADATA)) skb->mark = md->gbp; if (oip6) @@ -2129,7 +2129,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) #endif } - if (vxlan->flags & VXLAN_F_FLOW_BASED && + if (vxlan->flags & VXLAN_F_COLLECT_METADATA && info && info->mode == IP_TUNNEL_INFO_TX) { vxlan_xmit_one(skb, dev, NULL, false); return NETDEV_TX_OK; @@ -2462,7 +2462,6 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, - [IFLA_VXLAN_FLOWBASED] = { .type = NLA_U8 }, [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, @@ -2814,10 +2813,6 @@ static int vxlan_newlink(struct net *src_net, struct net_device *dev, if (data[IFLA_VXLAN_LIMIT]) conf.addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); - if (data[IFLA_VXLAN_FLOWBASED] && - nla_get_u8(data[IFLA_VXLAN_FLOWBASED])) - conf.flags |= VXLAN_F_FLOW_BASED; - if (data[IFLA_VXLAN_COLLECT_METADATA] && nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA])) conf.flags |= VXLAN_F_COLLECT_METADATA; @@ -2903,7 +2898,7 @@ static size_t vxlan_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ - nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_FLOWBASED */ + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ nla_total_size(sizeof(struct ifla_vxlan_port_range)) + @@ -2970,8 +2965,8 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) !!(vxlan->flags & VXLAN_F_L2MISS)) || nla_put_u8(skb, IFLA_VXLAN_L3MISS, !!(vxlan->flags & VXLAN_F_L3MISS)) || - nla_put_u8(skb, IFLA_VXLAN_FLOWBASED, - !!(vxlan->flags & VXLAN_F_FLOW_BASED)) || + nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, + !!(vxlan->flags & VXLAN_F_COLLECT_METADATA)) || nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || diff --git a/include/net/vxlan.h b/include/net/vxlan.h index eb8d721cdb67..e4534f1b2d8c 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -181,7 +181,6 @@ struct vxlan_dev { #define VXLAN_F_GBP 0x800 #define VXLAN_F_REMCSUM_NOPARTIAL 0x1000 #define VXLAN_F_COLLECT_METADATA 0x2000 -#define VXLAN_F_FLOW_BASED 0x4000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -190,8 +189,7 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_REMCSUM_RX | \ VXLAN_F_REMCSUM_NOPARTIAL | \ - VXLAN_F_COLLECT_METADATA | \ - VXLAN_F_FLOW_BASED) + VXLAN_F_COLLECT_METADATA) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ea047480a1f0..f24ec99a2262 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -382,7 +382,6 @@ enum { IFLA_VXLAN_REMCSUM_RX, IFLA_VXLAN_GBP, IFLA_VXLAN_REMCSUM_NOPARTIAL, - IFLA_VXLAN_FLOWBASED, IFLA_VXLAN_COLLECT_METADATA, __IFLA_VXLAN_MAX }; diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 547173336cd3..c6e937e36f8b 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -90,7 +90,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms) int err; struct vxlan_config conf = { .no_share = true, - .flags = VXLAN_F_FLOW_BASED | VXLAN_F_COLLECT_METADATA, + .flags = VXLAN_F_COLLECT_METADATA, }; if (!options) { -- cgit v1.2.3 From e05176a3283822bd32a1f3d929ce2050232299a8 Mon Sep 17 00:00:00 2001 From: Wenyu Zhang Date: Wed, 5 Aug 2015 00:30:47 -0700 Subject: openvswitch: Make 100 percents packets sampled when sampling rate is 1. When sampling rate is 1, the sampling probability is UINT32_MAX. The packet should be sampled even the prandom32() generate the number of UINT32_MAX. And none packet need be sampled when the probability is 0. Signed-off-by: Wenyu Zhang Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index cf04c2f8b32a..a0ac410e9570 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -669,9 +669,12 @@ static int sample(struct datapath *dp, struct sk_buff *skb, for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { + u32 probability; + switch (nla_type(a)) { case OVS_SAMPLE_ATTR_PROBABILITY: - if (prandom_u32() >= nla_get_u32(a)) + probability = nla_get_u32(a); + if (!probability || prandom_u32() > probability) return 0; break; -- cgit v1.2.3 From 10e4ea75149d11883a9e04c3b32ee1d7600d481e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 5 Aug 2015 09:39:27 -0700 Subject: net: Fix race condition in store_rps_map There is a race condition in store_rps_map that allows jump label count in rps_needed to go below zero. This can happen when concurrently attempting to set and a clear map. Scenario: 1. rps_needed count is zero 2. New map is assigned by setting thread, but rps_needed count _not_ yet incremented (rps_needed count still zero) 2. Map is cleared by second thread, old_map set to that just assigned 3. Second thread performs static_key_slow_dec, rps_needed count now goes negative Fix is to increment or decrement rps_needed under the spinlock. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 194c1d03b2b3..39ec6949c1e6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -726,14 +726,17 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, old_map = rcu_dereference_protected(queue->rps_map, lockdep_is_held(&rps_map_lock)); rcu_assign_pointer(queue->rps_map, map); - spin_unlock(&rps_map_lock); if (map) static_key_slow_inc(&rps_needed); - if (old_map) { - kfree_rcu(old_map, rcu); + if (old_map) static_key_slow_dec(&rps_needed); - } + + spin_unlock(&rps_map_lock); + + if (old_map) + kfree_rcu(old_map, rcu); + free_cpumask_var(mask); return len; } -- cgit v1.2.3 From 4933d85c5173832ebd261756522095837583c458 Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Thu, 6 Aug 2015 12:52:47 +0900 Subject: net:wimax: Fix doucble word "the the" in networking.xml This patch fix a double word "the the" in Documentation/DocBook/networking.xml and Documentation/DocBook/networking/API-Wimax-report-rfkill-sw.html. These files are generated from comment in source, so I had to fix the typo in net/wimax/io-rfkill.c Signed-off-by: Masanari Iida Signed-off-by: David S. Miller --- net/wimax/op-rfkill.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/wimax/op-rfkill.c b/net/wimax/op-rfkill.c index 7d730543f243..477364ad750e 100644 --- a/net/wimax/op-rfkill.c +++ b/net/wimax/op-rfkill.c @@ -135,8 +135,7 @@ EXPORT_SYMBOL_GPL(wimax_report_rfkill_hw); * @state: New state of the RF kill switch. %WIMAX_RF_ON radio on, * %WIMAX_RF_OFF radio off. * - * Reports changes in the software RF switch state to the the WiMAX - * stack. + * Reports changes in the software RF switch state to the WiMAX stack. * * The main use is during initialization, so the driver can query the * device for its current software radio kill switch state and feed it -- cgit v1.2.3 From 1525c386a1f01612c6f3f27241113d7fc8e6d72d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 6 Aug 2015 01:44:02 -0400 Subject: net: switchdev: change fdb addr for a byte array The address in the switchdev_obj_fdb structure is currently represented as a pointer. Replacing it for a 6-byte array allows switchdev to carry addresses directly read from hardware registers, not stored by the switch chip driver (as in Rocker). Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 2 +- include/net/switchdev.h | 2 +- net/bridge/br_fdb.c | 2 +- net/switchdev/switchdev.c | 5 +++-- 4 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index b77e0e7307d4..80bb25c5a644 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4543,7 +4543,7 @@ static int rocker_port_fdb_dump(const struct rocker_port *rocker_port, hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) { if (found->key.pport != rocker_port->pport) continue; - fdb->addr = found->key.addr; + ether_addr_copy(fdb->addr, found->key.addr); fdb->vid = rocker_port_vlan_to_vid(rocker_port, found->key.vlan_id); err = obj->cb(rocker_port->dev, obj); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 89da8934519b..e90e1a0fa579 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -70,7 +70,7 @@ struct switchdev_obj { u32 tb_id; } ipv4_fib; struct switchdev_obj_fdb { /* PORT_FDB */ - const unsigned char *addr; + u8 addr[ETH_ALEN]; u16 vid; } fdb; } u; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 9e9875da0a4f..5656b44bf3de 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -136,11 +136,11 @@ static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) struct switchdev_obj obj = { .id = SWITCHDEV_OBJ_PORT_FDB, .u.fdb = { - .addr = f->addr.addr, .vid = f->vlan_id, }, }; + ether_addr_copy(obj.u.fdb.addr, f->addr.addr); switchdev_port_obj_del(f->dst->dev, &obj); } diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 33bafa2e703e..9db87a34f866 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -742,11 +743,11 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct switchdev_obj obj = { .id = SWITCHDEV_OBJ_PORT_FDB, .u.fdb = { - .addr = addr, .vid = vid, }, }; + ether_addr_copy(obj.u.fdb.addr, addr); return switchdev_port_obj_add(dev, &obj); } EXPORT_SYMBOL_GPL(switchdev_port_fdb_add); @@ -769,11 +770,11 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct switchdev_obj obj = { .id = SWITCHDEV_OBJ_PORT_FDB, .u.fdb = { - .addr = addr, .vid = vid, }, }; + ether_addr_copy(obj.u.fdb.addr, addr); return switchdev_port_obj_del(dev, &obj); } EXPORT_SYMBOL_GPL(switchdev_port_fdb_del); -- cgit v1.2.3 From 890248261a18c7ae22923095dfadea2c0a2a304a Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 6 Aug 2015 01:44:03 -0400 Subject: net: switchdev: support static FDB addresses This patch adds a is_static boolean to the switchdev_obj_fdb structure, in order to set the ndm_state to either NUD_NOARP or NUD_REACHABLE. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/switchdev.h | 1 + net/switchdev/switchdev.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/switchdev.h b/include/net/switchdev.h index e90e1a0fa579..0e296b82aef3 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -72,6 +72,7 @@ struct switchdev_obj { struct switchdev_obj_fdb { /* PORT_FDB */ u8 addr[ETH_ALEN]; u16 vid; + bool is_static; } fdb; } u; }; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 9db87a34f866..e9d1cacc4060 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -811,7 +811,7 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev, ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = NUD_REACHABLE; + ndm->ndm_state = obj->u.fdb.is_static ? NUD_NOARP : NUD_REACHABLE; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr)) goto nla_put_failure; -- cgit v1.2.3 From 55045ddded0f39d84c2ca019508973be8c595a78 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 6 Aug 2015 01:44:04 -0400 Subject: net: dsa: add support for switchdev FDB objects Remove the fdb_{add,del,getnext} function pointer in favor of new port_fdb_{add,del,getnext}. Implement the switchdev_port_obj_{add,del,dump} functions in DSA to support the SWITCHDEV_OBJ_PORT_FDB objects. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6171.c | 3 - drivers/net/dsa/mv88e6352.c | 3 - include/net/dsa.h | 16 ++-- net/dsa/slave.c | 218 +++++++++++++++++++++++--------------------- 4 files changed, 126 insertions(+), 114 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c index 1c7808495a9d..cfa21ed1f734 100644 --- a/drivers/net/dsa/mv88e6171.c +++ b/drivers/net/dsa/mv88e6171.c @@ -116,9 +116,6 @@ struct dsa_switch_driver mv88e6171_switch_driver = { .port_join_bridge = mv88e6xxx_join_bridge, .port_leave_bridge = mv88e6xxx_leave_bridge, .port_stp_update = mv88e6xxx_port_stp_update, - .fdb_add = mv88e6xxx_port_fdb_add, - .fdb_del = mv88e6xxx_port_fdb_del, - .fdb_getnext = mv88e6xxx_port_fdb_getnext, }; MODULE_ALIAS("platform:mv88e6171"); diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c index af210efecc55..eb4630fec6f1 100644 --- a/drivers/net/dsa/mv88e6352.c +++ b/drivers/net/dsa/mv88e6352.c @@ -341,9 +341,6 @@ struct dsa_switch_driver mv88e6352_switch_driver = { .port_join_bridge = mv88e6xxx_join_bridge, .port_leave_bridge = mv88e6xxx_leave_bridge, .port_stp_update = mv88e6xxx_port_stp_update, - .fdb_add = mv88e6xxx_port_fdb_add, - .fdb_del = mv88e6xxx_port_fdb_del, - .fdb_getnext = mv88e6xxx_port_fdb_getnext, }; MODULE_ALIAS("platform:mv88e6172"); diff --git a/include/net/dsa.h b/include/net/dsa.h index fbca63ba8f73..091d35f77180 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,12 +296,16 @@ struct dsa_switch_driver { u32 br_port_mask); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); - int (*fdb_add)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); - int (*fdb_del)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); - int (*fdb_getnext)(struct dsa_switch *ds, int port, - unsigned char *addr, bool *is_static); + + /* + * Forwarding database + */ + int (*port_fdb_add)(struct dsa_switch *ds, int port, u16 vid, + const u8 addr[ETH_ALEN]); + int (*port_fdb_del)(struct dsa_switch *ds, int port, u16 vid, + const u8 addr[ETH_ALEN]); + int (*port_fdb_getnext)(struct dsa_switch *ds, int port, u16 *vid, + u8 addr[ETH_ALEN], bool *is_static); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 0010c690cc67..1dbdeaab2bb4 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "dsa_priv.h" /* slave mii_bus handling ***************************************************/ @@ -200,105 +201,6 @@ out: return 0; } -static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, u16 nlm_flags) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; - - if (ds->drv->fdb_add) - ret = ds->drv->fdb_add(ds, p->port, addr, vid); - - return ret; -} - -static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; - int ret = -EOPNOTSUPP; - - if (ds->drv->fdb_del) - ret = ds->drv->fdb_del(ds, p->port, addr, vid); - - return ret; -} - -static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb, - const unsigned char *addr, u16 vid, - bool is_static, - u32 portid, u32 seq, int type, - unsigned int flags) -{ - struct nlmsghdr *nlh; - struct ndmsg *ndm; - - nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); - if (!nlh) - return -EMSGSIZE; - - ndm = nlmsg_data(nlh); - ndm->ndm_family = AF_BRIDGE; - ndm->ndm_pad1 = 0; - ndm->ndm_pad2 = 0; - ndm->ndm_flags = NTF_EXT_LEARNED; - ndm->ndm_type = 0; - ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; - - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) - goto nla_put_failure; - - if (vid && nla_put_u16(skb, NDA_VLAN, vid)) - goto nla_put_failure; - - nlmsg_end(skb, nlh); - return 0; - -nla_put_failure: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; -} - -/* Dump information about entries, in response to GETNEIGH */ -static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev, - struct net_device *filter_dev, int idx) -{ - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; - unsigned char addr[ETH_ALEN] = { 0 }; - int ret; - - if (!ds->drv->fdb_getnext) - return -EOPNOTSUPP; - - for (; ; idx++) { - bool is_static; - - ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static); - if (ret < 0) - break; - - if (idx < cb->args[0]) - continue; - - ret = dsa_slave_fill_info(dev, skb, addr, 0, - is_static, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, NLM_F_MULTI); - if (ret < 0) - break; - } - - return idx; -} - static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -364,6 +266,115 @@ static int dsa_slave_port_attr_set(struct net_device *dev, return ret; } +static int dsa_slave_port_fdb_add(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_obj_fdb *fdb = &obj->u.fdb; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int err; + + if (obj->trans == SWITCHDEV_TRANS_PREPARE) + err = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP; + else if (obj->trans == SWITCHDEV_TRANS_COMMIT) + err = ds->drv->port_fdb_add(ds, p->port, fdb->vid, fdb->addr); + else + err = -EOPNOTSUPP; + + return err; +} + +static int dsa_slave_port_fdb_del(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_obj_fdb *fdb = &obj->u.fdb; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + + if (!ds->drv->port_fdb_del) + return -EOPNOTSUPP; + + return ds->drv->port_fdb_del(ds, p->port, fdb->vid, fdb->addr); +} + +static int dsa_slave_port_fdb_dump(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_obj_fdb *fdb = &obj->u.fdb; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int err; + + if (!ds->drv->port_fdb_getnext) + return -EOPNOTSUPP; + + memset(fdb, 0, sizeof(*fdb)); + + for (;;) { + err = ds->drv->port_fdb_getnext(ds, p->port, &fdb->vid, + fdb->addr, &fdb->is_static); + if (err) + break; + + err = obj->cb(dev, obj); + if (err) + break; + } + + return err == -ENOENT ? 0 : err; +} + +static int dsa_slave_port_obj_add(struct net_device *dev, + struct switchdev_obj *obj) +{ + int err; + + switch (obj->id) { + case SWITCHDEV_OBJ_PORT_FDB: + err = dsa_slave_port_fdb_add(dev, obj); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + +static int dsa_slave_port_obj_del(struct net_device *dev, + struct switchdev_obj *obj) +{ + int err; + + switch (obj->id) { + case SWITCHDEV_OBJ_PORT_FDB: + err = dsa_slave_port_fdb_del(dev, obj); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + +static int dsa_slave_port_obj_dump(struct net_device *dev, + struct switchdev_obj *obj) +{ + int err; + + switch (obj->id) { + case SWITCHDEV_OBJ_PORT_FDB: + err = dsa_slave_port_fdb_dump(dev, obj); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + static int dsa_slave_bridge_port_join(struct net_device *dev, struct net_device *br) { @@ -765,9 +776,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_fdb_add = dsa_slave_fdb_add, - .ndo_fdb_del = dsa_slave_fdb_del, - .ndo_fdb_dump = dsa_slave_fdb_dump, + .ndo_fdb_add = switchdev_port_fdb_add, + .ndo_fdb_del = switchdev_port_fdb_del, + .ndo_fdb_dump = switchdev_port_fdb_dump, .ndo_do_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER @@ -780,6 +791,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = { static const struct switchdev_ops dsa_slave_switchdev_ops = { .switchdev_port_attr_get = dsa_slave_port_attr_get, .switchdev_port_attr_set = dsa_slave_port_attr_set, + .switchdev_port_obj_add = dsa_slave_port_obj_add, + .switchdev_port_obj_del = dsa_slave_port_obj_del, + .switchdev_port_obj_dump = dsa_slave_port_obj_dump, }; static void dsa_slave_adjust_link(struct net_device *dev) -- cgit v1.2.3 From 118d5234636ca3718f47ca2c8a3b117c19dfdffd Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Thu, 6 Aug 2015 11:04:56 +0100 Subject: mpls: Enforce payload type of traffic sent using explicit NULL RFC 4182 s2 states that if an IPv4 Explicit NULL label is the only label on the stack, then after popping the resulting packet must be treated as a IPv4 packet and forwarded based on the IPv4 header. The same is true for IPv6 Explicit NULL with an IPv6 packet following. Therefore, when installing the IPv4/IPv6 Explicit NULL label routes, add an attribute that specifies the expected payload type for use at forwarding time for determining the type of the encapsulated packet instead of inspecting the first nibble of the packet. Signed-off-by: Robert Shearman Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 71 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 04306989d054..8c5707db53c5 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -27,11 +27,23 @@ /* This maximum ha length copied from the definition of struct neighbour */ #define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))) +enum mpls_payload_type { + MPT_UNSPEC, /* IPv4 or IPv6 */ + MPT_IPV4 = 4, + MPT_IPV6 = 6, + + /* Other types not implemented: + * - Pseudo-wire with or without control word (RFC4385) + * - GAL (RFC5586) + */ +}; + struct mpls_route { /* next hop label forwarding entry */ struct net_device __rcu *rt_dev; struct rcu_head rt_rcu; u32 rt_label[MAX_NEW_LABELS]; u8 rt_protocol; /* routing protocol that set this entry */ + u8 rt_payload_type; u8 rt_labels; u8 rt_via_alen; u8 rt_via_table; @@ -96,16 +108,8 @@ EXPORT_SYMBOL_GPL(mpls_pkt_too_big); static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, struct mpls_entry_decoded dec) { - /* RFC4385 and RFC5586 encode other packets in mpls such that - * they don't conflict with the ip version number, making - * decoding by examining the ip version correct in everything - * except for the strangest cases. - * - * The strange cases if we choose to support them will require - * manual configuration. - */ - struct iphdr *hdr4; - bool success = true; + enum mpls_payload_type payload_type; + bool success = false; /* The IPv4 code below accesses through the IPv4 header * checksum, which is 12 bytes into the packet. @@ -120,23 +124,32 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, if (!pskb_may_pull(skb, 12)) return false; - /* Use ip_hdr to find the ip protocol version */ - hdr4 = ip_hdr(skb); - if (hdr4->version == 4) { + payload_type = rt->rt_payload_type; + if (payload_type == MPT_UNSPEC) + payload_type = ip_hdr(skb)->version; + + switch (payload_type) { + case MPT_IPV4: { + struct iphdr *hdr4 = ip_hdr(skb); skb->protocol = htons(ETH_P_IP); csum_replace2(&hdr4->check, htons(hdr4->ttl << 8), htons(dec.ttl << 8)); hdr4->ttl = dec.ttl; + success = true; + break; } - else if (hdr4->version == 6) { + case MPT_IPV6: { struct ipv6hdr *hdr6 = ipv6_hdr(skb); skb->protocol = htons(ETH_P_IPV6); hdr6->hop_limit = dec.ttl; + success = true; + break; } - else - /* version 0 and version 1 are used by pseudo wires */ - success = false; + case MPT_UNSPEC: + break; + } + return success; } @@ -255,16 +268,17 @@ static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { }; struct mpls_route_config { - u32 rc_protocol; - u32 rc_ifindex; - u16 rc_via_table; - u16 rc_via_alen; - u8 rc_via[MAX_VIA_ALEN]; - u32 rc_label; - u32 rc_output_labels; - u32 rc_output_label[MAX_NEW_LABELS]; - u32 rc_nlflags; - struct nl_info rc_nlinfo; + u32 rc_protocol; + u32 rc_ifindex; + u16 rc_via_table; + u16 rc_via_alen; + u8 rc_via[MAX_VIA_ALEN]; + u32 rc_label; + u32 rc_output_labels; + u32 rc_output_label[MAX_NEW_LABELS]; + u32 rc_nlflags; + enum mpls_payload_type rc_payload_type; + struct nl_info rc_nlinfo; }; static struct mpls_route *mpls_rt_alloc(size_t alen) @@ -493,6 +507,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) rt->rt_label[i] = cfg->rc_output_label[i]; rt->rt_protocol = cfg->rc_protocol; RCU_INIT_POINTER(rt->rt_dev, dev); + rt->rt_payload_type = cfg->rc_payload_type; rt->rt_via_table = cfg->rc_via_table; memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen); @@ -1047,6 +1062,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) goto nort0; RCU_INIT_POINTER(rt0->rt_dev, lo); rt0->rt_protocol = RTPROT_KERNEL; + rt0->rt_payload_type = MPT_IPV4; rt0->rt_via_table = NEIGH_LINK_TABLE; memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len); } @@ -1057,6 +1073,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) goto nort2; RCU_INIT_POINTER(rt2->rt_dev, lo); rt2->rt_protocol = RTPROT_KERNEL; + rt2->rt_payload_type = MPT_IPV6; rt2->rt_via_table = NEIGH_LINK_TABLE; memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len); } -- cgit v1.2.3 From ecea49914b329e3f5cf746979c10e05e6bdad77a Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Thu, 6 Aug 2015 21:27:54 +0900 Subject: net: ethernet: Fix double word "the the" in eth.c This patch fix double word "the the" in Documentation/DocBook/networking/API-eth-get-headlen.html Documentation/DocBook/networking/netdev.html Documentation/DocBook/networking.xml These files are generated from comment in source, so I have to fix comment in net/ethernet/eth.c. Signed-off-by: Masanari Iida Signed-off-by: David S. Miller --- net/ethernet/eth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 77e0f0e7a88e..217127c3a3ef 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -114,7 +114,7 @@ int eth_header(struct sk_buff *skb, struct net_device *dev, EXPORT_SYMBOL(eth_header); /** - * eth_get_headlen - determine the the length of header for an ethernet frame + * eth_get_headlen - determine the length of header for an ethernet frame * @data: pointer to start of frame * @len: total length of frame * -- cgit v1.2.3 From 0208bc8803918d7e84f247f1d4f1730171a24c16 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 1 Aug 2015 15:30:08 +0300 Subject: Bluetooth: Fix breakage in amp_write_rem_assoc_frag() We should be passing the pointer itself instead of the address of the pointer. This was a copy and paste bug when we replaced the calls to hci_send_cmd(). Originally, the arguments were "len, cp" but we overwrote them with "sizeof(cp), &cp" by mistake. Fixes: b3d3914006a0 ('Bluetooth: Move amp assoc read/write completed callback to amp.c') Signed-off-by: Dan Carpenter Signed-off-by: Marcel Holtmann --- net/bluetooth/amp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index 238ddd3cf95f..e32f34189007 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -379,7 +379,7 @@ static bool amp_write_rem_assoc_frag(struct hci_dev *hdev, amp_ctrl_put(ctrl); hci_req_init(&req, hdev); - hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, sizeof(cp), &cp); + hci_req_add(&req, HCI_OP_WRITE_REMOTE_AMP_ASSOC, len, cp); hci_req_run_skb(&req, write_remote_amp_assoc_complete); kfree(cp); -- cgit v1.2.3 From 77e867b5f133f6bb3debcfcc75ce4536d644e62e Mon Sep 17 00:00:00 2001 From: Lukasz Duda Date: Mon, 10 Aug 2015 21:15:52 +0200 Subject: 6lowpan: Fix extraction of flow label field The lowpan_fetch_skb function is used to fetch the first byte, which also increments the data pointer in skb structure, making subsequent array lookup of byte 0 actually being byte 1. To decompress the first byte of the Flow Label when the TF flag is set to 0x01, the second half of the first byte is needed. The patch fixes the extraction of the Flow Label field. Acked-by: Jukka Rissanen Signed-off-by: Lukasz Duda Signed-off-by: Glenn Ruben Bakke Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/6lowpan/iphc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 9055d7b9d112..74e56d7449c8 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -284,7 +284,7 @@ lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, if (lowpan_fetch_skb(skb, &tmp, sizeof(tmp))) return -EINVAL; - hdr.flow_lbl[0] = (skb->data[0] & 0x0F) | ((tmp >> 2) & 0x30); + hdr.flow_lbl[0] = (tmp & 0x0F) | ((tmp >> 2) & 0x30); memcpy(&hdr.flow_lbl[1], &skb->data[0], 2); skb_pull(skb, 2); break; -- cgit v1.2.3 From 51e0e5d8124ece158927a4c2288c0929d3b53aa3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 10 Aug 2015 21:15:53 +0200 Subject: ieee802154: 6lowpan: remove multiple lowpan per wpan support We currently supports multiple lowpan interfaces per wpan interface. I never saw any use case into such functionality. We drop this feature now because it's much easier do deal with address changes inside the under laying wpan interface. This patch removes the multiple lowpan interface and adds a lowpan_dev netdev pointer into the wpan_dev, if this pointer isn't null the wpan interface belongs to the assigned lowpan interface. Reviewed-by: Stefan Schmidt Tested-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/cfg802154.h | 3 ++ net/ieee802154/6lowpan/6lowpan_i.h | 8 ----- net/ieee802154/6lowpan/core.c | 67 +++++++++++--------------------------- net/ieee802154/6lowpan/rx.c | 38 +++++---------------- 4 files changed, 30 insertions(+), 86 deletions(-) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index 382f94b59f2f..e53b6bfda976 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -173,6 +173,9 @@ struct wpan_dev { struct list_head list; struct net_device *netdev; + /* lowpan interface, set when the wpan_dev belongs to one lowpan_dev */ + struct net_device *lowpan_dev; + u32 identifier; /* MAC PIB */ diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index e50f69da78eb..923b680adb61 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -37,15 +37,9 @@ static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) } } -struct lowpan_dev_record { - struct net_device *ldev; - struct list_head list; -}; - /* private device info */ struct lowpan_dev_info { struct net_device *real_dev; /* real WPAN device ptr */ - struct mutex dev_list_mtx; /* mutex for list ops */ u16 fragment_tag; }; @@ -55,8 +49,6 @@ lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) return netdev_priv(dev); } -extern struct list_head lowpan_devices; - int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); void lowpan_net_frag_exit(void); int lowpan_net_frag_init(void); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index f20a387a1011..a4edee8fdc79 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -52,9 +52,6 @@ #include "6lowpan_i.h" -LIST_HEAD(lowpan_devices); -static int lowpan_open_count; - static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; @@ -114,7 +111,6 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct net_device *real_dev; - struct lowpan_dev_record *entry; int ret; ASSERT_RTNL(); @@ -133,31 +129,19 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, return -EINVAL; } - lowpan_dev_info(dev)->real_dev = real_dev; - mutex_init(&lowpan_dev_info(dev)->dev_list_mtx); - - entry = kzalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { + if (real_dev->ieee802154_ptr->lowpan_dev) { dev_put(real_dev); - lowpan_dev_info(dev)->real_dev = NULL; - return -ENOMEM; + return -EBUSY; } - entry->ldev = dev; - + lowpan_dev_info(dev)->real_dev = real_dev; /* Set the lowpan hardware address to the wpan hardware address. */ memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN); - mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); - INIT_LIST_HEAD(&entry->list); - list_add_tail(&entry->list, &lowpan_devices); - mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); - ret = register_netdevice(dev); if (ret >= 0) { - if (!lowpan_open_count) - lowpan_rx_init(); - lowpan_open_count++; + real_dev->ieee802154_ptr->lowpan_dev = dev; + lowpan_rx_init(); } return ret; @@ -167,27 +151,12 @@ static void lowpan_dellink(struct net_device *dev, struct list_head *head) { struct lowpan_dev_info *lowpan_dev = lowpan_dev_info(dev); struct net_device *real_dev = lowpan_dev->real_dev; - struct lowpan_dev_record *entry, *tmp; ASSERT_RTNL(); - lowpan_open_count--; - if (!lowpan_open_count) - lowpan_rx_exit(); - - mutex_lock(&lowpan_dev_info(dev)->dev_list_mtx); - list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { - if (entry->ldev == dev) { - list_del(&entry->list); - kfree(entry); - } - } - mutex_unlock(&lowpan_dev_info(dev)->dev_list_mtx); - - mutex_destroy(&lowpan_dev_info(dev)->dev_list_mtx); - - unregister_netdevice_queue(dev, head); - + lowpan_rx_exit(); + real_dev->ieee802154_ptr->lowpan_dev = NULL; + unregister_netdevice(dev); dev_put(real_dev); } @@ -214,19 +183,21 @@ static int lowpan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - LIST_HEAD(del_list); - struct lowpan_dev_record *entry, *tmp; if (dev->type != ARPHRD_IEEE802154) goto out; - if (event == NETDEV_UNREGISTER) { - list_for_each_entry_safe(entry, tmp, &lowpan_devices, list) { - if (lowpan_dev_info(entry->ldev)->real_dev == dev) - lowpan_dellink(entry->ldev, &del_list); - } - - unregister_netdevice_many(&del_list); + switch (event) { + case NETDEV_UNREGISTER: + /* Check if wpan interface is unregistered that we + * also delete possible lowpan interfaces which belongs + * to the wpan interface. + */ + if (dev->ieee802154_ptr && dev->ieee802154_ptr->lowpan_dev) + lowpan_dellink(dev->ieee802154_ptr->lowpan_dev, NULL); + break; + default: + break; } out: diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index 4be1d289ab2d..d6f5e8ee6fda 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -15,36 +15,14 @@ #include "6lowpan_i.h" -static int lowpan_give_skb_to_devices(struct sk_buff *skb, - struct net_device *dev) +static int lowpan_give_skb_to_device(struct sk_buff *skb, + struct net_device *dev) { - struct lowpan_dev_record *entry; - struct sk_buff *skb_cp; - int stat = NET_RX_SUCCESS; - + skb->dev = dev->ieee802154_ptr->lowpan_dev; skb->protocol = htons(ETH_P_IPV6); skb->pkt_type = PACKET_HOST; - rcu_read_lock(); - list_for_each_entry_rcu(entry, &lowpan_devices, list) - if (lowpan_dev_info(entry->ldev)->real_dev == skb->dev) { - skb_cp = skb_copy(skb, GFP_ATOMIC); - if (!skb_cp) { - kfree_skb(skb); - rcu_read_unlock(); - return NET_RX_DROP; - } - - skb_cp->dev = entry->ldev; - stat = netif_rx(skb_cp); - if (stat == NET_RX_DROP) - break; - } - rcu_read_unlock(); - - consume_skb(skb); - - return stat; + return netif_rx(skb); } static int @@ -109,7 +87,7 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, if (skb->data[0] == LOWPAN_DISPATCH_IPV6) { /* Pull off the 1-byte of 6lowpan header. */ skb_pull(skb, 1); - return lowpan_give_skb_to_devices(skb, NULL); + return lowpan_give_skb_to_device(skb, dev); } else { switch (skb->data[0] & 0xe0) { case LOWPAN_DISPATCH_IPHC: /* ipv6 datagram */ @@ -117,7 +95,7 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, if (ret < 0) goto drop_skb; - return lowpan_give_skb_to_devices(skb, NULL); + return lowpan_give_skb_to_device(skb, dev); case LOWPAN_DISPATCH_FRAG1: /* first fragment header */ ret = lowpan_frag_rcv(skb, LOWPAN_DISPATCH_FRAG1); if (ret == 1) { @@ -125,7 +103,7 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, if (ret < 0) goto drop_skb; - return lowpan_give_skb_to_devices(skb, NULL); + return lowpan_give_skb_to_device(skb, dev); } else if (ret == -1) { return NET_RX_DROP; } else { @@ -138,7 +116,7 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, if (ret < 0) goto drop_skb; - return lowpan_give_skb_to_devices(skb, NULL); + return lowpan_give_skb_to_device(skb, dev); } else if (ret == -1) { return NET_RX_DROP; } else { -- cgit v1.2.3 From 09095fdc9e5d5438051fc4e92867f1aff764cd21 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 10 Aug 2015 21:15:54 +0200 Subject: mac802154: fix wpan mac setting while lowpan is there If we currently change the mac address inside the wpan interface while we have a lowpan interface on top of the wpan interface, the mac address setting doesn't reach the lowpan interface. The effect would be that the IPv6 lowpan interface has the old SLAAC address and isn't working anymore because the lowpan interface use in internal mechanism sometimes dev->addr which is the old mac address of the wpan interface. This patch checks if a wpan interface belongs to lowpan interface, if yes then we need to check if the lowpan interface is down and change the mac address also at the lowpan interface. When the lowpan interface will be set up afterwards, it will use the correct SLAAC address which based on the updated mac address setting. Reviewed-by: Stefan Schmidt Tested-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 416de903e467..ff99055631f9 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -125,6 +125,14 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) if (netif_running(dev)) return -EBUSY; + /* lowpan need to be down for update + * SLAAC address after ifup + */ + if (sdata->wpan_dev.lowpan_dev) { + if (netif_running(sdata->wpan_dev.lowpan_dev)) + return -EBUSY; + } + ieee802154_be64_to_le64(&extended_addr, addr->sa_data); if (!ieee802154_is_valid_extended_unicast_addr(extended_addr)) return -EINVAL; @@ -132,6 +140,13 @@ static int mac802154_wpan_mac_addr(struct net_device *dev, void *p) memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); sdata->wpan_dev.extended_addr = extended_addr; + /* update lowpan interface mac address when + * wpan mac has been changed + */ + if (sdata->wpan_dev.lowpan_dev) + memcpy(sdata->wpan_dev.lowpan_dev->dev_addr, dev->dev_addr, + dev->addr_len); + return mac802154_wpan_update_llsec(dev); } -- cgit v1.2.3 From 91f02b3dd8311b48c021e8667a84dfc2d6445a03 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 10 Aug 2015 21:15:55 +0200 Subject: mac802154: cfg: remove test and set checks This patch removes several checks if a value is really changed. This makes only sense if we have another layer call e.g. calling the driver_ops which is done by callbacks like "set_channel". For MAC settings which need to be set by phy registers (if the phy supports that handling) this is set by doing an interface up currently and are not direct driver_ops calls, so we remove the checks from these configuration callbacks. Reviewed-by: Stefan Schmidt Suggested-by: Phoebe Buckheister Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/cfg.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'net') diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index f7ba51e8b4ca..cecfcda09aac 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -209,10 +209,6 @@ ieee802154_set_backoff_exponent(struct wpan_phy *wpan_phy, { ASSERT_RTNL(); - if (wpan_dev->min_be == min_be && - wpan_dev->max_be == max_be) - return 0; - wpan_dev->min_be = min_be; wpan_dev->max_be = max_be; return 0; @@ -224,9 +220,6 @@ ieee802154_set_short_addr(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, { ASSERT_RTNL(); - if (wpan_dev->short_addr == short_addr) - return 0; - wpan_dev->short_addr = short_addr; return 0; } @@ -238,9 +231,6 @@ ieee802154_set_max_csma_backoffs(struct wpan_phy *wpan_phy, { ASSERT_RTNL(); - if (wpan_dev->csma_retries == max_csma_backoffs) - return 0; - wpan_dev->csma_retries = max_csma_backoffs; return 0; } @@ -252,9 +242,6 @@ ieee802154_set_max_frame_retries(struct wpan_phy *wpan_phy, { ASSERT_RTNL(); - if (wpan_dev->frame_retries == max_frame_retries) - return 0; - wpan_dev->frame_retries = max_frame_retries; return 0; } @@ -265,9 +252,6 @@ ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, { ASSERT_RTNL(); - if (wpan_dev->lbt == mode) - return 0; - wpan_dev->lbt = mode; return 0; } -- cgit v1.2.3 From 89c7d788f89d58136a2e5596796c298942ee32d6 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 10 Aug 2015 21:15:56 +0200 Subject: mac802154: change frame_retries behaviour This patch changes the default minimum value of frame_retries to 0 and changes the frame_retries default value to 3 which is also 802.15.4 default. We don't use the frame_retries "-1" value as indicator for no-aret mode anymore, instead we checking on the ack request bit inside the 802.15.4 frame control field. This allows a acknowledge handling per frame. This checking is done by transceiver or inside xmit callback of driver layer. If a transceiver doesn't support ARET handling the transmit functionality ignores ack frames then, which isn't well but should not effect anything of current functionality. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/mac802154/iface.c | 3 +-- net/mac802154/main.c | 9 +++------ 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index ff99055631f9..ed26952f9e14 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -498,8 +498,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, wpan_dev->min_be = 3; wpan_dev->max_be = 5; wpan_dev->csma_retries = 4; - /* for compatibility, actual default is 3 */ - wpan_dev->frame_retries = -1; + wpan_dev->frame_retries = 3; wpan_dev->pan_id = cpu_to_le16(IEEE802154_PANID_BROADCAST); wpan_dev->short_addr = cpu_to_le16(IEEE802154_ADDR_BROADCAST); diff --git a/net/mac802154/main.c b/net/mac802154/main.c index 9e55431b9a5c..e8cab5bb80c6 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -111,7 +111,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) phy->supported.max_minbe = 8; phy->supported.min_maxbe = 3; phy->supported.max_maxbe = 8; - phy->supported.min_frame_retries = -1; + phy->supported.min_frame_retries = 0; phy->supported.max_frame_retries = 7; phy->supported.max_csma_backoffs = 5; phy->supported.lbt = NL802154_SUPPORTED_BOOL_FALSE; @@ -177,11 +177,8 @@ int ieee802154_register_hw(struct ieee802154_hw *hw) } if (!(hw->flags & IEEE802154_HW_FRAME_RETRIES)) { - /* TODO should be 3, but our default value is -1 which means - * no ARET handling. - */ - local->phy->supported.min_frame_retries = -1; - local->phy->supported.max_frame_retries = -1; + local->phy->supported.min_frame_retries = 3; + local->phy->supported.max_frame_retries = 3; } if (hw->flags & IEEE802154_HW_PROMISCUOUS) -- cgit v1.2.3 From c91208d819c814e7f418c7a083059cf533ad0396 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 10 Aug 2015 21:15:58 +0200 Subject: ieee802154: add ack request default handling This patch introduce a new mib entry which isn't part of 802.15.4 but useful as default behaviour to set the ack request bit or not if we don't know if the ack request bit should set. This is currently used for stacks like IEEE 802.15.4 6LoWPAN. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/cfg802154.h | 5 +++++ include/net/nl802154.h | 4 ++++ net/ieee802154/6lowpan/tx.c | 2 +- net/ieee802154/nl802154.c | 33 +++++++++++++++++++++++++++++++++ net/ieee802154/rdev-ops.h | 13 +++++++++++++ net/ieee802154/trace.h | 19 +++++++++++++++++++ net/mac802154/cfg.c | 11 +++++++++++ 7 files changed, 86 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index e53b6bfda976..76b1ffaea863 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -63,6 +63,8 @@ struct cfg802154_ops { s8 max_frame_retries); int (*set_lbt_mode)(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, bool mode); + int (*set_ackreq_default)(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, bool ackreq); }; static inline bool @@ -196,6 +198,9 @@ struct wpan_dev { bool lbt; bool promiscuous_mode; + + /* fallback for acknowledgment bit setting */ + bool ackreq; }; #define to_phy(_dev) container_of(_dev, struct wpan_phy, dev) diff --git a/include/net/nl802154.h b/include/net/nl802154.h index b0ab530d28cd..cf2713d8b975 100644 --- a/include/net/nl802154.h +++ b/include/net/nl802154.h @@ -52,6 +52,8 @@ enum nl802154_commands { NL802154_CMD_SET_LBT_MODE, + NL802154_CMD_SET_ACKREQ_DEFAULT, + /* add new commands above here */ /* used to define NL802154_CMD_MAX below */ @@ -104,6 +106,8 @@ enum nl802154_attrs { NL802154_ATTR_SUPPORTED_COMMANDS, + NL802154_ATTR_ACKREQ_DEFAULT, + /* add attributes here, update the policy in nl802154.c */ __NL802154_ATTR_AFTER_LAST, diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 2597abbf7f4b..1bf4a304b5c4 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -224,7 +224,7 @@ static int lowpan_header(struct sk_buff *skb, struct net_device *dev) } else { da.mode = IEEE802154_ADDR_LONG; da.extended_addr = ieee802154_devaddr_from_raw(daddr); - cb->ackreq = wpan_dev->frame_retries >= 0; + cb->ackreq = wpan_dev->ackreq; } return dev_hard_header(skb, lowpan_dev_info(dev)->real_dev, diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 68f24016860c..1b00a14850cb 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -230,6 +230,8 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED }, [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED }, + + [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, }; /* message building helper */ @@ -458,6 +460,7 @@ static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS); CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES); CMD(set_lbt_mode, SET_LBT_MODE); + CMD(set_ackreq_default, SET_ACKREQ_DEFAULT); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) CMD(set_tx_power, SET_TX_POWER); @@ -656,6 +659,10 @@ nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt)) goto nla_put_failure; + /* ackreq default behaviour */ + if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; @@ -1042,6 +1049,24 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) return rdev_set_lbt_mode(rdev, wpan_dev, mode); } +static int +nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg802154_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wpan_dev *wpan_dev = dev->ieee802154_ptr; + bool ackreq; + + if (netif_running(dev)) + return -EBUSY; + + if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]) + return -EINVAL; + + ackreq = !!nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]); + return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); +} + #define NL802154_FLAG_NEED_WPAN_PHY 0x01 #define NL802154_FLAG_NEED_NETDEV 0x02 #define NL802154_FLAG_NEED_RTNL 0x04 @@ -1248,6 +1273,14 @@ static const struct genl_ops nl802154_ops[] = { .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, + { + .cmd = NL802154_CMD_SET_ACKREQ_DEFAULT, + .doit = nl802154_set_ackreq_default, + .policy = nl802154_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = NL802154_FLAG_NEED_NETDEV | + NL802154_FLAG_NEED_RTNL, + }, }; /* initialisation/exit functions */ diff --git a/net/ieee802154/rdev-ops.h b/net/ieee802154/rdev-ops.h index 8d5960a37195..03b357501cc5 100644 --- a/net/ieee802154/rdev-ops.h +++ b/net/ieee802154/rdev-ops.h @@ -195,4 +195,17 @@ rdev_set_lbt_mode(struct cfg802154_registered_device *rdev, return ret; } +static inline int +rdev_set_ackreq_default(struct cfg802154_registered_device *rdev, + struct wpan_dev *wpan_dev, bool ackreq) +{ + int ret; + + trace_802154_rdev_set_ackreq_default(&rdev->wpan_phy, wpan_dev, + ackreq); + ret = rdev->ops->set_ackreq_default(&rdev->wpan_phy, wpan_dev, ackreq); + trace_802154_rdev_return_int(&rdev->wpan_phy, ret); + return ret; +} + #endif /* __CFG802154_RDEV_OPS */ diff --git a/net/ieee802154/trace.h b/net/ieee802154/trace.h index 4399b7fbaa31..9a471e41ec73 100644 --- a/net/ieee802154/trace.h +++ b/net/ieee802154/trace.h @@ -275,6 +275,25 @@ TRACE_EVENT(802154_rdev_set_lbt_mode, WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->mode)) ); +TRACE_EVENT(802154_rdev_set_ackreq_default, + TP_PROTO(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, + bool ackreq), + TP_ARGS(wpan_phy, wpan_dev, ackreq), + TP_STRUCT__entry( + WPAN_PHY_ENTRY + WPAN_DEV_ENTRY + __field(bool, ackreq) + ), + TP_fast_assign( + WPAN_PHY_ASSIGN; + WPAN_DEV_ASSIGN; + __entry->ackreq = ackreq; + ), + TP_printk(WPAN_PHY_PR_FMT ", " WPAN_DEV_PR_FMT + ", ackreq default: %s", WPAN_PHY_PR_ARG, + WPAN_DEV_PR_ARG, BOOL_TO_STR(__entry->ackreq)) +); + TRACE_EVENT(802154_rdev_return_int, TP_PROTO(struct wpan_phy *wpan_phy, int ret), TP_ARGS(wpan_phy, ret), diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index cecfcda09aac..c865ebb2ace2 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -256,6 +256,16 @@ ieee802154_set_lbt_mode(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, return 0; } +static int +ieee802154_set_ackreq_default(struct wpan_phy *wpan_phy, + struct wpan_dev *wpan_dev, bool ackreq) +{ + ASSERT_RTNL(); + + wpan_dev->ackreq = ackreq; + return 0; +} + const struct cfg802154_ops mac802154_config_ops = { .add_virtual_intf_deprecated = ieee802154_add_iface_deprecated, .del_virtual_intf_deprecated = ieee802154_del_iface_deprecated, @@ -273,4 +283,5 @@ const struct cfg802154_ops mac802154_config_ops = { .set_max_csma_backoffs = ieee802154_set_max_csma_backoffs, .set_max_frame_retries = ieee802154_set_max_frame_retries, .set_lbt_mode = ieee802154_set_lbt_mode, + .set_ackreq_default = ieee802154_set_ackreq_default, }; -- cgit v1.2.3 From 8f8db91840747e95df231f67dfa09eafeb4cc6d0 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 10 Aug 2015 21:15:59 +0200 Subject: ieee802154: 6lowpan: fix error frag handling This patch fixes the error handling for lowpan_xmit_fragment by replace "-PTR_ERR" to "PTR_ERR". PTR_ERR returns already a negative errno code. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/tx.c b/net/ieee802154/6lowpan/tx.c index 1bf4a304b5c4..f6263fc12340 100644 --- a/net/ieee802154/6lowpan/tx.c +++ b/net/ieee802154/6lowpan/tx.c @@ -112,7 +112,7 @@ lowpan_xmit_fragment(struct sk_buff *skb, const struct ieee802154_hdr *wpan_hdr, frag = lowpan_alloc_frag(skb, frag_hdrlen + len, wpan_hdr); if (IS_ERR(frag)) - return -PTR_ERR(frag); + return PTR_ERR(frag); memcpy(skb_put(frag, frag_hdrlen), frag_hdr, frag_hdrlen); memcpy(skb_put(frag, len), skb_network_header(skb) + offset, len); -- cgit v1.2.3 From e7d9ab731ac7babaf2e1b7b5e2280f5f555d263f Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Fri, 7 Aug 2015 20:22:52 +0200 Subject: Bluetooth: add hci_lookup_le_connect This patch adds hci_lookup_le_connect method, that will be used to check wether outgoing le connection attempt is in progress. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 20 ++++++++++++++++++++ net/bluetooth/hci_conn.c | 5 ++--- net/bluetooth/hci_event.c | 4 ++-- net/bluetooth/hci_request.c | 6 ++---- net/bluetooth/mgmt.c | 2 +- 5 files changed, 27 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index c8d2b5a89d08..f0a9fc1d06e0 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -811,6 +811,26 @@ static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, return NULL; } +static inline struct hci_conn *hci_lookup_le_connect(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *c; + + rcu_read_lock(); + + list_for_each_entry_rcu(c, &h->list, list) { + if (c->type == LE_LINK && c->state == BT_CONNECT && + !test_bit(HCI_CONN_SCANNING, &c->flags)) { + rcu_read_unlock(); + return c; + } + } + + rcu_read_unlock(); + + return NULL; +} + int hci_disconnect(struct hci_conn *conn, __u8 reason); bool hci_setup_sync(struct hci_conn *conn, __u16 handle); void hci_sco_setup(struct hci_conn *conn, __u8 status); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 2c48bf0b5afb..0b4d919c8d96 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -645,7 +645,7 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + conn = hci_lookup_le_connect(hdev); if (!conn) goto done; @@ -759,8 +759,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, /* Since the controller supports only one LE connection attempt at a * time, we return -EBUSY if there is any connection attempt running. */ - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); - if (conn) + if (hci_lookup_le_connect(hdev)) return ERR_PTR(-EBUSY); /* When given an identity address with existing identity diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 218d7dfc342f..128c5b70ee5e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1059,7 +1059,7 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_set_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + conn = hci_lookup_le_connect(hdev); if (conn) queue_delayed_work(hdev->workqueue, &conn->le_conn_timeout, @@ -4447,7 +4447,7 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + conn = hci_lookup_le_connect(hdev); if (!conn) { conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role); if (!conn) { diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index d6025d6e6d59..b7369220c9ef 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -317,7 +317,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) * address be updated at the next cycle. */ if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { + hci_lookup_le_connect(hdev)) { BT_DBG("Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return; @@ -479,7 +479,6 @@ void hci_update_page_scan(struct hci_dev *hdev) void __hci_update_background_scan(struct hci_request *req) { struct hci_dev *hdev = req->hdev; - struct hci_conn *conn; if (!test_bit(HCI_UP, &hdev->flags) || test_bit(HCI_INIT, &hdev->flags) || @@ -529,8 +528,7 @@ void __hci_update_background_scan(struct hci_request *req) * since some controllers are not able to scan and connect at * the same time. */ - conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); - if (conn) + if (hci_lookup_le_connect(hdev)) return; /* If controller is currently scanning, we stop it to ensure we diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 7ab191589541..35418bbe6b15 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4210,7 +4210,7 @@ static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status) /* Don't let discovery abort an outgoing connection attempt * that's using directed advertising. */ - if (hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT)) { + if (hci_lookup_le_connect(hdev)) { *status = MGMT_STATUS_REJECTED; return false; } -- cgit v1.2.3 From f75113a26008980ca13834fb6573145523596776 Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Fri, 7 Aug 2015 20:22:53 +0200 Subject: Bluetooth: add hci_connect_le_scan Currently, when trying to connect to already paired device that just rotated its RPA MAC address, old address would be used and connection would fail. In order to fix that, kernel must scan and receive advertisement with fresh RPA before connecting. This patch adds hci_connect_le_scan with dependencies, new method that will be used to connect to remote LE devices. Instead of just sending connect request, it adds a device to whitelist. Later patches will make use of this whitelist to send conenct request when advertisement is received, and properly handle timeouts. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 6 ++ net/bluetooth/hci_conn.c | 174 +++++++++++++++++++++++++++++++++++++++ net/bluetooth/hci_core.c | 33 ++++++++ 3 files changed, 213 insertions(+) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f0a9fc1d06e0..9e1a59e01fa2 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -846,6 +846,9 @@ void hci_chan_del(struct hci_chan *chan); void hci_chan_list_flush(struct hci_conn *conn); struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle); +struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, + u8 dst_type, u8 sec_level, + u16 conn_timeout, u8 role); struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, u8 role); @@ -1011,6 +1014,9 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev); struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type); +struct hci_conn_params *hci_explicit_connect_lookup(struct hci_dev *hdev, + bdaddr_t *addr, + u8 addr_type); void hci_uuids_clear(struct hci_dev *hdev); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 0b4d919c8d96..534feb7956a3 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -64,6 +64,48 @@ static void hci_le_create_connection_cancel(struct hci_conn *conn) hci_send_cmd(conn->hdev, HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL); } +/* This function requires the caller holds hdev->lock */ +static void hci_connect_le_scan_cleanup(struct hci_conn *conn) +{ + struct hci_conn_params *params; + struct smp_irk *irk; + bdaddr_t *bdaddr; + u8 bdaddr_type; + + bdaddr = &conn->dst; + bdaddr_type = conn->dst_type; + + /* Check if we need to convert to identity address */ + irk = hci_get_irk(conn->hdev, bdaddr, bdaddr_type); + if (irk) { + bdaddr = &irk->bdaddr; + bdaddr_type = irk->addr_type; + } + + params = hci_explicit_connect_lookup(conn->hdev, bdaddr, bdaddr_type); + if (!params) + return; + + /* The connection attempt was doing scan for new RPA, and is + * in scan phase. If params are not associated with any other + * autoconnect action, remove them completely. If they are, just unmark + * them as waiting for connection, by clearing explicit_connect field. + */ + if (params->auto_connect == HCI_AUTO_CONN_EXPLICIT) + hci_conn_params_del(conn->hdev, bdaddr, bdaddr_type); + else + params->explicit_connect = false; +} + +/* This function requires the caller holds hdev->lock */ +static void hci_connect_le_scan_remove(struct hci_conn *conn) +{ + hci_connect_le_scan_cleanup(conn); + + hci_conn_hash_del(conn->hdev, conn); + hci_update_background_scan(conn->hdev); +} + static void hci_acl_create_connection(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; @@ -858,6 +900,138 @@ done: return conn; } +static void hci_connect_le_scan_complete(struct hci_dev *hdev, u8 status, + u16 opcode) +{ + struct hci_conn *conn; + + if (!status) + return; + + BT_ERR("Failed to add device to auto conn whitelist: status 0x%2.2x", + status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT); + if (conn) + hci_le_conn_failed(conn, status); + + hci_dev_unlock(hdev); +} + +static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) +{ + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); + if (!conn) + return false; + + if (conn->dst_type != type) + return false; + + if (conn->state != BT_CONNECTED) + return false; + + return true; +} + +/* This function requires the caller holds hdev->lock */ +static int hci_explicit_conn_params_set(struct hci_request *req, + bdaddr_t *addr, u8 addr_type) +{ + struct hci_dev *hdev = req->hdev; + struct hci_conn_params *params; + + if (is_connected(hdev, addr, addr_type)) + return -EISCONN; + + params = hci_conn_params_add(hdev, addr, addr_type); + if (!params) + return -EIO; + + /* If we created new params, or existing params were marked as disabled, + * mark them to be used just once to connect. + */ + if (params->auto_connect == HCI_AUTO_CONN_DISABLED) { + params->auto_connect = HCI_AUTO_CONN_EXPLICIT; + list_del_init(¶ms->action); + list_add(¶ms->action, &hdev->pend_le_conns); + } + + params->explicit_connect = true; + __hci_update_background_scan(req); + + BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, + params->auto_connect); + + return 0; +} + +/* This function requires the caller holds hdev->lock */ +struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, + u8 dst_type, u8 sec_level, + u16 conn_timeout, u8 role) +{ + struct hci_conn *conn; + struct hci_request req; + int err; + + /* Let's make sure that le is enabled.*/ + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { + if (lmp_le_capable(hdev)) + return ERR_PTR(-ECONNREFUSED); + + return ERR_PTR(-EOPNOTSUPP); + } + + /* Some devices send ATT messages as soon as the physical link is + * established. To be able to handle these ATT messages, the user- + * space first establishes the connection and then starts the pairing + * process. + * + * So if a hci_conn object already exists for the following connection + * attempt, we simply update pending_sec_level and auth_type fields + * and return the object found. + */ + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst); + if (conn) { + if (conn->pending_sec_level < sec_level) + conn->pending_sec_level = sec_level; + goto done; + } + + BT_DBG("requesting refresh of dst_addr"); + + conn = hci_conn_add(hdev, LE_LINK, dst, role); + if (!conn) + return ERR_PTR(-ENOMEM); + + hci_req_init(&req, hdev); + + if (hci_explicit_conn_params_set(&req, dst, dst_type) < 0) + return ERR_PTR(-EBUSY); + + conn->state = BT_CONNECT; + set_bit(HCI_CONN_SCANNING, &conn->flags); + + err = hci_req_run(&req, hci_connect_le_scan_complete); + if (err && err != -ENODATA) { + hci_conn_del(conn); + return ERR_PTR(err); + } + + conn->dst_type = dst_type; + conn->sec_level = BT_SECURITY_LOW; + conn->pending_sec_level = sec_level; + conn->conn_timeout = conn_timeout; + +done: + hci_conn_hold(conn); + return conn; +} + struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type) { diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index bc43b6490555..adcbc74c2432 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2847,6 +2847,30 @@ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, return NULL; } +/* This function requires the caller holds hdev->lock */ +struct hci_conn_params *hci_explicit_connect_lookup(struct hci_dev *hdev, + bdaddr_t *addr, + u8 addr_type) +{ + struct hci_conn_params *param; + + list_for_each_entry(param, &hdev->pend_le_conns, action) { + if (bacmp(¶m->addr, addr) == 0 && + param->addr_type == addr_type && + param->explicit_connect) + return param; + } + + list_for_each_entry(param, &hdev->pend_le_reports, action) { + if (bacmp(¶m->addr, addr) == 0 && + param->addr_type == addr_type && + param->explicit_connect) + return param; + } + + return NULL; +} + /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) @@ -2916,6 +2940,15 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev) list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; + + /* If trying to estabilish one time connection to disabled + * device, leave the params, but mark them as just once. + */ + if (params->explicit_connect) { + params->auto_connect = HCI_AUTO_CONN_EXPLICIT; + continue; + } + list_del(¶ms->list); kfree(params); } -- cgit v1.2.3 From 28a667c9c279df5a6467842ee2b3b73ddf874732 Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Fri, 7 Aug 2015 20:22:54 +0200 Subject: Bluetooth: advertisement handling in new connect procedure Currently, when trying to connect to already paired device that just rotated its RPA MAC address, old address would be used and connection would fail. In order to fix that, kernel must scan and receive advertisement with fresh RPA before connecting. This path makes sure that after advertisement is received from device that we try to connect to, it is properly handled in check_pending_le_conn and trigger connect attempt. It also modifies hci_le_connect to make sure that connect attempt will be properly continued. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 52 +++++++++++++++++++++++++++++++++-------------- net/bluetooth/hci_event.c | 51 ++++++++++++++++++++++++++-------------------- net/bluetooth/mgmt.c | 6 ++++++ 3 files changed, 72 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 534feb7956a3..85c6aa5d5bbc 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -679,15 +679,18 @@ static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) { struct hci_conn *conn; - if (status == 0) - return; + hci_dev_lock(hdev); + + conn = hci_lookup_le_connect(hdev); + + if (!status) { + hci_connect_le_scan_cleanup(conn); + goto done; + } BT_ERR("HCI request failed to create LE connection: status 0x%2.2x", status); - hci_dev_lock(hdev); - - conn = hci_lookup_le_connect(hdev); if (!conn) goto done; @@ -727,6 +730,7 @@ static void hci_req_add_le_create_conn(struct hci_request *req, hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp); conn->state = BT_CONNECT; + clear_bit(HCI_CONN_SCANNING, &conn->flags); } static void hci_req_directed_advertising(struct hci_request *req, @@ -770,7 +774,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 role) { struct hci_conn_params *params; - struct hci_conn *conn; + struct hci_conn *conn, *conn_unfinished; struct smp_irk *irk; struct hci_request req; int err; @@ -793,9 +797,17 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * and return the object found. */ conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, dst); + conn_unfinished = NULL; if (conn) { - conn->pending_sec_level = sec_level; - goto done; + if (conn->state == BT_CONNECT && + test_bit(HCI_CONN_SCANNING, &conn->flags)) { + BT_DBG("will continue unfinished conn %pMR", dst); + conn_unfinished = conn; + } else { + if (conn->pending_sec_level < sec_level) + conn->pending_sec_level = sec_level; + goto done; + } } /* Since the controller supports only one LE connection attempt at a @@ -808,10 +820,6 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, * resolving key, the connection needs to be established * to a resolvable random address. * - * This uses the cached random resolvable address from - * a previous scan. When no cached address is available, - * try connecting to the identity address instead. - * * Storing the resolvable random address is required here * to handle connection failures. The address will later * be resolved back into the original identity address @@ -823,15 +831,23 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, dst_type = ADDR_LE_DEV_RANDOM; } - conn = hci_conn_add(hdev, LE_LINK, dst, role); + if (conn_unfinished) { + conn = conn_unfinished; + bacpy(&conn->dst, dst); + } else { + conn = hci_conn_add(hdev, LE_LINK, dst, role); + } + if (!conn) return ERR_PTR(-ENOMEM); conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; - conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; + if (!conn_unfinished) + conn->pending_sec_level = sec_level; + hci_req_init(&req, hdev); /* Disable advertising if we're active. For master role @@ -896,7 +912,13 @@ create_conn: } done: - hci_conn_hold(conn); + /* If this is continuation of connect started by hci_connect_le_scan, + * it already called hci_conn_hold and calling it again would mess the + * counter. + */ + if (!conn_unfinished) + hci_conn_hold(conn); + return conn; } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 128c5b70ee5e..7ba35a9ba6b7 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4640,42 +4640,49 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, /* If we're not connectable only connect devices that we have in * our pend_le_conns list. */ - params = hci_pend_le_action_lookup(&hdev->pend_le_conns, - addr, addr_type); + params = hci_explicit_connect_lookup(hdev, addr, addr_type); + if (!params) return NULL; - switch (params->auto_connect) { - case HCI_AUTO_CONN_DIRECT: - /* Only devices advertising with ADV_DIRECT_IND are - * triggering a connection attempt. This is allowing - * incoming connections from slave devices. - */ - if (adv_type != LE_ADV_DIRECT_IND) + if (!params->explicit_connect) { + switch (params->auto_connect) { + case HCI_AUTO_CONN_DIRECT: + /* Only devices advertising with ADV_DIRECT_IND are + * triggering a connection attempt. This is allowing + * incoming connections from slave devices. + */ + if (adv_type != LE_ADV_DIRECT_IND) + return NULL; + break; + case HCI_AUTO_CONN_ALWAYS: + /* Devices advertising with ADV_IND or ADV_DIRECT_IND + * are triggering a connection attempt. This means + * that incoming connectioms from slave device are + * accepted and also outgoing connections to slave + * devices are established when found. + */ + break; + default: return NULL; - break; - case HCI_AUTO_CONN_ALWAYS: - /* Devices advertising with ADV_IND or ADV_DIRECT_IND - * are triggering a connection attempt. This means - * that incoming connectioms from slave device are - * accepted and also outgoing connections to slave - * devices are established when found. - */ - break; - default: - return NULL; + } } conn = hci_connect_le(hdev, addr, addr_type, BT_SECURITY_LOW, HCI_LE_AUTOCONN_TIMEOUT, HCI_ROLE_MASTER); if (!IS_ERR(conn)) { - /* Store the pointer since we don't really have any + /* If HCI_AUTO_CONN_EXPLICIT is set, conn is already owned + * by higher layer that tried to connect, if no then + * store the pointer since we don't really have any * other owner of the object besides the params that * triggered it. This way we can abort the connection if * the parameters get removed and keep the reference * count consistent once the connection is established. */ - params->conn = hci_conn_get(conn); + + if (!params->explicit_connect) + params->conn = hci_conn_get(conn); + return conn; } diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 35418bbe6b15..5f5d7851f5bf 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -6107,6 +6107,12 @@ static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr, switch (auto_connect) { case HCI_AUTO_CONN_DISABLED: case HCI_AUTO_CONN_LINK_LOSS: + /* If auto connect is being disabled when we're trying to + * connect to device, keep connecting. + */ + if (params->explicit_connect) + list_add(¶ms->action, &hdev->pend_le_conns); + __hci_update_background_scan(req); break; case HCI_AUTO_CONN_REPORT: -- cgit v1.2.3 From cc2b6911a205b327b29c1d771925f8ab2f80295f Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Fri, 7 Aug 2015 20:22:55 +0200 Subject: Bluetooth: timeout handling in new connect procedure Currently, when trying to connect to already paired device that just rotated its RPA MAC address, old address would be used and connection would fail. In order to fix that, kernel must scan and receive advertisement with fresh RPA before connecting. This patch makes sure that when new procedure is in use, and we're stuck in scan phase because no advertisement was received and timeout happened, or app decided to close socket, scan whitelist gets properly cleaned up. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 85c6aa5d5bbc..b4548c739a64 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -382,8 +382,12 @@ static void hci_conn_timeout(struct work_struct *work) if (conn->out) { if (conn->type == ACL_LINK) hci_acl_create_connection_cancel(conn); - else if (conn->type == LE_LINK) - hci_le_create_connection_cancel(conn); + else if (conn->type == LE_LINK) { + if (test_bit(HCI_CONN_SCANNING, &conn->flags)) + hci_connect_le_scan_remove(conn); + else + hci_le_create_connection_cancel(conn); + } } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { hci_reject_sco(conn); } -- cgit v1.2.3 From fa1422207756833096b54356a539e3a7d7edec4f Mon Sep 17 00:00:00 2001 From: Jakub Pawlowski Date: Fri, 7 Aug 2015 20:22:56 +0200 Subject: Bluetooth: Enable new connection establishment procedure. Currently, when trying to connect to already paired device that just rotated its RPA MAC address, old address would be used and connection would fail. In order to fix that, kernel must scan and receive advertisement with fresh RPA before connecting. This patch enables new connection establishment procedure. Instead of just sending HCI_OP_LE_CREATE_CONN to controller, "connect" will add device to kernel whitelist and start scan. If advertisement is received, it'll be compared against whitelist and then trigger connection if it matches. That fixes mentioned reconnect issue for already paired devices. It also make whole connection procedure more robust. We can try to connect to multiple devices at same time now, even though controller allow only one. Signed-off-by: Jakub Pawlowski Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap_core.c | 6 ++++-- net/bluetooth/mgmt.c | 7 ++++--- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 45fffa413642..7c65ee200c29 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -7113,8 +7113,10 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, else role = HCI_ROLE_MASTER; - hcon = hci_connect_le(hdev, dst, dst_type, chan->sec_level, - HCI_LE_CONN_TIMEOUT, role); + hcon = hci_connect_le_scan(hdev, dst, dst_type, + chan->sec_level, + HCI_LE_CONN_TIMEOUT, + role); } else { u8 auth_type = l2cap_get_auth_type(chan); hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 5f5d7851f5bf..3a15f0964fab 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3564,9 +3564,10 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data, */ hci_conn_params_add(hdev, &cp->addr.bdaddr, addr_type); - conn = hci_connect_le(hdev, &cp->addr.bdaddr, addr_type, - sec_level, HCI_LE_CONN_TIMEOUT, - HCI_ROLE_MASTER); + conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr, + addr_type, sec_level, + HCI_LE_CONN_TIMEOUT, + HCI_ROLE_MASTER); } if (IS_ERR(conn)) { -- cgit v1.2.3 From a7854037da006a7472c48773e3190db55217ec9b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 7 Aug 2015 19:40:45 +0300 Subject: bridge: netlink: add support for vlan_filtering attribute This patch adds the ability to toggle the vlan filtering support via netlink. Since we're already running with rtnl in .changelink() we don't need to take any additional locks. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 14 +++++++++++++- net/bridge/br_private.h | 7 +++++++ net/bridge/br_vlan.c | 18 ++++++++++++------ 4 files changed, 33 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index f24ec99a2262..d450be36add2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -230,6 +230,7 @@ enum { IFLA_BR_AGEING_TIME, IFLA_BR_STP_STATE, IFLA_BR_PRIORITY, + IFLA_BR_VLAN_FILTERING, __IFLA_BR_MAX, }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 91a2e08c2bb8..6eb683d8e0c5 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -724,6 +724,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_AGEING_TIME] = { .type = NLA_U32 }, [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, + [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -771,6 +772,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], br_stp_set_bridge_priority(br, priority); } + if (data[IFLA_BR_VLAN_FILTERING]) { + u8 vlan_filter = nla_get_u8(data[IFLA_BR_VLAN_FILTERING]); + + err = __br_vlan_filter_toggle(br, vlan_filter); + if (err) + return err; + } + return 0; } @@ -782,6 +791,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u32)) + /* IFLA_BR_AGEING_TIME */ nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ 0; } @@ -794,13 +804,15 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) u32 ageing_time = jiffies_to_clock_t(br->ageing_time); u32 stp_enabled = br->stp_enabled; u16 priority = (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]; + u8 vlan_enabled = br_vlan_enabled(br); if (nla_put_u32(skb, IFLA_BR_FORWARD_DELAY, forward_delay) || nla_put_u32(skb, IFLA_BR_HELLO_TIME, hello_time) || nla_put_u32(skb, IFLA_BR_MAX_AGE, age_time) || nla_put_u32(skb, IFLA_BR_AGEING_TIME, ageing_time) || nla_put_u32(skb, IFLA_BR_STP_STATE, stp_enabled) || - nla_put_u16(skb, IFLA_BR_PRIORITY, priority)) + nla_put_u16(skb, IFLA_BR_PRIORITY, priority) || + nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled)) return -EMSGSIZE; return 0; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index e2cb359f9dd3..3d95647039d0 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -614,6 +614,7 @@ int br_vlan_delete(struct net_bridge *br, u16 vid); void br_vlan_flush(struct net_bridge *br); bool br_vlan_find(struct net_bridge *br, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); +int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); @@ -771,6 +772,12 @@ static inline int br_vlan_enabled(struct net_bridge *br) { return 0; } + +static inline int __br_vlan_filter_toggle(struct net_bridge *br, + unsigned long val) +{ + return -EOPNOTSUPP; +} #endif struct nf_br_ops { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 0d41f81838ff..3cef6892c0bb 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -468,21 +468,27 @@ void br_recalculate_fwd_mask(struct net_bridge *br) ~(1u << br->group_addr[5]); } -int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) +int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) { - if (!rtnl_trylock()) - return restart_syscall(); - if (br->vlan_enabled == val) - goto unlock; + return 0; br->vlan_enabled = val; br_manage_promisc(br); recalculate_group_addr(br); br_recalculate_fwd_mask(br); -unlock: + return 0; +} + +int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) +{ + if (!rtnl_trylock()) + return restart_syscall(); + + __br_vlan_filter_toggle(br, val); rtnl_unlock(); + return 0; } -- cgit v1.2.3 From fb811395cd5a71b9e94a068f524a6f4a21b67bdb Mon Sep 17 00:00:00 2001 From: Rick Jones Date: Fri, 7 Aug 2015 11:10:37 -0700 Subject: net: add explicit logging and stat for neighbour table overflow Add an explicit neighbour table overflow message (ratelimited) and statistic to make diagnosing neighbour table overflows tractable in the wild. Diagnosing a neighbour table overflow can be quite difficult in the wild because there is no explicit dmesg logged. Callers to neighbour code seem to use net_dbg_ratelimit when the neighbour call fails which means the "base message" is not emitted and the callback suppressed messages from the ratelimiting can end-up juxtaposed with unrelated messages. Further, a forced garbage collection will increment a stat on each call whether it was successful in freeing-up a table entry or not, so that statistic is only a hint. So, add a net_info_ratelimited message and explicit statistic to the neighbour code. Signed-off-by: Rick Jones Signed-off-by: David S. Miller --- include/net/neighbour.h | 1 + include/uapi/linux/neighbour.h | 1 + net/core/neighbour.c | 14 ++++++++++---- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index bd33e66f49aa..8b683841e574 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -125,6 +125,7 @@ struct neigh_statistics { unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ + unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 2e35c61bbdd1..788655bfa0f3 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -106,6 +106,7 @@ struct ndt_stats { __u64 ndts_rcv_probes_ucast; __u64 ndts_periodic_gc_runs; __u64 ndts_forced_gc_runs; + __u64 ndts_table_fulls; }; enum { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 84195dacb8b6..2b515ba7e94f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -274,8 +274,12 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device (entries >= tbl->gc_thresh2 && time_after(now, tbl->last_flush + 5 * HZ))) { if (!neigh_forced_gc(tbl) && - entries >= tbl->gc_thresh3) + entries >= tbl->gc_thresh3) { + net_info_ratelimited("%s: neighbor table overflow!\n", + tbl->id); + NEIGH_CACHE_STAT_INC(tbl, table_fulls); goto out_entries; + } } n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC); @@ -1849,6 +1853,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, ndst.ndts_rcv_probes_ucast += st->rcv_probes_ucast; ndst.ndts_periodic_gc_runs += st->periodic_gc_runs; ndst.ndts_forced_gc_runs += st->forced_gc_runs; + ndst.ndts_table_fulls += st->table_fulls; } if (nla_put(skb, NDTA_STATS, sizeof(ndst), &ndst)) @@ -2717,12 +2722,12 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) struct neigh_statistics *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards\n"); + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs unresolved_discards table_fulls\n"); return 0; } seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " - "%08lx %08lx %08lx %08lx %08lx\n", + "%08lx %08lx %08lx %08lx %08lx %08lx\n", atomic_read(&tbl->entries), st->allocs, @@ -2739,7 +2744,8 @@ static int neigh_stat_seq_show(struct seq_file *seq, void *v) st->periodic_gc_runs, st->forced_gc_runs, - st->unres_discards + st->unres_discards, + st->table_fulls ); return 0; -- cgit v1.2.3 From a9020fde67a6eb77f8130feff633189f99264db1 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 7 Aug 2015 23:51:33 -0700 Subject: openvswitch: Move tunnel destroy function to oppenvswitch module. This function will be used in gre and geneve vport implementations. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/vport-netdev.c | 21 ++++++++++++++++++--- net/openvswitch/vport-netdev.h | 2 +- net/openvswitch/vport-vxlan.c | 17 +---------------- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index cddb7069b11b..4b70aaa4a746 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -147,7 +147,7 @@ static struct vport *netdev_create(const struct vport_parms *parms) return ovs_netdev_link(vport, parms->name); } -void ovs_vport_free_rcu(struct rcu_head *rcu) +static void vport_netdev_free(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); @@ -155,7 +155,6 @@ void ovs_vport_free_rcu(struct rcu_head *rcu) dev_put(vport->dev); ovs_vport_free(vport); } -EXPORT_SYMBOL_GPL(ovs_vport_free_rcu); void ovs_netdev_detach_dev(struct vport *vport) { @@ -175,9 +174,25 @@ static void netdev_destroy(struct vport *vport) ovs_netdev_detach_dev(vport); rtnl_unlock(); - call_rcu(&vport->rcu, ovs_vport_free_rcu); + call_rcu(&vport->rcu, vport_netdev_free); } +void ovs_netdev_tunnel_destroy(struct vport *vport) +{ + rtnl_lock(); + if (vport->dev->priv_flags & IFF_OVS_DATAPATH) + ovs_netdev_detach_dev(vport); + + /* Early release so we can unregister the device */ + dev_put(vport->dev); + rtnl_delete_link(vport->dev); + vport->dev = NULL; + rtnl_unlock(); + + call_rcu(&vport->rcu, vport_netdev_free); +} +EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); + static unsigned int packet_length(const struct sk_buff *skb) { unsigned int length = skb->len - ETH_HLEN; diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 804412697a90..497cc81f1aca 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -29,9 +29,9 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); struct vport *ovs_netdev_link(struct vport *vport, const char *name); int ovs_netdev_send(struct vport *vport, struct sk_buff *skb); void ovs_netdev_detach_dev(struct vport *); -void ovs_vport_free_rcu(struct rcu_head *); int __init ovs_netdev_init(void); void ovs_netdev_exit(void); +void ovs_netdev_tunnel_destroy(struct vport *vport); #endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index c6e937e36f8b..1e8b00a23a23 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -146,21 +146,6 @@ static struct vport *vxlan_create(const struct vport_parms *parms) return ovs_netdev_link(vport, parms->name); } -static void vxlan_destroy(struct vport *vport) -{ - rtnl_lock(); - if (vport->dev->priv_flags & IFF_OVS_DATAPATH) - ovs_netdev_detach_dev(vport); - - /* Early release so we can unregister the device */ - dev_put(vport->dev); - rtnl_delete_link(vport->dev); - vport->dev = NULL; - rtnl_unlock(); - - call_rcu(&vport->rcu, ovs_vport_free_rcu); -} - static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, struct ip_tunnel_info *egress_tun_info) { @@ -183,7 +168,7 @@ static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, static struct vport_ops ovs_vxlan_netdev_vport_ops = { .type = OVS_VPORT_TYPE_VXLAN, .create = vxlan_create, - .destroy = vxlan_destroy, + .destroy = ovs_netdev_tunnel_destroy, .get_options = vxlan_get_options, .send = ovs_netdev_send, .get_egress_tun_info = vxlan_get_egress_tun_info, -- cgit v1.2.3 From 2e15ea390e6f4466655066d97e22ec66870a042c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 7 Aug 2015 23:51:42 -0700 Subject: ip_gre: Add support to collect tunnel metadata. Following patch create new tunnel flag which enable tunnel metadata collection on given device. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ip_tunnels.h | 7 +- include/uapi/linux/if_tunnel.h | 1 + net/ipv4/ip_gre.c | 195 +++++++++++++++++++++++++++++++++++++---- net/ipv4/ip_tunnel.c | 37 ++++++-- net/ipv4/ipip.c | 2 +- net/ipv6/sit.c | 2 +- 6 files changed, 216 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 47984415f5d1..984dbfa15e13 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -82,6 +82,8 @@ struct ip_tunnel_dst { __be32 saddr; }; +struct metadata_dst; + struct ip_tunnel { struct ip_tunnel __rcu *next; struct hlist_node hash_node; @@ -115,6 +117,7 @@ struct ip_tunnel { unsigned int prl_count; /* # of entries in PRL */ int ip_tnl_net_id; struct gro_cells gro_cells; + bool collect_md; }; #define TUNNEL_CSUM __cpu_to_be16(0x01) @@ -149,6 +152,7 @@ struct tnl_ptk_info { struct ip_tunnel_net { struct net_device *fb_tunnel_dev; struct hlist_head tunnels[IP_TNL_HASH_SIZE]; + struct ip_tunnel __rcu *collect_md_tun; }; struct ip_tunnel_encap_ops { @@ -235,7 +239,8 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, __be32 key); int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, - const struct tnl_ptk_info *tpi, bool log_ecn_error); + const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, + bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_parm *p); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index bd3cc11a431f..af4de90ba27d 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -112,6 +112,7 @@ enum { IFLA_GRE_ENCAP_FLAGS, IFLA_GRE_ENCAP_SPORT, IFLA_GRE_ENCAP_DPORT, + IFLA_GRE_COLLECT_METADATA, __IFLA_GRE_MAX, }; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5fd706473c73..554a760c2cd0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -200,9 +202,29 @@ static int ipgre_err(struct sk_buff *skb, u32 info, return PACKET_RCVD; } +static __be64 key_to_tunnel_id(__be32 key) +{ +#ifdef __BIG_ENDIAN + return (__force __be64)((__force u32)key); +#else + return (__force __be64)((__force u64)key << 32); +#endif +} + +/* Returns the least-significant 32 bits of a __be64. */ +static __be32 tunnel_id_to_key(__be64 x) +{ +#ifdef __BIG_ENDIAN + return (__force __be32)x; +#else + return (__force __be32)((__force u64)x >> 32); +#endif +} + static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) { struct net *net = dev_net(skb->dev); + struct metadata_dst *tun_dst = NULL; struct ip_tunnel_net *itn; const struct iphdr *iph; struct ip_tunnel *tunnel; @@ -218,40 +240,162 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) if (tunnel) { skb_pop_mac_header(skb); - ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error); + if (tunnel->collect_md) { + struct ip_tunnel_info *info; + + tun_dst = metadata_dst_alloc(0, GFP_ATOMIC); + if (!tun_dst) + return PACKET_REJECT; + + info = &tun_dst->u.tun_info; + info->key.ipv4_src = iph->saddr; + info->key.ipv4_dst = iph->daddr; + info->key.ipv4_tos = iph->tos; + info->key.ipv4_ttl = iph->ttl; + + info->mode = IP_TUNNEL_INFO_RX; + info->key.tun_flags = tpi->flags & + (TUNNEL_CSUM | TUNNEL_KEY); + info->key.tun_id = key_to_tunnel_id(tpi->key); + + info->key.tp_src = 0; + info->key.tp_dst = 0; + } + + ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); return PACKET_RCVD; } return PACKET_REJECT; } +static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, + __be16 proto, __be32 key, __be32 seq) +{ + struct gre_base_hdr *greh; + + skb_push(skb, hdr_len); + + skb_reset_transport_header(skb); + greh = (struct gre_base_hdr *)skb->data; + greh->flags = tnl_flags_to_gre_flags(flags); + greh->protocol = proto; + + if (flags & (TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_SEQ)) { + __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); + + if (flags & TUNNEL_SEQ) { + *ptr = seq; + ptr--; + } + if (flags & TUNNEL_KEY) { + *ptr = key; + ptr--; + } + if (flags & TUNNEL_CSUM && + !(skb_shinfo(skb)->gso_type & + (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { + *ptr = 0; + *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, + skb->len, 0)); + } + } +} + static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, const struct iphdr *tnl_params, __be16 proto) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct tnl_ptk_info tpi; - tpi.flags = tunnel->parms.o_flags; - tpi.proto = proto; - tpi.key = tunnel->parms.o_key; if (tunnel->parms.o_flags & TUNNEL_SEQ) tunnel->o_seqno++; - tpi.seq = htonl(tunnel->o_seqno); /* Push GRE header. */ - gre_build_header(skb, &tpi, tunnel->tun_hlen); - - skb_set_inner_protocol(skb, tpi.proto); + build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags, + proto, tunnel->parms.o_key, htonl(tunnel->o_seqno)); + skb_set_inner_protocol(skb, proto); ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } +static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel_info *tun_info; + struct net *net = dev_net(dev); + const struct ip_tunnel_key *key; + struct flowi4 fl; + struct rtable *rt; + int min_headroom; + int tunnel_hlen; + __be16 df, flags; + int err; + + tun_info = skb_tunnel_info(skb, AF_INET); + if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX)) + goto err_free_skb; + + key = &tun_info->key; + memset(&fl, 0, sizeof(fl)); + fl.daddr = key->ipv4_dst; + fl.saddr = key->ipv4_src; + fl.flowi4_tos = RT_TOS(key->ipv4_tos); + fl.flowi4_mark = skb->mark; + fl.flowi4_proto = IPPROTO_GRE; + + rt = ip_route_output_key(net, &fl); + if (IS_ERR(rt)) + goto err_free_skb; + + tunnel_hlen = ip_gre_calc_hlen(key->tun_flags); + + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + tunnel_hlen + sizeof(struct iphdr); + if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { + int head_delta = SKB_DATA_ALIGN(min_headroom - + skb_headroom(skb) + + 16); + err = pskb_expand_head(skb, max_t(int, head_delta, 0), + 0, GFP_ATOMIC); + if (unlikely(err)) + goto err_free_rt; + } + + /* Push Tunnel header. */ + skb = gre_handle_offloads(skb, !!(tun_info->key.tun_flags & TUNNEL_CSUM)); + if (IS_ERR(skb)) { + skb = NULL; + goto err_free_rt; + } + + flags = tun_info->key.tun_flags & (TUNNEL_CSUM | TUNNEL_KEY); + build_header(skb, tunnel_hlen, flags, htons(ETH_P_TEB), + tunnel_id_to_key(tun_info->key.tun_id), 0); + + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; + err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, + key->ipv4_dst, IPPROTO_GRE, + key->ipv4_tos, key->ipv4_ttl, df, false); + iptunnel_xmit_stats(err, &dev->stats, dev->tstats); + return; + +err_free_rt: + ip_rt_put(rt); +err_free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; +} + static netdev_tx_t ipgre_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tnl_params; + if (tunnel->collect_md) { + gre_fb_xmit(skb, dev); + return NETDEV_TX_OK; + } + if (dev->header_ops) { /* Need space for new headers */ if (skb_cow_head(skb, dev->needed_headroom - @@ -277,7 +421,6 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, goto out; __gre_xmit(skb, dev, tnl_params, skb->protocol); - return NETDEV_TX_OK; free_skb: @@ -292,6 +435,11 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, { struct ip_tunnel *tunnel = netdev_priv(dev); + if (tunnel->collect_md) { + gre_fb_xmit(skb, dev); + return NETDEV_TX_OK; + } + skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM)); if (IS_ERR(skb)) goto out; @@ -300,7 +448,6 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, goto free_skb; __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_TEB)); - return NETDEV_TX_OK; free_skb: @@ -596,8 +743,10 @@ out: return ipgre_tunnel_validate(tb, data); } -static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms) +static void ipgre_netlink_parms(struct net_device *dev, + struct nlattr *data[], + struct nlattr *tb[], + struct ip_tunnel_parm *parms) { memset(parms, 0, sizeof(*parms)); @@ -635,6 +784,12 @@ static void ipgre_netlink_parms(struct nlattr *data[], struct nlattr *tb[], if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); + + if (data[IFLA_GRE_COLLECT_METADATA]) { + struct ip_tunnel *t = netdev_priv(dev); + + t->collect_md = true; + } } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -712,7 +867,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, return err; } - ipgre_netlink_parms(data, tb, &p); + ipgre_netlink_parms(dev, data, tb, &p); return ip_tunnel_newlink(dev, tb, &p); } @@ -730,7 +885,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], return err; } - ipgre_netlink_parms(data, tb, &p); + ipgre_netlink_parms(dev, data, tb, &p); return ip_tunnel_changelink(dev, tb, &p); } @@ -765,6 +920,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_COLLECT_METADATA */ + nla_total_size(0) + 0; } @@ -796,6 +953,11 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) t->encap.flags)) goto nla_put_failure; + if (t->collect_md) { + if (nla_put_flag(skb, IFLA_GRE_COLLECT_METADATA)) + goto nla_put_failure; + } + return 0; nla_put_failure: @@ -817,6 +979,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { @@ -851,7 +1014,7 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { static int __net_init ipgre_tap_init_net(struct net *net) { - return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, NULL); + return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } static void __net_exit ipgre_tap_exit_net(struct net *net) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 626d9e56a6bd..cbb51f3fac06 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -230,10 +230,13 @@ skip_key_lookup: if (cand) return cand; + t = rcu_dereference(itn->collect_md_tun); + if (t) + return t; + if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) return netdev_priv(itn->fb_tunnel_dev); - return NULL; } EXPORT_SYMBOL_GPL(ip_tunnel_lookup); @@ -261,11 +264,15 @@ static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t) { struct hlist_head *head = ip_bucket(itn, &t->parms); + if (t->collect_md) + rcu_assign_pointer(itn->collect_md_tun, t); hlist_add_head_rcu(&t->hash_node, head); } -static void ip_tunnel_del(struct ip_tunnel *t) +static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t) { + if (t->collect_md) + rcu_assign_pointer(itn->collect_md_tun, NULL); hlist_del_init_rcu(&t->hash_node); } @@ -419,7 +426,8 @@ static struct ip_tunnel *ip_tunnel_create(struct net *net, } int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, - const struct tnl_ptk_info *tpi, bool log_ecn_error) + const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, + bool log_ecn_error) { struct pcpu_sw_netstats *tstats; const struct iphdr *iph = ip_hdr(skb); @@ -478,6 +486,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, skb->dev = tunnel->dev; } + if (tun_dst) + skb_dst_set(skb, (struct dst_entry *)tun_dst); + gro_cells_receive(&tunnel->gro_cells, skb); return 0; @@ -806,7 +817,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel_parm *p, bool set_mtu) { - ip_tunnel_del(t); + ip_tunnel_del(itn, t); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; t->parms.i_key = p->i_key; @@ -967,7 +978,7 @@ void ip_tunnel_dellink(struct net_device *dev, struct list_head *head) itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id); if (itn->fb_tunnel_dev != dev) { - ip_tunnel_del(netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); unregister_netdevice_queue(dev, head); } } @@ -1072,8 +1083,13 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], nt = netdev_priv(dev); itn = net_generic(net, nt->ip_tnl_net_id); - if (ip_tunnel_find(itn, p, dev->type)) - return -EEXIST; + if (nt->collect_md) { + if (rtnl_dereference(itn->collect_md_tun)) + return -EEXIST; + } else { + if (ip_tunnel_find(itn, p, dev->type)) + return -EEXIST; + } nt->net = net; nt->parms = *p; @@ -1089,7 +1105,6 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], dev->mtu = mtu; ip_tunnel_add(itn, nt); - out: return err; } @@ -1163,6 +1178,10 @@ int ip_tunnel_init(struct net_device *dev) iph->version = 4; iph->ihl = 5; + if (tunnel->collect_md) { + dev->features |= NETIF_F_NETNS_LOCAL; + netif_keep_dst(dev); + } return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); @@ -1176,7 +1195,7 @@ void ip_tunnel_uninit(struct net_device *dev) itn = net_generic(net, tunnel->ip_tnl_net_id); /* fb_tunnel_dev will be unregisted in net-exit call. */ if (itn->fb_tunnel_dev != dev) - ip_tunnel_del(netdev_priv(dev)); + ip_tunnel_del(itn, netdev_priv(dev)); ip_tunnel_dst_reset_all(tunnel); } diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 254238daf58b..f34c31defafe 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -198,7 +198,7 @@ static int ipip_rcv(struct sk_buff *skb) goto drop; if (iptunnel_pull_header(skb, 0, tpi.proto)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } return -1; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index ac35a28599be..94428fd85b2f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -742,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb) goto drop; if (iptunnel_pull_header(skb, 0, tpi.proto)) goto drop; - return ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error); + return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error); } return 1; -- cgit v1.2.3 From b2acd1dc3949cd60c571844d495594f05f0351f4 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 7 Aug 2015 23:51:47 -0700 Subject: openvswitch: Use regular GRE net_device instead of vport Using GRE tunnel meta data collection feature, we can implement OVS GRE vport. This patch removes all of the OVS specific GRE code and make OVS use a ip_gre net_device. Minimal GRE vport is kept to handle compatibility with current userspace application. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/gre.h | 12 +-- net/ipv4/gre_demux.c | 34 ------- net/ipv4/ip_gre.c | 36 +++++++ net/openvswitch/Kconfig | 2 +- net/openvswitch/vport-gre.c | 237 ++++---------------------------------------- 5 files changed, 61 insertions(+), 260 deletions(-) (limited to 'net') diff --git a/include/net/gre.h b/include/net/gre.h index b53182018743..e3e08459bf67 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -33,16 +33,8 @@ struct gre_cisco_protocol { int gre_cisco_register(struct gre_cisco_protocol *proto); int gre_cisco_unregister(struct gre_cisco_protocol *proto); -void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len); - -static inline struct sk_buff *gre_handle_offloads(struct sk_buff *skb, - bool csum) -{ - return iptunnel_handle_offloads(skb, csum, - csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); -} - +struct net_device *gretap_fb_dev_create(struct net *net, const char *name, + u8 name_assign_type); static inline int ip_gre_calc_hlen(__be16 o_flags) { diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 4a7b5b2a1ce3..77562e0ac66b 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -61,40 +61,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); -void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi, - int hdr_len) -{ - struct gre_base_hdr *greh; - - skb_push(skb, hdr_len); - - skb_reset_transport_header(skb); - greh = (struct gre_base_hdr *)skb->data; - greh->flags = tnl_flags_to_gre_flags(tpi->flags); - greh->protocol = tpi->proto; - - if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) { - __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); - - if (tpi->flags&TUNNEL_SEQ) { - *ptr = tpi->seq; - ptr--; - } - if (tpi->flags&TUNNEL_KEY) { - *ptr = tpi->key; - ptr--; - } - if (tpi->flags&TUNNEL_CSUM && - !(skb_shinfo(skb)->gso_type & - (SKB_GSO_GRE|SKB_GSO_GRE_CSUM))) { - *ptr = 0; - *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0, - skb->len, 0)); - } - } -} -EXPORT_SYMBOL_GPL(gre_build_header); - static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err) { diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 554a760c2cd0..49d140200d03 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -318,6 +318,13 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol); } +static struct sk_buff *gre_handle_offloads(struct sk_buff *skb, + bool csum) +{ + return iptunnel_handle_offloads(skb, csum, + csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); +} + static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel_info *tun_info; @@ -1012,6 +1019,35 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .get_link_net = ip_tunnel_get_link_net, }; +struct net_device *gretap_fb_dev_create(struct net *net, const char *name, + u8 name_assign_type) +{ + struct nlattr *tb[IFLA_MAX + 1]; + struct net_device *dev; + struct ip_tunnel *t; + int err; + + memset(&tb, 0, sizeof(tb)); + + dev = rtnl_create_link(net, name, name_assign_type, + &ipgre_tap_ops, tb); + if (IS_ERR(dev)) + return dev; + + /* Configure flow based GRE device. */ + t = netdev_priv(dev); + t->collect_md = true; + + err = ipgre_newlink(net, dev, tb, NULL); + if (err < 0) + goto out; + return dev; +out: + free_netdev(dev); + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(gretap_fb_dev_create); + static int __net_init ipgre_tap_init_net(struct net *net) { return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 15840401a2ce..422dc0567de9 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -34,7 +34,7 @@ config OPENVSWITCH config OPENVSWITCH_GRE tristate "Open vSwitch GRE tunneling support" depends on OPENVSWITCH - depends on NET_IPGRE_DEMUX + depends on NET_IPGRE default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create GRE diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index b87656c66aaf..871801d2ac23 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -45,235 +45,43 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_gre_vport_ops; -/* Returns the least-significant 32 bits of a __be64. */ -static __be32 be64_get_low32(__be64 x) +static struct vport *gre_tnl_create(const struct vport_parms *parms) { -#ifdef __BIG_ENDIAN - return (__force __be32)x; -#else - return (__force __be32)((__force u64)x >> 32); -#endif -} - -static __be16 filter_tnl_flags(__be16 flags) -{ - return flags & (TUNNEL_CSUM | TUNNEL_KEY); -} - -static struct sk_buff *__build_header(struct sk_buff *skb, - int tunnel_hlen) -{ - struct tnl_ptk_info tpi; - const struct ip_tunnel_key *tun_key; - - tun_key = &OVS_CB(skb)->egress_tun_info->key; - - skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); - if (IS_ERR(skb)) - return skb; - - tpi.flags = filter_tnl_flags(tun_key->tun_flags); - tpi.proto = htons(ETH_P_TEB); - tpi.key = be64_get_low32(tun_key->tun_id); - tpi.seq = 0; - gre_build_header(skb, &tpi, tunnel_hlen); - - return skb; -} - -static __be64 key_to_tunnel_id(__be32 key, __be32 seq) -{ -#ifdef __BIG_ENDIAN - return (__force __be64)((__force u64)seq << 32 | (__force u32)key); -#else - return (__force __be64)((__force u64)key << 32 | (__force u32)seq); -#endif -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_rcv(struct sk_buff *skb, - const struct tnl_ptk_info *tpi) -{ - struct ip_tunnel_info tun_info; - struct ovs_net *ovs_net; - struct vport *vport; - __be64 key; - - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - if (unlikely(!vport)) - return PACKET_REJECT; - - key = key_to_tunnel_id(tpi->key, tpi->seq); - ip_tunnel_info_init(&tun_info, ip_hdr(skb), 0, 0, key, - filter_tnl_flags(tpi->flags), NULL, 0); - - ovs_vport_receive(vport, skb, &tun_info); - return PACKET_RCVD; -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) -{ - struct ovs_net *ovs_net; + struct net *net = ovs_dp_get_net(parms->dp); + struct net_device *dev; struct vport *vport; - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - - if (unlikely(!vport)) - return PACKET_REJECT; - else - return PACKET_RCVD; -} - -static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - const struct ip_tunnel_key *tun_key; - struct flowi4 fl; - struct rtable *rt; - int min_headroom; - int tunnel_hlen; - __be16 df; - int err; - - if (unlikely(!OVS_CB(skb)->egress_tun_info)) { - err = -EINVAL; - goto err_free_skb; - } - - tun_key = &OVS_CB(skb)->egress_tun_info->key; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_GRE); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto err_free_skb; - } - - tunnel_hlen = ip_gre_calc_hlen(tun_key->tun_flags); - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) { - err = -ENOMEM; - goto err_free_rt; - } - - /* Push Tunnel header. */ - skb = __build_header(skb, tunnel_hlen); - if (IS_ERR(skb)) { - err = PTR_ERR(skb); - skb = NULL; - goto err_free_rt; + vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + rtnl_lock(); + dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); } - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->ignore_df = 1; - - return iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - tun_key->ipv4_dst, IPPROTO_GRE, - tun_key->ipv4_tos, tun_key->ipv4_ttl, df, false); -err_free_rt: - ip_rt_put(rt); -err_free_skb: - kfree_skb(skb); - return err; -} - -static struct gre_cisco_protocol gre_protocol = { - .handler = gre_rcv, - .err_handler = gre_err, - .priority = 1, -}; - -static int gre_ports; -static int gre_init(void) -{ - int err; - - gre_ports++; - if (gre_ports > 1) - return 0; - - err = gre_cisco_register(&gre_protocol); - if (err) - pr_warn("cannot register gre protocol handler\n"); - - return err; -} - -static void gre_exit(void) -{ - gre_ports--; - if (gre_ports > 0) - return; - - gre_cisco_unregister(&gre_protocol); -} + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); -static const char *gre_get_name(const struct vport *vport) -{ - return vport_priv(vport); + return vport; } static struct vport *gre_create(const struct vport_parms *parms) { - struct net *net = ovs_dp_get_net(parms->dp); - struct ovs_net *ovs_net; struct vport *vport; - int err; - - err = gre_init(); - if (err) - return ERR_PTR(err); - - ovs_net = net_generic(net, ovs_net_id); - if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { - vport = ERR_PTR(-EEXIST); - goto error; - } - vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); + vport = gre_tnl_create(parms); if (IS_ERR(vport)) - goto error; - - strncpy(vport_priv(vport), parms->name, IFNAMSIZ); - rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); - return vport; - -error: - gre_exit(); - return vport; -} - -static void gre_tnl_destroy(struct vport *vport) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct ovs_net *ovs_net; - - ovs_net = net_generic(net, ovs_net_id); + return vport; - RCU_INIT_POINTER(ovs_net->vport_net.gre_vport, NULL); - ovs_vport_deferred_free(vport); - gre_exit(); + return ovs_netdev_link(vport, parms->name); } static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, @@ -288,10 +96,9 @@ static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, static struct vport_ops ovs_gre_vport_ops = { .type = OVS_VPORT_TYPE_GRE, .create = gre_create, - .destroy = gre_tnl_destroy, - .get_name = gre_get_name, - .send = gre_tnl_send, + .send = ovs_netdev_send, .get_egress_tun_info = gre_get_egress_tun_info, + .destroy = ovs_netdev_tunnel_destroy, .owner = THIS_MODULE, }; -- cgit v1.2.3 From 9f57c67c379d88a10e8ad676426fee5ae7341b14 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Fri, 7 Aug 2015 23:51:52 -0700 Subject: gre: Remove support for sharing GRE protocol hook. Support for sharing GREPROTO_CISCO port was added so that OVS gre port and kernel GRE devices can co-exist. After flow-based tunneling patches OVS GRE protocol processing is completely moved to ip_gre module. so there is no need for GRE protocol hook. Following patch consolidates GRE protocol related functions into ip_gre module. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/gre.h | 80 ++----------------- net/ipv4/gre_demux.c | 201 +---------------------------------------------- net/ipv4/ip_gre.c | 215 +++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 206 insertions(+), 290 deletions(-) (limited to 'net') diff --git a/include/net/gre.h b/include/net/gre.h index e3e08459bf67..97eafdc47eea 100644 --- a/include/net/gre.h +++ b/include/net/gre.h @@ -4,6 +4,12 @@ #include #include +struct gre_base_hdr { + __be16 flags; + __be16 protocol; +}; +#define GRE_HEADER_SECTION 4 + #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 @@ -14,83 +20,9 @@ struct gre_protocol { void (*err_handler)(struct sk_buff *skb, u32 info); }; -struct gre_base_hdr { - __be16 flags; - __be16 protocol; -}; -#define GRE_HEADER_SECTION 4 - int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); -struct gre_cisco_protocol { - int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi); - int (*err_handler)(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi); - u8 priority; -}; - -int gre_cisco_register(struct gre_cisco_protocol *proto); -int gre_cisco_unregister(struct gre_cisco_protocol *proto); - struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type); - -static inline int ip_gre_calc_hlen(__be16 o_flags) -{ - int addend = 4; - - if (o_flags&TUNNEL_CSUM) - addend += 4; - if (o_flags&TUNNEL_KEY) - addend += 4; - if (o_flags&TUNNEL_SEQ) - addend += 4; - return addend; -} - -static inline __be16 gre_flags_to_tnl_flags(__be16 flags) -{ - __be16 tflags = 0; - - if (flags & GRE_CSUM) - tflags |= TUNNEL_CSUM; - if (flags & GRE_ROUTING) - tflags |= TUNNEL_ROUTING; - if (flags & GRE_KEY) - tflags |= TUNNEL_KEY; - if (flags & GRE_SEQ) - tflags |= TUNNEL_SEQ; - if (flags & GRE_STRICT) - tflags |= TUNNEL_STRICT; - if (flags & GRE_REC) - tflags |= TUNNEL_REC; - if (flags & GRE_VERSION) - tflags |= TUNNEL_VERSION; - - return tflags; -} - -static inline __be16 tnl_flags_to_gre_flags(__be16 tflags) -{ - __be16 flags = 0; - - if (tflags & TUNNEL_CSUM) - flags |= GRE_CSUM; - if (tflags & TUNNEL_ROUTING) - flags |= GRE_ROUTING; - if (tflags & TUNNEL_KEY) - flags |= GRE_KEY; - if (tflags & TUNNEL_SEQ) - flags |= GRE_SEQ; - if (tflags & TUNNEL_STRICT) - flags |= GRE_STRICT; - if (tflags & TUNNEL_REC) - flags |= GRE_REC; - if (tflags & TUNNEL_VERSION) - flags |= GRE_VERSION; - - return flags; -} - #endif diff --git a/net/ipv4/gre_demux.c b/net/ipv4/gre_demux.c index 77562e0ac66b..d9c552a721fc 100644 --- a/net/ipv4/gre_demux.c +++ b/net/ipv4/gre_demux.c @@ -31,7 +31,6 @@ #include static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly; -static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX]; int gre_add_protocol(const struct gre_protocol *proto, u8 version) { @@ -61,163 +60,6 @@ int gre_del_protocol(const struct gre_protocol *proto, u8 version) } EXPORT_SYMBOL_GPL(gre_del_protocol); -static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, - bool *csum_err) -{ - const struct gre_base_hdr *greh; - __be32 *options; - int hdr_len; - - if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) - return -EINVAL; - - tpi->flags = gre_flags_to_tnl_flags(greh->flags); - hdr_len = ip_gre_calc_hlen(tpi->flags); - - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - - greh = (struct gre_base_hdr *)skb_transport_header(skb); - tpi->proto = greh->protocol; - - options = (__be32 *)(greh + 1); - if (greh->flags & GRE_CSUM) { - if (skb_checksum_simple_validate(skb)) { - *csum_err = true; - return -EINVAL; - } - - skb_checksum_try_convert(skb, IPPROTO_GRE, 0, - null_compute_pseudo); - - options++; - } - - if (greh->flags & GRE_KEY) { - tpi->key = *options; - options++; - } else - tpi->key = 0; - - if (unlikely(greh->flags & GRE_SEQ)) { - tpi->seq = *options; - options++; - } else - tpi->seq = 0; - - /* WCCP version 1 and 2 protocol decoding. - * - Change protocol to IP - * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header - */ - if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { - tpi->proto = htons(ETH_P_IP); - if ((*(u8 *)options & 0xF0) != 0x40) { - hdr_len += 4; - if (!pskb_may_pull(skb, hdr_len)) - return -EINVAL; - } - } - - return iptunnel_pull_header(skb, hdr_len, tpi->proto); -} - -static int gre_cisco_rcv(struct sk_buff *skb) -{ - struct tnl_ptk_info tpi; - int i; - bool csum_err = false; - -#ifdef CONFIG_NET_IPGRE_BROADCAST - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { - /* Looped back packet, drop it! */ - if (rt_is_output_route(skb_rtable(skb))) - goto drop; - } -#endif - - if (parse_gre_header(skb, &tpi, &csum_err) < 0) - goto drop; - - rcu_read_lock(); - for (i = 0; i < GRE_IP_PROTO_MAX; i++) { - struct gre_cisco_protocol *proto; - int ret; - - proto = rcu_dereference(gre_cisco_proto_list[i]); - if (!proto) - continue; - ret = proto->handler(skb, &tpi); - if (ret == PACKET_RCVD) { - rcu_read_unlock(); - return 0; - } - } - rcu_read_unlock(); - - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); -drop: - kfree_skb(skb); - return 0; -} - -static void gre_cisco_err(struct sk_buff *skb, u32 info) -{ - /* All the routers (except for Linux) return only - * 8 bytes of packet payload. It means, that precise relaying of - * ICMP in the real Internet is absolutely infeasible. - * - * Moreover, Cisco "wise men" put GRE key to the third word - * in GRE header. It makes impossible maintaining even soft - * state for keyed - * GRE tunnels with enabled checksum. Tell them "thank you". - * - * Well, I wonder, rfc1812 was written by Cisco employee, - * what the hell these idiots break standards established - * by themselves??? - */ - - const int type = icmp_hdr(skb)->type; - const int code = icmp_hdr(skb)->code; - struct tnl_ptk_info tpi; - bool csum_err = false; - int i; - - if (parse_gre_header(skb, &tpi, &csum_err)) { - if (!csum_err) /* ignore csum errors. */ - return; - } - - if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { - ipv4_update_pmtu(skb, dev_net(skb->dev), info, - skb->dev->ifindex, 0, IPPROTO_GRE, 0); - return; - } - if (type == ICMP_REDIRECT) { - ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, - IPPROTO_GRE, 0); - return; - } - - rcu_read_lock(); - for (i = 0; i < GRE_IP_PROTO_MAX; i++) { - struct gre_cisco_protocol *proto; - - proto = rcu_dereference(gre_cisco_proto_list[i]); - if (!proto) - continue; - - if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD) - goto out; - - } -out: - rcu_read_unlock(); -} - static int gre_rcv(struct sk_buff *skb) { const struct gre_protocol *proto; @@ -268,60 +110,19 @@ static const struct net_protocol net_gre_protocol = { .netns_ok = 1, }; -static const struct gre_protocol ipgre_protocol = { - .handler = gre_cisco_rcv, - .err_handler = gre_cisco_err, -}; - -int gre_cisco_register(struct gre_cisco_protocol *newp) -{ - struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) - &gre_cisco_proto_list[newp->priority]; - - return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY; -} -EXPORT_SYMBOL_GPL(gre_cisco_register); - -int gre_cisco_unregister(struct gre_cisco_protocol *del_proto) -{ - struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **) - &gre_cisco_proto_list[del_proto->priority]; - int ret; - - ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL; - - if (ret) - return ret; - - synchronize_net(); - return 0; -} -EXPORT_SYMBOL_GPL(gre_cisco_unregister); - static int __init gre_init(void) { pr_info("GRE over IPv4 demultiplexor driver\n"); if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { pr_err("can't add protocol\n"); - goto err; + return -EAGAIN; } - - if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) { - pr_info("%s: can't add ipgre handler\n", __func__); - goto err_gre; - } - return 0; -err_gre: - inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); -err: - return -EAGAIN; } static void __exit gre_exit(void) { - gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 49d140200d03..fb44d693796e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -123,8 +123,127 @@ static int ipgre_tunnel_init(struct net_device *dev); static int ipgre_net_id __read_mostly; static int gre_tap_net_id __read_mostly; -static int ipgre_err(struct sk_buff *skb, u32 info, - const struct tnl_ptk_info *tpi) +static int ip_gre_calc_hlen(__be16 o_flags) +{ + int addend = 4; + + if (o_flags & TUNNEL_CSUM) + addend += 4; + if (o_flags & TUNNEL_KEY) + addend += 4; + if (o_flags & TUNNEL_SEQ) + addend += 4; + return addend; +} + +static __be16 gre_flags_to_tnl_flags(__be16 flags) +{ + __be16 tflags = 0; + + if (flags & GRE_CSUM) + tflags |= TUNNEL_CSUM; + if (flags & GRE_ROUTING) + tflags |= TUNNEL_ROUTING; + if (flags & GRE_KEY) + tflags |= TUNNEL_KEY; + if (flags & GRE_SEQ) + tflags |= TUNNEL_SEQ; + if (flags & GRE_STRICT) + tflags |= TUNNEL_STRICT; + if (flags & GRE_REC) + tflags |= TUNNEL_REC; + if (flags & GRE_VERSION) + tflags |= TUNNEL_VERSION; + + return tflags; +} + +static __be16 tnl_flags_to_gre_flags(__be16 tflags) +{ + __be16 flags = 0; + + if (tflags & TUNNEL_CSUM) + flags |= GRE_CSUM; + if (tflags & TUNNEL_ROUTING) + flags |= GRE_ROUTING; + if (tflags & TUNNEL_KEY) + flags |= GRE_KEY; + if (tflags & TUNNEL_SEQ) + flags |= GRE_SEQ; + if (tflags & TUNNEL_STRICT) + flags |= GRE_STRICT; + if (tflags & TUNNEL_REC) + flags |= GRE_REC; + if (tflags & TUNNEL_VERSION) + flags |= GRE_VERSION; + + return flags; +} + +static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, + bool *csum_err) +{ + const struct gre_base_hdr *greh; + __be32 *options; + int hdr_len; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr)))) + return -EINVAL; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING))) + return -EINVAL; + + tpi->flags = gre_flags_to_tnl_flags(greh->flags); + hdr_len = ip_gre_calc_hlen(tpi->flags); + + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + + greh = (struct gre_base_hdr *)skb_transport_header(skb); + tpi->proto = greh->protocol; + + options = (__be32 *)(greh + 1); + if (greh->flags & GRE_CSUM) { + if (skb_checksum_simple_validate(skb)) { + *csum_err = true; + return -EINVAL; + } + + skb_checksum_try_convert(skb, IPPROTO_GRE, 0, + null_compute_pseudo); + options++; + } + + if (greh->flags & GRE_KEY) { + tpi->key = *options; + options++; + } else { + tpi->key = 0; + } + if (unlikely(greh->flags & GRE_SEQ)) { + tpi->seq = *options; + options++; + } else { + tpi->seq = 0; + } + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) { + tpi->proto = htons(ETH_P_IP); + if ((*(u8 *)options & 0xF0) != 0x40) { + hdr_len += 4; + if (!pskb_may_pull(skb, hdr_len)) + return -EINVAL; + } + } + return iptunnel_pull_header(skb, hdr_len, tpi->proto); +} + +static void ipgre_err(struct sk_buff *skb, u32 info, + const struct tnl_ptk_info *tpi) { /* All the routers (except for Linux) return only @@ -150,14 +269,14 @@ static int ipgre_err(struct sk_buff *skb, u32 info, switch (type) { default: case ICMP_PARAMETERPROB: - return PACKET_RCVD; + return; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: case ICMP_PORT_UNREACH: /* Impossible event. */ - return PACKET_RCVD; + return; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, @@ -166,9 +285,10 @@ static int ipgre_err(struct sk_buff *skb, u32 info, break; } break; + case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) - return PACKET_RCVD; + return; break; case ICMP_REDIRECT: @@ -185,21 +305,60 @@ static int ipgre_err(struct sk_buff *skb, u32 info, iph->daddr, iph->saddr, tpi->key); if (!t) - return PACKET_REJECT; + return; if (t->parms.iph.daddr == 0 || ipv4_is_multicast(t->parms.iph.daddr)) - return PACKET_RCVD; + return; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) - return PACKET_RCVD; + return; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; - return PACKET_RCVD; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + /* All the routers (except for Linux) return only + * 8 bytes of packet payload. It means, that precise relaying of + * ICMP in the real Internet is absolutely infeasible. + * + * Moreover, Cisco "wise men" put GRE key to the third word + * in GRE header. It makes impossible maintaining even soft + * state for keyed + * GRE tunnels with enabled checksum. Tell them "thank you". + * + * Well, I wonder, rfc1812 was written by Cisco employee, + * what the hell these idiots break standards established + * by themselves??? + */ + + const int type = icmp_hdr(skb)->type; + const int code = icmp_hdr(skb)->code; + struct tnl_ptk_info tpi; + bool csum_err = false; + + if (parse_gre_header(skb, &tpi, &csum_err)) { + if (!csum_err) /* ignore csum errors. */ + return; + } + + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + ipv4_update_pmtu(skb, dev_net(skb->dev), info, + skb->dev->ifindex, 0, IPPROTO_GRE, 0); + return; + } + if (type == ICMP_REDIRECT) { + ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0, + IPPROTO_GRE, 0); + return; + } + + ipgre_err(skb, info, &tpi); } static __be64 key_to_tunnel_id(__be32 key) @@ -268,6 +427,31 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) return PACKET_REJECT; } +static int gre_rcv(struct sk_buff *skb) +{ + struct tnl_ptk_info tpi; + bool csum_err = false; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) { + /* Looped back packet, drop it! */ + if (rt_is_output_route(skb_rtable(skb))) + goto drop; + } +#endif + + if (parse_gre_header(skb, &tpi, &csum_err) < 0) + goto drop; + + if (ipgre_rcv(skb, &tpi) == PACKET_RCVD) + return 0; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); +drop: + kfree_skb(skb); + return 0; +} + static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags, __be16 proto, __be32 key, __be32 seq) { @@ -684,10 +868,9 @@ static int ipgre_tunnel_init(struct net_device *dev) return ip_tunnel_init(dev); } -static struct gre_cisco_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .priority = 0, +static const struct gre_protocol ipgre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, }; static int __net_init ipgre_init_net(struct net *net) @@ -1080,7 +1263,7 @@ static int __init ipgre_init(void) if (err < 0) goto pnet_tap_faied; - err = gre_cisco_register(&ipgre_protocol); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); goto add_proto_failed; @@ -1099,7 +1282,7 @@ static int __init ipgre_init(void) tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - gre_cisco_unregister(&ipgre_protocol); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&ipgre_tap_net_ops); pnet_tap_faied: @@ -1111,7 +1294,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - gre_cisco_unregister(&ipgre_protocol); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); } -- cgit v1.2.3 From 6bc6d0a88179b732b9a5e40e05099dc219d1b3cb Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Sat, 8 Aug 2015 17:09:14 +0200 Subject: dsa: Support multiple MDIO busses When using a cluster of switches, some topologies will have an MDIO bus per switch, not one for the whole cluster. Allow this to be represented in the device tree, by adding an optional mii-bus property at the switch level. The old platform_device method of instantiation supports this already, so only the device tree binding needs extending with an additional optional phandle. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/dsa.txt | 5 +++++ net/dsa/dsa.c | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt index f0b4cd72411d..9cf9a0ec333c 100644 --- a/Documentation/devicetree/bindings/net/dsa/dsa.txt +++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt @@ -58,6 +58,10 @@ Optionnal property: Documentation/devicetree/bindings/net/ethernet.txt for details. +- mii-bus : Should be a phandle to a valid MDIO bus device node. + This mii-bus will be used in preference to the + global dsa,mii-bus defined above, for this switch. + Optional subnodes: - fixed-link : Fixed-link subnode describing a link to a non-MDIO managed entity. See @@ -107,6 +111,7 @@ Example: #address-cells = <1>; #size-cells = <0>; reg = <17 1>; /* MDIO address 17, switch 1 in tree */ + mii-bus = <&mii_bus1>; switch1uplink: port@0 { reg = <0>; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index b445d492c115..78d4ac97aae3 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -574,7 +574,7 @@ static int dsa_of_probe(struct device *dev) { struct device_node *np = dev->of_node; struct device_node *child, *mdio, *ethernet, *port, *link; - struct mii_bus *mdio_bus; + struct mii_bus *mdio_bus, *mdio_bus_switch; struct net_device *ethernet_dev; struct dsa_platform_data *pd; struct dsa_chip_data *cd; @@ -636,6 +636,16 @@ static int dsa_of_probe(struct device *dev) if (!of_property_read_u32(child, "eeprom-length", &eeprom_len)) cd->eeprom_len = eeprom_len; + mdio = of_parse_phandle(child, "mii-bus", 0); + if (mdio) { + mdio_bus_switch = of_mdio_find_bus(mdio); + if (!mdio_bus_switch) { + ret = -EPROBE_DEFER; + goto out_free_chip; + } + cd->host_dev = &mdio_bus_switch->dev; + } + for_each_available_child_of_node(child, port) { port_reg = of_get_property(port, "reg", NULL); if (!port_reg) -- cgit v1.2.3 From 308ac9143ee2208f54d061eca54a89da509b5d92 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 8 Aug 2015 21:40:01 +0200 Subject: netfilter: nf_conntrack: push zone object into functions This patch replaces the zone id which is pushed down into functions with the actual zone object. It's a bigger one-time change, but needed for later on extending zones with a direction parameter, and thus decoupling this additional information from all call-sites. No functional changes in this patch. The default zone becomes a global const object, namely nf_ct_zone_dflt and will be returned directly in various cases, one being, when there's f.e. no zoning support. Signed-off-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 10 ++- include/net/netfilter/nf_conntrack_core.h | 3 +- include/net/netfilter/nf_conntrack_expect.h | 11 +++- include/net/netfilter/nf_conntrack_zones.h | 33 +++++++--- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 3 +- net/ipv4/netfilter/nf_defrag_ipv4.c | 11 ++-- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 3 +- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 12 ++-- net/netfilter/ipvs/ip_vs_nfct.c | 2 +- net/netfilter/nf_conntrack_core.c | 75 ++++++++++++++--------- net/netfilter/nf_conntrack_expect.c | 21 ++++--- net/netfilter/nf_conntrack_netlink.c | 84 ++++++++++++++------------ net/netfilter/nf_conntrack_pptp.c | 3 +- net/netfilter/nf_conntrack_standalone.c | 17 ++++-- net/netfilter/nf_nat_core.c | 19 +++--- net/netfilter/nf_synproxy_core.c | 4 +- net/netfilter/xt_CT.c | 6 +- net/netfilter/xt_connlimit.c | 9 +-- net/sched/act_connmark.c | 5 +- 21 files changed, 203 insertions(+), 132 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 37cd3911d5c5..f5e23c6dee8b 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -250,8 +250,12 @@ void nf_ct_untracked_status_or(unsigned long bits); void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, u32 portid, int report); + +struct nf_conntrack_zone; + void nf_conntrack_free(struct nf_conn *ct); -struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, +struct nf_conn *nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp); @@ -291,7 +295,9 @@ extern unsigned int nf_conntrack_max; extern unsigned int nf_conntrack_hash_rnd; void init_nf_conntrack_hash_rnd(void); -struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags); +struct nf_conn *nf_ct_tmpl_alloc(struct net *net, + const struct nf_conntrack_zone *zone, + gfp_t flags); #define NF_CT_STAT_INC(net, count) __this_cpu_inc((net)->ct.stat->count) #define NF_CT_STAT_INC_ATOMIC(net, count) this_cpu_inc((net)->ct.stat->count) diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index f2f0fa3bb150..c03f9c42b3cd 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -52,7 +52,8 @@ bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(struct net *net, u16 zone, +nf_conntrack_find_get(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); int __nf_conntrack_confirm(struct sk_buff *skb); diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 3f3aecbc8632..dce56f09ac9a 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -4,7 +4,9 @@ #ifndef _NF_CONNTRACK_EXPECT_H #define _NF_CONNTRACK_EXPECT_H + #include +#include extern unsigned int nf_ct_expect_hsize; extern unsigned int nf_ct_expect_max; @@ -76,15 +78,18 @@ int nf_conntrack_expect_init(void); void nf_conntrack_expect_fini(void); struct nf_conntrack_expect * -__nf_ct_expect_find(struct net *net, u16 zone, +__nf_ct_expect_find(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * -nf_ct_expect_find_get(struct net *net, u16 zone, +nf_ct_expect_find_get(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); struct nf_conntrack_expect * -nf_ct_find_expectation(struct net *net, u16 zone, +nf_ct_find_expectation(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 034efe8d45a5..0788bb0f267d 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -1,25 +1,38 @@ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H -#define NF_CT_DEFAULT_ZONE 0 - -#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) -#include +#define NF_CT_DEFAULT_ZONE_ID 0 struct nf_conntrack_zone { u16 id; }; -static inline u16 nf_ct_zone(const struct nf_conn *ct) +extern const struct nf_conntrack_zone nf_ct_zone_dflt; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +#include + +static inline const struct nf_conntrack_zone * +nf_ct_zone(const struct nf_conn *ct) { + const struct nf_conntrack_zone *nf_ct_zone = NULL; + #ifdef CONFIG_NF_CONNTRACK_ZONES - struct nf_conntrack_zone *nf_ct_zone; nf_ct_zone = nf_ct_ext_find(ct, NF_CT_EXT_ZONE); - if (nf_ct_zone) - return nf_ct_zone->id; #endif - return NF_CT_DEFAULT_ZONE; + return nf_ct_zone ? nf_ct_zone : &nf_ct_zone_dflt; } -#endif /* CONFIG_NF_CONNTRACK || CONFIG_NF_CONNTRACK_MODULE */ +static inline const struct nf_conntrack_zone * +nf_ct_zone_tmpl(const struct nf_conn *tmpl) +{ + return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt; +} + +static inline bool nf_ct_zone_equal(const struct nf_conn *a, + const struct nf_conntrack_zone *b) +{ + return nf_ct_zone(a)->id == b->id; +} +#endif /* IS_ENABLED(CONFIG_NF_CONNTRACK) */ #endif /* _NF_CONNTRACK_ZONES_H */ diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 30ad9554b5e9..8a2caaf3940b 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -280,7 +280,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; } - h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (h) { struct sockaddr_in sin; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 80d5554b9a88..8a2f41c2fe6f 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -134,9 +134,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + const struct nf_conntrack_zone *zone; NF_CT_ASSERT(skb->nfct == NULL); + zone = nf_ct_zone_tmpl(tmpl); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index b69e82bda215..20fe8e67c09b 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -43,19 +43,18 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, struct sk_buff *skb) { - u16 zone = NF_CT_DEFAULT_ZONE; - + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb->nfct) - zone = nf_ct_zone((struct nf_conn *)skb->nfct); + zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id; #endif if (nf_bridge_in_prerouting(skb)) - return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; + return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) - return IP_DEFRAG_CONNTRACK_IN + zone; + return IP_DEFRAG_CONNTRACK_IN + zone_id; else - return IP_DEFRAG_CONNTRACK_OUT + zone; + return IP_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv4_conntrack_defrag(const struct nf_hook_ops *ops, diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 4ba0c34c627b..7302900c321a 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -251,7 +251,7 @@ ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len) if (*len < 0 || (unsigned int) *len < sizeof(sin6)) return -EINVAL; - h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); if (!h) { pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n", &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port), diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 90388d606483..202914151360 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -150,7 +150,6 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_l4proto *inproto; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; NF_CT_ASSERT(skb->nfct == NULL); @@ -177,7 +176,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(net, zone, &intuple); + h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl), &intuple); if (!h) { pr_debug("icmpv6_error: no match\n"); return -NF_ACCEPT; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 267fb8d5876e..9d3de9b74856 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -33,20 +33,18 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, struct sk_buff *skb) { - u16 zone = NF_CT_DEFAULT_ZONE; - + u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb->nfct) - zone = nf_ct_zone((struct nf_conn *)skb->nfct); + zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id; #endif if (nf_bridge_in_prerouting(skb)) - return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; + return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) - return IP6_DEFRAG_CONNTRACK_IN + zone; + return IP6_DEFRAG_CONNTRACK_IN + zone_id; else - return IP6_DEFRAG_CONNTRACK_OUT + zone; - + return IP6_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv6_defrag(const struct nf_hook_ops *ops, diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 5882bbfd198c..136184572fc9 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -274,7 +274,7 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) " for conn " FMT_CONN "\n", __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); - h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE, + h = nf_conntrack_find_get(ip_vs_conn_net(cp), &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 651039ad1681..0bb26e84f849 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -126,7 +126,8 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); unsigned int nf_conntrack_hash_rnd __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { unsigned int n; @@ -135,7 +136,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) * three bytes manually. */ n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, zone->id ^ nf_conntrack_hash_rnd ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } @@ -151,12 +152,14 @@ static u32 hash_bucket(u32 hash, const struct net *net) } static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - u16 zone, unsigned int size) + const struct nf_conntrack_zone *zone, + unsigned int size) { return __hash_bucket(hash_conntrack_raw(tuple, zone), size); } -static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, +static inline u_int32_t hash_conntrack(const struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __hash_conntrack(tuple, zone, net->ct.htable_size); @@ -288,7 +291,9 @@ static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct) } /* Released via destroy_conntrack() */ -struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags) +struct nf_conn *nf_ct_tmpl_alloc(struct net *net, + const struct nf_conntrack_zone *zone, + gfp_t flags) { struct nf_conn *tmpl; @@ -306,7 +311,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, u16 zone, gfp_t flags) nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC); if (!nf_ct_zone) goto out_free; - nf_ct_zone->id = zone; + nf_ct_zone->id = zone->id; } #endif atomic_set(&tmpl->ct_general.use, 0); @@ -371,11 +376,12 @@ destroy_conntrack(struct nf_conntrack *nfct) static void nf_ct_delete_from_lists(struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; - u16 zone = nf_ct_zone(ct); unsigned int sequence; + zone = nf_ct_zone(ct); nf_ct_helper_destroy(ct); local_bh_disable(); @@ -431,8 +437,8 @@ static void death_by_timeout(unsigned long ul_conntrack) static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, - const struct nf_conntrack_tuple *tuple, - u16 zone) + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -440,8 +446,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, * so we need to check that the conntrack is confirmed */ return nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(ct) == zone && - nf_ct_is_confirmed(ct); + nf_ct_zone_equal(ct, zone) && + nf_ct_is_confirmed(ct); } /* @@ -450,7 +456,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, * and recheck nf_ct_tuple_equal(tuple, &h->tuple) */ static struct nf_conntrack_tuple_hash * -____nf_conntrack_find(struct net *net, u16 zone, +____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; @@ -486,7 +492,7 @@ begin: /* Find a connection corresponding to a tuple. */ static struct nf_conntrack_tuple_hash * -__nf_conntrack_find_get(struct net *net, u16 zone, +__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; @@ -513,7 +519,7 @@ begin: } struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(struct net *net, u16 zone, +nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, @@ -536,11 +542,11 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, int nf_conntrack_hash_check_insert(struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - u16 zone; unsigned int sequence; zone = nf_ct_zone(ct); @@ -558,12 +564,12 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone)) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone)) goto out; add_timer(&ct->timeout); @@ -588,6 +594,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); int __nf_conntrack_confirm(struct sk_buff *skb) { + const struct nf_conntrack_zone *zone; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -596,7 +603,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; - u16 zone; unsigned int sequence; ct = nf_ct_get(skb, &ctinfo); @@ -649,12 +655,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone)) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && - zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone)) goto out; /* Timer relative to confirmation time, not original @@ -707,11 +713,14 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { struct net *net = nf_ct_net(ignored_conntrack); + const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *ct; - u16 zone = nf_ct_zone(ignored_conntrack); - unsigned int hash = hash_conntrack(net, zone, tuple); + unsigned int hash; + + zone = nf_ct_zone(ignored_conntrack); + hash = hash_conntrack(net, zone, tuple); /* Disable BHs the entire time since we need to disable them at * least once for the stats anyway. @@ -721,7 +730,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone(ct) == zone) { + nf_ct_zone_equal(ct, zone)) { NF_CT_STAT_INC(net, found); rcu_read_unlock_bh(); return 1; @@ -810,7 +819,8 @@ void init_nf_conntrack_hash_rnd(void) } static struct nf_conn * -__nf_conntrack_alloc(struct net *net, u16 zone, +__nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp, u32 hash) @@ -864,7 +874,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone, nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); if (!nf_ct_zone) goto out_free; - nf_ct_zone->id = zone; + nf_ct_zone->id = zone->id; } #endif /* Because we use RCU lookups, we set ct_general.use to zero before @@ -881,7 +891,8 @@ out_free: #endif } -struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, +struct nf_conn *nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp) @@ -923,7 +934,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_tuple repl_tuple; struct nf_conntrack_ecache *ecache; struct nf_conntrack_expect *exp = NULL; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; unsigned int *timeouts; @@ -932,6 +943,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, return NULL; } + zone = nf_ct_zone_tmpl(tmpl); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) @@ -1026,10 +1038,10 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, int *set_reply, enum ip_conntrack_info *ctinfo) { + const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), @@ -1040,6 +1052,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, } /* look for tuple match */ + zone = nf_ct_zone_tmpl(tmpl); hash = hash_conntrack_raw(&tuple, zone); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { @@ -1290,6 +1303,12 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); +/* Built-in default zone used e.g. by modules. */ +const struct nf_conntrack_zone nf_ct_zone_dflt = { + .id = NF_CT_DEFAULT_ZONE_ID, +}; +EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); + #ifdef CONFIG_NF_CONNTRACK_ZONES static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { .len = sizeof(struct nf_conntrack_zone), diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index b45a4223cb05..980db854c3c8 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -88,7 +88,8 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple } struct nf_conntrack_expect * -__nf_ct_expect_find(struct net *net, u16 zone, +__nf_ct_expect_find(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; @@ -100,7 +101,7 @@ __nf_ct_expect_find(struct net *net, u16 zone, h = nf_ct_expect_dst_hash(tuple); hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone(i->master) == zone) + nf_ct_zone_equal(i->master, zone)) return i; } return NULL; @@ -109,7 +110,8 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * -nf_ct_expect_find_get(struct net *net, u16 zone, +nf_ct_expect_find_get(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; @@ -127,7 +129,8 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * -nf_ct_find_expectation(struct net *net, u16 zone, +nf_ct_find_expectation(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i, *exp = NULL; @@ -140,7 +143,7 @@ nf_ct_find_expectation(struct net *net, u16 zone, hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone(i->master) == zone) { + nf_ct_zone_equal(i->master, zone)) { exp = i; break; } @@ -220,16 +223,16 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && - nf_ct_zone(a->master) == nf_ct_zone(b->master); + nf_ct_zone_equal(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return a->master == b->master && a->class == b->class && - nf_ct_tuple_equal(&a->tuple, &b->tuple) && - nf_ct_tuple_mask_equal(&a->mask, &b->mask) && - nf_ct_zone(a->master) == nf_ct_zone(b->master); + nf_ct_tuple_equal(&a->tuple, &b->tuple) && + nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + nf_ct_zone_equal(a->master, nf_ct_zone(b->master)); } /* Generally a bad idea to call this: could have matched already. */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 6b8b0abbfab4..95f7f01e253d 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -458,6 +458,7 @@ static int ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct nlattr *nest_parms; @@ -487,8 +488,9 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct) && - nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + zone = nf_ct_zone(ct); + if (zone->id != NF_CT_DEFAULT_ZONE_ID && + nla_put_be16(skb, CTA_ZONE, htons(zone->id))) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0 || @@ -609,6 +611,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) static int ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) { + const struct nf_conntrack_zone *zone; struct net *net; struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; @@ -669,8 +672,9 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct) && - nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) + zone = nf_ct_zone(ct); + if (zone->id != NF_CT_DEFAULT_ZONE_ID && + nla_put_be16(skb, CTA_ZONE, htons(zone->id))) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) @@ -965,17 +969,18 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], } static int -ctnetlink_parse_zone(const struct nlattr *attr, u16 *zone) +ctnetlink_parse_zone(const struct nlattr *attr, + struct nf_conntrack_zone *zone) { - if (attr) + zone->id = NF_CT_DEFAULT_ZONE_ID; + #ifdef CONFIG_NF_CONNTRACK_ZONES - *zone = ntohs(nla_get_be16(attr)); + if (attr) + zone->id = ntohs(nla_get_be16(attr)); #else + if (attr) return -EOPNOTSUPP; #endif - else - *zone = 0; - return 0; } @@ -1058,7 +1063,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nf_conn *ct; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1078,7 +1083,7 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -1112,7 +1117,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, struct sk_buff *skb2 = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { @@ -1147,7 +1152,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -1645,7 +1650,8 @@ ctnetlink_change_conntrack(struct nf_conn *ct, } static struct nf_conn * -ctnetlink_create_conntrack(struct net *net, u16 zone, +ctnetlink_create_conntrack(struct net *net, + const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], struct nf_conntrack_tuple *otuple, struct nf_conntrack_tuple *rtuple, @@ -1804,7 +1810,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; err = ctnetlink_parse_zone(cda[CTA_ZONE], &zone); @@ -1824,9 +1830,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, } if (cda[CTA_TUPLE_ORIG]) - h = nf_conntrack_find_get(net, zone, &otuple); + h = nf_conntrack_find_get(net, &zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = nf_conntrack_find_get(net, zone, &rtuple); + h = nf_conntrack_find_get(net, &zone, &rtuple); if (h == NULL) { err = -ENOENT; @@ -1836,7 +1842,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (!cda[CTA_TUPLE_ORIG] || !cda[CTA_TUPLE_REPLY]) return -EINVAL; - ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, + ct = ctnetlink_create_conntrack(net, &zone, cda, &otuple, &rtuple, u3); if (IS_ERR(ct)) return PTR_ERR(ct); @@ -2091,6 +2097,7 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct) static int ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) { + const struct nf_conntrack_zone *zone; struct nlattr *nest_parms; rcu_read_lock(); @@ -2108,10 +2115,10 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; nla_nest_end(skb, nest_parms); - if (nf_ct_zone(ct)) { - if (nla_put_be16(skb, CTA_ZONE, htons(nf_ct_zone(ct)))) - goto nla_put_failure; - } + zone = nf_ct_zone(ct); + if (zone->id != NF_CT_DEFAULT_ZONE_ID && + nla_put_be16(skb, CTA_ZONE, htons(zone->id))) + goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) goto nla_put_failure; @@ -2612,7 +2619,7 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; - u16 zone = 0; + struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, .done = ctnetlink_exp_done, @@ -2622,13 +2629,11 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - if (cda[CTA_EXPECT_ZONE]) { - err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); - if (err < 0) - return err; - } + err = ctnetlink_parse_zone(cda[CTA_EXPECT_ZONE], &zone); + if (err < 0) + return err; - h = nf_conntrack_find_get(net, zone, &tuple); + h = nf_conntrack_find_get(net, &zone, &tuple); if (!h) return -ENOENT; @@ -2652,7 +2657,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, struct sk_buff *skb2; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (nlh->nlmsg_flags & NLM_F_DUMP) { @@ -2681,7 +2686,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - exp = nf_ct_expect_find_get(net, zone, &tuple); + exp = nf_ct_expect_find_get(net, &zone, &tuple); if (!exp) return -ENOENT; @@ -2732,8 +2737,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct nfgenmsg *nfmsg = nlmsg_data(nlh); struct hlist_node *next; u_int8_t u3 = nfmsg->nfgen_family; + struct nf_conntrack_zone zone; unsigned int i; - u16 zone; int err; if (cda[CTA_EXPECT_TUPLE]) { @@ -2747,7 +2752,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, return err; /* bump usage count to 2 */ - exp = nf_ct_expect_find_get(net, zone, &tuple); + exp = nf_ct_expect_find_get(net, &zone, &tuple); if (!exp) return -ENOENT; @@ -2937,7 +2942,8 @@ err_out: } static int -ctnetlink_create_expect(struct net *net, u16 zone, +ctnetlink_create_expect(struct net *net, + const struct nf_conntrack_zone *zone, const struct nlattr * const cda[], u_int8_t u3, u32 portid, int report) { @@ -3011,7 +3017,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_expect *exp; struct nfgenmsg *nfmsg = nlmsg_data(nlh); u_int8_t u3 = nfmsg->nfgen_family; - u16 zone; + struct nf_conntrack_zone zone; int err; if (!cda[CTA_EXPECT_TUPLE] @@ -3028,14 +3034,12 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, return err; spin_lock_bh(&nf_conntrack_expect_lock); - exp = __nf_ct_expect_find(net, zone, &tuple); - + exp = __nf_ct_expect_find(net, &zone, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_expect_lock); err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { - err = ctnetlink_create_expect(net, zone, cda, - u3, + err = ctnetlink_create_expect(net, &zone, cda, u3, NETLINK_CB(skb).portid, nlmsg_report(nlh)); } diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 825c3e3f8305..5588c7ae1ac2 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -143,13 +143,14 @@ static int destroy_sibling_or_exp(struct net *net, struct nf_conn *ct, const struct nf_conntrack_tuple *t) { const struct nf_conntrack_tuple_hash *h; + const struct nf_conntrack_zone *zone; struct nf_conntrack_expect *exp; struct nf_conn *sibling; - u16 zone = nf_ct_zone(ct); pr_debug("trying to timeout ct or exp for tuple "); nf_ct_dump_tuple(t); + zone = nf_ct_zone(ct); h = nf_conntrack_find_get(net, zone, t); if (h) { sibling = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index fc823fa5dcf5..28c8b2b982ec 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -140,6 +140,17 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) } #endif +#ifdef CONFIG_NF_CONNTRACK_ZONES +static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct) +{ + seq_printf(s, "zone=%u ", nf_ct_zone(ct)->id); +} +#else +static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct) +{ +} +#endif + #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP static void ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct) { @@ -228,11 +239,7 @@ static int ct_seq_show(struct seq_file *s, void *v) #endif ct_show_secctx(s, ct); - -#ifdef CONFIG_NF_CONNTRACK_ZONES - seq_printf(s, "zone=%u ", nf_ct_zone(ct)); -#endif - + ct_show_zone(s, ct); ct_show_delta_time(s, ct); seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 4e0b47831d43..65ebaf9fc4f9 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -118,14 +118,15 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, u16 zone, +hash_by_src(const struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ zone ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ zone->id ^ nf_conntrack_hash_rnd); return reciprocal_scale(hash, net->ct.nat_htable_size); } @@ -185,7 +186,8 @@ same_src(const struct nf_conn *ct, /* Only called for SRC manip */ static int -find_appropriate_src(struct net *net, u16 zone, +find_appropriate_src(struct net *net, + const struct nf_conntrack_zone *zone, const struct nf_nat_l3proto *l3proto, const struct nf_nat_l4proto *l4proto, const struct nf_conntrack_tuple *tuple, @@ -198,7 +200,7 @@ find_appropriate_src(struct net *net, u16 zone, hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { ct = nat->ct; - if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { + if (same_src(ct, tuple) && nf_ct_zone_equal(ct, zone)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -218,7 +220,8 @@ find_appropriate_src(struct net *net, u16 zone, * the ip with the lowest src-ip/dst-ip/proto usage. */ static void -find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, +find_best_ips_proto(const struct nf_conntrack_zone *zone, + struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) @@ -258,7 +261,7 @@ find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? - 0 : (__force u32)tuple->dst.u3.all[max] ^ zone); + 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { @@ -297,10 +300,12 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { + const struct nf_conntrack_zone *zone; const struct nf_nat_l3proto *l3proto; const struct nf_nat_l4proto *l4proto; struct net *net = nf_ct_net(ct); - u16 zone = nf_ct_zone(ct); + + zone = nf_ct_zone(ct); rcu_read_lock(); l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num); diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 71f1e9fdfa18..58b2e84dab27 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -17,10 +17,12 @@ #include #include #include + #include #include #include #include +#include int synproxy_net_id; EXPORT_SYMBOL_GPL(synproxy_net_id); @@ -352,7 +354,7 @@ static int __net_init synproxy_net_init(struct net *net) struct nf_conn *ct; int err = -ENOMEM; - ct = nf_ct_tmpl_alloc(net, 0, GFP_KERNEL); + ct = nf_ct_tmpl_alloc(net, &nf_ct_zone_dflt, GFP_KERNEL); if (IS_ERR(ct)) { err = PTR_ERR(ct); goto err1; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index c6630030c912..29e2856063ff 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -184,6 +184,7 @@ out: static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { + struct nf_conntrack_zone zone; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -201,7 +202,10 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (ret < 0) goto err1; - ct = nf_ct_tmpl_alloc(par->net, info->zone, GFP_KERNEL); + memset(&zone, 0, sizeof(zone)); + zone.id = info->zone; + + ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL); ret = PTR_ERR(ct); if (IS_ERR(ct)) goto err2; diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 29ba6218a820..075d89d94d28 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -134,7 +134,7 @@ static bool add_hlist(struct hlist_head *head, static unsigned int check_hlist(struct net *net, struct hlist_head *head, const struct nf_conntrack_tuple *tuple, - u16 zone, + const struct nf_conntrack_zone *zone, bool *addit) { const struct nf_conntrack_tuple_hash *found; @@ -201,7 +201,7 @@ static unsigned int count_tree(struct net *net, struct rb_root *root, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - u8 family, u16 zone) + u8 family, const struct nf_conntrack_zone *zone) { struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES]; struct rb_node **rbnode, *parent; @@ -290,7 +290,8 @@ static int count_them(struct net *net, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - u_int8_t family, u16 zone) + u_int8_t family, + const struct nf_conntrack_zone *zone) { struct rb_root *root; int count; @@ -321,10 +322,10 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) union nf_inet_addr addr; struct nf_conntrack_tuple tuple; const struct nf_conntrack_tuple *tuple_ptr = &tuple; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int connections; - u16 zone = NF_CT_DEFAULT_ZONE; ct = nf_ct_get(skb, &ctinfo); if (ct != NULL) { diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index f2b540220ad0..e67a1bdd0929 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -37,6 +37,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct tcf_connmark_info *ca = a->priv; + struct nf_conntrack_zone zone; struct nf_conn *c; int proto; @@ -70,7 +71,9 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, proto, &tuple)) goto out; - thash = nf_conntrack_find_get(dev_net(skb->dev), ca->zone, &tuple); + zone.id = ca->zone; + + thash = nf_conntrack_find_get(dev_net(skb->dev), &zone, &tuple); if (!thash) goto out; -- cgit v1.2.3 From eae8dee992af622fd992cb2370cd596ac80ef141 Mon Sep 17 00:00:00 2001 From: Thomas Egerer Date: Mon, 27 Jul 2015 10:50:19 +0200 Subject: xfrm6: Fix IPv6 ECN decapsulation Using ipv6_get_dsfield on the outer IP header implies that inner and outer header are of the the same address family. For interfamily tunnels, particularly 646, the code reading the DSCP field obtains the wrong values (IHL and the upper four bits of the DSCP field). This can cause the code to detect a congestion encoutered state in the outer header and enable the corresponding bits in the inner header, too. Since the DSCP field is stored in the xfrm mode common buffer independently from the IP version of the outer header, it's safe (and correct) to take this value from there. Signed-off-by: Thomas Egerer Signed-off-by: Steffen Klassert --- net/ipv6/xfrm6_mode_tunnel.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 901ef6f8addc..f7fbdbabe50e 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -20,10 +20,9 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) { - const struct ipv6hdr *outer_iph = ipv6_hdr(skb); struct ipv6hdr *inner_iph = ipipv6_hdr(skb); - if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos)) IP6_ECN_set_ce(inner_iph); } -- cgit v1.2.3 From df367561ffe5a66cd0b2970fdb8897d5487d38e6 Mon Sep 17 00:00:00 2001 From: Andrzej Hajda Date: Fri, 7 Aug 2015 09:59:34 +0200 Subject: net/xfrm: use kmemdup rather than duplicating its implementation The patch was generated using fixed coccinelle semantic patch scripts/coccinelle/api/memdup.cocci [1]. [1]: http://permalink.gmane.org/gmane.linux.kernel/2014320 Signed-off-by: Andrzej Hajda Signed-off-by: Steffen Klassert --- net/xfrm/xfrm_user.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0cebf1fc37a2..a8de9e300200 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -925,12 +925,10 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) return err; if (attrs[XFRMA_ADDRESS_FILTER]) { - filter = kmalloc(sizeof(*filter), GFP_KERNEL); + filter = kmemdup(nla_data(attrs[XFRMA_ADDRESS_FILTER]), + sizeof(*filter), GFP_KERNEL); if (filter == NULL) return -ENOMEM; - - memcpy(filter, nla_data(attrs[XFRMA_ADDRESS_FILTER]), - sizeof(*filter)); } if (attrs[XFRMA_PROTO]) -- cgit v1.2.3 From 42a7b32b73d6bf22e4bdd7bf68746e2d71f4cd8d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Mon, 10 Aug 2015 16:58:11 -0600 Subject: xfrm: Add oif to dst lookups Rules can be installed that direct route lookups to specific tables based on oif. Plumb the oif through the xfrm lookups so it gets set in the flow struct and passed to the resolver routines. Signed-off-by: David Ahern Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 7 +++++-- net/ipv4/xfrm4_policy.c | 11 ++++++----- net/ipv6/xfrm6_policy.c | 7 ++++--- net/xfrm/xfrm_policy.c | 24 ++++++++++++++---------- 4 files changed, 29 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index f0ee97eec24d..312e3fee9ccf 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -285,10 +285,13 @@ struct xfrm_policy_afinfo { unsigned short family; struct dst_ops *dst_ops; void (*garbage_collect)(struct net *net); - struct dst_entry *(*dst_lookup)(struct net *net, int tos, + struct dst_entry *(*dst_lookup)(struct net *net, + int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr); - int (*get_saddr)(struct net *net, xfrm_address_t *saddr, xfrm_address_t *daddr); + int (*get_saddr)(struct net *net, int oif, + xfrm_address_t *saddr, + xfrm_address_t *daddr); void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index bff69746e05f..55b3c0f4dde5 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -19,7 +19,7 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, - int tos, + int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { @@ -28,6 +28,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; + fl4->flowi4_oif = oif; if (saddr) fl4->saddr = saddr->a4; @@ -38,22 +39,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, return ERR_CAST(rt); } -static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, +static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { struct flowi4 fl4; - return __xfrm4_dst_lookup(net, &fl4, tos, saddr, daddr); + return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr); } -static int xfrm4_get_saddr(struct net *net, +static int xfrm4_get_saddr(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; struct flowi4 fl4; - dst = __xfrm4_dst_lookup(net, &fl4, 0, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index ed0583c1b9fc..a74013d3eceb 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -26,7 +26,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo; -static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, +static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { @@ -35,6 +35,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int err; memset(&fl6, 0, sizeof(fl6)); + fl6.flowi6_oif = oif; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); @@ -50,13 +51,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, return dst; } -static int xfrm6_get_saddr(struct net *net, +static int xfrm6_get_saddr(struct net *net, int oif, xfrm_address_t *saddr, xfrm_address_t *daddr) { struct dst_entry *dst; struct net_device *dev; - dst = xfrm6_dst_lookup(net, 0, NULL, daddr); + dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 18cead7645be..94af3d065785 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -115,7 +115,8 @@ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) rcu_read_unlock(); } -static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, +static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, + int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family) @@ -127,14 +128,15 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); - dst = afinfo->dst_lookup(net, tos, saddr, daddr); + dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr); xfrm_policy_put_afinfo(afinfo); return dst; } -static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, +static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, + int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family) @@ -153,7 +155,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, daddr = x->coaddr; } - dst = __xfrm_dst_lookup(net, tos, saddr, daddr, family); + dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family); if (!IS_ERR(dst)) { if (prev_saddr != saddr) @@ -1373,15 +1375,15 @@ int __xfrm_sk_clone_policy(struct sock *sk) } static int -xfrm_get_saddr(struct net *net, xfrm_address_t *local, xfrm_address_t *remote, - unsigned short family) +xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, + xfrm_address_t *remote, unsigned short family) { int err; struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; - err = afinfo->get_saddr(net, local, remote); + err = afinfo->get_saddr(net, oif, local, remote); xfrm_policy_put_afinfo(afinfo); return err; } @@ -1410,7 +1412,9 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { - error = xfrm_get_saddr(net, &tmp, remote, tmpl->encap_family); + error = xfrm_get_saddr(net, fl->flowi_oif, + &tmp, remote, + tmpl->encap_family); if (error) goto fail; local = &tmp; @@ -1690,8 +1694,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { family = xfrm[i]->props.family; - dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr, - family); + dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, + &saddr, &daddr, family); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; -- cgit v1.2.3 From e071d93eb40c969dc8c578dde5ddd89a30fb01cb Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 22 Jun 2015 09:13:23 +0200 Subject: batman-adv: Replace gw_reselect divisor with simple shift The gw_factor is divided by BATADV_TQ_LOCAL_WINDOW_SIZE ** 2 * 64. But the rest of the calculation has nothing to do with the tq window size and therefore the calculation is just (tmp_gw_factor / (64 ** 3)). Replace it with a simple shift to avoid a costly 64-bit divide when the max_gw_factor is changed from u32 to u64. This type change is necessary to avoid an overflow bug. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/gateway_client.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index bb0158620628..e1e1f317b915 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -154,14 +154,10 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; uint32_t max_gw_factor = 0, tmp_gw_factor = 0; - uint32_t gw_divisor; uint8_t max_tq = 0; uint8_t tq_avg; struct batadv_orig_node *orig_node; - gw_divisor = BATADV_TQ_LOCAL_WINDOW_SIZE * BATADV_TQ_LOCAL_WINDOW_SIZE; - gw_divisor *= 64; - rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { if (gw_node->deleted) @@ -187,7 +183,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) tmp_gw_factor = tq_avg * tq_avg; tmp_gw_factor *= gw_node->bandwidth_down; tmp_gw_factor *= 100 * 100; - tmp_gw_factor /= gw_divisor; + tmp_gw_factor >>= 18; if ((tmp_gw_factor > max_gw_factor) || ((tmp_gw_factor == max_gw_factor) && -- cgit v1.2.3 From 571a963768e7711e0bb5a30f71247b46ec9ca7b9 Mon Sep 17 00:00:00 2001 From: Ruben Wisniewski Date: Tue, 16 Jun 2015 21:06:23 +0200 Subject: batman-adv: Avoid u32 overflow during gateway select The gateway selection based on fast connections is using a single value calculated from the average tq (0-255) and the download bandwidth (in 100Kibit). The formula for the first step (tq ** 2 * 10000 * bandwidth) tends to overflow a u32 with low bandwidth settings like 50 [100KiBit] and a tq value of over 92. Changing this to a 64 bit unsigned integer allows to support a bandwidth_down with up to ~2.8e10 [100KiBit] and a perfect tq of 255. This is ~6.6 times higher than the maximum possible value of the gateway announcement TVLV. This problem only affects the non-default gw_sel_class 1. Signed-off-by: Ruben Wisniewsi [sven@narfation.org: rewritten commit message] Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/gateway_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index e1e1f317b915..4ac24d8d151b 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -153,7 +153,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) struct batadv_neigh_node *router; struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; - uint32_t max_gw_factor = 0, tmp_gw_factor = 0; + uint64_t max_gw_factor = 0, tmp_gw_factor = 0; uint8_t max_tq = 0; uint8_t tq_avg; struct batadv_orig_node *orig_node; -- cgit v1.2.3 From cdf0969763e020923abe28fddc605add572febc2 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Tue, 11 Aug 2015 12:00:37 -0700 Subject: Revert "Merge branch 'mv88e6xxx-switchdev-fdb'" This reverts commit f1d5ca434413b20cd3f8c18ff2b634b7782149a5, reversing changes made to 4933d85c5173832ebd261756522095837583c458. I applied v2 instead of v3. Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6171.c | 6 +- drivers/net/dsa/mv88e6352.c | 6 +- drivers/net/dsa/mv88e6xxx.c | 223 +++++++++++------------------------ drivers/net/dsa/mv88e6xxx.h | 31 ++--- drivers/net/ethernet/rocker/rocker.c | 2 +- include/net/dsa.h | 16 +-- include/net/switchdev.h | 3 +- net/bridge/br_fdb.c | 2 +- net/dsa/slave.c | 218 ++++++++++++++++------------------ net/switchdev/switchdev.c | 7 +- 10 files changed, 197 insertions(+), 317 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c index 735f04cd83ee..1c7808495a9d 100644 --- a/drivers/net/dsa/mv88e6171.c +++ b/drivers/net/dsa/mv88e6171.c @@ -116,9 +116,9 @@ struct dsa_switch_driver mv88e6171_switch_driver = { .port_join_bridge = mv88e6xxx_join_bridge, .port_leave_bridge = mv88e6xxx_leave_bridge, .port_stp_update = mv88e6xxx_port_stp_update, - .port_fdb_add = mv88e6xxx_port_fdb_add, - .port_fdb_del = mv88e6xxx_port_fdb_del, - .port_fdb_getnext = mv88e6xxx_port_fdb_getnext, + .fdb_add = mv88e6xxx_port_fdb_add, + .fdb_del = mv88e6xxx_port_fdb_del, + .fdb_getnext = mv88e6xxx_port_fdb_getnext, }; MODULE_ALIAS("platform:mv88e6171"); diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c index a18f7c83d4cb..7e935852e192 100644 --- a/drivers/net/dsa/mv88e6352.c +++ b/drivers/net/dsa/mv88e6352.c @@ -343,9 +343,9 @@ struct dsa_switch_driver mv88e6352_switch_driver = { .port_join_bridge = mv88e6xxx_join_bridge, .port_leave_bridge = mv88e6xxx_leave_bridge, .port_stp_update = mv88e6xxx_port_stp_update, - .port_fdb_add = mv88e6xxx_port_fdb_add, - .port_fdb_del = mv88e6xxx_port_fdb_del, - .port_fdb_getnext = mv88e6xxx_port_fdb_getnext, + .fdb_add = mv88e6xxx_port_fdb_add, + .fdb_del = mv88e6xxx_port_fdb_del, + .fdb_getnext = mv88e6xxx_port_fdb_getnext, }; MODULE_ALIAS("platform:mv88e6172"); diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index 9c6781de533b..109452056eff 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -964,7 +964,7 @@ static int _mv88e6xxx_atu_cmd(struct dsa_switch *ds, int fid, u16 cmd) { int ret; - ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_FID, fid); + ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, 0x01, fid); if (ret < 0) return ret; @@ -1091,7 +1091,7 @@ int mv88e6xxx_join_bridge(struct dsa_switch *ds, int port, u32 br_port_mask) ps->bridge_mask[fid] = br_port_mask; if (fid != ps->fid[port]) { - clear_bit(ps->fid[port], ps->fid_bitmap); + ps->fid_mask |= 1 << ps->fid[port]; ps->fid[port] = fid; ret = _mv88e6xxx_update_bridge_config(ds, fid); } @@ -1125,16 +1125,9 @@ int mv88e6xxx_leave_bridge(struct dsa_switch *ds, int port, u32 br_port_mask) mutex_lock(&ps->smi_mutex); - newfid = find_next_zero_bit(ps->fid_bitmap, VLAN_N_VID, 1); - if (unlikely(newfid > ps->num_ports)) { - netdev_err(ds->ports[port], "all first %d FIDs are used\n", - ps->num_ports); - ret = -ENOSPC; - goto unlock; - } - + newfid = __ffs(ps->fid_mask); ps->fid[port] = newfid; - set_bit(newfid, ps->fid_bitmap); + ps->fid_mask &= ~(1 << newfid); ps->bridge_mask[fid] &= ~(1 << port); ps->bridge_mask[newfid] = 1 << port; @@ -1142,7 +1135,6 @@ int mv88e6xxx_leave_bridge(struct dsa_switch *ds, int port, u32 br_port_mask) if (!ret) ret = _mv88e6xxx_update_bridge_config(ds, newfid); -unlock: mutex_unlock(&ps->smi_mutex); return ret; @@ -1182,8 +1174,8 @@ int mv88e6xxx_port_stp_update(struct dsa_switch *ds, int port, u8 state) return 0; } -static int _mv88e6xxx_atu_mac_write(struct dsa_switch *ds, - const u8 addr[ETH_ALEN]) +static int __mv88e6xxx_write_addr(struct dsa_switch *ds, + const unsigned char *addr) { int i, ret; @@ -1198,7 +1190,7 @@ static int _mv88e6xxx_atu_mac_write(struct dsa_switch *ds, return 0; } -static int _mv88e6xxx_atu_mac_read(struct dsa_switch *ds, u8 addr[ETH_ALEN]) +static int __mv88e6xxx_read_addr(struct dsa_switch *ds, unsigned char *addr) { int i, ret; @@ -1214,190 +1206,109 @@ static int _mv88e6xxx_atu_mac_read(struct dsa_switch *ds, u8 addr[ETH_ALEN]) return 0; } -static int _mv88e6xxx_atu_load(struct dsa_switch *ds, - struct mv88e6xxx_atu_entry *entry) -{ - u16 reg = 0; - int ret; - - ret = _mv88e6xxx_atu_wait(ds); - if (ret < 0) - return ret; - - ret = _mv88e6xxx_atu_mac_write(ds, entry->mac); - if (ret < 0) - return ret; - - if (entry->state != GLOBAL_ATU_DATA_STATE_UNUSED) { - unsigned int mask, shift; - - if (entry->trunk) { - reg |= GLOBAL_ATU_DATA_TRUNK; - mask = GLOBAL_ATU_DATA_TRUNK_ID_MASK; - shift = GLOBAL_ATU_DATA_TRUNK_ID_SHIFT; - } else { - mask = GLOBAL_ATU_DATA_PORT_VECTOR_MASK; - shift = GLOBAL_ATU_DATA_PORT_VECTOR_SHIFT; - } - - reg |= (entry->portv_trunkid << shift) & mask; - } - - reg |= entry->state & GLOBAL_ATU_DATA_STATE_MASK; - - ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_DATA, reg); - if (ret < 0) - return ret; - - return _mv88e6xxx_atu_cmd(ds, entry->fid, GLOBAL_ATU_OP_LOAD_DB); -} - -static int _mv88e6xxx_atu_getnext(struct dsa_switch *ds, u16 fid, - const u8 addr[ETH_ALEN], - struct mv88e6xxx_atu_entry *entry) +static int __mv88e6xxx_port_fdb_cmd(struct dsa_switch *ds, int port, + const unsigned char *addr, int state) { - struct mv88e6xxx_atu_entry next = { 0 }; + struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); + u8 fid = ps->fid[port]; int ret; - next.fid = fid; - ret = _mv88e6xxx_atu_wait(ds); if (ret < 0) return ret; - ret = _mv88e6xxx_atu_mac_write(ds, addr); + ret = __mv88e6xxx_write_addr(ds, addr); if (ret < 0) return ret; - ret = _mv88e6xxx_atu_cmd(ds, fid, GLOBAL_ATU_OP_GET_NEXT_DB); - if (ret < 0) - return ret; - - ret = _mv88e6xxx_atu_mac_read(ds, next.mac); - if (ret < 0) - return ret; - - ret = _mv88e6xxx_reg_read(ds, REG_GLOBAL, GLOBAL_ATU_DATA); - if (ret < 0) - return ret; - - next.state = ret & GLOBAL_ATU_DATA_STATE_MASK; - if (next.state != GLOBAL_ATU_DATA_STATE_UNUSED) { - unsigned int mask, shift; - - if (ret & GLOBAL_ATU_DATA_TRUNK) { - next.trunk = true; - mask = GLOBAL_ATU_DATA_TRUNK_ID_MASK; - shift = GLOBAL_ATU_DATA_TRUNK_ID_SHIFT; - } else { - next.trunk = false; - mask = GLOBAL_ATU_DATA_PORT_VECTOR_MASK; - shift = GLOBAL_ATU_DATA_PORT_VECTOR_SHIFT; - } - - next.portv_trunkid = (ret & mask) >> shift; - } - - *entry = next; - return 0; -} - -static int _mv88e6xxx_port_vid_to_fid(struct dsa_switch *ds, int port, u16 vid) -{ - struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); - - if (vid == 0) - return ps->fid[port]; - - return -ENOENT; -} - -static int _mv88e6xxx_port_fdb_load(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN], u8 state) -{ - struct mv88e6xxx_atu_entry entry = { 0 }; - int ret; - - ret = _mv88e6xxx_port_vid_to_fid(ds, port, vid); - if (ret < 0) + ret = _mv88e6xxx_reg_write(ds, REG_GLOBAL, GLOBAL_ATU_DATA, + (0x10 << port) | state); + if (ret) return ret; - entry.fid = ret; - entry.state = state; - ether_addr_copy(entry.mac, addr); - if (state != GLOBAL_ATU_DATA_STATE_UNUSED) { - entry.trunk = false; - entry.portv_trunkid = BIT(port); - } + ret = _mv88e6xxx_atu_cmd(ds, fid, GLOBAL_ATU_OP_LOAD_DB); - return _mv88e6xxx_atu_load(ds, &entry); + return ret; } -int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN]) +int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid) { - struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); - u8 state = is_multicast_ether_addr(addr) ? + int state = is_multicast_ether_addr(addr) ? GLOBAL_ATU_DATA_STATE_MC_STATIC : GLOBAL_ATU_DATA_STATE_UC_STATIC; + struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); int ret; mutex_lock(&ps->smi_mutex); - ret = _mv88e6xxx_port_fdb_load(ds, port, vid, addr, state); + ret = __mv88e6xxx_port_fdb_cmd(ds, port, addr, state); mutex_unlock(&ps->smi_mutex); return ret; } -int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN]) +int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid) { struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); - u8 state = GLOBAL_ATU_DATA_STATE_UNUSED; int ret; mutex_lock(&ps->smi_mutex); - ret = _mv88e6xxx_port_fdb_load(ds, port, vid, addr, state); + ret = __mv88e6xxx_port_fdb_cmd(ds, port, addr, + GLOBAL_ATU_DATA_STATE_UNUSED); mutex_unlock(&ps->smi_mutex); return ret; } -int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port, u16 *vid, - u8 addr[ETH_ALEN], bool *is_static) +static int __mv88e6xxx_port_getnext(struct dsa_switch *ds, int port, + unsigned char *addr, bool *is_static) { struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); - struct mv88e6xxx_atu_entry next; - u16 fid; - int ret; + u8 fid = ps->fid[port]; + int ret, state; - mutex_lock(&ps->smi_mutex); + ret = _mv88e6xxx_atu_wait(ds); + if (ret < 0) + return ret; - ret = _mv88e6xxx_port_vid_to_fid(ds, port, *vid); + ret = __mv88e6xxx_write_addr(ds, addr); if (ret < 0) - goto unlock; - fid = ret; + return ret; do { - if (is_broadcast_ether_addr(addr)) { - ret = -ENOENT; - goto unlock; - } + ret = _mv88e6xxx_atu_cmd(ds, fid, GLOBAL_ATU_OP_GET_NEXT_DB); + if (ret < 0) + return ret; - ret = _mv88e6xxx_atu_getnext(ds, fid, addr, &next); + ret = _mv88e6xxx_reg_read(ds, REG_GLOBAL, GLOBAL_ATU_DATA); if (ret < 0) - goto unlock; + return ret; + state = ret & GLOBAL_ATU_DATA_STATE_MASK; + if (state == GLOBAL_ATU_DATA_STATE_UNUSED) + return -ENOENT; + } while (!(((ret >> 4) & 0xff) & (1 << port))); - ether_addr_copy(addr, next.mac); + ret = __mv88e6xxx_read_addr(ds, addr); + if (ret < 0) + return ret; - if (next.state == GLOBAL_ATU_DATA_STATE_UNUSED) - continue; - } while (next.trunk || (next.portv_trunkid & BIT(port)) == 0); + *is_static = state == (is_multicast_ether_addr(addr) ? + GLOBAL_ATU_DATA_STATE_MC_STATIC : + GLOBAL_ATU_DATA_STATE_UC_STATIC); + + return 0; +} + +/* get next entry for port */ +int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port, + unsigned char *addr, bool *is_static) +{ + struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); + int ret; - *is_static = next.state == (is_multicast_ether_addr(addr) ? - GLOBAL_ATU_DATA_STATE_MC_STATIC : - GLOBAL_ATU_DATA_STATE_UC_STATIC); -unlock: + mutex_lock(&ps->smi_mutex); + ret = __mv88e6xxx_port_getnext(ds, port, addr, is_static); mutex_unlock(&ps->smi_mutex); return ret; @@ -1641,9 +1552,9 @@ static int mv88e6xxx_setup_port(struct dsa_switch *ds, int port) * ports, and allow each of the 'real' ports to only talk to * the upstream port. */ - fid = port + 1; + fid = __ffs(ps->fid_mask); ps->fid[port] = fid; - set_bit(fid, ps->fid_bitmap); + ps->fid_mask &= ~(1 << fid); if (!dsa_is_cpu_port(ds, port)) ps->bridge_mask[fid] = 1 << port; @@ -1740,7 +1651,7 @@ static int mv88e6xxx_atu_show_db(struct seq_file *s, struct dsa_switch *ds, unsigned char addr[6]; int ret, data, state; - ret = _mv88e6xxx_atu_mac_write(ds, bcast); + ret = __mv88e6xxx_write_addr(ds, bcast); if (ret < 0) return ret; @@ -1755,7 +1666,7 @@ static int mv88e6xxx_atu_show_db(struct seq_file *s, struct dsa_switch *ds, state = data & GLOBAL_ATU_DATA_STATE_MASK; if (state == GLOBAL_ATU_DATA_STATE_UNUSED) break; - ret = _mv88e6xxx_atu_mac_read(ds, addr); + ret = __mv88e6xxx_read_addr(ds, addr); if (ret < 0) return ret; mv88e6xxx_atu_show_entry(s, dbnum, addr, data); @@ -1942,6 +1853,8 @@ int mv88e6xxx_setup_common(struct dsa_switch *ds) ps->id = REG_READ(REG_PORT(0), PORT_SWITCH_ID) & 0xfff0; + ps->fid_mask = (1 << DSA_MAX_PORTS) - 1; + INIT_WORK(&ps->bridge_work, mv88e6xxx_bridge_work); name = kasprintf(GFP_KERNEL, "dsa%d", ds->index); diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h index a94c0cbb3813..8b017d65b691 100644 --- a/drivers/net/dsa/mv88e6xxx.h +++ b/drivers/net/dsa/mv88e6xxx.h @@ -11,8 +11,6 @@ #ifndef __MV88E6XXX_H #define __MV88E6XXX_H -#include - #ifndef UINT64_MAX #define UINT64_MAX (u64)(~((u64)0)) #endif @@ -171,7 +169,6 @@ #define GLOBAL_MAC_01 0x01 #define GLOBAL_MAC_23 0x02 #define GLOBAL_MAC_45 0x03 -#define GLOBAL_ATU_FID 0x01 /* 6097 6165 6351 6352 */ #define GLOBAL_CONTROL 0x04 #define GLOBAL_CONTROL_SW_RESET BIT(15) #define GLOBAL_CONTROL_PPU_ENABLE BIT(14) @@ -206,8 +203,6 @@ #define GLOBAL_ATU_OP_GET_CLR_VIOLATION ((7 << 12) | GLOBAL_ATU_OP_BUSY) #define GLOBAL_ATU_DATA 0x0c #define GLOBAL_ATU_DATA_TRUNK BIT(15) -#define GLOBAL_ATU_DATA_TRUNK_ID_MASK 0x00f0 -#define GLOBAL_ATU_DATA_TRUNK_ID_SHIFT 4 #define GLOBAL_ATU_DATA_PORT_VECTOR_MASK 0x3ff0 #define GLOBAL_ATU_DATA_PORT_VECTOR_SHIFT 4 #define GLOBAL_ATU_DATA_STATE_MASK 0x0f @@ -318,14 +313,6 @@ #define GLOBAL2_QOS_WEIGHT 0x1c #define GLOBAL2_MISC 0x1d -struct mv88e6xxx_atu_entry { - u16 fid; - u8 state; - bool trunk; - u16 portv_trunkid; - u8 mac[ETH_ALEN]; -}; - struct mv88e6xxx_priv_state { /* When using multi-chip addressing, this mutex protects * access to the indirect access registers. (In single-chip @@ -364,9 +351,9 @@ struct mv88e6xxx_priv_state { /* hw bridging */ - DECLARE_BITMAP(fid_bitmap, VLAN_N_VID); /* FIDs 1 to 4095 available */ - u16 fid[DSA_MAX_PORTS]; /* per (non-bridged) port FID */ - u16 bridge_mask[DSA_MAX_PORTS]; /* br groups (indexed by FID) */ + u32 fid_mask; + u8 fid[DSA_MAX_PORTS]; + u16 bridge_mask[DSA_MAX_PORTS]; unsigned long port_state_update_mask; u8 port_state[DSA_MAX_PORTS]; @@ -426,15 +413,15 @@ int mv88e6xxx_set_eee(struct dsa_switch *ds, int port, int mv88e6xxx_join_bridge(struct dsa_switch *ds, int port, u32 br_port_mask); int mv88e6xxx_leave_bridge(struct dsa_switch *ds, int port, u32 br_port_mask); int mv88e6xxx_port_stp_update(struct dsa_switch *ds, int port, u8 state); +int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); +int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); +int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port, + unsigned char *addr, bool *is_static); int mv88e6xxx_phy_page_read(struct dsa_switch *ds, int port, int page, int reg); int mv88e6xxx_phy_page_write(struct dsa_switch *ds, int port, int page, int reg, int val); -int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN]); -int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN]); -int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port, u16 *vid, - u8 addr[ETH_ALEN], bool *is_static); extern struct dsa_switch_driver mv88e6131_switch_driver; extern struct dsa_switch_driver mv88e6123_61_65_switch_driver; diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 80bb25c5a644..b77e0e7307d4 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4543,7 +4543,7 @@ static int rocker_port_fdb_dump(const struct rocker_port *rocker_port, hash_for_each_safe(rocker->fdb_tbl, bkt, tmp, found, entry) { if (found->key.pport != rocker_port->pport) continue; - ether_addr_copy(fdb->addr, found->key.addr); + fdb->addr = found->key.addr; fdb->vid = rocker_port_vlan_to_vid(rocker_port, found->key.vlan_id); err = obj->cb(rocker_port->dev, obj); diff --git a/include/net/dsa.h b/include/net/dsa.h index 091d35f77180..fbca63ba8f73 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,16 +296,12 @@ struct dsa_switch_driver { u32 br_port_mask); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); - - /* - * Forwarding database - */ - int (*port_fdb_add)(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN]); - int (*port_fdb_del)(struct dsa_switch *ds, int port, u16 vid, - const u8 addr[ETH_ALEN]); - int (*port_fdb_getnext)(struct dsa_switch *ds, int port, u16 *vid, - u8 addr[ETH_ALEN], bool *is_static); + int (*fdb_add)(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); + int (*fdb_del)(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); + int (*fdb_getnext)(struct dsa_switch *ds, int port, + unsigned char *addr, bool *is_static); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 0e296b82aef3..89da8934519b 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -70,9 +70,8 @@ struct switchdev_obj { u32 tb_id; } ipv4_fib; struct switchdev_obj_fdb { /* PORT_FDB */ - u8 addr[ETH_ALEN]; + const unsigned char *addr; u16 vid; - bool is_static; } fdb; } u; }; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 5656b44bf3de..9e9875da0a4f 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -136,11 +136,11 @@ static void fdb_del_external_learn(struct net_bridge_fdb_entry *f) struct switchdev_obj obj = { .id = SWITCHDEV_OBJ_PORT_FDB, .u.fdb = { + .addr = f->addr.addr, .vid = f->vlan_id, }, }; - ether_addr_copy(obj.u.fdb.addr, f->addr.addr); switchdev_port_obj_del(f->dst->dev, &obj); } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 1dbdeaab2bb4..0010c690cc67 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -19,7 +19,6 @@ #include #include #include -#include #include "dsa_priv.h" /* slave mii_bus handling ***************************************************/ @@ -201,6 +200,105 @@ out: return 0; } +static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid, u16 nlm_flags) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + if (ds->drv->fdb_add) + ret = ds->drv->fdb_add(ds, p->port, addr, vid); + + return ret; +} + +static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], + struct net_device *dev, + const unsigned char *addr, u16 vid) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + int ret = -EOPNOTSUPP; + + if (ds->drv->fdb_del) + ret = ds->drv->fdb_del(ds, p->port, addr, vid); + + return ret; +} + +static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb, + const unsigned char *addr, u16 vid, + bool is_static, + u32 portid, u32 seq, int type, + unsigned int flags) +{ + struct nlmsghdr *nlh; + struct ndmsg *ndm; + + nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); + if (!nlh) + return -EMSGSIZE; + + ndm = nlmsg_data(nlh); + ndm->ndm_family = AF_BRIDGE; + ndm->ndm_pad1 = 0; + ndm->ndm_pad2 = 0; + ndm->ndm_flags = NTF_EXT_LEARNED; + ndm->ndm_type = 0; + ndm->ndm_ifindex = dev->ifindex; + ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; + + if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) + goto nla_put_failure; + + if (vid && nla_put_u16(skb, NDA_VLAN, vid)) + goto nla_put_failure; + + nlmsg_end(skb, nlh); + return 0; + +nla_put_failure: + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; +} + +/* Dump information about entries, in response to GETNEIGH */ +static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, + struct net_device *dev, + struct net_device *filter_dev, int idx) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + unsigned char addr[ETH_ALEN] = { 0 }; + int ret; + + if (!ds->drv->fdb_getnext) + return -EOPNOTSUPP; + + for (; ; idx++) { + bool is_static; + + ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static); + if (ret < 0) + break; + + if (idx < cb->args[0]) + continue; + + ret = dsa_slave_fill_info(dev, skb, addr, 0, + is_static, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, NLM_F_MULTI); + if (ret < 0) + break; + } + + return idx; +} + static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct dsa_slave_priv *p = netdev_priv(dev); @@ -266,115 +364,6 @@ static int dsa_slave_port_attr_set(struct net_device *dev, return ret; } -static int dsa_slave_port_fdb_add(struct net_device *dev, - struct switchdev_obj *obj) -{ - struct switchdev_obj_fdb *fdb = &obj->u.fdb; - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; - int err; - - if (obj->trans == SWITCHDEV_TRANS_PREPARE) - err = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP; - else if (obj->trans == SWITCHDEV_TRANS_COMMIT) - err = ds->drv->port_fdb_add(ds, p->port, fdb->vid, fdb->addr); - else - err = -EOPNOTSUPP; - - return err; -} - -static int dsa_slave_port_fdb_del(struct net_device *dev, - struct switchdev_obj *obj) -{ - struct switchdev_obj_fdb *fdb = &obj->u.fdb; - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; - - if (!ds->drv->port_fdb_del) - return -EOPNOTSUPP; - - return ds->drv->port_fdb_del(ds, p->port, fdb->vid, fdb->addr); -} - -static int dsa_slave_port_fdb_dump(struct net_device *dev, - struct switchdev_obj *obj) -{ - struct switchdev_obj_fdb *fdb = &obj->u.fdb; - struct dsa_slave_priv *p = netdev_priv(dev); - struct dsa_switch *ds = p->parent; - int err; - - if (!ds->drv->port_fdb_getnext) - return -EOPNOTSUPP; - - memset(fdb, 0, sizeof(*fdb)); - - for (;;) { - err = ds->drv->port_fdb_getnext(ds, p->port, &fdb->vid, - fdb->addr, &fdb->is_static); - if (err) - break; - - err = obj->cb(dev, obj); - if (err) - break; - } - - return err == -ENOENT ? 0 : err; -} - -static int dsa_slave_port_obj_add(struct net_device *dev, - struct switchdev_obj *obj) -{ - int err; - - switch (obj->id) { - case SWITCHDEV_OBJ_PORT_FDB: - err = dsa_slave_port_fdb_add(dev, obj); - break; - default: - err = -EOPNOTSUPP; - break; - } - - return err; -} - -static int dsa_slave_port_obj_del(struct net_device *dev, - struct switchdev_obj *obj) -{ - int err; - - switch (obj->id) { - case SWITCHDEV_OBJ_PORT_FDB: - err = dsa_slave_port_fdb_del(dev, obj); - break; - default: - err = -EOPNOTSUPP; - break; - } - - return err; -} - -static int dsa_slave_port_obj_dump(struct net_device *dev, - struct switchdev_obj *obj) -{ - int err; - - switch (obj->id) { - case SWITCHDEV_OBJ_PORT_FDB: - err = dsa_slave_port_fdb_dump(dev, obj); - break; - default: - err = -EOPNOTSUPP; - break; - } - - return err; -} - static int dsa_slave_bridge_port_join(struct net_device *dev, struct net_device *br) { @@ -776,9 +765,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_fdb_add = switchdev_port_fdb_add, - .ndo_fdb_del = switchdev_port_fdb_del, - .ndo_fdb_dump = switchdev_port_fdb_dump, + .ndo_fdb_add = dsa_slave_fdb_add, + .ndo_fdb_del = dsa_slave_fdb_del, + .ndo_fdb_dump = dsa_slave_fdb_dump, .ndo_do_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER @@ -791,9 +780,6 @@ static const struct net_device_ops dsa_slave_netdev_ops = { static const struct switchdev_ops dsa_slave_switchdev_ops = { .switchdev_port_attr_get = dsa_slave_port_attr_get, .switchdev_port_attr_set = dsa_slave_port_attr_set, - .switchdev_port_obj_add = dsa_slave_port_obj_add, - .switchdev_port_obj_del = dsa_slave_port_obj_del, - .switchdev_port_obj_dump = dsa_slave_port_obj_dump, }; static void dsa_slave_adjust_link(struct net_device *dev) diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index e9d1cacc4060..33bafa2e703e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -743,11 +742,11 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct switchdev_obj obj = { .id = SWITCHDEV_OBJ_PORT_FDB, .u.fdb = { + .addr = addr, .vid = vid, }, }; - ether_addr_copy(obj.u.fdb.addr, addr); return switchdev_port_obj_add(dev, &obj); } EXPORT_SYMBOL_GPL(switchdev_port_fdb_add); @@ -770,11 +769,11 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct switchdev_obj obj = { .id = SWITCHDEV_OBJ_PORT_FDB, .u.fdb = { + .addr = addr, .vid = vid, }, }; - ether_addr_copy(obj.u.fdb.addr, addr); return switchdev_port_obj_del(dev, &obj); } EXPORT_SYMBOL_GPL(switchdev_port_fdb_del); @@ -811,7 +810,7 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev, ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = obj->u.fdb.is_static ? NUD_NOARP : NUD_REACHABLE; + ndm->ndm_state = NUD_REACHABLE; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr)) goto nla_put_failure; -- cgit v1.2.3 From 2a778e1b58990e15de5cba4badec1fa7ecb87e80 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 10 Aug 2015 09:09:49 -0400 Subject: net: dsa: change FDB routines prototypes Change the prototype of port_getnext to include a vid parameter. This is necessary to introduce the support for VLAN. Also rename the fdb_{add,del,getnext} function pointers to port_fdb_{add,del,getnext} since they are specific to a given port. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6171.c | 6 +++--- drivers/net/dsa/mv88e6352.c | 6 +++--- drivers/net/dsa/mv88e6xxx.c | 2 +- drivers/net/dsa/mv88e6xxx.h | 2 +- include/net/dsa.h | 17 +++++++++++------ net/dsa/slave.c | 16 +++++++++------- 6 files changed, 28 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/drivers/net/dsa/mv88e6171.c b/drivers/net/dsa/mv88e6171.c index 1c7808495a9d..735f04cd83ee 100644 --- a/drivers/net/dsa/mv88e6171.c +++ b/drivers/net/dsa/mv88e6171.c @@ -116,9 +116,9 @@ struct dsa_switch_driver mv88e6171_switch_driver = { .port_join_bridge = mv88e6xxx_join_bridge, .port_leave_bridge = mv88e6xxx_leave_bridge, .port_stp_update = mv88e6xxx_port_stp_update, - .fdb_add = mv88e6xxx_port_fdb_add, - .fdb_del = mv88e6xxx_port_fdb_del, - .fdb_getnext = mv88e6xxx_port_fdb_getnext, + .port_fdb_add = mv88e6xxx_port_fdb_add, + .port_fdb_del = mv88e6xxx_port_fdb_del, + .port_fdb_getnext = mv88e6xxx_port_fdb_getnext, }; MODULE_ALIAS("platform:mv88e6171"); diff --git a/drivers/net/dsa/mv88e6352.c b/drivers/net/dsa/mv88e6352.c index 7e935852e192..a18f7c83d4cb 100644 --- a/drivers/net/dsa/mv88e6352.c +++ b/drivers/net/dsa/mv88e6352.c @@ -343,9 +343,9 @@ struct dsa_switch_driver mv88e6352_switch_driver = { .port_join_bridge = mv88e6xxx_join_bridge, .port_leave_bridge = mv88e6xxx_leave_bridge, .port_stp_update = mv88e6xxx_port_stp_update, - .fdb_add = mv88e6xxx_port_fdb_add, - .fdb_del = mv88e6xxx_port_fdb_del, - .fdb_getnext = mv88e6xxx_port_fdb_getnext, + .port_fdb_add = mv88e6xxx_port_fdb_add, + .port_fdb_del = mv88e6xxx_port_fdb_del, + .port_fdb_getnext = mv88e6xxx_port_fdb_getnext, }; MODULE_ALIAS("platform:mv88e6172"); diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c index 0cc83785d194..d68e3fdd6c99 100644 --- a/drivers/net/dsa/mv88e6xxx.c +++ b/drivers/net/dsa/mv88e6xxx.c @@ -1310,7 +1310,7 @@ static int __mv88e6xxx_port_getnext(struct dsa_switch *ds, int port, /* get next entry for port */ int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port, - unsigned char *addr, bool *is_static) + unsigned char *addr, u16 *vid, bool *is_static) { struct mv88e6xxx_priv_state *ps = ds_to_priv(ds); int ret; diff --git a/drivers/net/dsa/mv88e6xxx.h b/drivers/net/dsa/mv88e6xxx.h index 200327b7ea7d..55a6190ce159 100644 --- a/drivers/net/dsa/mv88e6xxx.h +++ b/drivers/net/dsa/mv88e6xxx.h @@ -421,7 +421,7 @@ int mv88e6xxx_port_fdb_add(struct dsa_switch *ds, int port, int mv88e6xxx_port_fdb_del(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid); int mv88e6xxx_port_fdb_getnext(struct dsa_switch *ds, int port, - unsigned char *addr, bool *is_static); + unsigned char *addr, u16 *vid, bool *is_static); int mv88e6xxx_phy_page_read(struct dsa_switch *ds, int port, int page, int reg); int mv88e6xxx_phy_page_write(struct dsa_switch *ds, int port, int page, int reg, int val); diff --git a/include/net/dsa.h b/include/net/dsa.h index fbca63ba8f73..6356f437e911 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -296,12 +296,17 @@ struct dsa_switch_driver { u32 br_port_mask); int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); - int (*fdb_add)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); - int (*fdb_del)(struct dsa_switch *ds, int port, - const unsigned char *addr, u16 vid); - int (*fdb_getnext)(struct dsa_switch *ds, int port, - unsigned char *addr, bool *is_static); + + /* + * Forwarding database + */ + int (*port_fdb_add)(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); + int (*port_fdb_del)(struct dsa_switch *ds, int port, + const unsigned char *addr, u16 vid); + int (*port_fdb_getnext)(struct dsa_switch *ds, int port, + unsigned char *addr, u16 *vid, + bool *is_static); }; void register_switch_driver(struct dsa_switch_driver *type); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 0010c690cc67..3d341b694ecf 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -208,8 +208,8 @@ static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->fdb_add) - ret = ds->drv->fdb_add(ds, p->port, addr, vid); + if (ds->drv->port_fdb_add) + ret = ds->drv->port_fdb_add(ds, p->port, addr, vid); return ret; } @@ -222,8 +222,8 @@ static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->fdb_del) - ret = ds->drv->fdb_del(ds, p->port, addr, vid); + if (ds->drv->port_fdb_del) + ret = ds->drv->port_fdb_del(ds, p->port, addr, vid); return ret; } @@ -272,22 +272,24 @@ static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; unsigned char addr[ETH_ALEN] = { 0 }; + u16 vid = 0; int ret; - if (!ds->drv->fdb_getnext) + if (!ds->drv->port_fdb_getnext) return -EOPNOTSUPP; for (; ; idx++) { bool is_static; - ret = ds->drv->fdb_getnext(ds, p->port, addr, &is_static); + ret = ds->drv->port_fdb_getnext(ds, p->port, addr, &vid, + &is_static); if (ret < 0) break; if (idx < cb->args[0]) continue; - ret = dsa_slave_fill_info(dev, skb, addr, 0, + ret = dsa_slave_fill_info(dev, skb, addr, vid, is_static, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, -- cgit v1.2.3 From ce80e7bc57e25062c361de8fb6444129a63bac6d Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 10 Aug 2015 09:09:52 -0400 Subject: net: switchdev: support static FDB addresses This patch adds an ndm_state member to the switchdev_obj_fdb structure, in order to support static FDB addresses. Set Rocker ndm_state to NUD_REACHABLE. Signed-off-by: Vivien Didelot Acked-by: Scott Feldman Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 1 + include/net/switchdev.h | 1 + net/switchdev/switchdev.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index b77e0e7307d4..af050759eb44 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -4544,6 +4544,7 @@ static int rocker_port_fdb_dump(const struct rocker_port *rocker_port, if (found->key.pport != rocker_port->pport) continue; fdb->addr = found->key.addr; + fdb->ndm_state = NUD_REACHABLE; fdb->vid = rocker_port_vlan_to_vid(rocker_port, found->key.vlan_id); err = obj->cb(rocker_port->dev, obj); diff --git a/include/net/switchdev.h b/include/net/switchdev.h index 89da8934519b..319baab3b48e 100644 --- a/include/net/switchdev.h +++ b/include/net/switchdev.h @@ -72,6 +72,7 @@ struct switchdev_obj { struct switchdev_obj_fdb { /* PORT_FDB */ const unsigned char *addr; u16 vid; + u16 ndm_state; } fdb; } u; }; diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 33bafa2e703e..16c1c43980a1 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -810,7 +810,7 @@ static int switchdev_port_fdb_dump_cb(struct net_device *dev, ndm->ndm_flags = NTF_SELF; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = NUD_REACHABLE; + ndm->ndm_state = obj->u.fdb.ndm_state; if (nla_put(dump->skb, NDA_LLADDR, ETH_ALEN, obj->u.fdb.addr)) goto nla_put_failure; -- cgit v1.2.3 From ba14d9eb1999cad5b810f1fd97d1cb2d3f00869e Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Mon, 10 Aug 2015 09:09:53 -0400 Subject: net: dsa: add support for switchdev FDB objects Implement the switchdev_port_obj_{add,del,dump} functions in DSA to support the SWITCHDEV_OBJ_PORT_FDB objects. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/slave.c | 142 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 81 insertions(+), 61 deletions(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 3d341b694ecf..276758406065 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -200,74 +200,38 @@ out: return 0; } -static int dsa_slave_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid, u16 nlm_flags) +static int dsa_slave_port_fdb_add(struct net_device *dev, + struct switchdev_obj *obj) { + struct switchdev_obj_fdb *fdb = &obj->u.fdb; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; - if (ds->drv->port_fdb_add) - ret = ds->drv->port_fdb_add(ds, p->port, addr, vid); + if (obj->trans == SWITCHDEV_TRANS_PREPARE) + ret = ds->drv->port_fdb_add ? 0 : -EOPNOTSUPP; + else if (obj->trans == SWITCHDEV_TRANS_COMMIT) + ret = ds->drv->port_fdb_add(ds, p->port, fdb->addr, fdb->vid); return ret; } -static int dsa_slave_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], - struct net_device *dev, - const unsigned char *addr, u16 vid) +static int dsa_slave_port_fdb_del(struct net_device *dev, + struct switchdev_obj *obj) { + struct switchdev_obj_fdb *fdb = &obj->u.fdb; struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; int ret = -EOPNOTSUPP; if (ds->drv->port_fdb_del) - ret = ds->drv->port_fdb_del(ds, p->port, addr, vid); + ret = ds->drv->port_fdb_del(ds, p->port, fdb->addr, fdb->vid); return ret; } -static int dsa_slave_fill_info(struct net_device *dev, struct sk_buff *skb, - const unsigned char *addr, u16 vid, - bool is_static, - u32 portid, u32 seq, int type, - unsigned int flags) -{ - struct nlmsghdr *nlh; - struct ndmsg *ndm; - - nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); - if (!nlh) - return -EMSGSIZE; - - ndm = nlmsg_data(nlh); - ndm->ndm_family = AF_BRIDGE; - ndm->ndm_pad1 = 0; - ndm->ndm_pad2 = 0; - ndm->ndm_flags = NTF_EXT_LEARNED; - ndm->ndm_type = 0; - ndm->ndm_ifindex = dev->ifindex; - ndm->ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; - - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr)) - goto nla_put_failure; - - if (vid && nla_put_u16(skb, NDA_VLAN, vid)) - goto nla_put_failure; - - nlmsg_end(skb, nlh); - return 0; - -nla_put_failure: - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; -} - -/* Dump information about entries, in response to GETNEIGH */ -static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, - struct net_device *dev, - struct net_device *filter_dev, int idx) +static int dsa_slave_port_fdb_dump(struct net_device *dev, + struct switchdev_obj *obj) { struct dsa_slave_priv *p = netdev_priv(dev); struct dsa_switch *ds = p->parent; @@ -278,7 +242,7 @@ static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, if (!ds->drv->port_fdb_getnext) return -EOPNOTSUPP; - for (; ; idx++) { + for (;;) { bool is_static; ret = ds->drv->port_fdb_getnext(ds, p->port, addr, &vid, @@ -286,19 +250,16 @@ static int dsa_slave_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, if (ret < 0) break; - if (idx < cb->args[0]) - continue; + obj->u.fdb.addr = addr; + obj->u.fdb.vid = vid; + obj->u.fdb.ndm_state = is_static ? NUD_NOARP : NUD_REACHABLE; - ret = dsa_slave_fill_info(dev, skb, addr, vid, - is_static, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - RTM_NEWNEIGH, NLM_F_MULTI); + ret = obj->cb(dev, obj); if (ret < 0) break; } - return idx; + return ret == -ENOENT ? 0 : ret; } static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) @@ -366,6 +327,62 @@ static int dsa_slave_port_attr_set(struct net_device *dev, return ret; } +static int dsa_slave_port_obj_add(struct net_device *dev, + struct switchdev_obj *obj) +{ + int err; + + /* For the prepare phase, ensure the full set of changes is feasable in + * one go in order to signal a failure properly. If an operation is not + * supported, return -EOPNOTSUPP. + */ + + switch (obj->id) { + case SWITCHDEV_OBJ_PORT_FDB: + err = dsa_slave_port_fdb_add(dev, obj); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + +static int dsa_slave_port_obj_del(struct net_device *dev, + struct switchdev_obj *obj) +{ + int err; + + switch (obj->id) { + case SWITCHDEV_OBJ_PORT_FDB: + err = dsa_slave_port_fdb_del(dev, obj); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + +static int dsa_slave_port_obj_dump(struct net_device *dev, + struct switchdev_obj *obj) +{ + int err; + + switch (obj->id) { + case SWITCHDEV_OBJ_PORT_FDB: + err = dsa_slave_port_fdb_dump(dev, obj); + break; + default: + err = -EOPNOTSUPP; + break; + } + + return err; +} + static int dsa_slave_bridge_port_join(struct net_device *dev, struct net_device *br) { @@ -767,9 +784,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_change_rx_flags = dsa_slave_change_rx_flags, .ndo_set_rx_mode = dsa_slave_set_rx_mode, .ndo_set_mac_address = dsa_slave_set_mac_address, - .ndo_fdb_add = dsa_slave_fdb_add, - .ndo_fdb_del = dsa_slave_fdb_del, - .ndo_fdb_dump = dsa_slave_fdb_dump, + .ndo_fdb_add = switchdev_port_fdb_add, + .ndo_fdb_del = switchdev_port_fdb_del, + .ndo_fdb_dump = switchdev_port_fdb_dump, .ndo_do_ioctl = dsa_slave_ioctl, .ndo_get_iflink = dsa_slave_get_iflink, #ifdef CONFIG_NET_POLL_CONTROLLER @@ -782,6 +799,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = { static const struct switchdev_ops dsa_slave_switchdev_ops = { .switchdev_port_attr_get = dsa_slave_port_attr_get, .switchdev_port_attr_set = dsa_slave_port_attr_set, + .switchdev_port_obj_add = dsa_slave_port_obj_add, + .switchdev_port_obj_del = dsa_slave_port_obj_del, + .switchdev_port_obj_dump = dsa_slave_port_obj_dump, }; static void dsa_slave_adjust_link(struct net_device *dev) -- cgit v1.2.3 From 9e6080936c3e507da60492a8a606bdd1164baa81 Mon Sep 17 00:00:00 2001 From: Frederic Danis Date: Tue, 11 Aug 2015 16:35:36 +0200 Subject: net: rfkill: gpio: Remove BCM2E39 support Power management support for BCM2E39 is now performed in Bluetooth BCM UART driver. Signed-off-by: Frederic Danis Signed-off-by: Marcel Holtmann --- net/rfkill/rfkill-gpio.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index d5d58d919552..93127220cb54 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -164,7 +164,6 @@ static int rfkill_gpio_remove(struct platform_device *pdev) #ifdef CONFIG_ACPI static const struct acpi_device_id rfkill_acpi_match[] = { { "BCM2E1A", RFKILL_TYPE_BLUETOOTH }, - { "BCM2E39", RFKILL_TYPE_BLUETOOTH }, { "BCM2E3D", RFKILL_TYPE_BLUETOOTH }, { "BCM2E40", RFKILL_TYPE_BLUETOOTH }, { "BCM2E64", RFKILL_TYPE_BLUETOOTH }, -- cgit v1.2.3 From a42bbba5afd8613374ed043af3b3eda7e0e0e6bb Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 11 Aug 2015 21:44:07 +0200 Subject: Bluetooth: 6lowpan: change netdev_priv to lowpan_dev The usually way to get the btle lowpan private data is to use the introduced lowpan_dev inline function. This patch will cleanup by using lowpan_dev consequently. Reviewed-by: Stefan Schmidt Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/bluetooth/6lowpan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 0ffe2e24020a..24ed5b02cefc 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -859,7 +859,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); SET_NETDEV_DEVTYPE(netdev, &bt_type); - *dev = netdev_priv(netdev); + *dev = lowpan_dev(netdev); (*dev)->netdev = netdev; (*dev)->hdev = chan->conn->hcon->hdev; INIT_LIST_HEAD(&(*dev)->peers); -- cgit v1.2.3 From b72f6f51dc5abce94c1b5ee0186e9407ea0f919f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 11 Aug 2015 21:44:08 +0200 Subject: 6lowpan: add generic 6lowpan netdev private data This patch introduced the 6lowpan netdev private data struct. We name it lowpan_priv and it's placed at the beginning of netdev private data. All lowpan interfaces should allocate this room at first of netdev private data. 6LoWPAN LL private data can be allocate by additional netdev private data, e.g. dev->priv_size should be "sizeof(struct lowpan_priv) + sizeof(LL_LOWPAN_PRIVATE_DATA)". Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- include/net/6lowpan.h | 23 +++++++++++++++++++++++ net/6lowpan/Makefile | 2 +- net/6lowpan/core.c | 20 ++++++++++++++++++++ net/bluetooth/6lowpan.c | 9 ++++++--- net/ieee802154/6lowpan/6lowpan_i.h | 3 ++- net/ieee802154/6lowpan/core.c | 4 +++- 6 files changed, 55 insertions(+), 6 deletions(-) create mode 100644 net/6lowpan/core.c (limited to 'net') diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index dc03d77ad23b..a2f59ec98d24 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -197,6 +197,27 @@ #define LOWPAN_NHC_UDP_CS_P_11 0xF3 /* source & dest = 0xF0B + 4bit inline */ #define LOWPAN_NHC_UDP_CS_C 0x04 /* checksum elided */ +#define LOWPAN_PRIV_SIZE(llpriv_size) \ + (sizeof(struct lowpan_priv) + llpriv_size) + +enum lowpan_lltypes { + LOWPAN_LLTYPE_BTLE, + LOWPAN_LLTYPE_IEEE802154, +}; + +struct lowpan_priv { + enum lowpan_lltypes lltype; + + /* must be last */ + u8 priv[0] __aligned(sizeof(void *)); +}; + +static inline +struct lowpan_priv *lowpan_priv(const struct net_device *dev) +{ + return netdev_priv(dev); +} + #ifdef DEBUG /* print data in line */ static inline void raw_dump_inline(const char *caller, char *msg, @@ -372,6 +393,8 @@ lowpan_uncompress_size(const struct sk_buff *skb, u16 *dgram_offset) return skb->len + uncomp_header - ret; } +void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype); + int lowpan_header_decompress(struct sk_buff *skb, struct net_device *dev, const u8 *saddr, const u8 saddr_type, diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile index eb8baa72adc8..c6ffc55ee0d7 100644 --- a/net/6lowpan/Makefile +++ b/net/6lowpan/Makefile @@ -1,6 +1,6 @@ obj-$(CONFIG_6LOWPAN) += 6lowpan.o -6lowpan-y := iphc.o nhc.o +6lowpan-y := core.o iphc.o nhc.o #rfc6282 nhcs obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c new file mode 100644 index 000000000000..ed0eec9b41a1 --- /dev/null +++ b/net/6lowpan/core.c @@ -0,0 +1,20 @@ +/* This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: + * (C) 2015 Pengutronix, Alexander Aring + */ + +#include + +void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype) +{ + lowpan_priv(dev)->lltype = lltype; +} +EXPORT_SYMBOL(lowpan_netdev_setup); diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 24ed5b02cefc..131e79cde350 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -85,7 +85,7 @@ struct lowpan_dev { static inline struct lowpan_dev *lowpan_dev(const struct net_device *netdev) { - return netdev_priv(netdev); + return (struct lowpan_dev *)lowpan_priv(netdev)->priv; } static inline void peer_add(struct lowpan_dev *dev, struct lowpan_peer *peer) @@ -848,8 +848,9 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) struct net_device *netdev; int err = 0; - netdev = alloc_netdev(sizeof(struct lowpan_dev), IFACE_NAME_TEMPLATE, - NET_NAME_UNKNOWN, netdev_setup); + netdev = alloc_netdev(LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev)), + IFACE_NAME_TEMPLATE, NET_NAME_UNKNOWN, + netdev_setup); if (!netdev) return -ENOMEM; @@ -869,6 +870,8 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev) list_add_rcu(&(*dev)->list, &bt_6lowpan_devices); spin_unlock(&devices_lock); + lowpan_netdev_setup(netdev, LOWPAN_LLTYPE_BTLE); + err = register_netdev(netdev); if (err < 0) { BT_INFO("register_netdev failed %d", err); diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h index 923b680adb61..ea339fa94c27 100644 --- a/net/ieee802154/6lowpan/6lowpan_i.h +++ b/net/ieee802154/6lowpan/6lowpan_i.h @@ -5,6 +5,7 @@ #include #include +#include struct lowpan_create_arg { u16 tag; @@ -46,7 +47,7 @@ struct lowpan_dev_info { static inline struct lowpan_dev_info *lowpan_dev_info(const struct net_device *dev) { - return netdev_priv(dev); + return (struct lowpan_dev_info *)lowpan_priv(dev)->priv; } int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index a4edee8fdc79..180e9f5f86c3 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -138,6 +138,8 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, /* Set the lowpan hardware address to the wpan hardware address. */ memcpy(dev->dev_addr, real_dev->dev_addr, IEEE802154_ADDR_LEN); + lowpan_netdev_setup(dev, LOWPAN_LLTYPE_IEEE802154); + ret = register_netdevice(dev); if (ret >= 0) { real_dev->ieee802154_ptr->lowpan_dev = dev; @@ -162,7 +164,7 @@ static void lowpan_dellink(struct net_device *dev, struct list_head *head) static struct rtnl_link_ops lowpan_link_ops __read_mostly = { .kind = "lowpan", - .priv_size = sizeof(struct lowpan_dev_info), + .priv_size = LOWPAN_PRIV_SIZE(sizeof(struct lowpan_dev_info)), .setup = lowpan_setup, .newlink = lowpan_newlink, .dellink = lowpan_dellink, -- cgit v1.2.3 From 4ae935c127f701bc06afd0d5567e17511a6d25e8 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Tue, 11 Aug 2015 21:44:09 +0200 Subject: 6lowpan: move module_init into core functionality This patch moves module_init of 6lowpan module into core functionality of 6lowpan module. To load the ipv6 module at probing of the 6lowpan module should be core functionality. Loading next header compression modules is iphc specific. Nevertheless we only support IPHC for the generic 6LoWPAN branch right now so we can put it into the core functionality. If possible new compression formats are introduced nhc should load only when iphc is build. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/6lowpan/core.c | 20 ++++++++++++++++++++ net/6lowpan/iphc.c | 19 ------------------- 2 files changed, 20 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index ed0eec9b41a1..ae1896fa45e2 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -11,6 +11,8 @@ * (C) 2015 Pengutronix, Alexander Aring */ +#include + #include void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype) @@ -18,3 +20,21 @@ void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype) lowpan_priv(dev)->lltype = lltype; } EXPORT_SYMBOL(lowpan_netdev_setup); + +static int __init lowpan_module_init(void) +{ + request_module_nowait("ipv6"); + + request_module_nowait("nhc_dest"); + request_module_nowait("nhc_fragment"); + request_module_nowait("nhc_hop"); + request_module_nowait("nhc_ipv6"); + request_module_nowait("nhc_mobility"); + request_module_nowait("nhc_routing"); + request_module_nowait("nhc_udp"); + + return 0; +} +module_init(lowpan_module_init); + +MODULE_LICENSE("GPL"); diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 74e56d7449c8..1e0071fdcf72 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -48,7 +48,6 @@ #include #include -#include #include #include #include @@ -610,21 +609,3 @@ int lowpan_header_compress(struct sk_buff *skb, struct net_device *dev, return 0; } EXPORT_SYMBOL_GPL(lowpan_header_compress); - -static int __init lowpan_module_init(void) -{ - request_module_nowait("ipv6"); - - request_module_nowait("nhc_dest"); - request_module_nowait("nhc_fragment"); - request_module_nowait("nhc_hop"); - request_module_nowait("nhc_ipv6"); - request_module_nowait("nhc_mobility"); - request_module_nowait("nhc_routing"); - request_module_nowait("nhc_udp"); - - return 0; -} -module_init(lowpan_module_init); - -MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 76550786c6780d95db1b2de56a115a84347b242d Mon Sep 17 00:00:00 2001 From: Mugunthan V N Date: Wed, 12 Aug 2015 15:31:43 +0530 Subject: net: ipv4: increase dhcp inter device timeout When a system has multiple ethernet devices and during DHCP request (for using NFS), the system waits only for HZ/2 which is 500mS before switching to another interface for DHCP. There are some routers (Ex: Trendnet routers) which responds to DHCP request at about 560mS. When the system has only one ethernet interface there is no issue as the timeout is 2S and the dev xid doesn't changes and only retries. But when the system has multiple Ethernet like DRA74x with CPSW in dual EMAC mode, the DHCP response is dropped as the dev xid changes while shifting to the next device. So changing inter device timeout to HZ (which is 1S). Signed-off-by: Mugunthan V N Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 8e7328c6a390..ed4ef09c2136 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -94,7 +94,7 @@ /* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ #define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ #define CONF_SEND_RETRIES 6 /* Send six requests per open */ -#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */ +#define CONF_INTER_TIMEOUT (HZ) /* Inter-device timeout: 1 second */ #define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ #define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ #define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ -- cgit v1.2.3 From 36890997b0d219427e9d9d2aad5d46eb44fe808d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 2 Aug 2015 11:09:54 +0200 Subject: rfkill: Allow compile test of GPIO consumers if !GPIOLIB The GPIO subsystem provides dummy GPIO consumer functions if GPIOLIB is not enabled. Hence drivers that depend on GPIOLIB, but use GPIO consumer functionality only, can still be compiled if GPIOLIB is not enabled. Relax the dependency on GPIOLIB if COMPILE_TEST is enabled, where appropriate. Signed-off-by: Geert Uytterhoeven Acked-by: Linus Walleij Signed-off-by: Johannes Berg --- net/rfkill/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig index 4c10e7e6c9f6..598d374f6a35 100644 --- a/net/rfkill/Kconfig +++ b/net/rfkill/Kconfig @@ -36,7 +36,8 @@ config RFKILL_REGULATOR config RFKILL_GPIO tristate "GPIO RFKILL driver" - depends on RFKILL && GPIOLIB + depends on RFKILL + depends on GPIOLIB || COMPILE_TEST default n help If you say yes here you get support of a generic gpio RFKILL -- cgit v1.2.3 From 4b58c37bb9d4282446f7a0194dbc44325787ac8c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 8 Jul 2015 15:41:48 +0300 Subject: mac80211: remove ieee80211_aes_cmac_calculate_k1_k2() The iwlwifi driver was the only driver that used this, but as it turns out it never needed it, so we can remove it. Signed-off-by: Johannes Berg --- include/net/mac80211.h | 13 ------------- net/mac80211/aes_cmac.c | 17 ----------------- 2 files changed, 30 deletions(-) (limited to 'net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 484cc14fb947..e3314e516681 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4330,19 +4330,6 @@ void ieee80211_get_tkip_rx_p1k(struct ieee80211_key_conf *keyconf, void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf, struct sk_buff *skb, u8 *p2k); -/** - * ieee80211_aes_cmac_calculate_k1_k2 - calculate the AES-CMAC sub keys - * - * This function computes the two AES-CMAC sub-keys, based on the - * previously installed master key. - * - * @keyconf: the parameter passed with the set key - * @k1: a buffer to be filled with the 1st sub-key - * @k2: a buffer to be filled with the 2nd sub-key - */ -void ieee80211_aes_cmac_calculate_k1_k2(struct ieee80211_key_conf *keyconf, - u8 *k1, u8 *k2); - /** * ieee80211_get_key_tx_seq - get key TX sequence counter * diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c index 4192806be3d3..bdf0790d89cc 100644 --- a/net/mac80211/aes_cmac.c +++ b/net/mac80211/aes_cmac.c @@ -145,20 +145,3 @@ void ieee80211_aes_cmac_key_free(struct crypto_cipher *tfm) { crypto_free_cipher(tfm); } - -void ieee80211_aes_cmac_calculate_k1_k2(struct ieee80211_key_conf *keyconf, - u8 *k1, u8 *k2) -{ - u8 l[AES_BLOCK_SIZE] = {}; - struct ieee80211_key *key = - container_of(keyconf, struct ieee80211_key, conf); - - crypto_cipher_encrypt_one(key->u.aes_cmac.tfm, l, l); - - memcpy(k1, l, AES_BLOCK_SIZE); - gf_mulx(k1); - - memcpy(k2, k1, AES_BLOCK_SIZE); - gf_mulx(k2); -} -EXPORT_SYMBOL(ieee80211_aes_cmac_calculate_k1_k2); -- cgit v1.2.3 From 75dbf00b443c1763138486e87f4978ff43506f9e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 27 Jul 2015 11:11:11 +0300 Subject: mac80211: remove always true condition The outside if statement checks that IEEE80211_TX_INTFL_MLME_CONN_TX is set so this condition is always true. Checking twice upsets the static checkers. Signed-off-by: Dan Carpenter Signed-off-by: Johannes Berg --- net/mac80211/status.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 45628f37c083..8ba583243509 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -515,7 +515,7 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, if (!sdata) { skb->dev = NULL; - } else if (info->flags & IEEE80211_TX_INTFL_MLME_CONN_TX) { + } else { unsigned int hdr_size = ieee80211_hdrlen(hdr->frame_control); @@ -529,9 +529,6 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, ieee80211_mgd_conn_tx_status(sdata, hdr->frame_control, acked); - } else { - /* we assign ack frame ID for the others */ - WARN_ON(1); } rcu_read_unlock(); -- cgit v1.2.3 From fcd16c0a95a8faf4d310f94d831b22f901c5a744 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 12 Aug 2015 11:18:18 -0700 Subject: tcp: don't extend RTO on failed loss probe attempts If TLP was unable to send a probe, it extended the RTO to now + icsk_rto. But extending the RTO makes little sense if no TLP probe went out. With this commit, instead of extending the RTO we re-arm it relative to the transmit time of the write queue head. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Nandita Dukkipati Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7d1efa762b75..78fc89c1c43c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2275,13 +2275,12 @@ void tcp_send_loss_probe(struct sock *sk) tp->tlp_high_seq = tp->snd_nxt; rearm_timer: - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); - - if (likely(!err)) - NET_INC_STATS_BH(sock_net(sk), - LINUX_MIB_TCPLOSSPROBES); + if (likely(!err)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); + /* Reset s.t. tcp_rearm_rto will restart timer from now */ + inet_csk(sk)->icsk_pending = 0; + } + tcp_rearm_rto(sk); } /* Push out any pending frames which were held back due to -- cgit v1.2.3 From b340b26454e7ba0f5d04b4bd5c4c782f1628ebe1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 12 Aug 2015 11:18:19 -0700 Subject: tcp: TLP retransmits last if failed to send new packet When TLP fails to send new packet because of receive window limit, it should fall back to retransmit the last packet instead. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Nandita Dukkipati Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 78fc89c1c43c..444ab5beecbd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2149,7 +2149,7 @@ repair: tcp_cwnd_validate(sk, is_cwnd_limited); return false; } - return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk)); + return !tp->packets_out && tcp_send_head(sk); } bool tcp_schedule_loss_probe(struct sock *sk) @@ -2226,7 +2226,7 @@ static bool skb_still_in_host_queue(const struct sock *sk, return false; } -/* When probe timeout (PTO) fires, send a new segment if one exists, else +/* When probe timeout (PTO) fires, try send a new segment if possible, else * retransmit the last segment. */ void tcp_send_loss_probe(struct sock *sk) @@ -2235,11 +2235,19 @@ void tcp_send_loss_probe(struct sock *sk) struct sk_buff *skb; int pcount; int mss = tcp_current_mss(sk); - int err = -1; - if (tcp_send_head(sk)) { - err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); - goto rearm_timer; + skb = tcp_send_head(sk); + if (skb) { + if (tcp_snd_wnd_test(tp, skb, mss)) { + pcount = tp->packets_out; + tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC); + if (tp->packets_out > pcount) + goto probe_sent; + goto rearm_timer; + } + skb = tcp_write_queue_prev(sk, skb); + } else { + skb = tcp_write_queue_tail(sk); } /* At most one outstanding TLP retransmission. */ @@ -2247,7 +2255,6 @@ void tcp_send_loss_probe(struct sock *sk) goto rearm_timer; /* Retransmit last segment. */ - skb = tcp_write_queue_tail(sk); if (WARN_ON(!skb)) goto rearm_timer; @@ -2262,24 +2269,23 @@ void tcp_send_loss_probe(struct sock *sk) if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss, GFP_ATOMIC))) goto rearm_timer; - skb = tcp_write_queue_tail(sk); + skb = tcp_write_queue_next(sk, skb); } if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; - err = __tcp_retransmit_skb(sk, skb); + if (__tcp_retransmit_skb(sk, skb)) + goto rearm_timer; /* Record snd_nxt for loss detection. */ - if (likely(!err)) - tp->tlp_high_seq = tp->snd_nxt; + tp->tlp_high_seq = tp->snd_nxt; +probe_sent: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); + /* Reset s.t. tcp_rearm_rto will restart timer from now */ + inet_csk(sk)->icsk_pending = 0; rearm_timer: - if (likely(!err)) { - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSPROBES); - /* Reset s.t. tcp_rearm_rto will restart timer from now */ - inet_csk(sk)->icsk_pending = 0; - } tcp_rearm_rto(sk); } -- cgit v1.2.3 From cea45e208d700e9d633a636384a49f19cda979b7 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 13 Aug 2015 10:39:00 -0400 Subject: net: track link status of ipv6 nexthops Add support to track current link status of ipv6 nexthops to match recent changes that added support for ipv4 nexthops. This takes a simple approach to track linkdown status for next-hops and simply checks the dev for the dst entry and sets proper flags that to be used in the netlink message. v2: drop use of rt6i_nhflags since it is not needed right now Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Signed-off-by: David S. Miller --- net/ipv6/route.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c0fa61eba8f2..370f72785385 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2887,6 +2887,8 @@ static int rt6_fill_node(struct net *net, else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; + if (!netif_carrier_ok(rt->dst.dev)) + rtm->rtm_flags |= RTNH_F_LINKDOWN; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) -- cgit v1.2.3 From 35103d11173b8fea874183f8aa508ae71234d299 Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 13 Aug 2015 10:39:01 -0400 Subject: net: ipv6 sysctl option to ignore routes when nexthop link is down Like the ipv4 patch with a similar title, this adds a sysctl to allow the user to change routing behavior based on whether or not the interface associated with the nexthop was an up or down link. The default setting preserves the current behavior, but anyone that enables it will notice that nexthops on down interfaces will no longer be selected: net.ipv6.conf.all.ignore_routes_with_linkdown = 0 net.ipv6.conf.default.ignore_routes_with_linkdown = 0 net.ipv6.conf.lo.ignore_routes_with_linkdown = 0 ... When the above sysctls are set, not only will link status be reported to userspace, but an indication that a nexthop is dead and will not be used is also reported. 1000::/8 via 7000::2 dev p7p1 metric 1024 dead linkdown pref medium 1000::/8 via 8000::2 dev p8p1 metric 1024 pref medium 7000::/8 dev p7p1 proto kernel metric 256 dead linkdown pref medium 8000::/8 dev p8p1 proto kernel metric 256 pref medium 9000::/8 via 8000::2 dev p8p1 metric 2048 pref medium 9000::/8 via 7000::2 dev p7p1 metric 1024 dead linkdown pref medium fe80::/64 dev p7p1 proto kernel metric 256 dead linkdown pref medium fe80::/64 dev p8p1 proto kernel metric 256 pref medium This also adds devconf support and notification when sysctl values change. v2: drop use of rt6i_nhflags since it is not needed right now Signed-off-by: Andy Gospodarek Signed-off-by: Dinesh Dutt Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 105 +++++++++++++++++++++++++++++++++++++++++++++- net/ipv6/route.c | 11 ++++- 4 files changed, 116 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index cb9dcad72372..f1f32af6d9b9 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -31,6 +31,7 @@ struct ipv6_devconf { __s32 accept_ra_defrtr; __s32 accept_ra_min_hop_limit; __s32 accept_ra_pinfo; + __s32 ignore_routes_with_linkdown; #ifdef CONFIG_IPV6_ROUTER_PREF __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 80f3b74446a1..38b4fef20219 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -173,6 +173,7 @@ enum { DEVCONF_STABLE_SECRET, DEVCONF_USE_OIF_ADDRS_ONLY, DEVCONF_ACCEPT_RA_MIN_HOP_LIMIT, + DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 53e3a9d756b0..5dfbac72f1ab 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -214,6 +214,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .initialized = false, }, .use_oif_addrs_only = 0, + .ignore_routes_with_linkdown = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -257,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .initialized = false, }, .use_oif_addrs_only = 0, + .ignore_routes_with_linkdown = 0, }; /* Check if a valid qdisc is available */ @@ -472,6 +474,9 @@ static int inet6_netconf_msgsize_devconf(int type) if (type == -1 || type == NETCONFA_PROXY_NEIGH) size += nla_total_size(4); + if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) + size += nla_total_size(4); + return size; } @@ -508,6 +513,11 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0) goto nla_put_failure; + if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) && + nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + devconf->ignore_routes_with_linkdown) < 0) + goto nla_put_failure; + nlmsg_end(skb, nlh); return 0; @@ -544,6 +554,7 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { [NETCONFA_IFINDEX] = { .len = sizeof(int) }, [NETCONFA_FORWARDING] = { .len = sizeof(int) }, [NETCONFA_PROXY_NEIGH] = { .len = sizeof(int) }, + [NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN] = { .len = sizeof(int) }, }; static int inet6_netconf_get_devconf(struct sk_buff *in_skb, @@ -766,6 +777,63 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) rt6_purge_dflt_routers(net); return 1; } + +static void addrconf_linkdown_change(struct net *net, __s32 newf) +{ + struct net_device *dev; + struct inet6_dev *idev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf); + + idev->cnf.ignore_routes_with_linkdown = newf; + if (changed) + inet6_netconf_notify_devconf(dev_net(dev), + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + dev->ifindex, + &idev->cnf); + } + } +} + +static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) +{ + struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); + + net = (struct net *)table->extra2; + old = *p; + *p = newf; + + if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) { + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + NETCONFA_IFINDEX_DEFAULT, + net->ipv6.devconf_dflt); + rtnl_unlock(); + return 0; + } + + if (p == &net->ipv6.devconf_all->ignore_routes_with_linkdown) { + net->ipv6.devconf_dflt->ignore_routes_with_linkdown = newf; + addrconf_linkdown_change(net, newf); + if ((!newf) ^ (!old)) + inet6_netconf_notify_devconf(net, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + NETCONFA_IFINDEX_ALL, + net->ipv6.devconf_all); + } + rtnl_unlock(); + + return 1; +} + #endif /* Nobody refers to this ifaddr, destroy it */ @@ -4616,6 +4684,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_SUPPRESS_FRAG_NDISC] = cnf->suppress_frag_ndisc; array[DEVCONF_ACCEPT_RA_FROM_LOCAL] = cnf->accept_ra_from_local; array[DEVCONF_ACCEPT_RA_MTU] = cnf->accept_ra_mtu; + array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown; /* we omit DEVCONF_STABLE_SECRET for now */ array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only; } @@ -5338,6 +5407,34 @@ out: return err; } +static +int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, + int write, + void __user *buffer, + size_t *lenp, + loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; + int ret; + + /* ctl->data points to idev->cnf.ignore_routes_when_linkdown + * we should not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write) + ret = addrconf_fixup_linkdown(ctl, valp, val); + if (ret) + *ppos = pos; + return ret; +} + static struct addrconf_sysctl_table { struct ctl_table_header *sysctl_header; @@ -5629,7 +5726,13 @@ static struct addrconf_sysctl_table .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - + }, + { + .procname = "ignore_routes_with_linkdown", + .data = &ipv6_devconf.ignore_routes_with_linkdown, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_ignore_routes_with_linkdown, }, { /* sentinel */ diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 370f72785385..1c0217e61357 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -665,6 +665,12 @@ static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict, { int m; bool match_do_rr = false; + struct inet6_dev *idev = rt->rt6i_idev; + struct net_device *dev = rt->dst.dev; + + if (dev && !netif_carrier_ok(dev) && + idev->cnf.ignore_routes_with_linkdown) + goto out; if (rt6_check_expired(rt)) goto out; @@ -2887,8 +2893,11 @@ static int rt6_fill_node(struct net *net, else rtm->rtm_type = RTN_UNICAST; rtm->rtm_flags = 0; - if (!netif_carrier_ok(rt->dst.dev)) + if (!netif_carrier_ok(rt->dst.dev)) { rtm->rtm_flags |= RTNH_F_LINKDOWN; + if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown) + rtm->rtm_flags |= RTNH_F_DEAD; + } rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; if (rt->rt6i_flags & RTF_DYNAMIC) -- cgit v1.2.3 From 111495361598205967f1be4e07d4726b0f762d60 Mon Sep 17 00:00:00 2001 From: Vivien Didelot Date: Thu, 13 Aug 2015 12:52:17 -0400 Subject: net: dsa: add support for switchdev VLAN objects Add new functions in DSA drivers to access hardware VLAN entries through SWITCHDEV_OBJ_PORT_VLAN objects: - port_pvid_get() and vlan_getnext() to dump a VLAN - port_vlan_del() to exclude a port from a VLAN - port_pvid_set() and port_vlan_add() to join a port to a VLAN The DSA infrastructure will ensure that each VLAN of the given range does not already belong to another bridge. If it does, it will fallback to software VLAN and won't program the hardware. Signed-off-by: Vivien Didelot Signed-off-by: David S. Miller --- include/net/dsa.h | 11 ++++ net/dsa/slave.c | 158 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+) (limited to 'net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 6356f437e911..bd9b76502458 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -297,6 +297,17 @@ struct dsa_switch_driver { int (*port_stp_update)(struct dsa_switch *ds, int port, u8 state); + /* + * VLAN support + */ + int (*port_pvid_get)(struct dsa_switch *ds, int port, u16 *pvid); + int (*port_pvid_set)(struct dsa_switch *ds, int port, u16 pvid); + int (*port_vlan_add)(struct dsa_switch *ds, int port, u16 vid, + bool untagged); + int (*port_vlan_del)(struct dsa_switch *ds, int port, u16 vid); + int (*vlan_getnext)(struct dsa_switch *ds, u16 *vid, + unsigned long *ports, unsigned long *untagged); + /* * Forwarding database */ diff --git a/net/dsa/slave.c b/net/dsa/slave.c index aa0266f7d0ce..373ff315030d 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -200,6 +200,152 @@ out: return 0; } +static int dsa_bridge_check_vlan_range(struct dsa_switch *ds, + const struct net_device *bridge, + u16 vid_begin, u16 vid_end) +{ + struct dsa_slave_priv *p; + struct net_device *dev, *vlan_br; + DECLARE_BITMAP(members, DSA_MAX_PORTS); + DECLARE_BITMAP(untagged, DSA_MAX_PORTS); + u16 vid; + int member, err; + + if (!ds->drv->vlan_getnext || !vid_begin) + return -EOPNOTSUPP; + + vid = vid_begin - 1; + + do { + err = ds->drv->vlan_getnext(ds, &vid, members, untagged); + if (err) + break; + + if (vid > vid_end) + break; + + member = find_first_bit(members, DSA_MAX_PORTS); + if (member == DSA_MAX_PORTS) + continue; + + dev = ds->ports[member]; + p = netdev_priv(dev); + vlan_br = p->bridge_dev; + if (vlan_br == bridge) + continue; + + netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid); + return -EOPNOTSUPP; + } while (vid < vid_end); + + return err == -ENOENT ? 0 : err; +} + +static int dsa_slave_port_vlan_add(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_obj_vlan *vlan = &obj->u.vlan; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + u16 vid; + int err; + + switch (obj->trans) { + case SWITCHDEV_TRANS_PREPARE: + if (!ds->drv->port_vlan_add || !ds->drv->port_pvid_set) + return -EOPNOTSUPP; + + /* If the requested port doesn't belong to the same bridge as + * the VLAN members, fallback to software VLAN (hopefully). + */ + err = dsa_bridge_check_vlan_range(ds, p->bridge_dev, + vlan->vid_begin, + vlan->vid_end); + if (err) + return err; + break; + case SWITCHDEV_TRANS_COMMIT: + for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { + err = ds->drv->port_vlan_add(ds, p->port, vid, + vlan->flags & + BRIDGE_VLAN_INFO_UNTAGGED); + if (!err && vlan->flags & BRIDGE_VLAN_INFO_PVID) + err = ds->drv->port_pvid_set(ds, p->port, vid); + if (err) + return err; + } + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +static int dsa_slave_port_vlan_del(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_obj_vlan *vlan = &obj->u.vlan; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + u16 vid; + int err; + + if (!ds->drv->port_vlan_del) + return -EOPNOTSUPP; + + for (vid = vlan->vid_begin; vid <= vlan->vid_end; ++vid) { + err = ds->drv->port_vlan_del(ds, p->port, vid); + if (err) + return err; + } + + return 0; +} + +static int dsa_slave_port_vlan_dump(struct net_device *dev, + struct switchdev_obj *obj) +{ + struct switchdev_obj_vlan *vlan = &obj->u.vlan; + struct dsa_slave_priv *p = netdev_priv(dev); + struct dsa_switch *ds = p->parent; + DECLARE_BITMAP(members, DSA_MAX_PORTS); + DECLARE_BITMAP(untagged, DSA_MAX_PORTS); + u16 pvid, vid = 0; + int err; + + if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get) + return -EOPNOTSUPP; + + err = ds->drv->port_pvid_get(ds, p->port, &pvid); + if (err) + return err; + + for (;;) { + err = ds->drv->vlan_getnext(ds, &vid, members, untagged); + if (err) + break; + + if (!test_bit(p->port, members)) + continue; + + memset(vlan, 0, sizeof(*vlan)); + vlan->vid_begin = vlan->vid_end = vid; + + if (vid == pvid) + vlan->flags |= BRIDGE_VLAN_INFO_PVID; + + if (test_bit(p->port, untagged)) + vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED; + + err = obj->cb(dev, obj); + if (err) + break; + } + + return err == -ENOENT ? 0 : err; +} + static int dsa_slave_port_fdb_add(struct net_device *dev, struct switchdev_obj *obj) { @@ -341,6 +487,9 @@ static int dsa_slave_port_obj_add(struct net_device *dev, case SWITCHDEV_OBJ_PORT_FDB: err = dsa_slave_port_fdb_add(dev, obj); break; + case SWITCHDEV_OBJ_PORT_VLAN: + err = dsa_slave_port_vlan_add(dev, obj); + break; default: err = -EOPNOTSUPP; break; @@ -358,6 +507,9 @@ static int dsa_slave_port_obj_del(struct net_device *dev, case SWITCHDEV_OBJ_PORT_FDB: err = dsa_slave_port_fdb_del(dev, obj); break; + case SWITCHDEV_OBJ_PORT_VLAN: + err = dsa_slave_port_vlan_del(dev, obj); + break; default: err = -EOPNOTSUPP; break; @@ -375,6 +527,9 @@ static int dsa_slave_port_obj_dump(struct net_device *dev, case SWITCHDEV_OBJ_PORT_FDB: err = dsa_slave_port_fdb_dump(dev, obj); break; + case SWITCHDEV_OBJ_PORT_VLAN: + err = dsa_slave_port_vlan_dump(dev, obj); + break; default: err = -EOPNOTSUPP; break; @@ -794,6 +949,9 @@ static const struct net_device_ops dsa_slave_netdev_ops = { .ndo_netpoll_cleanup = dsa_slave_netpoll_cleanup, .ndo_poll_controller = dsa_slave_poll_controller, #endif + .ndo_bridge_getlink = switchdev_port_bridge_getlink, + .ndo_bridge_setlink = switchdev_port_bridge_setlink, + .ndo_bridge_dellink = switchdev_port_bridge_dellink, }; static const struct switchdev_ops dsa_slave_switchdev_ops = { -- cgit v1.2.3 From da65ad1fe399d77234a4caa50008c6b87f1fe401 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 13 Aug 2015 14:03:16 -0400 Subject: net: allow sleeping when modifying store_rps_map Commit 10e4ea751 ("net: Fix race condition in store_rps_map") has moved the manipulation of the rps_needed jump label under a spinlock. Since changing the state of a jump label may sleep this is incorrect and causes warnings during runtime. Make rps_map_lock a mutex to allow sleeping under it. Fixes: 10e4ea751 ("net: Fix race condition in store_rps_map") Signed-off-by: Sasha Levin Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 39ec6949c1e6..b279077c3089 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -689,7 +689,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, struct rps_map *old_map, *map; cpumask_var_t mask; int err, cpu, i; - static DEFINE_SPINLOCK(rps_map_lock); + static DEFINE_MUTEX(rps_map_mutex); if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -722,9 +722,9 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, map = NULL; } - spin_lock(&rps_map_lock); + mutex_lock(&rps_map_mutex); old_map = rcu_dereference_protected(queue->rps_map, - lockdep_is_held(&rps_map_lock)); + mutex_is_locked(&rps_map_mutex)); rcu_assign_pointer(queue->rps_map, map); if (map) @@ -732,7 +732,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, if (old_map) static_key_slow_dec(&rps_needed); - spin_unlock(&rps_map_lock); + mutex_unlock(&rps_map_mutex); if (old_map) kfree_rcu(old_map, rcu); -- cgit v1.2.3 From 0344338bd883e5e4a2f80409ed8260cd65d69e3b Mon Sep 17 00:00:00 2001 From: Andy Gospodarek Date: Thu, 13 Aug 2015 15:26:35 -0400 Subject: net: addr IFLA_OPERSTATE to netlink message for ipv6 ifinfo This is useful information to include in ipv6 netlink messages that report interface information. IFLA_OPERSTATE is already included in ipv4 messages, but missing for ipv6. This closes that gap. Signed-off-by: Andy Gospodarek Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5dfbac72f1ab..59242399b0b5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4706,6 +4706,7 @@ static inline size_t inet6_if_nlmsg_size(void) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(inet6_ifla6_size()); /* IFLA_PROTINFO */ } @@ -4962,7 +4963,9 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || (dev->ifindex != dev_get_iflink(dev) && - nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) + nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || + nla_put_u8(skb, IFLA_OPERSTATE, + netif_running(dev) ? dev->operstate : IF_OPER_DOWN)) goto nla_put_failure; protoinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protoinfo) -- cgit v1.2.3 From cd2fbe1b6b517ca7c0e80b103c674fdf5bd50f76 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:01 -0600 Subject: net: Use VRF device index for lookups on RX On ingress use index of VRF master device for route lookups if real device is enslaved. Rules are expected to be installed for the VRF device to direct lookups to a specific table. Signed-off-by: Shrijeet Mukherjee Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 8 +++++++- net/ipv4/route.c | 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6b98de0d7949..d8ced1d89f1b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -45,6 +45,7 @@ #include #include #include +#include #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -309,7 +310,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; + fl4.flowi4_iif = vrf_master_ifindex_rcu(dev); + if (!fl4.flowi4_iif) + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; @@ -339,6 +342,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (nh->nh_dev == dev) { dev_match = true; break; + } else if (vrf_master_ifindex_rcu(nh->nh_dev) == dev->ifindex) { + dev_match = true; + break; } } #else diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 18fd7c9095c7..c26ff1f7067d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,6 +112,7 @@ #endif #include #include +#include #define RT_FL_TOS(oldflp4) \ ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) @@ -1726,7 +1727,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = dev->ifindex; + fl4.flowi4_iif = vrf_master_ifindex_rcu(dev) ? : dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; -- cgit v1.2.3 From 613d09b30f8b589d5a9b49775054c8865db95d1c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:02 -0600 Subject: net: Use VRF device index for lookups on TX As with ingress use the index of VRF master device for route lookups on egress. However, the oif should only be used to direct the lookups to a specific table. Routes in the table are not based on the VRF device but rather interfaces that are part of the VRF so do not consider the oif for lookups within the table. The FLOWI_FLAG_VRFSRC is used to control this latter part. Signed-off-by: Shrijeet Mukherjee Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/flow.h | 1 + include/net/route.h | 3 +++ net/ipv4/fib_trie.c | 7 +++++-- net/ipv4/icmp.c | 4 ++++ net/ipv4/route.c | 5 +++++ 5 files changed, 18 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/flow.h b/include/net/flow.h index 3098ae33a178..f305588fc162 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -33,6 +33,7 @@ struct flowi_common { __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 +#define FLOWI_FLAG_VRFSRC 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; }; diff --git a/include/net/route.h b/include/net/route.h index 2d45f419477f..94189d4bd899 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -251,6 +251,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 if (inet_sk(sk)->transparent) flow_flags |= FLOWI_FLAG_ANYSRC; + if (netif_index_is_vrf(sock_net(sk), oif)) + flow_flags |= FLOWI_FLAG_VRFSRC; + flowi4_init_output(fl4, oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, protocol, flow_flags, dst, src, dport, sport); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 37c4bb89a708..1243c79cb5b0 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1423,8 +1423,11 @@ found: nh->nh_flags & RTNH_F_LINKDOWN && !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (flp->flowi4_oif && flp->flowi4_oif != nh->nh_oif) - continue; + if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + if (flp->flowi4_oif && + flp->flowi4_oif != nh->nh_oif) + continue; + } if (!(fib_flags & FIB_LOOKUP_NOREF)) atomic_inc(&fi->fib_clntref); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c0556f1e4bf0..1164fc4ce3bc 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -96,6 +96,7 @@ #include #include #include +#include /* * Build xmit assembly blocks @@ -425,6 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; + fl4.flowi4_oif = vrf_master_ifindex_rcu(skb->dev) ? : skb->dev->ifindex; security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -458,6 +460,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; + fl4->flowi4_oif = vrf_master_ifindex_rcu(skb_in->dev) ? : skb_in->dev->ifindex; + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c26ff1f7067d..2c89d294b669 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2131,6 +2131,11 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } + if (netif_is_vrf(dev_out) && + !(fl4->flowi4_flags & FLOWI_FLAG_VRFSRC)) { + rth = vrf_dev_get_rth(dev_out); + goto out; + } } if (!fl4->daddr) { -- cgit v1.2.3 From 9a24abfa42613fefc68963a98c2b7ab7fd7e374c Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:03 -0600 Subject: udp: Handle VRF device in sendmsg For unconnected UDP sockets using a VRF device lookup source address based on VRF table. This allows the UDP header to be properly setup before showing up at the VRF device via the dst. Signed-off-by: Shrijeet Mukherjee Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/udp.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 1b8c5ba7d5f7..c0a15e7f359f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1013,11 +1013,31 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (!rt) { struct net *net = sock_net(sk); + __u8 flow_flags = inet_sk_flowi_flags(sk); fl4 = &fl4_stack; + + /* unconnected socket. If output device is enslaved to a VRF + * device lookup source address from VRF table. This mimics + * behavior of ip_route_connect{_init}. + */ + if (netif_index_is_vrf(net, ipc.oif)) { + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, + RT_SCOPE_UNIVERSE, sk->sk_protocol, + (flow_flags | FLOWI_FLAG_VRFSRC), + faddr, saddr, dport, + inet->inet_sport); + + rt = ip_route_output_flow(net, fl4, sk); + if (!IS_ERR(rt)) { + saddr = fl4->saddr; + ip_rt_put(rt); + } + } + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, sk->sk_protocol, - inet_sk_flowi_flags(sk), + flow_flags, faddr, saddr, dport, inet->inet_sport); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); -- cgit v1.2.3 From 15be405eb2ea943ac5fa2aab7d0ba282e9ef1301 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:04 -0600 Subject: net: Add inet_addr lookup by table Currently inet_addr_type and inet_dev_addr_type expect local addresses to be in the local table. With the VRF device local routes for devices associated with a VRF will be in the table associated with the VRF. Provide an alternate inet_addr lookup to use a specific table rather than defaulting to the local table. Signed-off-by: Shrijeet Mukherjee Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 1 + net/ipv4/fib_frontend.c | 22 +++++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 94189d4bd899..6ba681f0b98d 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -189,6 +189,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); void ip_rt_send_redirect(struct sk_buff *skb); unsigned int inet_addr_type(struct net *net, __be32 addr); +unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); void ip_rt_multicast_event(struct in_device *); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d8ced1d89f1b..b11321a8e58d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -212,12 +212,12 @@ void fib_flush_external(struct net *net) */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, - __be32 addr) + __be32 addr, int tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; - struct fib_table *local_table; + struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; @@ -226,10 +226,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, rcu_read_lock(); - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (local_table) { + table = fib_get_table(net, tb_id); + if (table) { ret = RTN_UNICAST; - if (!fib_table_lookup(local_table, &fl4, &res, FIB_LOOKUP_NOREF)) { + if (!fib_table_lookup(table, &fl4, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; } @@ -239,16 +239,24 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, return ret; } +unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id) +{ + return __inet_dev_addr_type(net, NULL, addr, tb_id); +} +EXPORT_SYMBOL(inet_addr_type_table); + unsigned int inet_addr_type(struct net *net, __be32 addr) { - return __inet_dev_addr_type(net, NULL, addr); + return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - return __inet_dev_addr_type(net, dev, addr); + int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + + return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); -- cgit v1.2.3 From 30bbaa19500559d7625c65632195413f639b3b97 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:05 -0600 Subject: net: Fix up inet_addr_type checks Currently inet_addr_type and inet_dev_addr_type expect local addresses to be in the local table. With the VRF device local routes for devices associated with a VRF will be in the table associated with the VRF. Provide an alternate inet_addr lookup to use a specific table rather than defaulting to the local table. inet_addr_type_dev_table keeps the same semantics as inet_addr_type but if the passed in device is enslaved to a VRF then the table for that VRF is used for the lookup. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/route.h | 3 +++ net/ipv4/af_inet.c | 13 ++++++++++++- net/ipv4/arp.c | 15 +++++++++------ net/ipv4/fib_frontend.c | 25 ++++++++++++++++++++++--- net/ipv4/fib_semantics.c | 6 ++++-- net/ipv4/icmp.c | 5 +++-- 6 files changed, 53 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 6ba681f0b98d..6dda2c1bf8c6 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -192,6 +192,9 @@ unsigned int inet_addr_type(struct net *net, __be32 addr); unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); +unsigned int inet_addr_type_dev_table(struct net *net, + const struct net_device *dev, + __be32 addr); void ip_rt_multicast_event(struct in_device *); int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cc4e498a0ccf..c8b855882fa5 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,6 +119,7 @@ #ifdef CONFIG_IP_MROUTE #include #endif +#include /* The inetsw table contains everything that inet_create needs to @@ -427,6 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; + int tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -448,7 +450,16 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - chk_addr_ret = inet_addr_type(net, addr->sin_addr.s_addr); + if (sk->sk_bound_dev_if) { + struct net_device *dev; + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + if (dev) + tb_id = vrf_dev_table_rcu(dev) ? : tb_id; + rcu_read_unlock(); + } + chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 34a308573f4b..30409b75e925 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -233,7 +233,7 @@ static int arp_constructor(struct neighbour *neigh) return -EINVAL; } - neigh->type = inet_addr_type(dev_net(dev), addr); + neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); @@ -343,7 +343,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(dev_net(dev), + if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; @@ -351,7 +351,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!skb) break; saddr = ip_hdr(skb)->saddr; - if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) { + if (inet_addr_type_dev_table(dev_net(dev), dev, + saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -751,7 +752,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && - inet_addr_type(net, tip) == RTN_LOCAL && + inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha); @@ -811,16 +812,18 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) n = __neigh_lookup(&arp_tbl, &sip, dev, 0); if (IN_DEV_ARP_ACCEPT(in_dev)) { + unsigned int addr_type = inet_addr_type_dev_table(net, dev, sip); + /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ is_garp = arp->ar_op == htons(ARPOP_REQUEST) && tip == sip && - inet_addr_type(net, sip) == RTN_UNICAST; + addr_type == RTN_UNICAST; if (!n && ((arp->ar_op == htons(ARPOP_REPLY) && - inet_addr_type(net, sip) == RTN_UNICAST) || is_garp)) + addr_type == RTN_UNICAST) || is_garp)) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b11321a8e58d..c55723ec4c3e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -260,6 +260,19 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, } EXPORT_SYMBOL(inet_dev_addr_type); +/* inet_addr_type with dev == NULL but using the table from a dev + * if one is associated + */ +unsigned int inet_addr_type_dev_table(struct net *net, + const struct net_device *dev, + __be32 addr) +{ + int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + + return __inet_dev_addr_type(net, NULL, addr, rt_table); +} +EXPORT_SYMBOL(inet_addr_type_dev_table); + __be32 fib_compute_spec_dst(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -510,9 +523,12 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, addr = sk_extract_addr(&rt->rt_gateway); if (rt->rt_gateway.sa_family == AF_INET && addr) { + unsigned int addr_type; + cfg->fc_gw = addr; + addr_type = inet_addr_type_table(net, addr, cfg->fc_table); if (rt->rt_flags & RTF_GATEWAY && - inet_addr_type(net, addr) == RTN_UNICAST) + addr_type == RTN_UNICAST) cfg->fc_scope = RT_SCOPE_UNIVERSE; } @@ -984,11 +1000,14 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); } if (!(ok & LOCAL_OK)) { + unsigned int addr_type; + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ - if (gone && - inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { + addr_type = inet_addr_type_dev_table(dev_net(dev), dev, + ifa->ifa_local); + if (gone && addr_type != RTN_LOCAL) { /* And the last, but not the least thing. * We must flush stray FIB entries. * diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 558e196bae0f..410ddb67221e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -670,16 +670,18 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_result res; if (nh->nh_flags & RTNH_F_ONLINK) { + unsigned int addr_type; if (cfg->fc_scope >= RT_SCOPE_LINK) return -EINVAL; - if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) - return -EINVAL; dev = __dev_get_by_index(net, nh->nh_oif); if (!dev) return -ENODEV; if (!(dev->flags & IFF_UP)) return -ENETDOWN; + addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw); + if (addr_type != RTN_UNICAST) + return -EINVAL; if (!netif_carrier_ok(dev)) nh->nh_flags |= RTNH_F_LINKDOWN; nh->nh_dev = dev; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 1164fc4ce3bc..c6f1ce149ffb 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -484,7 +484,8 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type(net, fl4_dec.saddr) == RTN_LOCAL) { + if (inet_addr_type_dev_table(net, skb_in->dev, + fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) err = PTR_ERR(rt2); @@ -833,7 +834,7 @@ static bool icmp_unreach(struct sk_buff *skb) */ if (!net->ipv4.sysctl_icmp_ignore_bogus_error_responses && - inet_addr_type(net, iph->daddr) == RTN_BROADCAST) { + inet_addr_type_dev_table(net, skb->dev, iph->daddr) == RTN_BROADCAST) { net_warn_ratelimited("%pI4 sent an invalid ICMP type %u, code %u error to a broadcast: %pI4 on %s\n", &ip_hdr(skb)->saddr, icmph->type, icmph->code, -- cgit v1.2.3 From 021dd3b8a142d482cb65a27bf6644e3764001460 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:06 -0600 Subject: net: Add routes to the table associated with the device When a device associated with a VRF is brought up or down routes should be added to/removed from the table associated with the VRF. fib_magic defaults to using the main or local tables. Have it use the table with the device if there is one. A part of this is directing prefsrc validations to the correct table as well. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 8 ++++---- net/ipv4/fib_semantics.c | 25 +++++++++++++++++++------ 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c55723ec4c3e..7fa277176c33 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -800,6 +800,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); + int tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -814,11 +815,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad }, }; - if (type == RTN_UNICAST) - tb = fib_new_table(net, RT_TABLE_MAIN); - else - tb = fib_new_table(net, RT_TABLE_LOCAL); + if (!tb_id) + tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; + tb = fib_new_table(net, tb_id); if (!tb) return; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 410ddb67221e..85e9a8abf15c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -838,6 +838,23 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) return nh->nh_saddr; } +static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) +{ + if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || + fib_prefsrc != cfg->fc_dst) { + int tb_id = cfg->fc_table; + + if (tb_id == RT_TABLE_MAIN) + tb_id = RT_TABLE_LOCAL; + + if (inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, tb_id) != RTN_LOCAL) { + return false; + } + } + return true; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -1033,12 +1050,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_flags |= RTNH_F_LINKDOWN; } - if (fi->fib_prefsrc) { - if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || - fi->fib_prefsrc != cfg->fc_dst) - if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL) - goto err_inval; - } + if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc)) + goto err_inval; change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); -- cgit v1.2.3 From 3bfd847203c6d89532f836ad3f5b4ff4ced26dd9 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:07 -0600 Subject: net: Use passed in table for nexthop lookups If a user passes in a table for new routes use that table for nexthop lookups. Specifically, this solves the case where a connected route does not exist in the main table, but only another table and then a subsequent route is added with a next hop using the connected route. ie., $ ip route ls default via 10.0.2.2 dev eth0 10.0.2.0/24 dev eth0 proto kernel scope link src 10.0.2.15 169.254.0.0/16 dev eth0 scope link metric 1003 192.168.56.0/24 dev eth1 proto kernel scope link src 192.168.56.51 $ ip route ls table 10 1.1.1.0/24 dev eth2 scope link Without this patch adding a nexthop route fails: $ ip route add table 10 2.2.2.0/24 via 1.1.1.10 RTNETLINK answers: Network is unreachable With this patch the route is added successfully. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 85e9a8abf15c..b7f1d20a9615 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -691,6 +691,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } rcu_read_lock(); { + struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh->nh_gw, .flowi4_scope = cfg->fc_scope + 1, @@ -701,8 +702,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope < RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE); + + if (cfg->fc_table) + tbl = fib_get_table(net, cfg->fc_table); + + if (tbl) + err = fib_table_lookup(tbl, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); + else + err = fib_lookup(net, &fl4, &res, + FIB_LOOKUP_IGNORE_LINKSTATE); if (err) { rcu_read_unlock(); return err; -- cgit v1.2.3 From f7ba868b71bc858cf06de11fc8150c4552cfad81 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:08 -0600 Subject: net: Use VRF index for oif in ip_send_unicast_reply If output device is not specified use VRF device if input device is enslaved. This is needed to ensure tcp acks and resets go out VRF device. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6bf89a6312bc..0138fada0951 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1542,6 +1542,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, struct net *net = sock_net(sk); struct sk_buff *nskb; int err; + int oif; if (__ip_options_echo(&replyopts.opt.opt, skb, sopt)) return; @@ -1559,7 +1560,11 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, daddr = replyopts.opt.opt.faddr; } - flowi4_init_output(&fl4, arg->bound_dev_if, + oif = arg->bound_dev_if; + if (!oif && netif_index_is_vrf(net, skb->skb_iif)) + oif = skb->skb_iif; + + flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), RT_TOS(arg->tos), RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, -- cgit v1.2.3 From 9972f134a273d6dc52d912a3513fa06b426de9b4 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 13 Aug 2015 14:59:09 -0600 Subject: net: frags: Add VRF device index to cache and lookup Fragmentation cache uses information from the IP header to reassemble packets. That information can be duplicated across VRFs -- same source and destination addresses, protocol and id. Handle fragmentation with VRFs by adding the VRF device index to entries in the cache and the lookup arg. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index d96722ae8979..15762e758861 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -48,6 +48,7 @@ #include #include #include +#include /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c @@ -77,6 +78,7 @@ struct ipq { u8 ecn; /* RFC3168 support */ u16 max_df_size; /* largest frag with DF set seen */ int iif; + int vif; /* VRF device index */ unsigned int rid; struct inet_peer *peer; }; @@ -99,6 +101,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct ip4_create_arg { struct iphdr *iph; u32 user; + int vif; }; static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) @@ -127,7 +130,8 @@ static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) qp->saddr == arg->iph->saddr && qp->daddr == arg->iph->daddr && qp->protocol == arg->iph->protocol && - qp->user == arg->user; + qp->user == arg->user && + qp->vif == arg->vif; } static void ip4_frag_init(struct inet_frag_queue *q, const void *a) @@ -144,6 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->ecn = ip4_frag_ecn(arg->iph->tos); qp->saddr = arg->iph->saddr; qp->daddr = arg->iph->daddr; + qp->vif = arg->vif; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; @@ -244,7 +249,8 @@ out: /* Find the correct entry in the "incomplete datagrams" queue for * this IP datagram, and create new one, if nothing is found. */ -static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) +static struct ipq *ip_find(struct net *net, struct iphdr *iph, + u32 user, int vif) { struct inet_frag_queue *q; struct ip4_create_arg arg; @@ -252,6 +258,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) arg.iph = iph; arg.user = user; + arg.vif = vif; hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); @@ -648,14 +655,15 @@ out_fail: /* Process an incoming IP datagram fragment. */ int ip_defrag(struct sk_buff *skb, u32 user) { + struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; + int vif = vrf_master_ifindex_rcu(dev); + struct net *net = dev_net(dev); struct ipq *qp; - struct net *net; - net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ - qp = ip_find(net, ip_hdr(skb), user); + qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret; -- cgit v1.2.3 From 4edd56981c8fbb349b1529a2feaf772636eb1c83 Mon Sep 17 00:00:00 2001 From: Matthias May Date: Fri, 17 Jul 2015 15:28:39 +0200 Subject: cfg80211: regulatory: handle 5 and 10 MHz channels properly The original assumption of 20MHz wide channels hasn't been true since the addition of support for 5 and 10 MHz channels. Change the code to no longer disable all channels that don't fit into the 20MHz grid, but instead set the appropriate flags to disable operation on specific bandwidths. Signed-off-by: Matthias May [reword commit message] Signed-off-by: Johannes Berg --- net/wireless/reg.c | 64 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 50bfdc11222d..b144485946f2 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1004,7 +1004,7 @@ static u32 map_regdom_flags(u32 rd_flags) static const struct ieee80211_reg_rule * freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, - const struct ieee80211_regdomain *regd) + const struct ieee80211_regdomain *regd, u32 bw) { int i; bool band_rule_found = false; @@ -1028,7 +1028,7 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, if (!band_rule_found) band_rule_found = freq_in_rule_band(fr, center_freq); - bw_fits = reg_does_bw_fit(fr, center_freq, MHZ_TO_KHZ(20)); + bw_fits = reg_does_bw_fit(fr, center_freq, bw); if (band_rule_found && bw_fits) return rr; @@ -1040,14 +1040,26 @@ freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq, return ERR_PTR(-EINVAL); } -const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, - u32 center_freq) +const struct ieee80211_reg_rule *__freq_reg_info(struct wiphy *wiphy, + u32 center_freq, u32 min_bw) { - const struct ieee80211_regdomain *regd; + const struct ieee80211_regdomain *regd = reg_get_regdomain(wiphy); + const struct ieee80211_reg_rule *reg_rule = NULL; + u32 bw; - regd = reg_get_regdomain(wiphy); + for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) { + reg_rule = freq_reg_info_regd(wiphy, center_freq, regd, bw); + if (!IS_ERR(reg_rule)) + return reg_rule; + } - return freq_reg_info_regd(wiphy, center_freq, regd); + return reg_rule; +} + +const struct ieee80211_reg_rule *freq_reg_info(struct wiphy *wiphy, + u32 center_freq) +{ + return __freq_reg_info(wiphy, center_freq, MHZ_TO_KHZ(20)); } EXPORT_SYMBOL(freq_reg_info); @@ -1176,8 +1188,20 @@ static void handle_channel(struct wiphy *wiphy, if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + /* If we get a reg_rule we can assume that at least 5Mhz fit */ + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(10))) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(20))) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags = IEEE80211_CHAN_NO_HT40; + bw_flags |= IEEE80211_CHAN_NO_HT40; if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) @@ -1695,9 +1719,15 @@ static void handle_channel_custom(struct wiphy *wiphy, const struct ieee80211_power_rule *power_rule = NULL; const struct ieee80211_freq_range *freq_range = NULL; u32 max_bandwidth_khz; + u32 bw; - reg_rule = freq_reg_info_regd(wiphy, MHZ_TO_KHZ(chan->center_freq), - regd); + for (bw = MHZ_TO_KHZ(20); bw >= MHZ_TO_KHZ(5); bw = bw / 2) { + reg_rule = freq_reg_info_regd(wiphy, + MHZ_TO_KHZ(chan->center_freq), + regd, bw); + if (!IS_ERR(reg_rule)) + break; + } if (IS_ERR(reg_rule)) { REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n", @@ -1721,8 +1751,20 @@ static void handle_channel_custom(struct wiphy *wiphy, if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + /* If we get a reg_rule we can assume that at least 5Mhz fit */ + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(10))) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq), + MHZ_TO_KHZ(20))) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags = IEEE80211_CHAN_NO_HT40; + bw_flags |= IEEE80211_CHAN_NO_HT40; if (max_bandwidth_khz < MHZ_TO_KHZ(80)) bw_flags |= IEEE80211_CHAN_NO_80MHZ; if (max_bandwidth_khz < MHZ_TO_KHZ(160)) -- cgit v1.2.3 From 9189ee31df40f88808daee10aa7f99ba43ff8b13 Mon Sep 17 00:00:00 2001 From: Michal Kazior Date: Mon, 3 Aug 2015 10:55:24 +0200 Subject: cfg80211: propagate set_wiphy failure to userspace If driver failed to setup wiphy params (e.g. rts threshold, fragmentation treshold) userspace wasn't properly notified about this. This could lead to user confusion who would think the command succeeded even if that wasn't the case. Signed-off-by: Michal Kazior Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 76b41578a838..5849fa199f77 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2321,6 +2321,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.frag_threshold = old_frag_threshold; rdev->wiphy.rts_threshold = old_rts_threshold; rdev->wiphy.coverage_class = old_coverage_class; + return result; } } return 0; -- cgit v1.2.3 From 876dc9308e8b8a8fb57059234e57f4145c870c3c Mon Sep 17 00:00:00 2001 From: Bertold Van den Bergh Date: Wed, 5 Aug 2015 16:02:21 +0200 Subject: nl80211: Allow setting multicast rate on OCB interfaces Allow setting multicast rate on OCB interfaces. Current behaviour results in EOPNOTSUPP when attempting this. Signed-off-by: Bertold Van den Bergh Signed-off-by: Johannes Berg --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5849fa199f77..5d8748b4c8a2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7391,7 +7391,8 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info) int err; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB) return -EOPNOTSUPP; if (!rdev->ops->set_mcast_rate) -- cgit v1.2.3 From 5765f9f66e72ddedfe04e057a5a01454d7b67157 Mon Sep 17 00:00:00 2001 From: Bertold Van den Bergh Date: Wed, 5 Aug 2015 16:02:28 +0200 Subject: mac80211: Set txrc.bss to true for OCB interfaces To make mac80211 accept the multicast rate requested by the user the rate control should be told that it is operating in BSS mode. Without this, the default rate is selected in rate_control_send_low (!pubsta and !txrc->bss) Signed-off-by: Bertold Van den Bergh Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2079d480cd7b..84e0e8c7fb23 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -686,7 +686,8 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) txrc.bss = (tx->sdata->vif.type == NL80211_IFTYPE_AP || tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT || - tx->sdata->vif.type == NL80211_IFTYPE_ADHOC); + tx->sdata->vif.type == NL80211_IFTYPE_ADHOC || + tx->sdata->vif.type == NL80211_IFTYPE_OCB); /* set up RTS protection if desired */ if (len > tx->local->hw.wiphy->rts_threshold) { -- cgit v1.2.3 From cc11729893558b374316e6142dc383f0508436c8 Mon Sep 17 00:00:00 2001 From: Bertold Van den Bergh Date: Wed, 5 Aug 2015 16:02:42 +0200 Subject: mac80211: Only accept data frames in OCB mode Currently OCB mode accepts frames with bssid==broadcast and type!=beacon. Some non-data frames are sent matching this, for example probe responses. This results in unnecessary creation of STA entries. Signed-off-by: Bertold Van den Bergh Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index f673304f70f5..4d217d3265f4 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -3316,7 +3316,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) case NL80211_IFTYPE_OCB: if (!bssid) return false; - if (ieee80211_is_beacon(hdr->frame_control)) + if (!ieee80211_is_data_present(hdr->frame_control)) return false; if (!is_broadcast_ether_addr(bssid)) return false; -- cgit v1.2.3 From 4b819f6cc4221ea6dd250e006f7b9ab0f6c71b45 Mon Sep 17 00:00:00 2001 From: Bertold Van den Bergh Date: Wed, 5 Aug 2015 16:02:50 +0200 Subject: mac80211: Make OCB mode set BSSID Perform the BSS_CHANGED_BSSID action when joining an OCB network. This is required to set the broadcast BSSID in some network drivers. Signed-off-by: Bertold Van den Bergh Signed-off-by: Johannes Berg --- net/mac80211/ocb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ocb.c b/net/mac80211/ocb.c index 358d5f9d8207..573b81a1fb2d 100644 --- a/net/mac80211/ocb.c +++ b/net/mac80211/ocb.c @@ -179,7 +179,7 @@ int ieee80211_ocb_join(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_if_ocb *ifocb = &sdata->u.ocb; - u32 changed = BSS_CHANGED_OCB; + u32 changed = BSS_CHANGED_OCB | BSS_CHANGED_BSSID; int err; if (ifocb->joined == true) -- cgit v1.2.3 From 35225eb7a5589407299033bfa7e1ac723b17e2b5 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 6 Aug 2015 23:47:30 +0200 Subject: mac80211: remove ieee80211_tx_info from rate_control_apply_mask signature Remove unnecessary ieee80211_tx_info pointer from rate_control_apply_mask signature. rate_control_apply_mask() will be used to define a ratemask in rate_control_set_rates() for station rate table Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 03687d22b405..4f02e07ecc7b 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -631,7 +631,6 @@ static void rate_control_fill_sta_table(struct ieee80211_sta *sta, static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_supported_band *sband, - struct ieee80211_tx_info *info, struct ieee80211_tx_rate *rates, int max_rates) { @@ -647,8 +646,8 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, * default mask (allow all rates) is used to save some processing for * the common case. */ - mask = sdata->rc_rateidx_mask[info->band]; - has_mcs_mask = sdata->rc_has_mcs_mask[info->band]; + mask = sdata->rc_rateidx_mask[sband->band]; + has_mcs_mask = sdata->rc_has_mcs_mask[sband->band]; rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); for (i = 0; i < sband->n_bitrates; i++) @@ -659,14 +658,14 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, return; if (has_mcs_mask) - memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[info->band], + memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[sband->band], sizeof(mcs_mask)); else memset(mcs_mask, 0xff, sizeof(mcs_mask)); if (sta) { /* Filter out rates that the STA does not support */ - mask &= sta->supp_rates[info->band]; + mask &= sta->supp_rates[sband->band]; for (i = 0; i < sizeof(mcs_mask); i++) mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; } @@ -707,7 +706,7 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_data(hdr->frame_control)) - rate_control_apply_mask(sdata, sta, sband, info, dest, max_rates); + rate_control_apply_mask(sdata, sta, sband, dest, max_rates); if (dest[0].idx < 0) __rate_control_send_low(&sdata->local->hw, sband, sta, info, -- cgit v1.2.3 From 90c66bd2232ae6d3c88c1f3378e3028fded642b3 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 6 Aug 2015 23:47:31 +0200 Subject: mac80211: remove ieee80211_tx_rate dependency in rate mask code Remove ieee80211_tx_rate dependency in rate_idx_match_legacy_mask(), rate_idx_match_mcs_mask() and rate_idx_match_mask() in order to use the previous logic to define a ratemask in rate_control_set_rates() for station rate table. Moreover move rate mask definition logic in rate_control_cap_mask() Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 139 +++++++++++++++++++++++++++------------------------- 1 file changed, 71 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 4f02e07ecc7b..4f61ca026ecc 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -353,39 +353,37 @@ bool rate_control_send_low(struct ieee80211_sta *pubsta, } EXPORT_SYMBOL(rate_control_send_low); -static bool rate_idx_match_legacy_mask(struct ieee80211_tx_rate *rate, - int n_bitrates, u32 mask) +static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask) { int j; /* See whether the selected rate or anything below it is allowed. */ - for (j = rate->idx; j >= 0; j--) { + for (j = *rate_idx; j >= 0; j--) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ - rate->idx = j; + *rate_idx = j; return true; } } /* Try to find a higher rate that would be allowed */ - for (j = rate->idx + 1; j < n_bitrates; j++) { + for (j = *rate_idx + 1; j < n_bitrates; j++) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ - rate->idx = j; + *rate_idx = j; return true; } } return false; } -static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate, - u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]) +static bool rate_idx_match_mcs_mask(s8 *rate_idx, u8 *mcs_mask) { int i, j; int ridx, rbit; - ridx = rate->idx / 8; - rbit = rate->idx % 8; + ridx = *rate_idx / 8; + rbit = *rate_idx % 8; /* sanity check */ if (ridx < 0 || ridx >= IEEE80211_HT_MCS_MASK_LEN) @@ -395,20 +393,20 @@ static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate, for (i = ridx; i >= 0; i--) { for (j = rbit; j >= 0; j--) if (mcs_mask[i] & BIT(j)) { - rate->idx = i * 8 + j; + *rate_idx = i * 8 + j; return true; } rbit = 7; } /* Try to find a higher rate that would be allowed */ - ridx = (rate->idx + 1) / 8; - rbit = (rate->idx + 1) % 8; + ridx = (*rate_idx + 1) / 8; + rbit = (*rate_idx + 1) % 8; for (i = ridx; i < IEEE80211_HT_MCS_MASK_LEN; i++) { for (j = rbit; j < 8; j++) if (mcs_mask[i] & BIT(j)) { - rate->idx = i * 8 + j; + *rate_idx = i * 8 + j; return true; } rbit = 0; @@ -418,35 +416,30 @@ static bool rate_idx_match_mcs_mask(struct ieee80211_tx_rate *rate, -static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, +static void rate_idx_match_mask(s8 *rate_idx, u16 *rate_flags, struct ieee80211_supported_band *sband, enum nl80211_chan_width chan_width, u32 mask, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]) { - struct ieee80211_tx_rate alt_rate; - /* handle HT rates */ - if (rate->flags & IEEE80211_TX_RC_MCS) { - if (rate_idx_match_mcs_mask(rate, mcs_mask)) + if (*rate_flags & IEEE80211_TX_RC_MCS) { + if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; /* also try the legacy rates. */ - alt_rate.idx = 0; + *rate_idx = 0; /* keep protection flags */ - alt_rate.flags = rate->flags & - (IEEE80211_TX_RC_USE_RTS_CTS | - IEEE80211_TX_RC_USE_CTS_PROTECT | - IEEE80211_TX_RC_USE_SHORT_PREAMBLE); - alt_rate.count = rate->count; - if (rate_idx_match_legacy_mask(&alt_rate, - sband->n_bitrates, mask)) { - *rate = alt_rate; + *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | + IEEE80211_TX_RC_USE_CTS_PROTECT | + IEEE80211_TX_RC_USE_SHORT_PREAMBLE); + if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, + mask)) return; - } - } else if (!(rate->flags & IEEE80211_TX_RC_VHT_MCS)) { + } else if (!(*rate_flags & IEEE80211_TX_RC_VHT_MCS)) { /* handle legacy rates */ - if (rate_idx_match_legacy_mask(rate, sband->n_bitrates, mask)) + if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, + mask)) return; /* if HT BSS, and we handle a data frame, also try HT rates */ @@ -459,23 +452,19 @@ static void rate_idx_match_mask(struct ieee80211_tx_rate *rate, break; } - alt_rate.idx = 0; + *rate_idx = 0; /* keep protection flags */ - alt_rate.flags = rate->flags & - (IEEE80211_TX_RC_USE_RTS_CTS | - IEEE80211_TX_RC_USE_CTS_PROTECT | - IEEE80211_TX_RC_USE_SHORT_PREAMBLE); - alt_rate.count = rate->count; + *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | + IEEE80211_TX_RC_USE_CTS_PROTECT | + IEEE80211_TX_RC_USE_SHORT_PREAMBLE); - alt_rate.flags |= IEEE80211_TX_RC_MCS; + *rate_flags |= IEEE80211_TX_RC_MCS; if (chan_width == NL80211_CHAN_WIDTH_40) - alt_rate.flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; + *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; - if (rate_idx_match_mcs_mask(&alt_rate, mcs_mask)) { - *rate = alt_rate; + if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; - } } /* @@ -628,6 +617,40 @@ static void rate_control_fill_sta_table(struct ieee80211_sta *sta, } } +static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + struct ieee80211_sta *sta, u32 *mask, + u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]) +{ + u32 i, flags; + + *mask = sdata->rc_rateidx_mask[sband->band]; + flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); + for (i = 0; i < sband->n_bitrates; i++) { + if ((flags & sband->bitrates[i].flags) != flags) + *mask &= ~BIT(i); + } + + if (*mask == (1 << sband->n_bitrates) - 1 && + !sdata->rc_has_mcs_mask[sband->band]) + return false; + + if (sdata->rc_has_mcs_mask[sband->band]) + memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[sband->band], + IEEE80211_HT_MCS_MASK_LEN); + else + memset(mcs_mask, 0xff, IEEE80211_HT_MCS_MASK_LEN); + + if (sta) { + /* Filter out rates that the STA does not support */ + *mask &= sta->supp_rates[sband->band]; + for (i = 0; i < sizeof(mcs_mask); i++) + mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; + } + + return true; +} + static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_supported_band *sband, @@ -636,9 +659,8 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, { enum nl80211_chan_width chan_width; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; - bool has_mcs_mask; u32 mask; - u32 rate_flags; + u16 rate_flags; int i; /* @@ -646,30 +668,9 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, * default mask (allow all rates) is used to save some processing for * the common case. */ - mask = sdata->rc_rateidx_mask[sband->band]; - has_mcs_mask = sdata->rc_has_mcs_mask[sband->band]; - rate_flags = - ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); - for (i = 0; i < sband->n_bitrates; i++) - if ((rate_flags & sband->bitrates[i].flags) != rate_flags) - mask &= ~BIT(i); - - if (mask == (1 << sband->n_bitrates) - 1 && !has_mcs_mask) + if (!rate_control_cap_mask(sdata, sband, sta, &mask, mcs_mask)) return; - if (has_mcs_mask) - memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[sband->band], - sizeof(mcs_mask)); - else - memset(mcs_mask, 0xff, sizeof(mcs_mask)); - - if (sta) { - /* Filter out rates that the STA does not support */ - mask &= sta->supp_rates[sband->band]; - for (i = 0; i < sizeof(mcs_mask); i++) - mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; - } - /* * Make sure the rate index selected for each TX rate is * included in the configured mask and change the rate indexes @@ -681,8 +682,10 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, if (rates[i].idx < 0) break; - rate_idx_match_mask(&rates[i], sband, chan_width, mask, - mcs_mask); + rate_flags = rates[i].flags; + rate_idx_match_mask(&rates[i].idx, &rate_flags, sband, + chan_width, mask, mcs_mask); + rates[i].flags = rate_flags; } } -- cgit v1.2.3 From e910867bd285bb8470c47076d99d0325aaea895c Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 6 Aug 2015 23:47:32 +0200 Subject: mac80211: define rate_control_apply_mask_ratetbl() Define rate_control_apply_mask_ratetbl() in order to apply ratemask in rate_control_set_rates() for station rate table Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- net/mac80211/rate.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'net') diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 4f61ca026ecc..7e71de98297c 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -651,6 +651,30 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, return true; } +static void +rate_control_apply_mask_ratetbl(struct sta_info *sta, + struct ieee80211_supported_band *sband, + struct ieee80211_sta_rates *rates) +{ + int i; + u32 mask; + u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; + enum nl80211_chan_width chan_width; + + if (!rate_control_cap_mask(sta->sdata, sband, &sta->sta, &mask, + mcs_mask)) + return; + + chan_width = sta->sdata->vif.bss_conf.chandef.width; + for (i = 0; i < IEEE80211_TX_RATE_TABLE_SIZE; i++) { + if (rates->rate[i].idx < 0) + break; + + rate_idx_match_mask(&rates->rate[i].idx, &rates->rate[i].flags, + sband, chan_width, mask, mcs_mask); + } +} + static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_supported_band *sband, @@ -766,7 +790,10 @@ int rate_control_set_rates(struct ieee80211_hw *hw, { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sta_rates *old; + struct ieee80211_supported_band *sband; + sband = hw->wiphy->bands[ieee80211_get_sdata_band(sta->sdata)]; + rate_control_apply_mask_ratetbl(sta, sband, rates); /* * mac80211 guarantees that this function will not be called * concurrently, so the following RCU access is safe, even without -- cgit v1.2.3 From b119ad6e726cc805f739f8f6843b9de4df1f895e Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 6 Aug 2015 23:47:33 +0200 Subject: mac80211: add rate mask logic for vht rates Define rc_rateidx_vht_mcs_mask array and rate_idx_match_vht_mcs_mask() method in order to apply mcs mask for vht rates Signed-off-by: Lorenzo Bianconi Signed-off-by: Johannes Berg --- net/mac80211/cfg.c | 16 +++++-- net/mac80211/debugfs_netdev.c | 34 ++++++++++++++ net/mac80211/ieee80211_i.h | 5 +++ net/mac80211/iface.c | 14 +++++- net/mac80211/rate.c | 102 +++++++++++++++++++++++++++++++++++++----- net/mac80211/vht.c | 26 +++++++++++ 6 files changed, 181 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 5789d8353505..685ec13ed7c2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2504,16 +2504,26 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, sdata->rc_rateidx_mask[i] = mask->control[i].legacy; memcpy(sdata->rc_rateidx_mcs_mask[i], mask->control[i].ht_mcs, sizeof(mask->control[i].ht_mcs)); + memcpy(sdata->rc_rateidx_vht_mcs_mask[i], + mask->control[i].vht_mcs, + sizeof(mask->control[i].vht_mcs)); sdata->rc_has_mcs_mask[i] = false; + sdata->rc_has_vht_mcs_mask[i] = false; if (!sband) continue; - for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) - if (~sdata->rc_rateidx_mcs_mask[i][j]) { + for (j = 0; j < IEEE80211_HT_MCS_MASK_LEN; j++) { + if (~sdata->rc_rateidx_mcs_mask[i][j]) sdata->rc_has_mcs_mask[i] = true; + + if (~sdata->rc_rateidx_vht_mcs_mask[i][j]) + sdata->rc_has_vht_mcs_mask[i] = true; + + if (sdata->rc_has_mcs_mask[i] && + sdata->rc_has_vht_mcs_mask[i]) break; - } + } } return 0; diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index c09c0131bfa2..1021e87c051f 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -186,6 +186,38 @@ IEEE80211_IF_FILE(rc_rateidx_mcs_mask_2ghz, IEEE80211_IF_FILE(rc_rateidx_mcs_mask_5ghz, rc_rateidx_mcs_mask[IEEE80211_BAND_5GHZ], HEXARRAY); +static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_2ghz( + const struct ieee80211_sub_if_data *sdata, + char *buf, int buflen) +{ + int i, len = 0; + const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_2GHZ]; + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]); + len += scnprintf(buf + len, buflen - len, "\n"); + + return len; +} + +IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_2ghz); + +static ssize_t ieee80211_if_fmt_rc_rateidx_vht_mcs_mask_5ghz( + const struct ieee80211_sub_if_data *sdata, + char *buf, int buflen) +{ + int i, len = 0; + const u16 *mask = sdata->rc_rateidx_vht_mcs_mask[IEEE80211_BAND_5GHZ]; + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + len += scnprintf(buf + len, buflen - len, "%04x ", mask[i]); + len += scnprintf(buf + len, buflen - len, "\n"); + + return len; +} + +IEEE80211_IF_FILE_R(rc_rateidx_vht_mcs_mask_5ghz); + IEEE80211_IF_FILE(flags, flags, HEX); IEEE80211_IF_FILE(state, state, LHEX); IEEE80211_IF_FILE(txpower, vif.bss_conf.txpower, DEC); @@ -565,6 +597,8 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(rc_rateidx_mask_5ghz); DEBUGFS_ADD(rc_rateidx_mcs_mask_2ghz); DEBUGFS_ADD(rc_rateidx_mcs_mask_5ghz); + DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_2ghz); + DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); DEBUGFS_ADD(hw_queues); } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 36f217e842d8..6e52659f923f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -901,6 +901,9 @@ struct ieee80211_sub_if_data { bool rc_has_mcs_mask[IEEE80211_NUM_BANDS]; u8 rc_rateidx_mcs_mask[IEEE80211_NUM_BANDS][IEEE80211_HT_MCS_MASK_LEN]; + bool rc_has_vht_mcs_mask[IEEE80211_NUM_BANDS]; + u16 rc_rateidx_vht_mcs_mask[IEEE80211_NUM_BANDS][NL80211_VHT_NSS_MAX]; + union { struct ieee80211_if_ap ap; struct ieee80211_if_wds wds; @@ -1713,6 +1716,8 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, enum ieee80211_band band, bool nss_only); void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap); +void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, + u16 vht_mask[NL80211_VHT_NSS_MAX]); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 0fba7f97a963..6964fc6a8ea2 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1788,13 +1788,23 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, sband = local->hw.wiphy->bands[i]; sdata->rc_rateidx_mask[i] = sband ? (1 << sband->n_bitrates) - 1 : 0; - if (sband) + if (sband) { + __le16 cap; + u16 *vht_rate_mask; + memcpy(sdata->rc_rateidx_mcs_mask[i], sband->ht_cap.mcs.rx_mask, sizeof(sdata->rc_rateidx_mcs_mask[i])); - else + + cap = sband->vht_cap.vht_mcs.rx_mcs_map; + vht_rate_mask = sdata->rc_rateidx_vht_mcs_mask[i]; + ieee80211_get_vht_mask_from_cap(cap, vht_rate_mask); + } else { memset(sdata->rc_rateidx_mcs_mask[i], 0, sizeof(sdata->rc_rateidx_mcs_mask[i])); + memset(sdata->rc_rateidx_vht_mcs_mask[i], 0, + sizeof(sdata->rc_rateidx_vht_mcs_mask[i])); + } } ieee80211_set_default_queues(sdata); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 7e71de98297c..9857693b91ec 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -414,16 +414,77 @@ static bool rate_idx_match_mcs_mask(s8 *rate_idx, u8 *mcs_mask) return false; } +static bool rate_idx_match_vht_mcs_mask(s8 *rate_idx, u16 *vht_mask) +{ + int i, j; + int ridx, rbit; + + ridx = *rate_idx >> 4; + rbit = *rate_idx & 0xf; + + if (ridx < 0 || ridx >= NL80211_VHT_NSS_MAX) + return false; + /* See whether the selected rate or anything below it is allowed. */ + for (i = ridx; i >= 0; i--) { + for (j = rbit; j >= 0; j--) { + if (vht_mask[i] & BIT(j)) { + *rate_idx = (i << 4) | j; + return true; + } + } + rbit = 15; + } + + /* Try to find a higher rate that would be allowed */ + ridx = (*rate_idx + 1) >> 4; + rbit = (*rate_idx + 1) & 0xf; + + for (i = ridx; i < NL80211_VHT_NSS_MAX; i++) { + for (j = rbit; j < 16; j++) { + if (vht_mask[i] & BIT(j)) { + *rate_idx = (i << 4) | j; + return true; + } + } + rbit = 0; + } + return false; +} static void rate_idx_match_mask(s8 *rate_idx, u16 *rate_flags, struct ieee80211_supported_band *sband, enum nl80211_chan_width chan_width, u32 mask, - u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]) + u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], + u16 vht_mask[NL80211_VHT_NSS_MAX]) { - /* handle HT rates */ - if (*rate_flags & IEEE80211_TX_RC_MCS) { + if (*rate_flags & IEEE80211_TX_RC_VHT_MCS) { + /* handle VHT rates */ + if (rate_idx_match_vht_mcs_mask(rate_idx, vht_mask)) + return; + + *rate_idx = 0; + /* keep protection flags */ + *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | + IEEE80211_TX_RC_USE_CTS_PROTECT | + IEEE80211_TX_RC_USE_SHORT_PREAMBLE); + + *rate_flags |= IEEE80211_TX_RC_MCS; + if (chan_width == NL80211_CHAN_WIDTH_40) + *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; + + if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) + return; + + /* also try the legacy rates. */ + *rate_flags &= ~(IEEE80211_TX_RC_MCS | + IEEE80211_TX_RC_40_MHZ_WIDTH); + if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, + mask)) + return; + } else if (*rate_flags & IEEE80211_TX_RC_MCS) { + /* handle HT rates */ if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; @@ -436,7 +497,7 @@ static void rate_idx_match_mask(s8 *rate_idx, u16 *rate_flags, if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; - } else if (!(*rate_flags & IEEE80211_TX_RC_VHT_MCS)) { + } else { /* handle legacy rates */ if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) @@ -620,7 +681,8 @@ static void rate_control_fill_sta_table(struct ieee80211_sta *sta, static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, u32 *mask, - u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]) + u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], + u16 vht_mask[NL80211_VHT_NSS_MAX]) { u32 i, flags; @@ -632,7 +694,8 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, } if (*mask == (1 << sband->n_bitrates) - 1 && - !sdata->rc_has_mcs_mask[sband->band]) + !sdata->rc_has_mcs_mask[sband->band] && + !sdata->rc_has_vht_mcs_mask[sband->band]) return false; if (sdata->rc_has_mcs_mask[sband->band]) @@ -641,11 +704,25 @@ static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, else memset(mcs_mask, 0xff, IEEE80211_HT_MCS_MASK_LEN); + if (sdata->rc_has_vht_mcs_mask[sband->band]) + memcpy(vht_mask, sdata->rc_rateidx_vht_mcs_mask[sband->band], + sizeof(u16) * NL80211_VHT_NSS_MAX); + else + memset(vht_mask, 0xff, sizeof(u16) * NL80211_VHT_NSS_MAX); + if (sta) { + __le16 sta_vht_cap; + u16 sta_vht_mask[NL80211_VHT_NSS_MAX]; + /* Filter out rates that the STA does not support */ *mask &= sta->supp_rates[sband->band]; for (i = 0; i < sizeof(mcs_mask); i++) mcs_mask[i] &= sta->ht_cap.mcs.rx_mask[i]; + + sta_vht_cap = sta->vht_cap.vht_mcs.rx_mcs_map; + ieee80211_get_vht_mask_from_cap(sta_vht_cap, sta_vht_mask); + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) + vht_mask[i] &= sta_vht_mask[i]; } return true; @@ -659,10 +736,11 @@ rate_control_apply_mask_ratetbl(struct sta_info *sta, int i; u32 mask; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; + u16 vht_mask[NL80211_VHT_NSS_MAX]; enum nl80211_chan_width chan_width; if (!rate_control_cap_mask(sta->sdata, sband, &sta->sta, &mask, - mcs_mask)) + mcs_mask, vht_mask)) return; chan_width = sta->sdata->vif.bss_conf.chandef.width; @@ -671,7 +749,8 @@ rate_control_apply_mask_ratetbl(struct sta_info *sta, break; rate_idx_match_mask(&rates->rate[i].idx, &rates->rate[i].flags, - sband, chan_width, mask, mcs_mask); + sband, chan_width, mask, mcs_mask, + vht_mask); } } @@ -684,7 +763,7 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, enum nl80211_chan_width chan_width; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; u32 mask; - u16 rate_flags; + u16 rate_flags, vht_mask[NL80211_VHT_NSS_MAX]; int i; /* @@ -692,7 +771,8 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, * default mask (allow all rates) is used to save some processing for * the common case. */ - if (!rate_control_cap_mask(sdata, sband, sta, &mask, mcs_mask)) + if (!rate_control_cap_mask(sdata, sband, sta, &mask, mcs_mask, + vht_mask)) return; /* @@ -708,7 +788,7 @@ static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, rate_flags = rates[i].flags; rate_idx_match_mask(&rates[i].idx, &rate_flags, sband, - chan_width, mask, mcs_mask); + chan_width, mask, mcs_mask, vht_mask); rates[i].flags = rate_flags; } } diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c index f05808d0d80f..834ccdbc74be 100644 --- a/net/mac80211/vht.c +++ b/net/mac80211/vht.c @@ -426,3 +426,29 @@ void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, if (changed > 0) rate_control_rate_update(local, sband, sta, changed); } + +void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, + u16 vht_mask[NL80211_VHT_NSS_MAX]) +{ + int i; + u16 mask, cap = le16_to_cpu(vht_cap); + + for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { + mask = (cap >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; + switch (mask) { + case IEEE80211_VHT_MCS_SUPPORT_0_7: + vht_mask[i] = 0x00FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_8: + vht_mask[i] = 0x01FF; + break; + case IEEE80211_VHT_MCS_SUPPORT_0_9: + vht_mask[i] = 0x03FF; + break; + case IEEE80211_VHT_MCS_NOT_SUPPORTED: + default: + vht_mask[i] = 0; + break; + } + } +} -- cgit v1.2.3 From 40d9a38ad3b7029be9c278738b67cbdb6349ce85 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 13 Jul 2015 12:26:46 +0200 Subject: mac80211: use DECLARE_EWMA Instead of using the out-of-line average calculation, use the new DECLARE_EWMA() macro to declare a signal EWMA, and use that. This actually *reduces* the code size slightly (on x86-64) while also reducing the station info size by 80 bytes. Signed-off-by: Johannes Berg --- net/mac80211/Kconfig | 1 - net/mac80211/mesh_plink.c | 2 +- net/mac80211/rx.c | 4 ++-- net/mac80211/sta_info.c | 9 +++++---- net/mac80211/sta_info.h | 6 ++++-- 5 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig index 086de496a4c1..3891cbd2adea 100644 --- a/net/mac80211/Kconfig +++ b/net/mac80211/Kconfig @@ -7,7 +7,6 @@ config MAC80211 select CRYPTO_CCM select CRYPTO_GCM select CRC32 - select AVERAGE ---help--- This option enables the hardware independent IEEE 802.11 networking stack. diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index e12be2e4e8df..58384642e03c 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -60,7 +60,7 @@ static bool rssi_threshold_check(struct ieee80211_sub_if_data *sdata, { s32 rssi_threshold = sdata->u.mesh.mshcfg.rssi_threshold; return rssi_threshold == 0 || - (sta && (s8) -ewma_read(&sta->avg_signal) > rssi_threshold); + (sta && (s8) -ewma_signal_read(&sta->avg_signal) > rssi_threshold); } /** diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4d217d3265f4..5bc0b88d9eb1 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1428,7 +1428,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) sta->rx_bytes += rx->skb->len; if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { sta->last_signal = status->signal; - ewma_add(&sta->avg_signal, -status->signal); + ewma_signal_add(&sta->avg_signal, -status->signal); } if (status->chains) { @@ -1440,7 +1440,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) continue; sta->chain_signal_last[i] = signal; - ewma_add(&sta->chain_signal_avg[i], -signal); + ewma_signal_add(&sta->chain_signal_avg[i], -signal); } } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 70cd9fa57424..64f1936350c6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -341,9 +341,9 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, ktime_get_ts(&uptime); sta->last_connected = uptime.tv_sec; - ewma_init(&sta->avg_signal, 1024, 8); + ewma_signal_init(&sta->avg_signal); for (i = 0; i < ARRAY_SIZE(sta->chain_signal_avg); i++) - ewma_init(&sta->chain_signal_avg[i], 1024, 8); + ewma_signal_init(&sta->chain_signal_avg[i]); if (local->ops->wake_tx_queue) { void *txq_data; @@ -1896,7 +1896,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) } if (!(sinfo->filled & BIT(NL80211_STA_INFO_SIGNAL_AVG))) { - sinfo->signal_avg = (s8) -ewma_read(&sta->avg_signal); + sinfo->signal_avg = + (s8) -ewma_signal_read(&sta->avg_signal); sinfo->filled |= BIT(NL80211_STA_INFO_SIGNAL_AVG); } } @@ -1911,7 +1912,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = sta->chain_signal_last[i]; sinfo->chain_signal_avg[i] = - (s8) -ewma_read(&sta->chain_signal_avg[i]); + (s8) -ewma_signal_read(&sta->chain_signal_avg[i]); } } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 1d2805c598c0..b087c71ff7fe 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -320,6 +320,8 @@ struct mesh_sta { unsigned int fail_avg; }; +DECLARE_EWMA(signal, 1024, 8) + /** * struct sta_info - STA information * @@ -462,12 +464,12 @@ struct sta_info { unsigned long rx_fragments; unsigned long rx_dropped; int last_signal; - struct ewma avg_signal; + struct ewma_signal avg_signal; int last_ack_signal; u8 chains; s8 chain_signal_last[IEEE80211_MAX_CHAINS]; - struct ewma chain_signal_avg[IEEE80211_MAX_CHAINS]; + struct ewma_signal chain_signal_avg[IEEE80211_MAX_CHAINS]; /* Plus 1 for non-QoS frames */ __le16 last_seq_ctrl[IEEE80211_NUM_TIDS + 1]; -- cgit v1.2.3 From 65d7d46050704bcdb8121ddbf4110bfbf2b38baa Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 16 Jun 2015 17:10:22 +0200 Subject: batman-adv: Make DAT capability changes atomic Bitwise OR/AND assignments in C aren't guaranteed to be atomic. One OGM handler might undo the set/clear of a specific bit from another handler run in between. Fix this by using the atomic set_bit()/clear_bit()/test_bit() functions. Fixes: 17cf0ea455f1 ("batman-adv: tvlv - add distributed arp table container") Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/distributed-arp-table.c | 7 ++++--- net/batman-adv/types.h | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index fb54e6aed096..1cfba20d5112 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -19,6 +19,7 @@ #include "main.h" #include +#include #include #include #include @@ -453,7 +454,7 @@ static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res, int j; /* check if orig node candidate is running DAT */ - if (!(candidate->capabilities & BATADV_ORIG_CAPA_HAS_DAT)) + if (!test_bit(BATADV_ORIG_CAPA_HAS_DAT, &candidate->capabilities)) goto out; /* Check if this node has already been selected... */ @@ -713,9 +714,9 @@ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, uint16_t tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_DAT; + clear_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); else - orig->capabilities |= BATADV_ORIG_CAPA_HAS_DAT; + set_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); } /** diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 67d63483618e..29fd62562546 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -273,7 +273,7 @@ struct batadv_orig_node { struct hlist_node mcast_want_all_ipv4_node; struct hlist_node mcast_want_all_ipv6_node; #endif - uint8_t capabilities; + unsigned long capabilities; uint8_t capa_initialized; atomic_t last_ttvn; unsigned char *tt_buff; @@ -313,7 +313,7 @@ struct batadv_orig_node { * (= orig node announces a tvlv of type BATADV_TVLV_MCAST) */ enum batadv_orig_capabilities { - BATADV_ORIG_CAPA_HAS_DAT = BIT(0), + BATADV_ORIG_CAPA_HAS_DAT, BATADV_ORIG_CAPA_HAS_NC = BIT(1), BATADV_ORIG_CAPA_HAS_TT = BIT(2), BATADV_ORIG_CAPA_HAS_MCAST = BIT(3), -- cgit v1.2.3 From 4635469f5c617282f18c69643af36cd8c0acf707 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 16 Jun 2015 17:10:23 +0200 Subject: batman-adv: Make NC capability changes atomic Bitwise OR/AND assignments in C aren't guaranteed to be atomic. One OGM handler might undo the set/clear of a specific bit from another handler run in between. Fix this by using the atomic set_bit()/clear_bit()/test_bit() functions. Fixes: 3f4841ffb336 ("batman-adv: tvlv - add network coding container") Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/network-coding.c | 7 ++++--- net/batman-adv/types.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index f0a50f31d822..46604010dcd4 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -19,6 +19,7 @@ #include "main.h" #include +#include #include #include #include @@ -134,9 +135,9 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, uint16_t tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_NC; + clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); else - orig->capabilities |= BATADV_ORIG_CAPA_HAS_NC; + set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); } /** @@ -894,7 +895,7 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, goto out; /* check if orig node is network coding enabled */ - if (!(orig_node->capabilities & BATADV_ORIG_CAPA_HAS_NC)) + if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)) goto out; /* accept ogms from 'good' neighbors and single hop neighbors */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 29fd62562546..ed4aec5cb8cc 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -314,7 +314,7 @@ struct batadv_orig_node { */ enum batadv_orig_capabilities { BATADV_ORIG_CAPA_HAS_DAT, - BATADV_ORIG_CAPA_HAS_NC = BIT(1), + BATADV_ORIG_CAPA_HAS_NC, BATADV_ORIG_CAPA_HAS_TT = BIT(2), BATADV_ORIG_CAPA_HAS_MCAST = BIT(3), }; -- cgit v1.2.3 From ac4eebd48461ec993e7cb614d5afe7df8c72e6b7 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 16 Jun 2015 17:10:24 +0200 Subject: batman-adv: Make TT capability changes atomic Bitwise OR/AND assignments in C aren't guaranteed to be atomic. One OGM handler might undo the set/clear of a specific bit from another handler run in between. Fix this by using the atomic set_bit()/clear_bit()/test_bit() functions. Fixes: e17931d1a61d ("batman-adv: introduce capability initialization bitfield") Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/translation-table.c | 8 +++++--- net/batman-adv/types.h | 4 ++-- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index b4824951010b..1573489c1f8a 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -19,6 +19,7 @@ #include "main.h" #include +#include #include #include #include @@ -1862,7 +1863,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, } spin_unlock_bh(list_lock); } - orig_node->capa_initialized &= ~BATADV_ORIG_CAPA_HAS_TT; + clear_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static bool batadv_tt_global_to_purge(struct batadv_tt_global_entry *tt_global, @@ -2821,7 +2822,7 @@ static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, return; } } - orig_node->capa_initialized |= BATADV_ORIG_CAPA_HAS_TT; + set_bit(BATADV_ORIG_CAPA_HAS_TT, &orig_node->capa_initialized); } static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, @@ -3321,7 +3322,8 @@ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, bool has_tt_init; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)tt_buff; - has_tt_init = orig_node->capa_initialized & BATADV_ORIG_CAPA_HAS_TT; + has_tt_init = test_bit(BATADV_ORIG_CAPA_HAS_TT, + &orig_node->capa_initialized); /* orig table not initialised AND first diff is in the OGM OR the ttvn * increased by one -> we can apply the attached changes diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ed4aec5cb8cc..6f801efc3ae2 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -274,7 +274,7 @@ struct batadv_orig_node { struct hlist_node mcast_want_all_ipv6_node; #endif unsigned long capabilities; - uint8_t capa_initialized; + unsigned long capa_initialized; atomic_t last_ttvn; unsigned char *tt_buff; int16_t tt_buff_len; @@ -315,7 +315,7 @@ struct batadv_orig_node { enum batadv_orig_capabilities { BATADV_ORIG_CAPA_HAS_DAT, BATADV_ORIG_CAPA_HAS_NC, - BATADV_ORIG_CAPA_HAS_TT = BIT(2), + BATADV_ORIG_CAPA_HAS_TT, BATADV_ORIG_CAPA_HAS_MCAST = BIT(3), }; -- cgit v1.2.3 From 9c936e3f4c4fad07abb6c082a89508b8f724c88f Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 16 Jun 2015 17:10:25 +0200 Subject: batman-adv: Make MCAST capability changes atomic Bitwise OR/AND assignments in C aren't guaranteed to be atomic. One OGM handler might undo the set/clear of a specific bit from another handler run in between. Fix this by using the atomic set_bit()/clear_bit()/test_bit() functions. Fixes: 60432d756cf0 ("batman-adv: Announce new capability via multicast TVLV") Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/multicast.c | 18 ++++++++++-------- net/batman-adv/types.h | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 7aa480b7edd0..8f1ec21bf2d0 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -19,6 +19,7 @@ #include "main.h" #include +#include #include #include #include @@ -697,29 +698,30 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, uint8_t mcast_flags = BATADV_NO_FLAGS; bool orig_initialized; - orig_initialized = orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST; + orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST, + &orig->capa_initialized); /* If mcast support is turned on decrease the disabled mcast node * counter only if we had increased it for this node before. If this * is a completely new orig_node no need to decrease the counter. */ if (orig_mcast_enabled && - !(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST)) { + !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { if (orig_initialized) atomic_dec(&bat_priv->mcast.num_disabled); - orig->capabilities |= BATADV_ORIG_CAPA_HAS_MCAST; + set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); /* If mcast support is being switched off or if this is an initial * OGM without mcast support then increase the disabled mcast * node counter. */ } else if (!orig_mcast_enabled && - (orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST || + (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) || !orig_initialized)) { atomic_inc(&bat_priv->mcast.num_disabled); - orig->capabilities &= ~BATADV_ORIG_CAPA_HAS_MCAST; + clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } - orig->capa_initialized |= BATADV_ORIG_CAPA_HAS_MCAST; + set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); if (orig_mcast_enabled && tvlv_value && (tvlv_value_len >= sizeof(mcast_flags))) @@ -763,8 +765,8 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; - if (!(orig->capabilities & BATADV_ORIG_CAPA_HAS_MCAST) && - orig->capa_initialized & BATADV_ORIG_CAPA_HAS_MCAST) + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) && + test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized)) atomic_dec(&bat_priv->mcast.num_disabled); batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 6f801efc3ae2..1eeed1847bc7 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -316,7 +316,7 @@ enum batadv_orig_capabilities { BATADV_ORIG_CAPA_HAS_DAT, BATADV_ORIG_CAPA_HAS_NC, BATADV_ORIG_CAPA_HAS_TT, - BATADV_ORIG_CAPA_HAS_MCAST = BIT(3), + BATADV_ORIG_CAPA_HAS_MCAST, }; /** -- cgit v1.2.3 From 8a4023c5b5e30b11f1f383186f4a7222b3b823cf Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 16 Jun 2015 17:10:26 +0200 Subject: batman-adv: Fix potential synchronization issues in mcast tvlv handler So far the mcast tvlv handler did not anticipate the processing of multiple incoming OGMs from the same originator at the same time. This can lead to various issues: * Broken refcounting: For instance two mcast handlers might both assume that an originator just got multicast capabilities and will together wrongly decrease mcast.num_disabled by two, potentially leading to an integer underflow. * Potential kernel panic on hlist_del_rcu(): Two mcast handlers might one after another try to do an hlist_del_rcu(&orig->mcast_want_all_*_node). The second one will cause memory corruption / crashes. (Reported by: Sven Eckelmann ) Right in the beginning the code path makes assumptions about the current multicast related state of an originator and bases all updates on that. The easiest and least error prune way to fix the issues in this case is to serialize multiple mcast handler invocations with a spinlock. Fixes: 60432d756cf0 ("batman-adv: Announce new capability via multicast TVLV") Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/multicast.c | 63 +++++++++++++++++++++++++++++++++++---------- net/batman-adv/originator.c | 5 ++++ net/batman-adv/types.h | 3 +++ 3 files changed, 58 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 8f1ec21bf2d0..68a9554961eb 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -589,19 +590,26 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, * orig, has toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, uint8_t mcast_flags) { + struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; + struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_add_head_rcu(&orig->mcast_want_all_unsnoopables_node, - &bat_priv->mcast.want_all_unsnoopables_list); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && @@ -609,7 +617,10 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_del_rcu(&orig->mcast_want_all_unsnoopables_node); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } @@ -622,19 +633,26 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, uint8_t mcast_flags) { + struct hlist_node *node = &orig->mcast_want_all_ipv4_node; + struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_add_head_rcu(&orig->mcast_want_all_ipv4_node, - &bat_priv->mcast.want_all_ipv4_list); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && @@ -642,7 +660,10 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, atomic_dec(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_del_rcu(&orig->mcast_want_all_ipv4_node); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } @@ -655,19 +676,26 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has * toggled then this method updates counter and list accordingly. + * + * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, uint8_t mcast_flags) { + struct hlist_node *node = &orig->mcast_want_all_ipv6_node; + struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_add_head_rcu(&orig->mcast_want_all_ipv6_node, - &bat_priv->mcast.want_all_ipv6_list); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(!hlist_unhashed(node)); + + hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && @@ -675,7 +703,10 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, atomic_dec(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); - hlist_del_rcu(&orig->mcast_want_all_ipv6_node); + /* flag checks above + mcast_handler_lock prevents this */ + WARN_ON(hlist_unhashed(node)); + + hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } @@ -698,6 +729,11 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, uint8_t mcast_flags = BATADV_NO_FLAGS; bool orig_initialized; + if (orig_mcast_enabled && tvlv_value && + (tvlv_value_len >= sizeof(mcast_flags))) + mcast_flags = *(uint8_t *)tvlv_value; + + spin_lock_bh(&orig->mcast_handler_lock); orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); @@ -723,15 +759,12 @@ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); - if (orig_mcast_enabled && tvlv_value && - (tvlv_value_len >= sizeof(mcast_flags))) - mcast_flags = *(uint8_t *)tvlv_value; - batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; + spin_unlock_bh(&orig->mcast_handler_lock); } /** @@ -765,6 +798,8 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; + spin_lock_bh(&orig->mcast_handler_lock); + if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) && test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized)) atomic_dec(&bat_priv->mcast.num_disabled); @@ -772,4 +807,6 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); + + spin_unlock_bh(&orig->mcast_handler_lock); } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 018b7495ad84..32a0fcfab36d 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -696,8 +696,13 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, orig_node->last_seen = jiffies; reset_time = jiffies - 1 - msecs_to_jiffies(BATADV_RESET_PROTECTION_MS); orig_node->bcast_seqno_reset = reset_time; + #ifdef CONFIG_BATMAN_ADV_MCAST orig_node->mcast_flags = BATADV_NO_FLAGS; + INIT_HLIST_NODE(&orig_node->mcast_want_all_unsnoopables_node); + INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv4_node); + INIT_HLIST_NODE(&orig_node->mcast_want_all_ipv6_node); + spin_lock_init(&orig_node->mcast_handler_lock); #endif /* create a vlan object for the "untagged" LAN */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 1eeed1847bc7..55610a805b53 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -221,6 +221,7 @@ struct batadv_orig_bat_iv { * @batadv_dat_addr_t: address of the orig node in the distributed hash * @last_seen: time when last packet from this node was received * @bcast_seqno_reset: time when the broadcast seqno window was reset + * @mcast_handler_lock: synchronizes mcast-capability and -flag changes * @mcast_flags: multicast flags announced by the orig node * @mcast_want_all_unsnoop_node: a list node for the * mcast.want_all_unsnoopables list @@ -268,6 +269,8 @@ struct batadv_orig_node { unsigned long last_seen; unsigned long bcast_seqno_reset; #ifdef CONFIG_BATMAN_ADV_MCAST + /* synchronizes mcast tvlv specific orig changes */ + spinlock_t mcast_handler_lock; uint8_t mcast_flags; struct hlist_node mcast_want_all_unsnoopables_node; struct hlist_node mcast_want_all_ipv4_node; -- cgit v1.2.3 From 1f155101646e6f6cda218534ff6e252ff625137b Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Mon, 22 Jun 2015 00:36:28 +0800 Subject: batman-adv: protect tt request from double deletion The list_del() calls were changed to list_del_init() to prevent an accidental double deletion in batadv_tt_req_node_new(). Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/translation-table.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 1573489c1f8a..cd35bb846582 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -2196,7 +2196,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) spin_lock_bh(&bat_priv->tt.req_list_lock); list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { - list_del(&node->list); + list_del_init(&node->list); kfree(node); } @@ -2232,7 +2232,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (batadv_has_timed_out(node->issued_at, BATADV_TT_REQUEST_TIMEOUT)) { - list_del(&node->list); + list_del_init(&node->list); kfree(node); } } @@ -2514,7 +2514,8 @@ out: batadv_hardif_free_ref(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); - list_del(&tt_req_node->list); + /* list_del_init() verifies tt_req_node still is in the list */ + list_del_init(&tt_req_node->list); spin_unlock_bh(&bat_priv->tt.req_list_lock); kfree(tt_req_node); } @@ -2951,7 +2952,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (!batadv_compare_eth(node->addr, resp_src)) continue; - list_del(&node->list); + list_del_init(&node->list); kfree(node); } -- cgit v1.2.3 From 3f1e08d0ae6746379b9e21264dae52f4f35c7ad2 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Wed, 24 Jun 2015 14:50:20 +0200 Subject: batman-adv: remove broadcast packets scheduled for purged outgoing if When an interface is purged, the broadcast packets scheduled for this interface should get purged as well. Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/send.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 0a01992e65ab..191076ef1eca 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -616,7 +616,8 @@ batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, * we delete only packets belonging to the given interface */ if ((hard_iface) && - (forw_packet->if_incoming != hard_iface)) + (forw_packet->if_incoming != hard_iface) && + (forw_packet->if_outgoing != hard_iface)) continue; spin_unlock_bh(&bat_priv->forw_bcast_list_lock); -- cgit v1.2.3 From 53cf037bf846417fd92dc92ddf97267f69b110f4 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Tue, 30 Jun 2015 23:45:26 +0200 Subject: batman-adv: Fix potentially broken skb network header access The two commits noted below added calls to ip_hdr() and ipv6_hdr(). They need a correctly set skb network header. Unfortunately we cannot rely on the device drivers to set it for us. Therefore setting it in the beginning of the according ndo_start_xmit handler. Fixes: 1d8ab8d3c176 ("batman-adv: Modified forwarding behaviour for multicast packets") Fixes: ab49886e3da7 ("batman-adv: Add IPv4 link-local/IPv6-ll-all-nodes multicast support") Signed-off-by: Linus Lüssing Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/soft-interface.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index c002961da75d..926292d5ffa8 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -202,6 +202,7 @@ static int batadv_interface_tx(struct sk_buff *skb, int gw_mode; enum batadv_forw_mode forw_mode; struct batadv_orig_node *mcast_single_orig = NULL; + int network_offset = ETH_HLEN; if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; @@ -214,14 +215,18 @@ static int batadv_interface_tx(struct sk_buff *skb, case ETH_P_8021Q: vhdr = vlan_eth_hdr(skb); - if (vhdr->h_vlan_encapsulated_proto != ethertype) + if (vhdr->h_vlan_encapsulated_proto != ethertype) { + network_offset += VLAN_HLEN; break; + } /* fall through */ case ETH_P_BATMAN: goto dropped; } + skb_set_network_header(skb, network_offset); + if (batadv_bla_tx(bat_priv, skb, vid)) goto dropped; -- cgit v1.2.3 From 0751272880f3a0c74c786ecfaba2b3d98748482f Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 15 Aug 2015 11:00:32 +0200 Subject: ieee802154: 6lowpan: fix packet layer registration This patch fixes 802.15.4 packet layer registration when mutliple lowpan interfaces will be added. We need to register the packet layer at the first lowpan interface and deregister it at the last interface. This done by open_count variable which is protected by rtnl. Additional do a quiet fix by adding dev_put(real_dev) when netdev registration fails, which fix the refcount for the wpan dev. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/core.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 180e9f5f86c3..27c25ad935b4 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -52,6 +52,8 @@ #include "6lowpan_i.h" +static int open_count; + static struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; @@ -141,12 +143,18 @@ static int lowpan_newlink(struct net *src_net, struct net_device *dev, lowpan_netdev_setup(dev, LOWPAN_LLTYPE_IEEE802154); ret = register_netdevice(dev); - if (ret >= 0) { - real_dev->ieee802154_ptr->lowpan_dev = dev; - lowpan_rx_init(); + if (ret < 0) { + dev_put(real_dev); + return ret; } - return ret; + real_dev->ieee802154_ptr->lowpan_dev = dev; + if (!open_count) + lowpan_rx_init(); + + open_count++; + + return 0; } static void lowpan_dellink(struct net_device *dev, struct list_head *head) @@ -156,7 +164,11 @@ static void lowpan_dellink(struct net_device *dev, struct list_head *head) ASSERT_RTNL(); - lowpan_rx_exit(); + open_count--; + + if (!open_count) + lowpan_rx_exit(); + real_dev->ieee802154_ptr->lowpan_dev = NULL; unregister_netdevice(dev); dev_put(real_dev); -- cgit v1.2.3 From c0015bf3a34961342a27b021672049e535ab36a1 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 15 Aug 2015 11:00:33 +0200 Subject: ieee802154: 6lowpan: fix non-lowpan wpan interfaces We receive all 802.15.4 frames on the packet handler "lowpan_rcv" this patch checks if the wpan device belongs to a lowpan interface. Signed-off-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/6lowpan/rx.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/rx.c b/net/ieee802154/6lowpan/rx.c index d6f5e8ee6fda..12e10201d263 100644 --- a/net/ieee802154/6lowpan/rx.c +++ b/net/ieee802154/6lowpan/rx.c @@ -67,6 +67,10 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, struct ieee802154_hdr hdr; int ret; + if (dev->type != ARPHRD_IEEE802154 || + !dev->ieee802154_ptr->lowpan_dev) + goto drop; + skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto drop; @@ -77,9 +81,6 @@ static int lowpan_rcv(struct sk_buff *skb, struct net_device *dev, if (skb->pkt_type == PACKET_OTHERHOST) goto drop_skb; - if (dev->type != ARPHRD_IEEE802154) - goto drop_skb; - if (ieee802154_hdr_peek_addrs(skb, &hdr) < 0) goto drop_skb; -- cgit v1.2.3 From fe202fe95564023223ce1910c9e352f391abb1d5 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Fri, 14 Aug 2015 22:33:40 +0200 Subject: nfc: netlink: Add check on NFC_ATTR_VENDOR_DATA NFC_ATTR_VENDOR_DATA is an optional vendor_cmd argument. The current code was potentially using a non existing argument leading to potential catastrophic results. Cc: stable@vger.kernel.org Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- net/nfc/netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index f85f37ed19b2..81dfaaacfc4d 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1518,8 +1518,8 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, if (!dev || !dev->vendor_cmds || !dev->n_vendor_cmds) return -ENODEV; - data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); - if (data) { + if (info->attrs[NFC_ATTR_VENDOR_DATA]) { + data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); if (data_len == 0) return -EINVAL; -- cgit v1.2.3 From 1e3136789975f03e461798149309034e5213c1b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 15 Aug 2015 10:54:07 -0700 Subject: ipv4: fix refcount leak in fib_check_nh() fib_lookup() forces FIB_LOOKUP_NOREF flag, while fib_table_lookup() does not. This patch solves the typical message at reboot time or device dismantle : unregister_netdevice: waiting for eth0 to become free. Usage count = 4 Fixes: 3bfd847203c6 ("net: Use passed in table for nexthop lookups") Signed-off-by: Eric Dumazet Cc: David Ahern Acked-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b7f1d20a9615..c8025851dac7 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -708,7 +708,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (tbl) err = fib_table_lookup(tbl, &fl4, &res, - FIB_LOOKUP_IGNORE_LINKSTATE); + FIB_LOOKUP_IGNORE_LINKSTATE | + FIB_LOOKUP_NOREF); else err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE); -- cgit v1.2.3 From adca3c38d807b341a965d0aba8721d0784d8471b Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Mon, 17 Aug 2015 08:33:43 +0200 Subject: nfc: netlink: Warning fix When NFC_ATTR_VENDOR_DATA is not set, data_len is 0 and data is NULL. Fixes the following warning: net/nfc/netlink.c:1536:3: warning: 'data' may be used uninitialized +in this function [-Wmaybe-uninitialized] return cmd->doit(dev, data, data_len); Cc: stable@vger.kernel.org Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- net/nfc/netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 81dfaaacfc4d..73d1ca7c546c 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1524,6 +1524,7 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, if (data_len == 0) return -EINVAL; } else { + data = NULL; data_len = 0; } -- cgit v1.2.3 From 8f8ff9135b28a7560a5627aceaf289e3f0d4cd64 Mon Sep 17 00:00:00 2001 From: Richard Alpe Date: Mon, 17 Aug 2015 14:15:10 +0200 Subject: tipc: don't sanity check non-existing TLV (NL compat) A zero length payload means that no TLV (Type Length Value) data has been passed. Prior to this patch a non-existing TLV could be sanity checked with TLV_OK() resulting in random behavior where a user sending an empty message occasionally got a incorrect "operation not supported" message back. Signed-off-by: Richard Alpe Reviewed-by: Erik Hugne Signed-off-by: David S. Miller --- net/tipc/netlink_compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 53e0fee80086..1eadc95e1132 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -1114,7 +1114,7 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info) } len = nlmsg_attrlen(req_nlh, GENL_HDRLEN + TIPC_GENL_HDRLEN); - if (TLV_GET_LEN(msg.req) && !TLV_OK(msg.req, len)) { + if (len && !TLV_OK(msg.req, len)) { msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_SUPPORTED); err = -EOPNOTSUPP; goto send; -- cgit v1.2.3 From 4b469955685d58c2f8198bf817fc661600b7e3d0 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 13 Aug 2015 19:01:07 +0200 Subject: net: sch_generic: react upon IFF_NO_QUEUE flag Handle IFF_NO_QUEUE as alternative to tx_queue_len being zero. Signed-off-by: Phil Sutter Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6efca30894aa..942fea8405a4 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -735,7 +735,7 @@ static void attach_one_default_qdisc(struct net_device *dev, { struct Qdisc *qdisc = &noqueue_qdisc; - if (dev->tx_queue_len) { + if (dev->tx_queue_len && !(dev->priv_flags & IFF_NO_QUEUE)) { qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_ROOT); if (!qdisc) { @@ -755,7 +755,9 @@ static void attach_default_qdiscs(struct net_device *dev) txq = netdev_get_tx_queue(dev, 0); - if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) { + if (!netif_is_multiqueue(dev) || + dev->tx_queue_len == 0 || + dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); dev->qdisc = txq->qdisc_sleeping; atomic_inc(&dev->qdisc->refcnt); -- cgit v1.2.3 From a1c234f95cae2d293047bb6c36e7a4840dbac815 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 14 Aug 2015 16:40:40 +0200 Subject: lwtunnel: rename ip lwtunnel attributes We already have IFLA_IPTUN_ netlink attributes. The IP_TUN_ attributes look very similar, yet they serve very different purpose. This is confusing for anyone trying to implement a user space tool supporting lwt. As the IP_TUN_ attributes are used only for the lightweight tunnels, prefix them with LWTUNNEL_IP_ instead to make their purpose clear. Also, it's more logical to have them in lwtunnel.h together with the encap enum. Fixes: 3093fbe7ff4b ("route: Per route IP tunnel metadata via lightweight tunnel") Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 14 +++++++ include/uapi/linux/rtnetlink.h | 15 -------- net/ipv4/ip_tunnel_core.c | 86 +++++++++++++++++++++--------------------- 3 files changed, 57 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 31377bbea3f8..3bf223bc2367 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -12,5 +12,19 @@ enum lwtunnel_encap_types { #define LWTUNNEL_ENCAP_MAX (__LWTUNNEL_ENCAP_MAX - 1) +enum lwtunnel_ip_t { + LWTUNNEL_IP_UNSPEC, + LWTUNNEL_IP_ID, + LWTUNNEL_IP_DST, + LWTUNNEL_IP_SRC, + LWTUNNEL_IP_TTL, + LWTUNNEL_IP_TOS, + LWTUNNEL_IP_SPORT, + LWTUNNEL_IP_DPORT, + LWTUNNEL_IP_FLAGS, + __LWTUNNEL_IP_MAX, +}; + +#define LWTUNNEL_IP_MAX (__LWTUNNEL_IP_MAX - 1) #endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 47d24cb3fbc1..0d3d3cc43356 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -286,21 +286,6 @@ enum rt_class_t { /* Routing message attributes */ -enum ip_tunnel_t { - IP_TUN_UNSPEC, - IP_TUN_ID, - IP_TUN_DST, - IP_TUN_SRC, - IP_TUN_TTL, - IP_TUN_TOS, - IP_TUN_SPORT, - IP_TUN_DPORT, - IP_TUN_FLAGS, - __IP_TUN_MAX, -}; - -#define IP_TUN_MAX (__IP_TUN_MAX - 1) - enum rtattr_type_t { RTA_UNSPEC, RTA_DST, diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 5512f4e4ec1b..fd6319681c50 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -192,15 +192,15 @@ struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev, } EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64); -static const struct nla_policy ip_tun_policy[IP_TUN_MAX + 1] = { - [IP_TUN_ID] = { .type = NLA_U64 }, - [IP_TUN_DST] = { .type = NLA_U32 }, - [IP_TUN_SRC] = { .type = NLA_U32 }, - [IP_TUN_TTL] = { .type = NLA_U8 }, - [IP_TUN_TOS] = { .type = NLA_U8 }, - [IP_TUN_SPORT] = { .type = NLA_U16 }, - [IP_TUN_DPORT] = { .type = NLA_U16 }, - [IP_TUN_FLAGS] = { .type = NLA_U16 }, +static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { + [LWTUNNEL_IP_ID] = { .type = NLA_U64 }, + [LWTUNNEL_IP_DST] = { .type = NLA_U32 }, + [LWTUNNEL_IP_SRC] = { .type = NLA_U32 }, + [LWTUNNEL_IP_TTL] = { .type = NLA_U8 }, + [LWTUNNEL_IP_TOS] = { .type = NLA_U8 }, + [LWTUNNEL_IP_SPORT] = { .type = NLA_U16 }, + [LWTUNNEL_IP_DPORT] = { .type = NLA_U16 }, + [LWTUNNEL_IP_FLAGS] = { .type = NLA_U16 }, }; static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, @@ -208,10 +208,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, { struct ip_tunnel_info *tun_info; struct lwtunnel_state *new_state; - struct nlattr *tb[IP_TUN_MAX + 1]; + struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, IP_TUN_MAX, attr, ip_tun_policy); + err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy); if (err < 0) return err; @@ -223,29 +223,29 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info = lwt_tun_info(new_state); - if (tb[IP_TUN_ID]) - tun_info->key.tun_id = nla_get_u64(tb[IP_TUN_ID]); + if (tb[LWTUNNEL_IP_ID]) + tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]); - if (tb[IP_TUN_DST]) - tun_info->key.ipv4_dst = nla_get_be32(tb[IP_TUN_DST]); + if (tb[LWTUNNEL_IP_DST]) + tun_info->key.ipv4_dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); - if (tb[IP_TUN_SRC]) - tun_info->key.ipv4_src = nla_get_be32(tb[IP_TUN_SRC]); + if (tb[LWTUNNEL_IP_SRC]) + tun_info->key.ipv4_src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); - if (tb[IP_TUN_TTL]) - tun_info->key.ipv4_ttl = nla_get_u8(tb[IP_TUN_TTL]); + if (tb[LWTUNNEL_IP_TTL]) + tun_info->key.ipv4_ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); - if (tb[IP_TUN_TOS]) - tun_info->key.ipv4_tos = nla_get_u8(tb[IP_TUN_TOS]); + if (tb[LWTUNNEL_IP_TOS]) + tun_info->key.ipv4_tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); - if (tb[IP_TUN_SPORT]) - tun_info->key.tp_src = nla_get_be16(tb[IP_TUN_SPORT]); + if (tb[LWTUNNEL_IP_SPORT]) + tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); - if (tb[IP_TUN_DPORT]) - tun_info->key.tp_dst = nla_get_be16(tb[IP_TUN_DPORT]); + if (tb[LWTUNNEL_IP_DPORT]) + tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP_DPORT]); - if (tb[IP_TUN_FLAGS]) - tun_info->key.tun_flags = nla_get_u16(tb[IP_TUN_FLAGS]); + if (tb[LWTUNNEL_IP_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->options = NULL; @@ -261,14 +261,14 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, { struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); - if (nla_put_u64(skb, IP_TUN_ID, tun_info->key.tun_id) || - nla_put_be32(skb, IP_TUN_DST, tun_info->key.ipv4_dst) || - nla_put_be32(skb, IP_TUN_SRC, tun_info->key.ipv4_src) || - nla_put_u8(skb, IP_TUN_TOS, tun_info->key.ipv4_tos) || - nla_put_u8(skb, IP_TUN_TTL, tun_info->key.ipv4_ttl) || - nla_put_u16(skb, IP_TUN_SPORT, tun_info->key.tp_src) || - nla_put_u16(skb, IP_TUN_DPORT, tun_info->key.tp_dst) || - nla_put_u16(skb, IP_TUN_FLAGS, tun_info->key.tun_flags)) + if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || + nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.ipv4_dst) || + nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.ipv4_src) || + nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.ipv4_tos) || + nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ipv4_ttl) || + nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || + nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) || + nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) return -ENOMEM; return 0; @@ -276,14 +276,14 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) { - return nla_total_size(8) /* IP_TUN_ID */ - + nla_total_size(4) /* IP_TUN_DST */ - + nla_total_size(4) /* IP_TUN_SRC */ - + nla_total_size(1) /* IP_TUN_TOS */ - + nla_total_size(1) /* IP_TUN_TTL */ - + nla_total_size(2) /* IP_TUN_SPORT */ - + nla_total_size(2) /* IP_TUN_DPORT */ - + nla_total_size(2); /* IP_TUN_FLAGS */ + return nla_total_size(8) /* LWTUNNEL_IP_ID */ + + nla_total_size(4) /* LWTUNNEL_IP_DST */ + + nla_total_size(4) /* LWTUNNEL_IP_SRC */ + + nla_total_size(1) /* LWTUNNEL_IP_TOS */ + + nla_total_size(1) /* LWTUNNEL_IP_TTL */ + + nla_total_size(2) /* LWTUNNEL_IP_SPORT */ + + nla_total_size(2) /* LWTUNNEL_IP_DPORT */ + + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ } static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { -- cgit v1.2.3 From 47dceb8ecdc1c3ad1818dfea3d659a05b74c3fc2 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Aug 2015 22:31:34 -0400 Subject: packet: add classic BPF fanout mode Add fanout mode PACKET_FANOUT_CBPF that accepts a classic BPF program to select a socket. This avoids having to keep adding special case fanout modes. One example use case is application layer load balancing. The QUIC protocol, for instance, encodes a connection ID in UDP payload. Also add socket option SOL_PACKET/PACKET_FANOUT_DATA that updates data associated with the socket group. Fanout mode PACKET_FANOUT_CBPF is the only user so far. Signed-off-by: Willem de Bruijn Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 2 + net/packet/af_packet.c | 99 +++++++++++++++++++++++++++++++++++++++++- net/packet/internal.h | 5 ++- 3 files changed, 104 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index d3d715f8c88f..a4bb16fa822e 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -55,6 +55,7 @@ struct sockaddr_ll { #define PACKET_TX_HAS_OFF 19 #define PACKET_QDISC_BYPASS 20 #define PACKET_ROLLOVER_STATS 21 +#define PACKET_FANOUT_DATA 22 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 @@ -62,6 +63,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_ROLLOVER 3 #define PACKET_FANOUT_RND 4 #define PACKET_FANOUT_QM 5 +#define PACKET_FANOUT_CBPF 6 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index b5afe538bb88..8869d07013e6 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -92,6 +92,7 @@ #ifdef CONFIG_INET #include #endif +#include #include "internal.h" @@ -1410,6 +1411,22 @@ static unsigned int fanout_demux_qm(struct packet_fanout *f, return skb_get_queue_mapping(skb) % num; } +static unsigned int fanout_demux_bpf(struct packet_fanout *f, + struct sk_buff *skb, + unsigned int num) +{ + struct bpf_prog *prog; + unsigned int ret = 0; + + rcu_read_lock(); + prog = rcu_dereference(f->bpf_prog); + if (prog) + ret = BPF_PROG_RUN(prog, skb) % num; + rcu_read_unlock(); + + return ret; +} + static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); @@ -1454,6 +1471,9 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; + case PACKET_FANOUT_CBPF: + idx = fanout_demux_bpf(f, skb, num); + break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) @@ -1502,6 +1522,74 @@ static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) return false; } +static void fanout_init_data(struct packet_fanout *f) +{ + switch (f->type) { + case PACKET_FANOUT_LB: + atomic_set(&f->rr_cur, 0); + break; + case PACKET_FANOUT_CBPF: + RCU_INIT_POINTER(f->bpf_prog, NULL); + break; + } +} + +static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) +{ + struct bpf_prog *old; + + spin_lock(&f->lock); + old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); + rcu_assign_pointer(f->bpf_prog, new); + spin_unlock(&f->lock); + + if (old) { + synchronize_net(); + bpf_prog_destroy(old); + } +} + +static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, + unsigned int len) +{ + struct bpf_prog *new; + struct sock_fprog fprog; + int ret; + + if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) + return -EPERM; + if (len != sizeof(fprog)) + return -EINVAL; + if (copy_from_user(&fprog, data, len)) + return -EFAULT; + + ret = bpf_prog_create_from_user(&new, &fprog, NULL); + if (ret) + return ret; + + __fanout_set_data_bpf(po->fanout, new); + return 0; +} + +static int fanout_set_data(struct packet_sock *po, char __user *data, + unsigned int len) +{ + switch (po->fanout->type) { + case PACKET_FANOUT_CBPF: + return fanout_set_data_cbpf(po, data, len); + default: + return -EINVAL; + }; +} + +static void fanout_release_data(struct packet_fanout *f) +{ + switch (f->type) { + case PACKET_FANOUT_CBPF: + __fanout_set_data_bpf(f, NULL); + }; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_sock *po = pkt_sk(sk); @@ -1519,6 +1607,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: + case PACKET_FANOUT_CBPF: break; default: return -EINVAL; @@ -1561,10 +1650,10 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) match->id = id; match->type = type; match->flags = flags; - atomic_set(&match->rr_cur, 0); INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); atomic_set(&match->sk_ref, 0); + fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; @@ -1610,6 +1699,7 @@ static void fanout_release(struct sock *sk) if (atomic_dec_and_test(&f->sk_ref)) { list_del(&f->list); dev_remove_pack(&f->prot_hook); + fanout_release_data(f); kfree(f); } mutex_unlock(&fanout_mutex); @@ -3529,6 +3619,13 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return fanout_add(sk, val & 0xffff, val >> 16); } + case PACKET_FANOUT_DATA: + { + if (!po->fanout) + return -EINVAL; + + return fanout_set_data(po, optval, optlen); + } case PACKET_TX_HAS_OFF: { unsigned int val; diff --git a/net/packet/internal.h b/net/packet/internal.h index e20b3e8829b8..9ee46314b7d7 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -79,7 +79,10 @@ struct packet_fanout { u16 id; u8 type; u8 flags; - atomic_t rr_cur; + union { + atomic_t rr_cur; + struct bpf_prog __rcu *bpf_prog; + }; struct list_head list; struct sock *arr[PACKET_FANOUT_MAX]; spinlock_t lock; -- cgit v1.2.3 From f2e520956a1ab636698f8160194c9b8ac0989aab Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 14 Aug 2015 22:31:35 -0400 Subject: packet: add extended BPF fanout mode Add fanout mode PACKET_FANOUT_EBPF that accepts an en extended BPF program to select a socket. Update the internal eBPF program by passing to socket option SOL_PACKET/PACKET_FANOUT_DATA a file descriptor returned by bpf(). Signed-off-by: Willem de Bruijn Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index a4bb16fa822e..9e7edfd8141e 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -64,6 +64,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_RND 4 #define PACKET_FANOUT_QM 5 #define PACKET_FANOUT_CBPF 6 +#define PACKET_FANOUT_EBPF 7 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8869d07013e6..7b8e39a22387 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1472,6 +1472,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } @@ -1529,6 +1530,7 @@ static void fanout_init_data(struct packet_fanout *f) atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } @@ -1571,12 +1573,39 @@ static int fanout_set_data_cbpf(struct packet_sock *po, char __user *data, return 0; } +static int fanout_set_data_ebpf(struct packet_sock *po, char __user *data, + unsigned int len) +{ + struct bpf_prog *new; + u32 fd; + + if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) + return -EPERM; + if (len != sizeof(fd)) + return -EINVAL; + if (copy_from_user(&fd, data, len)) + return -EFAULT; + + new = bpf_prog_get(fd); + if (IS_ERR(new)) + return PTR_ERR(new); + if (new->type != BPF_PROG_TYPE_SOCKET_FILTER) { + bpf_prog_put(new); + return -EINVAL; + } + + __fanout_set_data_bpf(po->fanout, new); + return 0; +} + static int fanout_set_data(struct packet_sock *po, char __user *data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); + case PACKET_FANOUT_EBPF: + return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; }; @@ -1586,6 +1615,7 @@ static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); }; } @@ -1608,6 +1638,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: + case PACKET_FANOUT_EBPF: break; default: return -EINVAL; -- cgit v1.2.3 From ec120da6f0fe59f175c2a8faa0a7700280c39644 Mon Sep 17 00:00:00 2001 From: Ian Morris Date: Fri, 14 Aug 2015 22:43:38 +0100 Subject: ipv6: trivial whitespace fix Change brace placement to be in line with coding standards Signed-off-by: Ian Morris Signed-off-by: David S. Miller --- net/ipv6/udp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e51fc3eee6db..0aba654f5b91 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1496,7 +1496,8 @@ int __net_init udp6_proc_init(struct net *net) return udp_proc_register(net, &udp6_seq_afinfo); } -void udp6_proc_exit(struct net *net) { +void udp6_proc_exit(struct net *net) +{ udp_proc_unregister(net, &udp6_seq_afinfo); } #endif /* CONFIG_PROC_FS */ -- cgit v1.2.3 From 2ea273d76a4869c097c6b1a1070162add2f66a6e Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Mon, 17 Aug 2015 14:37:06 -0700 Subject: net: Export bpf_prog_create_from_user(). Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index a50dbfa83ad9..f8184222465e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1124,6 +1124,7 @@ int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, *pfp = fp; return 0; } +EXPORT_SYMBOL_GPL(bpf_prog_create_from_user); void bpf_prog_destroy(struct bpf_prog *fp) { -- cgit v1.2.3 From dc028da54ed353edd44dca88b7eb19fd5126c354 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 16 Aug 2015 17:13:27 -0600 Subject: inet: Move VRF table lookup to inlined function Table lookup compiles out when VRF is not enabled. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/vrf.h | 24 ++++++++++++++++++++++++ net/ipv4/af_inet.c | 10 +--------- 2 files changed, 25 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/vrf.h b/include/net/vrf.h index 0484d29d4589..40e3793c7a05 100644 --- a/include/net/vrf.h +++ b/include/net/vrf.h @@ -81,6 +81,25 @@ static inline int vrf_dev_table(const struct net_device *dev) return tb_id; } +static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) +{ + struct net_device *dev; + int tb_id = 0; + + if (!ifindex) + return 0; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + tb_id = vrf_dev_table_rcu(dev); + + rcu_read_unlock(); + + return tb_id; +} + /* called with rtnl */ static inline int vrf_dev_table_rtnl(const struct net_device *dev) { @@ -125,6 +144,11 @@ static inline int vrf_dev_table(const struct net_device *dev) return 0; } +static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) +{ + return 0; +} + static inline int vrf_dev_table_rtnl(const struct net_device *dev) { return 0; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c8b855882fa5..675e88cac2b4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,15 +450,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - if (sk->sk_bound_dev_if) { - struct net_device *dev; - - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); - if (dev) - tb_id = vrf_dev_table_rcu(dev) ? : tb_id; - rcu_read_unlock(); - } + tb_id = vrf_dev_table_ifindex(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too -- cgit v1.2.3 From deedb59039f111c41aa5a54ee384c8e7c08bc78a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Aug 2015 16:03:39 +0200 Subject: netfilter: nf_conntrack: add direction support for zones This work adds a direction parameter to netfilter zones, so identity separation can be performed only in original/reply or both directions (default). This basically opens up the possibility of doing NAT with conflicting IP address/port tuples from multiple, isolated tenants on a host (e.g. from a netns) without requiring each tenant to NAT twice resp. to use its own dedicated IP address to SNAT to, meaning overlapping tuples can be made unique with the zone identifier in original direction, where the NAT engine will then allocate a unique tuple in the commonly shared default zone for the reply direction. In some restricted, local DNAT cases, also port redirection could be used for making the reply traffic unique w/o requiring SNAT. The consensus we've reached and discussed at NFWS and since the initial implementation [1] was to directly integrate the direction meta data into the existing zones infrastructure, as opposed to the ct->mark approach we proposed initially. As we pass the nf_conntrack_zone object directly around, we don't have to touch all call-sites, but only those, that contain equality checks of zones. Thus, based on the current direction (original or reply), we either return the actual id, or the default NF_CT_DEFAULT_ZONE_ID. CT expectations are direction-agnostic entities when expectations are being compared among themselves, so we can only use the identifier in this case. Note that zone identifiers can not be included into the hash mix anymore as they don't contain a "stable" value that would be equal for both directions at all times, f.e. if only zone->id would unconditionally be xor'ed into the table slot hash, then replies won't find the corresponding conntracking entry anymore. If no particular direction is specified when configuring zones, the behaviour is exactly as we expect currently (both directions). Support has been added for the CT netlink interface as well as the x_tables raw CT target, which both already offer existing interfaces to user space for the configuration of zones. Below a minimal, simplified collision example (script in [2]) with netperf sessions: +--- tenant-1 ---+ mark := 1 | netperf |--+ +----------------+ | CT zone := mark [ORIGINAL] [ip,sport] := X +--------------+ +--- gateway ---+ | mark routing |--| SNAT |-- ... + +--------------+ +---------------+ | +--- tenant-2 ---+ | ~~~|~~~ | netperf |--+ +-----------+ | +----------------+ mark := 2 | netserver |------ ... + [ip,sport] := X +-----------+ [ip,port] := Y On the gateway netns, example: iptables -t raw -A PREROUTING -j CT --zone mark --zone-dir ORIGINAL iptables -t nat -A POSTROUTING -o -j SNAT --to-source --random-fully iptables -t mangle -A PREROUTING -m conntrack --ctdir ORIGINAL -j CONNMARK --save-mark iptables -t mangle -A POSTROUTING -m conntrack --ctdir REPLY -j CONNMARK --restore-mark conntrack dump from gateway netns: netperf -H 10.1.1.2 -t TCP_STREAM -l60 -p12865,5555 from each tenant netns tcp 6 431995 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=1 src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=1024 [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1 tcp 6 431994 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=5555 dport=12865 zone-orig=2 src=10.1.1.2 dst=10.1.1.1 sport=12865 dport=5555 [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=1 tcp 6 299 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=39438 dport=33768 zone-orig=1 src=10.1.1.2 dst=10.1.1.1 sport=33768 dport=39438 [ASSURED] mark=1 secctx=system_u:object_r:unlabeled_t:s0 use=1 tcp 6 300 ESTABLISHED src=40.1.1.1 dst=10.1.1.2 sport=32889 dport=40206 zone-orig=2 src=10.1.1.2 dst=10.1.1.1 sport=40206 dport=32889 [ASSURED] mark=2 secctx=system_u:object_r:unlabeled_t:s0 use=2 Taking this further, test script in [2] creates 200 tenants and runs original-tuple colliding netperf sessions each. A conntrack -L dump in the gateway netns also confirms 200 overlapping entries, all in ESTABLISHED state as expected. I also did run various other tests with some permutations of the script, to mention some: SNAT in random/random-fully/persistent mode, no zones (no overlaps), static zones (original, reply, both directions), etc. [1] http://thread.gmane.org/gmane.comp.security.firewalls.netfilter.devel/57412/ [2] https://paste.fedoraproject.org/242835/65657871/ Signed-off-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_zones.h | 31 +++- include/uapi/linux/netfilter/nfnetlink_conntrack.h | 1 + include/uapi/linux/netfilter/xt_CT.h | 6 +- net/ipv4/netfilter/nf_defrag_ipv4.c | 8 +- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 8 +- net/netfilter/nf_conntrack_core.c | 53 +++--- net/netfilter/nf_conntrack_expect.c | 8 +- net/netfilter/nf_conntrack_netlink.c | 177 +++++++++++++++------ net/netfilter/nf_conntrack_standalone.c | 30 +++- net/netfilter/nf_nat_core.c | 13 +- net/netfilter/xt_CT.c | 17 +- net/sched/act_connmark.c | 1 + 12 files changed, 259 insertions(+), 94 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 0788bb0f267d..3942ddf0d4ff 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -1,10 +1,18 @@ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H +#include + #define NF_CT_DEFAULT_ZONE_ID 0 +#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) +#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) + +#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) + struct nf_conntrack_zone { u16 id; + u16 dir; }; extern const struct nf_conntrack_zone nf_ct_zone_dflt; @@ -29,8 +37,29 @@ nf_ct_zone_tmpl(const struct nf_conn *tmpl) return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt; } +static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, + enum ip_conntrack_dir dir) +{ + return zone->dir & (1 << dir); +} + +static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone, + enum ip_conntrack_dir dir) +{ + return nf_ct_zone_matches_dir(zone, dir) ? + zone->id : NF_CT_DEFAULT_ZONE_ID; +} + static inline bool nf_ct_zone_equal(const struct nf_conn *a, - const struct nf_conntrack_zone *b) + const struct nf_conntrack_zone *b, + enum ip_conntrack_dir dir) +{ + return nf_ct_zone_id(nf_ct_zone(a), dir) == + nf_ct_zone_id(b, dir); +} + +static inline bool nf_ct_zone_equal_any(const struct nf_conn *a, + const struct nf_conntrack_zone *b) { return nf_ct_zone(a)->id == b->id; } diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index acad6c52a652..c1a4e1441a25 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -61,6 +61,7 @@ enum ctattr_tuple { CTA_TUPLE_UNSPEC, CTA_TUPLE_IP, CTA_TUPLE_PROTO, + CTA_TUPLE_ZONE, __CTA_TUPLE_MAX }; #define CTA_TUPLE_MAX (__CTA_TUPLE_MAX - 1) diff --git a/include/uapi/linux/netfilter/xt_CT.h b/include/uapi/linux/netfilter/xt_CT.h index 5a688c1ca4d7..452005ff0e9e 100644 --- a/include/uapi/linux/netfilter/xt_CT.h +++ b/include/uapi/linux/netfilter/xt_CT.h @@ -6,7 +6,11 @@ enum { XT_CT_NOTRACK = 1 << 0, XT_CT_NOTRACK_ALIAS = 1 << 1, - XT_CT_MASK = XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS, + XT_CT_ZONE_DIR_ORIG = 1 << 2, + XT_CT_ZONE_DIR_REPL = 1 << 3, + + XT_CT_MASK = XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS | + XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL, }; struct xt_ct_target_info { diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 20fe8e67c09b..9306ec4fab41 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -45,8 +45,12 @@ static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - if (skb->nfct) - zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id; + if (skb->nfct) { + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + } #endif if (nf_bridge_in_prerouting(skb)) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index 9d3de9b74856..6d9c0b3d5b8c 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -35,8 +35,12 @@ static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) - if (skb->nfct) - zone_id = nf_ct_zone((struct nf_conn *)skb->nfct)->id; + if (skb->nfct) { + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + + zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); + } #endif if (nf_bridge_in_prerouting(skb)) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0bb26e84f849..acc06222ce6a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -126,8 +126,7 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); unsigned int nf_conntrack_hash_rnd __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) { unsigned int n; @@ -136,7 +135,7 @@ static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, * three bytes manually. */ n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, zone->id ^ nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } @@ -152,17 +151,15 @@ static u32 hash_bucket(u32 hash, const struct net *net) } static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone, unsigned int size) { - return __hash_bucket(hash_conntrack_raw(tuple, zone), size); + return __hash_bucket(hash_conntrack_raw(tuple), size); } static inline u_int32_t hash_conntrack(const struct net *net, - const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, zone, net->ct.htable_size); + return __hash_conntrack(tuple, net->ct.htable_size); } bool @@ -312,6 +309,7 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, if (!nf_ct_zone) goto out_free; nf_ct_zone->id = zone->id; + nf_ct_zone->dir = zone->dir; } #endif atomic_set(&tmpl->ct_general.use, 0); @@ -376,20 +374,18 @@ destroy_conntrack(struct nf_conntrack *nfct) static void nf_ct_delete_from_lists(struct nf_conn *ct) { - const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); unsigned int hash, reply_hash; unsigned int sequence; - zone = nf_ct_zone(ct); nf_ct_helper_destroy(ct); local_bh_disable(); do { sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_conntrack(net, zone, + hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -446,7 +442,7 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, * so we need to check that the conntrack is confirmed */ return nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone_equal(ct, zone) && + nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && nf_ct_is_confirmed(ct); } @@ -523,7 +519,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple, zone)); + hash_conntrack_raw(tuple)); } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -554,9 +550,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) local_bh_disable(); do { sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_conntrack(net, zone, + hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -564,12 +560,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone)) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone)) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; add_timer(&ct->timeout); @@ -623,7 +621,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = hash_bucket(hash, net); - reply_hash = hash_conntrack(net, zone, + reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); @@ -655,12 +653,14 @@ __nf_conntrack_confirm(struct sk_buff *skb) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone)) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone)) + nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, + NF_CT_DIRECTION(h))) goto out; /* Timer relative to confirmation time, not original @@ -720,7 +720,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, unsigned int hash; zone = nf_ct_zone(ignored_conntrack); - hash = hash_conntrack(net, zone, tuple); + hash = hash_conntrack(net, tuple); /* Disable BHs the entire time since we need to disable them at * least once for the stats anyway. @@ -730,7 +730,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone_equal(ct, zone)) { + nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { NF_CT_STAT_INC(net, found); rcu_read_unlock_bh(); return 1; @@ -830,7 +830,7 @@ __nf_conntrack_alloc(struct net *net, if (unlikely(!nf_conntrack_hash_rnd)) { init_nf_conntrack_hash_rnd(); /* recompute the hash as nf_conntrack_hash_rnd is initialized */ - hash = hash_conntrack_raw(orig, zone); + hash = hash_conntrack_raw(orig); } /* We don't want any race condition at early drop stage */ @@ -875,6 +875,7 @@ __nf_conntrack_alloc(struct net *net, if (!nf_ct_zone) goto out_free; nf_ct_zone->id = zone->id; + nf_ct_zone->dir = zone->dir; } #endif /* Because we use RCU lookups, we set ct_general.use to zero before @@ -1053,7 +1054,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl); - hash = hash_conntrack_raw(&tuple, zone); + hash = hash_conntrack_raw(&tuple); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, @@ -1306,6 +1307,7 @@ EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); /* Built-in default zone used e.g. by modules. */ const struct nf_conntrack_zone nf_ct_zone_dflt = { .id = NF_CT_DEFAULT_ZONE_ID, + .dir = NF_CT_DEFAULT_ZONE_DIR, }; EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); @@ -1617,8 +1619,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), - hashsize); + bucket = __hash_conntrack(&h->tuple, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 980db854c3c8..acf5c7b3f378 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -101,7 +101,7 @@ __nf_ct_expect_find(struct net *net, h = nf_ct_expect_dst_hash(tuple); hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal(i->master, zone)) + nf_ct_zone_equal_any(i->master, zone)) return i; } return NULL; @@ -143,7 +143,7 @@ nf_ct_find_expectation(struct net *net, hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal(i->master, zone)) { + nf_ct_zone_equal_any(i->master, zone)) { exp = i; break; } @@ -223,7 +223,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && - nf_ct_zone_equal(a->master, nf_ct_zone(b->master)); + nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, @@ -232,7 +232,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, return a->master == b->master && a->class == b->class && nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && - nf_ct_zone_equal(a->master, nf_ct_zone(b->master)); + nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } /* Generally a bad idea to call this: could have matched already. */ diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 95f7f01e253d..4eaf925bead4 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -127,6 +127,20 @@ ctnetlink_dump_tuples(struct sk_buff *skb, return ret; } +static inline int +ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, + const struct nf_conntrack_zone *zone, int dir) +{ + if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir) + return 0; + if (nla_put_be16(skb, attrtype, htons(zone->id))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + static inline int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { @@ -474,11 +488,16 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = 0; + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -486,11 +505,13 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - zone = nf_ct_zone(ct); - if (zone->id != NF_CT_DEFAULT_ZONE_ID && - nla_put_be16(skb, CTA_ZONE, htons(zone->id))) + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_status(skb, ct) < 0 || @@ -600,7 +621,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES - + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) + ctnetlink_label_size(ct) @@ -658,11 +679,16 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) nfmsg->res_id = 0; rcu_read_lock(); + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -670,11 +696,13 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - zone = nf_ct_zone(ct); - if (zone->id != NF_CT_DEFAULT_ZONE_ID && - nla_put_be16(skb, CTA_ZONE, htons(zone->id))) + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) @@ -924,15 +952,55 @@ ctnetlink_parse_tuple_proto(struct nlattr *attr, return ret; } +static int +ctnetlink_parse_zone(const struct nlattr *attr, + struct nf_conntrack_zone *zone) +{ + zone->id = NF_CT_DEFAULT_ZONE_ID; + zone->dir = NF_CT_DEFAULT_ZONE_DIR; + +#ifdef CONFIG_NF_CONNTRACK_ZONES + if (attr) + zone->id = ntohs(nla_get_be16(attr)); +#else + if (attr) + return -EOPNOTSUPP; +#endif + return 0; +} + +static int +ctnetlink_parse_tuple_zone(struct nlattr *attr, enum ctattr_type type, + struct nf_conntrack_zone *zone) +{ + int ret; + + if (zone->id != NF_CT_DEFAULT_ZONE_ID) + return -EINVAL; + + ret = ctnetlink_parse_zone(attr, zone); + if (ret < 0) + return ret; + + if (type == CTA_TUPLE_REPLY) + zone->dir = NF_CT_ZONE_DIR_REPL; + else + zone->dir = NF_CT_ZONE_DIR_ORIG; + + return 0; +} + static const struct nla_policy tuple_nla_policy[CTA_TUPLE_MAX+1] = { [CTA_TUPLE_IP] = { .type = NLA_NESTED }, [CTA_TUPLE_PROTO] = { .type = NLA_NESTED }, + [CTA_TUPLE_ZONE] = { .type = NLA_U16 }, }; static int ctnetlink_parse_tuple(const struct nlattr * const cda[], struct nf_conntrack_tuple *tuple, - enum ctattr_type type, u_int8_t l3num) + enum ctattr_type type, u_int8_t l3num, + struct nf_conntrack_zone *zone) { struct nlattr *tb[CTA_TUPLE_MAX+1]; int err; @@ -959,6 +1027,16 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (err < 0) return err; + if (tb[CTA_TUPLE_ZONE]) { + if (!zone) + return -EINVAL; + + err = ctnetlink_parse_tuple_zone(tb[CTA_TUPLE_ZONE], + type, zone); + if (err < 0) + return err; + } + /* orig and expect tuples get DIR_ORIGINAL */ if (type == CTA_TUPLE_REPLY) tuple->dst.dir = IP_CT_DIR_REPLY; @@ -968,22 +1046,6 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], return 0; } -static int -ctnetlink_parse_zone(const struct nlattr *attr, - struct nf_conntrack_zone *zone) -{ - zone->id = NF_CT_DEFAULT_ZONE_ID; - -#ifdef CONFIG_NF_CONNTRACK_ZONES - if (attr) - zone->id = ntohs(nla_get_be16(attr)); -#else - if (attr) - return -EOPNOTSUPP; -#endif - return 0; -} - static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { [CTA_HELP_NAME] = { .type = NLA_NUL_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, @@ -1071,9 +1133,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, + u3, &zone); else if (cda[CTA_TUPLE_REPLY]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, + u3, &zone); else { return ctnetlink_flush_conntrack(net, cda, NETLINK_CB(skb).portid, @@ -1143,9 +1207,11 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG, + u3, &zone); else if (cda[CTA_TUPLE_REPLY]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, + u3, &zone); else return -EINVAL; @@ -1767,7 +1833,8 @@ ctnetlink_create_conntrack(struct net *net, struct nf_conntrack_tuple_hash *master_h; struct nf_conn *master_ct; - err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &master, CTA_TUPLE_MASTER, + u3, NULL); if (err < 0) goto err2; @@ -1818,13 +1885,15 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_TUPLE_ORIG]) { - err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, u3); + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG, + u3, &zone); if (err < 0) return err; } if (cda[CTA_TUPLE_REPLY]) { - err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, u3); + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY, + u3, &zone); if (err < 0) return err; } @@ -2088,7 +2157,7 @@ ctnetlink_nfqueue_build_size(const struct nf_conn *ct) + nla_total_size(sizeof(u_int32_t)) /* CTA_MARK */ #endif #ifdef CONFIG_NF_CONNTRACK_ZONES - + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE */ + + nla_total_size(sizeof(u_int16_t)) /* CTA_ZONE|CTA_TUPLE_ZONE */ #endif + ctnetlink_proto_size(ct) ; @@ -2101,11 +2170,16 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) struct nlattr *nest_parms; rcu_read_lock(); + zone = nf_ct_zone(ct); + nest_parms = nla_nest_start(skb, CTA_TUPLE_ORIG | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_ORIG) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); nest_parms = nla_nest_start(skb, CTA_TUPLE_REPLY | NLA_F_NESTED); @@ -2113,11 +2187,13 @@ ctnetlink_nfqueue_build(struct sk_buff *skb, struct nf_conn *ct) goto nla_put_failure; if (ctnetlink_dump_tuples(skb, nf_ct_tuple(ct, IP_CT_DIR_REPLY)) < 0) goto nla_put_failure; + if (ctnetlink_dump_zone_id(skb, CTA_TUPLE_ZONE, zone, + NF_CT_ZONE_DIR_REPL) < 0) + goto nla_put_failure; nla_nest_end(skb, nest_parms); - zone = nf_ct_zone(ct); - if (zone->id != NF_CT_DEFAULT_ZONE_ID && - nla_put_be16(skb, CTA_ZONE, htons(zone->id))) + if (ctnetlink_dump_zone_id(skb, CTA_ZONE, zone, + NF_CT_DEFAULT_ZONE_DIR) < 0) goto nla_put_failure; if (ctnetlink_dump_id(skb, ct) < 0) @@ -2225,12 +2301,12 @@ static int ctnetlink_nfqueue_exp_parse(const struct nlattr * const *cda, int err; err = ctnetlink_parse_tuple(cda, tuple, CTA_EXPECT_TUPLE, - nf_ct_l3num(ct)); + nf_ct_l3num(ct), NULL); if (err < 0) return err; return ctnetlink_parse_tuple(cda, mask, CTA_EXPECT_MASK, - nf_ct_l3num(ct)); + nf_ct_l3num(ct), NULL); } static int @@ -2625,7 +2701,8 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb, .done = ctnetlink_exp_done, }; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, + u3, NULL); if (err < 0) return err; @@ -2677,9 +2754,11 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, return err; if (cda[CTA_EXPECT_TUPLE]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); else if (cda[CTA_EXPECT_MASTER]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, + u3, NULL); else return -EINVAL; @@ -2747,7 +2826,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; @@ -2854,7 +2934,8 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, return -EINVAL; err = ctnetlink_parse_tuple((const struct nlattr * const *)tb, - &nat_tuple, CTA_EXPECT_NAT_TUPLE, u3); + &nat_tuple, CTA_EXPECT_NAT_TUPLE, + u3, NULL); if (err < 0) return err; @@ -2955,13 +3036,16 @@ ctnetlink_create_expect(struct net *net, int err; /* caller guarantees that those three CTA_EXPECT_* exist */ - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, u3); + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK, + u3, NULL); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, u3); + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER, + u3, NULL); if (err < 0) return err; @@ -3029,7 +3113,8 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, u3); + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE, + u3, NULL); if (err < 0) return err; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 28c8b2b982ec..1fb3cacc04e1 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -141,12 +141,30 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct) #endif #ifdef CONFIG_NF_CONNTRACK_ZONES -static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct) +static void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, + int dir) { - seq_printf(s, "zone=%u ", nf_ct_zone(ct)->id); + const struct nf_conntrack_zone *zone = nf_ct_zone(ct); + + if (zone->dir != dir) + return; + switch (zone->dir) { + case NF_CT_DEFAULT_ZONE_DIR: + seq_printf(s, "zone=%u ", zone->id); + break; + case NF_CT_ZONE_DIR_ORIG: + seq_printf(s, "zone-orig=%u ", zone->id); + break; + case NF_CT_ZONE_DIR_REPL: + seq_printf(s, "zone-reply=%u ", zone->id); + break; + default: + break; + } } #else -static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct) +static inline void ct_show_zone(struct seq_file *s, const struct nf_conn *ct, + int dir) { } #endif @@ -213,6 +231,8 @@ static int ct_seq_show(struct seq_file *s, void *v) print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, l3proto, l4proto); + ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG); + if (seq_has_overflowed(s)) goto release; @@ -225,6 +245,8 @@ static int ct_seq_show(struct seq_file *s, void *v) print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l3proto, l4proto); + ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL); + if (seq_print_acct(s, ct, IP_CT_DIR_REPLY)) goto release; @@ -239,7 +261,7 @@ static int ct_seq_show(struct seq_file *s, void *v) #endif ct_show_secctx(s, ct); - ct_show_zone(s, ct); + ct_show_zone(s, ct, NF_CT_DEFAULT_ZONE_DIR); ct_show_delta_time(s, ct); seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 65ebaf9fc4f9..5113dfd39df9 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -118,15 +118,13 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, - const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) { unsigned int hash; /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ zone->id ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ nf_conntrack_hash_rnd); return reciprocal_scale(hash, net->ct.nat_htable_size); } @@ -194,13 +192,14 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(net, zone, tuple); + unsigned int h = hash_by_src(net, tuple); const struct nf_conn_nat *nat; const struct nf_conn *ct; hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { ct = nat->ct; - if (same_src(ct, tuple) && nf_ct_zone_equal(ct, zone)) { + if (same_src(ct, tuple) && + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -425,7 +424,7 @@ nf_nat_setup_info(struct nf_conn *ct, if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; - srchash = hash_by_src(net, nf_ct_zone(ct), + srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate extension aera */ diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 29e2856063ff..536cb67928ad 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -181,6 +181,19 @@ out: #endif } +static u16 xt_ct_flags_to_dir(const struct xt_ct_target_info_v1 *info) +{ + switch (info->flags & (XT_CT_ZONE_DIR_ORIG | + XT_CT_ZONE_DIR_REPL)) { + case XT_CT_ZONE_DIR_ORIG: + return NF_CT_ZONE_DIR_ORIG; + case XT_CT_ZONE_DIR_REPL: + return NF_CT_ZONE_DIR_REPL; + default: + return NF_CT_DEFAULT_ZONE_DIR; + } +} + static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { @@ -194,7 +207,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, } #ifndef CONFIG_NF_CONNTRACK_ZONES - if (info->zone) + if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG | + XT_CT_ZONE_DIR_REPL)) goto err1; #endif @@ -204,6 +218,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, memset(&zone, 0, sizeof(zone)); zone.id = info->zone; + zone.dir = xt_ct_flags_to_dir(info); ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL); ret = PTR_ERR(ct); diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index e67a1bdd0929..5019a47b9270 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -72,6 +72,7 @@ static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a, goto out; zone.id = ca->zone; + zone.dir = NF_CT_DEFAULT_ZONE_DIR; thash = nf_conntrack_find_get(dev_net(skb->dev), &zone, &tuple); if (!thash) -- cgit v1.2.3 From 5e8018fc61423e677398d4ad4d72df70b9788e77 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 14 Aug 2015 16:03:40 +0200 Subject: netfilter: nf_conntrack: add efficient mark to zone mapping This work adds the possibility of deriving the zone id from the skb->mark field in a scalable manner. This allows for having only a single template serving hundreds/thousands of different zones, for example, instead of the need to have one match for each zone as an extra CT jump target. Note that we'd need to have this information attached to the template as at the time when we're trying to lookup a possible ct object, we already need to know zone information for a possible match when going into __nf_conntrack_find_get(). This work provides a minimal implementation for a possible mapping. In order to not add/expose an extra ct->status bit, the zone structure has been extended to carry a flag for deriving the mark. Signed-off-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_zones.h | 45 +++++++++++++++++++++-- include/uapi/linux/netfilter/xt_CT.h | 4 ++- net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 3 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 4 ++- net/netfilter/nf_conntrack_core.c | 50 +++++++++----------------- net/netfilter/nf_conntrack_netlink.c | 5 ++- net/netfilter/xt_CT.c | 5 ++- 7 files changed, 72 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 3942ddf0d4ff..5316c7b3a374 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -10,9 +10,12 @@ #define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) +#define NF_CT_FLAG_MARK 1 + struct nf_conntrack_zone { u16 id; - u16 dir; + u8 flags; + u8 dir; }; extern const struct nf_conntrack_zone nf_ct_zone_dflt; @@ -32,9 +35,45 @@ nf_ct_zone(const struct nf_conn *ct) } static inline const struct nf_conntrack_zone * -nf_ct_zone_tmpl(const struct nf_conn *tmpl) +nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags) +{ + zone->id = id; + zone->flags = flags; + zone->dir = dir; + + return zone; +} + +static inline const struct nf_conntrack_zone * +nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb, + struct nf_conntrack_zone *tmp) +{ + const struct nf_conntrack_zone *zone; + + if (!tmpl) + return &nf_ct_zone_dflt; + + zone = nf_ct_zone(tmpl); + if (zone->flags & NF_CT_FLAG_MARK) + zone = nf_ct_zone_init(tmp, skb->mark, zone->dir, 0); + + return zone; +} + +static inline int nf_ct_zone_add(struct nf_conn *ct, gfp_t flags, + const struct nf_conntrack_zone *info) { - return tmpl ? nf_ct_zone(tmpl) : &nf_ct_zone_dflt; +#ifdef CONFIG_NF_CONNTRACK_ZONES + struct nf_conntrack_zone *nf_ct_zone; + + nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, flags); + if (!nf_ct_zone) + return -ENOMEM; + + nf_ct_zone_init(nf_ct_zone, info->id, info->dir, + info->flags); +#endif + return 0; } static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, diff --git a/include/uapi/linux/netfilter/xt_CT.h b/include/uapi/linux/netfilter/xt_CT.h index 452005ff0e9e..9e520418b858 100644 --- a/include/uapi/linux/netfilter/xt_CT.h +++ b/include/uapi/linux/netfilter/xt_CT.h @@ -8,9 +8,11 @@ enum { XT_CT_NOTRACK_ALIAS = 1 << 1, XT_CT_ZONE_DIR_ORIG = 1 << 2, XT_CT_ZONE_DIR_REPL = 1 << 3, + XT_CT_ZONE_MARK = 1 << 4, XT_CT_MASK = XT_CT_NOTRACK | XT_CT_NOTRACK_ALIAS | - XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL, + XT_CT_ZONE_DIR_ORIG | XT_CT_ZONE_DIR_REPL | + XT_CT_ZONE_MARK, }; struct xt_ct_target_info { diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 8a2f41c2fe6f..cdde3ec496e9 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -135,9 +135,10 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_zone *zone; + struct nf_conntrack_zone tmp; NF_CT_ASSERT(skb->nfct == NULL); - zone = nf_ct_zone_tmpl(tmpl); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); /* Are they talking about one of our connections? */ if (!nf_ct_get_tuplepr(skb, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 202914151360..0e6fae103d33 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -150,6 +150,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_tuple intuple, origtuple; const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_l4proto *inproto; + struct nf_conntrack_zone tmp; NF_CT_ASSERT(skb->nfct == NULL); @@ -176,7 +177,8 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl), &intuple); + h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp), + &intuple); if (!h) { pr_debug("icmpv6_error: no match\n"); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index acc06222ce6a..48521d62c672 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -301,25 +301,15 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, tmpl->status = IPS_TEMPLATE; write_pnet(&tmpl->ct_net, net); -#ifdef CONFIG_NF_CONNTRACK_ZONES - if (zone) { - struct nf_conntrack_zone *nf_ct_zone; - - nf_ct_zone = nf_ct_ext_add(tmpl, NF_CT_EXT_ZONE, GFP_ATOMIC); - if (!nf_ct_zone) - goto out_free; - nf_ct_zone->id = zone->id; - nf_ct_zone->dir = zone->dir; - } -#endif + if (nf_ct_zone_add(tmpl, flags, zone) < 0) + goto out_free; + atomic_set(&tmpl->ct_general.use, 0); return tmpl; -#ifdef CONFIG_NF_CONNTRACK_ZONES out_free: kfree(tmpl); return NULL; -#endif } EXPORT_SYMBOL_GPL(nf_ct_tmpl_alloc); @@ -850,10 +840,9 @@ __nf_conntrack_alloc(struct net *net, * SLAB_DESTROY_BY_RCU. */ ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); - if (ct == NULL) { - atomic_dec(&net->ct.count); - return ERR_PTR(-ENOMEM); - } + if (ct == NULL) + goto out; + spin_lock_init(&ct->lock); ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; @@ -867,29 +856,20 @@ __nf_conntrack_alloc(struct net *net, memset(&ct->__nfct_init_offset[0], 0, offsetof(struct nf_conn, proto) - offsetof(struct nf_conn, __nfct_init_offset[0])); -#ifdef CONFIG_NF_CONNTRACK_ZONES - if (zone) { - struct nf_conntrack_zone *nf_ct_zone; - - nf_ct_zone = nf_ct_ext_add(ct, NF_CT_EXT_ZONE, GFP_ATOMIC); - if (!nf_ct_zone) - goto out_free; - nf_ct_zone->id = zone->id; - nf_ct_zone->dir = zone->dir; - } -#endif + + if (zone && nf_ct_zone_add(ct, GFP_ATOMIC, zone) < 0) + goto out_free; + /* Because we use RCU lookups, we set ct_general.use to zero before * this is inserted in any list. */ atomic_set(&ct->ct_general.use, 0); return ct; - -#ifdef CONFIG_NF_CONNTRACK_ZONES out_free: - atomic_dec(&net->ct.count); kmem_cache_free(net->ct.nf_conntrack_cachep, ct); +out: + atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); -#endif } struct nf_conn *nf_conntrack_alloc(struct net *net, @@ -937,6 +917,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_expect *exp = NULL; const struct nf_conntrack_zone *zone; struct nf_conn_timeout *timeout_ext; + struct nf_conntrack_zone tmp; unsigned int *timeouts; if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) { @@ -944,7 +925,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, return NULL; } - zone = nf_ct_zone_tmpl(tmpl); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, hash); if (IS_ERR(ct)) @@ -1042,6 +1023,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_zone tmp; struct nf_conn *ct; u32 hash; @@ -1053,7 +1035,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, } /* look for tuple match */ - zone = nf_ct_zone_tmpl(tmpl); + zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); hash = hash_conntrack_raw(&tuple); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 4eaf925bead4..94a66541e0b7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -956,9 +956,8 @@ static int ctnetlink_parse_zone(const struct nlattr *attr, struct nf_conntrack_zone *zone) { - zone->id = NF_CT_DEFAULT_ZONE_ID; - zone->dir = NF_CT_DEFAULT_ZONE_DIR; - + nf_ct_zone_init(zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); #ifdef CONFIG_NF_CONNTRACK_ZONES if (attr) zone->id = ntohs(nla_get_be16(attr)); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 536cb67928ad..346509825a80 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -208,7 +208,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, #ifndef CONFIG_NF_CONNTRACK_ZONES if (info->zone || info->flags & (XT_CT_ZONE_DIR_ORIG | - XT_CT_ZONE_DIR_REPL)) + XT_CT_ZONE_DIR_REPL | + XT_CT_ZONE_MARK)) goto err1; #endif @@ -219,6 +220,8 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, memset(&zone, 0, sizeof(zone)); zone.id = info->zone; zone.dir = xt_ct_flags_to_dir(info); + if (info->flags & XT_CT_ZONE_MARK) + zone.flags |= NF_CT_FLAG_MARK; ct = nf_ct_tmpl_alloc(par->net, &zone, GFP_KERNEL); ret = PTR_ERR(ct); -- cgit v1.2.3 From 2536862311d2276454ddef9dc36d6551a4b400fd Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 17 Aug 2015 13:42:24 -0700 Subject: lwt: Add support to redirect dst.input This patch adds the capability to redirect dst input in the same way that dst output is redirected by LWT. Also, save the original dst.input and and dst.out when setting up lwtunnel redirection. These can be called by the client as a pass- through. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 30 ++++++++++++++++++++++++++- net/core/lwtunnel.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/route.c | 8 +++++++- net/ipv6/route.c | 8 +++++++- 4 files changed, 98 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 33bd30963a95..e25b60eb262d 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -11,12 +11,15 @@ #define LWTUNNEL_HASH_SIZE (1 << LWTUNNEL_HASH_BITS) /* lw tunnel state flags */ -#define LWTUNNEL_STATE_OUTPUT_REDIRECT 0x1 +#define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) +#define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) struct lwtunnel_state { __u16 type; __u16 flags; atomic_t refcnt; + int (*orig_output)(struct sock *sk, struct sk_buff *skb); + int (*orig_input)(struct sk_buff *); int len; __u8 data[0]; }; @@ -25,6 +28,7 @@ struct lwtunnel_encap_ops { int (*build_state)(struct net_device *dev, struct nlattr *encap, struct lwtunnel_state **ts); int (*output)(struct sock *sk, struct sk_buff *skb); + int (*input)(struct sk_buff *skb); int (*fill_encap)(struct sk_buff *skb, struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate); @@ -58,6 +62,13 @@ static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) +{ + if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_INPUT_REDIRECT)) + return true; + + return false; +} int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, @@ -72,6 +83,8 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct sock *sk, struct sk_buff *skb); int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); +int lwtunnel_input(struct sk_buff *skb); +int lwtunnel_input6(struct sk_buff *skb); #else @@ -90,6 +103,11 @@ static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) return false; } +static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) +{ + return false; +} + static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, unsigned int num) { @@ -142,6 +160,16 @@ static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) return -EOPNOTSUPP; } +static inline int lwtunnel_input(struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + +static inline int lwtunnel_input6(struct sk_buff *skb) +{ + return -EOPNOTSUPP; +} + #endif #endif /* __NET_LWTUNNEL_H */ diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 5d6d8e3d450a..3331585174d9 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -241,3 +241,58 @@ int lwtunnel_output(struct sock *sk, struct sk_buff *skb) return __lwtunnel_output(sk, skb, lwtstate); } EXPORT_SYMBOL(lwtunnel_output); + +int __lwtunnel_input(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + const struct lwtunnel_encap_ops *ops; + int ret = -EINVAL; + + if (!lwtstate) + goto drop; + + if (lwtstate->type == LWTUNNEL_ENCAP_NONE || + lwtstate->type > LWTUNNEL_ENCAP_MAX) + return 0; + + ret = -EOPNOTSUPP; + rcu_read_lock(); + ops = rcu_dereference(lwtun_encaps[lwtstate->type]); + if (likely(ops && ops->input)) + ret = ops->input(skb); + rcu_read_unlock(); + + if (ret == -EOPNOTSUPP) + goto drop; + + return ret; + +drop: + kfree_skb(skb); + + return ret; +} + +int lwtunnel_input6(struct sk_buff *skb) +{ + struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt6i_lwtstate; + + return __lwtunnel_input(skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_input6); + +int lwtunnel_input(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable *)skb_dst(skb); + struct lwtunnel_state *lwtstate = NULL; + + if (rt) + lwtstate = rt->rt_lwtstate; + + return __lwtunnel_input(skb, lwtstate); +} +EXPORT_SYMBOL(lwtunnel_input); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2c89d294b669..2403e85107f0 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1631,8 +1631,14 @@ static int __mkroute_input(struct sk_buff *skb, rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - if (lwtunnel_output_redirect(rth->rt_lwtstate)) + if (lwtunnel_output_redirect(rth->rt_lwtstate)) { + rth->rt_lwtstate->orig_output = rth->dst.output; rth->dst.output = lwtunnel_output; + } + if (lwtunnel_input_redirect(rth->rt_lwtstate)) { + rth->rt_lwtstate->orig_input = rth->dst.input; + rth->dst.input = lwtunnel_input; + } skb_dst_set(skb, &rth->dst); out: err = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1c0217e61357..c3733049715e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1785,8 +1785,14 @@ int ip6_route_add(struct fib6_config *cfg) if (err) goto out; rt->rt6i_lwtstate = lwtstate_get(lwtstate); - if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) + if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) { + rt->rt6i_lwtstate->orig_output = rt->dst.output; rt->dst.output = lwtunnel_output6; + } + if (lwtunnel_input_redirect(rt->rt6i_lwtstate)) { + rt->rt6i_lwtstate->orig_input = rt->dst.input; + rt->dst.input = lwtunnel_input6; + } } ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len); -- cgit v1.2.3 From 4b048d6d9d0b0b90e1e94f2393796bbf1fa8df4e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 17 Aug 2015 13:42:25 -0700 Subject: net: Change pseudohdr argument of inet_proto_csum_replace* to be a bool inet_proto_csum_replace4,2,16 take a pseudohdr argument which indicates the checksum field carries a pseudo header. This argument should be a boolean instead of an int. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/checksum.h | 6 +++--- net/core/filter.c | 2 +- net/core/utils.c | 4 ++-- net/ipv4/netfilter/ipt_ECN.c | 2 +- net/ipv4/netfilter/nf_nat_l3proto_ipv4.c | 4 ++-- net/ipv4/netfilter/nf_nat_proto_icmp.c | 2 +- net/ipv6/netfilter/nf_nat_l3proto_ipv6.c | 4 ++-- net/ipv6/netfilter/nf_nat_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_seqadj.c | 9 +++++---- net/netfilter/nf_nat_proto_dccp.c | 2 +- net/netfilter/nf_nat_proto_tcp.c | 2 +- net/netfilter/nf_nat_proto_udp.c | 2 +- net/netfilter/nf_nat_proto_udplite.c | 2 +- net/netfilter/nf_synproxy_core.c | 2 +- net/netfilter/xt_TCPMSS.c | 8 ++++---- net/netfilter/xt_TCPOPTSTRIP.c | 2 +- net/openvswitch/actions.c | 12 ++++++------ net/sched/act_nat.c | 7 ++++--- 18 files changed, 38 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 2d1d73cb773e..619f3445d57e 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -140,14 +140,14 @@ static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr); + __be32 from, __be32 to, bool pseudohdr); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, - int pseudohdr); + bool pseudohdr); static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, - int pseudohdr) + bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); diff --git a/net/core/filter.c b/net/core/filter.c index f8184222465e..83f08cefeab7 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1349,7 +1349,7 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = { static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) { struct sk_buff *skb = (struct sk_buff *) (long) r1; - u32 is_pseudo = BPF_IS_PSEUDO_HEADER(flags); + bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags); int offset = (int) r2; __sum16 sum, *ptr; diff --git a/net/core/utils.c b/net/core/utils.c index a7732a068043..cd7d202f340e 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -301,7 +301,7 @@ out: EXPORT_SYMBOL(in6_pton); void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, - __be32 from, __be32 to, int pseudohdr) + __be32 from, __be32 to, bool pseudohdr) { if (skb->ip_summed != CHECKSUM_PARTIAL) { csum_replace4(sum, from, to); @@ -318,7 +318,7 @@ EXPORT_SYMBOL(inet_proto_csum_replace4); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, - int pseudohdr) + bool pseudohdr) { __be32 diff[] = { ~from[0], ~from[1], ~from[2], ~from[3], diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 4bf3dc49ad1e..270765236f5e 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -72,7 +72,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) tcph->cwr = einfo->proto.tcp.cwr; inet_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + oldval, ((__be16 *)tcph)[6], false); return true; } diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index e59cc05c09e9..22f4579b0c2a 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -120,7 +120,7 @@ static void nf_nat_ipv4_csum_update(struct sk_buff *skb, oldip = iph->daddr; newip = t->dst.u3.ip; } - inet_proto_csum_replace4(check, skb, oldip, newip, 1); + inet_proto_csum_replace4(check, skb, oldip, newip, true); } static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, @@ -151,7 +151,7 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, } } else inet_proto_csum_replace2(check, skb, - htons(oldlen), htons(datalen), 1); + htons(oldlen), htons(datalen), true); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c b/net/ipv4/netfilter/nf_nat_proto_icmp.c index 4557b4ab8342..7b98baa13ede 100644 --- a/net/ipv4/netfilter/nf_nat_proto_icmp.c +++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c @@ -67,7 +67,7 @@ icmp_manip_pkt(struct sk_buff *skb, hdr = (struct icmphdr *)(skb->data + hdroff); inet_proto_csum_replace2(&hdr->checksum, skb, - hdr->un.echo.id, tuple->src.u.icmp.id, 0); + hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; } diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index e76900e0aa92..70fbaed49edb 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -124,7 +124,7 @@ static void nf_nat_ipv6_csum_update(struct sk_buff *skb, newip = &t->dst.u3.in6; } inet_proto_csum_replace16(check, skb, oldip->s6_addr32, - newip->s6_addr32, 1); + newip->s6_addr32, true); } static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, @@ -155,7 +155,7 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, } } else inet_proto_csum_replace2(check, skb, - htons(oldlen), htons(datalen), 1); + htons(oldlen), htons(datalen), true); } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c index 2205e8eeeacf..57593b00c5b4 100644 --- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c @@ -73,7 +73,7 @@ icmpv6_manip_pkt(struct sk_buff *skb, hdr->icmp6_type == ICMPV6_ECHO_REPLY) { inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, hdr->icmp6_identifier, - tuple->src.u.icmp.id, 0); + tuple->src.u.icmp.id, false); hdr->icmp6_identifier = tuple->src.u.icmp.id; } return true; diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index ce3e840c8704..dff0f0cc59e4 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -103,9 +103,9 @@ static void nf_ct_sack_block_adjust(struct sk_buff *skb, ntohl(sack->end_seq), ntohl(new_end_seq)); inet_proto_csum_replace4(&tcph->check, skb, - sack->start_seq, new_start_seq, 0); + sack->start_seq, new_start_seq, false); inet_proto_csum_replace4(&tcph->check, skb, - sack->end_seq, new_end_seq, 0); + sack->end_seq, new_end_seq, false); sack->start_seq = new_start_seq; sack->end_seq = new_end_seq; sackoff += sizeof(*sack); @@ -193,8 +193,9 @@ int nf_ct_seq_adjust(struct sk_buff *skb, newseq = htonl(ntohl(tcph->seq) + seqoff); newack = htonl(ntohl(tcph->ack_seq) - ackoff); - inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, 0); - inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, 0); + inet_proto_csum_replace4(&tcph->check, skb, tcph->seq, newseq, false); + inet_proto_csum_replace4(&tcph->check, skb, tcph->ack_seq, newack, + false); pr_debug("Adjusting sequence number from %u->%u, ack from %u->%u\n", ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), diff --git a/net/netfilter/nf_nat_proto_dccp.c b/net/netfilter/nf_nat_proto_dccp.c index b8067b53ff3a..15c47b246d0d 100644 --- a/net/netfilter/nf_nat_proto_dccp.c +++ b/net/netfilter/nf_nat_proto_dccp.c @@ -69,7 +69,7 @@ dccp_manip_pkt(struct sk_buff *skb, l3proto->csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - 0); + false); return true; } diff --git a/net/netfilter/nf_nat_proto_tcp.c b/net/netfilter/nf_nat_proto_tcp.c index 37f5505f4529..4f8820fc5148 100644 --- a/net/netfilter/nf_nat_proto_tcp.c +++ b/net/netfilter/nf_nat_proto_tcp.c @@ -70,7 +70,7 @@ tcp_manip_pkt(struct sk_buff *skb, return true; l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, 0); + inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); return true; } diff --git a/net/netfilter/nf_nat_proto_udp.c b/net/netfilter/nf_nat_proto_udp.c index b0ede2f0d8bc..b1e627227b6e 100644 --- a/net/netfilter/nf_nat_proto_udp.c +++ b/net/netfilter/nf_nat_proto_udp.c @@ -57,7 +57,7 @@ udp_manip_pkt(struct sk_buff *skb, l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, - 0); + false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } diff --git a/net/netfilter/nf_nat_proto_udplite.c b/net/netfilter/nf_nat_proto_udplite.c index 368f14e01e75..58340c97bd83 100644 --- a/net/netfilter/nf_nat_proto_udplite.c +++ b/net/netfilter/nf_nat_proto_udplite.c @@ -56,7 +56,7 @@ udplite_manip_pkt(struct sk_buff *skb, } l3proto->csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); - inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, 0); + inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index d7f168527903..14f8b43ec5a7 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -225,7 +225,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, synproxy->tsoff); } inet_proto_csum_replace4(&th->check, skb, - old, *ptr, 0); + old, *ptr, false); return 1; } optoff += op[1]; diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index 8c3190e2fc6a..8c02501a530f 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -144,7 +144,7 @@ tcpmss_mangle_packet(struct sk_buff *skb, inet_proto_csum_replace2(&tcph->check, skb, htons(oldmss), htons(newmss), - 0); + false); return 0; } } @@ -185,18 +185,18 @@ tcpmss_mangle_packet(struct sk_buff *skb, memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, - htons(len), htons(len + TCPOLEN_MSS), 1); + htons(len), htons(len + TCPOLEN_MSS), true); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; - inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), 0); + inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; inet_proto_csum_replace2(&tcph->check, skb, - oldval, ((__be16 *)tcph)[6], 0); + oldval, ((__be16 *)tcph)[6], false); return TCPOLEN_MSS; } diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 625fa1d636a0..eb92bffff11c 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -80,7 +80,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, n <<= 8; } inet_proto_csum_replace2(&tcph->check, skb, htons(o), - htons(n), 0); + htons(n), false); } memset(opt + i, TCPOPT_NOP, optl); } diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 14da52ddd327..4f4200717bef 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -284,14 +284,14 @@ static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, if (nh->protocol == IPPROTO_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, - addr, new_addr, 1); + addr, new_addr, true); } else if (nh->protocol == IPPROTO_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&uh->check, skb, - addr, new_addr, 1); + addr, new_addr, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -316,14 +316,14 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, if (l4_proto == NEXTHDR_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, - addr, new_addr, 1); + addr, new_addr, true); } else if (l4_proto == NEXTHDR_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace16(&uh->check, skb, - addr, new_addr, 1); + addr, new_addr, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } @@ -331,7 +331,7 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, } else if (l4_proto == NEXTHDR_ICMP) { if (likely(transport_len >= sizeof(struct icmp6hdr))) inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, - skb, addr, new_addr, 1); + skb, addr, new_addr, true); } } @@ -498,7 +498,7 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) { - inet_proto_csum_replace2(check, skb, *port, new_port, 0); + inet_proto_csum_replace2(check, skb, *port, new_port, false); *port = new_port; } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 5be0b3c1c5b0..b7c4ead8b5a8 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -162,7 +162,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, goto drop; tcph = (void *)(skb_network_header(skb) + ihl); - inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, 1); + inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, + true); break; } case IPPROTO_UDP: @@ -178,7 +179,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, udph = (void *)(skb_network_header(skb) + ihl); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&udph->check, skb, addr, - new_addr, 1); + new_addr, true); if (!udph->check) udph->check = CSUM_MANGLED_0; } @@ -231,7 +232,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a, iph->saddr = new_addr; inet_proto_csum_replace4(&icmph->checksum, skb, addr, new_addr, - 0); + false); break; } default: -- cgit v1.2.3 From abc5d1ff3e8f9b4a9d274818459b123e31981dc9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 17 Aug 2015 13:42:26 -0700 Subject: net: Add inet_proto_csum_replace_by_diff utility function This function updates a checksum field value and skb->csum based on a value which is the difference between the old and new checksum. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/checksum.h | 2 ++ net/core/utils.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'net') diff --git a/include/net/checksum.h b/include/net/checksum.h index 619f3445d57e..9fcaedf994ee 100644 --- a/include/net/checksum.h +++ b/include/net/checksum.h @@ -144,6 +144,8 @@ void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); +void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, + __wsum diff, bool pseudohdr); static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, diff --git a/net/core/utils.c b/net/core/utils.c index cd7d202f340e..3dffce953c39 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -336,6 +336,19 @@ void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, } EXPORT_SYMBOL(inet_proto_csum_replace16); +void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, + __wsum diff, bool pseudohdr) +{ + if (skb->ip_summed != CHECKSUM_PARTIAL) { + *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); + if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr) + skb->csum = ~csum_add(diff, ~skb->csum); + } else if (pseudohdr) { + *sum = ~csum_fold(csum_add(diff, csum_unfold(*sum))); + } +} +EXPORT_SYMBOL(inet_proto_csum_replace_by_diff); + struct __net_random_once_work { struct work_struct work; struct static_key *key; -- cgit v1.2.3 From 65d7ab8de582bc668e3dabb6ff48f750098a6e78 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 17 Aug 2015 13:42:27 -0700 Subject: net: Identifier Locator Addressing module Adding new module name ila. This implements ILA translation. Light weight tunnel redirection is used to perform the translation in the data path. This is configured by the "ip -6 route" command using the "encap ila " option, where is the value to set in destination locator of the packet. e.g. ip -6 route add 3333:0:0:1:5555:0:1:0/128 \ encap ila 2001:0:0:1 via 2401:db00:20:911a:face:0:25:0 Sets a route where 3333:0:0:1 will be overwritten by 2001:0:0:1 on output. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/uapi/linux/ila.h | 15 +++ include/uapi/linux/lwtunnel.h | 1 + net/ipv6/Kconfig | 19 ++++ net/ipv6/Makefile | 1 + net/ipv6/ila.c | 216 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 252 insertions(+) create mode 100644 include/uapi/linux/ila.h create mode 100644 net/ipv6/ila.c (limited to 'net') diff --git a/include/uapi/linux/ila.h b/include/uapi/linux/ila.h new file mode 100644 index 000000000000..7ed9e670814e --- /dev/null +++ b/include/uapi/linux/ila.h @@ -0,0 +1,15 @@ +/* ila.h - ILA Interface */ + +#ifndef _UAPI_LINUX_ILA_H +#define _UAPI_LINUX_ILA_H + +enum { + ILA_ATTR_UNSPEC, + ILA_ATTR_LOCATOR, /* u64 */ + + __ILA_ATTR_MAX, +}; + +#define ILA_ATTR_MAX (__ILA_ATTR_MAX - 1) + +#endif /* _UAPI_LINUX_ILA_H */ diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 3bf223bc2367..aa84ca396bcb 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -7,6 +7,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_NONE, LWTUNNEL_ENCAP_MPLS, LWTUNNEL_ENCAP_IP, + LWTUNNEL_ENCAP_ILA, __LWTUNNEL_ENCAP_MAX, }; diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 643f61339e7b..983bb999738c 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -92,6 +92,25 @@ config IPV6_MIP6 If unsure, say N. +config IPV6_ILA + tristate "IPv6: Identifier Locator Addressing (ILA)" + select LWTUNNEL + ---help--- + Support for IPv6 Identifier Locator Addressing (ILA). + + ILA is a mechanism to do network virtualization without + encapsulation. The basic concept of ILA is that we split an + IPv6 address into a 64 bit locator and 64 bit identifier. The + identifier is the identity of an entity in communication + ("who") and the locator expresses the location of the + entity ("where"). + + ILA can be configured using the "encap ila" option with + "ip -6 route" command. ILA is described in + https://tools.ietf.org/html/draft-herbert-nvo3-ila-00. + + If unsure, say N. + config INET6_XFRM_TUNNEL tristate select INET6_TUNNEL diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 0f3f1999719a..2c900c7b7eb1 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_IPV6_MIP6) += mip6.o +obj-$(CONFIG_IPV6_ILA) += ila.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_VTI) += ip6_vti.o diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c new file mode 100644 index 000000000000..2540ab4b76d1 --- /dev/null +++ b/net/ipv6/ila.c @@ -0,0 +1,216 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ila_params { + __be64 locator; +}; + +static inline struct ila_params *ila_params_lwtunnel( + struct lwtunnel_state *lwstate) +{ + return (struct ila_params *)lwstate->data; +} + +static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) +{ + __be32 diff[] = { + ~from[0], ~from[1], to[0], to[1], + }; + + return csum_partial(diff, sizeof(diff), 0); +} + +static inline __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) +{ + return compute_csum_diff8((__be32 *)&ip6h->daddr, + (__be32 *)&p->locator); +} + +static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) +{ + __wsum diff; + struct ipv6hdr *ip6h = ipv6_hdr(skb); + size_t nhoff = sizeof(struct ipv6hdr); + + /* First update checksum */ + switch (ip6h->nexthdr) { + case NEXTHDR_TCP: + if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) { + struct tcphdr *th = (struct tcphdr *) + (skb_network_header(skb) + nhoff); + + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&th->check, skb, + diff, true); + } + break; + case NEXTHDR_UDP: + if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) { + struct udphdr *uh = (struct udphdr *) + (skb_network_header(skb) + nhoff); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&uh->check, skb, + diff, true); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + break; + case NEXTHDR_ICMP: + if (likely(pskb_may_pull(skb, + nhoff + sizeof(struct icmp6hdr)))) { + struct icmp6hdr *ih = (struct icmp6hdr *) + (skb_network_header(skb) + nhoff); + + diff = get_csum_diff(ip6h, p); + inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, + diff, true); + } + break; + } + + /* Now change destination address */ + *(__be64 *)&ip6h->daddr = p->locator; +} + +static int ila_output(struct sock *sk, struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct rt6_info *rt6 = NULL; + + if (skb->protocol != htons(ETH_P_IPV6)) + goto drop; + + rt6 = (struct rt6_info *)dst; + + update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate)); + + return rt6->rt6i_lwtstate->orig_output(sk, skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static int ila_input(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + struct rt6_info *rt6 = NULL; + + if (skb->protocol != htons(ETH_P_IPV6)) + goto drop; + + rt6 = (struct rt6_info *)dst; + + update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate)); + + return rt6->rt6i_lwtstate->orig_input(skb); + +drop: + kfree_skb(skb); + return -EINVAL; +} + +static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { + [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, +}; + +static int ila_build_state(struct net_device *dev, struct nlattr *nla, + struct lwtunnel_state **ts) +{ + struct ila_params *p; + struct nlattr *tb[ILA_ATTR_MAX + 1]; + size_t encap_len = sizeof(*p); + struct lwtunnel_state *newts; + int ret; + + ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, + ila_nl_policy); + if (ret < 0) + return ret; + + if (!tb[ILA_ATTR_LOCATOR]) + return -EINVAL; + + newts = lwtunnel_state_alloc(encap_len); + if (!newts) + return -ENOMEM; + + newts->len = encap_len; + p = ila_params_lwtunnel(newts); + + p->locator = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); + + newts->type = LWTUNNEL_ENCAP_ILA; + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | + LWTUNNEL_STATE_INPUT_REDIRECT; + + *ts = newts; + + return 0; +} + +static int ila_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ila_params *p = ila_params_lwtunnel(lwtstate); + + if (nla_put_u64(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator)) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + /* No encapsulation overhead */ + return 0; +} + +static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct ila_params *a_p = ila_params_lwtunnel(a); + struct ila_params *b_p = ila_params_lwtunnel(b); + + return (a_p->locator != b_p->locator); +} + +static const struct lwtunnel_encap_ops ila_encap_ops = { + .build_state = ila_build_state, + .output = ila_output, + .input = ila_input, + .fill_encap = ila_fill_encap_info, + .get_encap_size = ila_encap_nlsize, + .cmp_encap = ila_encap_cmp, +}; + +static int __init ila_init(void) +{ + return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); +} + +static void __exit ila_fini(void) +{ + lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); +} + +module_init(ila_init); +module_exit(ila_fini); +MODULE_AUTHOR("Tom Herbert "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 2e659c0551c99c9712724d96acd3d9271587265c Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:36 +0200 Subject: net: 8021q: convert to using IFF_NO_QUEUE Signed-off-by: Phil Sutter Cc: Patrick McHardy Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 01d7ba840df8..fded86508117 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -791,10 +791,9 @@ void vlan_setup(struct net_device *dev) { ether_setup(dev); - dev->priv_flags |= IFF_802_1Q_VLAN; + dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); - dev->tx_queue_len = 0; dev->netdev_ops = &vlan_netdev_ops; dev->destructor = vlan_dev_free; -- cgit v1.2.3 From ccecb2a47ceb0fc59b23b966cd63b5f19315b2a2 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:37 +0200 Subject: net: bridge: convert to using IFF_NO_QUEUE Signed-off-by: Phil Sutter Cc: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0aa8f5cf46a1..6ed2feb51e3c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -365,8 +365,7 @@ void br_dev_setup(struct net_device *dev) dev->destructor = br_dev_free; dev->ethtool_ops = &br_ethtool_ops; SET_NETDEV_DEVTYPE(dev, &br_type); - dev->tx_queue_len = 0; - dev->priv_flags = IFF_EBRIDGE; + dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; -- cgit v1.2.3 From 4afbc0db720555f1bf0dd9f3e112819041231818 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:38 +0200 Subject: net: 6lowpan: convert to using IFF_NO_QUEUE Signed-off-by: Phil Sutter Cc: Alexander Aring Signed-off-by: David S. Miller --- net/ieee802154/6lowpan/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 27c25ad935b4..953b1c49f5d1 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -90,7 +90,7 @@ static void lowpan_setup(struct net_device *dev) dev->hard_header_len = 2 + 1 + 20 + 14; dev->needed_tailroom = 2; /* FCS */ dev->mtu = IPV6_MIN_MTU; - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; dev->flags = IFF_BROADCAST | IFF_MULTICAST; dev->watchdog_timeo = 0; -- cgit v1.2.3 From 0a5f107b6774aa9c48ca3e924dccaf1296ef6a43 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:41 +0200 Subject: net: dsa: convert to using IFF_NO_QUEUE Signed-off-by: Phil Sutter Cc: Lennert Buytenhek Signed-off-by: David S. Miller --- net/dsa/slave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 373ff315030d..cce97385f743 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1147,7 +1147,7 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent, slave_dev->features = master->vlan_features; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; eth_hw_addr_inherit(slave_dev, master); - slave_dev->tx_queue_len = 0; + slave_dev->priv_flags |= IFF_NO_QUEUE; slave_dev->netdev_ops = &dsa_slave_netdev_ops; slave_dev->switchdev_ops = &dsa_slave_switchdev_ops; -- cgit v1.2.3 From cdf7370391d3a482ef2f2a8c73d16c6db3dbecf0 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:44 +0200 Subject: net: batman-adv: convert to using IFF_NO_QUEUE Signed-off-by: Phil Sutter Cc: Marek Lindner Cc: Simon Wunderlich Cc: Antonio Quartulli Signed-off-by: David S. Miller --- net/batman-adv/soft-interface.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 51cda3a7c51d..49d3d3aa59cb 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -941,7 +941,7 @@ static void batadv_softif_init_early(struct net_device *dev) dev->netdev_ops = &batadv_netdev_ops; dev->destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; /* can't call min_mtu, because the needed variables * have not been initialized yet -- cgit v1.2.3 From 9ad09c5c05f7ce718d135ba8b55f9af733fc9b3f Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:45 +0200 Subject: net: hsr: convert to using IFF_NO_QUEUE Signed-off-by: Phil Sutter Cc: Arvid Brodin Signed-off-by: David S. Miller --- net/hsr/hsr_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 44d27469ae55..35a9788bb3ae 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -392,7 +392,7 @@ void hsr_dev_setup(struct net_device *dev) dev->header_ops = &hsr_header_ops; dev->netdev_ops = &hsr_device_ops; SET_NETDEV_DEVTYPE(dev, &hsr_type); - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; dev->destructor = hsr_dev_destroy; -- cgit v1.2.3 From 4676a15207e3bc5e18b7e39b934ce0e890ee54fe Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:46 +0200 Subject: net: caif: convert to using IFF_NO_QUEUE Signed-off-by: Phil Sutter Cc: Dmitry Tarnyagin Signed-off-by: David S. Miller --- drivers/net/caif/caif_hsi.c | 2 +- drivers/net/caif/caif_serial.c | 2 +- drivers/net/caif/caif_spi.c | 2 +- drivers/net/wan/hdlc_fr.c | 2 +- net/caif/caif_dev.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c index b3b922adc0e4..615c65da39be 100644 --- a/drivers/net/caif/caif_hsi.c +++ b/drivers/net/caif/caif_hsi.c @@ -1120,7 +1120,7 @@ static void cfhsi_setup(struct net_device *dev) dev->type = ARPHRD_CAIF; dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->mtu = CFHSI_MAX_CAIF_FRAME_SZ; - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; dev->destructor = free_netdev; dev->netdev_ops = &cfhsi_netdevops; for (i = 0; i < CFHSI_PRIO_LAST; ++i) diff --git a/drivers/net/caif/caif_serial.c b/drivers/net/caif/caif_serial.c index 9da06537237f..c2dea4916e5d 100644 --- a/drivers/net/caif/caif_serial.c +++ b/drivers/net/caif/caif_serial.c @@ -427,7 +427,7 @@ static void caifdev_setup(struct net_device *dev) dev->type = ARPHRD_CAIF; dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->mtu = CAIF_MAX_MTU; - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; dev->destructor = free_netdev; skb_queue_head_init(&serdev->head); serdev->common.link_select = CAIF_LINK_LOW_LATENCY; diff --git a/drivers/net/caif/caif_spi.c b/drivers/net/caif/caif_spi.c index 72ea9ff9bb9c..de3962014af7 100644 --- a/drivers/net/caif/caif_spi.c +++ b/drivers/net/caif/caif_spi.c @@ -710,7 +710,7 @@ static void cfspi_setup(struct net_device *dev) dev->netdev_ops = &cfspi_ops; dev->type = ARPHRD_CAIF; dev->flags = IFF_NOARP | IFF_POINTOPOINT; - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; dev->mtu = SPI_MAX_PAYLOAD_SIZE; dev->destructor = free_netdev; skb_queue_head_init(&cfspi->qhead); diff --git a/drivers/net/wan/hdlc_fr.c b/drivers/net/wan/hdlc_fr.c index 3ebed1c40abb..e92aaf615901 100644 --- a/drivers/net/wan/hdlc_fr.c +++ b/drivers/net/wan/hdlc_fr.c @@ -1096,7 +1096,7 @@ static int fr_add_pvc(struct net_device *frad, unsigned int dlci, int type) } dev->netdev_ops = &pvc_ops; dev->mtu = HDLC_MAX_MTU; - dev->tx_queue_len = 0; + dev->priv_flags |= IFF_NO_QUEUE; dev->ml_priv = pvc; if (register_netdevice(dev) != 0) { diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index edbca468fa73..d730a0f68f46 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -177,7 +177,7 @@ static int transmit(struct cflayer *layer, struct cfpkt *pkt) skb->protocol = htons(ETH_P_CAIF); /* Check if we need to handle xoff */ - if (likely(caifd->netdev->tx_queue_len == 0)) + if (likely(caifd->netdev->priv_flags & IFF_NO_QUEUE)) goto noxoff; if (unlikely(caifd->xoff)) -- cgit v1.2.3 From 906470c19da771e638e7c8e16e16c31995b139cc Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:48 +0200 Subject: net: warn if drivers set tx_queue_len = 0 Due to the introduction of IFF_NO_QUEUE, there is a better way for drivers to indicate that no qdisc should be attached by default. Though, the old convention can't be dropped since ignoring that setting would break drivers still using it. Instead, add a warning so out-of-tree driver maintainers get a chance to adjust their code before we finally get rid of any special handling of tx_queue_len == 0. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4870c3556a5a..b1f3f4844e60 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6997,6 +6997,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; setup(dev); + if (!dev->tx_queue_len) + printk(KERN_WARNING "%s uses DEPRECATED zero tx_queue_len - convert driver to use IFF_NO_QUEUE instead.\n", name); + dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; if (netif_alloc_netdev_queues(dev)) -- cgit v1.2.3 From 348e3435cbefa815bd56a5205c1412b5afe7b92e Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 18 Aug 2015 10:30:49 +0200 Subject: net: sched: drop all special handling of tx_queue_len == 0 Those were all workarounds for the formerly double meaning of tx_queue_len, which broke scheduling algorithms if untreated. Now that all in-tree drivers have been converted away from setting tx_queue_len = 0, it should be safe to drop these workarounds for categorically broken setups. Signed-off-by: Phil Sutter Cc: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_fifo.c | 2 +- net/sched/sch_gred.c | 8 +++----- net/sched/sch_htb.c | 6 ++---- net/sched/sch_plug.c | 8 ++------ net/sched/sch_sfb.c | 2 +- 5 files changed, 9 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 2e2398cfc694..2177eac0a61e 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -54,7 +54,7 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt) bool is_bfifo = sch->ops == &bfifo_qdisc_ops; if (opt == NULL) { - u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; + u32 limit = qdisc_dev(sch)->tx_queue_len; if (is_bfifo) limit *= psched_mtu(qdisc_dev(sch)); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index abb9f2fec28f..80105109f756 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -512,11 +512,9 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_GRED_LIMIT]) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); - else { - u32 qlen = qdisc_dev(sch)->tx_queue_len ? : 1; - - sch->limit = qlen * psched_mtu(qdisc_dev(sch)); - } + else + sch->limit = qdisc_dev(sch)->tx_queue_len + * psched_mtu(qdisc_dev(sch)); return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index f1acb0f60dc3..cf4b0f865d1b 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1048,11 +1048,9 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (tb[TCA_HTB_DIRECT_QLEN]) q->direct_qlen = nla_get_u32(tb[TCA_HTB_DIRECT_QLEN]); - else { + else q->direct_qlen = qdisc_dev(sch)->tx_queue_len; - if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ - q->direct_qlen = 2; - } + if ((q->rate2quantum = gopt->rate2quantum) < 1) q->rate2quantum = 1; q->defcls = gopt->defcls; diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index ade9445a55ab..5abfe44678d4 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -130,12 +130,8 @@ static int plug_init(struct Qdisc *sch, struct nlattr *opt) q->unplug_indefinite = false; if (opt == NULL) { - /* We will set a default limit of 100 pkts (~150kB) - * in case tx_queue_len is not available. The - * default value is completely arbitrary. - */ - u32 pkt_limit = qdisc_dev(sch)->tx_queue_len ? : 100; - q->limit = pkt_limit * psched_mtu(qdisc_dev(sch)); + q->limit = qdisc_dev(sch)->tx_queue_len + * psched_mtu(qdisc_dev(sch)); } else { struct tc_plug_qopt *ctl = nla_data(opt); diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 4b815193326c..dcdff5c769a1 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -502,7 +502,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) limit = ctl->limit; if (limit == 0) - limit = max_t(u32, qdisc_dev(sch)->tx_queue_len, 1); + limit = qdisc_dev(sch)->tx_queue_len; child = fifo_create_dflt(sch, &pfifo_qdisc_ops, limit); if (IS_ERR(child)) -- cgit v1.2.3 From 1e72e6f8859a598bfc22cf268c2dafe8ddb9f1b4 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 17 Aug 2015 23:52:50 +0200 Subject: net: dsa: Allow multi hop routes to be expressed With more than two switches in a hierarchy, it becomes necessary to describe multi-hop routes between switches. The current binding does not allow this, although the older platform_data did. Extend the link property to be a list rather than a single phandle to a remote switch. It is then possible to express that a port should be used to reach more than one switch and the switch maybe more than one hop away. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/dsa.txt | 33 +++++++++++++++---- net/dsa/dsa.c | 40 +++++++++++++++++------ 2 files changed, 57 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/Documentation/devicetree/bindings/net/dsa/dsa.txt b/Documentation/devicetree/bindings/net/dsa/dsa.txt index 9cf9a0ec333c..04e6bef3ac3f 100644 --- a/Documentation/devicetree/bindings/net/dsa/dsa.txt +++ b/Documentation/devicetree/bindings/net/dsa/dsa.txt @@ -44,9 +44,10 @@ Note that a port labelled "dsa" will imply checking for the uplink phandle described below. Optionnal property: -- link : Should be a phandle to another switch's DSA port. +- link : Should be a list of phandles to another switch's DSA port. This property is only used when switches are being - chained/cascaded together. + chained/cascaded together. This port is used as outgoing port + towards the phandle port, which can be more than one hop away. - phy-handle : Phandle to a PHY on an external MDIO bus, not the switch internal one. See @@ -100,10 +101,11 @@ Example: label = "cpu"; }; - switch0uplink: port@6 { + switch0port6: port@6 { reg = <6>; label = "dsa"; - link = <&switch1uplink>; + link = <&switch1port0 + &switch2port0>; }; }; @@ -113,10 +115,29 @@ Example: reg = <17 1>; /* MDIO address 17, switch 1 in tree */ mii-bus = <&mii_bus1>; - switch1uplink: port@0 { + switch1port0: port@0 { reg = <0>; label = "dsa"; - link = <&switch0uplink>; + link = <&switch0port6>; + }; + switch1port1: port@1 { + reg = <1>; + label = "dsa"; + link = <&switch2port1>; + }; + }; + + switch@2 { + #address-cells = <1>; + #size-cells = <0>; + reg = <18 2>; /* MDIO address 18, switch 2 in tree */ + mii-bus = <&mii_bus1>; + + switch2port0: port@0 { + reg = <0>; + label = "dsa"; + link = <&switch1port1 + &switch0port6>; }; }; }; diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 78d4ac97aae3..053eb2b8e682 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -554,6 +554,31 @@ static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, return 0; } +static int dsa_of_probe_links(struct dsa_platform_data *pd, + struct dsa_chip_data *cd, + int chip_index, int port_index, + struct device_node *port, + const char *port_name) +{ + struct device_node *link; + int link_index; + int ret; + + for (link_index = 0;; link_index++) { + link = of_parse_phandle(port, "link", link_index); + if (!link) + break; + + if (!strcmp(port_name, "dsa") && pd->nr_chips > 1) { + ret = dsa_of_setup_routing_table(pd, cd, chip_index, + port_index, link); + if (ret) + return ret; + } + } + return 0; +} + static void dsa_of_free_platform_data(struct dsa_platform_data *pd) { int i; @@ -573,7 +598,7 @@ static void dsa_of_free_platform_data(struct dsa_platform_data *pd) static int dsa_of_probe(struct device *dev) { struct device_node *np = dev->of_node; - struct device_node *child, *mdio, *ethernet, *port, *link; + struct device_node *child, *mdio, *ethernet, *port; struct mii_bus *mdio_bus, *mdio_bus_switch; struct net_device *ethernet_dev; struct dsa_platform_data *pd; @@ -668,15 +693,10 @@ static int dsa_of_probe(struct device *dev) goto out_free_chip; } - link = of_parse_phandle(port, "link", 0); - - if (!strcmp(port_name, "dsa") && link && - pd->nr_chips > 1) { - ret = dsa_of_setup_routing_table(pd, cd, - chip_index, port_index, link); - if (ret) - goto out_free_chip; - } + ret = dsa_of_probe_links(pd, cd, chip_index, + port_index, port, port_name); + if (ret) + goto out_free_chip; } } -- cgit v1.2.3 From df383e6240ef222703648072dafd2a1ae21b0d2a Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 18 Aug 2015 18:41:13 +0200 Subject: lwtunnel: fix memory leak The built lwtunnel_state struct has to be freed after comparison. Fixes: 571e722676fe3 ("ipv4: support for fib route lwtunnel encap attributes") Signed-off-by: Jiri Benc Acked-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 7 ++++++- net/ipv4/fib_semantics.c | 10 ++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index e25b60eb262d..34fd8f70c2ca 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -36,6 +36,11 @@ struct lwtunnel_encap_ops { }; #ifdef CONFIG_LWTUNNEL +static inline void lwtstate_free(struct lwtunnel_state *lws) +{ + kfree(lws); +} + static inline struct lwtunnel_state * lwtstate_get(struct lwtunnel_state *lws) { @@ -51,7 +56,7 @@ static inline void lwtstate_put(struct lwtunnel_state *lws) return; if (atomic_dec_and_test(&lws->refcnt)) - kfree(lws); + lwtstate_free(lws); } static inline bool lwtunnel_output_redirect(struct lwtunnel_state *lwtstate) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index c8025851dac7..d5253071f71f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -539,7 +539,7 @@ int fib_encap_match(struct net *net, u16 encap_type, { struct lwtunnel_state *lwtstate; struct net_device *dev = NULL; - int ret; + int ret, result = 0; if (encap_type == LWTUNNEL_ENCAP_NONE) return 0; @@ -548,10 +548,12 @@ int fib_encap_match(struct net *net, u16 encap_type, dev = __dev_get_by_index(net, oif); ret = lwtunnel_build_state(dev, encap_type, encap, &lwtstate); - if (!ret) - return lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + if (!ret) { + result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); + lwtstate_free(lwtstate); + } - return 0; + return result; } int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) -- cgit v1.2.3 From 2d79849903e00fb445038ddd8afb2acfbf89eb7f Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Tue, 18 Aug 2015 18:42:09 +0200 Subject: lwtunnel: ip tunnel: fix multiple routes with different encap Currently, two routes going through the same tunnel interface are considered the same even when they are routed to a different host after encapsulation. This causes all routes added after the first one to have incorrect encapsulation parameters. This is nicely visible by doing: # ip r a 192.168.1.2/32 dev vxlan0 tunnel dst 10.0.0.2 # ip r a 192.168.1.3/32 dev vxlan0 tunnel dst 10.0.0.3 # ip r [...] 192.168.1.2/32 tunnel id 0 src 0.0.0.0 dst 10.0.0.2 [...] 192.168.1.3/32 tunnel id 0 src 0.0.0.0 dst 10.0.0.2 [...] Implement the missing comparison function. Fixes: 3093fbe7ff4bc ("route: Per route IP tunnel metadata via lightweight tunnel") Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel_core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index fd6319681c50..1c2389d582a6 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -286,10 +286,17 @@ static int ip_tun_encap_nlsize(struct lwtunnel_state *lwtstate) + nla_total_size(2); /* LWTUNNEL_IP_FLAGS */ } +static int ip_tun_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + return memcmp(lwt_tun_info(a), lwt_tun_info(b), + sizeof(struct ip_tunnel_info)); +} + static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .build_state = ip_tun_build_state, .fill_encap = ip_tun_fill_encap_info, .get_encap_size = ip_tun_encap_nlsize, + .cmp_encap = ip_tun_cmp_encap, }; void __init ip_tunnel_core_init(void) -- cgit v1.2.3 From 8cfd23e6740158817d2045915f6ea5a2daf11bce Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 17 Aug 2015 18:09:55 +0200 Subject: netfilter: nft_payload: work around vlan header stripping make payload expression aware of the fact that VLAN offload may have removed a vlan header. When we encounter tagged skb, transparently insert the tag into the register so that vlan header matching can work without userspace being aware of offload features. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_payload.c | 57 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 56 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 94fb3b27a2c5..09b4b07eb676 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -17,6 +18,53 @@ #include #include +/* add vlan header into the user buffer for if tag was removed by offloads */ +static bool +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +{ + int mac_off = skb_mac_header(skb) - skb->data; + u8 vlan_len, *vlanh, *dst_u8 = (u8 *) d; + struct vlan_ethhdr veth; + + vlanh = (u8 *) &veth; + if (offset < ETH_HLEN) { + u8 ethlen = min_t(u8, len, ETH_HLEN - offset); + + if (skb_copy_bits(skb, mac_off, &veth, ETH_HLEN)) + return false; + + veth.h_vlan_proto = skb->vlan_proto; + + memcpy(dst_u8, vlanh + offset, ethlen); + + len -= ethlen; + if (len == 0) + return true; + + dst_u8 += ethlen; + offset = ETH_HLEN; + } else if (offset >= VLAN_ETH_HLEN) { + offset -= VLAN_HLEN; + goto skip; + } + + veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb)); + veth.h_vlan_encapsulated_proto = skb->protocol; + + vlanh += offset; + + vlan_len = min_t(u8, len, VLAN_ETH_HLEN - offset); + memcpy(dst_u8, vlanh, vlan_len); + + len -= vlan_len; + if (!len) + return true; + + dst_u8 += vlan_len; + skip: + return skb_copy_bits(skb, offset + mac_off, dst_u8, len) == 0; +} + static void nft_payload_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -26,10 +74,18 @@ static void nft_payload_eval(const struct nft_expr *expr, u32 *dest = ®s->data[priv->dreg]; int offset; + dest[priv->len / NFT_REG32_SIZE] = 0; switch (priv->base) { case NFT_PAYLOAD_LL_HEADER: if (!skb_mac_header_was_set(skb)) goto err; + + if (skb_vlan_tag_present(skb)) { + if (!nft_payload_copy_vlan(dest, skb, + priv->offset, priv->len)) + goto err; + return; + } offset = skb_mac_header(skb) - skb->data; break; case NFT_PAYLOAD_NETWORK_HEADER: @@ -43,7 +99,6 @@ static void nft_payload_eval(const struct nft_expr *expr, } offset += priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; if (skb_copy_bits(skb, offset, dest, priv->len) < 0) goto err; return; -- cgit v1.2.3 From 18041e31743d278b6323518d20a2ef656c3cc689 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 18 Aug 2015 21:40:16 +0300 Subject: vrf: vrf_master_ifindex_rcu is not always called with rcu read lock While running net-next I hit this: [ 634.073119] =============================== [ 634.073150] [ INFO: suspicious RCU usage. ] [ 634.073182] 4.2.0-rc6+ #45 Not tainted [ 634.073213] ------------------------------- [ 634.073244] include/net/vrf.h:38 suspicious rcu_dereference_check() usage! [ 634.073274] other info that might help us debug this: [ 634.073307] rcu_scheduler_active = 1, debug_locks = 1 [ 634.073338] 2 locks held by swapper/0/0: [ 634.073369] #0: (((&n->timer))){+.-...}, at: [] call_timer_fn+0x5/0x480 [ 634.073412] #1: (slock-AF_INET){+.-...}, at: [] icmp_send+0x155/0x5f0 [ 634.073450] stack backtrace: [ 634.073483] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 4.2.0-rc6+ #45 [ 634.073514] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 634.073545] 0000000000000000 0593ba8242d9ace4 ffff88002fc03b48 ffffffff81803f1b [ 634.073612] 0000000000000000 ffffffff81e12500 ffff88002fc03b78 ffffffff811003c5 [ 634.073642] 0000000000000000 ffff88002ec4e600 ffffffff81f00f80 ffff88002fc03cf0 [ 634.073669] Call Trace: [ 634.073694] [] dump_stack+0x4c/0x65 [ 634.073728] [] lockdep_rcu_suspicious+0xc5/0x100 [ 634.073763] [] icmp_route_lookup+0x176/0x5c0 [ 634.073793] [] ? icmp_send+0x35b/0x5f0 [ 634.073818] [] ? icmp_send+0x2d4/0x5f0 [ 634.073844] [] icmp_send+0x42e/0x5f0 [ 634.073873] [] ipv4_link_failure+0x22/0xa0 [ 634.073899] [] arp_error_report+0x3a/0x80 [ 634.073926] [] ? neigh_lookup+0x2c0/0x2c0 [ 634.073952] [] neigh_invalidate+0x8e/0x110 [ 634.073984] [] neigh_timer_handler+0x1ae/0x290 [ 634.074013] [] ? neigh_lookup+0x2c0/0x2c0 [ 634.074013] [] call_timer_fn+0xb3/0x480 [ 634.074013] [] ? call_timer_fn+0x5/0x480 [ 634.074013] [] ? neigh_lookup+0x2c0/0x2c0 [ 634.074013] [] run_timer_softirq+0x20c/0x430 [ 634.074013] [] __do_softirq+0xde/0x630 [ 634.074013] [] irq_exit+0x117/0x120 [ 634.074013] [] smp_apic_timer_interrupt+0x46/0x60 [ 634.074013] [] apic_timer_interrupt+0x70/0x80 [ 634.074013] [] ? native_safe_halt+0x6/0x10 [ 634.074013] [] ? trace_hardirqs_on+0xd/0x10 [ 634.074013] [] default_idle+0x23/0x200 [ 634.074013] [] arch_cpu_idle+0xf/0x20 [ 634.074013] [] default_idle_call+0x2a/0x40 [ 634.074013] [] cpu_startup_entry+0x39c/0x4c0 [ 634.074013] [] rest_init+0x13d/0x150 [ 634.074013] [] start_kernel+0x4a8/0x4c9 [ 634.074013] [] ? early_idt_handler_array+0x120/0x120 [ 634.074013] [] x86_64_start_reservations+0x2a/0x2c [ 634.074013] [] x86_64_start_kernel+0x14a/0x16d It would seem vrf_master_ifindex_rcu() can be called without RCU held in other contexts as well so introduce a new helper which acquires rcu and returns the ifindex. Also add curly braces around both the "if" and "else" parts as per the style guide. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/vrf.h | 20 ++++++++++++++++++-- net/ipv4/icmp.c | 4 ++-- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/vrf.h b/include/net/vrf.h index 3bb4af462ed6..5bfb16237fd7 100644 --- a/include/net/vrf.h +++ b/include/net/vrf.h @@ -43,9 +43,9 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev) if (!dev) return 0; - if (netif_is_vrf(dev)) + if (netif_is_vrf(dev)) { ifindex = dev->ifindex; - else { + } else { vrf_ptr = rcu_dereference(dev->vrf_ptr); if (vrf_ptr) ifindex = vrf_ptr->ifindex; @@ -54,6 +54,17 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev) return ifindex; } +static inline int vrf_master_ifindex(const struct net_device *dev) +{ + int ifindex; + + rcu_read_lock(); + ifindex = vrf_master_ifindex_rcu(dev); + rcu_read_unlock(); + + return ifindex; +} + /* called with rcu_read_lock */ static inline int vrf_dev_table_rcu(const struct net_device *dev) { @@ -133,6 +144,11 @@ static inline int vrf_master_ifindex_rcu(const struct net_device *dev) return 0; } +static inline int vrf_master_ifindex(const struct net_device *dev) +{ + return 0; +} + static inline int vrf_dev_table_rcu(const struct net_device *dev) { return 0; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c6f1ce149ffb..f16488efa1c8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -426,7 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos); fl4.flowi4_proto = IPPROTO_ICMP; - fl4.flowi4_oif = vrf_master_ifindex_rcu(skb->dev) ? : skb->dev->ifindex; + fl4.flowi4_oif = vrf_master_ifindex(skb->dev) ? : skb->dev->ifindex; security_skb_classify_flow(skb, flowi4_to_flowi(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) @@ -460,7 +460,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = vrf_master_ifindex_rcu(skb_in->dev) ? : skb_in->dev->ifindex; + fl4->flowi4_oif = vrf_master_ifindex(skb_in->dev) ? : skb_in->dev->ifindex; security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4); -- cgit v1.2.3 From fdf79bd48876812acf0de58ed7a8bc1b3a3c67d6 Mon Sep 17 00:00:00 2001 From: Robert Baldyga Date: Thu, 20 Aug 2015 17:26:00 +0200 Subject: NFC: nci: Add post_setup handler Some drivers require non-standard configuration after NCI_CORE_INIT request, because they need to know ndev->manufact_specific_info or ndev->manufact_id. This patch adds post_setup handler allowing to do such custom configuration. Signed-off-by: Robert Baldyga Signed-off-by: Samuel Ortiz --- include/net/nfc/nci_core.h | 1 + net/nfc/nci/core.c | 4 ++++ 2 files changed, 5 insertions(+) (limited to 'net') diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 01fc8c531115..1bdaa5f51107 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -79,6 +79,7 @@ struct nci_ops { int (*close)(struct nci_dev *ndev); int (*send)(struct nci_dev *ndev, struct sk_buff *skb); int (*setup)(struct nci_dev *ndev); + int (*post_setup)(struct nci_dev *ndev); int (*fw_download)(struct nci_dev *ndev, const char *firmware_name); __u32 (*get_rfprotocol)(struct nci_dev *ndev, __u8 rf_protocol); int (*discover_se)(struct nci_dev *ndev); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 95af2d24d5be..d9045ec172e3 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -388,6 +388,10 @@ static int nci_open_device(struct nci_dev *ndev) msecs_to_jiffies(NCI_INIT_TIMEOUT)); } + if (ndev->ops->post_setup) { + rc = ndev->ops->post_setup(ndev); + } + if (!rc) { rc = __nci_request(ndev, nci_init_complete_req, 0, msecs_to_jiffies(NCI_INIT_TIMEOUT)); -- cgit v1.2.3 From 025a0cb8380b7100d39fb426db9192b6c59595dc Mon Sep 17 00:00:00 2001 From: Robert Baldyga Date: Thu, 20 Aug 2015 17:26:01 +0200 Subject: NFC: nci: export nci_core_reset and nci_core_init Some drivers needs to have ability to reinit NCI core, for example after updating firmware in setup() of post_setup() callback. This patch makes nci_core_reset() and nci_core_init() functions public, to make it possible. Signed-off-by: Robert Baldyga Signed-off-by: Samuel Ortiz --- include/net/nfc/nci_core.h | 2 ++ net/nfc/nci/core.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'net') diff --git a/include/net/nfc/nci_core.h b/include/net/nfc/nci_core.h index 1bdaa5f51107..d0d0f1e53bb9 100644 --- a/include/net/nfc/nci_core.h +++ b/include/net/nfc/nci_core.h @@ -278,6 +278,8 @@ int nci_request(struct nci_dev *ndev, unsigned long opt), unsigned long opt, __u32 timeout); int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload); +int nci_core_reset(struct nci_dev *ndev); +int nci_core_init(struct nci_dev *ndev); int nci_recv_frame(struct nci_dev *ndev, struct sk_buff *skb); int nci_set_config(struct nci_dev *ndev, __u8 id, size_t len, __u8 *val); diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index d9045ec172e3..943889b87a34 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -351,6 +351,20 @@ int nci_prop_cmd(struct nci_dev *ndev, __u8 oid, size_t len, __u8 *payload) } EXPORT_SYMBOL(nci_prop_cmd); +int nci_core_reset(struct nci_dev *ndev) +{ + return __nci_request(ndev, nci_reset_req, 0, + msecs_to_jiffies(NCI_RESET_TIMEOUT)); +} +EXPORT_SYMBOL(nci_core_reset); + +int nci_core_init(struct nci_dev *ndev) +{ + return __nci_request(ndev, nci_init_req, 0, + msecs_to_jiffies(NCI_INIT_TIMEOUT)); +} +EXPORT_SYMBOL(nci_core_init); + static int nci_open_device(struct nci_dev *ndev) { int rc = 0; -- cgit v1.2.3 From 5a9e0ffc0f128ecdf7c770f76c268e4f9f3c9118 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Wed, 19 Aug 2015 21:26:42 +0200 Subject: nfc: nci: hci: Add check on skb nci_hci_send_cmd parameter skb can be NULL and may lead to a NULL pointer error. Add a check condition before setting HCI rx buffer. Cc: stable@vger.kernel.org Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- net/nfc/nci/hci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c index af002df640c7..609f92283d1b 100644 --- a/net/nfc/nci/hci.c +++ b/net/nfc/nci/hci.c @@ -233,7 +233,7 @@ int nci_hci_send_cmd(struct nci_dev *ndev, u8 gate, u8 cmd, r = nci_request(ndev, nci_hci_send_data_req, (unsigned long)&data, msecs_to_jiffies(NCI_DATA_TIMEOUT)); - if (r == NCI_STATUS_OK) + if (r == NCI_STATUS_OK && skb) *skb = conn_info->rx_skb; return r; -- cgit v1.2.3 From 29e76924cf087bc6a9114a9244828fd13ae959bb Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Wed, 19 Aug 2015 21:26:43 +0200 Subject: nfc: netlink: Add capability to reply to vendor_cmd with data A proprietary vendor command may send back useful data to the user application. For example, the field level applied on the NFC router antenna. Still based on net/wireless/nl80211.c implementation, add nfc_vendor_cmd_alloc_reply_skb and nfc_vendor_cmd_reply in order to send back over netlink data generated by a proprietary command. Signed-off-by: Christophe Ricard Signed-off-by: Samuel Ortiz --- include/net/nfc/nfc.h | 41 ++++++++++++++++++++++++ net/nfc/netlink.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 125 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index f9e58ae45f9c..30afc9a6718c 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -203,6 +203,7 @@ struct nfc_dev { int n_vendor_cmds; struct nfc_ops *ops; + struct genl_info *cur_cmd_info; }; #define to_nfc_dev(_dev) container_of(_dev, struct nfc_dev, dev) @@ -318,4 +319,44 @@ static inline int nfc_set_vendor_cmds(struct nfc_dev *dev, return 0; } +struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, + enum nfc_attrs attr, + u32 oui, u32 subcmd, + int approxlen); +int nfc_vendor_cmd_reply(struct sk_buff *skb); + +/** + * nfc_vendor_cmd_alloc_reply_skb - allocate vendor command reply + * @dev: nfc device + * @oui: vendor oui + * @approxlen: an upper bound of the length of the data that will + * be put into the skb + * + * This function allocates and pre-fills an skb for a reply to + * a vendor command. Since it is intended for a reply, calling + * it outside of a vendor command's doit() operation is invalid. + * + * The returned skb is pre-filled with some identifying data in + * a way that any data that is put into the skb (with skb_put(), + * nla_put() or similar) will end up being within the + * %NFC_ATTR_VENDOR_DATA attribute, so all that needs to be done + * with the skb is adding data for the corresponding userspace tool + * which can then read that data out of the vendor data attribute. + * You must not modify the skb in any other way. + * + * When done, call nfc_vendor_cmd_reply() with the skb and return + * its error code as the result of the doit() operation. + * + * Return: An allocated and pre-filled skb. %NULL if any errors happen. + */ +static inline struct sk_buff * +nfc_vendor_cmd_alloc_reply_skb(struct nfc_dev *dev, + u32 oui, u32 subcmd, int approxlen) +{ + return __nfc_alloc_vendor_cmd_reply_skb(dev, + NFC_ATTR_VENDOR_DATA, + oui, + subcmd, approxlen); +} + #endif /* __NET_NFC_H */ diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 73d1ca7c546c..853172c27f68 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -63,6 +63,8 @@ static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, + [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, + }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { @@ -1503,7 +1505,7 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; - int i; + int i, err; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_VENDOR_ID] || @@ -1534,12 +1536,92 @@ static int nfc_genl_vendor_cmd(struct sk_buff *skb, if (cmd->vendor_id != vid || cmd->subcmd != subcmd) continue; - return cmd->doit(dev, data, data_len); + dev->cur_cmd_info = info; + err = cmd->doit(dev, data, data_len); + dev->cur_cmd_info = NULL; + return err; } return -EOPNOTSUPP; } +/* message building helper */ +static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq, + int flags, u8 cmd) +{ + /* since there is no private header just add the generic one */ + return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd); +} + +static struct sk_buff * +__nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen, + u32 portid, u32 seq, + enum nfc_attrs attr, + u32 oui, u32 subcmd, gfp_t gfp) +{ + struct sk_buff *skb; + void *hdr; + + skb = nlmsg_new(approxlen + 100, gfp); + if (!skb) + return NULL; + + hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR); + if (!hdr) { + kfree_skb(skb); + return NULL; + } + + if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx)) + goto nla_put_failure; + if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui)) + goto nla_put_failure; + if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd)) + goto nla_put_failure; + + ((void **)skb->cb)[0] = dev; + ((void **)skb->cb)[1] = hdr; + + return skb; + +nla_put_failure: + kfree_skb(skb); + return NULL; +} + +struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, + enum nfc_attrs attr, + u32 oui, u32 subcmd, + int approxlen) +{ + if (WARN_ON(!dev->cur_cmd_info)) + return NULL; + + return __nfc_alloc_vendor_cmd_skb(dev, approxlen, + dev->cur_cmd_info->snd_portid, + dev->cur_cmd_info->snd_seq, attr, + oui, subcmd, GFP_KERNEL); +} +EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb); + +int nfc_vendor_cmd_reply(struct sk_buff *skb) +{ + struct nfc_dev *dev = ((void **)skb->cb)[0]; + void *hdr = ((void **)skb->cb)[1]; + + /* clear CB data for netlink core to own from now on */ + memset(skb->cb, 0, sizeof(skb->cb)); + + if (WARN_ON(!dev->cur_cmd_info)) { + kfree_skb(skb); + return -EINVAL; + } + + genlmsg_end(skb, hdr); + return genlmsg_reply(skb, dev->cur_cmd_info); +} +EXPORT_SYMBOL(nfc_vendor_cmd_reply); + static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, -- cgit v1.2.3 From e01286ef03a9c7b1d4937309f923c226ab05bc4d Mon Sep 17 00:00:00 2001 From: Ying Xue Date: Wed, 19 Aug 2015 16:04:51 +0800 Subject: ipv4: Make fib_encap_match static Make fib_encap_match() static as it isn't used outside the file. Signed-off-by: Ying Xue Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index d5253071f71f..8fdf6c26a012 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -533,9 +533,9 @@ errout: #endif -int fib_encap_match(struct net *net, u16 encap_type, - struct nlattr *encap, - int oif, const struct fib_nh *nh) +static int fib_encap_match(struct net *net, u16 encap_type, + struct nlattr *encap, + int oif, const struct fib_nh *nh) { struct lwtunnel_state *lwtstate; struct net_device *dev = NULL; -- cgit v1.2.3 From eb4cb85180cd3baee4a01fd32e296fc28c2cffc1 Mon Sep 17 00:00:00 2001 From: Scott Feldman Date: Wed, 19 Aug 2015 11:29:35 -0700 Subject: bridge: fix netlink max attr size .maxtype should match .policy. Probably just been getting lucky here because IFLA_BRPORT_MAX > IFLA_BR_MAX. Fixes: 13323516 ("bridge: implement rtnl_link_ops->changelink") Signed-off-by: Scott Feldman Signed-off-by: David S. Miller --- net/bridge/br_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 0f2408f6cdfe..dbcb1949ea58 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -849,7 +849,7 @@ struct rtnl_link_ops br_link_ops __read_mostly = { .kind = "bridge", .priv_size = sizeof(struct net_bridge), .setup = br_dev_setup, - .maxtype = IFLA_BRPORT_MAX, + .maxtype = IFLA_BR_MAX, .policy = br_policy, .validate = br_validate, .newlink = br_dev_newlink, -- cgit v1.2.3 From 4c9bcd117918ba6096d01194565e9d1814e5ef22 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 19 Aug 2015 11:40:31 -0700 Subject: net: Fix nexthop lookups Andreas reported breakage adding routes with local nexthops: $ ip route show table main ... 172.28.0.0/24 dev vnf-xe1p0 proto kernel scope link src 172.28.0.16 $ ip route add 10.0.0.0/8 via 172.28.0.32 table 100 dev vnf-xe1p0 RTNETLINK answers: Resource temporarily unavailable 3bfd847203c changed the lookup to use the passed in table but for cases like this the nexthop is in the local table rather than the passed in table. Fixes: 3bfd847203c ("net: Use passed in table for nexthop lookups") Reported-by: Andreas Schultz Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 8fdf6c26a012..01f1c7dcd329 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -712,9 +712,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, err = fib_table_lookup(tbl, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE | FIB_LOOKUP_NOREF); - else + + /* on error or if no table given do full lookup. This + * is needed for example when nexthops are in the local + * table rather than the given table + */ + if (!tbl || err) { err = fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE); + } + if (err) { rcu_read_unlock(); return err; -- cgit v1.2.3 From c1ea5d672aaff08da337dee735dbb548e3415585 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:23 +0200 Subject: ip_tunnels: add IPv6 addresses to ip_tunnel_key Add the IPv6 addresses as an union with IPv4 ones. When using IPv4, the newly introduced padding after the IPv4 addresses needs to be zeroed out. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 +++--- include/net/ip_tunnels.h | 24 ++++++++++++++++++++---- net/core/filter.c | 4 ++-- net/ipv4/ip_gre.c | 10 +++++----- net/ipv4/ip_tunnel_core.c | 8 ++++---- net/openvswitch/flow_netlink.c | 18 +++++++++--------- net/openvswitch/flow_table.c | 2 +- net/openvswitch/vport-geneve.c | 2 +- net/openvswitch/vport.c | 2 +- net/openvswitch/vport.h | 4 ++-- 10 files changed, 48 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ad51dac88d19..30a7abcf2c09 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1276,8 +1276,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto drop; info = &tun_dst->u.tun_info; - info->key.ipv4_src = iph->saddr; - info->key.ipv4_dst = iph->daddr; + info->key.u.ipv4.src = iph->saddr; + info->key.u.ipv4.dst = iph->daddr; info->key.ipv4_tos = iph->tos; info->key.ipv4_ttl = iph->ttl; info->key.tp_src = udp_hdr(skb)->source; @@ -1925,7 +1925,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = be64_to_cpu(info->key.tun_id); remote_ip.sin.sin_family = AF_INET; - remote_ip.sin.sin_addr.s_addr = info->key.ipv4_dst; + remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; dst = &remote_ip; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index cc3b39e9010b..6a51371dad00 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -25,10 +25,24 @@ /* Used to memset ip_tunnel padding. */ #define IP_TUNNEL_KEY_SIZE offsetofend(struct ip_tunnel_key, tp_dst) +/* Used to memset ipv4 address padding. */ +#define IP_TUNNEL_KEY_IPV4_PAD offsetofend(struct ip_tunnel_key, u.ipv4.dst) +#define IP_TUNNEL_KEY_IPV4_PAD_LEN \ + (FIELD_SIZEOF(struct ip_tunnel_key, u) - \ + FIELD_SIZEOF(struct ip_tunnel_key, u.ipv4)) + struct ip_tunnel_key { __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; + union { + struct { + __be32 src; + __be32 dst; + } ipv4; + struct { + struct in6_addr src; + struct in6_addr dst; + } ipv6; + } u; __be16 tun_flags; u8 ipv4_tos; u8 ipv4_ttl; @@ -177,8 +191,10 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, const void *opts, u8 opts_len) { tun_info->key.tun_id = tun_id; - tun_info->key.ipv4_src = saddr; - tun_info->key.ipv4_dst = daddr; + tun_info->key.u.ipv4.src = saddr; + tun_info->key.u.ipv4.dst = daddr; + memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD, + 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); tun_info->key.ipv4_tos = tos; tun_info->key.ipv4_ttl = ttl; tun_info->key.tun_flags = tun_flags; diff --git a/net/core/filter.c b/net/core/filter.c index 83f08cefeab7..379568562ffb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1495,7 +1495,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) return -EINVAL; to->tunnel_id = be64_to_cpu(info->key.tun_id); - to->remote_ipv4 = be32_to_cpu(info->key.ipv4_src); + to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); return 0; } @@ -1529,7 +1529,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; info->key.tun_id = cpu_to_be64(from->tunnel_id); - info->key.ipv4_dst = cpu_to_be32(from->remote_ipv4); + info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); return 0; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fb44d693796e..b7bb7d6aa7a8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -407,8 +407,8 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) return PACKET_REJECT; info = &tun_dst->u.tun_info; - info->key.ipv4_src = iph->saddr; - info->key.ipv4_dst = iph->daddr; + info->key.u.ipv4.src = iph->saddr; + info->key.u.ipv4.dst = iph->daddr; info->key.ipv4_tos = iph->tos; info->key.ipv4_ttl = iph->ttl; @@ -527,8 +527,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) key = &tun_info->key; memset(&fl, 0, sizeof(fl)); - fl.daddr = key->ipv4_dst; - fl.saddr = key->ipv4_src; + fl.daddr = key->u.ipv4.dst; + fl.saddr = key->u.ipv4.src; fl.flowi4_tos = RT_TOS(key->ipv4_tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; @@ -564,7 +564,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, - key->ipv4_dst, IPPROTO_GRE, + key->u.ipv4.dst, IPPROTO_GRE, key->ipv4_tos, key->ipv4_ttl, df, false); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 1c2389d582a6..93907d71cda6 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -227,10 +227,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]); if (tb[LWTUNNEL_IP_DST]) - tun_info->key.ipv4_dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); + tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]); if (tb[LWTUNNEL_IP_SRC]) - tun_info->key.ipv4_src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); + tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); if (tb[LWTUNNEL_IP_TTL]) tun_info->key.ipv4_ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); @@ -262,8 +262,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || - nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.ipv4_dst) || - nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.ipv4_src) || + nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || + nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.ipv4_tos) || nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ipv4_ttl) || nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a6eb77ab1a64..a7f866374817 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -534,11 +534,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, tun_flags |= TUNNEL_KEY; break; case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_src, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_dst, + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: @@ -609,7 +609,7 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, } if (!is_mask) { - if (!match->key->tun_key.ipv4_dst) { + if (!match->key->tun_key.u.ipv4.dst) { OVS_NLERR(log, "IPv4 tunnel dst address is zero"); return -EINVAL; } @@ -647,13 +647,13 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, if (output->tun_flags & TUNNEL_KEY && nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id)) return -EMSGSIZE; - if (output->ipv4_src && + if (output->u.ipv4.src && nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, - output->ipv4_src)) + output->u.ipv4.src)) return -EMSGSIZE; - if (output->ipv4_dst && + if (output->u.ipv4.dst && nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, - output->ipv4_dst)) + output->u.ipv4.dst)) return -EMSGSIZE; if (output->ipv4_tos && nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) @@ -1116,7 +1116,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, /* The userspace does not send tunnel attributes that * are 0, but we should not wildcard them nonetheless. */ - if (match->key->tun_key.ipv4_dst) + if (match->key->tun_key.u.ipv4.dst) SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, 0xff, true); @@ -1287,7 +1287,7 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) goto nla_put_failure; - if ((swkey->tun_key.ipv4_dst || is_mask)) { + if ((swkey->tun_key.u.ipv4.dst || is_mask)) { const void *opts = NULL; if (output->tun_key.tun_flags & TUNNEL_OPTIONS_PRESENT) diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 3a9d1dde76ed..d22d8e948d0f 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -426,7 +426,7 @@ static u32 flow_hash(const struct sw_flow_key *key, static int flow_key_start(const struct sw_flow_key *key) { - if (key->tun_key.ipv4_dst) + if (key->tun_key.u.ipv4.dst) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 1da3a14d1010..023813d05f88 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -203,7 +203,7 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) } err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, - tun_key->ipv4_dst, tun_key->ipv4_tos, + tun_key->u.ipv4.dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, df, sport, dport, tun_key->tun_flags, vni, opts_len, opts, !!(tun_key->tun_flags & TUNNEL_CSUM), false); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d14f59403c5e..a06adc72a58d 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -603,7 +603,7 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, * saddr, tp_src and tp_dst */ __ip_tunnel_info_init(egress_tun_info, - fl.saddr, tun_key->ipv4_dst, + fl.saddr, tun_key->u.ipv4.dst, tun_key->ipv4_tos, tun_key->ipv4_ttl, tp_src, tp_dst, diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 1a689c28b5a6..43d8f5a835cb 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -254,8 +254,8 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, struct rtable *rt; memset(fl, 0, sizeof(*fl)); - fl->daddr = key->ipv4_dst; - fl->saddr = key->ipv4_src; + fl->daddr = key->u.ipv4.dst; + fl->saddr = key->u.ipv4.src; fl->flowi4_tos = RT_TOS(key->ipv4_tos); fl->flowi4_mark = mark; fl->flowi4_proto = protocol; -- cgit v1.2.3 From 7c383fb2254c44e096427470da6a36380169b548 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:24 +0200 Subject: ip_tunnels: use tos and ttl fields also for IPv6 Rename the ipv4_tos and ipv4_ttl fields to just 'tos' and 'ttl', as they'll be used with IPv6 tunnels, too. Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 8 ++++---- include/net/ip_tunnels.h | 8 ++++---- net/ipv4/ip_gre.c | 8 ++++---- net/ipv4/ip_tunnel_core.c | 8 ++++---- net/openvswitch/flow_netlink.c | 10 +++++----- net/openvswitch/vport-geneve.c | 4 ++-- net/openvswitch/vport.c | 4 ++-- net/openvswitch/vport.h | 2 +- 8 files changed, 26 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 30a7abcf2c09..ebeb3def06c5 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1278,8 +1278,8 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) info = &tun_dst->u.tun_info; info->key.u.ipv4.src = iph->saddr; info->key.u.ipv4.dst = iph->daddr; - info->key.ipv4_tos = iph->tos; - info->key.ipv4_ttl = iph->ttl; + info->key.tos = iph->tos; + info->key.ttl = iph->ttl; info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; @@ -1960,8 +1960,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, else flags &= ~VXLAN_F_UDP_CSUM; - ttl = info->key.ipv4_ttl; - tos = info->key.ipv4_tos; + ttl = info->key.ttl; + tos = info->key.tos; if (info->options_len) md = ip_tunnel_info_opts(info, sizeof(*md)); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 6a51371dad00..224e4ecec91b 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -44,8 +44,8 @@ struct ip_tunnel_key { } ipv6; } u; __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; + u8 tos; /* TOS for IPv4, TC for IPv6 */ + u8 ttl; /* TTL for IPv4, HL for IPv6 */ __be16 tp_src; __be16 tp_dst; }; @@ -195,8 +195,8 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, tun_info->key.u.ipv4.dst = daddr; memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); - tun_info->key.ipv4_tos = tos; - tun_info->key.ipv4_ttl = ttl; + tun_info->key.tos = tos; + tun_info->key.ttl = ttl; tun_info->key.tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index b7bb7d6aa7a8..5193618b2600 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -409,8 +409,8 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) info = &tun_dst->u.tun_info; info->key.u.ipv4.src = iph->saddr; info->key.u.ipv4.dst = iph->daddr; - info->key.ipv4_tos = iph->tos; - info->key.ipv4_ttl = iph->ttl; + info->key.tos = iph->tos; + info->key.ttl = iph->ttl; info->mode = IP_TUNNEL_INFO_RX; info->key.tun_flags = tpi->flags & @@ -529,7 +529,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl, 0, sizeof(fl)); fl.daddr = key->u.ipv4.dst; fl.saddr = key->u.ipv4.src; - fl.flowi4_tos = RT_TOS(key->ipv4_tos); + fl.flowi4_tos = RT_TOS(key->tos); fl.flowi4_mark = skb->mark; fl.flowi4_proto = IPPROTO_GRE; @@ -565,7 +565,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE, - key->ipv4_tos, key->ipv4_ttl, df, false); + key->tos, key->ttl, df, false); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 93907d71cda6..f0514e39e57c 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -233,10 +233,10 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.u.ipv4.src = nla_get_be32(tb[LWTUNNEL_IP_SRC]); if (tb[LWTUNNEL_IP_TTL]) - tun_info->key.ipv4_ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); + tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP_TTL]); if (tb[LWTUNNEL_IP_TOS]) - tun_info->key.ipv4_tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); + tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]); if (tb[LWTUNNEL_IP_SPORT]) tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP_SPORT]); @@ -264,8 +264,8 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb, if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) || nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) || nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) || - nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.ipv4_tos) || - nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ipv4_ttl) || + nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) || nla_put_u16(skb, LWTUNNEL_IP_SPORT, tun_info->key.tp_src) || nla_put_u16(skb, LWTUNNEL_IP_DPORT, tun_info->key.tp_dst) || nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags)) diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index a7f866374817..4e7a3f7facc2 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -542,11 +542,11 @@ static int ipv4_tun_from_nlattr(const struct nlattr *attr, nla_get_in_addr(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TOS: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_tos, + SW_FLOW_KEY_PUT(match, tun_key.tos, nla_get_u8(a), is_mask); break; case OVS_TUNNEL_KEY_ATTR_TTL: - SW_FLOW_KEY_PUT(match, tun_key.ipv4_ttl, + SW_FLOW_KEY_PUT(match, tun_key.ttl, nla_get_u8(a), is_mask); ttl = true; break; @@ -655,10 +655,10 @@ static int __ipv4_tun_to_nlattr(struct sk_buff *skb, nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, output->u.ipv4.dst)) return -EMSGSIZE; - if (output->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->ipv4_tos)) + if (output->tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ipv4_ttl)) + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) return -EMSGSIZE; if ((output->tun_flags & TUNNEL_DONT_FRAGMENT) && nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 023813d05f88..d01bd6360970 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -203,8 +203,8 @@ static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) } err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, - tun_key->u.ipv4.dst, tun_key->ipv4_tos, - tun_key->ipv4_ttl, df, sport, dport, + tun_key->u.ipv4.dst, tun_key->tos, + tun_key->ttl, df, sport, dport, tun_key->tun_flags, vni, opts_len, opts, !!(tun_key->tun_flags & TUNNEL_CSUM), false); if (err < 0) diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index a06adc72a58d..d73e5a16e7ca 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -604,8 +604,8 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, */ __ip_tunnel_info_init(egress_tun_info, fl.saddr, tun_key->u.ipv4.dst, - tun_key->ipv4_tos, - tun_key->ipv4_ttl, + tun_key->tos, + tun_key->ttl, tp_src, tp_dst, tun_key->tun_id, tun_key->tun_flags, diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 43d8f5a835cb..b88b3ee86f07 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -256,7 +256,7 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, memset(fl, 0, sizeof(*fl)); fl->daddr = key->u.ipv4.dst; fl->saddr = key->u.ipv4.src; - fl->flowi4_tos = RT_TOS(key->ipv4_tos); + fl->flowi4_tos = RT_TOS(key->tos); fl->flowi4_mark = mark; fl->flowi4_proto = protocol; -- cgit v1.2.3 From 61adedf3e3f1d3f032c5a6a299978d91eff6d555 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:25 +0200 Subject: route: move lwtunnel state to dst_entry Currently, the lwtunnel state resides in per-protocol data. This is a problem if we encapsulate ipv6 traffic in an ipv4 tunnel (or vice versa). The xmit function of the tunnel does not know whether the packet has been routed to it by ipv4 or ipv6, yet it needs the lwtstate data. Moving the lwtstate data to dst_entry makes such inter-protocol tunneling possible. As a bonus, this brings a nice diffstat. Signed-off-by: Jiri Benc Acked-by: Roopa Prabhu Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vrf.c | 1 - drivers/net/vxlan.c | 4 +-- include/net/dst.h | 3 +- include/net/dst_metadata.h | 15 +++------ include/net/ip6_fib.h | 1 - include/net/lwtunnel.h | 12 -------- include/net/route.h | 1 - net/core/dst.c | 3 ++ net/core/filter.c | 2 +- net/core/lwtunnel.c | 70 ++++++------------------------------------ net/ipv4/ip_gre.c | 2 +- net/ipv4/route.c | 20 +++++------- net/ipv6/ila.c | 14 +++------ net/ipv6/ip6_fib.c | 1 - net/ipv6/route.c | 20 ++++++------ net/mpls/mpls_iptunnel.c | 7 ++--- net/openvswitch/vport-netdev.c | 2 +- 17 files changed, 48 insertions(+), 130 deletions(-) (limited to 'net') diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index dbeffe789185..b3d9c5546c79 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -295,7 +295,6 @@ static struct rtable *vrf_rtable_create(struct net_device *dev) rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); rth->rt_uncached_list = NULL; - rth->rt_lwtstate = NULL; } return rth; diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index ebeb3def06c5..93613ffd8d7e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1909,7 +1909,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, u32 flags = vxlan->flags; /* FIXME: Support IPv6 */ - info = skb_tunnel_info(skb, AF_INET); + info = skb_tunnel_info(skb); if (rdst) { dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; @@ -2105,7 +2105,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) struct vxlan_fdb *f; /* FIXME: Support IPv6 */ - info = skb_tunnel_info(skb, AF_INET); + info = skb_tunnel_info(skb); skb_reset_mac_header(skb); eth = eth_hdr(skb); diff --git a/include/net/dst.h b/include/net/dst.h index 2578811cef51..0a9a723f6c19 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -44,6 +44,7 @@ struct dst_entry { #else void *__pad1; #endif + struct lwtunnel_state *lwtstate; int (*input)(struct sk_buff *); int (*output)(struct sock *sk, struct sk_buff *skb); @@ -89,7 +90,7 @@ struct dst_entry { * (L1_CACHE_SIZE would be too much) */ #ifdef CONFIG_64BIT - long __pad_to_align_refcnt[2]; + long __pad_to_align_refcnt[1]; #endif /* * __refcnt wants to be on a different cache line from diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 075f523ff23f..2cb52d562272 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -23,22 +23,17 @@ static inline struct metadata_dst *skb_metadata_dst(struct sk_buff *skb) return NULL; } -static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb, - int family) +static inline struct ip_tunnel_info *skb_tunnel_info(struct sk_buff *skb) { struct metadata_dst *md_dst = skb_metadata_dst(skb); - struct rtable *rt; + struct dst_entry *dst; if (md_dst) return &md_dst->u.tun_info; - switch (family) { - case AF_INET: - rt = (struct rtable *)skb_dst(skb); - if (rt && rt->rt_lwtstate) - return lwt_tun_info(rt->rt_lwtstate); - break; - } + dst = skb_dst(skb); + if (dst && dst->lwtstate) + return lwt_tun_info(dst->lwtstate); return NULL; } diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 276328e3daa6..063d30474cf6 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -133,7 +133,6 @@ struct rt6_info { /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; u8 rt6i_protocol; - struct lwtunnel_state *rt6i_lwtstate; }; static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index cfee53916ba5..843489884448 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -87,9 +87,7 @@ int lwtunnel_get_encap_size(struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_output(struct sock *sk, struct sk_buff *skb); -int lwtunnel_output6(struct sock *sk, struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb); -int lwtunnel_input6(struct sk_buff *skb); #else @@ -164,21 +162,11 @@ static inline int lwtunnel_output(struct sock *sk, struct sk_buff *skb) return -EOPNOTSUPP; } -static inline int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} - static inline int lwtunnel_input(struct sk_buff *skb) { return -EOPNOTSUPP; } -static inline int lwtunnel_input6(struct sk_buff *skb) -{ - return -EOPNOTSUPP; -} - #endif #endif /* __NET_LWTUNNEL_H */ diff --git a/include/net/route.h b/include/net/route.h index 6dda2c1bf8c6..395d79bb556c 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -66,7 +66,6 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; - struct lwtunnel_state *rt_lwtstate; }; static inline bool rt_is_input_route(const struct rtable *rt) diff --git a/net/core/dst.c b/net/core/dst.c index f8694d1b8702..50dcdbb0ee46 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -184,6 +185,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif + dst->lwtstate = NULL; atomic_set(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; @@ -264,6 +266,7 @@ again: kfree(dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); + lwtstate_put(dst->lwtstate); dst = child; if (dst) { diff --git a/net/core/filter.c b/net/core/filter.c index 379568562ffb..b4adc961413f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1489,7 +1489,7 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) { struct sk_buff *skb = (struct sk_buff *) (long) r1; struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; - struct ip_tunnel_info *info = skb_tunnel_info(skb, AF_INET); + struct ip_tunnel_info *info = skb_tunnel_info(skb); if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) return -EINVAL; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 3331585174d9..e924c2e08554 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -179,14 +179,16 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b) } EXPORT_SYMBOL(lwtunnel_cmp_encap); -int __lwtunnel_output(struct sock *sk, struct sk_buff *skb, - struct lwtunnel_state *lwtstate) +int lwtunnel_output(struct sock *sk, struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; int ret = -EINVAL; - if (!lwtstate) + if (!dst) goto drop; + lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) @@ -209,47 +211,18 @@ drop: return ret; } - -int lwtunnel_output6(struct sock *sk, struct sk_buff *skb) -{ - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) { - lwtstate = rt->rt6i_lwtstate; - skb->dev = rt->dst.dev; - } - - skb->protocol = htons(ETH_P_IPV6); - - return __lwtunnel_output(sk, skb, lwtstate); -} -EXPORT_SYMBOL(lwtunnel_output6); - -int lwtunnel_output(struct sock *sk, struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) { - lwtstate = rt->rt_lwtstate; - skb->dev = rt->dst.dev; - } - - skb->protocol = htons(ETH_P_IP); - - return __lwtunnel_output(sk, skb, lwtstate); -} EXPORT_SYMBOL(lwtunnel_output); -int __lwtunnel_input(struct sk_buff *skb, - struct lwtunnel_state *lwtstate) +int lwtunnel_input(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); const struct lwtunnel_encap_ops *ops; + struct lwtunnel_state *lwtstate; int ret = -EINVAL; - if (!lwtstate) + if (!dst) goto drop; + lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || lwtstate->type > LWTUNNEL_ENCAP_MAX) @@ -272,27 +245,4 @@ drop: return ret; } - -int lwtunnel_input6(struct sk_buff *skb) -{ - struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) - lwtstate = rt->rt6i_lwtstate; - - return __lwtunnel_input(skb, lwtstate); -} -EXPORT_SYMBOL(lwtunnel_input6); - -int lwtunnel_input(struct sk_buff *skb) -{ - struct rtable *rt = (struct rtable *)skb_dst(skb); - struct lwtunnel_state *lwtstate = NULL; - - if (rt) - lwtstate = rt->rt_lwtstate; - - return __lwtunnel_input(skb, lwtstate); -} EXPORT_SYMBOL(lwtunnel_input); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5193618b2600..1bf328182697 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -521,7 +521,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) __be16 df, flags; int err; - tun_info = skb_tunnel_info(skb, AF_INET); + tun_info = skb_tunnel_info(skb); if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX)) goto err_free_skb; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2403e85107f0..f3087aaa6dd8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1359,7 +1359,6 @@ static void ipv4_dst_destroy(struct dst_entry *dst) list_del(&rt->rt_uncached); spin_unlock_bh(&ul->lock); } - lwtstate_put(rt->rt_lwtstate); } void rt_flush_dev(struct net_device *dev) @@ -1408,7 +1407,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif - rt->rt_lwtstate = lwtstate_get(nh->nh_lwtstate); + rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) @@ -1494,7 +1493,6 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; if (our) { rth->dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; @@ -1624,19 +1622,18 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag); - if (lwtunnel_output_redirect(rth->rt_lwtstate)) { - rth->rt_lwtstate->orig_output = rth->dst.output; + if (lwtunnel_output_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_output = rth->dst.output; rth->dst.output = lwtunnel_output; } - if (lwtunnel_input_redirect(rth->rt_lwtstate)) { - rth->rt_lwtstate->orig_input = rth->dst.input; + if (lwtunnel_input_redirect(rth->dst.lwtstate)) { + rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } skb_dst_set(skb, &rth->dst); @@ -1695,7 +1692,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - tun_info = skb_tunnel_info(skb, AF_INET); + tun_info = skb_tunnel_info(skb); if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else @@ -1815,7 +1812,6 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { @@ -2006,7 +2002,6 @@ add: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); - rth->rt_lwtstate = NULL; RT_CACHE_STAT_INC(out_slow_tot); if (flags & RTCF_LOCAL) @@ -2029,7 +2024,7 @@ add: } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); - if (lwtunnel_output_redirect(rth->rt_lwtstate)) + if (lwtunnel_output_redirect(rth->dst.lwtstate)) rth->dst.output = lwtunnel_output; return rth; @@ -2293,7 +2288,6 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); - rt->rt_lwtstate = NULL; dst_free(new); } diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c index 2540ab4b76d1..f011c3d5ca40 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila.c @@ -89,16 +89,13 @@ static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p) static int ila_output(struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rt6_info *rt6 = NULL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - rt6 = (struct rt6_info *)dst; + update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); - update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate)); - - return rt6->rt6i_lwtstate->orig_output(sk, skb); + return dst->lwtstate->orig_output(sk, skb); drop: kfree_skb(skb); @@ -108,16 +105,13 @@ drop: static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct rt6_info *rt6 = NULL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; - rt6 = (struct rt6_info *)dst; - - update_ipv6_locator(skb, ila_params_lwtunnel(rt6->rt6i_lwtstate)); + update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate)); - return rt6->rt6i_lwtstate->orig_input(skb); + return dst->lwtstate->orig_input(skb); drop: kfree_skb(skb); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 5693b5eb8482..865e777ae20c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -178,7 +178,6 @@ static void rt6_free_pcpu(struct rt6_info *non_pcpu_rt) static void rt6_release(struct rt6_info *rt) { if (atomic_dec_and_test(&rt->rt6i_ref)) { - lwtstate_put(rt->rt6i_lwtstate); rt6_free_pcpu(rt); dst_free(&rt->dst); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c3733049715e..e6bbcdee7707 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1784,14 +1784,14 @@ int ip6_route_add(struct fib6_config *cfg) cfg->fc_encap, &lwtstate); if (err) goto out; - rt->rt6i_lwtstate = lwtstate_get(lwtstate); - if (lwtunnel_output_redirect(rt->rt6i_lwtstate)) { - rt->rt6i_lwtstate->orig_output = rt->dst.output; - rt->dst.output = lwtunnel_output6; + rt->dst.lwtstate = lwtstate_get(lwtstate); + if (lwtunnel_output_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_output = rt->dst.output; + rt->dst.output = lwtunnel_output; } - if (lwtunnel_input_redirect(rt->rt6i_lwtstate)) { - rt->rt6i_lwtstate->orig_input = rt->dst.input; - rt->dst.input = lwtunnel_input6; + if (lwtunnel_input_redirect(rt->dst.lwtstate)) { + rt->dst.lwtstate->orig_input = rt->dst.input; + rt->dst.input = lwtunnel_input; } } @@ -2174,7 +2174,7 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort) #endif rt->rt6i_prefsrc = ort->rt6i_prefsrc; rt->rt6i_table = ort->rt6i_table; - rt->rt6i_lwtstate = lwtstate_get(ort->rt6i_lwtstate); + rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate); } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -2838,7 +2838,7 @@ static inline size_t rt6_nlmsg_size(struct rt6_info *rt) + nla_total_size(sizeof(struct rta_cacheinfo)) + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */ + nla_total_size(1) /* RTA_PREF */ - + lwtunnel_get_encap_size(rt->rt6i_lwtstate); + + lwtunnel_get_encap_size(rt->dst.lwtstate); } static int rt6_fill_node(struct net *net, @@ -2991,7 +2991,7 @@ static int rt6_fill_node(struct net *net, if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags))) goto nla_put_failure; - lwtunnel_fill_encap(skb, rt->rt6i_lwtstate); + lwtunnel_fill_encap(skb, rt->dst.lwtstate); nlmsg_end(skb, nlh); return 0; diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 276f8c992218..3da5ca3ba563 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -48,7 +48,6 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; - struct lwtunnel_state *lwtstate = NULL; int err = 0; bool bos; int i; @@ -58,11 +57,9 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) { ttl = ip_hdr(skb)->ttl; rt = (struct rtable *)dst; - lwtstate = rt->rt_lwtstate; } else if (skb->protocol == htons(ETH_P_IPV6)) { ttl = ipv6_hdr(skb)->hop_limit; rt6 = (struct rt6_info *)dst; - lwtstate = rt6->rt6i_lwtstate; } else { goto drop; } @@ -72,12 +69,12 @@ int mpls_output(struct sock *sk, struct sk_buff *skb) /* Find the output device */ out_dev = dst->dev; if (!mpls_output_possible(out_dev) || - !lwtstate || skb_warn_if_lro(skb)) + !dst->lwtstate || skb_warn_if_lro(skb)) goto drop; skb_forward_csum(skb); - tun_encap_info = mpls_lwtunnel_encap(lwtstate); + tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 4b70aaa4a746..a75011505039 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -57,7 +57,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb, skb_tunnel_info(skb, AF_INET)); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; error: -- cgit v1.2.3 From 06e9d040ba08b0f645783ff958384d5837b3fa3a Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:26 +0200 Subject: ipv6: drop metadata dst in ip6_route_input The fix in commit 48fb6b554501 is incomplete, as now ip6_route_input can be called with non-NULL dst if it's a metadata dst and the reference is leaked. Drop the reference. Fixes: 48fb6b554501 ("ipv6: fix crash over flow-based vxlan device") Fixes: ee122c79d422 ("vxlan: Flow based tunneling") CC: Wei-Chun Chao CC: Thomas Graf Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv6/route.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e6bbcdee7707..0947ad0b3de8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1140,6 +1140,7 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_proto = iph->nexthdr, }; + skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } -- cgit v1.2.3 From ab450605b35caa768ca33e86db9403229bf42be4 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:27 +0200 Subject: ipv6: ndisc: inherit metadata dst when creating ndisc requests If output device wants to see the dst, inherit the dst of the original skb in the ndisc request. This is an IPv6 counterpart of commit 0accfc268f4d ("arp: Inherit metadata dst when creating ARP requests"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/ndisc.h | 3 ++- net/ipv6/addrconf.c | 2 +- net/ipv6/ndisc.c | 10 +++++++--- net/ipv6/route.c | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index b3a7751251b4..aba5695fadb0 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -182,7 +182,8 @@ int ndisc_rcv(struct sk_buff *skb); void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr); + const struct in6_addr *daddr, const struct in6_addr *saddr, + struct sk_buff *oskb); void ndisc_send_rs(struct net_device *dev, const struct in6_addr *saddr, const struct in6_addr *daddr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 59242399b0b5..0f08d3b9e238 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3656,7 +3656,7 @@ static void addrconf_dad_work(struct work_struct *w) /* send a neighbour solicitation for our addr */ addrconf_addr_solict_mult(&ifp->addr, &mcaddr); - ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &in6addr_any, NULL); out: in6_ifa_put(ifp); rtnl_unlock(); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b3054611f88a..13d3c2beb93e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -553,7 +553,8 @@ static void ndisc_send_unsol_na(struct net_device *dev) void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, const struct in6_addr *solicit, - const struct in6_addr *daddr, const struct in6_addr *saddr) + const struct in6_addr *daddr, const struct in6_addr *saddr, + struct sk_buff *oskb) { struct sk_buff *skb; struct in6_addr addr_buf; @@ -589,6 +590,9 @@ void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr); + if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE) && oskb) + skb_dst_copy(skb, oskb); + ndisc_send_skb(skb, daddr, saddr); } @@ -675,12 +679,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) "%s: trying to ucast probe in NUD_INVALID: %pI6\n", __func__, target); } - ndisc_send_ns(dev, neigh, target, target, saddr); + ndisc_send_ns(dev, neigh, target, target, saddr, skb); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { neigh_app_ns(neigh); } else { addrconf_addr_solict_mult(target, &mcaddr); - ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr, skb); } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0947ad0b3de8..c4f3b9fcca9d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -538,7 +538,7 @@ static void rt6_probe_deferred(struct work_struct *w) container_of(w, struct __rt6_probe_work, work); addrconf_addr_solict_mult(&work->target, &mcaddr); - ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL); + ndisc_send_ns(work->dev, NULL, &work->target, &mcaddr, NULL, NULL); dev_put(work->dev); kfree(work); } -- cgit v1.2.3 From 904af04d30f303d96902584206457128c3051d8d Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:31 +0200 Subject: ipv6: route: extend flow representation with tunnel key Use flowi_tunnel in flowi6 similarly to what is done with IPv4. This complements commit 1b7179d3adff ("route: Extend flow representation with tunnel key"). Signed-off-by: Jiri Benc Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/flow.h | 1 + net/ipv6/route.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/net/flow.h b/include/net/flow.h index f305588fc162..9e0297c4c11d 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -130,6 +130,7 @@ struct flowi6 { #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid +#define flowi6_tun_key __fl_common.flowic_tun_key struct in6_addr daddr; struct in6_addr saddr; __be32 flowlabel; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c4f3b9fcca9d..6c0fe4c7ce8d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -54,11 +54,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include @@ -1131,6 +1133,7 @@ void ip6_route_input(struct sk_buff *skb) const struct ipv6hdr *iph = ipv6_hdr(skb); struct net *net = dev_net(skb->dev); int flags = RT6_LOOKUP_F_HAS_SADDR; + struct ip_tunnel_info *tun_info; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .daddr = iph->daddr, @@ -1140,6 +1143,9 @@ void ip6_route_input(struct sk_buff *skb) .flowi6_proto = iph->nexthdr, }; + tun_info = skb_tunnel_info(skb); + if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); } -- cgit v1.2.3 From 32a2b002ce615eadd3bfaddabde290f70a1dd17b Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 20 Aug 2015 13:56:32 +0200 Subject: ipv6: route: per route IP tunnel metadata via lightweight tunnel Allow specification of per route IP tunnel instructions also for IPv6. This complements commit 3093fbe7ff4b ("route: Per route IP tunnel metadata via lightweight tunnel"). Signed-off-by: Jiri Benc CC: YOSHIFUJI Hideaki Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/uapi/linux/lwtunnel.h | 16 +++++++ net/ipv4/ip_tunnel_core.c | 102 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) (limited to 'net') diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index aa84ca396bcb..34141a5dfe74 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -8,6 +8,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_MPLS, LWTUNNEL_ENCAP_IP, LWTUNNEL_ENCAP_ILA, + LWTUNNEL_ENCAP_IP6, __LWTUNNEL_ENCAP_MAX, }; @@ -28,4 +29,19 @@ enum lwtunnel_ip_t { #define LWTUNNEL_IP_MAX (__LWTUNNEL_IP_MAX - 1) +enum lwtunnel_ip6_t { + LWTUNNEL_IP6_UNSPEC, + LWTUNNEL_IP6_ID, + LWTUNNEL_IP6_DST, + LWTUNNEL_IP6_SRC, + LWTUNNEL_IP6_HOPLIMIT, + LWTUNNEL_IP6_TC, + LWTUNNEL_IP6_SPORT, + LWTUNNEL_IP6_DPORT, + LWTUNNEL_IP6_FLAGS, + __LWTUNNEL_IP6_MAX, +}; + +#define LWTUNNEL_IP6_MAX (__LWTUNNEL_IP6_MAX - 1) + #endif /* _UAPI_LWTUNNEL_H_ */ diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f0514e39e57c..289b6c26ce37 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -299,9 +299,111 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = { .cmp_encap = ip_tun_cmp_encap, }; +static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { + [LWTUNNEL_IP6_ID] = { .type = NLA_U64 }, + [LWTUNNEL_IP6_DST] = { .len = sizeof(struct in6_addr) }, + [LWTUNNEL_IP6_SRC] = { .len = sizeof(struct in6_addr) }, + [LWTUNNEL_IP6_HOPLIMIT] = { .type = NLA_U8 }, + [LWTUNNEL_IP6_TC] = { .type = NLA_U8 }, + [LWTUNNEL_IP6_SPORT] = { .type = NLA_U16 }, + [LWTUNNEL_IP6_DPORT] = { .type = NLA_U16 }, + [LWTUNNEL_IP6_FLAGS] = { .type = NLA_U16 }, +}; + +static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, + struct lwtunnel_state **ts) +{ + struct ip_tunnel_info *tun_info; + struct lwtunnel_state *new_state; + struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; + int err; + + err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy); + if (err < 0) + return err; + + new_state = lwtunnel_state_alloc(sizeof(*tun_info)); + if (!new_state) + return -ENOMEM; + + new_state->type = LWTUNNEL_ENCAP_IP6; + + tun_info = lwt_tun_info(new_state); + + if (tb[LWTUNNEL_IP6_ID]) + tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]); + + if (tb[LWTUNNEL_IP6_DST]) + tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]); + + if (tb[LWTUNNEL_IP6_SRC]) + tun_info->key.u.ipv6.src = nla_get_in6_addr(tb[LWTUNNEL_IP6_SRC]); + + if (tb[LWTUNNEL_IP6_HOPLIMIT]) + tun_info->key.ttl = nla_get_u8(tb[LWTUNNEL_IP6_HOPLIMIT]); + + if (tb[LWTUNNEL_IP6_TC]) + tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]); + + if (tb[LWTUNNEL_IP6_SPORT]) + tun_info->key.tp_src = nla_get_be16(tb[LWTUNNEL_IP6_SPORT]); + + if (tb[LWTUNNEL_IP6_DPORT]) + tun_info->key.tp_dst = nla_get_be16(tb[LWTUNNEL_IP6_DPORT]); + + if (tb[LWTUNNEL_IP6_FLAGS]) + tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); + + tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->options = NULL; + tun_info->options_len = 0; + + *ts = new_state; + + return 0; +} + +static int ip6_tun_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwtstate) +{ + struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate); + + if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) || + nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) || + nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) || + nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) || + nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) || + nla_put_u16(skb, LWTUNNEL_IP6_SPORT, tun_info->key.tp_src) || + nla_put_u16(skb, LWTUNNEL_IP6_DPORT, tun_info->key.tp_dst) || + nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags)) + return -ENOMEM; + + return 0; +} + +static int ip6_tun_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(8) /* LWTUNNEL_IP6_ID */ + + nla_total_size(16) /* LWTUNNEL_IP6_DST */ + + nla_total_size(16) /* LWTUNNEL_IP6_SRC */ + + nla_total_size(1) /* LWTUNNEL_IP6_HOPLIMIT */ + + nla_total_size(1) /* LWTUNNEL_IP6_TC */ + + nla_total_size(2) /* LWTUNNEL_IP6_SPORT */ + + nla_total_size(2) /* LWTUNNEL_IP6_DPORT */ + + nla_total_size(2); /* LWTUNNEL_IP6_FLAGS */ +} + +static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = { + .build_state = ip6_tun_build_state, + .fill_encap = ip6_tun_fill_encap_info, + .get_encap_size = ip6_tun_encap_nlsize, + .cmp_encap = ip_tun_cmp_encap, +}; + void __init ip_tunnel_core_init(void) { lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP); + lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6); } struct static_key ip_tunnel_metadata_cnt = STATIC_KEY_INIT_FALSE; -- cgit v1.2.3 From eefa32d3f3c54bc7f9704968bc78adf0439c6c2a Mon Sep 17 00:00:00 2001 From: Raducu Deaconu Date: Fri, 17 Jul 2015 08:45:40 +0300 Subject: ipvs: Add ovf scheduler The weighted overflow scheduling algorithm directs network connections to the server with the highest weight that is currently available and overflows to the next when active connections exceed the node's weight. Signed-off-by: Raducu Deaconu Acked-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/Kconfig | 11 ++++++ net/netfilter/ipvs/Makefile | 1 + net/netfilter/ipvs/ip_vs_ovf.c | 86 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+) create mode 100644 net/netfilter/ipvs/ip_vs_ovf.c (limited to 'net') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 3b6929dec748..b32fb0dbe237 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -162,6 +162,17 @@ config IP_VS_FO If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_OVF + tristate "weighted overflow scheduling" + ---help--- + The weighted overflow scheduling algorithm directs network + connections to the server with the highest weight that is + currently available and overflows to the next when active + connections exceed the node's weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + config IP_VS_LBLC tristate "locality-based least-connection scheduling" ---help--- diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 38b2723b2e3d..67f3f4389602 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o obj-$(CONFIG_IP_VS_FO) += ip_vs_fo.o +obj-$(CONFIG_IP_VS_OVF) += ip_vs_ovf.o obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c new file mode 100644 index 000000000000..f7d62c3b7329 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -0,0 +1,86 @@ +/* + * IPVS: Overflow-Connection Scheduling module + * + * Authors: Raducu Deaconu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Scheduler implements "overflow" loadbalancing according to number of active + * connections , will keep all conections to the node with the highest weight + * and overflow to the next node if the number of connections exceeds the node's + * weight. + * Note that this scheduler might not be suitable for UDP because it only uses + * active connections + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include + +/* OVF Connection scheduling */ +static struct ip_vs_dest * +ip_vs_ovf_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, + struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest, *h = NULL; + int hw = 0, w; + + IP_VS_DBG(6, "ip_vs_ovf_schedule(): Scheduling...\n"); + /* select the node with highest weight, go to next in line if active + * connections exceed weight + */ + list_for_each_entry_rcu(dest, &svc->destinations, n_list) { + w = atomic_read(&dest->weight); + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->activeconns) > w || + w == 0) + continue; + if (!h || w > hw) { + h = dest; + hw = w; + } + } + + if (h) { + IP_VS_DBG_BUF(6, "OVF: server %s:%u active %d w %d\n", + IP_VS_DBG_ADDR(h->af, &h->addr), + ntohs(h->port), + atomic_read(&h->activeconns), + atomic_read(&h->weight)); + return h; + } + + ip_vs_scheduler_err(svc, "no destination available"); + return NULL; +} + +static struct ip_vs_scheduler ip_vs_ovf_scheduler = { + .name = "ovf", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_ovf_scheduler.n_list), + .schedule = ip_vs_ovf_schedule, +}; + +static int __init ip_vs_ovf_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_ovf_scheduler); +} + +static void __exit ip_vs_ovf_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_ovf_scheduler); + synchronize_rcu(); +} + +module_init(ip_vs_ovf_init); +module_exit(ip_vs_ovf_cleanup); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e0b26cc997d57305b4097711e12e13992580ae34 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 26 Jul 2015 14:57:34 +0300 Subject: ipvs: call rtnl_lock early When the sync damon is started we need to hold rtnl lock while calling ip_mc_join_group. Currently, we have a wrong locking order because the correct one is rtnl_lock->__ip_vs_mutex. It is implied from the usage of __ip_vs_mutex in ip_vs_dst_event() which is called under rtnl lock during NETDEV_* notifications. Fix the problem by calling rtnl_lock early only for the start_sync_thread call. As a bonus this fixes the usage __dev_get_by_name which was not called under rtnl lock. This patch actually extends and depends on commit 54ff9ef36bdf ("ipv4, ipv6: kill ip_mc_{join, leave}_group and ipv6_sock_mc_{join, drop}"). Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 50 +++++++++++++++++++++++++++-------------- net/netfilter/ipvs/ip_vs_sync.c | 2 -- 2 files changed, 33 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 24c554201a76..af0b69e411b7 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2335,13 +2335,18 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) cmd == IP_VS_SO_SET_STOPDAEMON) { struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - mutex_lock(&ipvs->sync_mutex); - if (cmd == IP_VS_SO_SET_STARTDAEMON) + if (cmd == IP_VS_SO_SET_STARTDAEMON) { + rtnl_lock(); + mutex_lock(&ipvs->sync_mutex); ret = start_sync_thread(net, dm->state, dm->mcast_ifn, dm->syncid); - else + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + } else { + mutex_lock(&ipvs->sync_mutex); ret = stop_sync_thread(net, dm->state); - mutex_unlock(&ipvs->sync_mutex); + mutex_unlock(&ipvs->sync_mutex); + } goto out_dec; } @@ -3342,6 +3347,9 @@ nla_put_failure: static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) { + struct netns_ipvs *ipvs = net_ipvs(net); + int ret; + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) @@ -3353,19 +3361,30 @@ static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) if (net_ipvs(net)->mixed_address_family_dests > 0) return -EINVAL; - return start_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); + rtnl_lock(); + mutex_lock(&ipvs->sync_mutex); + ret = start_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); + mutex_unlock(&ipvs->sync_mutex); + rtnl_unlock(); + return ret; } static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs) { + struct netns_ipvs *ipvs = net_ipvs(net); + int ret; + if (!attrs[IPVS_DAEMON_ATTR_STATE]) return -EINVAL; - return stop_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_lock(&ipvs->sync_mutex); + ret = stop_sync_thread(net, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); + mutex_unlock(&ipvs->sync_mutex); + return ret; } static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) @@ -3389,7 +3408,7 @@ static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs) static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) { - int ret = 0, cmd; + int ret = -EINVAL, cmd; struct net *net; struct netns_ipvs *ipvs; @@ -3400,22 +3419,19 @@ static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) { struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; - mutex_lock(&ipvs->sync_mutex); if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy)) { - ret = -EINVAL; + ip_vs_daemon_policy)) goto out; - } if (cmd == IPVS_CMD_NEW_DAEMON) ret = ip_vs_genl_new_daemon(net, daemon_attrs); else ret = ip_vs_genl_del_daemon(net, daemon_attrs); -out: - mutex_unlock(&ipvs->sync_mutex); } + +out: return ret; } diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index d99ad93eb855..6bc6dca9bca8 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1405,11 +1405,9 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) mreq.imr_ifindex = dev->ifindex; - rtnl_lock(); lock_sock(sk); ret = ip_mc_join_group(sk, &mreq); release_sock(sk); - rtnl_unlock(); return ret; } -- cgit v1.2.3 From e4ff67513096e6e196ca58043fce04d0f87babbe Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 26 Jul 2015 15:03:27 +0300 Subject: ipvs: add sync_maxlen parameter for the sync daemon Allow setups with large MTU to send large sync packets by adding sync_maxlen parameter. The default value is now based on MTU but no more than 1500 for compatibility reasons. To avoid problems if MTU changes allow fragmentation by sending packets with DF=0. Problem reported by Dan Carpenter. Reported-by: Dan Carpenter Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 19 +++--- include/uapi/linux/ip_vs.h | 1 + net/netfilter/ipvs/ip_vs_ctl.c | 53 ++++++++++------ net/netfilter/ipvs/ip_vs_sync.c | 137 ++++++++++++++++++---------------------- 4 files changed, 108 insertions(+), 102 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 4e3731ee4eac..2fdc13caf712 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -846,6 +846,13 @@ struct ipvs_master_sync_state { /* How much time to keep dests in trash */ #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) +struct ipvs_sync_daemon_cfg { + int syncid; + u16 sync_maxlen; + /* multicast interface name */ + char mcast_ifn[IP_VS_IFNAME_MAXLEN]; +}; + /* IPVS in network namespace */ struct netns_ipvs { int gen; /* Generation */ @@ -961,15 +968,10 @@ struct netns_ipvs { spinlock_t sync_buff_lock; struct task_struct **backup_threads; int threads_mask; - int send_mesg_maxlen; - int recv_mesg_maxlen; volatile int sync_state; - volatile int master_syncid; - volatile int backup_syncid; struct mutex sync_mutex; - /* multicast interface name */ - char master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - char backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + struct ipvs_sync_daemon_cfg mcfg; /* Master Configuration */ + struct ipvs_sync_daemon_cfg bcfg; /* Backup Configuration */ /* net name space ptr */ struct net *net; /* Needed by timer routines */ /* Number of heterogeneous destinations, needed becaus heterogeneous @@ -1408,7 +1410,8 @@ static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest) /* IPVS sync daemon data and function prototypes * (from ip_vs_sync.c) */ -int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid); +int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *cfg, + int state); int stop_sync_thread(struct net *net, int state); void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts); diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index 3199243f2028..68377d8c8870 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -406,6 +406,7 @@ enum { IPVS_DAEMON_ATTR_STATE, /* sync daemon state (master/backup) */ IPVS_DAEMON_ATTR_MCAST_IFN, /* multicast interface name */ IPVS_DAEMON_ATTR_SYNC_ID, /* SyncID we belong to */ + IPVS_DAEMON_ATTR_SYNC_MAXLEN, /* UDP Payload Size */ __IPVS_DAEMON_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index af0b69e411b7..96f7bbfd5e1d 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2336,10 +2336,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ipvs_sync_daemon_cfg cfg; + + memset(&cfg, 0, sizeof(cfg)); + strlcpy(cfg.mcast_ifn, dm->mcast_ifn, + sizeof(cfg.mcast_ifn)); + cfg.syncid = dm->syncid; rtnl_lock(); mutex_lock(&ipvs->sync_mutex); - ret = start_sync_thread(net, dm->state, dm->mcast_ifn, - dm->syncid); + ret = start_sync_thread(net, &cfg, dm->state); mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); } else { @@ -2650,15 +2655,15 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn, + strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ipvs->master_syncid; + d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn, + strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ipvs->backup_syncid; + d[1].syncid = ipvs->bcfg.syncid; } if (copy_to_user(user, &d, sizeof(d)) != 0) ret = -EFAULT; @@ -2813,6 +2818,7 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, .len = IP_VS_IFNAME_MAXLEN }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ @@ -3271,7 +3277,7 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, } static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, - const char *mcast_ifn, __u32 syncid) + struct ipvs_sync_daemon_cfg *c) { struct nlattr *nl_daemon; @@ -3280,8 +3286,9 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, return -EMSGSIZE; if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || - nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) || - nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid)) + nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || + nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen)) goto nla_put_failure; nla_nest_end(skb, nl_daemon); @@ -3293,7 +3300,7 @@ nla_put_failure: } static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, - const char *mcast_ifn, __u32 syncid, + struct ipvs_sync_daemon_cfg *c, struct netlink_callback *cb) { void *hdr; @@ -3303,7 +3310,7 @@ static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state, if (!hdr) return -EMSGSIZE; - if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + if (ip_vs_genl_fill_daemon(skb, state, c)) goto nla_put_failure; genlmsg_end(skb, hdr); @@ -3323,8 +3330,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb, mutex_lock(&ipvs->sync_mutex); if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, - ipvs->master_mcast_ifn, - ipvs->master_syncid, cb) < 0) + &ipvs->mcfg, cb) < 0) goto nla_put_failure; cb->args[0] = 1; @@ -3332,8 +3338,7 @@ static int ip_vs_genl_dump_daemons(struct sk_buff *skb, if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, - ipvs->backup_mcast_ifn, - ipvs->backup_syncid, cb) < 0) + &ipvs->bcfg, cb) < 0) goto nla_put_failure; cb->args[1] = 1; @@ -3348,25 +3353,33 @@ nla_put_failure: static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) { struct netns_ipvs *ipvs = net_ipvs(net); + struct ipvs_sync_daemon_cfg c; + struct nlattr *a; int ret; + memset(&c, 0, sizeof(c)); if (!(attrs[IPVS_DAEMON_ATTR_STATE] && attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; + strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + sizeof(c.mcast_ifn)); + c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); + + a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN]; + if (a) + c.sync_maxlen = nla_get_u16(a); /* The synchronization protocol is incompatible with mixed family * services */ - if (net_ipvs(net)->mixed_address_family_dests > 0) + if (ipvs->mixed_address_family_dests > 0) return -EINVAL; rtnl_lock(); mutex_lock(&ipvs->sync_mutex); - ret = start_sync_thread(net, - nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), - nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), - nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); + ret = start_sync_thread(net, &c, + nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); mutex_unlock(&ipvs->sync_mutex); rtnl_unlock(); return ret; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 6bc6dca9bca8..e68a43421479 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -320,26 +320,28 @@ sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms) * Create a new sync buffer for Version 1 proto. */ static inline struct ip_vs_sync_buff * -ip_vs_sync_buff_create(struct netns_ipvs *ipvs) +ip_vs_sync_buff_create(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); if (!sb->mesg) { kfree(sb); return NULL; } sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */ sb->mesg->version = SYNC_PROTO_VER; - sb->mesg->syncid = ipvs->master_syncid; + sb->mesg->syncid = ipvs->mcfg.syncid; sb->mesg->size = htons(sizeof(struct ip_vs_sync_mesg)); sb->mesg->nr_conns = 0; sb->mesg->spare = 0; sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg); - sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen; + sb->end = (unsigned char *)sb->mesg + len; sb->firstuse = jiffies; return sb; @@ -402,7 +404,7 @@ select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp) * Create a new sync buffer for Version 0 proto. */ static inline struct ip_vs_sync_buff * -ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) +ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs, unsigned int len) { struct ip_vs_sync_buff *sb; struct ip_vs_sync_mesg_v0 *mesg; @@ -410,17 +412,19 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs) if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) return NULL; - sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC); + len = max_t(unsigned int, len + sizeof(struct ip_vs_sync_mesg_v0), + ipvs->mcfg.sync_maxlen); + sb->mesg = kmalloc(len, GFP_ATOMIC); if (!sb->mesg) { kfree(sb); return NULL; } mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg; mesg->nr_conns = 0; - mesg->syncid = ipvs->master_syncid; + mesg->syncid = ipvs->mcfg.syncid; mesg->size = htons(sizeof(struct ip_vs_sync_mesg_v0)); sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0); - sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen; + sb->end = (unsigned char *)mesg + len; sb->firstuse = jiffies; return sb; } @@ -533,7 +537,7 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, struct ip_vs_sync_buff *buff; struct ipvs_master_sync_state *ms; int id; - int len; + unsigned int len; if (unlikely(cp->af != AF_INET)) return; @@ -553,17 +557,19 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, id = select_master_thread_id(ipvs, cp); ms = &ipvs->ms[id]; buff = ms->sync_buff; + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; if (buff) { m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; /* Send buffer if it is for v1 */ - if (!m->nr_conns) { + if (buff->head + len > buff->end || !m->nr_conns) { sb_queue_tail(ipvs, ms); ms->sync_buff = NULL; buff = NULL; } } if (!buff) { - buff = ip_vs_sync_buff_create_v0(ipvs); + buff = ip_vs_sync_buff_create_v0(ipvs, len); if (!buff) { spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); @@ -572,8 +578,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, ms->sync_buff = buff; } - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; m = (struct ip_vs_sync_mesg_v0 *) buff->mesg; s = (struct ip_vs_sync_conn_v0 *) buff->head; @@ -597,12 +601,6 @@ static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp, m->nr_conns++; m->size = htons(ntohs(m->size) + len); buff->head += len; - - /* check if there is a space for next one */ - if (buff->head + FULL_CONN_SIZE > buff->end) { - sb_queue_tail(ipvs, ms); - ms->sync_buff = NULL; - } spin_unlock_bh(&ipvs->sync_buff_lock); /* synchronize its controller if it has */ @@ -694,7 +692,7 @@ sloop: } if (!buff) { - buff = ip_vs_sync_buff_create(ipvs); + buff = ip_vs_sync_buff_create(ipvs, len); if (!buff) { spin_unlock_bh(&ipvs->sync_buff_lock); pr_err("ip_vs_sync_buff_create failed.\n"); @@ -1219,7 +1217,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer, return; } /* SyncID sanity check */ - if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) { + if (ipvs->bcfg.syncid != 0 && m2->syncid != ipvs->bcfg.syncid) { IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid); return; } @@ -1319,6 +1317,17 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) release_sock(sk); } +/* Control fragmentation of messages */ +static void set_mcast_pmtudisc(struct sock *sk, int val) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ + lock_sock(sk); + inet->pmtudisc = val; + release_sock(sk); +} + /* * Specifiy default interface for outgoing multicasts */ @@ -1344,43 +1353,6 @@ static int set_mcast_if(struct sock *sk, char *ifname) } -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(struct net *net, int sync_state) -{ - struct netns_ipvs *ipvs = net_ipvs(net); - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - dev = __dev_get_by_name(net, ipvs->master_mcast_ifn); - if (!dev) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN + - SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", ipvs->send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn); - if (!dev) - return -ENODEV; - - ipvs->recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", ipvs->recv_mesg_maxlen); - } - - return 0; -} - - /* * Join a multicast group. * the group is specified by a class D multicast address 224.0.0.0/8 @@ -1461,7 +1433,7 @@ static struct socket *make_send_sock(struct net *net, int id) pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); + result = set_mcast_if(sock->sk, ipvs->mcfg.mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); goto error; @@ -1469,11 +1441,13 @@ static struct socket *make_send_sock(struct net *net, int id) set_mcast_loop(sock->sk, 0); set_mcast_ttl(sock->sk, 1); + /* Allow fragmentation if MTU changes */ + set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); result = sysctl_sync_sock_size(ipvs); if (result > 0) set_sock_size(sock->sk, 1, result); - result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn); + result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); if (result < 0) { pr_err("Error binding address of the mcast interface\n"); goto error; @@ -1531,7 +1505,7 @@ static struct socket *make_receive_sock(struct net *net, int id) /* join the multicast group */ result = join_mcast_group(sock->sk, (struct in_addr *) &mcast_addr.sin_addr, - ipvs->backup_mcast_ifn); + ipvs->bcfg.mcast_ifn); if (result < 0) { pr_err("Error joining to the multicast group\n"); goto error; @@ -1639,7 +1613,7 @@ static int sync_thread_master(void *data) pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " "syncid = %d, id = %d\n", - ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id); + ipvs->mcfg.mcast_ifn, ipvs->mcfg.syncid, tinfo->id); for (;;) { sb = next_sync_buff(ipvs, ms); @@ -1693,7 +1667,7 @@ static int sync_thread_backup(void *data) pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " "syncid = %d, id = %d\n", - ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id); + ipvs->bcfg.mcast_ifn, ipvs->bcfg.syncid, tinfo->id); while (!kthread_should_stop()) { wait_event_interruptible(*sk_sleep(tinfo->sock->sk), @@ -1703,7 +1677,7 @@ static int sync_thread_backup(void *data) /* do we have data now? */ while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { len = ip_vs_receive(tinfo->sock, tinfo->buf, - ipvs->recv_mesg_maxlen); + ipvs->bcfg.sync_maxlen); if (len <= 0) { if (len != -EAGAIN) pr_err("receiving message error\n"); @@ -1723,16 +1697,19 @@ static int sync_thread_backup(void *data) } -int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) +int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *c, + int state) { struct ip_vs_sync_thread_data *tinfo; struct task_struct **array = NULL, *task; struct socket *sock; struct netns_ipvs *ipvs = net_ipvs(net); + struct net_device *dev; char *name; int (*threadfn)(void *data); - int id, count; + int id, count, hlen; int result = -ENOMEM; + u16 mtu, min_mtu; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", @@ -1744,22 +1721,35 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) } else count = ipvs->threads_mask + 1; + dev = __dev_get_by_name(net, c->mcast_ifn); + if (!dev) { + pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); + return -ENODEV; + } + hlen = sizeof(struct iphdr) + sizeof(struct udphdr); + mtu = (state == IP_VS_STATE_BACKUP) ? + clamp(dev->mtu, 1500U, 65535U) : 1500U; + min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; + + if (c->sync_maxlen) + c->sync_maxlen = clamp_t(unsigned int, + c->sync_maxlen, min_mtu, + 65535 - hlen); + else + c->sync_maxlen = mtu - hlen; + if (state == IP_VS_STATE_MASTER) { if (ipvs->ms) return -EEXIST; - strlcpy(ipvs->master_mcast_ifn, mcast_ifn, - sizeof(ipvs->master_mcast_ifn)); - ipvs->master_syncid = syncid; + ipvs->mcfg = *c; name = "ipvs-m:%d:%d"; threadfn = sync_thread_master; } else if (state == IP_VS_STATE_BACKUP) { if (ipvs->backup_threads) return -EEXIST; - strlcpy(ipvs->backup_mcast_ifn, mcast_ifn, - sizeof(ipvs->backup_mcast_ifn)); - ipvs->backup_syncid = syncid; + ipvs->bcfg = *c; name = "ipvs-b:%d:%d"; threadfn = sync_thread_backup; } else { @@ -1787,7 +1777,6 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) if (!array) goto out; } - set_sync_mesg_maxlen(net, state); tinfo = NULL; for (id = 0; id < count; id++) { @@ -1805,7 +1794,7 @@ int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid) tinfo->net = net; tinfo->sock = sock; if (state == IP_VS_STATE_BACKUP) { - tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen, + tinfo->buf = kmalloc(ipvs->bcfg.sync_maxlen, GFP_KERNEL); if (!tinfo->buf) goto outtinfo; -- cgit v1.2.3 From d33288172e72c4729e8b9f2243fb40601afabc8f Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 26 Jul 2015 15:03:28 +0300 Subject: ipvs: add more mcast parameters for the sync daemon - mcast_group: configure the multicast address, now IPv6 is supported too - mcast_port: configure the multicast port - mcast_ttl: configure the multicast TTL/HOP_LIMIT Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- include/net/ip_vs.h | 4 ++ include/uapi/linux/ip_vs.h | 4 ++ net/netfilter/ipvs/ip_vs_ctl.c | 50 ++++++++++++++- net/netfilter/ipvs/ip_vs_sync.c | 138 +++++++++++++++++++++++++++++++++------- 4 files changed, 172 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 2fdc13caf712..9b9ca87a4210 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -847,8 +847,12 @@ struct ipvs_master_sync_state { #define IP_VS_DEST_TRASH_PERIOD (120 * HZ) struct ipvs_sync_daemon_cfg { + union nf_inet_addr mcast_group; int syncid; u16 sync_maxlen; + u16 mcast_port; + u8 mcast_af; + u8 mcast_ttl; /* multicast interface name */ char mcast_ifn[IP_VS_IFNAME_MAXLEN]; }; diff --git a/include/uapi/linux/ip_vs.h b/include/uapi/linux/ip_vs.h index 68377d8c8870..391395c06c7e 100644 --- a/include/uapi/linux/ip_vs.h +++ b/include/uapi/linux/ip_vs.h @@ -407,6 +407,10 @@ enum { IPVS_DAEMON_ATTR_MCAST_IFN, /* multicast interface name */ IPVS_DAEMON_ATTR_SYNC_ID, /* SyncID we belong to */ IPVS_DAEMON_ATTR_SYNC_MAXLEN, /* UDP Payload Size */ + IPVS_DAEMON_ATTR_MCAST_GROUP, /* IPv4 Multicast Address */ + IPVS_DAEMON_ATTR_MCAST_GROUP6, /* IPv6 Multicast Address */ + IPVS_DAEMON_ATTR_MCAST_PORT, /* Multicast Port (base) */ + IPVS_DAEMON_ATTR_MCAST_TTL, /* Multicast TTL */ __IPVS_DAEMON_ATTR_MAX, }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 96f7bbfd5e1d..1a23e91d50d8 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2819,6 +2819,10 @@ static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { .len = IP_VS_IFNAME_MAXLEN }, [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) }, + [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 }, + [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 }, }; /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ @@ -3288,8 +3292,21 @@ static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state, if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) || nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) || nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) || - nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen)) + nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) || + nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) || + nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl)) goto nla_put_failure; +#ifdef CONFIG_IP_VS_IPV6 + if (c->mcast_af == AF_INET6) { + if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6, + &c->mcast_group.in6)) + goto nla_put_failure; + } else +#endif + if (c->mcast_af == AF_INET && + nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP, + c->mcast_group.ip)) + goto nla_put_failure; nla_nest_end(skb, nl_daemon); return 0; @@ -3370,6 +3387,37 @@ static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs) if (a) c.sync_maxlen = nla_get_u16(a); + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP]; + if (a) { + c.mcast_af = AF_INET; + c.mcast_group.ip = nla_get_in_addr(a); + if (!ipv4_is_multicast(c.mcast_group.ip)) + return -EINVAL; + } else { + a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6]; + if (a) { +#ifdef CONFIG_IP_VS_IPV6 + int addr_type; + + c.mcast_af = AF_INET6; + c.mcast_group.in6 = nla_get_in6_addr(a); + addr_type = ipv6_addr_type(&c.mcast_group.in6); + if (!(addr_type & IPV6_ADDR_MULTICAST)) + return -EINVAL; +#else + return -EAFNOSUPPORT; +#endif + } + } + + a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT]; + if (a) + c.mcast_port = nla_get_u16(a); + + a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL]; + if (a) + c.mcast_ttl = nla_get_u8(a); + /* The synchronization protocol is incompatible with mixed family * services */ diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index e68a43421479..43f140950075 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -262,6 +262,11 @@ struct ip_vs_sync_mesg { /* ip_vs_sync_conn entries start here */ }; +union ipvs_sockaddr { + struct sockaddr_in in; + struct sockaddr_in6 in6; +}; + struct ip_vs_sync_buff { struct list_head list; unsigned long firstuse; @@ -1301,6 +1306,14 @@ static void set_mcast_loop(struct sock *sk, u_char loop) /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ lock_sock(sk); inet->mc_loop = loop ? 1 : 0; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_LOOP */ + np->mc_loop = loop ? 1 : 0; + } +#endif release_sock(sk); } @@ -1314,6 +1327,14 @@ static void set_mcast_ttl(struct sock *sk, u_char ttl) /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ lock_sock(sk); inet->mc_ttl = ttl; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_HOPS */ + np->mcast_hops = ttl; + } +#endif release_sock(sk); } @@ -1325,6 +1346,14 @@ static void set_mcast_pmtudisc(struct sock *sk, int val) /* setsockopt(sock, SOL_IP, IP_MTU_DISCOVER, &val, sizeof(val)); */ lock_sock(sk); inet->pmtudisc = val; +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MTU_DISCOVER */ + np->pmtudisc = val; + } +#endif release_sock(sk); } @@ -1347,6 +1376,14 @@ static int set_mcast_if(struct sock *sk, char *ifname) lock_sock(sk); inet->mc_index = dev->ifindex; /* inet->mc_addr = 0; */ +#ifdef CONFIG_IP_VS_IPV6 + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + /* IPV6_MULTICAST_IF */ + np->mcast_oif = dev->ifindex; + } +#endif release_sock(sk); return 0; @@ -1384,6 +1421,27 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) return ret; } +#ifdef CONFIG_IP_VS_IPV6 +static int join_mcast_group6(struct sock *sk, struct in6_addr *addr, + char *ifname) +{ + struct net *net = sock_net(sk); + struct net_device *dev; + int ret; + + dev = __dev_get_by_name(net, ifname); + if (!dev) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + ret = ipv6_sock_mc_join(sk, dev->ifindex, addr); + release_sock(sk); + + return ret; +} +#endif static int bind_mcastif_addr(struct socket *sock, char *ifname) { @@ -1412,6 +1470,26 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname) return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); } +static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, + struct ipvs_sync_daemon_cfg *c, int id) +{ + if (AF_INET6 == c->mcast_af) { + sa->in6 = (struct sockaddr_in6) { + .sin6_family = AF_INET6, + .sin6_port = htons(c->mcast_port + id), + }; + sa->in6.sin6_addr = c->mcast_group.in6; + *salen = sizeof(sa->in6); + } else { + sa->in = (struct sockaddr_in) { + .sin_family = AF_INET, + .sin_port = htons(c->mcast_port + id), + }; + sa->in.sin_addr = c->mcast_group.in; + *salen = sizeof(sa->in); + } +} + /* * Set up sending multicast socket over UDP */ @@ -1419,16 +1497,13 @@ static struct socket *make_send_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); /* multicast addr */ - struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), - }; + union ipvs_sockaddr mcast_addr; struct socket *sock; - int result; + int result, salen; /* First create a socket */ - result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(net, ipvs->mcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); @@ -1440,21 +1515,25 @@ static struct socket *make_send_sock(struct net *net, int id) } set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); + set_mcast_ttl(sock->sk, ipvs->mcfg.mcast_ttl); /* Allow fragmentation if MTU changes */ set_mcast_pmtudisc(sock->sk, IP_PMTUDISC_DONT); result = sysctl_sync_sock_size(ipvs); if (result > 0) set_sock_size(sock->sk, 1, result); - result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); + if (AF_INET == ipvs->mcfg.mcast_af) + result = bind_mcastif_addr(sock, ipvs->mcfg.mcast_ifn); + else + result = 0; if (result < 0) { pr_err("Error binding address of the mcast interface\n"); goto error; } + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr), 0); + salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); goto error; @@ -1475,16 +1554,13 @@ static struct socket *make_receive_sock(struct net *net, int id) { struct netns_ipvs *ipvs = net_ipvs(net); /* multicast addr */ - struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id), - .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), - }; + union ipvs_sockaddr mcast_addr; struct socket *sock; - int result; + int result, salen; /* First create a socket */ - result = sock_create_kern(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + result = sock_create_kern(net, ipvs->bcfg.mcast_af, SOCK_DGRAM, + IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); @@ -1495,17 +1571,22 @@ static struct socket *make_receive_sock(struct net *net, int id) if (result > 0) set_sock_size(sock->sk, 0, result); - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr)); + get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; } /* join the multicast group */ - result = join_mcast_group(sock->sk, - (struct in_addr *) &mcast_addr.sin_addr, - ipvs->bcfg.mcast_ifn); +#ifdef CONFIG_IP_VS_IPV6 + if (ipvs->bcfg.mcast_af == AF_INET6) + result = join_mcast_group6(sock->sk, &mcast_addr.in6.sin6_addr, + ipvs->bcfg.mcast_ifn); + else +#endif + result = join_mcast_group(sock->sk, &mcast_addr.in.sin_addr, + ipvs->bcfg.mcast_ifn); if (result < 0) { pr_err("Error joining to the multicast group\n"); goto error; @@ -1721,12 +1802,23 @@ int start_sync_thread(struct net *net, struct ipvs_sync_daemon_cfg *c, } else count = ipvs->threads_mask + 1; + if (c->mcast_af == AF_UNSPEC) { + c->mcast_af = AF_INET; + c->mcast_group.ip = cpu_to_be32(IP_VS_SYNC_GROUP); + } + if (!c->mcast_port) + c->mcast_port = IP_VS_SYNC_PORT; + if (!c->mcast_ttl) + c->mcast_ttl = 1; + dev = __dev_get_by_name(net, c->mcast_ifn); if (!dev) { pr_err("Unknown mcast interface: %s\n", c->mcast_ifn); return -ENODEV; } - hlen = sizeof(struct iphdr) + sizeof(struct udphdr); + hlen = (AF_INET6 == c->mcast_af) ? + sizeof(struct ipv6hdr) + sizeof(struct udphdr) : + sizeof(struct iphdr) + sizeof(struct udphdr); mtu = (state == IP_VS_STATE_BACKUP) ? clamp(dev->mtu, 1500U, 65535U) : 1500U; min_mtu = (state == IP_VS_STATE_BACKUP) ? 1024 : 1; -- cgit v1.2.3 From 59e26423e00263b6b06b39d219147f22610ce5d6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 21 Aug 2015 21:28:10 +0200 Subject: netfilter: nf_dup: fix sparse warnings >> net/ipv4/netfilter/nft_dup_ipv4.c:29:37: sparse: incorrect type in initializer (different base types) net/ipv4/netfilter/nft_dup_ipv4.c:29:37: expected restricted __be32 [user type] s_addr net/ipv4/netfilter/nft_dup_ipv4.c:29:37: got unsigned int [unsigned] >> net/ipv6/netfilter/nf_dup_ipv6.c:48:23: sparse: incorrect type in assignment (different base types) net/ipv6/netfilter/nf_dup_ipv6.c:48:23: expected restricted __be32 [addressable] [assigned] [usertype] flowlabel net/ipv6/netfilter/nf_dup_ipv6.c:48:23: got int Reported-by: kbuild test robot Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/nft_dup_ipv4.c | 2 +- net/ipv6/netfilter/nf_dup_ipv6.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index 25419fbddcb6..b45932d43b69 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -26,7 +26,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); struct in_addr gw = { - .s_addr = regs->data[priv->sreg_addr], + .s_addr = (__force __be32)regs->data[priv->sreg_addr], }; int oif = regs->data[priv->sreg_dev]; diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index d8ab654080b4..c5c87e921ccd 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -45,8 +45,8 @@ static bool nf_dup_ipv6_route(struct sk_buff *skb, const struct in6_addr *gw, fl6.flowi6_oif = oif; fl6.daddr = *gw; - fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | - (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; + fl6.flowlabel = (__force __be32)(((iph->flow_lbl[0] & 0xF) << 16) | + (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); -- cgit v1.2.3 From 116984a316c3a3200f8a7912110cc4a6d6c0989e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 21 Aug 2015 21:34:08 +0200 Subject: netfilter: xt_TEE: use IS_ENABLED(CONFIG_NF_DUP_IPV6) Instead of IS_ENABLED(CONFIG_IPV6), otherwise we hit: et/built-in.o: In function `tee_tg6': >> xt_TEE.c:(.text+0x6cd8c): undefined reference to `nf_dup_ipv6' when: CONFIG_IPV6=y CONFIG_NF_DUP_IPV4=y # CONFIG_NF_DUP_IPV6 is not set CONFIG_NETFILTER_XT_TARGET_TEE=y Reported-by: kbuild test robot Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_TEE.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 49fee6aa2c0a..fd980aa7715d 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -37,7 +37,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) return XT_CONTINUE; } -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_NF_DUP_IPV6) static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { @@ -129,7 +129,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = { .destroy = tee_tg_destroy, .me = THIS_MODULE, }, -#if IS_ENABLED(CONFIG_IPV6) +#if IS_ENABLED(CONFIG_NF_DUP_IPV6) { .name = "TEE", .revision = 1, -- cgit v1.2.3 From b7fe10e5ebac2a3f37e95535e616494b65fa020f Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 19 Aug 2015 17:07:32 -0700 Subject: gro: Fix remcsum offload to deal with frags in GRO The remote checksum offload GRO did not consider the case that frag0 might be in use. This patch fixes that by accessing headers using the skb_gro functions and not saving offsets relative to skb->head. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 23 +++++++++-------------- include/linux/netdevice.h | 44 ++++++++++++++++++++++++++++++++------------ net/ipv4/fou.c | 28 ++++++++++++---------------- 3 files changed, 53 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 54615bb9d916..64fcd2402562 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -519,10 +519,10 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, u32 data, struct gro_remcsum *grc, bool nopartial) { - size_t start, offset, plen; + size_t start, offset; if (skb->remcsum_offload) - return NULL; + return vh; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; @@ -532,17 +532,8 @@ static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, offsetof(struct udphdr, check) : offsetof(struct tcphdr, check)); - plen = hdrlen + offset + sizeof(u16); - - /* Pull checksum that will be written */ - if (skb_gro_header_hard(skb, off + plen)) { - vh = skb_gro_header_slow(skb, off + plen, off); - if (!vh) - return NULL; - } - - skb_gro_remcsum_process(skb, (void *)vh + hdrlen, - start, offset, grc, nopartial); + vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -573,7 +564,6 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, goto out; } - skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); flags = ntohl(vh->vx_flags); @@ -588,6 +578,8 @@ static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, goto out; } + skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ + flush = 0; for (p = *head; p; p = p->next) { @@ -1110,6 +1102,9 @@ static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh, { size_t start, offset, plen; + if (skb->remcsum_offload) + return vh; + start = (data & VXLAN_RCO_MASK) << VXLAN_RCO_SHIFT; offset = start + ((data & VXLAN_RCO_UDP) ? offsetof(struct udphdr, check) : diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4bd177faa90c..6abe0d6f1e1d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2311,8 +2311,7 @@ __sum16 __skb_gro_checksum_complete(struct sk_buff *skb); static inline bool skb_at_gro_remcsum_start(struct sk_buff *skb) { - return (NAPI_GRO_CB(skb)->gro_remcsum_start - skb_headroom(skb) == - skb_gro_offset(skb)); + return (NAPI_GRO_CB(skb)->gro_remcsum_start == skb_gro_offset(skb)); } static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb, @@ -2408,37 +2407,58 @@ static inline void skb_gro_remcsum_init(struct gro_remcsum *grc) grc->delta = 0; } -static inline void skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, - int start, int offset, - struct gro_remcsum *grc, - bool nopartial) +static inline void *skb_gro_remcsum_process(struct sk_buff *skb, void *ptr, + unsigned int off, size_t hdrlen, + int start, int offset, + struct gro_remcsum *grc, + bool nopartial) { __wsum delta; + size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); BUG_ON(!NAPI_GRO_CB(skb)->csum_valid); if (!nopartial) { - NAPI_GRO_CB(skb)->gro_remcsum_start = - ((unsigned char *)ptr + start) - skb->head; - return; + NAPI_GRO_CB(skb)->gro_remcsum_start = off + hdrlen + start; + return ptr; + } + + ptr = skb_gro_header_fast(skb, off); + if (skb_gro_header_hard(skb, off + plen)) { + ptr = skb_gro_header_slow(skb, off + plen, off); + if (!ptr) + return NULL; } - delta = remcsum_adjust(ptr, NAPI_GRO_CB(skb)->csum, start, offset); + delta = remcsum_adjust(ptr + hdrlen, NAPI_GRO_CB(skb)->csum, + start, offset); /* Adjust skb->csum since we changed the packet */ NAPI_GRO_CB(skb)->csum = csum_add(NAPI_GRO_CB(skb)->csum, delta); - grc->offset = (ptr + offset) - (void *)skb->head; + grc->offset = off + hdrlen + offset; grc->delta = delta; + + return ptr; } static inline void skb_gro_remcsum_cleanup(struct sk_buff *skb, struct gro_remcsum *grc) { + void *ptr; + size_t plen = grc->offset + sizeof(u16); + if (!grc->delta) return; - remcsum_unadjust((__sum16 *)(skb->head + grc->offset), grc->delta); + ptr = skb_gro_header_fast(skb, grc->offset); + if (skb_gro_header_hard(skb, grc->offset + sizeof(u16))) { + ptr = skb_gro_header_slow(skb, plen, grc->offset); + if (!ptr) + return; + } + + remcsum_unadjust((__sum16 *)ptr, grc->delta); } static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev, diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 34968cd5c146..eb11f9506894 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -79,7 +79,11 @@ static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); + size_t plen = sizeof(struct udphdr) + hdrlen + + max_t(size_t, offset + sizeof(u16), start); + + if (skb->remcsum_offload) + return guehdr; if (!pskb_may_pull(skb, plen)) return NULL; @@ -221,29 +225,21 @@ out_unlock: static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, struct guehdr *guehdr, void *data, - size_t hdrlen, u8 ipproto, - struct gro_remcsum *grc, bool nopartial) + size_t hdrlen, struct gro_remcsum *grc, + bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); - size_t plen = hdrlen + max_t(size_t, offset + sizeof(u16), start); if (skb->remcsum_offload) - return NULL; + return guehdr; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; - /* Pull checksum that will be written */ - if (skb_gro_header_hard(skb, off + plen)) { - guehdr = skb_gro_header_slow(skb, off + plen, off); - if (!guehdr) - return NULL; - } - - skb_gro_remcsum_process(skb, (void *)guehdr + hdrlen, - start, offset, grc, nopartial); + guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen, + start, offset, grc, nopartial); skb->remcsum_offload = 1; @@ -307,10 +303,10 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_gro_remcsum(skb, off, guehdr, - data + doffset, hdrlen, - guehdr->proto_ctype, &grc, + data + doffset, hdrlen, &grc, !!(fou->flags & FOU_F_REMCSUM_NOPARTIAL)); + if (!guehdr) goto out; -- cgit v1.2.3 From 270136613bf7306e2b83457628e2b2f6c6be3989 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Wed, 19 Aug 2015 17:07:34 -0700 Subject: fou: Do WARN_ON_ONCE in gue_gro_receive for bad proto callbacks Do WARN_ON_ONCE instead of WARN_ON in gue_gro_receive when the offload callcaks are bad (either don't exist or gro_receive is not specified). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/fou.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index eb11f9506894..2d1646cff057 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -347,7 +347,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head, rcu_read_lock(); offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[guehdr->proto_ctype]); - if (WARN_ON(!ops || !ops->callbacks.gro_receive)) + if (WARN_ON_ONCE(!ops || !ops->callbacks.gro_receive)) goto out_unlock; pp = ops->callbacks.gro_receive(head, skb); -- cgit v1.2.3 From 17b2063077a7478e5fd3c34b04a059dbb8474638 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 20 Aug 2015 02:12:54 -0400 Subject: tipc: eliminate risk of premature link setup during failover When a link goes down, and there is still a working link towards its destination node, a failover is initiated, and the failed link is not allowed to re-establish until that procedure is finished. To ensure this, the concerned link endpoints are set to state LINK_FAILINGOVER, and the node endpoints to NODE_FAILINGOVER during the failover period. However, if the link reset is due to a disabled bearer, the corres- ponding link endpoint is deleted, and only the node endpoint knows about the ongoing failover. Now, if the disabled bearer is re-enabled during the failover period, the discovery mechanism may create a new link endpoint that is ready to be established, despite that this is not permitted. This situation may cause both the ongoing failover and any subsequent link synchronization to fail. In this commit, we ensure that a newly created link goes directly to state LINK_FAILINGOVER if the corresponding node state is NODE_FAILINGOVER. This eliminates the problem described above. Furthermore, we tighten the criteria for which packets are allowed to end a failover state in the function tipc_node_check_state(). By checking that the receiving link is up and running, instead of just checking that it is not in failover mode, we eliminate the risk that protocol packets from the re-created link may cause the failover to be prematurely terminated. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/node.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index 7c191641b44f..004834bd1605 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -565,6 +565,8 @@ void tipc_node_check_dest(struct net *net, u32 onode, goto exit; } tipc_link_reset(l); + if (n->state == NODE_FAILINGOVER) + tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); @@ -1129,7 +1131,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, } /* Open parallel link when tunnel link reaches synch point */ - if ((n->state == NODE_FAILINGOVER) && !tipc_link_is_failingover(l)) { + if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { if (!more(rcv_nxt, n->sync_point)) return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); -- cgit v1.2.3 From 5ae2f8e6857968d6dddbd3879ed0a32b860e02d1 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 20 Aug 2015 02:12:55 -0400 Subject: tipc: interrupt link synchronization when a link goes down When we introduced the new link failover/synch mechanism in commit 6e498158a827fd515b514842e9a06bdf0f75ab86 ("tipc: move link synch and failover to link aggregation level"), we missed the case when the non-tunnel link goes down during the link synchronization period. In this case the tunnel link will remain in state LINK_SYNCHING, something leading to unpredictable behavior when the failover procedure is initiated. In this commit, we ensure that the node and remaining link goes back to regular communication state (SELF_UP_PEER_UP/LINK_ESTABLISHED) when one of the parallel links goes down. We also ensure that we don't re-enter synch mode if subsequent SYNCH packets arrive on the remaining link. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 2 +- net/tipc/node.c | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index f067e5425560..7058c86f5e48 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -351,11 +351,11 @@ int tipc_link_fsm_evt(struct tipc_link *l, int evt) l->state = LINK_RESET; break; case LINK_ESTABLISH_EVT: + case LINK_SYNCH_END_EVT: break; case LINK_SYNCH_BEGIN_EVT: l->state = LINK_SYNCHING; break; - case LINK_SYNCH_END_EVT: case LINK_FAILOVER_BEGIN_EVT: case LINK_FAILOVER_END_EVT: default: diff --git a/net/tipc/node.c b/net/tipc/node.c index 004834bd1605..937cc6192bcf 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -423,6 +423,8 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, /* There is still a working link => initiate failover */ tnl = node_active_link(n, 0); + tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); + tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); tipc_link_reset(l); @@ -1140,6 +1142,10 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, return true; } + /* No synching needed if only one link */ + if (!pl || !tipc_link_is_up(pl)) + return true; + /* Initiate or update synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) { syncpt = iseqno + exp_pkts - 1; @@ -1158,9 +1164,8 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, /* Open tunnel link when parallel link reaches synch point */ if ((n->state == NODE_SYNCHING) && tipc_link_is_synching(l)) { - if (pl) - dlv_nxt = mod(pl->rcv_nxt - skb_queue_len(pl->inputq)); - if (!pl || more(dlv_nxt, n->sync_point)) { + dlv_nxt = pl->rcv_nxt - mod(skb_queue_len(pl->inputq)); + if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(l, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; -- cgit v1.2.3 From 2be80c2d87de789550982e74a11e9f9ff5940845 Mon Sep 17 00:00:00 2001 From: Jon Paul Maloy Date: Thu, 20 Aug 2015 02:12:56 -0400 Subject: tipc: fix stale link problem during synchronization Recent changes to the link synchronization means that we can now just drop packets arriving on the synchronizing link before the synch point is reached. This has lead to significant simplifications to the implementation, but also turns out to have a flip side that we need to consider. Under unlucky circumstances, the two endpoints may end up repeatedly dropping each other's packets, while immediately asking for retransmission of the same packets, just to drop them once more. This pattern will eventually be broken when the synch point is reached on the other link, but before that, the endpoints may have arrived at the retransmission limit (stale counter) that indicates that the link should be broken. We see this happen at rare occasions. The fix for this is to not ask for retransmissions when a link is in state LINK_SYNCHING. The fact that the link has reached this state means that it has already received the first SYNCH packet, and that it knows the synch point. Hence, it doesn't need any more packets until the other link has reached the synch point, whereafter it can go ahead and ask for the missing packets. However, because of the reduced traffic on the synching link that follows this change, it may now take longer to discover that the synch point has been reached. We compensate for this by letting all packets, on any of the links, trig a check for synchronization termination. This is possible because the packets themselves don't contain any information that is needed for discovering this condition. Reviewed-by: Ying Xue Signed-off-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/link.c | 3 ++- net/tipc/node.c | 12 ++++++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 7058c86f5e48..75db07c78a69 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1330,6 +1330,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, u16 peers_snd_nxt = msg_next_sent(hdr); u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); + u16 rcv_nxt = l->rcv_nxt; char *if_name; int rc = 0; @@ -1393,7 +1394,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, break; /* Send NACK if peer has sent pkts we haven't received yet */ - if (more(peers_snd_nxt, l->rcv_nxt)) + if (more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l)) rcvgap = peers_snd_nxt - l->rcv_nxt; if (rcvgap || (msg_probe(hdr))) tipc_link_build_proto_msg(l, STATE_MSG, 0, rcvgap, diff --git a/net/tipc/node.c b/net/tipc/node.c index 937cc6192bcf..703875fd6cde 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1079,7 +1079,7 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt; int state = n->state; - struct tipc_link *l, *pl = NULL; + struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; int i, pb_id; @@ -1164,12 +1164,20 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, /* Open tunnel link when parallel link reaches synch point */ if ((n->state == NODE_SYNCHING) && tipc_link_is_synching(l)) { + if (tipc_link_is_synching(l)) { + tnl = l; + } else { + tnl = pl; + pl = l; + } dlv_nxt = pl->rcv_nxt - mod(skb_queue_len(pl->inputq)); if (more(dlv_nxt, n->sync_point)) { - tipc_link_fsm_evt(l, LINK_SYNCH_END_EVT); + tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; } + if (l == pl) + return true; if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) return true; if (usr == LINK_PROTOCOL) -- cgit v1.2.3 From 127eb7cd3c210afead788991a30950a9e36759ea Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 24 Aug 2015 09:45:41 -0700 Subject: lwt: Add cfg argument to build_state Add cfg and family arguments to lwt build state functions. cfg is a void pointer and will either be a pointer to a fib_config or fib6_config structure. The family parameter indicates which one (either AF_INET or AF_INET6). LWT encpasulation implementation may use the fib configuration to build the LWT state. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/lwtunnel.h | 3 +++ net/core/lwtunnel.c | 5 +++-- net/ipv4/fib_semantics.c | 17 ++++++++++------- net/ipv4/ip_tunnel_core.c | 2 ++ net/ipv6/ila.c | 1 + net/ipv6/route.c | 3 ++- net/mpls/mpls_iptunnel.c | 1 + 7 files changed, 22 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 843489884448..fce0e35e74d0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -26,6 +26,7 @@ struct lwtunnel_state { struct lwtunnel_encap_ops { int (*build_state)(struct net_device *dev, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **ts); int (*output)(struct sock *sk, struct sk_buff *skb); int (*input)(struct sk_buff *skb); @@ -80,6 +81,7 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_build_state(struct net_device *dev, u16 encap_type, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **lws); int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate); @@ -130,6 +132,7 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type, struct nlattr *encap, + unsigned int family, const void *cfg, struct lwtunnel_state **lws) { return -EOPNOTSUPP; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index e924c2e08554..dfb1a9ca0835 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -72,7 +72,8 @@ int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *ops, EXPORT_SYMBOL(lwtunnel_encap_del_ops); int lwtunnel_build_state(struct net_device *dev, u16 encap_type, - struct nlattr *encap, struct lwtunnel_state **lws) + struct nlattr *encap, unsigned int family, + const void *cfg, struct lwtunnel_state **lws) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; @@ -85,7 +86,7 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type, rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[encap_type]); if (likely(ops && ops->build_state)) - ret = ops->build_state(dev, encap, lws); + ret = ops->build_state(dev, encap, family, cfg, lws); rcu_read_unlock(); return ret; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 01f1c7dcd329..1b2d01170a4d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -511,7 +511,8 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, dev = __dev_get_by_index(net, cfg->fc_oif); ret = lwtunnel_build_state(dev, nla_get_u16( nla_entype), - nla, &lwtstate); + nla, AF_INET, cfg, + &lwtstate); if (ret) goto errout; nexthop_nh->nh_lwtstate = @@ -535,7 +536,8 @@ errout: static int fib_encap_match(struct net *net, u16 encap_type, struct nlattr *encap, - int oif, const struct fib_nh *nh) + int oif, const struct fib_nh *nh, + const struct fib_config *cfg) { struct lwtunnel_state *lwtstate; struct net_device *dev = NULL; @@ -546,8 +548,8 @@ static int fib_encap_match(struct net *net, u16 encap_type, if (oif) dev = __dev_get_by_index(net, oif); - ret = lwtunnel_build_state(dev, encap_type, - encap, &lwtstate); + ret = lwtunnel_build_state(dev, encap_type, encap, + AF_INET, cfg, &lwtstate); if (!ret) { result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate); lwtstate_free(lwtstate); @@ -571,7 +573,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, cfg->fc_encap, cfg->fc_oif, - fi->fib_nh)) + fi->fib_nh, cfg)) return 1; } if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && @@ -663,7 +665,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { - int err; + int err = 0; struct net *net; struct net_device *dev; @@ -1005,7 +1007,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) if (cfg->fc_oif) dev = __dev_get_by_index(net, cfg->fc_oif); err = lwtunnel_build_state(dev, cfg->fc_encap_type, - cfg->fc_encap, &lwtstate); + cfg->fc_encap, AF_INET, cfg, + &lwtstate); if (err) goto failure; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 289b6c26ce37..934f2ac8ad61 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -204,6 +204,7 @@ static const struct nla_policy ip_tun_policy[LWTUNNEL_IP_MAX + 1] = { }; static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, + unsigned int family, const void *cfg, struct lwtunnel_state **ts) { struct ip_tunnel_info *tun_info; @@ -311,6 +312,7 @@ static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = { }; static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, + unsigned int family, const void *cfg, struct lwtunnel_state **ts) { struct ip_tunnel_info *tun_info; diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c index f011c3d5ca40..ffe4dcad6088 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila.c @@ -123,6 +123,7 @@ static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { }; static int ila_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, struct lwtunnel_state **ts) { struct ila_params *p; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e476f01add87..df3e353a012d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1819,7 +1819,8 @@ int ip6_route_add(struct fib6_config *cfg) struct lwtunnel_state *lwtstate; err = lwtunnel_build_state(dev, cfg->fc_encap_type, - cfg->fc_encap, &lwtstate); + cfg->fc_encap, AF_INET6, cfg, + &lwtstate); if (err) goto out; rt->dst.lwtstate = lwtstate_get(lwtstate); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 3da5ca3ba563..21e70bc9af98 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -123,6 +123,7 @@ drop: } static int mpls_build_state(struct net_device *dev, struct nlattr *nla, + unsigned int family, const void *cfg, struct lwtunnel_state **ts) { struct mpls_iptunnel_encap *tun_encap_info; -- cgit v1.2.3 From 92b78aff855a4815afe5311e3473ec829d3f2a9e Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 24 Aug 2015 09:45:42 -0700 Subject: ila: Precompute checksum difference for translations In the ILA build state for LWT compute the checksum difference to apply to transport checksums that include the IPv6 pseudo header. The difference is between the route destination (from fib6_config) and the locator to write. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/ipv6/ila.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'net') diff --git a/net/ipv6/ila.c b/net/ipv6/ila.c index ffe4dcad6088..678d2df4b8d9 100644 --- a/net/ipv6/ila.c +++ b/net/ipv6/ila.c @@ -14,6 +14,8 @@ struct ila_params { __be64 locator; + __be64 locator_match; + __wsum csum_diff; }; static inline struct ila_params *ila_params_lwtunnel( @@ -33,6 +35,9 @@ static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to) static inline __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) { + if (*(__be64 *)&ip6h->daddr == p->locator_match) + return p->csum_diff; + else return compute_csum_diff8((__be32 *)&ip6h->daddr, (__be32 *)&p->locator); } @@ -130,8 +135,12 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla, struct nlattr *tb[ILA_ATTR_MAX + 1]; size_t encap_len = sizeof(*p); struct lwtunnel_state *newts; + const struct fib6_config *cfg6 = cfg; int ret; + if (family != AF_INET6) + return -EINVAL; + ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy); if (ret < 0) @@ -149,6 +158,15 @@ static int ila_build_state(struct net_device *dev, struct nlattr *nla, p->locator = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); + if (cfg6->fc_dst_len > sizeof(__be64)) { + /* Precompute checksum difference for translation since we + * know both the old locator and the new one. + */ + p->locator_match = *(__be64 *)&cfg6->fc_dst; + p->csum_diff = compute_csum_diff8( + (__be32 *)&p->locator_match, (__be32 *)&p->locator); + } + newts->type = LWTUNNEL_ENCAP_ILA; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | LWTUNNEL_STATE_INPUT_REDIRECT; -- cgit v1.2.3 From 6b5e971a282c0e7b18b47823103d695352b5a3c2 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 26 May 2015 18:34:26 +0200 Subject: batman-adv: Replace C99 int types with kernel type (s|u)(8|16|32|64) are the preferred types in the kernel. The use of the standard C99 types u?int(8|16|32|64)_t are objected by some people and even checkpatch now warns about using them. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 94 +++++++------- net/batman-adv/bitarray.c | 6 +- net/batman-adv/bitarray.h | 10 +- net/batman-adv/bridge_loop_avoidance.c | 72 ++++++----- net/batman-adv/bridge_loop_avoidance.h | 5 +- net/batman-adv/distributed-arp-table.c | 59 +++++---- net/batman-adv/distributed-arp-table.h | 8 +- net/batman-adv/fragmentation.c | 10 +- net/batman-adv/gateway_client.c | 14 +-- net/batman-adv/gateway_client.h | 2 +- net/batman-adv/gateway_common.c | 11 +- net/batman-adv/hash.c | 6 +- net/batman-adv/hash.h | 12 +- net/batman-adv/icmp_socket.c | 2 +- net/batman-adv/main.c | 69 +++++----- net/batman-adv/main.h | 48 ++++--- net/batman-adv/multicast.c | 16 +-- net/batman-adv/network-coding.c | 44 ++++--- net/batman-adv/originator.c | 15 ++- net/batman-adv/originator.h | 11 +- net/batman-adv/packet.h | 204 +++++++++++++++--------------- net/batman-adv/routing.c | 24 ++-- net/batman-adv/routing.h | 2 +- net/batman-adv/send.c | 8 +- net/batman-adv/send.h | 8 +- net/batman-adv/soft-interface.c | 26 ++-- net/batman-adv/sysfs.c | 4 +- net/batman-adv/translation-table.c | 224 ++++++++++++++++----------------- net/batman-adv/translation-table.h | 29 +++-- net/batman-adv/types.h | 105 ++++++++-------- 30 files changed, 563 insertions(+), 585 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 753383c2215c..f550a6ab77bb 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -77,8 +77,7 @@ enum batadv_dup_status { * @lq_index: index to store the value at * @value: value to store in the ring buffer */ -static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, - uint8_t value) +static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value) { lq_recv[*lq_index] = value; *lq_index = (*lq_index + 1) % BATADV_TQ_GLOBAL_WINDOW_SIZE; @@ -91,12 +90,12 @@ static void batadv_ring_buffer_set(uint8_t lq_recv[], uint8_t *lq_index, * * Returns computed average value. */ -static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) +static u8 batadv_ring_buffer_avg(const u8 lq_recv[]) { - const uint8_t *ptr; - uint16_t count = 0; - uint16_t i = 0; - uint16_t sum = 0; + const u8 *ptr; + u16 count = 0; + u16 i = 0; + u16 sum = 0; ptr = lq_recv; @@ -113,7 +112,7 @@ static uint8_t batadv_ring_buffer_avg(const uint8_t lq_recv[]) if (count == 0) return 0; - return (uint8_t)(sum / count); + return (u8)(sum / count); } /** @@ -155,14 +154,14 @@ static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node, kfree(orig_node->bat_iv.bcast_own); orig_node->bat_iv.bcast_own = data_ptr; - data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC); + data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); if (!data_ptr) { kfree(orig_node->bat_iv.bcast_own); goto unlock; } memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, - (max_if_num - 1) * sizeof(uint8_t)); + (max_if_num - 1) * sizeof(u8)); kfree(orig_node->bat_iv.bcast_own_sum); orig_node->bat_iv.bcast_own_sum = data_ptr; @@ -215,19 +214,19 @@ free_bcast_own: if (max_if_num == 0) goto free_own_sum; - data_ptr = kmalloc_array(max_if_num, sizeof(uint8_t), GFP_ATOMIC); + data_ptr = kmalloc_array(max_if_num, sizeof(u8), GFP_ATOMIC); if (!data_ptr) { kfree(orig_node->bat_iv.bcast_own); goto unlock; } memcpy(data_ptr, orig_node->bat_iv.bcast_own_sum, - del_if_num * sizeof(uint8_t)); + del_if_num * sizeof(u8)); - if_offset = (del_if_num + 1) * sizeof(uint8_t); - memcpy((char *)data_ptr + del_if_num * sizeof(uint8_t), + if_offset = (del_if_num + 1) * sizeof(u8); + memcpy((char *)data_ptr + del_if_num * sizeof(u8), orig_node->bat_iv.bcast_own_sum + if_offset, - (max_if_num - del_if_num) * sizeof(uint8_t)); + (max_if_num - del_if_num) * sizeof(u8)); free_own_sum: kfree(orig_node->bat_iv.bcast_own_sum); @@ -250,7 +249,7 @@ unlock: * If the object does not exists it is created an initialised. */ static struct batadv_orig_node * -batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) +batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr) { struct batadv_orig_node *orig_node; int size, hash_added; @@ -270,7 +269,7 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const uint8_t *addr) if (!orig_node->bat_iv.bcast_own) goto free_orig_node; - size = bat_priv->num_ifaces * sizeof(uint8_t); + size = bat_priv->num_ifaces * sizeof(u8); orig_node->bat_iv.bcast_own_sum = kzalloc(size, GFP_ATOMIC); if (!orig_node->bat_iv.bcast_own_sum) goto free_orig_node; @@ -293,7 +292,7 @@ free_orig_node: static struct batadv_neigh_node * batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, - const uint8_t *neigh_addr, + const u8 *neigh_addr, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh) { @@ -339,7 +338,7 @@ static int batadv_iv_ogm_iface_enable(struct batadv_hard_iface *hard_iface) { struct batadv_ogm_packet *batadv_ogm_packet; unsigned char *ogm_buff; - uint32_t random_seqno; + u32 random_seqno; /* randomize initial seqno to avoid collision */ get_random_bytes(&random_seqno, sizeof(random_seqno)); @@ -411,8 +410,7 @@ static unsigned long batadv_iv_ogm_fwd_send_time(void) } /* apply hop penalty for a normal link */ -static uint8_t batadv_hop_penalty(uint8_t tq, - const struct batadv_priv *bat_priv) +static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv) { int hop_penalty = atomic_read(&bat_priv->hop_penalty); int new_tq; @@ -442,11 +440,11 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet, { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); const char *fwd_str; - uint8_t packet_num; - int16_t buff_pos; + u8 packet_num; + s16 buff_pos; struct batadv_ogm_packet *batadv_ogm_packet; struct sk_buff *skb; - uint8_t *packet_pos; + u8 *packet_pos; if (hard_iface->if_status != BATADV_IF_ACTIVE) return; @@ -837,7 +835,7 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); - uint16_t tvlv_len; + u16 tvlv_len; if (batadv_ogm_packet->ttl <= 1) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n"); @@ -896,9 +894,9 @@ batadv_iv_ogm_slide_own_bcast_window(struct batadv_hard_iface *hard_iface) struct hlist_head *head; struct batadv_orig_node *orig_node; unsigned long *word; - uint32_t i; + u32 i; size_t word_index; - uint8_t *w; + u8 *w; int if_num; for (i = 0; i < hash->size; i++) { @@ -927,8 +925,8 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *primary_if, *tmp_hard_iface; int *ogm_buff_len = &hard_iface->bat_iv.ogm_buff_len; - uint32_t seqno; - uint16_t tvlv_len = 0; + u32 seqno; + u16 tvlv_len = 0; unsigned long send_time; primary_if = batadv_primary_if_get_selected(bat_priv); @@ -947,7 +945,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) batadv_ogm_packet->tvlv_len = htons(tvlv_len); /* change sequence number to network order */ - seqno = (uint32_t)atomic_read(&hard_iface->bat_iv.ogm_seqno); + seqno = (u32)atomic_read(&hard_iface->bat_iv.ogm_seqno); batadv_ogm_packet->seqno = htonl(seqno); atomic_inc(&hard_iface->bat_iv.ogm_seqno); @@ -1010,9 +1008,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_node_tmp; int if_num; - uint8_t sum_orig, sum_neigh; - uint8_t *neigh_addr; - uint8_t tq_avg; + u8 sum_orig, sum_neigh; + u8 *neigh_addr; + u8 tq_avg; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "update_originator(): Searching and updating originator entry of received packet\n"); @@ -1164,8 +1162,8 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node, struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; - uint8_t total_count; - uint8_t orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; + u8 total_count; + u8 orig_eq_count, neigh_rq_count, neigh_rq_inv, tq_own; unsigned int neigh_rq_inv_cube, neigh_rq_max_cube; int tq_asym_penalty, inv_asym_penalty, if_num, ret = 0; unsigned int combined_tq; @@ -1311,13 +1309,13 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr, struct batadv_neigh_node *neigh_node; struct batadv_neigh_ifinfo *neigh_ifinfo; int is_dup; - int32_t seq_diff; + s32 seq_diff; int need_update = 0; int set_mark; enum batadv_dup_status ret = BATADV_NO_DUP; - uint32_t seqno = ntohl(batadv_ogm_packet->seqno); - uint8_t *neigh_addr; - uint8_t packet_count; + u32 seqno = ntohl(batadv_ogm_packet->seqno); + u8 *neigh_addr; + u8 packet_count; unsigned long *bitmap; orig_node = batadv_iv_ogm_orig_get(bat_priv, batadv_ogm_packet->orig); @@ -1418,7 +1416,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, bool sameseq, similar_ttl; struct sk_buff *skb_priv; struct ethhdr *ethhdr; - uint8_t *prev_sender; + u8 *prev_sender; int is_bidirect; /* create a private copy of the skb, as some functions change tq value @@ -1600,7 +1598,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, struct batadv_orig_node *orig_neigh_node, *orig_node; struct batadv_hard_iface *hard_iface; struct batadv_ogm_packet *ogm_packet; - uint32_t if_incoming_seqno; + u32 if_incoming_seqno; bool has_directlink_flag; struct ethhdr *ethhdr; bool is_my_oldorig = false; @@ -1673,9 +1671,9 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset, if (is_my_orig) { unsigned long *word; int offset; - int32_t bit_pos; - int16_t if_num; - uint8_t *weight; + s32 bit_pos; + s16 if_num; + u8 *weight; orig_neigh_node = batadv_iv_ogm_orig_get(bat_priv, ethhdr->h_source); @@ -1751,7 +1749,7 @@ static int batadv_iv_ogm_receive(struct sk_buff *skb, { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_ogm_packet *ogm_packet; - uint8_t *packet_pos; + u8 *packet_pos; int ogm_offset; bool ret; @@ -1835,7 +1833,7 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv, unsigned long last_seen_jiffies; struct hlist_head *head; int batman_count = 0; - uint32_t i; + u32 i; seq_printf(seq, " %-15s %s (%s/%i) %17s [%10s]: %20s ...\n", "Originator", "last-seen", "#", BATADV_TQ_MAX_VALUE, @@ -1903,7 +1901,7 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; - uint8_t tq1, tq2; + u8 tq1, tq2; int diff; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); @@ -1945,7 +1943,7 @@ batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1, struct batadv_hard_iface *if_outgoing2) { struct batadv_neigh_ifinfo *neigh1_ifinfo, *neigh2_ifinfo; - uint8_t tq1, tq2; + u8 tq1, tq2; bool ret; neigh1_ifinfo = batadv_neigh_ifinfo_get(neigh1, if_outgoing1); diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index cf68c328345e..25cbc36e997a 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -21,7 +21,7 @@ #include /* shift the packet array by n places. */ -static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n) +static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n) { if (n <= 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE) return; @@ -35,8 +35,8 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, int32_t n) * 1 if the window was moved (either new or very old) * 0 if the window was not moved/shifted. */ -int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, - int32_t seq_num_diff, int set_mark) +int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, + int set_mark) { struct batadv_priv *bat_priv = priv; diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 0c2456225fae..0226b220fe5b 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -28,9 +28,9 @@ * and curr_seqno is within range of last_seqno. Otherwise returns 0. */ static inline int batadv_test_bit(const unsigned long *seq_bits, - uint32_t last_seqno, uint32_t curr_seqno) + u32 last_seqno, u32 curr_seqno) { - int32_t diff; + s32 diff; diff = last_seqno - curr_seqno; if (diff < 0 || diff >= BATADV_TQ_LOCAL_WINDOW_SIZE) @@ -39,7 +39,7 @@ static inline int batadv_test_bit(const unsigned long *seq_bits, } /* turn corresponding bit on, so we can remember that we got the packet */ -static inline void batadv_set_bit(unsigned long *seq_bits, int32_t n) +static inline void batadv_set_bit(unsigned long *seq_bits, s32 n) { /* if too old, just drop it */ if (n < 0 || n >= BATADV_TQ_LOCAL_WINDOW_SIZE) @@ -51,7 +51,7 @@ static inline void batadv_set_bit(unsigned long *seq_bits, int32_t n) /* receive and process one packet, returns 1 if received seq_num is considered * new, 0 if old */ -int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, - int32_t seq_num_diff, int set_mark); +int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff, + int set_mark); #endif /* _NET_BATMAN_ADV_BITARRAY_H_ */ diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ba0609292ae7..8f9059f15ebb 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -51,7 +51,7 @@ #include "packet.h" #include "translation-table.h" -static const uint8_t batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; +static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; static void batadv_bla_periodic_work(struct work_struct *work); static void @@ -59,10 +59,10 @@ batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); /* return the index of the claim */ -static inline uint32_t batadv_choose_claim(const void *data, uint32_t size) +static inline u32 batadv_choose_claim(const void *data, u32 size) { struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; - uint32_t hash = 0; + u32 hash = 0; hash = jhash(&claim->addr, sizeof(claim->addr), hash); hash = jhash(&claim->vid, sizeof(claim->vid), hash); @@ -71,11 +71,10 @@ static inline uint32_t batadv_choose_claim(const void *data, uint32_t size) } /* return the index of the backbone gateway */ -static inline uint32_t batadv_choose_backbone_gw(const void *data, - uint32_t size) +static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data; - uint32_t hash = 0; + u32 hash = 0; hash = jhash(&claim->addr, sizeof(claim->addr), hash); hash = jhash(&claim->vid, sizeof(claim->vid), hash); @@ -192,8 +191,8 @@ static struct batadv_bla_claim * Returns claim if found or NULL otherwise. */ static struct batadv_bla_backbone_gw * -batadv_backbone_hash_find(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid) +batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr, + unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; @@ -269,14 +268,14 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) */ -static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, +static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, unsigned short vid, int claimtype) { struct sk_buff *skb; struct ethhdr *ethhdr; struct batadv_hard_iface *primary_if; struct net_device *soft_iface; - uint8_t *hw_src; + u8 *hw_src; struct batadv_bla_claim_dst local_claim_dest; __be32 zeroip = 0; @@ -304,13 +303,13 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac, * with XX = claim type * and YY:YY = group id */ - (uint8_t *)&local_claim_dest); + (u8 *)&local_claim_dest); if (!skb) goto out; ethhdr = (struct ethhdr *)skb->data; - hw_src = (uint8_t *)ethhdr + ETH_HLEN + sizeof(struct arphdr); + hw_src = (u8 *)ethhdr + ETH_HLEN + sizeof(struct arphdr); /* now we pretend that the client would have sent this ... */ switch (claimtype) { @@ -383,7 +382,7 @@ out: * be found. */ static struct batadv_bla_backbone_gw * -batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, uint8_t *orig, +batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid, bool own_backbone) { struct batadv_bla_backbone_gw *entry; @@ -552,7 +551,7 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) { - uint8_t mac[ETH_ALEN]; + u8 mac[ETH_ALEN]; __be16 crc; memcpy(mac, batadv_announce_mac, 4); @@ -571,7 +570,7 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, * @backbone_gw: the backbone gateway which claims it */ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, - const uint8_t *mac, const unsigned short vid, + const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_bla_claim *claim; @@ -635,7 +634,7 @@ claim_free_ref: * given mac address and vid. */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, - const uint8_t *mac, const unsigned short vid) + const u8 *mac, const unsigned short vid) { struct batadv_bla_claim search_claim, *claim; @@ -659,12 +658,11 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, } /* check for ANNOUNCE frame, return 1 if handled */ -static int batadv_handle_announce(struct batadv_priv *bat_priv, - uint8_t *an_addr, uint8_t *backbone_addr, - unsigned short vid) +static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, + u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; - uint16_t crc; + u16 crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) return 0; @@ -708,8 +706,8 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, /* check for REQUEST frame, return 1 if handled */ static int batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - uint8_t *backbone_addr, - struct ethhdr *ethhdr, unsigned short vid) + u8 *backbone_addr, struct ethhdr *ethhdr, + unsigned short vid) { /* check for REQUEST frame */ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest)) @@ -732,8 +730,8 @@ static int batadv_handle_request(struct batadv_priv *bat_priv, /* check for UNCLAIM frame, return 1 if handled */ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - uint8_t *backbone_addr, - uint8_t *claim_addr, unsigned short vid) + u8 *backbone_addr, u8 *claim_addr, + unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -761,7 +759,7 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv, /* check for CLAIM frame, return 1 if handled */ static int batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - uint8_t *backbone_addr, uint8_t *claim_addr, + u8 *backbone_addr, u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; @@ -805,10 +803,10 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv, */ static int batadv_check_claim_group(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, - uint8_t *hw_src, uint8_t *hw_dst, + u8 *hw_src, u8 *hw_dst, struct ethhdr *ethhdr) { - uint8_t *backbone_addr; + u8 *backbone_addr; struct batadv_orig_node *orig_node; struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; @@ -877,7 +875,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; - uint8_t *hw_src, *hw_dst; + u8 *hw_src, *hw_dst; struct vlan_hdr *vhdr, vhdr_buf; struct ethhdr *ethhdr; struct arphdr *arphdr; @@ -923,7 +921,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, /* pskb_may_pull() may have modified the pointers, get ethhdr again */ ethhdr = eth_hdr(skb); - arphdr = (struct arphdr *)((uint8_t *)ethhdr + headlen); + arphdr = (struct arphdr *)((u8 *)ethhdr + headlen); /* Check whether the ARP frame carries a valid * IP information @@ -937,7 +935,7 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv, if (arphdr->ar_pln != 4) return 0; - hw_src = (uint8_t *)arphdr + sizeof(struct arphdr); + hw_src = (u8 *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; @@ -1238,9 +1236,9 @@ static struct lock_class_key batadv_backbone_hash_lock_class_key; int batadv_bla_init(struct batadv_priv *bat_priv) { int i; - uint8_t claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00}; + u8 claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00}; struct batadv_hard_iface *primary_if; - uint16_t crc; + u16 crc; unsigned long entrytime; spin_lock_init(&bat_priv->bla.bcast_duplist_lock); @@ -1368,7 +1366,7 @@ out: * * Returns true if orig is a backbone for this vid, false otherwise. */ -bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig, +bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; @@ -1647,9 +1645,9 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) struct batadv_bla_claim *claim; struct batadv_hard_iface *primary_if; struct hlist_head *head; - uint32_t i; + u32 i; bool is_own; - uint8_t *primary_addr; + u8 *primary_addr; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) @@ -1692,9 +1690,9 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) struct batadv_hard_iface *primary_if; struct hlist_head *head; int secs, msecs; - uint32_t i; + u32 i; bool is_own; - uint8_t *primary_addr; + u8 *primary_addr; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 0282690389ac..449f2f1350a5 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -38,7 +38,7 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb, int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset); int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset); -bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, uint8_t *orig, +bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid); int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb); @@ -84,8 +84,7 @@ static inline int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, } static inline bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, - uint8_t *orig, - unsigned short vid) + u8 *orig, unsigned short vid) { return false; } diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index cc7d87d64987..a63ea27eca6c 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -102,7 +102,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv, struct batadv_dat_entry *dat_entry; struct hlist_node *node_tmp; struct hlist_head *head; - uint32_t i; + u32 i; if (!bat_priv->dat.hash) return; @@ -168,11 +168,11 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2) * * Returns the value of the hw_src field in the ARP packet. */ -static uint8_t *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) +static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size) { - uint8_t *addr; + u8 *addr; - addr = (uint8_t *)(skb->data + hdr_size); + addr = (u8 *)(skb->data + hdr_size); addr += ETH_HLEN + sizeof(struct arphdr); return addr; @@ -197,7 +197,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size) * * Returns the value of the hw_dst field in the ARP packet. */ -static uint8_t *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) +static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size) { return batadv_arp_hw_src(skb, hdr_size) + ETH_ALEN + 4; } @@ -221,12 +221,12 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size) * * Returns the selected index in the hash table for the given data. */ -static uint32_t batadv_hash_dat(const void *data, uint32_t size) +static u32 batadv_hash_dat(const void *data, u32 size) { - uint32_t hash = 0; + u32 hash = 0; const struct batadv_dat_entry *dat = data; const unsigned char *key; - uint32_t i; + u32 i; key = (const unsigned char *)&dat->ip; for (i = 0; i < sizeof(dat->ip); i++) { @@ -265,7 +265,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, struct hlist_head *head; struct batadv_dat_entry to_find, *dat_entry, *dat_entry_tmp = NULL; struct batadv_hashtable *hash = bat_priv->dat.hash; - uint32_t index; + u32 index; if (!hash) return NULL; @@ -300,7 +300,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip, * @vid: VLAN identifier */ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, - uint8_t *mac_addr, unsigned short vid) + u8 *mac_addr, unsigned short vid) { struct batadv_dat_entry *dat_entry; int hash_added; @@ -357,11 +357,11 @@ out: * @msg: message to print together with the debugging information */ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, - uint16_t type, int hdr_size, char *msg) + u16 type, int hdr_size, char *msg) { struct batadv_unicast_4addr_packet *unicast_4addr_packet; struct batadv_bcast_packet *bcast_pkt; - uint8_t *orig_addr; + u8 *orig_addr; __be32 ip_src, ip_dst; if (msg) @@ -424,7 +424,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, #else static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb, - uint16_t type, int hdr_size, char *msg) + u16 type, int hdr_size, char *msg) { } @@ -709,9 +709,8 @@ void batadv_dat_status_update(struct net_device *net_dev) */ static void batadv_dat_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, - void *tvlv_value, - uint16_t tvlv_value_len) + u8 flags, + void *tvlv_value, u16 tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) clear_bit(BATADV_ORIG_CAPA_HAS_DAT, &orig->capabilities); @@ -787,7 +786,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) struct hlist_head *head; unsigned long last_seen_jiffies; int last_seen_msecs, last_seen_secs, last_seen_mins; - uint32_t i; + u32 i; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) @@ -830,14 +829,14 @@ out: * * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise. */ -static uint16_t batadv_arp_get_type(struct batadv_priv *bat_priv, - struct sk_buff *skb, int hdr_size) +static u16 batadv_arp_get_type(struct batadv_priv *bat_priv, + struct sk_buff *skb, int hdr_size) { struct arphdr *arphdr; struct ethhdr *ethhdr; __be32 ip_src, ip_dst; - uint8_t *hw_src, *hw_dst; - uint16_t type = 0; + u8 *hw_src, *hw_dst; + u16 type = 0; /* pull the ethernet header */ if (unlikely(!pskb_may_pull(skb, hdr_size + ETH_HLEN))) @@ -934,9 +933,9 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size) bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb) { - uint16_t type = 0; + u16 type = 0; __be32 ip_dst, ip_src; - uint8_t *hw_src; + u8 *hw_src; bool ret = false; struct batadv_dat_entry *dat_entry = NULL; struct sk_buff *skb_new; @@ -1022,9 +1021,9 @@ out: bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { - uint16_t type; + u16 type; __be32 ip_src, ip_dst; - uint8_t *hw_src; + u8 *hw_src; struct sk_buff *skb_new; struct batadv_dat_entry *dat_entry = NULL; bool ret = false; @@ -1100,9 +1099,9 @@ out: void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb) { - uint16_t type; + u16 type; __be32 ip_src, ip_dst; - uint8_t *hw_src, *hw_dst; + u8 *hw_src, *hw_dst; int hdr_size = 0; unsigned short vid; @@ -1146,9 +1145,9 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { - uint16_t type; + u16 type; __be32 ip_src, ip_dst; - uint8_t *hw_src, *hw_dst; + u8 *hw_src, *hw_dst; bool dropped = false; unsigned short vid; @@ -1202,7 +1201,7 @@ out: bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet) { - uint16_t type; + u16 type; __be32 ip_dst; struct batadv_dat_entry *dat_entry = NULL; bool ret = false; diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index 3181507ebc14..26d4a525a798 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -54,7 +54,7 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, static inline void batadv_dat_init_orig_node_addr(struct batadv_orig_node *orig_node) { - uint32_t addr; + u32 addr; addr = batadv_choose_orig(orig_node->orig, BATADV_DAT_ADDR_MAX); orig_node->dat_addr = (batadv_dat_addr_t)addr; @@ -69,7 +69,7 @@ static inline void batadv_dat_init_own_addr(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if) { - uint32_t addr; + u32 addr; addr = batadv_choose_orig(primary_if->net_dev->dev_addr, BATADV_DAT_ADDR_MAX); @@ -89,7 +89,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset); * Updates the ethtool statistics for the received packet if it is a DAT subtype */ static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv, - uint8_t subtype) + u8 subtype) { switch (subtype) { case BATADV_P_DAT_DHT_GET: @@ -169,7 +169,7 @@ static inline void batadv_dat_free(struct batadv_priv *bat_priv) } static inline void batadv_dat_inc_counter(struct batadv_priv *bat_priv, - uint8_t subtype) + u8 subtype) { } diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index c0f0d01ab244..ec5f7bcad66e 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -66,7 +66,7 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node, bool (*check_cb)(struct batadv_frag_table_entry *)) { struct batadv_frag_table_entry *chain; - uint8_t i; + u8 i; for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) { chain = &orig_node->fragments[i]; @@ -110,7 +110,7 @@ static int batadv_frag_size_limit(void) * without searching for the right position. */ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, - uint16_t seqno) + u16 seqno) { if (chain->seqno == seqno) return false; @@ -145,8 +145,8 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, struct batadv_frag_list_entry *frag_entry_new = NULL, *frag_entry_curr; struct batadv_frag_list_entry *frag_entry_last = NULL; struct batadv_frag_packet *frag_packet; - uint8_t bucket; - uint16_t seqno, hdr_size = sizeof(struct batadv_frag_packet); + u8 bucket; + u16 seqno, hdr_size = sizeof(struct batadv_frag_packet); bool ret = false; /* Linearize packet to avoid linearizing 16 packets in a row when doing @@ -351,7 +351,7 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb, struct batadv_orig_node *orig_node_dst = NULL; struct batadv_neigh_node *neigh_node = NULL; struct batadv_frag_packet *packet; - uint16_t total_size; + u16 total_size; bool ret = false; packet = (struct batadv_frag_packet *)skb->data; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 6012e2b4af4f..6269c75919bf 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -153,9 +153,9 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) struct batadv_neigh_node *router; struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; - uint64_t max_gw_factor = 0, tmp_gw_factor = 0; - uint8_t max_tq = 0; - uint8_t tq_avg; + u64 max_gw_factor = 0, tmp_gw_factor = 0; + u8 max_tq = 0; + u8 tq_avg; struct batadv_orig_node *orig_node; rcu_read_lock(); @@ -348,7 +348,7 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_neigh_ifinfo *router_gw_tq = NULL; struct batadv_orig_node *curr_gw_orig; struct batadv_neigh_node *router_gw = NULL, *router_orig = NULL; - uint8_t gw_tq_avg, orig_tq_avg; + u8 gw_tq_avg, orig_tq_avg; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); if (!curr_gw_orig) @@ -688,7 +688,7 @@ out: */ enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, - uint8_t *chaddr) + u8 *chaddr) { enum batadv_dhcp_recipient ret = BATADV_DHCP_NO; struct ethhdr *ethhdr; @@ -698,7 +698,7 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, struct vlan_ethhdr *vhdr; int chaddr_offset; __be16 proto; - uint8_t *p; + u8 *p; /* check for ethernet header */ if (!pskb_may_pull(skb, *header_len + ETH_HLEN)) @@ -814,7 +814,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; struct ethhdr *ethhdr = (struct ethhdr *)skb->data; bool out_of_range = false; - uint8_t curr_tq_avg; + u8 curr_tq_avg; unsigned short vid; vid = batadv_get_vid(skb, 0); diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index 89565b451c18..ef4d7e336651 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -43,6 +43,6 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, - uint8_t *chaddr); + u8 *chaddr); #endif /* _NET_BATMAN_ADV_GATEWAY_CLIENT_H_ */ diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 39cf44ccebd4..9e32c8ebc28f 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -39,7 +39,7 @@ * Returns false on parse error and true otherwise. */ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, - uint32_t *down, uint32_t *up) + u32 *down, u32 *up) { enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT; char *slash_ptr, *tmp_ptr; @@ -124,7 +124,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv) { struct batadv_tvlv_gateway_data gw; - uint32_t down, up; + u32 down, up; char gw_mode; gw_mode = atomic_read(&bat_priv->gw_mode); @@ -149,7 +149,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, size_t count) { struct batadv_priv *bat_priv = netdev_priv(net_dev); - uint32_t down_curr, up_curr, down_new = 0, up_new = 0; + u32 down_curr, up_curr, down_new = 0, up_new = 0; bool ret; down_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_down); @@ -195,9 +195,8 @@ end: */ static void batadv_gw_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, - void *tvlv_value, - uint16_t tvlv_value_len) + u8 flags, + void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_gateway_data gateway, *gateway_ptr; diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index e89f3146b092..2ea6a18d793f 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -25,7 +25,7 @@ /* clears the hash */ static void batadv_hash_init(struct batadv_hashtable *hash) { - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) { INIT_HLIST_HEAD(&hash->table[i]); @@ -42,7 +42,7 @@ void batadv_hash_destroy(struct batadv_hashtable *hash) } /* allocates and clears the hash */ -struct batadv_hashtable *batadv_hash_new(uint32_t size) +struct batadv_hashtable *batadv_hash_new(u32 size) { struct batadv_hashtable *hash; @@ -73,7 +73,7 @@ free_hash: void batadv_hash_set_lock_class(struct batadv_hashtable *hash, struct lock_class_key *key) { - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) lockdep_set_class(&hash->list_locks[i], key); diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 5065f50c9c3c..377626250ac7 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -39,17 +39,17 @@ typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *, * based on the key in the data of the first * argument and the size the second */ -typedef uint32_t (*batadv_hashdata_choose_cb)(const void *, uint32_t); +typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32); typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *); struct batadv_hashtable { struct hlist_head *table; /* the hashtable itself with the buckets */ spinlock_t *list_locks; /* spinlock for each hash list entry */ - uint32_t size; /* size of hashtable */ + u32 size; /* size of hashtable */ }; /* allocates and clears the hash */ -struct batadv_hashtable *batadv_hash_new(uint32_t size); +struct batadv_hashtable *batadv_hash_new(u32 size); /* set class key for all locks */ void batadv_hash_set_lock_class(struct batadv_hashtable *hash, @@ -69,7 +69,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash, struct hlist_head *head; struct hlist_node *node, *node_tmp; spinlock_t *list_lock; /* spinlock to protect write access */ - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -105,7 +105,7 @@ static inline int batadv_hash_add(struct batadv_hashtable *hash, const void *data, struct hlist_node *data_node) { - uint32_t index; + u32 index; int ret = -1; struct hlist_head *head; struct hlist_node *node; @@ -149,7 +149,7 @@ static inline void *batadv_hash_remove(struct batadv_hashtable *hash, batadv_hashdata_choose_cb choose, void *data) { - uint32_t index; + u32 index; struct hlist_node *node; struct hlist_head *head; void *data_save = NULL; diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 07061bcbaa04..5a70b7ed2558 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -183,7 +183,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff, struct batadv_orig_node *orig_node = NULL; struct batadv_neigh_node *neigh_node = NULL; size_t packet_len = sizeof(struct batadv_icmp_packet); - uint8_t *addr; + u8 *addr; if (len < sizeof(struct batadv_icmp_header)) { batadv_dbg(BATADV_DBG_BATMAN, bat_priv, diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 8457097f1643..40750cbe4f4f 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -234,7 +234,7 @@ void batadv_mesh_free(struct net_device *soft_iface) * * Returns 'true' if the mac address was found, false otherwise. */ -bool batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr) +bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr) { const struct batadv_hard_iface *hard_iface; bool is_my_mac = false; @@ -387,7 +387,7 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct batadv_priv *bat_priv; struct batadv_ogm_packet *batadv_ogm_packet; struct batadv_hard_iface *hard_iface; - uint8_t idx; + u8 idx; int ret; hard_iface = container_of(ptype, struct batadv_hard_iface, @@ -496,7 +496,7 @@ static void batadv_recv_handler_init(void) } int -batadv_recv_handler_register(uint8_t packet_type, +batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)) { @@ -512,7 +512,7 @@ batadv_recv_handler_register(uint8_t packet_type, return 0; } -void batadv_recv_handler_unregister(uint8_t packet_type) +void batadv_recv_handler_unregister(u8 packet_type) { batadv_rx_handler[packet_type] = batadv_recv_unhandled_packet; } @@ -642,8 +642,7 @@ batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler) * Returns tvlv handler if found or NULL otherwise. */ static struct batadv_tvlv_handler -*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version) +*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler_tmp, *tvlv_handler = NULL; @@ -691,8 +690,7 @@ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv) * Returns tvlv container if found or NULL otherwise. */ static struct batadv_tvlv_container -*batadv_tvlv_container_get(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version) +*batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version) { struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL; @@ -723,10 +721,10 @@ static struct batadv_tvlv_container * * Returns size of all currently registered tvlv containers in bytes. */ -static uint16_t batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) +static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) { struct batadv_tvlv_container *tvlv; - uint16_t tvlv_len = 0; + u16 tvlv_len = 0; hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) { tvlv_len += sizeof(struct batadv_tvlv_hdr); @@ -764,7 +762,7 @@ static void batadv_tvlv_container_remove(struct batadv_tvlv_container *tvlv) * @version: tvlv container type to unregister */ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version) + u8 type, u8 version) { struct batadv_tvlv_container *tvlv; @@ -787,8 +785,8 @@ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, * content is going to replace the old one. */ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version, - void *tvlv_value, uint16_t tvlv_value_len) + u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_container *tvlv_old, *tvlv_new; @@ -861,14 +859,13 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff, * * Returns size of all appended tvlv containers in bytes. */ -uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, - unsigned char **packet_buff, - int *packet_buff_len, - int packet_min_len) +u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, + unsigned char **packet_buff, + int *packet_buff_len, int packet_min_len) { struct batadv_tvlv_container *tvlv; struct batadv_tvlv_hdr *tvlv_hdr; - uint16_t tvlv_value_len; + u16 tvlv_value_len; void *tvlv_value; bool ret; @@ -893,7 +890,7 @@ uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, tvlv_hdr->len = tvlv->tvlv_hdr.len; tvlv_value = tvlv_hdr + 1; memcpy(tvlv_value, tvlv + 1, ntohs(tvlv->tvlv_hdr.len)); - tvlv_value = (uint8_t *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); + tvlv_value = (u8 *)tvlv_value + ntohs(tvlv->tvlv_hdr.len); } end: @@ -920,8 +917,8 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, struct batadv_tvlv_handler *tvlv_handler, bool ogm_source, struct batadv_orig_node *orig_node, - uint8_t *src, uint8_t *dst, - void *tvlv_value, uint16_t tvlv_value_len) + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len) { if (!tvlv_handler) return NET_RX_SUCCESS; @@ -972,13 +969,13 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv, int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, bool ogm_source, struct batadv_orig_node *orig_node, - uint8_t *src, uint8_t *dst, - void *tvlv_value, uint16_t tvlv_value_len) + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len) { struct batadv_tvlv_handler *tvlv_handler; struct batadv_tvlv_hdr *tvlv_hdr; - uint16_t tvlv_value_cont_len; - uint8_t cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; + u16 tvlv_value_cont_len; + u8 cifnotfound = BATADV_TVLV_HANDLER_OGM_CIFNOTFND; int ret = NET_RX_SUCCESS; while (tvlv_value_len >= sizeof(*tvlv_hdr)) { @@ -1000,7 +997,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, tvlv_value_cont_len); if (tvlv_handler) batadv_tvlv_handler_free_ref(tvlv_handler); - tvlv_value = (uint8_t *)tvlv_value + tvlv_value_cont_len; + tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len; tvlv_value_len -= tvlv_value_cont_len; } @@ -1034,7 +1031,7 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { void *tvlv_value; - uint16_t tvlv_value_len; + u16 tvlv_value_len; if (!batadv_ogm_packet) return; @@ -1066,14 +1063,14 @@ void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, void (*optr)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, + u8 flags, void *tvlv_value, - uint16_t tvlv_value_len), + u16 tvlv_value_len), int (*uptr)(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, + u8 *src, u8 *dst, void *tvlv_value, - uint16_t tvlv_value_len), - uint8_t type, uint8_t version, uint8_t flags) + u16 tvlv_value_len), + u8 type, u8 version, u8 flags) { struct batadv_tvlv_handler *tvlv_handler; @@ -1108,7 +1105,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, * @version: tvlv handler version to be unregistered */ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version) + u8 type, u8 version) { struct batadv_tvlv_handler *tvlv_handler; @@ -1134,9 +1131,9 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, * @tvlv_value: tvlv content * @tvlv_value_len: tvlv content length */ -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src, - uint8_t *dst, uint8_t type, uint8_t version, - void *tvlv_value, uint16_t tvlv_value_len) +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, + u8 *dst, u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len) { struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; struct batadv_tvlv_hdr *tvlv_hdr; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 41d27c7872b9..78500ac725d6 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -193,7 +193,7 @@ extern struct workqueue_struct *batadv_event_workqueue; int batadv_mesh_init(struct net_device *soft_iface); void batadv_mesh_free(struct net_device *soft_iface); -bool batadv_is_my_mac(struct batadv_priv *bat_priv, const uint8_t *addr); +bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_hard_iface * batadv_seq_print_text_primary_if_get(struct seq_file *seq); int batadv_max_header_len(void); @@ -202,10 +202,10 @@ int batadv_batman_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev); int -batadv_recv_handler_register(uint8_t packet_type, +batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); -void batadv_recv_handler_unregister(uint8_t packet_type); +void batadv_recv_handler_unregister(u8 packet_type); int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops); int batadv_algo_select(struct batadv_priv *bat_priv, char *name); int batadv_algo_seq_print_text(struct seq_file *seq, void *offset); @@ -304,7 +304,7 @@ static inline bool batadv_has_timed_out(unsigned long timestamp, * they handle overflows/underflows and can correctly check for a * predecessor/successor unless the variable sequence number has grown by * more then 2**(bitwidth(x)-1)-1. - * This means that for a uint8_t with the maximum value 255, it would think: + * This means that for a u8 with the maximum value 255, it would think: * - when adding nothing - it is neither a predecessor nor a successor * - before adding more than 127 to the starting value - it is a predecessor, * - when adding 128 - it is neither a predecessor nor a successor, @@ -327,10 +327,9 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx, #define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1) /* Sum and return the cpu-local counters for index 'idx' */ -static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv, - size_t idx) +static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) { - uint64_t *counters, sum = 0; + u64 *counters, sum = 0; int cpu; for_each_possible_cpu(cpu) { @@ -348,39 +347,38 @@ static inline uint64_t batadv_sum_counter(struct batadv_priv *bat_priv, #define BATADV_SKB_CB(__skb) ((struct batadv_skb_cb *)&((__skb)->cb[0])) void batadv_tvlv_container_register(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version, - void *tvlv_value, uint16_t tvlv_value_len); -uint16_t batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, - unsigned char **packet_buff, - int *packet_buff_len, - int packet_min_len); + u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len); +u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv, + unsigned char **packet_buff, + int *packet_buff_len, int packet_min_len); void batadv_tvlv_ogm_receive(struct batadv_priv *bat_priv, struct batadv_ogm_packet *batadv_ogm_packet, struct batadv_orig_node *orig_node); void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version); + u8 type, u8 version); void batadv_tvlv_handler_register(struct batadv_priv *bat_priv, void (*optr)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, + u8 flags, void *tvlv_value, - uint16_t tvlv_value_len), + u16 tvlv_value_len), int (*uptr)(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, + u8 *src, u8 *dst, void *tvlv_value, - uint16_t tvlv_value_len), - uint8_t type, uint8_t version, uint8_t flags); + u16 tvlv_value_len), + u8 type, u8 version, u8 flags); void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv, - uint8_t type, uint8_t version); + u8 type, u8 version); int batadv_tvlv_containers_process(struct batadv_priv *bat_priv, bool ogm_source, struct batadv_orig_node *orig_node, - uint8_t *src, uint8_t *dst, - void *tvlv_buff, uint16_t tvlv_buff_len); -void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, uint8_t *src, - uint8_t *dst, uint8_t type, uint8_t version, - void *tvlv_value, uint16_t tvlv_value_len); + u8 *src, u8 *dst, + void *tvlv_buff, u16 tvlv_buff_len); +void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src, + u8 *dst, u8 type, u8 version, + void *tvlv_value, u16 tvlv_value_len); unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len); bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid); diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 68a9554961eb..2593f0fe0bad 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -89,7 +89,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev, * Returns true if the given address is already in the given list. * Otherwise returns false. */ -static bool batadv_mcast_mla_is_duplicate(uint8_t *mcast_addr, +static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; @@ -595,7 +595,7 @@ batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t mcast_flags) + u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; @@ -638,7 +638,7 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t mcast_flags) + u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv4_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; @@ -681,7 +681,7 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t mcast_flags) + u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv6_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; @@ -721,17 +721,17 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, */ static void batadv_mcast_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, + u8 flags, void *tvlv_value, - uint16_t tvlv_value_len) + u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); - uint8_t mcast_flags = BATADV_NO_FLAGS; + u8 mcast_flags = BATADV_NO_FLAGS; bool orig_initialized; if (orig_mcast_enabled && tvlv_value && (tvlv_value_len >= sizeof(mcast_flags))) - mcast_flags = *(uint8_t *)tvlv_value; + mcast_flags = *(u8 *)tvlv_value; spin_lock_bh(&orig->mcast_handler_lock); orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST, diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 46604010dcd4..60dc7a6910b1 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -130,9 +130,8 @@ void batadv_nc_status_update(struct net_device *net_dev) */ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, - void *tvlv_value, - uint16_t tvlv_value_len) + u8 flags, + void *tvlv_value, u16 tvlv_value_len) { if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); @@ -382,7 +381,7 @@ static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; - uint32_t i; + u32 i; if (!hash) return; @@ -418,7 +417,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, struct hlist_node *node_tmp; struct batadv_nc_path *nc_path; spinlock_t *lock; /* Protects lists in hash */ - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -478,10 +477,10 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, * * Returns the selected index in the hash table for the given data. */ -static uint32_t batadv_nc_hash_choose(const void *data, uint32_t size) +static u32 batadv_nc_hash_choose(const void *data, u32 size) { const struct batadv_nc_path *nc_path = data; - uint32_t hash = 0; + u32 hash = 0; hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); @@ -744,8 +743,8 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, struct batadv_ogm_packet *ogm_packet) { struct batadv_orig_ifinfo *orig_ifinfo; - uint32_t last_real_seqno; - uint8_t last_ttl; + u32 last_real_seqno; + u8 last_ttl; orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT); if (!orig_ifinfo) @@ -938,8 +937,8 @@ out: */ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, - uint8_t *src, - uint8_t *dst) + u8 *src, + u8 *dst) { int hash_added; struct batadv_nc_path *nc_path, nc_path_key; @@ -991,9 +990,9 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, * selection of a receiver with slightly lower TQ than the other * @tq: to be weighted tq value */ -static uint8_t batadv_nc_random_weight_tq(uint8_t tq) +static u8 batadv_nc_random_weight_tq(u8 tq) { - uint8_t rand_val, rand_tq; + u8 rand_val, rand_tq; get_random_bytes(&rand_val, sizeof(rand_val)); @@ -1038,7 +1037,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct batadv_nc_packet *nc_packet, struct batadv_neigh_node *neigh_node) { - uint8_t tq_weighted_neigh, tq_weighted_coding, tq_tmp; + u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp; struct sk_buff *skb_dest, *skb_src; struct batadv_unicast_packet *packet1; struct batadv_unicast_packet *packet2; @@ -1047,7 +1046,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, struct batadv_neigh_node *router_coding = NULL; struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; - uint8_t *first_source, *first_dest, *second_source, *second_dest; + u8 *first_source, *first_dest, *second_source, *second_dest; __be32 packet_id1, packet_id2; size_t count; bool res = false; @@ -1231,8 +1230,7 @@ out: * * Returns true if coding of a decoded packet is allowed. */ -static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, - uint8_t *dst, uint8_t *src) +static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) { if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src)) return false; @@ -1255,7 +1253,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, struct batadv_nc_node *in_nc_node, struct batadv_nc_node *out_nc_node, struct sk_buff *skb, - uint8_t *eth_dst) + u8 *eth_dst) { struct batadv_nc_path *nc_path, nc_path_key; struct batadv_nc_packet *nc_packet_out = NULL; @@ -1321,8 +1319,8 @@ batadv_nc_path_search(struct batadv_priv *bat_priv, static struct batadv_nc_packet * batadv_nc_skb_src_search(struct batadv_priv *bat_priv, struct sk_buff *skb, - uint8_t *eth_dst, - uint8_t *eth_src, + u8 *eth_dst, + u8 *eth_src, struct batadv_nc_node *in_nc_node) { struct batadv_orig_node *orig_node; @@ -1362,7 +1360,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv, */ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, struct sk_buff *skb, - uint8_t *eth_dst_new) + u8 *eth_dst_new) { struct ethhdr *ethhdr; @@ -1638,7 +1636,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_unicast_packet *unicast_packet; struct batadv_coded_packet coded_packet_tmp; struct ethhdr *ethhdr, ethhdr_tmp; - uint8_t *orig_dest, ttl, ttvn; + u8 *orig_dest, ttl, ttvn; unsigned int coding_len; int err; @@ -1730,7 +1728,7 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, struct batadv_hashtable *hash = bat_priv->nc.decoding_hash; struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL; struct batadv_nc_path *nc_path, nc_path_key; - uint8_t *dest, *source; + u8 *dest, *source; __be32 packet_id; int index; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 32a0fcfab36d..4500e3a08719 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -452,8 +452,7 @@ out: */ struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const uint8_t *neigh_addr, - struct batadv_orig_node *orig_node) + const u8 *neigh_addr, struct batadv_orig_node *orig_node) { struct batadv_neigh_node *neigh_node; @@ -489,7 +488,7 @@ out: struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, const struct batadv_hard_iface *hard_iface, - const uint8_t *addr) + const u8 *addr) { struct batadv_neigh_node *tmp_neigh_node, *res = NULL; @@ -624,7 +623,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; - uint32_t i; + u32 i; if (!hash) return; @@ -659,7 +658,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv) * Returns the newly created object or NULL on failure. */ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, - const uint8_t *addr) + const u8 *addr) { struct batadv_orig_node *orig_node; struct batadv_orig_node_vlan *vlan; @@ -981,7 +980,7 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) struct hlist_head *head; spinlock_t *list_lock; /* spinlock to protect write access */ struct batadv_orig_node *orig_node; - uint32_t i; + u32 i; if (!hash) return; @@ -1115,7 +1114,7 @@ int batadv_orig_hash_add_if(struct batadv_hard_iface *hard_iface, struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; struct batadv_orig_node *orig_node; - uint32_t i; + u32 i; int ret; /* resize all orig nodes because orig_node->bcast_own(_sum) depend on @@ -1152,7 +1151,7 @@ int batadv_orig_hash_del_if(struct batadv_hard_iface *hard_iface, struct batadv_hard_iface *hard_iface_tmp; struct batadv_orig_node *orig_node; struct batadv_algo_ops *bao = bat_priv->bat_algo_ops; - uint32_t i; + u32 i; int ret; /* resize all orig nodes because orig_node->bcast_own(_sum) depend on diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 79734d302010..3fc76f6f710c 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -40,15 +40,14 @@ void batadv_purge_orig_ref(struct batadv_priv *bat_priv); void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, - const uint8_t *addr); + const u8 *addr); struct batadv_neigh_node * batadv_neigh_node_get(const struct batadv_orig_node *orig_node, const struct batadv_hard_iface *hard_iface, - const uint8_t *addr); + const u8 *addr); struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const uint8_t *neigh_addr, - struct batadv_orig_node *orig_node); + const u8 *neigh_addr, struct batadv_orig_node *orig_node); void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, @@ -86,9 +85,9 @@ void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan); /* hashfunction to choose an entry in a hash table of given size * hash algorithm from http://en.wikipedia.org/wiki/Hash_table */ -static inline uint32_t batadv_choose_orig(const void *data, uint32_t size) +static inline u32 batadv_choose_orig(const void *data, u32 size) { - uint32_t hash = 0; + u32 hash = 0; hash = jhash(data, ETH_ALEN, hash); return hash % size; diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h index 9e747c08d0bc..11f996b39fef 100644 --- a/net/batman-adv/packet.h +++ b/net/batman-adv/packet.h @@ -197,8 +197,8 @@ enum batadv_tvlv_type { * transport the claim type and the group id */ struct batadv_bla_claim_dst { - uint8_t magic[3]; /* FF:43:05 */ - uint8_t type; /* bla_claimframe */ + u8 magic[3]; /* FF:43:05 */ + u8 type; /* bla_claimframe */ __be16 group; /* group id */ }; @@ -213,16 +213,16 @@ struct batadv_bla_claim_dst { * @tvlv_len: length of tvlv data following the ogm header */ struct batadv_ogm_packet { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t flags; - __be32 seqno; - uint8_t orig[ETH_ALEN]; - uint8_t prev_sender[ETH_ALEN]; - uint8_t reserved; - uint8_t tq; - __be16 tvlv_len; + u8 packet_type; + u8 version; + u8 ttl; + u8 flags; + __be32 seqno; + u8 orig[ETH_ALEN]; + u8 prev_sender[ETH_ALEN]; + u8 reserved; + u8 tq; + __be16 tvlv_len; /* __packed is not needed as the struct size is divisible by 4, * and the largest data type in this struct has a size of 4. */ @@ -246,14 +246,14 @@ struct batadv_ogm_packet { * members are padded the same way as they are in real packets. */ struct batadv_icmp_header { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t msg_type; /* see ICMP message types above */ - uint8_t dst[ETH_ALEN]; - uint8_t orig[ETH_ALEN]; - uint8_t uid; - uint8_t align[3]; + u8 packet_type; + u8 version; + u8 ttl; + u8 msg_type; /* see ICMP message types above */ + u8 dst[ETH_ALEN]; + u8 orig[ETH_ALEN]; + u8 uid; + u8 align[3]; }; /** @@ -269,15 +269,15 @@ struct batadv_icmp_header { * @seqno: ICMP sequence number */ struct batadv_icmp_packet { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t msg_type; /* see ICMP message types above */ - uint8_t dst[ETH_ALEN]; - uint8_t orig[ETH_ALEN]; - uint8_t uid; - uint8_t reserved; - __be16 seqno; + u8 packet_type; + u8 version; + u8 ttl; + u8 msg_type; /* see ICMP message types above */ + u8 dst[ETH_ALEN]; + u8 orig[ETH_ALEN]; + u8 uid; + u8 reserved; + __be16 seqno; }; #define BATADV_RR_LEN 16 @@ -296,16 +296,16 @@ struct batadv_icmp_packet { * @rr: route record array */ struct batadv_icmp_packet_rr { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t msg_type; /* see ICMP message types above */ - uint8_t dst[ETH_ALEN]; - uint8_t orig[ETH_ALEN]; - uint8_t uid; - uint8_t rr_cur; - __be16 seqno; - uint8_t rr[BATADV_RR_LEN][ETH_ALEN]; + u8 packet_type; + u8 version; + u8 ttl; + u8 msg_type; /* see ICMP message types above */ + u8 dst[ETH_ALEN]; + u8 orig[ETH_ALEN]; + u8 uid; + u8 rr_cur; + __be16 seqno; + u8 rr[BATADV_RR_LEN][ETH_ALEN]; }; #define BATADV_ICMP_MAX_PACKET_SIZE sizeof(struct batadv_icmp_packet_rr) @@ -331,11 +331,11 @@ struct batadv_icmp_packet_rr { * @dest: originator destination of the unicast packet */ struct batadv_unicast_packet { - uint8_t packet_type; - uint8_t version; - uint8_t ttl; - uint8_t ttvn; /* destination translation table version number */ - uint8_t dest[ETH_ALEN]; + u8 packet_type; + u8 version; + u8 ttl; + u8 ttvn; /* destination translation table version number */ + u8 dest[ETH_ALEN]; /* "4 bytes boundary + 2 bytes" long to make the payload after the * following ethernet header again 4 bytes boundary aligned */ @@ -349,9 +349,9 @@ struct batadv_unicast_packet { */ struct batadv_unicast_4addr_packet { struct batadv_unicast_packet u; - uint8_t src[ETH_ALEN]; - uint8_t subtype; - uint8_t reserved; + u8 src[ETH_ALEN]; + u8 subtype; + u8 reserved; /* "4 bytes boundary + 2 bytes" long to make the payload after the * following ethernet header again 4 bytes boundary aligned */ @@ -370,22 +370,22 @@ struct batadv_unicast_4addr_packet { * @total_size: size of the merged packet */ struct batadv_frag_packet { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; + u8 packet_type; + u8 version; /* batman version field */ + u8 ttl; #if defined(__BIG_ENDIAN_BITFIELD) - uint8_t no:4; - uint8_t reserved:4; + u8 no:4; + u8 reserved:4; #elif defined(__LITTLE_ENDIAN_BITFIELD) - uint8_t reserved:4; - uint8_t no:4; + u8 reserved:4; + u8 no:4; #else #error "unknown bitfield endianness" #endif - uint8_t dest[ETH_ALEN]; - uint8_t orig[ETH_ALEN]; - __be16 seqno; - __be16 total_size; + u8 dest[ETH_ALEN]; + u8 orig[ETH_ALEN]; + __be16 seqno; + __be16 total_size; }; /** @@ -398,12 +398,12 @@ struct batadv_frag_packet { * @orig: originator of the broadcast packet */ struct batadv_bcast_packet { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; - uint8_t reserved; - __be32 seqno; - uint8_t orig[ETH_ALEN]; + u8 packet_type; + u8 version; /* batman version field */ + u8 ttl; + u8 reserved; + __be32 seqno; + u8 orig[ETH_ALEN]; /* "4 bytes boundary + 2 bytes" long to make the payload after the * following ethernet header again 4 bytes boundary aligned */ @@ -428,21 +428,21 @@ struct batadv_bcast_packet { * @coded_len: length of network coded part of the payload */ struct batadv_coded_packet { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; - uint8_t first_ttvn; - /* uint8_t first_dest[ETH_ALEN]; - saved in mac header destination */ - uint8_t first_source[ETH_ALEN]; - uint8_t first_orig_dest[ETH_ALEN]; - __be32 first_crc; - uint8_t second_ttl; - uint8_t second_ttvn; - uint8_t second_dest[ETH_ALEN]; - uint8_t second_source[ETH_ALEN]; - uint8_t second_orig_dest[ETH_ALEN]; - __be32 second_crc; - __be16 coded_len; + u8 packet_type; + u8 version; /* batman version field */ + u8 ttl; + u8 first_ttvn; + /* u8 first_dest[ETH_ALEN]; - saved in mac header destination */ + u8 first_source[ETH_ALEN]; + u8 first_orig_dest[ETH_ALEN]; + __be32 first_crc; + u8 second_ttl; + u8 second_ttvn; + u8 second_dest[ETH_ALEN]; + u8 second_source[ETH_ALEN]; + u8 second_orig_dest[ETH_ALEN]; + __be32 second_crc; + __be16 coded_len; }; #pragma pack() @@ -459,14 +459,14 @@ struct batadv_coded_packet { * @align: 2 bytes to align the header to a 4 byte boundary */ struct batadv_unicast_tvlv_packet { - uint8_t packet_type; - uint8_t version; /* batman version field */ - uint8_t ttl; - uint8_t reserved; - uint8_t dst[ETH_ALEN]; - uint8_t src[ETH_ALEN]; - __be16 tvlv_len; - uint16_t align; + u8 packet_type; + u8 version; /* batman version field */ + u8 ttl; + u8 reserved; + u8 dst[ETH_ALEN]; + u8 src[ETH_ALEN]; + __be16 tvlv_len; + u16 align; }; /** @@ -476,9 +476,9 @@ struct batadv_unicast_tvlv_packet { * @len: tvlv container length */ struct batadv_tvlv_hdr { - uint8_t type; - uint8_t version; - __be16 len; + u8 type; + u8 version; + __be16 len; }; /** @@ -500,9 +500,9 @@ struct batadv_tvlv_gateway_data { * one batadv_tvlv_tt_vlan_data object per announced vlan */ struct batadv_tvlv_tt_data { - uint8_t flags; - uint8_t ttvn; - __be16 num_vlan; + u8 flags; + u8 ttvn; + __be16 num_vlan; }; /** @@ -513,9 +513,9 @@ struct batadv_tvlv_tt_data { * @reserved: unused, useful for alignment purposes */ struct batadv_tvlv_tt_vlan_data { - __be32 crc; - __be16 vid; - uint16_t reserved; + __be32 crc; + __be16 vid; + u16 reserved; }; /** @@ -527,9 +527,9 @@ struct batadv_tvlv_tt_vlan_data { * @vid: VLAN identifier */ struct batadv_tvlv_tt_change { - uint8_t flags; - uint8_t reserved[3]; - uint8_t addr[ETH_ALEN]; + u8 flags; + u8 reserved[3]; + u8 addr[ETH_ALEN]; __be16 vid; }; @@ -539,7 +539,7 @@ struct batadv_tvlv_tt_change { * @vid: VLAN identifier */ struct batadv_tvlv_roam_adv { - uint8_t client[ETH_ALEN]; + u8 client[ETH_ALEN]; __be16 vid; }; @@ -549,8 +549,8 @@ struct batadv_tvlv_roam_adv { * @reserved: reserved field */ struct batadv_tvlv_mcast_data { - uint8_t flags; - uint8_t reserved[3]; + u8 flags; + u8 reserved[3]; }; #endif /* _NET_BATMAN_ADV_PACKET_H_ */ diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index c360c0cd19c2..8d990b070a2e 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -145,7 +145,7 @@ out: * 0 if the packet is to be accepted * 1 if the packet is to be ignored. */ -int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff, +int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, unsigned long *last_reset) { if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE || @@ -653,19 +653,19 @@ out: static bool batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, struct batadv_unicast_packet *unicast_packet, - uint8_t *dst_addr, unsigned short vid) + u8 *dst_addr, unsigned short vid) { struct batadv_orig_node *orig_node = NULL; struct batadv_hard_iface *primary_if = NULL; bool ret = false; - uint8_t *orig_addr, orig_ttvn; + u8 *orig_addr, orig_ttvn; if (batadv_is_my_client(bat_priv, dst_addr, vid)) { primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; orig_addr = primary_if->net_dev->dev_addr; - orig_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); + orig_ttvn = (u8)atomic_read(&bat_priv->tt.vn); } else { orig_node = batadv_transtable_search(bat_priv, NULL, dst_addr, vid); @@ -676,7 +676,7 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv, goto out; orig_addr = orig_node->orig; - orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); + orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn); } /* update the packet header */ @@ -698,7 +698,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, struct batadv_unicast_packet *unicast_packet; struct batadv_hard_iface *primary_if; struct batadv_orig_node *orig_node; - uint8_t curr_ttvn, old_ttvn; + u8 curr_ttvn, old_ttvn; struct ethhdr *ethhdr; unsigned short vid; int is_old_ttvn; @@ -740,7 +740,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, * value is used later to check if the node which sent (or re-routed * last time) the packet had an updated information or not */ - curr_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); + curr_ttvn = (u8)atomic_read(&bat_priv->tt.vn); if (!batadv_is_my_mac(bat_priv, unicast_packet->dest)) { orig_node = batadv_orig_hash_find(bat_priv, unicast_packet->dest); @@ -751,7 +751,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv, if (!orig_node) return 0; - curr_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); + curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn); batadv_orig_node_free_ref(orig_node); } @@ -833,7 +833,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); struct batadv_unicast_packet *unicast_packet; struct batadv_unicast_4addr_packet *unicast_4addr_packet; - uint8_t *orig_addr; + u8 *orig_addr; struct batadv_orig_node *orig_node = NULL; int check, hdr_size = sizeof(*unicast_packet); bool is4addr; @@ -904,7 +904,7 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); struct batadv_unicast_tvlv_packet *unicast_tvlv_packet; unsigned char *tvlv_buff; - uint16_t tvlv_buff_len; + u16 tvlv_buff_len; int hdr_size = sizeof(*unicast_tvlv_packet); int ret = NET_RX_DROP; @@ -1007,8 +1007,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb, struct ethhdr *ethhdr; int hdr_size = sizeof(*bcast_packet); int ret = NET_RX_DROP; - int32_t seq_diff; - uint32_t seqno; + s32 seq_diff; + u32 seqno; /* drop packet if it has not necessary minimum size */ if (unlikely(!pskb_may_pull(skb, hdr_size))) diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 6bc29d33abc1..3c185a1fb7a9 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -55,7 +55,7 @@ struct batadv_neigh_node * batadv_find_router(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); -int batadv_window_protected(struct batadv_priv *bat_priv, int32_t seq_num_diff, +int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff, unsigned long *last_reset); #endif /* _NET_BATMAN_ADV_ROUTING_H_ */ diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 191076ef1eca..f664324805eb 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -54,7 +54,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work); */ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, - const uint8_t *dst_addr) + const u8 *dst_addr) { struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); struct ethhdr *ethhdr; @@ -172,7 +172,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_unicast_packet *unicast_packet; - uint8_t ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); + u8 ttvn = (u8)atomic_read(&orig_node->last_ttvn); if (batadv_skb_head_push(skb, hdr_size) < 0) return false; @@ -343,12 +343,12 @@ out: */ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, - int packet_subtype, uint8_t *dst_hint, + int packet_subtype, u8 *dst_hint, unsigned short vid) { struct ethhdr *ethhdr = (struct ethhdr *)skb->data; struct batadv_orig_node *orig_node; - uint8_t *src, *dst; + u8 *src, *dst; src = ethhdr->h_source; dst = ethhdr->h_dest; diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 0536835fe503..33ef6fc1030d 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -33,7 +33,7 @@ struct work_struct; int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, - const uint8_t *dst_addr); + const u8 *dst_addr); int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if); @@ -56,7 +56,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, unsigned short vid); int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, - int packet_subtype, uint8_t *dst_hint, + int packet_subtype, u8 *dst_hint, unsigned short vid); int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid); @@ -75,7 +75,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, - struct sk_buff *skb, uint8_t *dst_hint, + struct sk_buff *skb, u8 *dst_hint, unsigned short vid) { return batadv_send_skb_via_tt_generic(bat_priv, skb, BATADV_UNICAST, 0, @@ -100,7 +100,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv, static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_subtype, - uint8_t *dst_hint, + u8 *dst_hint, unsigned short vid) { return batadv_send_skb_via_tt_generic(bat_priv, skb, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 49d3d3aa59cb..d5c5ad93a611 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -131,7 +131,7 @@ static int batadv_interface_set_mac_addr(struct net_device *dev, void *p) struct batadv_priv *bat_priv = netdev_priv(dev); struct batadv_softif_vlan *vlan; struct sockaddr *addr = p; - uint8_t old_addr[ETH_ALEN]; + u8 old_addr[ETH_ALEN]; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; @@ -186,19 +186,19 @@ static int batadv_interface_tx(struct sk_buff *skb, struct batadv_hard_iface *primary_if = NULL; struct batadv_bcast_packet *bcast_packet; __be16 ethertype = htons(ETH_P_BATMAN); - static const uint8_t stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, - 0x00, 0x00}; - static const uint8_t ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, - 0x00, 0x00}; + static const u8 stp_addr[ETH_ALEN] = {0x01, 0x80, 0xC2, 0x00, + 0x00, 0x00}; + static const u8 ectp_addr[ETH_ALEN] = {0xCF, 0x00, 0x00, 0x00, + 0x00, 0x00}; enum batadv_dhcp_recipient dhcp_rcp = BATADV_DHCP_NO; - uint8_t *dst_hint = NULL, chaddr[ETH_ALEN]; + u8 *dst_hint = NULL, chaddr[ETH_ALEN]; struct vlan_ethhdr *vhdr; unsigned int header_len = 0; int data_len = skb->len, ret; unsigned long brd_delay = 1; bool do_bcast = false, client_added; unsigned short vid; - uint32_t seqno; + u32 seqno; int gw_mode; enum batadv_forw_mode forw_mode; struct batadv_orig_node *mcast_single_orig = NULL; @@ -750,9 +750,9 @@ static void batadv_softif_destroy_finish(struct work_struct *work) static int batadv_softif_init_late(struct net_device *dev) { struct batadv_priv *bat_priv; - uint32_t random_seqno; + u32 random_seqno; int ret; - size_t cnt_len = sizeof(uint64_t) * BATADV_CNT_NUM; + size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; batadv_set_lockdep_class(dev); @@ -763,7 +763,7 @@ static int batadv_softif_init_late(struct net_device *dev) /* batadv_interface_stats() needs to be available as soon as * register_netdevice() has been called */ - bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(uint64_t)); + bat_priv->bat_counters = __alloc_percpu(cnt_len, __alignof__(u64)); if (!bat_priv->bat_counters) return -ENOMEM; @@ -1117,8 +1117,7 @@ static const struct { #endif }; -static void batadv_get_strings(struct net_device *dev, uint32_t stringset, - uint8_t *data) +static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) { if (stringset == ETH_SS_STATS) memcpy(data, batadv_counters_strings, @@ -1126,8 +1125,7 @@ static void batadv_get_strings(struct net_device *dev, uint32_t stringset, } static void batadv_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, - uint64_t *data) + struct ethtool_stats *stats, u64 *data) { struct batadv_priv *bat_priv = netdev_priv(dev); int i; diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index d6a312a82c03..9de3c8804ff4 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -457,7 +457,7 @@ static ssize_t batadv_show_gw_bwidth(struct kobject *kobj, struct attribute *attr, char *buff) { struct batadv_priv *bat_priv = batadv_kobj_to_batpriv(kobj); - uint32_t down, up; + u32 down, up; down = atomic_read(&bat_priv->gw.bandwidth_down); up = atomic_read(&bat_priv->gw.bandwidth_up); @@ -512,7 +512,7 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj, { struct net_device *net_dev = batadv_kobj_to_netdev(kobj); struct batadv_priv *bat_priv = netdev_priv(net_dev); - uint32_t mark, mask; + u32 mark, mask; char *mask_ptr; /* parse the mask if it has been specified, otherwise assume the mask is diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index c1eb7b72ab15..f90898881eee 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -56,7 +56,7 @@ static struct lock_class_key batadv_tt_local_hash_lock_class_key; static struct lock_class_key batadv_tt_global_hash_lock_class_key; -static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client, +static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node); static void batadv_tt_purge(struct work_struct *work); @@ -85,10 +85,10 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2) * Returns the hash index where the object represented by 'data' should be * stored at. */ -static inline uint32_t batadv_choose_tt(const void *data, uint32_t size) +static inline u32 batadv_choose_tt(const void *data, u32 size) { struct batadv_tt_common_entry *tt; - uint32_t hash = 0; + u32 hash = 0; tt = (struct batadv_tt_common_entry *)data; hash = jhash(&tt->addr, ETH_ALEN, hash); @@ -107,12 +107,12 @@ static inline uint32_t batadv_choose_tt(const void *data, uint32_t size) * found, NULL otherwise. */ static struct batadv_tt_common_entry * -batadv_tt_hash_find(struct batadv_hashtable *hash, const uint8_t *addr, +batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr, unsigned short vid) { struct hlist_head *head; struct batadv_tt_common_entry to_search, *tt, *tt_tmp = NULL; - uint32_t index; + u32 index; if (!hash) return NULL; @@ -152,7 +152,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const uint8_t *addr, * found, NULL otherwise. */ static struct batadv_tt_local_entry * -batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr, +batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; @@ -177,7 +177,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr, * is found, NULL otherwise. */ static struct batadv_tt_global_entry * -batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const uint8_t *addr, +batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_common_entry *tt_common_entry; @@ -223,7 +223,7 @@ batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry) * (excluding ourself). */ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid) + const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; int count; @@ -364,11 +364,11 @@ batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry) */ static void batadv_tt_local_event(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, - uint8_t event_flags) + u8 event_flags) { struct batadv_tt_change_node *tt_change_node, *entry, *safe; struct batadv_tt_common_entry *common = &tt_local_entry->common; - uint8_t flags = common->flags | event_flags; + u8 flags = common->flags | event_flags; bool event_removed = false; bool del_op_requested, del_op_entry; @@ -448,7 +448,7 @@ static int batadv_tt_len(int changes_num) * * Returns the number of entries. */ -static uint16_t batadv_tt_entries(uint16_t tt_len) +static u16 batadv_tt_entries(u16 tt_len) { return tt_len / batadv_tt_len(1); } @@ -462,7 +462,7 @@ static uint16_t batadv_tt_entries(uint16_t tt_len) */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { - uint16_t num_vlan = 0, tt_local_entries = 0; + u16 num_vlan = 0, tt_local_entries = 0; struct batadv_softif_vlan *vlan; int hdr_size; @@ -525,8 +525,8 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, * * Returns true if the client was successfully added, false otherwise. */ -bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, - unsigned short vid, int ifindex, uint32_t mark) +bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, + unsigned short vid, int ifindex, u32 mark) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); struct batadv_tt_local_entry *tt_local; @@ -537,8 +537,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, struct batadv_tt_orig_list_entry *orig_entry; int hash_added, table_size, packet_size_max; bool ret = false, roamed_back = false; - uint8_t remote_flags; - uint32_t match_mark; + u8 remote_flags; + u32 match_mark; if (ifindex != BATADV_NULL_IFINDEX) in_dev = dev_get_by_index(&init_net, ifindex); @@ -605,7 +605,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", addr, BATADV_PRINT_VID(vid), - (uint8_t)atomic_read(&bat_priv->tt.vn)); + (u8)atomic_read(&bat_priv->tt.vn)); ether_addr_copy(tt_local->common.addr, addr); /* The local entry has to be marked as NEW to avoid to send it in @@ -724,16 +724,16 @@ out: * * Return the size of the allocated buffer or 0 in case of failure. */ -static uint16_t +static u16 batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, - int32_t *tt_len) + s32 *tt_len) { - uint16_t num_vlan = 0, num_entries = 0, change_offset, tvlv_len; + u16 num_vlan = 0, num_entries = 0, change_offset, tvlv_len; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; - uint8_t *tt_change_ptr; + u8 *tt_change_ptr; rcu_read_lock(); list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { @@ -769,7 +769,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, tt_vlan++; } - tt_change_ptr = (uint8_t *)*tt_data + change_offset; + tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: @@ -795,16 +795,16 @@ out: * * Return the size of the allocated buffer or 0 in case of failure. */ -static uint16_t +static u16 batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data **tt_data, struct batadv_tvlv_tt_change **tt_change, - int32_t *tt_len) + s32 *tt_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_softif_vlan *vlan; - uint16_t num_vlan = 0, num_entries = 0, tvlv_len; - uint8_t *tt_change_ptr; + u16 num_vlan = 0, num_entries = 0, tvlv_len; + u8 *tt_change_ptr; int change_offset; rcu_read_lock(); @@ -841,7 +841,7 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, tt_vlan++; } - tt_change_ptr = (uint8_t *)*tt_data + change_offset; + tt_change_ptr = (u8 *)*tt_data + change_offset; *tt_change = (struct batadv_tvlv_tt_change *)tt_change_ptr; out: @@ -861,7 +861,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) struct batadv_tvlv_tt_change *tt_change; int tt_diff_len, tt_change_len = 0; int tt_diff_entries_num = 0, tt_diff_entries_count = 0; - uint16_t tvlv_len; + u16 tvlv_len; tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes); tt_diff_len = batadv_tt_len(tt_diff_entries_num); @@ -935,12 +935,12 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) struct batadv_softif_vlan *vlan; struct hlist_head *head; unsigned short vid; - uint32_t i; + u32 i; int last_seen_secs; int last_seen_msecs; unsigned long last_seen_jiffies; bool no_purge; - uint16_t np_flag = BATADV_TT_CLIENT_NOPURGE; + u16 np_flag = BATADV_TT_CLIENT_NOPURGE; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) @@ -948,7 +948,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, "Locally retrieved addresses (from %s) announced via TT (TTVN: %u):\n", - net_dev->name, (uint8_t)atomic_read(&bat_priv->tt.vn)); + net_dev->name, (u8)atomic_read(&bat_priv->tt.vn)); seq_printf(seq, " %-13s %s %-8s %-9s (%-10s)\n", "Client", "VID", "Flags", "Last seen", "CRC"); @@ -1008,7 +1008,7 @@ out: static void batadv_tt_local_set_pending(struct batadv_priv *bat_priv, struct batadv_tt_local_entry *tt_local_entry, - uint16_t flags, const char *message) + u16 flags, const char *message) { batadv_tt_local_event(bat_priv, tt_local_entry, flags); @@ -1034,12 +1034,12 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, * * Returns the flags assigned to the local entry before being deleted */ -uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid, - const char *message, bool roaming) +u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr, + unsigned short vid, const char *message, + bool roaming) { struct batadv_tt_local_entry *tt_local_entry; - uint16_t flags, curr_flags = BATADV_NO_FLAGS; + u16 flags, curr_flags = BATADV_NO_FLAGS; struct batadv_softif_vlan *vlan; void *tt_entry_exists; @@ -1142,7 +1142,7 @@ static void batadv_tt_local_purge(struct batadv_priv *bat_priv, struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ - uint32_t i; + u32 i; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; @@ -1163,7 +1163,7 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv) struct batadv_softif_vlan *vlan; struct hlist_node *node_tmp; struct hlist_head *head; - uint32_t i; + u32 i; if (!bat_priv->tt.local_hash) return; @@ -1338,15 +1338,14 @@ out: static bool batadv_tt_global_add(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *tt_addr, - unsigned short vid, uint16_t flags, - uint8_t ttvn) + unsigned short vid, u16 flags, u8 ttvn) { struct batadv_tt_global_entry *tt_global_entry; struct batadv_tt_local_entry *tt_local_entry; bool ret = false; int hash_added; struct batadv_tt_common_entry *common; - uint16_t local_flags; + u16 local_flags; /* ignore global entries from backbone nodes */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) @@ -1543,8 +1542,8 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, struct batadv_tt_common_entry *tt_common_entry; struct batadv_orig_node_vlan *vlan; struct hlist_head *head; - uint8_t last_ttvn; - uint16_t flags; + u8 last_ttvn; + u16 flags; tt_common_entry = &tt_global_entry->common; flags = tt_common_entry->flags; @@ -1618,7 +1617,7 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset) struct batadv_tt_global_entry *tt_global; struct batadv_hard_iface *primary_if; struct hlist_head *head; - uint32_t i; + u32 i; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) @@ -1838,12 +1837,12 @@ out: */ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - int32_t match_vid, + s32 match_vid, const char *message) { struct batadv_tt_global_entry *tt_global; struct batadv_tt_common_entry *tt_common_entry; - uint32_t i; + u32 i; struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct hlist_node *safe; struct hlist_head *head; @@ -1914,7 +1913,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) struct hlist_head *head; struct hlist_node *node_tmp; spinlock_t *list_lock; /* protects write access to the hash lists */ - uint32_t i; + u32 i; char *msg = NULL; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; @@ -1955,7 +1954,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv) struct batadv_tt_global_entry *tt_global; struct hlist_node *node_tmp; struct hlist_head *head; - uint32_t i; + u32 i; if (!bat_priv->tt.global_hash) return; @@ -2016,8 +2015,8 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry, * If the two clients are AP isolated the function returns NULL. */ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, - const uint8_t *src, - const uint8_t *addr, + const u8 *src, + const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry = NULL; @@ -2085,16 +2084,16 @@ out: * * Returns the checksum of the global table of a given originator. */ -static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - unsigned short vid) +static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, + struct batadv_orig_node *orig_node, + unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; - uint32_t i, crc_tmp, crc = 0; - uint8_t flags; + u32 i, crc_tmp, crc = 0; + u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { @@ -2162,14 +2161,14 @@ static uint32_t batadv_tt_global_crc(struct batadv_priv *bat_priv, * * Returns the checksum of the local table */ -static uint32_t batadv_tt_local_crc(struct batadv_priv *bat_priv, - unsigned short vid) +static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, + unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common; struct hlist_head *head; - uint32_t i, crc_tmp, crc = 0; - uint8_t flags; + u32 i, crc_tmp, crc = 0; + u8 flags; __be16 tmp_vid; for (i = 0; i < hash->size; i++) { @@ -2226,7 +2225,7 @@ static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const void *tt_buff, - uint16_t tt_buff_len) + u16 tt_buff_len) { /* Replace the old buffer only if I received something in the * last OGM (the OGM could carry no changes) @@ -2335,15 +2334,15 @@ static int batadv_tt_global_valid(const void *entry_ptr, */ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, struct batadv_hashtable *hash, - void *tvlv_buff, uint16_t tt_len, + void *tvlv_buff, u16 tt_len, int (*valid_cb)(const void *, const void *), void *cb_data) { struct batadv_tt_common_entry *tt_common_entry; struct batadv_tvlv_tt_change *tt_change; struct hlist_head *head; - uint16_t tt_tot, tt_num_entries = 0; - uint32_t i; + u16 tt_tot, tt_num_entries = 0; + u32 i; tt_tot = batadv_tt_entries(tt_len); tt_change = (struct batadv_tvlv_tt_change *)tvlv_buff; @@ -2385,11 +2384,11 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv, */ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_vlan_data *tt_vlan, - uint16_t num_vlan) + u16 num_vlan) { struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp; struct batadv_orig_node_vlan *vlan; - uint32_t crc; + u32 crc; int i; /* check if each received CRC matches the locally stored one */ @@ -2444,7 +2443,7 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_orig_node_vlan *vlan; - uint32_t crc; + u32 crc; /* recompute the global CRC for each VLAN */ rcu_read_lock(); @@ -2474,9 +2473,9 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, */ static int batadv_send_tt_request(struct batadv_priv *bat_priv, struct batadv_orig_node *dst_orig_node, - uint8_t ttvn, + u8 ttvn, struct batadv_tvlv_tt_vlan_data *tt_vlan, - uint16_t num_vlan, bool full_table) + u16 num_vlan, bool full_table) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tt_req_node *tt_req_node = NULL; @@ -2555,7 +2554,7 @@ out: */ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, - uint8_t *req_src, uint8_t *req_dst) + u8 *req_src, u8 *req_dst) { struct batadv_orig_node *req_dst_orig_node; struct batadv_orig_node *res_dst_orig_node = NULL; @@ -2563,9 +2562,9 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_tvlv_tt_vlan_data *tt_vlan; bool ret = false, full_table; - uint8_t orig_ttvn, req_ttvn; - uint16_t tvlv_len; - int32_t tt_len; + u8 orig_ttvn, req_ttvn; + u16 tvlv_len; + s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (%pM) [%c]\n", @@ -2581,7 +2580,7 @@ static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv, if (!res_dst_orig_node) goto out; - orig_ttvn = (uint8_t)atomic_read(&req_dst_orig_node->last_ttvn); + orig_ttvn = (u8)atomic_read(&req_dst_orig_node->last_ttvn); req_ttvn = tt_data->ttvn; tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(tt_data + 1); @@ -2687,16 +2686,16 @@ out: */ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, - uint8_t *req_src) + u8 *req_src) { struct batadv_tvlv_tt_data *tvlv_tt_data = NULL; struct batadv_hard_iface *primary_if = NULL; struct batadv_tvlv_tt_change *tt_change; struct batadv_orig_node *orig_node; - uint8_t my_ttvn, req_ttvn; - uint16_t tvlv_len; + u8 my_ttvn, req_ttvn; + u16 tvlv_len; bool full_table; - int32_t tt_len; + s32 tt_len; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_REQUEST from %pM for ttvn: %u (me) [%c]\n", @@ -2705,7 +2704,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->tt.commit_lock); - my_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); + my_ttvn = (u8)atomic_read(&bat_priv->tt.vn); req_ttvn = tt_data->ttvn; orig_node = batadv_orig_hash_find(bat_priv, req_src); @@ -2744,7 +2743,7 @@ static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv, bat_priv->tt.last_changeset_len); spin_unlock_bh(&bat_priv->tt.last_changeset_lock); } else { - req_ttvn = (uint8_t)atomic_read(&bat_priv->tt.vn); + req_ttvn = (u8)atomic_read(&bat_priv->tt.vn); /* allocate the tvlv, put the tt_data and all the tt_vlan_data * in the initial part @@ -2805,7 +2804,7 @@ out: */ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, - uint8_t *req_src, uint8_t *req_dst) + u8 *req_src, u8 *req_dst) { if (batadv_is_my_mac(bat_priv, req_dst)) return batadv_send_my_tt_response(bat_priv, tt_data, req_src); @@ -2816,7 +2815,7 @@ static bool batadv_send_tt_response(struct batadv_priv *bat_priv, static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_change *tt_change, - uint16_t tt_num_changes, uint8_t ttvn) + u16 tt_num_changes, u8 ttvn) { int i; int roams; @@ -2848,8 +2847,8 @@ static void _batadv_tt_update_changes(struct batadv_priv *bat_priv, static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_change *tt_change, - uint8_t ttvn, uint8_t *resp_src, - uint16_t num_entries) + u8 ttvn, u8 *resp_src, + u16 num_entries) { struct batadv_orig_node *orig_node; @@ -2879,7 +2878,7 @@ out: static void batadv_tt_update_changes(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - uint16_t tt_num_changes, uint8_t ttvn, + u16 tt_num_changes, u8 ttvn, struct batadv_tvlv_tt_change *tt_change) { _batadv_tt_update_changes(bat_priv, orig_node, tt_change, @@ -2898,7 +2897,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv, * * Returns true if the client is served by this node, false otherwise. */ -bool batadv_is_my_client(struct batadv_priv *bat_priv, const uint8_t *addr, +bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; @@ -2929,13 +2928,13 @@ out: */ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, - uint8_t *resp_src, uint16_t num_entries) + u8 *resp_src, u16 num_entries) { struct batadv_tt_req_node *node, *safe; struct batadv_orig_node *orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; - uint8_t *tvlv_ptr = (uint8_t *)tt_data; - uint16_t change_offset; + u8 *tvlv_ptr = (u8 *)tt_data; + u16 change_offset; batadv_dbg(BATADV_DBG_TT, bat_priv, "Received TT_RESPONSE from %pM for ttvn %d t_size: %d [%c]\n", @@ -3018,8 +3017,7 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv) * * returns true if the ROAMING_ADV can be sent, false otherwise */ -static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, - uint8_t *client) +static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client) { struct batadv_tt_roam_node *tt_roam_node; bool ret = false; @@ -3074,7 +3072,7 @@ unlock: * for this particular roamed client has to be forwarded to the sender of the * roaming message. */ -static void batadv_send_roam_adv(struct batadv_priv *bat_priv, uint8_t *client, +static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, unsigned short vid, struct batadv_orig_node *orig_node) { @@ -3152,14 +3150,14 @@ void batadv_tt_free(struct batadv_priv *bat_priv) * @enable: whether to set or unset the flag * @count: whether to increase the TT size by the number of changed entries */ -static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, - uint16_t flags, bool enable, bool count) +static void batadv_tt_local_set_flags(struct batadv_priv *bat_priv, u16 flags, + bool enable, bool count) { struct batadv_hashtable *hash = bat_priv->tt.local_hash; struct batadv_tt_common_entry *tt_common_entry; - uint16_t changed_num = 0; + u16 changed_num = 0; struct hlist_head *head; - uint32_t i; + u32 i; if (!hash) return; @@ -3201,7 +3199,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) struct hlist_node *node_tmp; struct hlist_head *head; spinlock_t *list_lock; /* protects write access to the hash lists */ - uint32_t i; + u32 i; if (!hash) return; @@ -3267,7 +3265,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) atomic_inc(&bat_priv->tt.vn); batadv_dbg(BATADV_DBG_TT, bat_priv, "Local changes committed, updating to ttvn %u\n", - (uint8_t)atomic_read(&bat_priv->tt.vn)); + (u8)atomic_read(&bat_priv->tt.vn)); /* reset the sending counter */ atomic_set(&bat_priv->tt.ogm_append_cnt, BATADV_TT_OGM_APPEND_MAX); @@ -3286,8 +3284,8 @@ void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.commit_lock); } -bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, uint8_t *src, - uint8_t *dst, unsigned short vid) +bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, + unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry = NULL; struct batadv_tt_global_entry *tt_global_entry = NULL; @@ -3335,11 +3333,11 @@ out: */ static void batadv_tt_update_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - const void *tt_buff, uint16_t tt_num_vlan, + const void *tt_buff, u16 tt_num_vlan, struct batadv_tvlv_tt_change *tt_change, - uint16_t tt_num_changes, uint8_t ttvn) + u16 tt_num_changes, u8 ttvn) { - uint8_t orig_ttvn = (uint8_t)atomic_read(&orig_node->last_ttvn); + u8 orig_ttvn = (u8)atomic_read(&orig_node->last_ttvn); struct batadv_tvlv_tt_vlan_data *tt_vlan; bool full_table = true; bool has_tt_init; @@ -3418,7 +3416,7 @@ request_table: * deleted later by a DEL or because of timeout */ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid) + u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt_global_entry; bool ret = false; @@ -3444,7 +3442,7 @@ out: * to keep the latter consistent with the node TTVN */ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid) + u8 *addr, unsigned short vid) { struct batadv_tt_local_entry *tt_local_entry; bool ret = false; @@ -3530,13 +3528,13 @@ void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface) */ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, void *tvlv_value, - uint16_t tvlv_value_len) + u8 flags, void *tvlv_value, + u16 tvlv_value_len) { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_tvlv_tt_change *tt_change; struct batadv_tvlv_tt_data *tt_data; - uint16_t num_entries, num_vlan; + u16 num_entries, num_vlan; if (tvlv_value_len < sizeof(*tt_data)) return; @@ -3572,12 +3570,12 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, * otherwise. */ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, + u8 *src, u8 *dst, void *tvlv_value, - uint16_t tvlv_value_len) + u16 tvlv_value_len) { struct batadv_tvlv_tt_data *tt_data; - uint16_t tt_vlan_len, tt_num_entries; + u16 tt_vlan_len, tt_num_entries; char tt_flag; bool ret; @@ -3653,9 +3651,9 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, * otherwise. */ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, + u8 *src, u8 *dst, void *tvlv_value, - uint16_t tvlv_value_len) + u16 tvlv_value_len) { struct batadv_tvlv_roam_adv *roaming_adv; struct batadv_orig_node *orig_node = NULL; @@ -3737,7 +3735,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv) * otherwise */ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid) + const u8 *addr, unsigned short vid) { struct batadv_tt_global_entry *tt; bool ret; diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 6acc25d3a925..9b82d5dd56e0 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -28,38 +28,37 @@ struct net_device; struct seq_file; int batadv_tt_init(struct batadv_priv *bat_priv); -bool batadv_tt_local_add(struct net_device *soft_iface, const uint8_t *addr, - unsigned short vid, int ifindex, uint32_t mark); -uint16_t batadv_tt_local_remove(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid, - const char *message, bool roaming); +bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, + unsigned short vid, int ifindex, u32 mark); +u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, + const u8 *addr, unsigned short vid, + const char *message, bool roaming); int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset); int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset); void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, - int32_t match_vid, const char *message); + s32 match_vid, const char *message); int batadv_tt_global_hash_count(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid); + const u8 *addr, unsigned short vid); struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv, - const uint8_t *src, - const uint8_t *addr, + const u8 *src, const u8 *addr, unsigned short vid); void batadv_tt_free(struct batadv_priv *bat_priv); -bool batadv_is_my_client(struct batadv_priv *bat_priv, const uint8_t *addr, +bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid); -bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, uint8_t *src, - uint8_t *dst, unsigned short vid); +bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst, + unsigned short vid); void batadv_tt_local_commit_changes(struct batadv_priv *bat_priv); bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid); + u8 *addr, unsigned short vid); bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv, - uint8_t *addr, unsigned short vid); + u8 *addr, unsigned short vid); void batadv_tt_local_resize_to_mtu(struct net_device *soft_iface); bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node, const unsigned char *addr, unsigned short vid); bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv, - const uint8_t *addr, unsigned short vid); + const u8 *addr, unsigned short vid); #endif /* _NET_BATMAN_ADV_TRANSLATION_TABLE_H_ */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 55610a805b53..f2ebe33d9b59 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -44,7 +44,7 @@ struct seq_file; * * *Please be careful: batadv_dat_addr_t must be UNSIGNED* */ -#define batadv_dat_addr_t uint16_t +#define batadv_dat_addr_t u16 #endif /* CONFIG_BATMAN_ADV_DAT */ @@ -103,10 +103,10 @@ struct batadv_hard_iface_bat_iv { */ struct batadv_hard_iface { struct list_head list; - int16_t if_num; + s16 if_num; char if_status; struct net_device *net_dev; - uint8_t num_bcasts; + u8 num_bcasts; struct kobject *hardif_obj; atomic_t refcount; struct packet_type batman_adv_ptype; @@ -132,8 +132,8 @@ struct batadv_orig_ifinfo { struct hlist_node list; struct batadv_hard_iface *if_outgoing; struct batadv_neigh_node __rcu *router; /* rcu protected pointer */ - uint32_t last_real_seqno; - uint8_t last_ttl; + u32 last_real_seqno; + u8 last_ttl; unsigned long batman_seqno_reset; atomic_t refcount; struct rcu_head rcu; @@ -152,9 +152,9 @@ struct batadv_frag_table_entry { struct hlist_head head; spinlock_t lock; /* protects head */ unsigned long timestamp; - uint16_t seqno; - uint16_t size; - uint16_t total_size; + u16 seqno; + u16 size; + u16 total_size; }; /** @@ -166,7 +166,7 @@ struct batadv_frag_table_entry { struct batadv_frag_list_entry { struct hlist_node list; struct sk_buff *skb; - uint8_t no; + u8 no; }; /** @@ -175,7 +175,7 @@ struct batadv_frag_list_entry { * @num_entries: number of TT entries for this VLAN */ struct batadv_vlan_tt { - uint32_t crc; + u32 crc; atomic_t num_entries; }; @@ -206,7 +206,7 @@ struct batadv_orig_node_vlan { */ struct batadv_orig_bat_iv { unsigned long *bcast_own; - uint8_t *bcast_own_sum; + u8 *bcast_own_sum; /* ogm_cnt_lock protects: bcast_own, bcast_own_sum, * neigh_node->bat_iv.real_bits & neigh_node->bat_iv.real_packet_count */ @@ -260,7 +260,7 @@ struct batadv_orig_bat_iv { * @bat_iv: B.A.T.M.A.N. IV private structure */ struct batadv_orig_node { - uint8_t orig[ETH_ALEN]; + u8 orig[ETH_ALEN]; struct hlist_head ifinfo_list; struct batadv_orig_ifinfo *last_bonding_candidate; #ifdef CONFIG_BATMAN_ADV_DAT @@ -271,7 +271,7 @@ struct batadv_orig_node { #ifdef CONFIG_BATMAN_ADV_MCAST /* synchronizes mcast tvlv specific orig changes */ spinlock_t mcast_handler_lock; - uint8_t mcast_flags; + u8 mcast_flags; struct hlist_node mcast_want_all_unsnoopables_node; struct hlist_node mcast_want_all_ipv4_node; struct hlist_node mcast_want_all_ipv6_node; @@ -280,12 +280,12 @@ struct batadv_orig_node { unsigned long capa_initialized; atomic_t last_ttvn; unsigned char *tt_buff; - int16_t tt_buff_len; + s16 tt_buff_len; spinlock_t tt_buff_lock; /* protects tt_buff & tt_buff_len */ /* prevents from changing the table while reading it */ spinlock_t tt_lock; DECLARE_BITMAP(bcast_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); - uint32_t last_bcast_seqno; + u32 last_bcast_seqno; struct hlist_head neigh_list; /* neigh_list_lock protects: neigh_list and router */ spinlock_t neigh_list_lock; @@ -335,8 +335,8 @@ enum batadv_orig_capabilities { struct batadv_gw_node { struct hlist_node list; struct batadv_orig_node *orig_node; - uint32_t bandwidth_down; - uint32_t bandwidth_up; + u32 bandwidth_down; + u32 bandwidth_up; unsigned long deleted; atomic_t refcount; struct rcu_head rcu; @@ -358,7 +358,7 @@ struct batadv_gw_node { struct batadv_neigh_node { struct hlist_node list; struct batadv_orig_node *orig_node; - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; struct hlist_head ifinfo_list; spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */ struct batadv_hard_iface *if_incoming; @@ -378,11 +378,11 @@ struct batadv_neigh_node { * @real_packet_count: counted result of real_bits */ struct batadv_neigh_ifinfo_bat_iv { - uint8_t tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE]; - uint8_t tq_index; - uint8_t tq_avg; + u8 tq_recv[BATADV_TQ_GLOBAL_WINDOW_SIZE]; + u8 tq_index; + u8 tq_avg; DECLARE_BITMAP(real_bits, BATADV_TQ_LOCAL_WINDOW_SIZE); - uint8_t real_packet_count; + u8 real_packet_count; }; /** @@ -398,7 +398,7 @@ struct batadv_neigh_ifinfo { struct hlist_node list; struct batadv_hard_iface *if_outgoing; struct batadv_neigh_ifinfo_bat_iv bat_iv; - uint8_t last_ttl; + u8 last_ttl; atomic_t refcount; struct rcu_head rcu; }; @@ -411,7 +411,7 @@ struct batadv_neigh_ifinfo { */ #ifdef CONFIG_BATMAN_ADV_BLA struct batadv_bcast_duplist_entry { - uint8_t orig[ETH_ALEN]; + u8 orig[ETH_ALEN]; __be32 crc; unsigned long entrytime; }; @@ -543,7 +543,7 @@ struct batadv_priv_tt { spinlock_t req_list_lock; /* protects req_list */ spinlock_t roam_list_lock; /* protects roam_list */ unsigned char *last_changeset; - int16_t last_changeset_len; + s16 last_changeset_len; /* protects last_changeset & last_changeset_len */ spinlock_t last_changeset_lock; /* prevents from executing a commit while reading the table */ @@ -663,7 +663,7 @@ struct batadv_priv_mcast { struct hlist_head want_all_unsnoopables_list; struct hlist_head want_all_ipv4_list; struct hlist_head want_all_ipv6_list; - uint8_t flags; + u8 flags; bool enabled; atomic_t num_disabled; atomic_t num_want_all_unsnoopables; @@ -781,7 +781,7 @@ struct batadv_priv { atomic_t mesh_state; struct net_device *soft_iface; struct net_device_stats stats; - uint64_t __percpu *bat_counters; /* Per cpu counters */ + u64 __percpu *bat_counters; /* Per cpu counters */ atomic_t aggregated_ogms; atomic_t bonding; atomic_t fragmentation; @@ -803,8 +803,8 @@ struct batadv_priv { #ifdef CONFIG_BATMAN_ADV_DEBUG atomic_t log_level; #endif - uint32_t isolation_mark; - uint32_t isolation_mark_mask; + u32 isolation_mark; + u32 isolation_mark_mask; atomic_t bcast_seqno; atomic_t bcast_queue_left; atomic_t batman_queue_left; @@ -870,7 +870,7 @@ struct batadv_socket_client { struct batadv_socket_packet { struct list_head list; size_t icmp_len; - uint8_t icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE]; + u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE]; }; /** @@ -891,14 +891,14 @@ struct batadv_socket_packet { */ #ifdef CONFIG_BATMAN_ADV_BLA struct batadv_bla_backbone_gw { - uint8_t orig[ETH_ALEN]; + u8 orig[ETH_ALEN]; unsigned short vid; struct hlist_node hash_entry; struct batadv_priv *bat_priv; unsigned long lasttime; atomic_t wait_periods; atomic_t request_sent; - uint16_t crc; + u16 crc; atomic_t refcount; struct rcu_head rcu; }; @@ -914,7 +914,7 @@ struct batadv_bla_backbone_gw { * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_bla_claim { - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; unsigned short vid; struct batadv_bla_backbone_gw *backbone_gw; unsigned long lasttime; @@ -936,10 +936,10 @@ struct batadv_bla_claim { * @rcu: struct used for freeing in an RCU-safe manner */ struct batadv_tt_common_entry { - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; unsigned short vid; struct hlist_node hash_entry; - uint16_t flags; + u16 flags; unsigned long added_at; atomic_t refcount; struct rcu_head rcu; @@ -981,7 +981,7 @@ struct batadv_tt_global_entry { */ struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; - uint8_t ttvn; + u8 ttvn; struct hlist_node list; atomic_t refcount; struct rcu_head rcu; @@ -1004,7 +1004,7 @@ struct batadv_tt_change_node { * @list: list node for batadv_priv_tt::req_list */ struct batadv_tt_req_node { - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; unsigned long issued_at; struct list_head list; }; @@ -1018,7 +1018,7 @@ struct batadv_tt_req_node { * @list: list node for batadv_priv_tt::roam_list */ struct batadv_tt_roam_node { - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; atomic_t counter; unsigned long first_time; struct list_head list; @@ -1035,7 +1035,7 @@ struct batadv_tt_roam_node { */ struct batadv_nc_node { struct list_head list; - uint8_t addr[ETH_ALEN]; + u8 addr[ETH_ALEN]; atomic_t refcount; struct rcu_head rcu; struct batadv_orig_node *orig_node; @@ -1059,8 +1059,8 @@ struct batadv_nc_path { atomic_t refcount; struct list_head packet_list; spinlock_t packet_list_lock; /* Protects packet_list */ - uint8_t next_hop[ETH_ALEN]; - uint8_t prev_hop[ETH_ALEN]; + u8 next_hop[ETH_ALEN]; + u8 prev_hop[ETH_ALEN]; unsigned long last_valid; }; @@ -1112,11 +1112,11 @@ struct batadv_skb_cb { struct batadv_forw_packet { struct hlist_node list; unsigned long send_time; - uint8_t own; + u8 own; struct sk_buff *skb; - uint16_t packet_len; - uint32_t direct_link_flags; - uint8_t num_packets; + u16 packet_len; + u32 direct_link_flags; + u8 num_packets; struct delayed_work delayed_work; struct batadv_hard_iface *if_incoming; struct batadv_hard_iface *if_outgoing; @@ -1191,7 +1191,7 @@ struct batadv_algo_ops { */ struct batadv_dat_entry { __be32 ip; - uint8_t mac_addr[ETH_ALEN]; + u8 mac_addr[ETH_ALEN]; unsigned short vid; unsigned long last_update; struct hlist_node hash_entry; @@ -1253,14 +1253,13 @@ struct batadv_tvlv_handler { struct hlist_node list; void (*ogm_handler)(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, - uint8_t flags, - void *tvlv_value, uint16_t tvlv_value_len); + u8 flags, void *tvlv_value, u16 tvlv_value_len); int (*unicast_handler)(struct batadv_priv *bat_priv, - uint8_t *src, uint8_t *dst, - void *tvlv_value, uint16_t tvlv_value_len); - uint8_t type; - uint8_t version; - uint8_t flags; + u8 *src, u8 *dst, + void *tvlv_value, u16 tvlv_value_len); + u8 type; + u8 version; + u8 flags; atomic_t refcount; struct rcu_head rcu; }; -- cgit v1.2.3 From 34473822da06510d708087bb76d9ad757ac09567 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 31 May 2015 10:10:20 +0200 Subject: batman-adv: Fix kerneldoc over 80 column lines Kerneldoc required single line documentation in the past (before 2009). Therefore, the 80 columns limit per line check of checkpatch was disabled for kerneldoc. But kerneldoc is not excluded anymore from it and checkpatch now enabled the check again. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/icmp_socket.c | 4 ++-- net/batman-adv/network-coding.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index 5a70b7ed2558..bcabb5e3f4d3 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -337,8 +337,8 @@ err: } /** - * batadv_socket_receive_packet - schedule an icmp packet to be sent to userspace - * on an icmp socket. + * batadv_socket_receive_packet - schedule an icmp packet to be sent to + * userspace on an icmp socket. * @socket_client: the socket this packet belongs to * @icmph: pointer to the header of the icmp packet * @icmp_len: total length of the icmp packet diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 60dc7a6910b1..5cab144000be 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -872,8 +872,8 @@ free: } /** - * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node structs - * (best called on incoming OGMs) + * batadv_nc_update_nc_node - updates stored incoming and outgoing nc node + * structs (best called on incoming OGMs) * @bat_priv: the bat priv with all the soft interface information * @orig_node: orig node originating the ogm packet * @orig_neigh_node: neighboring orig node from which we received the ogm packet -- cgit v1.2.3 From 4f248cff9e21720bd5f057661f752fba067f3779 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Tue, 9 Jun 2015 20:50:49 +0200 Subject: batman-adv: Remove multiple assignment per line The Linux CodingStyle disallows multiple assignments in a single line. (see chapter 1) Reported-by: Markus Pargmann Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 6 ++++-- net/batman-adv/bridge_loop_avoidance.c | 6 ++++-- net/batman-adv/distributed-arp-table.c | 3 ++- net/batman-adv/gateway_client.c | 15 ++++++++++----- net/batman-adv/gateway_common.c | 5 ++++- net/batman-adv/network-coding.c | 3 ++- net/batman-adv/translation-table.c | 18 +++++++++++++----- 7 files changed, 39 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f550a6ab77bb..df5411845c7d 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1004,7 +1004,8 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv, { struct batadv_neigh_ifinfo *neigh_ifinfo = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; - struct batadv_neigh_node *neigh_node = NULL, *tmp_neigh_node = NULL; + struct batadv_neigh_node *neigh_node = NULL; + struct batadv_neigh_node *tmp_neigh_node = NULL; struct batadv_neigh_node *router = NULL; struct batadv_orig_node *orig_node_tmp; int if_num; @@ -1404,7 +1405,8 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, struct batadv_hard_iface *if_outgoing) { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); - struct batadv_neigh_node *router = NULL, *router_router = NULL; + struct batadv_neigh_node *router = NULL; + struct batadv_neigh_node *router_router = NULL; struct batadv_orig_node *orig_neigh_node; struct batadv_orig_ifinfo *orig_ifinfo; struct batadv_neigh_node *orig_neigh_router = NULL; diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 8f9059f15ebb..191a70290dca 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -88,7 +88,8 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node, { const void *data1 = container_of(node, struct batadv_bla_backbone_gw, hash_entry); - const struct batadv_bla_backbone_gw *gw1 = data1, *gw2 = data2; + const struct batadv_bla_backbone_gw *gw1 = data1; + const struct batadv_bla_backbone_gw *gw2 = data2; if (!batadv_compare_eth(gw1->orig, gw2->orig)) return 0; @@ -105,7 +106,8 @@ static int batadv_compare_claim(const struct hlist_node *node, { const void *data1 = container_of(node, struct batadv_bla_claim, hash_entry); - const struct batadv_bla_claim *cl1 = data1, *cl2 = data2; + const struct batadv_bla_claim *cl1 = data1; + const struct batadv_bla_claim *cl2 = data2; if (!batadv_compare_eth(cl1->addr, cl2->addr)) return 0; diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index a63ea27eca6c..83bc1aaf5800 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -497,7 +497,8 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv, int select, batadv_dat_addr_t ip_key, batadv_dat_addr_t *last_max) { - batadv_dat_addr_t max = 0, tmp_max = 0; + batadv_dat_addr_t max = 0; + batadv_dat_addr_t tmp_max = 0; struct batadv_orig_node *orig_node, *max_orig_node = NULL; struct batadv_hashtable *hash = bat_priv->orig_hash; struct hlist_head *head; diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 6269c75919bf..d7ca2144e62c 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -153,7 +153,8 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) struct batadv_neigh_node *router; struct batadv_neigh_ifinfo *router_ifinfo; struct batadv_gw_node *gw_node, *curr_gw = NULL; - u64 max_gw_factor = 0, tmp_gw_factor = 0; + u64 max_gw_factor = 0; + u64 tmp_gw_factor = 0; u8 max_tq = 0; u8 tq_avg; struct batadv_orig_node *orig_node; @@ -263,7 +264,8 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv) void batadv_gw_election(struct batadv_priv *bat_priv) { - struct batadv_gw_node *curr_gw = NULL, *next_gw = NULL; + struct batadv_gw_node *curr_gw = NULL; + struct batadv_gw_node *next_gw = NULL; struct batadv_neigh_node *router = NULL; struct batadv_neigh_ifinfo *router_ifinfo = NULL; char gw_addr[18] = { '\0' }; @@ -347,7 +349,8 @@ void batadv_gw_check_election(struct batadv_priv *bat_priv, struct batadv_neigh_ifinfo *router_orig_tq = NULL; struct batadv_neigh_ifinfo *router_gw_tq = NULL; struct batadv_orig_node *curr_gw_orig; - struct batadv_neigh_node *router_gw = NULL, *router_orig = NULL; + struct batadv_neigh_node *router_gw = NULL; + struct batadv_neigh_node *router_orig = NULL; u8 gw_tq_avg, orig_tq_avg; curr_gw_orig = batadv_gw_get_selected_orig(bat_priv); @@ -808,9 +811,11 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len, bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb) { - struct batadv_neigh_node *neigh_curr = NULL, *neigh_old = NULL; + struct batadv_neigh_node *neigh_curr = NULL; + struct batadv_neigh_node *neigh_old = NULL; struct batadv_orig_node *orig_dst_node = NULL; - struct batadv_gw_node *gw_node = NULL, *curr_gw = NULL; + struct batadv_gw_node *gw_node = NULL; + struct batadv_gw_node *curr_gw = NULL; struct batadv_neigh_ifinfo *curr_ifinfo, *old_ifinfo; struct ethhdr *ethhdr = (struct ethhdr *)skb->data; bool out_of_range = false; diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 9e32c8ebc28f..c50931cbfbb5 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -149,7 +149,10 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, size_t count) { struct batadv_priv *bat_priv = netdev_priv(net_dev); - u32 down_curr, up_curr, down_new = 0, up_new = 0; + u32 down_curr; + u32 up_curr; + u32 down_new = 0; + u32 up_new = 0; bool ret; down_curr = (unsigned int)atomic_read(&bat_priv->gw.bandwidth_down); diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 5cab144000be..ccfe5d86b106 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -887,7 +887,8 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, struct batadv_ogm_packet *ogm_packet, int is_single_hop_neigh) { - struct batadv_nc_node *in_nc_node = NULL, *out_nc_node = NULL; + struct batadv_nc_node *in_nc_node = NULL; + struct batadv_nc_node *out_nc_node = NULL; /* Check if network coding is enabled */ if (!atomic_read(&bat_priv->network_coding)) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index f90898881eee..7e34901e214c 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -462,7 +462,8 @@ static u16 batadv_tt_entries(u16 tt_len) */ static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv) { - u16 num_vlan = 0, tt_local_entries = 0; + u16 num_vlan = 0; + u16 tt_local_entries = 0; struct batadv_softif_vlan *vlan; int hdr_size; @@ -536,7 +537,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, struct hlist_head *head; struct batadv_tt_orig_list_entry *orig_entry; int hash_added, table_size, packet_size_max; - bool ret = false, roamed_back = false; + bool ret = false; + bool roamed_back = false; u8 remote_flags; u32 match_mark; @@ -730,7 +732,10 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, struct batadv_tvlv_tt_change **tt_change, s32 *tt_len) { - u16 num_vlan = 0, num_entries = 0, change_offset, tvlv_len; + u16 num_vlan = 0; + u16 num_entries = 0; + u16 change_offset; + u16 tvlv_len; struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_orig_node_vlan *vlan; u8 *tt_change_ptr; @@ -803,7 +808,9 @@ batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv, { struct batadv_tvlv_tt_vlan_data *tt_vlan; struct batadv_softif_vlan *vlan; - u16 num_vlan = 0, num_entries = 0, tvlv_len; + u16 num_vlan = 0; + u16 num_entries = 0; + u16 tvlv_len; u8 *tt_change_ptr; int change_offset; @@ -860,7 +867,8 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv) struct batadv_tvlv_tt_data *tt_data; struct batadv_tvlv_tt_change *tt_change; int tt_diff_len, tt_change_len = 0; - int tt_diff_entries_num = 0, tt_diff_entries_count = 0; + int tt_diff_entries_num = 0; + int tt_diff_entries_count = 0; u16 tvlv_len; tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes); -- cgit v1.2.3 From 433ff98f3f1d6baa56e23f6b50370a5ab16c0e66 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Thu, 18 Jun 2015 16:11:07 +0800 Subject: batman-adv: update kernel doc of batadv_tt_global_del_orig_entry() The updated kernel doc & additional comment shall prevent accidental copy & paste errors or calling the function without the required precautions. Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/translation-table.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 7e34901e214c..6b5f718f6705 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1658,20 +1658,26 @@ out: } /** - * batadv_tt_global_del_orig_entry - remove and free an orig_entry + * _batadv_tt_global_del_orig_entry - remove and free an orig_entry * @tt_global_entry: the global entry to remove the orig_entry from * @orig_entry: the orig entry to remove and free * * Remove an orig_entry from its list in the given tt_global_entry and * free this orig_entry afterwards. + * + * Caller must hold tt_global_entry->list_lock and ensure orig_entry->list is + * part of a list. */ static void -batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, - struct batadv_tt_orig_list_entry *orig_entry) +_batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, + struct batadv_tt_orig_list_entry *orig_entry) { batadv_tt_global_size_dec(orig_entry->orig_node, tt_global_entry->common.vid); atomic_dec(&tt_global_entry->orig_list_count); + /* requires holding tt_global_entry->list_lock and orig_entry->list + * being part of a list + */ hlist_del_rcu(&orig_entry->list); batadv_tt_orig_list_entry_free_ref(orig_entry); } @@ -1687,7 +1693,7 @@ batadv_tt_global_del_orig_list(struct batadv_tt_global_entry *tt_global_entry) spin_lock_bh(&tt_global_entry->list_lock); head = &tt_global_entry->orig_list; hlist_for_each_entry_safe(orig_entry, safe, head, list) - batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); + _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); spin_unlock_bh(&tt_global_entry->list_lock); } @@ -1722,8 +1728,8 @@ batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv, orig_node->orig, tt_global_entry->common.addr, BATADV_PRINT_VID(vid), message); - batadv_tt_global_del_orig_entry(tt_global_entry, - orig_entry); + _batadv_tt_global_del_orig_entry(tt_global_entry, + orig_entry); } } spin_unlock_bh(&tt_global_entry->list_lock); -- cgit v1.2.3 From 383b863620aa5127200bff591f7465d593290997 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Thu, 18 Jun 2015 16:24:24 +0800 Subject: batman-adv: rename batadv_new_tt_req_node to batadv_tt_req_node_new Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/translation-table.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 6b5f718f6705..3932df200db6 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -2272,11 +2272,16 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv) spin_unlock_bh(&bat_priv->tt.req_list_lock); } -/* returns the pointer to the new tt_req_node struct if no request - * has already been issued for this orig_node, NULL otherwise +/** + * batadv_tt_req_node_new - search and possibly create a tt_req_node object + * @bat_priv: the bat priv with all the soft interface information + * @orig_node: orig node this request is being issued for + * + * Returns the pointer to the new tt_req_node struct if no request + * has already been issued for this orig_node, NULL otherwise. */ static struct batadv_tt_req_node * -batadv_new_tt_req_node(struct batadv_priv *bat_priv, +batadv_tt_req_node_new(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node) { struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL; @@ -2505,7 +2510,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv, /* The new tt_req will be issued only if I'm not waiting for a * reply from the same orig_node yet */ - tt_req_node = batadv_new_tt_req_node(bat_priv, dst_orig_node); + tt_req_node = batadv_tt_req_node_new(bat_priv, dst_orig_node); if (!tt_req_node) goto out; -- cgit v1.2.3 From 2bdd1888f147576fb870a926a4d015305b5bbeee Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Thu, 18 Jun 2015 18:53:19 +0200 Subject: batman-adv: Remove batadv_ types forward declarations main.h is included in every file and is the only way to access types.h. This makes forward declarations for all types defined in types.h unnecessary. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/bridge_loop_avoidance.h | 3 --- net/batman-adv/debugfs.h | 1 - net/batman-adv/gateway_common.h | 1 - net/batman-adv/icmp_socket.h | 1 - net/batman-adv/multicast.h | 2 -- net/batman-adv/network-coding.h | 4 ---- net/batman-adv/routing.h | 4 ---- net/batman-adv/send.h | 3 --- net/batman-adv/soft-interface.h | 4 ---- net/batman-adv/sysfs.h | 2 -- net/batman-adv/translation-table.h | 2 -- 11 files changed, 27 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 449f2f1350a5..025152b34282 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -22,9 +22,6 @@ #include -struct batadv_hard_iface; -struct batadv_orig_node; -struct batadv_priv; struct seq_file; struct sk_buff; diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 187acdc85dfa..80ab8d6f0ab3 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -22,7 +22,6 @@ #include -struct batadv_hard_iface; struct net_device; #define BATADV_DEBUGFS_SUBDIR "batman_adv" diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index bd5c812cebf4..ab893e318229 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -22,7 +22,6 @@ #include -struct batadv_priv; struct net_device; enum batadv_gw_modes { diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 7de7fce4b48c..e937143f0b10 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -23,7 +23,6 @@ #include struct batadv_icmp_header; -struct batadv_priv; #define BATADV_ICMP_SOCKET "socket" diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index beb6e56c624a..8f3cb04b9f13 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -20,8 +20,6 @@ #include "main.h" -struct batadv_orig_node; -struct batadv_priv; struct sk_buff; /** diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 5b79aa8c64c1..8f6d4ad8778a 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -22,11 +22,7 @@ #include -struct batadv_nc_node; -struct batadv_neigh_node; struct batadv_ogm_packet; -struct batadv_orig_node; -struct batadv_priv; struct net_device; struct seq_file; struct sk_buff; diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index 3c185a1fb7a9..204bbe4952a6 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -22,10 +22,6 @@ #include -struct batadv_hard_iface; -struct batadv_neigh_node; -struct batadv_orig_node; -struct batadv_priv; struct sk_buff; bool batadv_check_management_packet(struct sk_buff *skb, diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 33ef6fc1030d..82059f259e46 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -25,9 +25,6 @@ #include "packet.h" -struct batadv_hard_iface; -struct batadv_orig_node; -struct batadv_priv; struct sk_buff; struct work_struct; diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index 578e8a663c30..8e82176f40b1 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -22,10 +22,6 @@ #include -struct batadv_hard_iface; -struct batadv_orig_node; -struct batadv_priv; -struct batadv_softif_vlan; struct net_device; struct sk_buff; diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index 2294583f7cf9..61974428a7af 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -23,8 +23,6 @@ #include #include -struct batadv_priv; -struct batadv_softif_vlan; struct kobject; struct net_device; diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 9b82d5dd56e0..abd8e116e5fb 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -22,8 +22,6 @@ #include -struct batadv_orig_node; -struct batadv_priv; struct net_device; struct seq_file; -- cgit v1.2.3 From d0fa4f3f5b0419a016ff27eea2fa615e4c0d1713 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Mon, 22 Jun 2015 00:30:22 +0800 Subject: batman-adv: convert orig_node->vlan_list to hlist Since the list's tail is never accessed using a double linked list head wastes memory. Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/originator.c | 7 ++++--- net/batman-adv/translation-table.c | 8 ++++---- net/batman-adv/types.h | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 4500e3a08719..610620aa8d26 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node, struct batadv_orig_node_vlan *vlan = NULL, *tmp; rcu_read_lock(); - list_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) { + hlist_for_each_entry_rcu(tmp, &orig_node->vlan_list, list) { if (tmp->vid != vid) continue; @@ -118,7 +119,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node, atomic_set(&vlan->refcount, 2); vlan->vid = vid; - list_add_rcu(&vlan->list, &orig_node->vlan_list); + hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list); out: spin_unlock_bh(&orig_node->vlan_list_lock); @@ -673,7 +674,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, return NULL; INIT_HLIST_HEAD(&orig_node->neigh_list); - INIT_LIST_HEAD(&orig_node->vlan_list); + INIT_HLIST_HEAD(&orig_node->vlan_list); INIT_HLIST_HEAD(&orig_node->ifinfo_list); spin_lock_init(&orig_node->bcast_seqno_lock); spin_lock_init(&orig_node->neigh_list_lock); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 3932df200db6..9e1f866a59f4 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -315,7 +315,7 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); - list_del_rcu(&vlan->list); + hlist_del_rcu(&vlan->list); spin_unlock_bh(&orig_node->vlan_list_lock); batadv_orig_node_vlan_free_ref(vlan); } @@ -741,7 +741,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, u8 *tt_change_ptr; rcu_read_lock(); - list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { num_vlan++; num_entries += atomic_read(&vlan->tt.num_entries); } @@ -767,7 +767,7 @@ batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node, (*tt_data)->num_vlan = htons(num_vlan); tt_vlan = (struct batadv_tvlv_tt_vlan_data *)(*tt_data + 1); - list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { tt_vlan->vid = htons(vlan->vid); tt_vlan->crc = htonl(vlan->tt.crc); @@ -2466,7 +2466,7 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv, /* recompute the global CRC for each VLAN */ rcu_read_lock(); - list_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { + hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list) { /* if orig_node is a backbone node for this VLAN, don't compute * the CRC as we ignore all the global entries over it */ diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index f2ebe33d9b59..fe3695cb1d40 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -190,7 +190,7 @@ struct batadv_vlan_tt { struct batadv_orig_node_vlan { unsigned short vid; struct batadv_vlan_tt tt; - struct list_head list; + struct hlist_node list; atomic_t refcount; struct rcu_head rcu; }; @@ -302,7 +302,7 @@ struct batadv_orig_node { spinlock_t out_coding_list_lock; /* Protects out_coding_list */ #endif struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT]; - struct list_head vlan_list; + struct hlist_head vlan_list; spinlock_t vlan_list_lock; /* protects vlan_list */ struct batadv_orig_bat_iv bat_iv; }; -- cgit v1.2.3 From a121048a89f8186fa2c8388d5b41528466013857 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Mon, 22 Jun 2015 00:30:23 +0800 Subject: batman-adv: prevent potential hlist double deletion The hlist_del_rcu() call in batadv_tt_global_size_mod() does not check if the element still is part of the list prior to deletion. The atomic list counter should prevent the worst but converting to hlist_del_init_rcu() ensures the element can't be deleted more than once. Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/translation-table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 9e1f866a59f4..596e326b01d0 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -315,7 +315,7 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node, if (atomic_add_return(v, &vlan->tt.num_entries) == 0) { spin_lock_bh(&orig_node->vlan_list_lock); - hlist_del_rcu(&vlan->list); + hlist_del_init_rcu(&vlan->list); spin_unlock_bh(&orig_node->vlan_list_lock); batadv_orig_node_vlan_free_ref(vlan); } -- cgit v1.2.3 From 77927b7d9dd6c9c5f59f0f49c76cda54b908090e Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 21 Jun 2015 14:42:49 +0200 Subject: batman-adv: Return EINVAL on invalid gw_bandwidth change Invalid speed settings by the user are currently acknowledged as correct but not stored. Instead the return of the store operation of the file "gw_bandwidth" should indicate that the given value is not acceptable. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/gateway_common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index c50931cbfbb5..6b930a651f38 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -19,6 +19,7 @@ #include "main.h" #include +#include #include #include #include @@ -160,7 +161,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, ret = batadv_parse_gw_bandwidth(net_dev, buff, &down_new, &up_new); if (!ret) - goto end; + return -EINVAL; if (!down_new) down_new = 1; @@ -184,7 +185,6 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff, atomic_set(&bat_priv->gw.bandwidth_up, up_new); batadv_gw_tvlv_container_update(bat_priv); -end: return count; } -- cgit v1.2.3 From 0b8336f5fc26f263821534d7a9a3633748692207 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 21 Jun 2015 19:40:09 +0200 Subject: batman-adv: Fix gw_bandwidth calculation on 32 bit systems The TVLV for the gw_bandwidth stores everything as u32. But the gw_bandwidth reads the signed long which limits the maximum value to (2 ** 31 - 1) on systems with 4 byte long. Also the input value is always converted from either Mibit/s or Kibit/s to 100Kibit/s. This reduces the values even further when the user sets it via the default unit Kibit/s. It may even cause an integer overflow and end up with a value the user never intended. Instead read the values as u64, check for possible overflows, do the unit adjustments and then reduce the size to u32. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/gateway_common.c | 49 +++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 6b930a651f38..0cb5e6b6f6d4 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -44,7 +45,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, { enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT; char *slash_ptr, *tmp_ptr; - long ldown, lup; + u64 ldown, lup; int ret; slash_ptr = strchr(buff, '/'); @@ -62,7 +63,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, *tmp_ptr = '\0'; } - ret = kstrtol(buff, 10, &ldown); + ret = kstrtou64(buff, 10, &ldown); if (ret) { batadv_err(net_dev, "Download speed of gateway mode invalid: %s\n", @@ -72,14 +73,31 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, switch (bw_unit_type) { case BATADV_BW_UNIT_MBIT: - *down = ldown * 10; + /* prevent overflow */ + if (U64_MAX / 10 < ldown) { + batadv_err(net_dev, + "Download speed of gateway mode too large: %s\n", + buff); + return false; + } + + ldown *= 10; break; case BATADV_BW_UNIT_KBIT: default: - *down = ldown / 100; + ldown = div_u64(ldown, 100); break; } + if (U32_MAX < ldown) { + batadv_err(net_dev, + "Download speed of gateway mode too large: %s\n", + buff); + return false; + } + + *down = ldown; + /* we also got some upload info */ if (slash_ptr) { bw_unit_type = BATADV_BW_UNIT_KBIT; @@ -95,7 +113,7 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, *tmp_ptr = '\0'; } - ret = kstrtol(slash_ptr + 1, 10, &lup); + ret = kstrtou64(slash_ptr + 1, 10, &lup); if (ret) { batadv_err(net_dev, "Upload speed of gateway mode invalid: %s\n", @@ -105,13 +123,30 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff, switch (bw_unit_type) { case BATADV_BW_UNIT_MBIT: - *up = lup * 10; + /* prevent overflow */ + if (U64_MAX / 10 < lup) { + batadv_err(net_dev, + "Upload speed of gateway mode too large: %s\n", + slash_ptr + 1); + return false; + } + + lup *= 10; break; case BATADV_BW_UNIT_KBIT: default: - *up = lup / 100; + lup = div_u64(lup, 100); break; } + + if (U32_MAX < lup) { + batadv_err(net_dev, + "Upload speed of gateway mode too large: %s\n", + slash_ptr + 1); + return false; + } + + *up = lup; } return true; -- cgit v1.2.3 From 7c26a53ba5e7e4bf2be3cb93f2fffaefccff2e3b Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Sun, 28 Jun 2015 22:16:06 +0800 Subject: batman-adv: convert bat_priv->tt.req_list to hlist Since the list's tail is never accessed using a double linked list head wastes memory. Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/main.c | 2 +- net/batman-adv/translation-table.c | 29 ++++++++++++++++------------- net/batman-adv/types.h | 4 ++-- 3 files changed, 19 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 40750cbe4f4f..d277ba724a86 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -148,7 +148,7 @@ int batadv_mesh_init(struct net_device *soft_iface) INIT_HLIST_HEAD(&bat_priv->mcast.want_all_ipv6_list); #endif INIT_LIST_HEAD(&bat_priv->tt.changes_list); - INIT_LIST_HEAD(&bat_priv->tt.req_list); + INIT_HLIST_HEAD(&bat_priv->tt.req_list); INIT_LIST_HEAD(&bat_priv->tt.roam_list); #ifdef CONFIG_BATMAN_ADV_MCAST INIT_HLIST_HEAD(&bat_priv->mcast.mla_list); diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 596e326b01d0..734b456efaf4 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -2224,12 +2224,13 @@ static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv, static void batadv_tt_req_list_free(struct batadv_priv *bat_priv) { - struct batadv_tt_req_node *node, *safe; + struct batadv_tt_req_node *node; + struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); - list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { - list_del_init(&node->list); + hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { + hlist_del_init(&node->list); kfree(node); } @@ -2259,13 +2260,14 @@ static void batadv_tt_save_orig_buffer(struct batadv_priv *bat_priv, static void batadv_tt_req_purge(struct batadv_priv *bat_priv) { - struct batadv_tt_req_node *node, *safe; + struct batadv_tt_req_node *node; + struct hlist_node *safe; spin_lock_bh(&bat_priv->tt.req_list_lock); - list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { + hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (batadv_has_timed_out(node->issued_at, BATADV_TT_REQUEST_TIMEOUT)) { - list_del_init(&node->list); + hlist_del_init(&node->list); kfree(node); } } @@ -2287,7 +2289,7 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv, struct batadv_tt_req_node *tt_req_node_tmp, *tt_req_node = NULL; spin_lock_bh(&bat_priv->tt.req_list_lock); - list_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) { + hlist_for_each_entry(tt_req_node_tmp, &bat_priv->tt.req_list, list) { if (batadv_compare_eth(tt_req_node_tmp, orig_node) && !batadv_has_timed_out(tt_req_node_tmp->issued_at, BATADV_TT_REQUEST_TIMEOUT)) @@ -2301,7 +2303,7 @@ batadv_tt_req_node_new(struct batadv_priv *bat_priv, ether_addr_copy(tt_req_node->addr, orig_node->orig); tt_req_node->issued_at = jiffies; - list_add(&tt_req_node->list, &bat_priv->tt.req_list); + hlist_add_head(&tt_req_node->list, &bat_priv->tt.req_list); unlock: spin_unlock_bh(&bat_priv->tt.req_list_lock); return tt_req_node; @@ -2552,8 +2554,8 @@ out: batadv_hardif_free_ref(primary_if); if (ret && tt_req_node) { spin_lock_bh(&bat_priv->tt.req_list_lock); - /* list_del_init() verifies tt_req_node still is in the list */ - list_del_init(&tt_req_node->list); + /* hlist_del_init() verifies tt_req_node still is in the list */ + hlist_del_init(&tt_req_node->list); spin_unlock_bh(&bat_priv->tt.req_list_lock); kfree(tt_req_node); } @@ -2949,7 +2951,8 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, struct batadv_tvlv_tt_data *tt_data, u8 *resp_src, u16 num_entries) { - struct batadv_tt_req_node *node, *safe; + struct batadv_tt_req_node *node; + struct hlist_node *safe; struct batadv_orig_node *orig_node = NULL; struct batadv_tvlv_tt_change *tt_change; u8 *tvlv_ptr = (u8 *)tt_data; @@ -2987,10 +2990,10 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv, /* Delete the tt_req_node from pending tt_requests list */ spin_lock_bh(&bat_priv->tt.req_list_lock); - list_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { + hlist_for_each_entry_safe(node, safe, &bat_priv->tt.req_list, list) { if (!batadv_compare_eth(node->addr, resp_src)) continue; - list_del_init(&node->list); + hlist_del_init(&node->list); kfree(node); } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index fe3695cb1d40..2f5e6c39f913 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -537,7 +537,7 @@ struct batadv_priv_tt { struct list_head changes_list; struct batadv_hashtable *local_hash; struct batadv_hashtable *global_hash; - struct list_head req_list; + struct hlist_head req_list; struct list_head roam_list; spinlock_t changes_list_lock; /* protects changes */ spinlock_t req_list_lock; /* protects req_list */ @@ -1006,7 +1006,7 @@ struct batadv_tt_change_node { struct batadv_tt_req_node { u8 addr[ETH_ALEN]; unsigned long issued_at; - struct list_head list; + struct hlist_node list; }; /** -- cgit v1.2.3 From 2c72d655b04450056566bcbfe89c2427376b60b4 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 21 Jun 2015 14:45:14 +0200 Subject: batman-adv: Annotate deleting functions with external lock via lockdep Functions which use (h)list_del* are requiring correct locking when they operate on global lists. Most of the time the search in the list and the delete are done in the same function. All other cases should have it visible that they require a special lock to avoid race conditions. Lockdep asserts can be used to check these problem during runtime when the lockdep functionality is enabled. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/main.c | 11 ++++++++--- net/batman-adv/multicast.c | 13 +++++++++++-- net/batman-adv/network-coding.c | 4 ++++ net/batman-adv/translation-table.c | 2 ++ 4 files changed, 25 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d277ba724a86..50fc07b9d353 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -737,13 +738,17 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv) /** * batadv_tvlv_container_remove - remove tvlv container from the tvlv container * list + * @bat_priv: the bat priv with all the soft interface information * @tvlv: the to be removed tvlv container * * Has to be called with the appropriate locks being acquired * (tvlv.container_list_lock). */ -static void batadv_tvlv_container_remove(struct batadv_tvlv_container *tvlv) +static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv, + struct batadv_tvlv_container *tvlv) { + lockdep_assert_held(&bat_priv->tvlv.handler_list_lock); + if (!tvlv) return; @@ -768,7 +773,7 @@ void batadv_tvlv_container_unregister(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv = batadv_tvlv_container_get(bat_priv, type, version); - batadv_tvlv_container_remove(tvlv); + batadv_tvlv_container_remove(bat_priv, tvlv); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); } @@ -807,7 +812,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv, spin_lock_bh(&bat_priv->tvlv.container_list_lock); tvlv_old = batadv_tvlv_container_get(bat_priv, type, version); - batadv_tvlv_container_remove(tvlv_old); + batadv_tvlv_container_remove(bat_priv, tvlv_old); hlist_add_head(&tvlv_new->list, &bat_priv->tvlv.container_list); spin_unlock_bh(&bat_priv->tvlv.container_list_lock); } diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 2593f0fe0bad..410f34cf85c2 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -103,15 +104,19 @@ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, /** * batadv_mcast_mla_list_free - free a list of multicast addresses + * @bat_priv: the bat priv with all the soft interface information * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. */ -static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) +static void batadv_mcast_mla_list_free(struct batadv_priv *bat_priv, + struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; + lockdep_assert_held(&bat_priv->tt.commit_lock); + hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { hlist_del(&mcast_entry->list); kfree(mcast_entry); @@ -134,6 +139,8 @@ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; + lockdep_assert_held(&bat_priv->tt.commit_lock); + hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && @@ -164,6 +171,8 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; + lockdep_assert_held(&bat_priv->tt.commit_lock); + if (!mcast_list) return; @@ -268,7 +277,7 @@ update: batadv_mcast_mla_tt_add(bat_priv, &mcast_list); out: - batadv_mcast_mla_list_free(&mcast_list); + batadv_mcast_mla_list_free(bat_priv, &mcast_list); } /** diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index ccfe5d86b106..f5276be2c77c 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -586,6 +586,8 @@ static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, unsigned long timeout = bat_priv->nc.max_buffer_time; bool res = false; + lockdep_assert_held(&nc_path->packet_list_lock); + /* Packets are added to tail, so the remaining packets did not time * out and we can stop processing the current queue */ @@ -622,6 +624,8 @@ static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, { unsigned long timeout = bat_priv->nc.max_fwd_delay; + lockdep_assert_held(&nc_path->packet_list_lock); + /* Packets are added to tail, so the remaining packets did not time * out and we can stop processing the current queue */ diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 734b456efaf4..79cee7b771dc 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1672,6 +1672,8 @@ static void _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry, struct batadv_tt_orig_list_entry *orig_entry) { + lockdep_assert_held(&tt_global_entry->list_lock); + batadv_tt_global_size_dec(orig_entry->orig_node, tt_global_entry->common.vid); atomic_dec(&tt_global_entry->orig_list_count); -- cgit v1.2.3 From 5274cd68d744b4bc59b32d87cbde70803130eb3f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 21 Jun 2015 14:45:15 +0200 Subject: batman-adv: Add lockdep_asserts for documented external locks Some functions already have documentation about locks they require inside their kerneldoc header. These can be directly tested during runtime using the lockdep asserts. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/fragmentation.c | 3 +++ net/batman-adv/multicast.c | 6 ++++++ net/batman-adv/translation-table.c | 2 ++ 3 files changed, 11 insertions(+) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index ec5f7bcad66e..700c96c82a15 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,8 @@ static int batadv_frag_size_limit(void) static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain, u16 seqno) { + lockdep_assert_held(&chain->lock); + if (chain->seqno == seqno) return false; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 410f34cf85c2..eb76386f8d4b 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -609,6 +609,8 @@ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; + lockdep_assert_held(&orig->mcast_handler_lock); + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { @@ -652,6 +654,8 @@ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct hlist_node *node = &orig->mcast_want_all_ipv4_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; + lockdep_assert_held(&orig->mcast_handler_lock); + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { @@ -695,6 +699,8 @@ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct hlist_node *node = &orig->mcast_want_all_ipv6_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; + lockdep_assert_held(&orig->mcast_handler_lock); + /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 79cee7b771dc..4228b10c47ea 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -3271,6 +3271,8 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) */ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv) { + lockdep_assert_held(&bat_priv->tt.commit_lock); + /* Update multicast addresses in local translation table */ batadv_mcast_mla_update(bat_priv); -- cgit v1.2.3 From 87b40f534d2a266eec55a897e9a93f3b022c901d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 17 Jul 2015 10:03:42 +0200 Subject: batman-adv: Fix conditional statements indentation commit 29b9256e6631 ("batman-adv: consider outgoing interface in OGM sending") incorrectly indented the interface check code. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index df5411845c7d..5c122000688f 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -968,7 +968,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface) rcu_read_lock(); list_for_each_entry_rcu(tmp_hard_iface, &batadv_hardif_list, list) { if (tmp_hard_iface->soft_iface != hard_iface->soft_iface) - continue; + continue; batadv_iv_ogm_queue_add(bat_priv, *ogm_buff, *ogm_buff_len, hard_iface, tmp_hard_iface, 1, send_time); -- cgit v1.2.3 From 854d2a63de86a769db4dbed75b660f544b3c0c7a Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Fri, 17 Jul 2015 22:25:59 +0800 Subject: batman-adv: beautify supported routing algorithm list Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 50fc07b9d353..e61c5f3633d0 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -584,7 +584,7 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset) seq_puts(seq, "Available routing algorithms:\n"); hlist_for_each_entry(bat_algo_ops, &batadv_algo_list, list) { - seq_printf(seq, "%s\n", bat_algo_ops->name); + seq_printf(seq, " * %s\n", bat_algo_ops->name); } return 0; -- cgit v1.2.3 From 6f021c62d64f38092bc2a0c5fe7b81d5e5b21a00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Aug 2015 12:30:00 -0700 Subject: tcp: fix slow start after idle vs TSO/GSO slow start after idle might reduce cwnd, but we perform this after first packet was cooked and sent. With TSO/GSO, it means that we might send a full TSO packet even if cwnd should have been reduced to IW10. Moving the SSAI check in skb_entail() makes sense, because we slightly reduce number of times this check is done, especially for large send() and TCP Small queue callbacks from softirq context. As Neal pointed out, we also need to perform the check if/when receive window opens. Tested: Following packetdrill test demonstrates the problem // Test of slow start after idle `sysctl -q net.ipv4.tcp_slow_start_after_idle=1` 0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 < S 0:0(0) win 65535 +0 > S. 0:0(0) ack 1 +.100 < . 1:1(0) ack 1 win 511 +0 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_SOCKET, SO_SNDBUF, [200000], 4) = 0 +0 write(4, ..., 26000) = 26000 +0 > . 1:5001(5000) ack 1 +0 > . 5001:10001(5000) ack 1 +0 %{ assert tcpi_snd_cwnd == 10 }% +.100 < . 1:1(0) ack 10001 win 511 +0 %{ assert tcpi_snd_cwnd == 20, tcpi_snd_cwnd }% +0 > . 10001:20001(10000) ack 1 +0 > P. 20001:26001(6000) ack 1 +.100 < . 1:1(0) ack 26001 win 511 +0 %{ assert tcpi_snd_cwnd == 36, tcpi_snd_cwnd }% +4 write(4, ..., 20000) = 20000 // If slow start after idle works properly, we should send 5 MSS here (cwnd/2) +0 > . 26001:31001(5000) ack 1 +0 %{ assert tcpi_snd_cwnd == 10, tcpi_snd_cwnd }% +0 > . 31001:36001(5000) ack 1 Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++++++++ net/ipv4/tcp.c | 2 ++ net/ipv4/tcp_input.c | 3 +++ net/ipv4/tcp_output.c | 12 ++++-------- 4 files changed, 22 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 364426a2be5a..309801f7eb82 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1165,6 +1165,19 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) } u32 tcp_default_init_rwnd(u32 mss); +void tcp_cwnd_restart(struct sock *sk, s32 delta); + +static inline void tcp_slow_start_after_idle_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + s32 delta; + + if (!sysctl_tcp_slow_start_after_idle || tp->packets_out) + return; + delta = tcp_time_stamp - tp->lsndtime; + if (delta > inet_csk(sk)->icsk_rto) + tcp_cwnd_restart(sk, delta); +} /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 45534a5ab430..b8b8fa184f75 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -627,6 +627,8 @@ static void skb_entail(struct sock *sk, struct sk_buff *skb) sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; + + tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4e4d6bcd0ca9..0abca2841de2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3332,6 +3332,9 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 tp->pred_flags = 0; tcp_fast_path_check(sk); + if (tcp_send_head(sk)) + tcp_slow_start_after_idle_check(sk); + if (nwin > tp->max_window) { tp->max_window = nwin; tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 444ab5beecbd..1188e4fcf23b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -137,12 +137,12 @@ static __u16 tcp_advertise_mss(struct sock *sk) } /* RFC2861. Reset CWND after idle period longer RTO to "restart window". - * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct sock *sk, const struct dst_entry *dst) + * This is the first part of cwnd validation mechanism. + */ +void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); - s32 delta = tcp_time_stamp - tp->lsndtime; - u32 restart_cwnd = tcp_init_cwnd(tp, dst); + u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 cwnd = tp->snd_cwnd; tcp_ca_event(sk, CA_EVENT_CWND_RESTART); @@ -164,10 +164,6 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - if (sysctl_tcp_slow_start_after_idle && - (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) - tcp_cwnd_restart(sk, __sk_dst_get(sk)); - tp->lsndtime = now; /* If it is a reply for ato after last received -- cgit v1.2.3 From 4ec3b28c2763e11a423d03810ff0be65f02e635e Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 20 Aug 2015 15:06:30 -0700 Subject: xfrm: Use VRF master index if output device is enslaved Directs route lookups to VRF table. Compiles out if NET_VRF is not enabled. With this patch able to successfully bring up ipsec tunnels in VRFs, even with duplicate network configuration. Signed-off-by: David Ahern Acked-by: Nikolay Aleksandrov Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/xfrm4_policy.c | 7 +++++-- net/ipv6/xfrm6_policy.c | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 55b3c0f4dde5..bb919b28619f 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,6 +15,7 @@ #include #include #include +#include static struct xfrm_policy_afinfo xfrm4_policy_afinfo; @@ -107,8 +108,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; + if (skb_dst(skb)) { + oif = vrf_master_ifindex(skb_dst(skb)->dev) ? + : skb_dst(skb)->dev->ifindex; + } memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index a74013d3eceb..30caa289c5db 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -20,6 +20,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6_MIP6) #include #endif @@ -131,8 +132,10 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse) nexthdr = nh[nhoff]; - if (skb_dst(skb)) - oif = skb_dst(skb)->dev->ifindex; + if (skb_dst(skb)) { + oif = vrf_master_ifindex(skb_dst(skb)->dev) ? + : skb_dst(skb)->dev->ifindex; + } memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; -- cgit v1.2.3 From 43e122b014c955a33220fabbd09c4b5e4f422c3c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 21 Aug 2015 17:38:02 -0700 Subject: tcp: refine pacing rate determination When TCP pacing was added back in linux-3.12, we chose to apply a fixed ratio of 200 % against current rate, to allow probing for optimal throughput even during slow start phase, where cwnd can be doubled every other gRTT. At Google, we found it was better applying a different ratio while in Congestion Avoidance phase. This ratio was set to 120 %. We've used the normal tcp_in_slow_start() helper for a while, then tuned the condition to select the conservative ratio as soon as cwnd >= ssthresh/2 : - After cwnd reduction, it is safer to ramp up more slowly, as we approach optimal cwnd. - Initial ramp up (ssthresh == INFINITY) still allows doubling cwnd every other RTT. Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 15 +++++++++++++++ include/net/tcp.h | 2 ++ net/ipv4/sysctl_net_ipv4.c | 19 +++++++++++++++++++ net/ipv4/tcp_input.c | 18 +++++++++++++++++- 4 files changed, 53 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 46e88ed7f41d..ac77a13d2ea2 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -586,6 +586,21 @@ tcp_min_tso_segs - INTEGER if available window is too small. Default: 2 +tcp_pacing_ss_ratio - INTEGER + sk->sk_pacing_rate is set by TCP stack using a ratio applied + to current rate. (current_rate = cwnd * mss / srtt) + If TCP is in slow start, tcp_pacing_ss_ratio is applied + to let TCP probe for bigger speeds, assuming cwnd can be + doubled every other RTT. + Default: 200 + +tcp_pacing_ca_ratio - INTEGER + sk->sk_pacing_rate is set by TCP stack using a ratio applied + to current rate. (current_rate = cwnd * mss / srtt) + If TCP is in congestion avoidance phase, tcp_pacing_ca_ratio + is applied to conservatively probe for bigger throughput. + Default: 120 + tcp_tso_win_divisor - INTEGER This allows control over what percentage of the congestion window can be consumed by a single TSO frame. diff --git a/include/net/tcp.h b/include/net/tcp.h index 309801f7eb82..4a7b03947a38 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -281,6 +281,8 @@ extern unsigned int sysctl_tcp_notsent_lowat; extern int sysctl_tcp_min_tso_segs; extern int sysctl_tcp_autocorking; extern int sysctl_tcp_invalid_ratelimit; +extern int sysctl_tcp_pacing_ss_ratio; +extern int sysctl_tcp_pacing_ca_ratio; extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 0330ab2e2b63..879bdc5c95b1 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -29,6 +29,7 @@ static int zero; static int one = 1; static int four = 4; +static int thousand = 1000; static int gso_max_segs = GSO_MAX_SEGS; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; @@ -711,6 +712,24 @@ static struct ctl_table ipv4_table[] = { .extra1 = &one, .extra2 = &gso_max_segs, }, + { + .procname = "tcp_pacing_ss_ratio", + .data = &sysctl_tcp_pacing_ss_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, + { + .procname = "tcp_pacing_ca_ratio", + .data = &sysctl_tcp_pacing_ca_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &thousand, + }, { .procname = "tcp_autocorking", .data = &sysctl_tcp_autocorking, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0abca2841de2..dc08e2352665 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -753,13 +753,29 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) * TCP pacing, to smooth the burst on large writes when packets * in flight is significantly lower than cwnd (or rwin) */ +int sysctl_tcp_pacing_ss_ratio __read_mostly = 200; +int sysctl_tcp_pacing_ca_ratio __read_mostly = 120; + static void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ - rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); + rate = (u64)tp->mss_cache * ((USEC_PER_SEC / 100) << 3); + + /* current rate is (cwnd * mss) / srtt + * In Slow Start [1], set sk_pacing_rate to 200 % the current rate. + * In Congestion Avoidance phase, set it to 120 % the current rate. + * + * [1] : Normal Slow Start condition is (tp->snd_cwnd < tp->snd_ssthresh) + * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching + * end of slow start and should slow down. + */ + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate *= sysctl_tcp_pacing_ss_ratio; + else + rate *= sysctl_tcp_pacing_ca_ratio; rate *= max(tp->snd_cwnd, tp->packets_out); -- cgit v1.2.3 From 1d2e3f396c532b95a37bbee92269f37efe908457 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:22 -0700 Subject: RDS: restore return value in rds_cmsg_rdma_args() In rds_cmsg_rdma_args() 'ret' is used by rds_pin_pages() which returns number of pinned pages on success. And the same value is returned to the caller of rds_cmsg_rdma_args() on success which is not intended. Commit f4a3fc03c1d7 ("RDS: Clean up error handling in rds_cmsg_rdma_args") removed the 'ret = 0' line which broke RDS RDMA mode. Fix it by restoring the return value on rds_pin_pages() success keeping the clean-up in place. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 40084d843e9f..6401b501a215 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -658,6 +658,8 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if (ret < 0) goto out; + else + ret = 0; rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); -- cgit v1.2.3 From 43962dd7ee192299c6e0c6cd7f0a65997308f1f4 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:23 -0700 Subject: RDS: always free recv frag as we free its ring entry We were still seeing rare occurrences of the WARN_ON(recv->r_frag) which indicates that the recv refill path was finding allocated frags in ring entries that were marked free. These were usually followed by OOM crashes. They only seem to be occurring in the presence of completion errors and connection resets. This patch ensures that we free the frag as we mark the ring entry free. This should stop the refill path from finding allocated frags in ring entries that were marked free. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index cac5b4506ee3..2a6a75c59943 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -982,10 +982,17 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, } /* - * It's very important that we only free this ring entry if we've truly - * freed the resources allocated to the entry. The refilling path can - * leak if we don't. + * rds_ib_process_recv() doesn't always consume the frag, and + * we might not have called it at all if the wc didn't indicate + * success. We already unmapped the frag's pages, though, and + * the following rds_ib_ring_free() call tells the refill path + * that it will not find an allocated frag here. Make sure we + * keep that promise by freeing a frag that's still on the ring. */ + if (recv->r_frag) { + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; + } rds_ib_ring_free(&ic->i_recv_ring, 1); } } -- cgit v1.2.3 From 1bc7b863f230e429dd7a06c0956ada7933d69f50 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:24 -0700 Subject: RDS: destroy the ib state earlier during shutdown Destroy ib state early during shutdown. Otherwise we can get callbacks after the QP isn't really able to handle them. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f40d8f52b753..94d4427377b2 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -640,6 +640,16 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) (atomic_read(&ic->i_signaled_sends) == 0)); tasklet_kill(&ic->i_recv_tasklet); + /* first destroy the ib state that generates callbacks */ + if (ic->i_cm_id->qp) + rdma_destroy_qp(ic->i_cm_id); + if (ic->i_send_cq) + ib_destroy_cq(ic->i_send_cq); + if (ic->i_recv_cq) + ib_destroy_cq(ic->i_recv_cq); + rdma_destroy_id(ic->i_cm_id); + + /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * @@ -663,14 +673,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) if (ic->i_recvs) rds_ib_recv_clear_ring(ic); - if (ic->i_cm_id->qp) - rdma_destroy_qp(ic->i_cm_id); - if (ic->i_send_cq) - ib_destroy_cq(ic->i_send_cq); - if (ic->i_recv_cq) - ib_destroy_cq(ic->i_recv_cq); - rdma_destroy_id(ic->i_cm_id); - /* * Move connection back to the nodev list. */ -- cgit v1.2.3 From e1f475a738e4c861d8aff84b737a0538680cbe05 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:25 -0700 Subject: RDS: don't update ip address tables if the address hasn't changed If the ip address tables hasn't changed, there is no need to remove them only to be added back again. Lets fix it. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 657ba9f5d308..e49c9568b4a5 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -151,12 +151,17 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) { + if (!rds_ibdev_old) + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + + if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); rds_ib_dev_put(rds_ibdev_old); + return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } + rds_ib_dev_put(rds_ibdev_old); - return rds_ib_add_ipaddr(rds_ibdev, ipaddr); + return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) -- cgit v1.2.3 From 73ce4317bf983282593aff710b112a7e705620c3 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:26 -0700 Subject: RDS: make sure we post recv buffers If we get an ENOMEM during rds_ib_recv_refill, we might never come back and refill again later. Patch makes sure to kick krdsd into helping out. To achieve this we add RDS_RECV_REFILL flag and update in the refill path based on that so that at least some therad will keep posting receive buffers. Since krdsd and softirq both might race for refill, we decide to schedule on work queue based on ring_low instead of ring_empty. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/connection.c | 2 ++ net/rds/ib.h | 2 +- net/rds/ib_cm.c | 2 +- net/rds/ib_recv.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++------ net/rds/rds.h | 1 + 5 files changed, 57 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index d4fecb21ca25..a50e652eb269 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -301,6 +301,8 @@ void rds_conn_shutdown(struct rds_connection *conn) wait_event(conn->c_waitq, !test_bit(RDS_IN_XMIT, &conn->c_flags)); + wait_event(conn->c_waitq, + !test_bit(RDS_RECV_REFILL, &conn->c_flags)); conn->c_trans->conn_shutdown(conn); rds_conn_reset(conn); diff --git a/net/rds/ib.h b/net/rds/ib.h index 86d88ec5d556..6422c52682e5 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -320,7 +320,7 @@ void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); -void rds_ib_recv_refill(struct rds_connection *conn, int prefill); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to); void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 94d4427377b2..04243dd1c2ea 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -135,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, 1); + rds_ib_recv_refill(conn, 1, GFP_KERNEL); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 2a6a75c59943..3afdcbdd06b4 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -297,7 +297,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic } static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, int prefill) + struct rds_ib_recv_work *recv, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct ib_sge *sge; @@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, gfp_t slab_mask = GFP_NOWAIT; gfp_t page_mask = GFP_NOWAIT; - if (prefill) { + if (gfp & __GFP_WAIT) { slab_mask = GFP_KERNEL; page_mask = GFP_HIGHUSER; } @@ -347,6 +347,24 @@ out: return ret; } +static int acquire_refill(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_RECV_REFILL, &conn->c_flags) == 0; +} + +static void release_refill(struct rds_connection *conn) +{ + clear_bit(RDS_RECV_REFILL, &conn->c_flags); + + /* We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into @@ -354,15 +372,23 @@ out: * * -1 is returned if posting fails due to temporary resource exhaustion. */ -void rds_ib_recv_refill(struct rds_connection *conn, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; + int can_wait = gfp & __GFP_WAIT; u32 pos; + /* the goal here is to just make sure that someone, somewhere + * is posting buffers. If we can't get the refill lock, + * let them do their thing + */ + if (!acquire_refill(conn)) + return; + while ((prefill || rds_conn_up(conn)) && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) { if (pos >= ic->i_recv_ring.w_nr) { @@ -372,7 +398,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, prefill); + ret = rds_ib_recv_refill_one(conn, recv, gfp); if (ret) { break; } @@ -402,6 +428,24 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill) if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); + + release_refill(conn); + + /* if we're called from the softirq handler, we'll be GFP_NOWAIT. + * in this case the ring being low is going to lead to more interrupts + * and we can safely let the softirq code take care of it unless the + * ring is completely empty. + * + * if we're called from krdsd, we'll be GFP_KERNEL. In this case + * we might have raced with the softirq code while we had the refill + * lock held. Use rds_ib_ring_low() instead of ring_empty to decide + * if we should requeue. + */ + if (rds_conn_up(conn) && + ((can_wait && rds_ib_ring_low(&ic->i_recv_ring)) || + rds_ib_ring_empty(&ic->i_recv_ring))) { + queue_delayed_work(rds_wq, &conn->c_recv_w, 1); + } } /* @@ -1023,7 +1067,7 @@ void rds_ib_recv_tasklet_fn(unsigned long data) rds_ib_stats_inc(s_ib_rx_ring_empty); if (rds_ib_ring_low(&ic->i_recv_ring)) - rds_ib_recv_refill(conn, 0); + rds_ib_recv_refill(conn, 0, GFP_NOWAIT); } int rds_ib_recv(struct rds_connection *conn) @@ -1032,8 +1076,10 @@ int rds_ib_recv(struct rds_connection *conn) int ret = 0; rdsdebug("conn %p\n", conn); - if (rds_conn_up(conn)) + if (rds_conn_up(conn)) { rds_ib_attempt_ack(ic); + rds_ib_recv_refill(conn, 0, GFP_KERNEL); + } return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index 9005fb0586f6..afb4048d0cfd 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -80,6 +80,7 @@ enum { #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 #define RDS_IN_XMIT 2 +#define RDS_RECV_REFILL 3 struct rds_connection { struct hlist_node c_hash_node; -- cgit v1.2.3 From 0c484240210a8c835a216dbca6f1d690e4094a7c Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:27 -0700 Subject: RDS: check for congestion updates during rds_send_xmit Ensure we don't keep sending the data if the link is congested. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 2581b8e3dbe7..dd901c324edf 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -411,7 +411,8 @@ over_batch: */ if (ret == 0) { smp_mb(); - if (!list_empty(&conn->c_send_queue) && + if ((test_bit(0, &conn->c_map_queued) || + !list_empty(&conn->c_send_queue)) && send_gen == conn->c_send_gen) { rds_stats_inc(s_send_lock_queue_raced); goto restart; -- cgit v1.2.3 From 0df5f9a68a79f31e608b69a9369d6415a38843e1 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:28 -0700 Subject: RDS: add a sock_destruct callback debug aid This helps to detect the accidental processes/apps trying to destroy the RDS socket which they are sharing with other processes/apps. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/af_rds.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 896834cd3b9a..a2f28a6d4dc5 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -438,6 +438,14 @@ static const struct proto_ops rds_proto_ops = { .sendpage = sock_no_sendpage, }; +static void rds_sock_destruct(struct sock *sk) +{ + struct rds_sock *rs = rds_sk_to_rs(sk); + + WARN_ON((&rs->rs_item != rs->rs_item.next || + &rs->rs_item != rs->rs_item.prev)); +} + static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { struct rds_sock *rs; @@ -445,6 +453,7 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) sock_init_data(sock, sk); sock->ops = &rds_proto_ops; sk->sk_protocol = protocol; + sk->sk_destruct = rds_sock_destruct; rs = rds_sk_to_rs(sk); spin_lock_init(&rs->rs_lock); -- cgit v1.2.3 From 4f73113c63bae5cd092c93c2a9835b9d3c7f0b54 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:29 -0700 Subject: RDS: Mark message mapped before transmit rds_send_xmit() marks the rds message map flag after xmit_[rdma/atomic]() which is clearly wrong. We need to maintain the ownership between transport and rds. Also take care of error path. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index dd901c324edf..faad1be61ab6 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -282,26 +282,34 @@ restart: /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { rm->m_final_op = &rm->rdma; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_rdma_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { rm->m_final_op = &rm->atomic; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue + */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); - if (ret) + if (ret) { + clear_bit(RDS_MSG_MAPPED, &rm->m_flags); + wake_up_interruptible(&rm->m_flush_wait); break; + } conn->c_xmit_atomic_sent = 1; - /* The transport owns the mapped memory for now. - * You can't unmap it while it's on the send queue */ - set_bit(RDS_MSG_MAPPED, &rm->m_flags); } /* -- cgit v1.2.3 From 3049147ca7c8426268433d16d7546a51dede316d Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:30 -0700 Subject: RDS: Make sure we do a signaled send for large-send WR(Work Requests )always generate a WC(Work Completion) with signaled send. Default RDS ib code is setup for un-signaled completion. Since RDS connction is persistent, we can end up sending the data even after large-send when the remote end is not active(for any reason). By doing a signaled send at least once per large-send, we can at least detect the problem in work completion handler there by avoiding sending more data to inactive remote. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_send.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 5d0a704fa039..c576ebeb4115 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -709,6 +709,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_op = ic->i_data_op; prev->s_wr.send_flags |= IB_SEND_SOLICITED; + if (!(prev->s_wr.send_flags & IB_SEND_SIGNALED)) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + prev->s_wr.send_flags |= IB_SEND_SIGNALED; + nr_sig++; + } ic->i_data_op = NULL; } -- cgit v1.2.3 From 5c240fa2ab394af1dbde280e00cc038cbc7f0409 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:31 -0700 Subject: RDS: Fix assertion level from fatal to warning Fix the asserion level since its not fatal and can be hit in normal execution paths. There is no need to take the system down. We keep the WARN_ON() to detect the condition if we get here with bad pages. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 +- net/rds/rdma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e49c9568b4a5..7b7aac8cdb56 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -490,7 +490,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 6401b501a215..c1df9b1cf3b2 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -451,7 +451,7 @@ void rds_rdma_free_op(struct rm_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ if (!ro->op_write) { - BUG_ON(irqs_disabled()); + WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); } put_page(page); -- cgit v1.2.3 From 1c3be624f40acd6f2bd0f22ade081ac2467e8617 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Sat, 22 Aug 2015 15:45:32 -0700 Subject: RDS: Don't destroy the rdma id until after we're done using it During connection resets, we are destroying the rdma id too soon. We can't destroy it when it is still in use. So lets move rdma_destroy_id() after we clear the rings. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 04243dd1c2ea..d150bb4aa3cb 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -647,7 +647,6 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ib_destroy_cq(ic->i_send_cq); if (ic->i_recv_cq) ib_destroy_cq(ic->i_recv_cq); - rdma_destroy_id(ic->i_cm_id); /* then free the resources that ib callbacks use */ if (ic->i_send_hdrs) @@ -673,6 +672,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) if (ic->i_recvs) rds_ib_recv_clear_ring(ic); + rdma_destroy_id(ic->i_cm_id); + /* * Move connection back to the nodev list. */ -- cgit v1.2.3 From dfcec251d22bab947d10cf37e9ad67085cf7f097 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:33 -0700 Subject: RDS: make sure rds_send_drop_to properly takes the m_rs_lock rds_send_drop_to() is used during socket tear down to find all the messages on the socket and flush them . It can race with the acking code unless it takes the m_rs_lock on each and every message. This plugs a hole where we didn't take m_rs_lock on any message that didn't have the RDS_MSG_ON_CONN set. Taking m_rs_lock avoids double frees and other memory corruptions as the ack code trusts the message m_rs pointer on a socket that had actually been freed. We must take m_rs_lock to access m_rs. Because of lock nesting and rs access, we also need to acquire rs_lock. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index faad1be61ab6..b40c2ea8e89a 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -778,8 +778,22 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); list_del_init(&rm->m_sock_item); - rds_message_wait(rm); + + /* just in case the code above skipped this message + * because RDS_MSG_ON_CONN wasn't set, run it again here + * taking m_rs_lock is the only thing that keeps us + * from racing with ack processing. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); + + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + rds_message_put(rm); } } -- cgit v1.2.3 From 06e8941e22f5cbaa4051f06d7ac99e3f302f6d48 Mon Sep 17 00:00:00 2001 From: Mukesh Kacker Date: Sat, 22 Aug 2015 15:45:34 -0700 Subject: RDS: return EMSGSIZE for oversize requests before processing/queueing rds_send_queue_rm() allows for the "current datagram" being queued to exceed SO_SNDBUF thresholds by checking bytes queued without counting in length of current datagram. (Since sk_sndbuf is set to twice requested SO_SNDBUF value as a kernel heuristic this is usually fine!) If this "current datagram" squeezing past the threshold is itself many times the size of the sk_sndbuf threshold itself then even twice the SO_SNDBUF does not save us and it gets queued but cannot be transmitted. Threads block and deadlock and device becomes unusable. The check for this datagram not exceeding SNDBUF thresholds (EMSGSIZE) is not done on this datagram as that check is only done if queueing attempt fails. (Datagrams that follow this datagram fail queueing attempts, go through the check and eventually trip EMSGSIZE error but zero length datagrams silently fail!) This fix moves the check for datagrams exceeding SNDBUF limits before any processing or queueing is attempted and returns EMSGSIZE early in the rds_sndmsg() code. This change also ensures that all datagrams get checked for exceeding SNDBUF/sk_sndbuf size limits and the large datagrams that exceed those limits do not get to rds_send_queue_rm() code for processing. Signed-off-by: Mukesh Kacker Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/send.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index b40c2ea8e89a..4df61a515b83 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1015,6 +1015,11 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) goto out; } + if (payload_len > rds_sk_sndbuf(rs)) { + ret = -EMSGSIZE; + goto out; + } + /* size of rm including all sgs */ ret = rds_rm_size(msg, payload_len); if (ret < 0) @@ -1087,11 +1092,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port, dport, &queued)) { rds_stats_inc(s_send_queue_full); - /* XXX make sure this is reasonable */ - if (payload_len > rds_sk_sndbuf(rs)) { - ret = -EMSGSIZE; - goto out; - } + if (nonblock) { ret = -EAGAIN; goto out; -- cgit v1.2.3 From ae05368afa7ea27fd906477874ed3ebc7df7e53c Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Sat, 22 Aug 2015 15:45:35 -0700 Subject: RDS: check for valid cm_id before initiating connection Connection could have been dropped while the route is being resolved so check for valid cm_id before initiating the connection. Reviewed-by: Ajaykumar Hotchandani Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma_transport.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 208240836043..b9b40af5345b 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -34,6 +34,7 @@ #include #include "rdma_transport.h" +#include "ib.h" static struct rdma_cm_id *rds_rdma_listen_id; @@ -82,8 +83,18 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, break; case RDMA_CM_EVENT_ROUTE_RESOLVED: - /* XXX worry about racing with listen acceptance */ - ret = trans->cm_initiate_connect(cm_id); + /* Connection could have been dropped so make sure the + * cm_id is valid before proceeding + */ + if (conn) { + struct rds_ib_connection *ibic; + + ibic = conn->c_transport_data; + if (ibic && ibic->i_cm_id == cm_id) + ret = trans->cm_initiate_connect(cm_id); + else + rds_conn_drop(conn); + } break; case RDMA_CM_EVENT_ESTABLISHED: -- cgit v1.2.3 From 25105051fde444cac11a2e2599350e5388936428 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 23 Aug 2015 02:11:21 +0200 Subject: ah6: fix error return code Return a negative error code on failure. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @@ identifier ret; expression e1,e2; @@ ( if (\(ret < 0\|ret != 0\)) { ... return ret; } | ret = 0 ) ... when != ret = e1 when != &ret *if(...) { ... when != ret = e2 when forall return ret; } // Signed-off-by: Julia Lawall Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/ah6.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index ed7d4e3f9c10..0630a4d5daaa 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -577,8 +577,10 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) work_iph = ah_alloc_tmp(ahash, nfrags + sglists, hdr_len + ahp->icv_trunc_len + seqhi_len); - if (!work_iph) + if (!work_iph) { + err = -ENOMEM; goto out; + } auth_data = ah_tmp_auth((u8 *)work_iph, hdr_len); seqhi = (__be32 *)(auth_data + ahp->icv_trunc_len); -- cgit v1.2.3 From 94c10f0ea303473884f69edd6bbb0cedcf570105 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Tue, 25 Aug 2015 13:38:50 -0700 Subject: ah4: Fix error return in ah_input(). Noticed by Herbert Xu. Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index ac9a32ec3ee4..f2a71025a770 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -360,8 +360,10 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + ahp->icv_trunc_len + seqhi_len); - if (!work_iph) + if (!work_iph) { + err = -ENOMEM; goto out; + } seqhi = (__be32 *)((char *)work_iph + ihl); auth_data = ah_tmp_auth(seqhi, seqhi_len); -- cgit v1.2.3 From d7499160107dd1367cf34873564b522a5516430c Mon Sep 17 00:00:00 2001 From: Masanari Iida Date: Mon, 24 Aug 2015 22:56:54 +0900 Subject: net-next: Fix warning while make xmldocs caused by skbuff.c This patch fix following warnings. .//net/core/skbuff.c:407: warning: No description found for parameter 'len' .//net/core/skbuff.c:407: warning: Excess function parameter 'length' description in '__netdev_alloc_skb' .//net/core/skbuff.c:476: warning: No description found for parameter 'len' .//net/core/skbuff.c:476: warning: Excess function parameter 'length' description in '__napi_alloc_skb' Signed-off-by: Masanari Iida Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index bf9a5d93c2d1..8a725cc50a90 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -392,7 +392,7 @@ EXPORT_SYMBOL(napi_alloc_frag); /** * __netdev_alloc_skb - allocate an skbuff for rx on a specific device * @dev: network device to receive on - * @length: length to allocate + * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb * * Allocate a new &sk_buff and assign it a usage count of one. The @@ -461,7 +461,7 @@ EXPORT_SYMBOL(__netdev_alloc_skb); /** * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance * @napi: napi instance this buffer was allocated for - * @length: length to allocate + * @len: length to allocate * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages * * Allocate a new sk_buff for use in NAPI receive. This buffer will -- cgit v1.2.3 From e252b3d1a1744af1431aca30e091420734c2b012 Mon Sep 17 00:00:00 2001 From: WANG Cong Date: Tue, 25 Aug 2015 10:38:53 -0700 Subject: route: fix a use-after-free This patch fixes the following crash: general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.2.0-rc7+ #166 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff88010656d280 ti: ffff880106570000 task.ti: ffff880106570000 RIP: 0010:[] [] dst_destroy+0xa6/0xef RSP: 0018:ffff880107603e38 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff8800d225a000 RCX: ffffffff82250fd0 RDX: 0000000000000001 RSI: ffffffff82250fd0 RDI: 6b6b6b6b6b6b6b6b RBP: ffff880107603e58 R08: 0000000000000001 R09: 0000000000000001 R10: 000000000000b530 R11: ffff880107609000 R12: 0000000000000000 R13: ffffffff82343c40 R14: 0000000000000000 R15: ffffffff8182fb4f FS: 0000000000000000(0000) GS:ffff880107600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007fcabd9d3000 CR3: 00000000d7279000 CR4: 00000000000006e0 Stack: ffffffff82250fd0 ffff8801077d6f00 ffffffff82253c40 ffff8800d225a000 ffff880107603e68 ffffffff8182fb5d ffff880107603f08 ffffffff810d795e ffffffff810d7648 ffff880106574000 ffff88010656d280 ffff88010656d280 Call Trace: [] dst_destroy_rcu+0xe/0x1d [] rcu_process_callbacks+0x618/0x7eb [] ? rcu_process_callbacks+0x302/0x7eb [] ? dst_gc_task+0x1eb/0x1eb [] __do_softirq+0x178/0x39f [] irq_exit+0x41/0x95 [] smp_apic_timer_interrupt+0x34/0x40 [] apic_timer_interrupt+0x6d/0x80 [] ? default_idle+0x21/0x32 [] ? default_idle+0x1f/0x32 [] arch_cpu_idle+0xf/0x11 [] default_idle_call+0x1f/0x21 [] cpu_startup_entry+0x1ad/0x273 [] start_secondary+0x135/0x156 dst is freed right before lwtstate_put(), this is not correct... Fixes: 61adedf3e3f1 ("route: move lwtunnel state to dst_entry") Acked-by: Jiri Benc Signed-off-by: Cong Wang Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/dst.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 50dcdbb0ee46..477035ed7903 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -262,11 +262,12 @@ again: if (dst->dev) dev_put(dst->dev); + lwtstate_put(dst->lwtstate); + if (dst->flags & DST_METADATA) kfree(dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); - lwtstate_put(dst->lwtstate); dst = child; if (dst) { -- cgit v1.2.3 From b01d04aa511b78c795dd258bf8689a421a6b4d0c Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Tue, 25 Aug 2015 15:54:25 -0700 Subject: rds: Fix improper gfp_t usage. >> net/rds/ib_recv.c:382:28: sparse: incorrect type in initializer (different base types) net/rds/ib_recv.c:382:28: expected int [signed] can_wait net/rds/ib_recv.c:382:28: got restricted gfp_t net/rds/ib_recv.c:828:23: sparse: cast to restricted __le64 Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 3afdcbdd06b4..ed9b41e3b277 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -379,7 +379,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) struct ib_recv_wr *failed_wr; unsigned int posted = 0; int ret = 0; - int can_wait = gfp & __GFP_WAIT; + bool can_wait = !!(gfp & __GFP_WAIT); u32 pos; /* the goal here is to just make sure that someone, somewhere -- cgit v1.2.3 From ba54d3ced958435e1802daf992cfd44c26cd4cb7 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Tue, 25 Aug 2015 12:01:58 -0700 Subject: RDS: fix the dangling reference to rds_ib_incoming_slab On rds_ib_frag_slab allocation failure, ensure rds_ib_incoming_slab is not pointing to the detsroyed memory. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index ed9b41e3b277..6bbe62060060 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1102,9 +1102,10 @@ int rds_ib_recv_init(void) rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), 0, SLAB_HWCACHE_ALIGN, NULL); - if (!rds_ib_frag_slab) + if (!rds_ib_frag_slab) { kmem_cache_destroy(rds_ib_incoming_slab); - else + rds_ib_incoming_slab = NULL; + } else ret = 0; out: return ret; -- cgit v1.2.3 From 3f6b3143031b678a8577df1f24ca977510aefcf5 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Tue, 25 Aug 2015 12:01:59 -0700 Subject: RDS: Fix rds MR reference count in rds_rdma_unuse() rds_rdma_unuse() drops the mr reference count which it hasn't taken. Correct way of removing mr is to remove mr from the tree and then rdma_destroy_mr() it first, then rds_mr_put() to decrement its reference count. Whichever thread holds last reference will free the mr via rds_mr_put() This bug was triggering weird null pointer crashes. One if the trace for it is captured below. BUG: unable to handle kernel NULL pointer dereference at 0000000000000104 IP: [] rds_ib_free_mr+0x31/0x130 [rds_rdma] PGD 4366fa067 PUD 4366f9067 PMD 0 Oops: 0000 [#1] SMP [...] task: ffff88046da6a000 ti: ffff88046da6c000 task.ti: ffff88046da6c000 RIP: 0010:[] [] rds_ib_free_mr+0x31/0x130 [rds_rdma] RSP: 0018:ffff88046fa43bd8 EFLAGS: 00010286 RAX: 0000000071d38b80 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff880079e7ff40 RBP: ffff88046fa43bf8 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88046fa43ca8 R11: ffff88046a802ed8 R12: ffff880079e7fa40 R13: 0000000000000000 R14: ffff880079e7ff40 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88046fa40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000104 CR3: 00000004366fb000 CR4: 00000000000006e0 Stack: ffff880079e7fa40 ffff880671d38f08 ffff880079e7ff40 0000000000000296 ffff88046fa43c28 ffffffffa087a38b ffff880079e7fa40 ffff880671d38f10 0000000000000000 0000000000000292 ffff88046fa43c48 ffffffffa087a3b6 Call Trace: [] rds_destroy_mr+0x8b/0xa0 [rds] [] __rds_put_mr_final+0x16/0x30 [rds] [] rds_rdma_unuse+0xc2/0x120 [rds] [] rds_recv_incoming_exthdrs+0x83/0xa0 [rds] [] rds_recv_incoming+0x92/0x200 [rds] [] rds_ib_process_recv+0x259/0x320 [rds_rdma] [] rds_ib_recv_tasklet_fn+0x1a8/0x490 [rds_rdma] [] ? __remove_hrtimer+0x58/0x90 [] tasklet_action+0xb1/0xc0 [] __do_softirq+0xe2/0x290 [] irq_exit+0xa6/0xb0 [] do_IRQ+0x65/0xf0 [] common_interrupt+0x6b/0x6b Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index c1df9b1cf3b2..4c93badeabf2 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -435,9 +435,10 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) /* If the MR was marked as invalidate, this will * trigger an async flush. */ - if (zot_me) + if (zot_me) { rds_destroy_mr(mr); - rds_mr_put(mr); + rds_mr_put(mr); + } } void rds_rdma_free_op(struct rm_rdma_op *ro) -- cgit v1.2.3 From 6116c2030fff91950f68b7fffb5959c91a05aaf6 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Tue, 25 Aug 2015 12:02:00 -0700 Subject: RDS: fix fmr pool dirty_count In rds_ib_flush_mr_pool(), dirty_count accounts the clean ones which is wrong. This can lead to a negative dirty count value. Lets fix it. Signed-off-by: Wengang Wang Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 7b7aac8cdb56..a275b7d205ef 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -528,11 +528,13 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr /* * given an llist of mrs, put them all into the list_head for more processing */ -static void llist_append_to_list(struct llist_head *llist, struct list_head *list) +static unsigned int llist_append_to_list(struct llist_head *llist, + struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; + unsigned int count = 0; node = llist_del_all(llist); while (node) { @@ -540,7 +542,9 @@ static void llist_append_to_list(struct llist_head *llist, struct list_head *lis ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; + count++; } + return count; } /* @@ -581,7 +585,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); unsigned long unpinned = 0; - unsigned int nfreed = 0, ncleaned = 0, free_goal; + unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; int ret = 0; rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); @@ -623,8 +627,8 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ - llist_append_to_list(&pool->drop_list, &unmap_list); - llist_append_to_list(&pool->free_list, &unmap_list); + dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); + dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) llist_append_to_list(&pool->clean_list, &unmap_list); @@ -652,7 +656,6 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, kfree(ibmr); nfreed++; } - ncleaned++; } if (!list_empty(&unmap_list)) { @@ -678,7 +681,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, } atomic_sub(unpinned, &pool->free_pinned); - atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: -- cgit v1.2.3 From ad1d7dc0d79d3dd2c5d2931b13edbd4fe33e5fac Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Tue, 25 Aug 2015 12:02:01 -0700 Subject: RDS: push FMR pool flush work to its own worker RDS FMR flush operation and also it races with connect/reconect which happes a lot with RDS. FMR flush being on common rds_wq aggrevates the problem. Lets push RDS FMR pool flush work to its own worker. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 9 ++++++++- net/rds/ib.h | 2 ++ net/rds/ib_rdma.c | 27 ++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 13814227b3b2..d020fade312c 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -366,6 +366,7 @@ void rds_ib_exit(void) rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); + rds_ib_fmr_exit(); } struct rds_transport rds_ib_transport = { @@ -401,10 +402,14 @@ int rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = ib_register_client(&rds_ib_client); + ret = rds_ib_fmr_init(); if (ret) goto out; + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out_fmr_exit; + ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; @@ -427,6 +432,8 @@ out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); +out_fmr_exit: + rds_ib_fmr_exit(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index 6422c52682e5..9fc95e38659a 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -313,6 +313,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); +int rds_ib_fmr_init(void); +void rds_ib_fmr_exit(void); /* ib_recv.c */ int rds_ib_recv_init(void); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a275b7d205ef..2ac78c9879ea 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -83,6 +83,25 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; +struct workqueue_struct *rds_ib_fmr_wq; + +int rds_ib_fmr_init(void) +{ + rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); + if (!rds_ib_fmr_wq) + return -ENOMEM; + return 0; +} + +/* By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_fmr_exit(void) +{ + destroy_workqueue(rds_ib_fmr_wq); +} + static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_mr_pool_flush_worker(struct work_struct *work); @@ -719,15 +738,17 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked - * as use_once and invalidate at the same time. */ - schedule_delayed_work(&pool->flush_worker, 10); + * as use_once and invalidate at the same time. + */ + queue_delayed_work(rds_ib_fmr_wq, + &pool->flush_worker, 10); } } -- cgit v1.2.3 From ef5217a6e2e60bc3d0679f2652480b99730956fe Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Tue, 25 Aug 2015 12:02:02 -0700 Subject: RDS: flush the FMR pool less often FMR flush is an expensive and time consuming operation. Reduce the frequency of FMR pool flush by 50% so that more FMR work gets accumulated for more efficient flushing. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 2ac78c9879ea..e596dfb76038 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -737,7 +737,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) + atomic_read(&pool->dirty_count) >= pool->max_items / 5) queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { -- cgit v1.2.3 From 272412141908c40517cc89d5bb2eb074a2ec1474 Mon Sep 17 00:00:00 2001 From: santosh.shilimkar@oracle.com Date: Tue, 25 Aug 2015 12:02:03 -0700 Subject: RDS: remove superfluous from rds_ib_alloc_fmr() Memory allocated for 'ibmr' uses kzalloc_node() which already initialises the memory to zero. There is no need to do memset() 0 on that memory. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e596dfb76038..251d1ce0b7c7 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -360,8 +360,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } - memset(ibmr, 0, sizeof(*ibmr)); - ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | -- cgit v1.2.3 From 3c645621b79828be7a46fb2694eb423b343b4bbe Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 25 Aug 2015 20:06:31 -0700 Subject: net_sched: make tcf_hash_destroy() static tcf_hash_destroy() used once. Make it static. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/act_api.h | 1 - net/sched/act_api.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 4519c81304bd..9d446f136607 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -111,7 +111,6 @@ struct tc_action_ops { }; int tcf_hash_search(struct tc_action *a, u32 index); -void tcf_hash_destroy(struct tc_action *a); u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo); int tcf_hash_check(u32 index, struct tc_action *a, int bind); int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index b087087ccfa9..06e7c4a37245 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -36,7 +36,7 @@ static void free_tcf(struct rcu_head *head) kfree(p); } -void tcf_hash_destroy(struct tc_action *a) +static void tcf_hash_destroy(struct tc_action *a) { struct tcf_common *p = a->priv; struct tcf_hashinfo *hinfo = a->ops->hinfo; @@ -52,7 +52,6 @@ void tcf_hash_destroy(struct tc_action *a) */ call_rcu(&p->tcfc_rcu, free_tcf); } -EXPORT_SYMBOL(tcf_hash_destroy); int __tcf_hash_release(struct tc_action *a, bool bind, bool strict) { -- cgit v1.2.3 From faa54be4c78da6aa2148c539659867afdecd3e8d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 25 Aug 2015 20:06:32 -0700 Subject: net_sched: act_bpf: remove unnecessary copy Fix harmless typo and avoid unnecessary copy of empty 'prog' into unused 'strcut tcf_bpf_cfg old'. Fixes: f4eaed28c783 ("act_bpf: fix memory leaks when replacing bpf programs") Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/act_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 1b97dabc621a..458cf647e698 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -327,7 +327,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog = to_bpf(act); spin_lock_bh(&prog->tcf_lock); - if (ret != ACT_P_CREATED) + if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); prog->bpf_ops = cfg.bpf_ops; -- cgit v1.2.3 From ed7aa879ce1a0ff6468c5a6d3eb4e31169b95c4a Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 25 Aug 2015 20:06:33 -0700 Subject: net_sched: convert tcindex to call tcf_exts_destroy from rcu callback Adjust destroy path of cls_tcindex to call tcf_exts_destroy() after rcu grace period. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_tcindex.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index a557dbaf5afe..944c8ff45055 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -27,6 +27,7 @@ struct tcindex_filter_result { struct tcf_exts exts; struct tcf_result res; + struct rcu_head rcu; }; struct tcindex_filter { @@ -133,8 +134,23 @@ static int tcindex_init(struct tcf_proto *tp) return 0; } -static int -tcindex_delete(struct tcf_proto *tp, unsigned long arg) +static void tcindex_destroy_rexts(struct rcu_head *head) +{ + struct tcindex_filter_result *r; + + r = container_of(head, struct tcindex_filter_result, rcu); + tcf_exts_destroy(&r->exts); +} + +static void tcindex_destroy_fexts(struct rcu_head *head) +{ + struct tcindex_filter *f = container_of(head, struct tcindex_filter, rcu); + + tcf_exts_destroy(&f->result.exts); + kfree(f); +} + +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; @@ -162,9 +178,14 @@ found: rcu_assign_pointer(*walk, rtnl_dereference(f->next)); } tcf_unbind_filter(tp, &r->res); - tcf_exts_destroy(&r->exts); + /* all classifiers are required to call tcf_exts_destroy() after rcu + * grace period, since converted-to-rcu actions are relying on that + * in cleanup() callback + */ if (f) - kfree_rcu(f, rcu); + call_rcu(&f->rcu, tcindex_destroy_fexts); + else + call_rcu(&r->rcu, tcindex_destroy_rexts); return 0; } -- cgit v1.2.3 From 9e528d89154b602209fda2fb92e89927bbffc976 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 25 Aug 2015 20:06:34 -0700 Subject: net_sched: convert rsvp to call tcf_exts_destroy from rcu callback Adjust destroy path of cls_rsvp to call tcf_exts_destroy() after rcu grace period. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sched/cls_rsvp.h | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 02fa82792dab..f9c9fc075fe6 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -283,12 +283,22 @@ static int rsvp_init(struct tcf_proto *tp) return -ENOBUFS; } -static void -rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +static void rsvp_delete_filter_rcu(struct rcu_head *head) { - tcf_unbind_filter(tp, &f->res); + struct rsvp_filter *f = container_of(head, struct rsvp_filter, rcu); + tcf_exts_destroy(&f->exts); - kfree_rcu(f, rcu); + kfree(f); +} + +static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + /* all classifiers are required to call tcf_exts_destroy() after rcu + * grace period, since converted-to-rcu actions are relying on that + * in cleanup() callback + */ + call_rcu(&f->rcu, rsvp_delete_filter_rcu); } static bool rsvp_destroy(struct tcf_proto *tp, bool force) -- cgit v1.2.3 From cff82457c5584f6a96d2b85d1a88b81ba304a330 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 25 Aug 2015 20:06:35 -0700 Subject: net_sched: act_bpf: remove spinlock in fast path Similar to act_gact/act_mirred, act_bpf can be lockless in packet processing with extra care taken to free bpf programs after rcu grace period. Replacement of existing act_bpf (very rare) is done with synchronize_rcu() and final destruction is done from tc_action_ops->cleanup() callback that is called from tcf_exts_destroy()->tcf_action_destroy()->__tcf_hash_release() when bind and refcnt reach zero which is only possible when classifier is destroyed. Previous two patches fixed the last two classifiers (tcindex and rsvp) to call tcf_exts_destroy() from rcu callback. Similar to gact/mirred there is a race between prog->filter and prog->tcf_action. Meaning that the program being replaced may use previous default action if it happened to return TC_ACT_UNSPEC. act_mirred race betwen tcf_action and tcfm_dev is similar. In all cases the race is harmless. Long term we may want to improve the situation by replacing the whole tc_action->priv as single pointer instead of updating inner fields one by one. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/net/tc_act/tc_bpf.h | 2 +- net/sched/act_bpf.c | 36 +++++++++++++++++++----------------- 2 files changed, 20 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/include/net/tc_act/tc_bpf.h b/include/net/tc_act/tc_bpf.h index a152e9858b2c..958d69cfb19c 100644 --- a/include/net/tc_act/tc_bpf.h +++ b/include/net/tc_act/tc_bpf.h @@ -15,7 +15,7 @@ struct tcf_bpf { struct tcf_common common; - struct bpf_prog *filter; + struct bpf_prog __rcu *filter; union { u32 bpf_fd; u16 bpf_num_ops; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 458cf647e698..559bfa011bda 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -37,25 +37,24 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, struct tcf_result *res) { struct tcf_bpf *prog = act->priv; + struct bpf_prog *filter; int action, filter_res; bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS; if (unlikely(!skb_mac_header_was_set(skb))) return TC_ACT_UNSPEC; - spin_lock(&prog->tcf_lock); - - prog->tcf_tm.lastuse = jiffies; - bstats_update(&prog->tcf_bstats, skb); + tcf_lastuse_update(&prog->tcf_tm); + bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); - /* Needed here for accessing maps. */ rcu_read_lock(); + filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = BPF_PROG_RUN(filter, skb); __skb_pull(skb, skb->mac_len); } else { - filter_res = BPF_PROG_RUN(prog->filter, skb); + filter_res = BPF_PROG_RUN(filter, skb); } rcu_read_unlock(); @@ -77,7 +76,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, break; case TC_ACT_SHOT: action = filter_res; - prog->tcf_qstats.drops++; + qstats_drop_inc(this_cpu_ptr(prog->common.cpu_qstats)); break; case TC_ACT_UNSPEC: action = prog->tcf_action; @@ -87,7 +86,6 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, break; } - spin_unlock(&prog->tcf_lock); return action; } @@ -263,7 +261,10 @@ static void tcf_bpf_prog_fill_cfg(const struct tcf_bpf *prog, struct tcf_bpf_cfg *cfg) { cfg->is_ebpf = tcf_bpf_is_ebpf(prog); - cfg->filter = prog->filter; + /* updates to prog->filter are prevented, since it's called either + * with rtnl lock or during final cleanup in rcu callback + */ + cfg->filter = rcu_dereference_protected(prog->filter, 1); cfg->bpf_ops = prog->bpf_ops; cfg->bpf_name = prog->bpf_name; @@ -294,7 +295,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (!tcf_hash_check(parm->index, act, bind)) { ret = tcf_hash_create(parm->index, est, act, - sizeof(*prog), bind, false); + sizeof(*prog), bind, true); if (ret < 0) return ret; @@ -325,7 +326,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, goto out; prog = to_bpf(act); - spin_lock_bh(&prog->tcf_lock); + ASSERT_RTNL(); if (res != ACT_P_CREATED) tcf_bpf_prog_fill_cfg(prog, &old); @@ -339,14 +340,15 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, prog->bpf_fd = cfg.bpf_fd; prog->tcf_action = parm->action; - prog->filter = cfg.filter; - - spin_unlock_bh(&prog->tcf_lock); + rcu_assign_pointer(prog->filter, cfg.filter); - if (res == ACT_P_CREATED) + if (res == ACT_P_CREATED) { tcf_hash_insert(act); - else + } else { + /* make sure the program being replaced is no longer executing */ + synchronize_rcu(); tcf_bpf_cfg_cleanup(&old); + } return res; out: -- cgit v1.2.3 From 1afe839e6b31a85fc53adbf8757d6373908d414d Mon Sep 17 00:00:00 2001 From: Andreas Herz Date: Fri, 21 Aug 2015 11:31:32 +0200 Subject: netfilter: ip6t_REJECT: added missing icmpv6 codes RFC 4443 added two new codes values for ICMPv6 type 1: 5 - Source address failed ingress/egress policy 6 - Reject route to destination And RFC 7084 states in L-14 that IPv6 Router MUST send ICMPv6 Destination Unreachable with code 5 for packets forwarded to it that use an address from a prefix that has been invalidated. Codes 5 and 6 are more informative subsets of code 1. Signed-off-by: Andreas Herz Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h | 4 +++- net/ipv6/netfilter/ip6t_REJECT.c | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h b/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h index 205ed62e4605..cd2e940c8bf5 100644 --- a/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h +++ b/include/uapi/linux/netfilter_ipv6/ip6t_REJECT.h @@ -10,7 +10,9 @@ enum ip6t_reject_with { IP6T_ICMP6_ADDR_UNREACH, IP6T_ICMP6_PORT_UNREACH, IP6T_ICMP6_ECHOREPLY, - IP6T_TCP_RESET + IP6T_TCP_RESET, + IP6T_ICMP6_POLICY_FAIL, + IP6T_ICMP6_REJECT_ROUTE }; struct ip6t_reject_info { diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 567367a75172..0ed841a3fa33 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -63,6 +63,12 @@ reject_tg6(struct sk_buff *skb, const struct xt_action_param *par) case IP6T_TCP_RESET: nf_send_reset6(net, skb, par->hooknum); break; + case IP6T_ICMP6_POLICY_FAIL: + nf_send_unreach6(net, skb, ICMPV6_POLICY_FAIL, par->hooknum); + break; + case IP6T_ICMP6_REJECT_ROUTE: + nf_send_unreach6(net, skb, ICMPV6_REJECT_ROUTE, par->hooknum); + break; } return NF_DROP; -- cgit v1.2.3 From 1dd34b5ad8aebaff17b625fc0126e18243008a3f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 26 Aug 2015 15:57:38 -0700 Subject: bpf: fix bpf_skb_set_tunnel_key() helper Make sure to indicate to tunnel driver that key.tun_id is set, otherwise gre won't recognize the metadata. Fixes: d3aa45ce6b94 ("bpf: add helpers to access tunnel metadata") Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index b4adc961413f..66500d490995 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1528,6 +1528,7 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) info = &md->u.tun_info; info->mode = IP_TUNNEL_INFO_TX; + info->key.tun_flags = TUNNEL_KEY; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); -- cgit v1.2.3 From f729dc70dafec281e524f51ae496a72ea4f8e319 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Sun, 26 Jul 2015 04:37:15 +0800 Subject: batman-adv: move hardif refcount inc to batadv_neigh_node_new() The batadv_neigh_node cleanup function 'batadv_neigh_node_free_rcu()' takes care of reducing the hardif refcounter, hence it's only logical to assume the creating function of that same object 'batadv_neigh_node_new()' takes care of increasing the same refcounter. Signed-off-by: Marek Lindner Acked-by: Simon Wunderlich Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 6 ------ net/batman-adv/originator.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 5c122000688f..b9b8b333b363 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -303,12 +303,6 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, if (!neigh_node) goto out; - if (!atomic_inc_not_zero(&hard_iface->refcount)) { - kfree(neigh_node); - neigh_node = NULL; - goto out; - } - neigh_node->orig_node = orig_neigh; neigh_node->if_incoming = hard_iface; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 610620aa8d26..f8317c1db427 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -461,6 +461,12 @@ batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, if (!neigh_node) goto out; + if (!atomic_inc_not_zero(&hard_iface->refcount)) { + kfree(neigh_node); + neigh_node = NULL; + goto out; + } + INIT_HLIST_NODE(&neigh_node->list); INIT_HLIST_HEAD(&neigh_node->ifinfo_list); spin_lock_init(&neigh_node->ifinfo_lock); -- cgit v1.2.3 From 39bf7618f038474a0ccbeb0be173f11e147bd083 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Sun, 26 Jul 2015 04:37:47 +0800 Subject: batman-adv: remove redundant hard_iface assignment The batadv_neigh_node_new() function already sets the hard_iface pointer. Signed-off-by: Marek Lindner Acked-by: Simon Wunderlich Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index b9b8b333b363..b18184e0c497 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -304,7 +304,6 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, goto out; neigh_node->orig_node = orig_neigh; - neigh_node->if_incoming = hard_iface; spin_lock_bh(&orig_node->neigh_list_lock); tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface, -- cgit v1.2.3 From 741aa06bfb0ab731086d258a1838152fe2502b5f Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Sun, 26 Jul 2015 04:57:43 +0800 Subject: batman-adv: move neigh_node list add into batadv_neigh_node_new() All batadv_neigh_node_* functions expect the neigh_node list item to be part of the orig_node->neigh_list, therefore the constructor of said list item should be adding the newly created neigh_node to the respective list. Signed-off-by: Marek Lindner Acked-by: Simon Wunderlich Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 21 +-------------------- net/batman-adv/originator.c | 12 ++++++++++++ 2 files changed, 13 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index b18184e0c497..5e93af4cb97f 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -296,8 +296,7 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, struct batadv_orig_node *orig_node, struct batadv_orig_node *orig_neigh) { - struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface); - struct batadv_neigh_node *neigh_node, *tmp_neigh_node; + struct batadv_neigh_node *neigh_node; neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node); if (!neigh_node) @@ -305,24 +304,6 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, neigh_node->orig_node = orig_neigh; - spin_lock_bh(&orig_node->neigh_list_lock); - tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface, - neigh_addr); - if (!tmp_neigh_node) { - hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); - } else { - kfree(neigh_node); - batadv_hardif_free_ref(hard_iface); - neigh_node = tmp_neigh_node; - } - spin_unlock_bh(&orig_node->neigh_list_lock); - - if (!tmp_neigh_node) - batadv_dbg(BATADV_DBG_BATMAN, bat_priv, - "Creating new neighbor %pM for orig_node %pM on interface %s\n", - neigh_addr, orig_node->orig, - hard_iface->net_dev->name); - out: return neigh_node; } diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index f8317c1db427..f7517757d435 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -457,6 +457,10 @@ batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, { struct batadv_neigh_node *neigh_node; + neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr); + if (neigh_node) + goto out; + neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC); if (!neigh_node) goto out; @@ -478,6 +482,14 @@ batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, /* extra reference for return */ atomic_set(&neigh_node->refcount, 2); + spin_lock_bh(&orig_node->neigh_list_lock); + hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list); + spin_unlock_bh(&orig_node->neigh_list_lock); + + batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv, + "Creating new neighbor %pM for orig_node %pM on interface %s\n", + neigh_addr, orig_node->orig, hard_iface->net_dev->name); + out: return neigh_node; } -- cgit v1.2.3 From bd3524c14bd02f94a4fa33e700883e01182f5ed5 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Mon, 3 Aug 2015 19:13:58 +0200 Subject: batman-adv: remove obsolete deleted attribute for gateway node With rcu, the gateway node deleted attribute is not needed anymore. In fact, it may delay the free of the gateway node and its referenced structures. Therefore remove it altogether and simplify purging as well. Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/gateway_client.c | 48 +++++++++++------------------------------ net/batman-adv/gateway_client.h | 2 +- net/batman-adv/main.c | 2 +- net/batman-adv/originator.c | 1 - net/batman-adv/types.h | 2 -- 5 files changed, 14 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index d7ca2144e62c..634c7e3b4e89 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -161,9 +161,6 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv) rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - if (gw_node->deleted) - continue; - orig_node = gw_node->orig_node; router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT); if (!router) @@ -473,9 +470,6 @@ batadv_gw_node_get(struct batadv_priv *bat_priv, if (gw_node_tmp->orig_node != orig_node) continue; - if (gw_node_tmp->deleted) - continue; - if (!atomic_inc_not_zero(&gw_node_tmp->refcount)) continue; @@ -525,9 +519,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, gw_node->bandwidth_down = ntohl(gateway->bandwidth_down); gw_node->bandwidth_up = ntohl(gateway->bandwidth_up); - gw_node->deleted = 0; if (ntohl(gateway->bandwidth_down) == 0) { - gw_node->deleted = jiffies; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Gateway %pM removed from gateway list\n", orig_node->orig); @@ -535,14 +527,21 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, /* Note: We don't need a NULL check here, since curr_gw never * gets dereferenced. */ + spin_lock_bh(&bat_priv->gw.list_lock); + hlist_del_init_rcu(&gw_node->list); + spin_unlock_bh(&bat_priv->gw.list_lock); + + batadv_gw_node_free_ref(gw_node); + curr_gw = batadv_gw_get_selected_gw_node(bat_priv); if (gw_node == curr_gw) batadv_gw_reselect(bat_priv); + + if (curr_gw) + batadv_gw_node_free_ref(curr_gw); } out: - if (curr_gw) - batadv_gw_node_free_ref(curr_gw); if (gw_node) batadv_gw_node_free_ref(gw_node); } @@ -558,39 +557,19 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv, batadv_gw_node_update(bat_priv, orig_node, &gateway); } -void batadv_gw_node_purge(struct batadv_priv *bat_priv) +void batadv_gw_node_free(struct batadv_priv *bat_priv) { - struct batadv_gw_node *gw_node, *curr_gw; + struct batadv_gw_node *gw_node; struct hlist_node *node_tmp; - unsigned long timeout = msecs_to_jiffies(2 * BATADV_PURGE_TIMEOUT); - int do_reselect = 0; - - curr_gw = batadv_gw_get_selected_gw_node(bat_priv); spin_lock_bh(&bat_priv->gw.list_lock); - hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.list, list) { - if (((!gw_node->deleted) || - (time_before(jiffies, gw_node->deleted + timeout))) && - atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE) - continue; - if (curr_gw == gw_node) - do_reselect = 1; - - hlist_del_rcu(&gw_node->list); + hlist_del_init_rcu(&gw_node->list); batadv_gw_node_free_ref(gw_node); } - spin_unlock_bh(&bat_priv->gw.list_lock); - - /* gw_reselect() needs to acquire the gw_list_lock */ - if (do_reselect) - batadv_gw_reselect(bat_priv); - - if (curr_gw) - batadv_gw_node_free_ref(curr_gw); } /* fails if orig_node has no router */ @@ -654,9 +633,6 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset) rcu_read_lock(); hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) { - if (gw_node->deleted) - continue; - /* fails if orig_node has no router */ if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0) continue; diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index ef4d7e336651..fa9527785ed3 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -38,7 +38,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv, struct batadv_tvlv_gateway_data *gateway); void batadv_gw_node_delete(struct batadv_priv *bat_priv, struct batadv_orig_node *orig_node); -void batadv_gw_node_purge(struct batadv_priv *bat_priv); +void batadv_gw_node_free(struct batadv_priv *bat_priv); int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset); bool batadv_gw_out_of_range(struct batadv_priv *bat_priv, struct sk_buff *skb); enum batadv_dhcp_recipient diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index e61c5f3633d0..d7f17c1aa4a4 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -199,7 +199,7 @@ void batadv_mesh_free(struct net_device *soft_iface) batadv_purge_outstanding_packets(bat_priv, NULL); - batadv_gw_node_purge(bat_priv); + batadv_gw_node_free(bat_priv); batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index f7517757d435..d6d9809fee66 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1028,7 +1028,6 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv) spin_unlock_bh(list_lock); } - batadv_gw_node_purge(bat_priv); batadv_gw_election(bat_priv); } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 2f5e6c39f913..d260efd70499 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -328,7 +328,6 @@ enum batadv_orig_capabilities { * @orig_node: pointer to corresponding orig node * @bandwidth_down: advertised uplink download bandwidth * @bandwidth_up: advertised uplink upload bandwidth - * @deleted: this struct is scheduled for deletion * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner */ @@ -337,7 +336,6 @@ struct batadv_gw_node { struct batadv_orig_node *orig_node; u32 bandwidth_down; u32 bandwidth_up; - unsigned long deleted; atomic_t refcount; struct rcu_head rcu; }; -- cgit v1.2.3 From 3f32f8a6874ae2515c8894588a5c60dd65ecc7e5 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Sun, 26 Jul 2015 04:59:15 +0800 Subject: batman-adv: rearrange batadv_neigh_node_new() arguments to follow convention Signed-off-by: Marek Lindner Acked-by: Simon Wunderlich Signed-off-by: Antonio Quartulli --- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/originator.c | 7 ++++--- net/batman-adv/originator.h | 5 +++-- 3 files changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 5e93af4cb97f..912d9c36fb1c 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -298,7 +298,7 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface *hard_iface, { struct batadv_neigh_node *neigh_node; - neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node); + neigh_node = batadv_neigh_node_new(orig_node, hard_iface, neigh_addr); if (!neigh_node) goto out; diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index d6d9809fee66..099a84afcd61 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -444,16 +444,17 @@ out: /** * batadv_neigh_node_new - create and init a new neigh_node object + * @orig_node: originator object representing the neighbour * @hard_iface: the interface where the neighbour is connected to * @neigh_addr: the mac address of the neighbour interface - * @orig_node: originator object representing the neighbour * * Allocates a new neigh_node object and initialises all the generic fields. * Returns the new object or NULL on failure. */ struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const u8 *neigh_addr, struct batadv_orig_node *orig_node) +batadv_neigh_node_new(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr) { struct batadv_neigh_node *neigh_node; diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index 3fc76f6f710c..fde34385c62c 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -46,8 +46,9 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node, const struct batadv_hard_iface *hard_iface, const u8 *addr); struct batadv_neigh_node * -batadv_neigh_node_new(struct batadv_hard_iface *hard_iface, - const u8 *neigh_addr, struct batadv_orig_node *orig_node); +batadv_neigh_node_new(struct batadv_orig_node *orig_node, + struct batadv_hard_iface *hard_iface, + const u8 *neigh_addr); void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node); struct batadv_neigh_node * batadv_orig_router_get(struct batadv_orig_node *orig_node, -- cgit v1.2.3 From 1e3b4669e79253748073b0ee95270f92f0372b20 Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Tue, 4 Aug 2015 15:44:06 +0200 Subject: batman-adv: fix gateway client style issues commit 0511575c4d03 ("batman-adv: remove obsolete deleted attribute for gateway node") incorrectly added an empy line and forgot to remove an include. Signed-off-by: Simon Wunderlich Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/gateway_client.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 634c7e3b4e89..e6c8382c79ba 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -565,7 +564,6 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv) spin_lock_bh(&bat_priv->gw.list_lock); hlist_for_each_entry_safe(gw_node, node_tmp, &bat_priv->gw.list, list) { - hlist_del_init_rcu(&gw_node->list); batadv_gw_node_free_ref(gw_node); } -- cgit v1.2.3 From 07c48eca1661decbd52393ef535f0c97e5313c4e Mon Sep 17 00:00:00 2001 From: Simon Wunderlich Date: Tue, 4 Aug 2015 14:43:16 +0200 Subject: batman-adv: Start new development cycle Signed-off-by: Simon Wunderlich Signed-off-by: Antonio Quartulli --- net/batman-adv/main.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 78500ac725d6..ebd8af0a1eb0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2015.1" +#define BATADV_SOURCE_VERSION "2015.2" #endif /* B.A.T.M.A.N. parameters */ -- cgit v1.2.3 From a5256f7e74d85d7ae60ac3bd557d5fe3444be810 Mon Sep 17 00:00:00 2001 From: Antonio Quartulli Date: Tue, 4 Aug 2015 22:26:19 +0200 Subject: batman-adv: don't access unregistered net_device object In batadv_hardif_disable_interface() there is a call to batadv_softif_destroy_sysfs() which in turns invokes unregister_netdevice() on the soft_iface. After this point we cannot rely on the soft_iface object anymore because it might get free'd by the netdev periodic routine at any time. For this reason the netdev_upper_dev_unlink(.., soft_iface) call is moved before the invocation of batadv_softif_destroy_sysfs() so that we can be sure that the soft_iface object is still valid. Signed-off-by: Antonio Quartulli Signed-off-by: Marek Lindner --- net/batman-adv/hard-interface.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index f4a15d2e5eaf..0565b20bfa6d 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -528,6 +528,8 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_purge_outstanding_packets(bat_priv, hard_iface); dev_put(hard_iface->soft_iface); + netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); + /* nobody uses this interface anymore */ if (!bat_priv->num_ifaces) { batadv_gw_check_client_stop(bat_priv); @@ -536,7 +538,6 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, batadv_softif_destroy_sysfs(hard_iface->soft_iface); } - netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); hard_iface->soft_iface = NULL; batadv_hardif_free_ref(hard_iface); -- cgit v1.2.3 From 7bca68c7844b1642868809a5ef4387c1f099ab1d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 7 Aug 2015 19:28:42 +0200 Subject: batman-adv: Add lower layer needed_(head|tail)room to own ones The maximum of hard_header_len and maximum of all needed_(head|tail)room of all slave interfaces of a batman-adv device must be used to define the batman-adv device needed_(head|tail)room. This is required to avoid too small buffer problems when these slave devices try to send the encapsulated packet in a tx path without the possibility to resize the skbuff. Signed-off-by: Sven Eckelmann Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/hard-interface.c | 41 +++++++++++++++++++++++++++++++++++++++++ net/batman-adv/soft-interface.c | 2 -- 2 files changed, 41 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 0565b20bfa6d..f11345e163d7 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -252,6 +252,44 @@ static void batadv_check_known_mac_addr(const struct net_device *net_dev) rcu_read_unlock(); } +/** + * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra head/tailroom + * @soft_iface: netdev struct of the mesh interface + */ +static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface) +{ + const struct batadv_hard_iface *hard_iface; + unsigned short lower_header_len = ETH_HLEN; + unsigned short lower_headroom = 0; + unsigned short lower_tailroom = 0; + unsigned short needed_headroom; + + rcu_read_lock(); + list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { + if (hard_iface->if_status == BATADV_IF_NOT_IN_USE) + continue; + + if (hard_iface->soft_iface != soft_iface) + continue; + + lower_header_len = max_t(unsigned short, lower_header_len, + hard_iface->net_dev->hard_header_len); + + lower_headroom = max_t(unsigned short, lower_headroom, + hard_iface->net_dev->needed_headroom); + + lower_tailroom = max_t(unsigned short, lower_tailroom, + hard_iface->net_dev->needed_tailroom); + } + rcu_read_unlock(); + + needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN); + needed_headroom += batadv_max_header_len(); + + soft_iface->needed_headroom = needed_headroom; + soft_iface->needed_tailroom = lower_tailroom; +} + int batadv_hardif_min_mtu(struct net_device *soft_iface) { struct batadv_priv *bat_priv = netdev_priv(soft_iface); @@ -474,6 +512,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, "Not using interface %s (retrying later): interface not active\n", hard_iface->net_dev->name); + batadv_hardif_recalc_extra_skbroom(soft_iface); + /* begin scheduling originator messages on that interface */ batadv_schedule_bat_ogm(hard_iface); @@ -529,6 +569,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface, dev_put(hard_iface->soft_iface); netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface); + batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface); /* nobody uses this interface anymore */ if (!bat_priv->num_ifaces) { diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index d5c5ad93a611..ac4d08de5df4 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -947,8 +947,6 @@ static void batadv_softif_init_early(struct net_device *dev) * have not been initialized yet */ dev->mtu = ETH_DATA_LEN; - /* reserve more space in the skbuff for our header */ - dev->hard_header_len = batadv_max_header_len(); /* generate random address */ eth_hw_addr_random(dev); -- cgit v1.2.3 From ed29266347025a19ee689807b07d121f0a7441f1 Mon Sep 17 00:00:00 2001 From: Marek Lindner Date: Tue, 4 Aug 2015 23:31:44 +0800 Subject: batman-adv: turn batadv_neigh_node_get() into local function commit c214ebe1eb29 ("batman-adv: move neigh_node list add into batadv_neigh_node_new()") removed external calls to batadv_neigh_node_get(). Signed-off-by: Marek Lindner Signed-off-by: Antonio Quartulli --- net/batman-adv/originator.c | 72 ++++++++++++++++++++++----------------------- net/batman-adv/originator.h | 4 --- 2 files changed, 36 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 099a84afcd61..7486df9ed48d 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -442,6 +442,42 @@ out: return neigh_ifinfo; } +/** + * batadv_neigh_node_get - retrieve a neighbour from the list + * @orig_node: originator which the neighbour belongs to + * @hard_iface: the interface where this neighbour is connected to + * @addr: the address of the neighbour + * + * Looks for and possibly returns a neighbour belonging to this originator list + * which is connected through the provided hard interface. + * Returns NULL if the neighbour is not found. + */ +static struct batadv_neigh_node * +batadv_neigh_node_get(const struct batadv_orig_node *orig_node, + const struct batadv_hard_iface *hard_iface, + const u8 *addr) +{ + struct batadv_neigh_node *tmp_neigh_node, *res = NULL; + + rcu_read_lock(); + hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { + if (!batadv_compare_eth(tmp_neigh_node->addr, addr)) + continue; + + if (tmp_neigh_node->if_incoming != hard_iface) + continue; + + if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) + continue; + + res = tmp_neigh_node; + break; + } + rcu_read_unlock(); + + return res; +} + /** * batadv_neigh_node_new - create and init a new neigh_node object * @orig_node: originator object representing the neighbour @@ -495,42 +531,6 @@ out: return neigh_node; } -/** - * batadv_neigh_node_get - retrieve a neighbour from the list - * @orig_node: originator which the neighbour belongs to - * @hard_iface: the interface where this neighbour is connected to - * @addr: the address of the neighbour - * - * Looks for and possibly returns a neighbour belonging to this originator list - * which is connected through the provided hard interface. - * Returns NULL if the neighbour is not found. - */ -struct batadv_neigh_node * -batadv_neigh_node_get(const struct batadv_orig_node *orig_node, - const struct batadv_hard_iface *hard_iface, - const u8 *addr) -{ - struct batadv_neigh_node *tmp_neigh_node, *res = NULL; - - rcu_read_lock(); - hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) { - if (!batadv_compare_eth(tmp_neigh_node->addr, addr)) - continue; - - if (tmp_neigh_node->if_incoming != hard_iface) - continue; - - if (!atomic_inc_not_zero(&tmp_neigh_node->refcount)) - continue; - - res = tmp_neigh_node; - break; - } - rcu_read_unlock(); - - return res; -} - /** * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object * @rcu: rcu pointer of the orig_ifinfo object diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index fde34385c62c..fa18f9bf266b 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -42,10 +42,6 @@ void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node); struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, const u8 *addr); struct batadv_neigh_node * -batadv_neigh_node_get(const struct batadv_orig_node *orig_node, - const struct batadv_hard_iface *hard_iface, - const u8 *addr); -struct batadv_neigh_node * batadv_neigh_node_new(struct batadv_orig_node *orig_node, struct batadv_hard_iface *hard_iface, const u8 *neigh_addr); -- cgit v1.2.3 From 8e2fed1c0cfbb29995a4301060acc0ef4ee84420 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:44 -0700 Subject: openvswitch: Serialize acts with original netlink len Previously, we used the kernel-internal netlink actions length to calculate the size of messages to serialize back to userspace. However,the sw_flow_actions may not be formatted exactly the same as the actions on the wire, so store the original actions length when de-serializing and re-use the original length when serializing. Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 2 +- net/openvswitch/flow.h | 1 + net/openvswitch/flow_netlink.c | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ffe984f5b95c..d5b547375887 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -713,7 +713,7 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, /* OVS_FLOW_ATTR_ACTIONS */ if (should_fill_actions(ufid_flags)) - len += nla_total_size(acts->actions_len); + len += nla_total_size(acts->orig_len); return len + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index b62cdb3e3589..082a87bac819 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -144,6 +144,7 @@ struct sw_flow_id { struct sw_flow_actions { struct rcu_head rcu; + size_t orig_len; /* From flow_cmd_new netlink actions size */ u32 actions_len; struct nlattr actions[]; }; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 4e7a3f7facc2..c182b28c0884 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1619,6 +1619,7 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); acts->actions_len = (*sfa)->actions_len; + acts->orig_len = (*sfa)->orig_len; kfree(*sfa); *sfa = acts; @@ -2223,6 +2224,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, if (IS_ERR(*sfa)) return PTR_ERR(*sfa); + (*sfa)->orig_len = nla_len(attr); err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) -- cgit v1.2.3 From be26b9a88fcee570796c67701f50800039e25aec Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:45 -0700 Subject: openvswitch: Move MASKED* macros to datapath.h This will allow the ovs-conntrack code to reuse these macros. Signed-off-by: Joe Stringer Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 52 ++++++++++++++++++++++------------------------ net/openvswitch/datapath.h | 4 ++++ 2 files changed, 29 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 4f4200717bef..520438b77dc8 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -185,10 +185,6 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, return 0; } -/* 'KEY' must not have any bits set outside of the 'MASK' */ -#define MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) -#define SET_MASKED(OLD, KEY, MASK) ((OLD) = MASKED(OLD, KEY, MASK)) - static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, const __be32 *mpls_lse, const __be32 *mask) { @@ -201,7 +197,7 @@ static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, return err; stack = (__be32 *)skb_mpls_header(skb); - lse = MASKED(*stack, *mpls_lse, *mask); + lse = OVS_MASKED(*stack, *mpls_lse, *mask); if (skb->ip_summed == CHECKSUM_COMPLETE) { __be32 diff[] = { ~(*stack), lse }; @@ -244,9 +240,9 @@ static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) const u16 *src = (const u16 *)src_; const u16 *mask = (const u16 *)mask_; - SET_MASKED(dst[0], src[0], mask[0]); - SET_MASKED(dst[1], src[1], mask[1]); - SET_MASKED(dst[2], src[2], mask[2]); + OVS_SET_MASKED(dst[0], src[0], mask[0]); + OVS_SET_MASKED(dst[1], src[1], mask[1]); + OVS_SET_MASKED(dst[2], src[2], mask[2]); } static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, @@ -338,10 +334,10 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], const __be32 mask[4], __be32 masked[4]) { - masked[0] = MASKED(old[0], addr[0], mask[0]); - masked[1] = MASKED(old[1], addr[1], mask[1]); - masked[2] = MASKED(old[2], addr[2], mask[2]); - masked[3] = MASKED(old[3], addr[3], mask[3]); + masked[0] = OVS_MASKED(old[0], addr[0], mask[0]); + masked[1] = OVS_MASKED(old[1], addr[1], mask[1]); + masked[2] = OVS_MASKED(old[2], addr[2], mask[2]); + masked[3] = OVS_MASKED(old[3], addr[3], mask[3]); } static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, @@ -358,15 +354,15 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask) { /* Bits 21-24 are always unmasked, so this retains their values. */ - SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); - SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); - SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); + OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16)); + OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8)); + OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask); } static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, u8 mask) { - new_ttl = MASKED(nh->ttl, new_ttl, mask); + new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask); csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); nh->ttl = new_ttl; @@ -392,7 +388,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, * makes sense to check if the value actually changed. */ if (mask->ipv4_src) { - new_addr = MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); + new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); if (unlikely(new_addr != nh->saddr)) { set_ip_addr(skb, nh, &nh->saddr, new_addr); @@ -400,7 +396,7 @@ static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, } } if (mask->ipv4_dst) { - new_addr = MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); + new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); if (unlikely(new_addr != nh->daddr)) { set_ip_addr(skb, nh, &nh->daddr, new_addr); @@ -488,7 +484,8 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); } if (mask->ipv6_hlimit) { - SET_MASKED(nh->hop_limit, key->ipv6_hlimit, mask->ipv6_hlimit); + OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit, + mask->ipv6_hlimit); flow_key->ip.ttl = nh->hop_limit; } return 0; @@ -517,8 +514,8 @@ static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, uh = udp_hdr(skb); /* Either of the masks is non-zero, so do not bother checking them. */ - src = MASKED(uh->source, key->udp_src, mask->udp_src); - dst = MASKED(uh->dest, key->udp_dst, mask->udp_dst); + src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src); + dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst); if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { if (likely(src != uh->source)) { @@ -558,12 +555,12 @@ static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, return err; th = tcp_hdr(skb); - src = MASKED(th->source, key->tcp_src, mask->tcp_src); + src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src); if (likely(src != th->source)) { set_tp_port(skb, &th->source, src, &th->check); flow_key->tp.src = src; } - dst = MASKED(th->dest, key->tcp_dst, mask->tcp_dst); + dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst); if (likely(dst != th->dest)) { set_tp_port(skb, &th->dest, dst, &th->check); flow_key->tp.dst = dst; @@ -590,8 +587,8 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, old_csum = sh->checksum; old_correct_csum = sctp_compute_cksum(skb, sctphoff); - sh->source = MASKED(sh->source, key->sctp_src, mask->sctp_src); - sh->dest = MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); + sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src); + sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); new_csum = sctp_compute_cksum(skb, sctphoff); @@ -770,12 +767,13 @@ static int execute_masked_set_action(struct sk_buff *skb, switch (nla_type(a)) { case OVS_KEY_ATTR_PRIORITY: - SET_MASKED(skb->priority, nla_get_u32(a), *get_mask(a, u32 *)); + OVS_SET_MASKED(skb->priority, nla_get_u32(a), + *get_mask(a, u32 *)); flow_key->phy.priority = skb->priority; break; case OVS_KEY_ATTR_SKB_MARK: - SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); + OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); flow_key->phy.skb_mark = skb->mark; break; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 6b28c5cedb23..487a85f7d967 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -200,6 +200,10 @@ void ovs_dp_notify_wq(struct work_struct *work); int action_fifos_init(void); void action_fifos_exit(void); +/* 'KEY' must not have any bits set outside of the 'MASK' */ +#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) +#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) + #define OVS_NLERR(logging_allowed, fmt, ...) \ do { \ if (logging_allowed && net_ratelimit()) \ -- cgit v1.2.3 From 5b490047240f7b986228de968334ddd7a341e1fe Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:46 -0700 Subject: ipv6: Export nf_ct_frag6_gather() Signed-off-by: Joe Stringer Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 6d02498172c1..701cd2bae0a9 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -633,6 +633,7 @@ ret_orig: kfree_skb(clone); return skb; } +EXPORT_SYMBOL_GPL(nf_ct_frag6_gather); void nf_ct_frag6_consume_orig(struct sk_buff *skb) { -- cgit v1.2.3 From 7f8a436eaa2c3ddd8e1ff2fbca267e6275085536 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:48 -0700 Subject: openvswitch: Add conntrack action Expose the kernel connection tracker via OVS. Userspace components can make use of the CT action to populate the connection state (ct_state) field for a flow. This state can be subsequently matched. Exposed connection states are OVS_CS_F_*: - NEW (0x01) - Beginning of a new connection. - ESTABLISHED (0x02) - Part of an existing connection. - RELATED (0x04) - Related to an established connection. - INVALID (0x20) - Could not track the connection for this packet. - REPLY_DIR (0x40) - This packet is in the reply direction for the flow. - TRACKED (0x80) - This packet has been sent through conntrack. When the CT action is executed by itself, it will send the packet through the connection tracker and populate the ct_state field with one or more of the connection state flags above. The CT action will always set the TRACKED bit. When the COMMIT flag is passed to the conntrack action, this specifies that information about the connection should be stored. This allows subsequent packets for the same (or related) connections to be correlated with this connection. Sending subsequent packets for the connection through conntrack allows the connection tracker to consider the packets as ESTABLISHED, RELATED, and/or REPLY_DIR. The CT action may optionally take a zone to track the flow within. This allows connections with the same 5-tuple to be kept logically separate from connections in other zones. If the zone is specified, then the "ct_zone" match field will be subsequently populated with the zone id. IP fragments are handled by transparently assembling them as part of the CT action. The maximum received unit (MRU) size is tracked so that refragmentation can occur during output. IP frag handling contributed by Andy Zhou. Based on original design by Justin Pettit. Signed-off-by: Joe Stringer Signed-off-by: Justin Pettit Signed-off-by: Andy Zhou Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 40 ++++ net/openvswitch/Kconfig | 11 + net/openvswitch/Makefile | 2 + net/openvswitch/actions.c | 175 ++++++++++++++- net/openvswitch/conntrack.c | 454 +++++++++++++++++++++++++++++++++++++++ net/openvswitch/conntrack.h | 78 +++++++ net/openvswitch/datapath.c | 66 ++++-- net/openvswitch/datapath.h | 6 + net/openvswitch/flow.c | 2 + net/openvswitch/flow.h | 6 + net/openvswitch/flow_netlink.c | 69 ++++-- net/openvswitch/flow_netlink.h | 4 +- net/openvswitch/vport.c | 1 + 13 files changed, 877 insertions(+), 37 deletions(-) create mode 100644 net/openvswitch/conntrack.c create mode 100644 net/openvswitch/conntrack.h (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index d6b885460187..55f599792673 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -164,6 +164,9 @@ enum ovs_packet_cmd { * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the * output port is actually a tunnel port. Contains the output tunnel key * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes. + * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and + * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment + * size. * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. @@ -180,6 +183,7 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_UNUSED2, OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, error logging should be suppressed. */ + OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ __OVS_PACKET_ATTR_MAX }; @@ -319,6 +323,8 @@ enum ovs_key_attr { OVS_KEY_ATTR_MPLS, /* array of struct ovs_key_mpls. * The implementation may restrict * the accepted length of the array. */ + OVS_KEY_ATTR_CT_STATE, /* u8 bitmask of OVS_CS_F_* */ + OVS_KEY_ATTR_CT_ZONE, /* u16 connection tracking zone. */ #ifdef __KERNEL__ OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ @@ -431,6 +437,15 @@ struct ovs_key_nd { __u8 nd_tll[ETH_ALEN]; }; +/* OVS_KEY_ATTR_CT_STATE flags */ +#define OVS_CS_F_NEW 0x01 /* Beginning of a new connection. */ +#define OVS_CS_F_ESTABLISHED 0x02 /* Part of an existing connection. */ +#define OVS_CS_F_RELATED 0x04 /* Related to an established + * connection. */ +#define OVS_CS_F_INVALID 0x20 /* Could not track connection. */ +#define OVS_CS_F_REPLY_DIR 0x40 /* Flow is in the reply direction. */ +#define OVS_CS_F_TRACKED 0x80 /* Conntrack has occurred. */ + /** * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands. * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow @@ -594,6 +609,28 @@ struct ovs_action_hash { uint32_t hash_basis; }; +/** + * enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action. + * @OVS_CT_ATTR_FLAGS: u32 connection tracking flags. + * @OVS_CT_ATTR_ZONE: u16 connection tracking zone. + */ +enum ovs_ct_attr { + OVS_CT_ATTR_UNSPEC, + OVS_CT_ATTR_FLAGS, /* u8 bitmask of OVS_CT_F_*. */ + OVS_CT_ATTR_ZONE, /* u16 zone id. */ + __OVS_CT_ATTR_MAX +}; + +#define OVS_CT_ATTR_MAX (__OVS_CT_ATTR_MAX - 1) + +/* + * OVS_CT_ATTR_FLAGS flags - bitmask of %OVS_CT_F_* + * @OVS_CT_F_COMMIT: Commits the flow to the conntrack table. This allows + * future packets for the same connection to be identified as 'established' + * or 'related'. + */ +#define OVS_CT_F_COMMIT 0x01 + /** * enum ovs_action_attr - Action types. * @@ -623,6 +660,8 @@ struct ovs_action_hash { * indicate the new packet contents. This could potentially still be * %ETH_P_MPLS if the resulting MPLS label stack is not empty. If there * is no MPLS label stack, as determined by ethertype, no action is taken. + * @OVS_ACTION_ATTR_CT: Track the connection. Populate the conntrack-related + * entries in the flow key. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -648,6 +687,7 @@ enum ovs_action_attr { * data immediately followed by a mask. * The data must be zero for the unmasked * bits. */ + OVS_ACTION_ATTR_CT, /* One nested OVS_CT_ATTR_* . */ __OVS_ACTION_ATTR_MAX, /* Nothing past this will be accepted * from userspace. */ diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 422dc0567de9..98f343d0d6dd 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -31,6 +31,17 @@ config OPENVSWITCH If unsure, say N. +config OPENVSWITCH_CONNTRACK + bool "Open vSwitch conntrack action support" + depends on OPENVSWITCH + depends on NF_CONNTRACK + default OPENVSWITCH + ---help--- + If you say Y here, then Open vSwitch module will be able to pass + packets through conntrack. + + Say N to exclude this support and reduce the binary size. + config OPENVSWITCH_GRE tristate "Open vSwitch GRE tunneling support" depends on OPENVSWITCH diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 6e1701de04d8..5b5913b06f54 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -15,6 +15,8 @@ openvswitch-y := \ vport-internal_dev.o \ vport-netdev.o +openvswitch-$(CONFIG_OPENVSWITCH_CONNTRACK) += conntrack.o + obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 520438b77dc8..72ca2c491b0a 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include #include +#include #include #include #include @@ -38,6 +40,7 @@ #include "datapath.h" #include "flow.h" +#include "conntrack.h" #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, @@ -52,6 +55,20 @@ struct deferred_action { struct sw_flow_key pkt_key; }; +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + __u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 l2_data[MAX_L2_LEN]; +}; + +static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage); + #define DEFERRED_ACTION_FIFO_SIZE 10 struct action_fifo { int head; @@ -602,14 +619,145 @@ static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static int ovs_vport_output(struct sock *sock, struct sk_buff *skb) +{ + struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage); + struct vport *vport = data->vport; + + if (skb_cow_head(skb, data->l2_len) < 0) { + kfree_skb(skb); + return -ENOMEM; + } + + __skb_dst_copy(skb, data->dst); + *OVS_CB(skb) = data->cb; + skb->inner_protocol = data->inner_protocol; + skb->vlan_tci = data->vlan_tci; + skb->vlan_proto = data->vlan_proto; + + /* Reconstruct the MAC header. */ + skb_push(skb, data->l2_len); + memcpy(skb->data, &data->l2_data, data->l2_len); + ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_reset_mac_header(skb); + + ovs_vport_send(vport, skb); + return 0; +} + +static unsigned int +ovs_dst_get_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops ovs_dst_ops = { + .family = AF_UNSPEC, + .mtu = ovs_dst_get_mtu, +}; + +/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is + * ovs_vport_output(), which is called once per fragmented packet. + */ +static void prepare_frag(struct vport *vport, struct sk_buff *skb) +{ + unsigned int hlen = skb_network_offset(skb); + struct ovs_frag_data *data; + + data = this_cpu_ptr(&ovs_frag_data_storage); + data->dst = skb->_skb_refdst; + data->vport = vport; + data->cb = *OVS_CB(skb); + data->inner_protocol = skb->inner_protocol; + data->vlan_tci = skb->vlan_tci; + data->vlan_proto = skb->vlan_proto; + data->l2_len = hlen; + memcpy(&data->l2_data, skb->data, hlen); + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + skb_pull(skb, hlen); +} + +static void ovs_fragment(struct vport *vport, struct sk_buff *skb, u16 mru, + __be16 ethertype) +{ + if (skb_network_offset(skb) > MAX_L2_LEN) { + OVS_NLERR(1, "L2 header too long to fragment"); + return; + } + + if (ethertype == htons(ETH_P_IP)) { + struct dst_entry ovs_dst; + unsigned long orig_dst; + + prepare_frag(vport, skb); + dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_dst); + IPCB(skb)->frag_max_size = mru; + + ip_do_fragment(skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else if (ethertype == htons(ETH_P_IPV6)) { + const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops(); + unsigned long orig_dst; + struct rt6_info ovs_rt; + + if (!v6ops) { + kfree_skb(skb); + return; + } + + prepare_frag(vport, skb); + memset(&ovs_rt, 0, sizeof(ovs_rt)); + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_rt.dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_rt.dst); + IP6CB(skb)->frag_max_size = mru; + + v6ops->fragment(skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else { + WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", + ovs_vport_name(vport), ntohs(ethertype), mru, + vport->dev->mtu); + kfree_skb(skb); + } +} + +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, + struct sw_flow_key *key) { struct vport *vport = ovs_vport_rcu(dp, out_port); - if (likely(vport)) - ovs_vport_send(vport, skb); - else + if (likely(vport)) { + u16 mru = OVS_CB(skb)->mru; + + if (likely(!mru || (skb->len <= mru + ETH_HLEN))) { + ovs_vport_send(vport, skb); + } else if (mru <= vport->dev->mtu) { + __be16 ethertype = key->eth.type; + + if (!is_flow_key_valid(key)) { + if (eth_p_mpls(skb->protocol)) + ethertype = skb->inner_protocol; + else + ethertype = vlan_get_protocol(skb); + } + + ovs_fragment(vport, skb, mru, ethertype); + } else { + kfree_skb(skb); + } + } else { kfree_skb(skb); + } } static int output_userspace(struct datapath *dp, struct sk_buff *skb, @@ -623,6 +771,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_ACTION; + upcall.mru = OVS_CB(skb)->mru; for (a = nla_data(attr), rem = nla_len(attr); rem > 0; a = nla_next(a, &rem)) { @@ -816,6 +965,11 @@ static int execute_masked_set_action(struct sk_buff *skb, err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, __be32 *)); break; + + case OVS_KEY_ATTR_CT_STATE: + case OVS_KEY_ATTR_CT_ZONE: + err = -EINVAL; + break; } return err; @@ -885,7 +1039,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC); if (out_skb) - do_output(dp, out_skb, prev_port); + do_output(dp, out_skb, prev_port, key); prev_port = -1; } @@ -942,6 +1096,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, case OVS_ACTION_ATTR_SAMPLE: err = sample(dp, skb, key, a, attr, len); break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, + nla_data(a)); + + /* Hide stolen IP fragments from user space. */ + if (err == -EINPROGRESS) + return 0; + break; } if (unlikely(err)) { @@ -951,7 +1114,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, } if (prev_port != -1) - do_output(dp, skb, prev_port); + do_output(dp, skb, prev_port, key); else consume_skb(skb); diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c new file mode 100644 index 000000000000..1189fd50f1cf --- /dev/null +++ b/net/openvswitch/conntrack.c @@ -0,0 +1,454 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +#include "datapath.h" +#include "conntrack.h" +#include "flow.h" +#include "flow_netlink.h" + +struct ovs_ct_len_tbl { + size_t maxlen; + size_t minlen; +}; + +/* Conntrack action context for execution. */ +struct ovs_conntrack_info { + struct nf_conntrack_zone zone; + struct nf_conn *ct; + u32 flags; + u16 family; +}; + +static u16 key_to_nfproto(const struct sw_flow_key *key) +{ + switch (ntohs(key->eth.type)) { + case ETH_P_IP: + return NFPROTO_IPV4; + case ETH_P_IPV6: + return NFPROTO_IPV6; + default: + return NFPROTO_UNSPEC; + } +} + +/* Map SKB connection state into the values used by flow definition. */ +static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) +{ + u8 ct_state = OVS_CS_F_TRACKED; + + switch (ctinfo) { + case IP_CT_ESTABLISHED_REPLY: + case IP_CT_RELATED_REPLY: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_REPLY_DIR; + break; + default: + break; + } + + switch (ctinfo) { + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + ct_state |= OVS_CS_F_ESTABLISHED; + break; + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + ct_state |= OVS_CS_F_RELATED; + break; + case IP_CT_NEW: + case IP_CT_NEW_REPLY: + ct_state |= OVS_CS_F_NEW; + break; + default: + break; + } + + return ct_state; +} + +static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, + const struct nf_conntrack_zone *zone) +{ + key->ct.state = state; + key->ct.zone = zone->id; +} + +/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. + */ +static void ovs_ct_update_key(const struct sk_buff *skb, + struct sw_flow_key *key, bool post_ct) +{ + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u8 state = 0; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + state = ovs_ct_get_state(ctinfo); + if (ct->master) + state |= OVS_CS_F_RELATED; + zone = nf_ct_zone(ct); + } else if (post_ct) { + state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; + } + __ovs_ct_update_key(key, state, zone); +} + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) +{ + ovs_ct_update_key(skb, key, false); +} + +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) +{ + if (nla_put_u8(skb, OVS_KEY_ATTR_CT_STATE, key->ct.state)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) + return -EMSGSIZE; + + return 0; +} + +static int handle_fragments(struct net *net, struct sw_flow_key *key, + u16 zone, struct sk_buff *skb) +{ + struct ovs_skb_cb ovs_cb = *OVS_CB(skb); + + if (key->eth.type == htons(ETH_P_IP)) { + enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; + int err; + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + err = ip_defrag(skb, user); + if (err) + return err; + + ovs_cb.mru = IPCB(skb)->frag_max_size; + } else if (key->eth.type == htons(ETH_P_IPV6)) { +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + struct sk_buff *reasm; + + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + reasm = nf_ct_frag6_gather(skb, user); + if (!reasm) + return -EINPROGRESS; + + if (skb == reasm) + return -EINVAL; + + key->ip.proto = ipv6_hdr(reasm)->nexthdr; + skb_morph(skb, reasm); + consume_skb(reasm); + ovs_cb.mru = IP6CB(skb)->frag_max_size; +#else + return -EPFNOSUPPORT; +#endif + } else { + return -EPFNOSUPPORT; + } + + key->ip.frag = OVS_FRAG_TYPE_NONE; + skb_clear_hash(skb); + skb->ignore_df = 1; + *OVS_CB(skb) = ovs_cb; + + return 0; +} + +static struct nf_conntrack_expect * +ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, + u16 proto, const struct sk_buff *skb) +{ + struct nf_conntrack_tuple tuple; + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, &tuple)) + return NULL; + return __nf_ct_expect_find(net, zone, &tuple); +} + +/* Determine whether skb->nfct is equal to the result of conntrack lookup. */ +static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, + const struct ovs_conntrack_info *info) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return false; + if (!net_eq(net, read_pnet(&ct->ct_net))) + return false; + if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) + return false; + + return true; +} + +static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + /* If we are recirculating packets to match on conntrack fields and + * committing with a separate conntrack action, then we don't need to + * actually run the packet through conntrack twice unless it's for a + * different zone. + */ + if (!skb_nfct_cached(net, skb, info)) { + struct nf_conn *tmpl = info->ct; + + /* Associate skb with specified zone. */ + if (tmpl) { + if (skb->nfct) + nf_conntrack_put(skb->nfct); + nf_conntrack_get(&tmpl->ct_general); + skb->nfct = &tmpl->ct_general; + skb->nfctinfo = IP_CT_NEW; + } + + if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, + skb) != NF_ACCEPT) + return -ENOENT; + } + + return 0; +} + +/* Lookup connection and read fields into key. */ +static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nf_conntrack_expect *exp; + + exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); + if (exp) { + u8 state; + + state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; + __ovs_ct_update_key(key, state, &info->zone); + } else { + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + + ovs_ct_update_key(skb, key, true); + } + + return 0; +} + +/* Lookup connection and confirm if unconfirmed. */ +static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + u8 state; + int err; + + state = key->ct.state; + if (key->ct.zone == info->zone.id && + ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) { + /* Previous lookup has shown that this connection is already + * tracked and committed. Skip committing. + */ + return 0; + } + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + if (nf_conntrack_confirm(skb) != NF_ACCEPT) + return -EINVAL; + + ovs_ct_update_key(skb, key, true); + + return 0; +} + +int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + int nh_ofs; + int err; + + /* The conntrack module expects to be working at L3. */ + nh_ofs = skb_network_offset(skb); + skb_pull(skb, nh_ofs); + + if (key->ip.frag != OVS_FRAG_TYPE_NONE) { + err = handle_fragments(net, key, info->zone.id, skb); + if (err) + return err; + } + + if (info->flags & OVS_CT_F_COMMIT) + err = ovs_ct_commit(net, key, info, skb); + else + err = ovs_ct_lookup(net, key, info, skb); + + skb_push(skb, nh_ofs); + return err; +} + +static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { + [OVS_CT_ATTR_FLAGS] = { .minlen = sizeof(u32), + .maxlen = sizeof(u32) }, + [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), + .maxlen = sizeof(u16) }, +}; + +static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, + bool log) +{ + struct nlattr *a; + int rem; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int maxlen = ovs_ct_attr_lens[type].maxlen; + int minlen = ovs_ct_attr_lens[type].minlen; + + if (type > OVS_CT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown conntrack attr (type=%d, max=%d)", + type, OVS_CT_ATTR_MAX); + return -EINVAL; + } + if (nla_len(a) < minlen || nla_len(a) > maxlen) { + OVS_NLERR(log, + "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", + type, nla_len(a), maxlen); + return -EINVAL; + } + + switch (type) { + case OVS_CT_ATTR_FLAGS: + info->flags = nla_get_u32(a); + break; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case OVS_CT_ATTR_ZONE: + info->zone.id = nla_get_u16(a); + break; +#endif + default: + OVS_NLERR(log, "Unknown conntrack attr (%d)", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); + return -EINVAL; + } + + return 0; +} + +bool ovs_ct_verify(enum ovs_key_attr attr) +{ + if (attr == OVS_KEY_ATTR_CT_STATE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + attr == OVS_KEY_ATTR_CT_ZONE) + return true; + + return false; +} + +int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + struct ovs_conntrack_info ct_info; + u16 family; + int err; + + family = key_to_nfproto(key); + if (family == NFPROTO_UNSPEC) { + OVS_NLERR(log, "ct family unspecified"); + return -EINVAL; + } + + memset(&ct_info, 0, sizeof(ct_info)); + ct_info.family = family; + + nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); + + err = parse_ct(attr, &ct_info, log); + if (err) + return err; + + /* Set up template for tracking connections in specific zones. */ + ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); + if (!ct_info.ct) { + OVS_NLERR(log, "Failed to allocate conntrack template"); + return -ENOMEM; + } + + err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, + sizeof(ct_info), log); + if (err) + goto err_free_ct; + + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + nf_conntrack_get(&ct_info.ct->ct_general); + return 0; +err_free_ct: + nf_conntrack_free(ct_info.ct); + return err; +} + +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start(skb, OVS_ACTION_ATTR_CT); + if (!start) + return -EMSGSIZE; + + if (nla_put_u32(skb, OVS_CT_ATTR_FLAGS, ct_info->flags)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) + return -EMSGSIZE; + + nla_nest_end(skb, start); + + return 0; +} + +void ovs_ct_free_action(const struct nlattr *a) +{ + struct ovs_conntrack_info *ct_info = nla_data(a); + + if (ct_info->ct) + nf_ct_put(ct_info->ct); +} diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h new file mode 100644 index 000000000000..e812ee64a718 --- /dev/null +++ b/net/openvswitch/conntrack.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2015 Nicira, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OVS_CONNTRACK_H +#define OVS_CONNTRACK_H 1 + +#include "flow.h" + +struct ovs_conntrack_info; +enum ovs_key_attr; + +#if defined(CONFIG_OPENVSWITCH_CONNTRACK) +bool ovs_ct_verify(enum ovs_key_attr attr); +int ovs_ct_copy_action(struct net *, const struct nlattr *, + const struct sw_flow_key *, struct sw_flow_actions **, + bool log); +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); + +int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, + const struct ovs_conntrack_info *); + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key); +int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb); +void ovs_ct_free_action(const struct nlattr *a); +#else +#include + +static inline bool ovs_ct_verify(int attr) +{ + return false; +} + +static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla, + const struct sw_flow_key *key, + struct sw_flow_actions **acts, bool log) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + return -ENOTSUPP; +} + +static inline void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key) +{ + key->ct.state = 0; + key->ct.zone = 0; +} + +static inline int ovs_ct_put_key(const struct sw_flow_key *key, + struct sk_buff *skb) +{ + return 0; +} + +static inline void ovs_ct_free_action(const struct nlattr *a) { } +#endif +#endif /* ovs_conntrack.h */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index d5b547375887..72e63726efa0 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -275,6 +275,7 @@ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall); if (unlikely(error)) kfree_skb(skb); @@ -400,9 +401,23 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, if (upcall_info->actions_len) size += nla_total_size(upcall_info->actions_len); + /* OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) + size += nla_total_size(sizeof(upcall_info->mru)); + return size; } +static void pad_packet(struct datapath *dp, struct sk_buff *skb) +{ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(skb->len) - skb->len; + + if (plen > 0) + memset(skb_put(skb, plen), 0, plen); + } +} + static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info) @@ -492,6 +507,16 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_nest_cancel(user_skb, nla); } + /* Add OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) { + if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, + upcall_info->mru)) { + err = -ENOBUFS; + goto out; + } + pad_packet(dp, user_skb); + } + /* Only reserve room for attribute header, packet data is added * in skb_zerocopy() */ if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { @@ -505,12 +530,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, goto out; /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ - if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { - size_t plen = NLA_ALIGN(user_skb->len) - user_skb->len; - - if (plen > 0) - memset(skb_put(user_skb, plen), 0, plen); - } + pad_packet(dp, user_skb); ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; @@ -527,6 +547,7 @@ out: static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; @@ -535,6 +556,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct ethhdr *eth; struct vport *input_vport; + u16 mru = 0; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; @@ -564,6 +586,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) else packet->protocol = htons(ETH_P_802_2); + /* Set packet's mru */ + if (a[OVS_PACKET_ATTR_MRU]) { + mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); + packet->ignore_df = 1; + } + OVS_CB(packet)->mru = mru; + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); @@ -575,7 +604,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (err) goto err_flow_free; - err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], + err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts, log); if (err) goto err_flow_free; @@ -586,7 +615,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); - dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; @@ -598,6 +627,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (!input_vport) goto err_unlock; + packet->dev = input_vport->dev; OVS_CB(packet)->input_vport = input_vport; sf_acts = rcu_dereference(flow->sf_acts); @@ -624,6 +654,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, }; static const struct genl_ops dp_packet_genl_ops[] = { @@ -880,6 +911,7 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow *flow = NULL, *new_flow; @@ -929,8 +961,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) goto err_kfree_flow; /* Validate actions. */ - error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, - &acts, log); + error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], + &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); goto err_kfree_flow; @@ -944,7 +976,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; @@ -1038,7 +1070,8 @@ error: } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ -static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, +static struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, const struct sw_flow_key *key, const struct sw_flow_mask *mask, bool log) @@ -1048,7 +1081,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, int error; ovs_flow_mask_key(&masked_key, key, mask); - error = ovs_nla_copy_actions(a, &masked_key, &acts, log); + error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); if (error) { OVS_NLERR(log, "Actions may not be safe on all matching packets"); @@ -1060,6 +1093,7 @@ static struct sw_flow_actions *get_flow_actions(const struct nlattr *a, static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; struct sw_flow_key key; @@ -1091,8 +1125,8 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) /* Validate actions. */ if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = get_flow_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &mask, - log); + acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key, + &mask, log); if (IS_ERR(acts)) { error = PTR_ERR(acts); goto error; @@ -1108,7 +1142,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) } ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp(net, ovs_header->dp_ifindex); if (unlikely(!dp)) { error = -ENODEV; goto err_unlock_ovs; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 487a85f7d967..d24ba98024be 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -27,6 +27,7 @@ #include #include +#include "conntrack.h" #include "flow.h" #include "flow_table.h" #include "vport.h" @@ -97,10 +98,13 @@ struct datapath { * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. + * @mru: The maximum received fragement size; 0 if the packet is not + * fragmented. */ struct ovs_skb_cb { struct ip_tunnel_info *egress_tun_info; struct vport *input_vport; + u16 mru; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) @@ -113,6 +117,7 @@ struct ovs_skb_cb { * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. + * @mru: If not zero, Maximum received IP fragment size. */ struct dp_upcall_info { const struct ip_tunnel_info *egress_tun_info; @@ -121,6 +126,7 @@ struct dp_upcall_info { int actions_len; u32 portid; u8 cmd; + u16 mru; }; /** diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8db22ef73626..376ca8738fd4 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -49,6 +49,7 @@ #include "datapath.h" #include "flow.h" #include "flow_netlink.h" +#include "conntrack.h" u64 ovs_flow_used_time(unsigned long flow_jiffies) { @@ -707,6 +708,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, key->phy.priority = skb->priority; key->phy.in_port = OVS_CB(skb)->input_vport->port_no; key->phy.skb_mark = skb->mark; + ovs_ct_fill_key(skb, key); key->ovs_flow_hash = 0; key->recirc_id = 0; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 082a87bac819..312c7d755b9b 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -111,6 +111,12 @@ struct sw_flow_key { } nd; } ipv6; }; + struct { + /* Connection tracking fields. */ + u16 zone; + u8 state; + } ct; + } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ struct sw_flow_key_range { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index c182b28c0884..4e795b289eb7 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -281,7 +281,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 22); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 24); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -290,6 +290,8 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(1) /* OVS_KEY_ATTR_CT_STATE */ + + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -339,6 +341,8 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, .next = ovs_tunnel_key_lens, }, [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, + [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u8) }, + [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, }; static bool is_all_zero(const u8 *fp, size_t size) @@ -768,6 +772,21 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, return -EINVAL; *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); } + + if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && + ovs_ct_verify(OVS_KEY_ATTR_CT_STATE)) { + u8 ct_state = nla_get_u8(a[OVS_KEY_ATTR_CT_STATE]); + + SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && + ovs_ct_verify(OVS_KEY_ATTR_CT_ZONE)) { + u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); + + SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); + } return 0; } @@ -1266,6 +1285,7 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr, memset(&match, 0, sizeof(match)); match.key = key; + memset(&key->ct, 0, sizeof(key->ct)); key->phy.in_port = DP_MAX_PORTS; return metadata_from_nlattrs(&match, &attrs, a, false, log); @@ -1314,6 +1334,9 @@ static int __ovs_nla_put_key(const struct sw_flow_key *swkey, if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) goto nla_put_failure; + if (ovs_ct_put_key(output, skb)) + goto nla_put_failure; + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); if (!nla) goto nla_put_failure; @@ -1574,6 +1597,9 @@ void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) case OVS_ACTION_ATTR_SET: ovs_nla_free_set_action(a); break; + case OVS_ACTION_ATTR_CT: + ovs_ct_free_action(a); + break; } } @@ -1647,8 +1673,8 @@ static struct nlattr *__add_action(struct sw_flow_actions **sfa, return a; } -static int add_action(struct sw_flow_actions **sfa, int attrtype, - void *data, int len, bool log) +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data, + int len, bool log) { struct nlattr *a; @@ -1663,7 +1689,7 @@ static inline int add_nested_action_start(struct sw_flow_actions **sfa, int used = (*sfa)->actions_len; int err; - err = add_action(sfa, attrtype, NULL, 0, log); + err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log); if (err) return err; @@ -1679,12 +1705,12 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, a->nla_len = sfa->actions_len - st_offset; } -static int __ovs_nla_copy_actions(const struct nlattr *attr, +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log); -static int validate_and_copy_sample(const struct nlattr *attr, +static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) @@ -1716,15 +1742,15 @@ static int validate_and_copy_sample(const struct nlattr *attr, start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); if (start < 0) return start; - err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, - nla_data(probability), sizeof(u32), log); + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, + nla_data(probability), sizeof(u32), log); if (err) return err; st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); if (st_acts < 0) return st_acts; - err = __ovs_nla_copy_actions(actions, key, depth + 1, sfa, + err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa, eth_type, vlan_tci, log); if (err) return err; @@ -2058,7 +2084,7 @@ static int copy_action(const struct nlattr *from, return 0; } -static int __ovs_nla_copy_actions(const struct nlattr *attr, +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, int depth, struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) @@ -2082,7 +2108,8 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, [OVS_ACTION_ATTR_SET] = (u32)-1, [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, - [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash) + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), + [OVS_ACTION_ATTR_CT] = (u32)-1, }; const struct ovs_action_push_vlan *vlan; int type = nla_type(a); @@ -2189,13 +2216,20 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa, + err = validate_and_copy_sample(net, a, key, depth, sfa, eth_type, vlan_tci, log); if (err) return err; skip_copy = true; break; + case OVS_ACTION_ATTR_CT: + err = ovs_ct_copy_action(net, a, key, sfa, log); + if (err) + return err; + skip_copy = true; + break; + default: OVS_NLERR(log, "Unknown Action type %d", type); return -EINVAL; @@ -2214,7 +2248,7 @@ static int __ovs_nla_copy_actions(const struct nlattr *attr, } /* 'key' must be the masked key. */ -int ovs_nla_copy_actions(const struct nlattr *attr, +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log) { @@ -2225,7 +2259,7 @@ int ovs_nla_copy_actions(const struct nlattr *attr, return PTR_ERR(*sfa); (*sfa)->orig_len = nla_len(attr); - err = __ovs_nla_copy_actions(attr, key, 0, sfa, key->eth.type, + err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, key->eth.tci, log); if (err) ovs_nla_free_flow_actions(*sfa); @@ -2350,6 +2384,13 @@ int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) if (err) return err; break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_action_to_attr(nla_data(a), skb); + if (err) + return err; + break; + default: if (nla_put(skb, type, nla_len(a), nla_data(a))) return -EMSGSIZE; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index acd074408f0a..c0b484b237c9 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -62,9 +62,11 @@ int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, const struct sw_flow_key *key, bool log); u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); -int ovs_nla_copy_actions(const struct nlattr *attr, +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, struct sw_flow_actions **sfa, bool log); +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len, bool log); int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d73e5a16e7ca..e2dc9dac59e6 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -484,6 +484,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->egress_tun_info = NULL; + OVS_CB(skb)->mru = 0; /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { -- cgit v1.2.3 From 182e3042e15de759e81618d11fe4f62f5259d982 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:49 -0700 Subject: openvswitch: Allow matching on conntrack mark Allow matching and setting the ct_mark field. As with ct_state and ct_zone, these fields are populated when the CT action is executed. To write to this field, a value and mask can be specified as a nested attribute under the CT action. This data is stored with the conntrack entry, and is executed after the lookup occurs for the CT action. The conntrack entry itself must be committed using the COMMIT flag in the CT action flags for this change to persist. Signed-off-by: Justin Pettit Signed-off-by: Joe Stringer Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 5 +++ net/openvswitch/actions.c | 1 + net/openvswitch/conntrack.c | 67 ++++++++++++++++++++++++++++++++++++++-- net/openvswitch/conntrack.h | 1 + net/openvswitch/flow.h | 1 + net/openvswitch/flow_netlink.c | 12 ++++++- 6 files changed, 83 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 55f599792673..7a185b554343 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -325,6 +325,7 @@ enum ovs_key_attr { * the accepted length of the array. */ OVS_KEY_ATTR_CT_STATE, /* u8 bitmask of OVS_CS_F_* */ OVS_KEY_ATTR_CT_ZONE, /* u16 connection tracking zone. */ + OVS_KEY_ATTR_CT_MARK, /* u32 connection tracking mark */ #ifdef __KERNEL__ OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ @@ -613,11 +614,15 @@ struct ovs_action_hash { * enum ovs_ct_attr - Attributes for %OVS_ACTION_ATTR_CT action. * @OVS_CT_ATTR_FLAGS: u32 connection tracking flags. * @OVS_CT_ATTR_ZONE: u16 connection tracking zone. + * @OVS_CT_ATTR_MARK: u32 value followed by u32 mask. For each bit set in the + * mask, the corresponding bit in the value is copied to the connection + * tracking mark field in the connection. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, OVS_CT_ATTR_FLAGS, /* u8 bitmask of OVS_CT_F_*. */ OVS_CT_ATTR_ZONE, /* u16 zone id. */ + OVS_CT_ATTR_MARK, /* mark to associate with this connection. */ __OVS_CT_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 72ca2c491b0a..9741d2c703df 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -968,6 +968,7 @@ static int execute_masked_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_CT_STATE: case OVS_KEY_ATTR_CT_ZONE: + case OVS_KEY_ATTR_CT_MARK: err = -EINVAL; break; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 1189fd50f1cf..e53dafccf21f 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -28,12 +28,19 @@ struct ovs_ct_len_tbl { size_t minlen; }; +/* Metadata mark for masked write to conntrack mark */ +struct md_mark { + u32 value; + u32 mask; +}; + /* Conntrack action context for execution. */ struct ovs_conntrack_info { struct nf_conntrack_zone zone; struct nf_conn *ct; u32 flags; u16 family; + struct md_mark mark; }; static u16 key_to_nfproto(const struct sw_flow_key *key) @@ -84,10 +91,12 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) } static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, - const struct nf_conntrack_zone *zone) + const struct nf_conntrack_zone *zone, + const struct nf_conn *ct) { key->ct.state = state; key->ct.zone = zone->id; + key->ct.mark = ct ? ct->mark : 0; } /* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has @@ -110,7 +119,7 @@ static void ovs_ct_update_key(const struct sk_buff *skb, } else if (post_ct) { state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; } - __ovs_ct_update_key(key, state, zone); + __ovs_ct_update_key(key, state, zone, ct); } void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key) @@ -127,6 +136,35 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, key->ct.zone)) return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) + return -EMSGSIZE; + + return 0; +} + +static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, + u32 ct_mark, u32 mask) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u32 new_mark; + + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) + return -ENOTSUPP; + + /* The connection could be invalid, in which case set_mark is no-op. */ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + new_mark = ct_mark | (ct->mark & ~(mask)); + if (ct->mark != new_mark) { + ct->mark = new_mark; + nf_conntrack_event_cache(IPCT_MARK, ct); + key->ct.mark = new_mark; + } + return 0; } @@ -247,7 +285,7 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, u8 state; state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; - __ovs_ct_update_key(key, state, &info->zone); + __ovs_ct_update_key(key, state, &info->zone, exp->master); } else { int err; @@ -310,7 +348,13 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, err = ovs_ct_commit(net, key, info, skb); else err = ovs_ct_lookup(net, key, info, skb); + if (err) + goto err; + if (info->mark.mask) + err = ovs_ct_set_mark(skb, key, info->mark.value, + info->mark.mask); +err: skb_push(skb, nh_ofs); return err; } @@ -320,6 +364,8 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { .maxlen = sizeof(u32) }, [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), .maxlen = sizeof(u16) }, + [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), + .maxlen = sizeof(struct md_mark) }, }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -354,6 +400,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, case OVS_CT_ATTR_ZONE: info->zone.id = nla_get_u16(a); break; +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + case OVS_CT_ATTR_MARK: { + struct md_mark *mark = nla_data(a); + + info->mark = *mark; + break; + } #endif default: OVS_NLERR(log, "Unknown conntrack attr (%d)", @@ -377,6 +431,9 @@ bool ovs_ct_verify(enum ovs_key_attr attr) if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && attr == OVS_KEY_ATTR_CT_ZONE) return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + attr == OVS_KEY_ATTR_CT_MARK) + return true; return false; } @@ -439,6 +496,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), + &ct_info->mark)) + return -EMSGSIZE; nla_nest_end(skb, start); diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index e812ee64a718..87b289c58978 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -65,6 +65,7 @@ static inline void ovs_ct_fill_key(const struct sk_buff *skb, { key->ct.state = 0; key->ct.zone = 0; + key->ct.mark = 0; } static inline int ovs_ct_put_key(const struct sw_flow_key *key, diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 312c7d755b9b..e05e69711ce1 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -114,6 +114,7 @@ struct sw_flow_key { struct { /* Connection tracking fields. */ u16 zone; + u32 mark; u8 state; } ct; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 4e795b289eb7..b17a4ec348f9 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -281,7 +281,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 24); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 25); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -292,6 +292,7 @@ size_t ovs_key_attr_size(void) + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + nla_total_size(1) /* OVS_KEY_ATTR_CT_STATE */ + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -343,6 +344,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_MPLS] = { .len = sizeof(struct ovs_key_mpls) }, [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u8) }, [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, + [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, }; static bool is_all_zero(const u8 *fp, size_t size) @@ -787,6 +789,13 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); } + if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && + ovs_ct_verify(OVS_KEY_ATTR_CT_MARK)) { + u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]); + + SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); + } return 0; } @@ -1919,6 +1928,7 @@ static int validate_set(const struct nlattr *a, case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_CT_MARK: case OVS_KEY_ATTR_ETHERNET: break; -- cgit v1.2.3 From 55e5713f2b5cefed98e3674017e994d688e47b85 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:50 -0700 Subject: netfilter: Always export nf_connlabels_replace() The following patches will reuse this code from OVS. Signed-off-by: Joe Stringer Acked-by: Pravin B Shelar Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_labels.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index bb53f120e79c..daa7c1383bec 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -48,7 +48,6 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit) } EXPORT_SYMBOL_GPL(nf_connlabel_set); -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) static void replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; @@ -89,7 +88,6 @@ int nf_connlabels_replace(struct nf_conn *ct, return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); -#endif static struct nf_ct_ext_type labels_extend __read_mostly = { .len = sizeof(struct nf_conn_labels), -- cgit v1.2.3 From 86ca02e77408bb58ba596c1a411ec7f631733690 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:51 -0700 Subject: netfilter: connlabels: Export setting connlabel length Add functions to change connlabel length into nf_conntrack_labels.c so they may be reused by other modules like OVS and nftables without needing to jump through xt_match_check() hoops. Suggested-by: Florian Westphal Signed-off-by: Joe Stringer Acked-by: Florian Westphal Acked-by: Thomas Graf Signed-off-by: David S. Miller --- include/net/netfilter/nf_conntrack_labels.h | 4 ++++ net/netfilter/nf_conntrack_labels.c | 32 +++++++++++++++++++++++++++++ net/netfilter/xt_connlabel.c | 16 ++++----------- 3 files changed, 40 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index dec6336bf850..7e2b1d025f50 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -54,7 +54,11 @@ int nf_connlabels_replace(struct nf_conn *ct, #ifdef CONFIG_NF_CONNTRACK_LABELS int nf_conntrack_labels_init(void); void nf_conntrack_labels_fini(void); +int nf_connlabels_get(struct net *net, unsigned int n_bits); +void nf_connlabels_put(struct net *net); #else static inline int nf_conntrack_labels_init(void) { return 0; } static inline void nf_conntrack_labels_fini(void) {} +static inline int nf_connlabels_get(struct net *net, unsigned int n_bits) { return 0; } +static inline void nf_connlabels_put(struct net *net) {} #endif diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index daa7c1383bec..3ce5c314ea4b 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -14,6 +14,8 @@ #include #include +static spinlock_t nf_connlabels_lock; + static unsigned int label_bits(const struct nf_conn_labels *l) { unsigned int longs = l->words; @@ -89,6 +91,35 @@ int nf_connlabels_replace(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(nf_connlabels_replace); +int nf_connlabels_get(struct net *net, unsigned int n_bits) +{ + size_t words; + + if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE)) + return -ERANGE; + + words = BITS_TO_LONGS(n_bits); + + spin_lock(&nf_connlabels_lock); + net->ct.labels_used++; + if (words > net->ct.label_words) + net->ct.label_words = words; + spin_unlock(&nf_connlabels_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_connlabels_get); + +void nf_connlabels_put(struct net *net) +{ + spin_lock(&nf_connlabels_lock); + net->ct.labels_used--; + if (net->ct.labels_used == 0) + net->ct.label_words = 0; + spin_unlock(&nf_connlabels_lock); +} +EXPORT_SYMBOL_GPL(nf_connlabels_put); + static struct nf_ct_ext_type labels_extend __read_mostly = { .len = sizeof(struct nf_conn_labels), .align = __alignof__(struct nf_conn_labels), @@ -97,6 +128,7 @@ static struct nf_ct_ext_type labels_extend __read_mostly = { int nf_conntrack_labels_init(void) { + spin_lock_init(&nf_connlabels_lock); return nf_ct_extend_register(&labels_extend); } diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index 9f8719df2001..bb9cbeb18868 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -42,10 +42,6 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) XT_CONNLABEL_OP_SET; struct xt_connlabel_mtinfo *info = par->matchinfo; int ret; - size_t words; - - if (info->bit > XT_CONNLABEL_MAXBIT) - return -ERANGE; if (info->options & ~options) { pr_err("Unknown options in mask %x\n", info->options); @@ -59,19 +55,15 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) return ret; } - par->net->ct.labels_used++; - words = BITS_TO_LONGS(info->bit+1); - if (words > par->net->ct.label_words) - par->net->ct.label_words = words; - + ret = nf_connlabels_get(par->net, info->bit + 1); + if (ret < 0) + nf_ct_l3proto_module_put(par->family); return ret; } static void connlabel_mt_destroy(const struct xt_mtdtor_param *par) { - par->net->ct.labels_used--; - if (par->net->ct.labels_used == 0) - par->net->ct.label_words = 0; + nf_connlabels_put(par->net); nf_ct_l3proto_module_put(par->family); } -- cgit v1.2.3 From c2ac667358708d7cce64c78f58af6adf4c1e848b Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:52 -0700 Subject: openvswitch: Allow matching on conntrack label Allow matching and setting the ct_label field. As with ct_mark, this is populated by executing the CT action. The label field may be modified by specifying a label and mask nested under the CT action. It is stored as metadata attached to the connection. Label modification occurs after lookup, and will only persist when the conntrack entry is committed by providing the COMMIT flag to the CT action. Labels are currently fixed to 128 bits in size. Signed-off-by: Joe Stringer Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 10 +++ net/openvswitch/actions.c | 1 + net/openvswitch/conntrack.c | 128 ++++++++++++++++++++++++++++++++++++++- net/openvswitch/conntrack.h | 11 +++- net/openvswitch/datapath.c | 18 +++--- net/openvswitch/datapath.h | 3 + net/openvswitch/flow.c | 4 +- net/openvswitch/flow.h | 3 +- net/openvswitch/flow_netlink.c | 46 +++++++++----- net/openvswitch/flow_netlink.h | 9 +-- 10 files changed, 199 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7a185b554343..9d52058a9330 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -326,6 +326,7 @@ enum ovs_key_attr { OVS_KEY_ATTR_CT_STATE, /* u8 bitmask of OVS_CS_F_* */ OVS_KEY_ATTR_CT_ZONE, /* u16 connection tracking zone. */ OVS_KEY_ATTR_CT_MARK, /* u32 connection tracking mark */ + OVS_KEY_ATTR_CT_LABEL, /* 16-octet connection tracking label */ #ifdef __KERNEL__ OVS_KEY_ATTR_TUNNEL_INFO, /* struct ip_tunnel_info */ @@ -438,6 +439,11 @@ struct ovs_key_nd { __u8 nd_tll[ETH_ALEN]; }; +#define OVS_CT_LABEL_LEN 16 +struct ovs_key_ct_label { + __u8 ct_label[OVS_CT_LABEL_LEN]; +}; + /* OVS_KEY_ATTR_CT_STATE flags */ #define OVS_CS_F_NEW 0x01 /* Beginning of a new connection. */ #define OVS_CS_F_ESTABLISHED 0x02 /* Part of an existing connection. */ @@ -617,12 +623,16 @@ struct ovs_action_hash { * @OVS_CT_ATTR_MARK: u32 value followed by u32 mask. For each bit set in the * mask, the corresponding bit in the value is copied to the connection * tracking mark field in the connection. + * @OVS_CT_ATTR_LABEL: %OVS_CT_LABEL_LEN value followed by %OVS_CT_LABEL_LEN + * mask. For each bit set in the mask, the corresponding bit in the value is + * copied to the connection tracking label field in the connection. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, OVS_CT_ATTR_FLAGS, /* u8 bitmask of OVS_CT_F_*. */ OVS_CT_ATTR_ZONE, /* u16 zone id. */ OVS_CT_ATTR_MARK, /* mark to associate with this connection. */ + OVS_CT_ATTR_LABEL, /* label to associate with this connection. */ __OVS_CT_ATTR_MAX }; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 9741d2c703df..736a113a75c3 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -969,6 +969,7 @@ static int execute_masked_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_CT_STATE: case OVS_KEY_ATTR_CT_ZONE: case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABEL: err = -EINVAL; break; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e53dafccf21f..a0417fb33d1d 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -34,6 +35,12 @@ struct md_mark { u32 mask; }; +/* Metadata label for masked write to conntrack label. */ +struct md_label { + struct ovs_key_ct_label value; + struct ovs_key_ct_label mask; +}; + /* Conntrack action context for execution. */ struct ovs_conntrack_info { struct nf_conntrack_zone zone; @@ -41,6 +48,7 @@ struct ovs_conntrack_info { u32 flags; u16 family; struct md_mark mark; + struct md_label label; }; static u16 key_to_nfproto(const struct sw_flow_key *key) @@ -90,6 +98,24 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) return ct_state; } +static void ovs_ct_get_label(const struct nf_conn *ct, + struct ovs_key_ct_label *label) +{ + struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL; + + if (cl) { + size_t len = cl->words * sizeof(long); + + if (len > OVS_CT_LABEL_LEN) + len = OVS_CT_LABEL_LEN; + else if (len < OVS_CT_LABEL_LEN) + memset(label, 0, OVS_CT_LABEL_LEN); + memcpy(label, cl->bits, len); + } else { + memset(label, 0, OVS_CT_LABEL_LEN); + } +} + static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, const struct nf_conntrack_zone *zone, const struct nf_conn *ct) @@ -97,6 +123,7 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, key->ct.state = state; key->ct.zone = zone->id; key->ct.mark = ct ? ct->mark : 0; + ovs_ct_get_label(ct, &key->ct.label); } /* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has @@ -140,6 +167,11 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABEL) && + nla_put(skb, OVS_KEY_ATTR_CT_LABEL, sizeof(key->ct.label), + &key->ct.label)) + return -EMSGSIZE; + return 0; } @@ -168,6 +200,40 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, return 0; } +static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_key_ct_label *label, + const struct ovs_key_ct_label *mask) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn_labels *cl; + struct nf_conn *ct; + int err; + + if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) + return -ENOTSUPP; + + /* The connection could be invalid, in which case set_label is no-op.*/ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + + cl = nf_ct_labels_find(ct); + if (!cl) { + nf_ct_labels_ext_add(ct); + cl = nf_ct_labels_find(ct); + } + if (!cl || cl->words * sizeof(long) < OVS_CT_LABEL_LEN) + return -ENOSPC; + + err = nf_connlabels_replace(ct, (u32 *)label, (u32 *)mask, + OVS_CT_LABEL_LEN / sizeof(u32)); + if (err) + return err; + + ovs_ct_get_label(ct, &key->ct.label); + return 0; +} + static int handle_fragments(struct net *net, struct sw_flow_key *key, u16 zone, struct sk_buff *skb) { @@ -327,6 +393,17 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, return 0; } +static bool label_nonzero(const struct ovs_key_ct_label *label) +{ + size_t i; + + for (i = 0; i < sizeof(*label); i++) + if (label->ct_label[i]) + return true; + + return false; +} + int ovs_ct_execute(struct net *net, struct sk_buff *skb, struct sw_flow_key *key, const struct ovs_conntrack_info *info) @@ -351,9 +428,15 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb, if (err) goto err; - if (info->mark.mask) + if (info->mark.mask) { err = ovs_ct_set_mark(skb, key, info->mark.value, info->mark.mask); + if (err) + goto err; + } + if (label_nonzero(&info->label.mask)) + err = ovs_ct_set_label(skb, key, &info->label.value, + &info->label.mask); err: skb_push(skb, nh_ofs); return err; @@ -366,6 +449,8 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { .maxlen = sizeof(u16) }, [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), .maxlen = sizeof(struct md_mark) }, + [OVS_CT_ATTR_LABEL] = { .minlen = sizeof(struct md_label), + .maxlen = sizeof(struct md_label) }, }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -408,6 +493,14 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, info->mark = *mark; break; } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case OVS_CT_ATTR_LABEL: { + struct md_label *label = nla_data(a); + + info->label = *label; + break; + } #endif default: OVS_NLERR(log, "Unknown conntrack attr (%d)", @@ -424,7 +517,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, return 0; } -bool ovs_ct_verify(enum ovs_key_attr attr) +bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) { if (attr == OVS_KEY_ATTR_CT_STATE) return true; @@ -434,6 +527,12 @@ bool ovs_ct_verify(enum ovs_key_attr attr) if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && attr == OVS_KEY_ATTR_CT_MARK) return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + attr == OVS_KEY_ATTR_CT_LABEL) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + return ovs_net->xt_label; + } return false; } @@ -500,6 +599,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), &ct_info->mark)) return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + nla_put(skb, OVS_CT_ATTR_LABEL, sizeof(ct_info->label), + &ct_info->label)) + return -EMSGSIZE; nla_nest_end(skb, start); @@ -513,3 +616,24 @@ void ovs_ct_free_action(const struct nlattr *a) if (ct_info->ct) nf_ct_put(ct_info->ct); } + +void ovs_ct_init(struct net *net) +{ + unsigned int n_bits = sizeof(struct ovs_key_ct_label) * BITS_PER_BYTE; + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (nf_connlabels_get(net, n_bits)) { + ovs_net->xt_label = false; + OVS_NLERR(true, "Failed to set connlabel length"); + } else { + ovs_net->xt_label = true; + } +} + +void ovs_ct_exit(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (ovs_net->xt_label) + nf_connlabels_put(net); +} diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h index 87b289c58978..3cb30667a7dc 100644 --- a/net/openvswitch/conntrack.h +++ b/net/openvswitch/conntrack.h @@ -20,7 +20,9 @@ struct ovs_conntrack_info; enum ovs_key_attr; #if defined(CONFIG_OPENVSWITCH_CONNTRACK) -bool ovs_ct_verify(enum ovs_key_attr attr); +void ovs_ct_init(struct net *); +void ovs_ct_exit(struct net *); +bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); int ovs_ct_copy_action(struct net *, const struct nlattr *, const struct sw_flow_key *, struct sw_flow_actions **, bool log); @@ -35,7 +37,11 @@ void ovs_ct_free_action(const struct nlattr *a); #else #include -static inline bool ovs_ct_verify(int attr) +static inline void ovs_ct_init(struct net *net) { } + +static inline void ovs_ct_exit(struct net *net) { } + +static inline bool ovs_ct_verify(struct net *net, int attr) { return false; } @@ -66,6 +72,7 @@ static inline void ovs_ct_fill_key(const struct sk_buff *skb, key->ct.state = 0; key->ct.zone = 0; key->ct.mark = 0; + memset(&key->ct.label, 0, sizeof(key->ct.label)); } static inline int ovs_ct_put_key(const struct sw_flow_key *key, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 72e63726efa0..ec0f8d9cee73 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -599,8 +599,8 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_key_extract_userspace(a[OVS_PACKET_ATTR_KEY], packet, - &flow->key, log); + err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], + packet, &flow->key, log); if (err) goto err_flow_free; @@ -947,7 +947,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) /* Extract key. */ ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto err_kfree_flow; @@ -1118,7 +1118,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); ovs_match_init(&match, &key, &mask); - error = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) goto error; @@ -1208,6 +1208,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow; @@ -1222,7 +1223,7 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, log); } else if (!ufid_present) { OVS_NLERR(log, @@ -1266,6 +1267,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct ovs_header *ovs_header = info->userhdr; + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow = NULL; @@ -1280,8 +1282,8 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); if (a[OVS_FLOW_ATTR_KEY]) { ovs_match_init(&match, &key, NULL); - err = ovs_nla_get_match(&match, a[OVS_FLOW_ATTR_KEY], NULL, - log); + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + NULL, log); if (unlikely(err)) return err; } @@ -2237,6 +2239,7 @@ static int __net_init ovs_init_net(struct net *net) INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + ovs_ct_init(net); return 0; } @@ -2271,6 +2274,7 @@ static void __net_exit ovs_exit_net(struct net *dnet) struct net *net; LIST_HEAD(head); + ovs_ct_exit(dnet); ovs_lock(); list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index d24ba98024be..4e785ab88973 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -138,6 +138,9 @@ struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; struct vport_net vport_net; + + /* Module reference for configuring conntrack. */ + bool xt_label; }; extern int ovs_net_id; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 376ca8738fd4..5a3195e538ce 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -715,7 +715,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, return key_extract(skb, key); } -int ovs_flow_key_extract_userspace(const struct nlattr *attr, +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log) { @@ -724,7 +724,7 @@ int ovs_flow_key_extract_userspace(const struct nlattr *attr, memset(key, 0, OVS_SW_FLOW_KEY_METADATA_SIZE); /* Extract metadata from netlink attributes. */ - err = ovs_nla_get_flow_metadata(attr, key, log); + err = ovs_nla_get_flow_metadata(net, attr, key, log); if (err) return err; diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index e05e69711ce1..fe527d2dd4b7 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -116,6 +116,7 @@ struct sw_flow_key { u16 zone; u32 mark; u8 state; + struct ovs_key_ct_label label; } ct; } __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ @@ -220,7 +221,7 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key); /* Extract key from packet coming from userspace. */ -int ovs_flow_key_extract_userspace(const struct nlattr *attr, +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, struct sk_buff *skb, struct sw_flow_key *key, bool log); diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index b17a4ec348f9..e22c5bfe8575 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -281,7 +281,7 @@ size_t ovs_key_attr_size(void) /* Whenever adding new OVS_KEY_ FIELDS, we should consider * updating this function. */ - BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 25); + BUILD_BUG_ON(OVS_KEY_ATTR_TUNNEL_INFO != 26); return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ @@ -293,6 +293,7 @@ size_t ovs_key_attr_size(void) + nla_total_size(1) /* OVS_KEY_ATTR_CT_STATE */ + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABEL */ + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ @@ -345,6 +346,7 @@ static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u8) }, [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_LABEL] = { .len = sizeof(struct ovs_key_ct_label) }, }; static bool is_all_zero(const u8 *fp, size_t size) @@ -721,9 +723,9 @@ int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, egress_tun_info->options_len); } -static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, - const struct nlattr **a, bool is_mask, - bool log) +static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 *attrs, const struct nlattr **a, + bool is_mask, bool log) { if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); @@ -776,36 +778,45 @@ static int metadata_from_nlattrs(struct sw_flow_match *match, u64 *attrs, } if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && - ovs_ct_verify(OVS_KEY_ATTR_CT_STATE)) { + ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { u8 ct_state = nla_get_u8(a[OVS_KEY_ATTR_CT_STATE]); SW_FLOW_KEY_PUT(match, ct.state, ct_state, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); } if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && - ovs_ct_verify(OVS_KEY_ATTR_CT_ZONE)) { + ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); SW_FLOW_KEY_PUT(match, ct.zone, ct_zone, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); } if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && - ovs_ct_verify(OVS_KEY_ATTR_CT_MARK)) { + ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) { u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]); SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); } + if (*attrs & (1 << OVS_KEY_ATTR_CT_LABEL) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABEL)) { + const struct ovs_key_ct_label *cl; + + cl = nla_data(a[OVS_KEY_ATTR_CT_LABEL]); + SW_FLOW_KEY_MEMCPY(match, ct.label, cl->ct_label, + sizeof(*cl), is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABEL); + } return 0; } -static int ovs_key_from_nlattrs(struct sw_flow_match *match, u64 attrs, - const struct nlattr **a, bool is_mask, - bool log) +static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 attrs, const struct nlattr **a, + bool is_mask, bool log) { int err; - err = metadata_from_nlattrs(match, &attrs, a, is_mask, log); + err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log); if (err) return err; @@ -1057,6 +1068,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * mask. In case the 'mask' is NULL, the flow is treated as exact match * flow. Otherwise, it is treated as a wildcarded flow, except the mask * does not include any don't care bit. + * @net: Used to determine per-namespace field support. * @match: receives the extracted flow match information. * @key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute * sequence. The fields should of the packet that triggered the creation @@ -1067,7 +1079,7 @@ static void mask_set_nlattr(struct nlattr *attr, u8 val) * probing for feature compatibility this should be passed in as false to * suppress unnecessary error logging. */ -int ovs_nla_get_match(struct sw_flow_match *match, +int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, const struct nlattr *nla_key, const struct nlattr *nla_mask, bool log) @@ -1117,7 +1129,7 @@ int ovs_nla_get_match(struct sw_flow_match *match, } } - err = ovs_key_from_nlattrs(match, key_attrs, a, false, log); + err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); if (err) return err; @@ -1197,7 +1209,8 @@ int ovs_nla_get_match(struct sw_flow_match *match, } } - err = ovs_key_from_nlattrs(match, mask_attrs, a, true, log); + err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, + log); if (err) goto free_newmask; } @@ -1278,7 +1291,7 @@ u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) * extracted from the packet itself. */ -int ovs_nla_get_flow_metadata(const struct nlattr *attr, +int ovs_nla_get_flow_metadata(struct net *net, const struct nlattr *attr, struct sw_flow_key *key, bool log) { @@ -1297,7 +1310,7 @@ int ovs_nla_get_flow_metadata(const struct nlattr *attr, memset(&key->ct, 0, sizeof(key->ct)); key->phy.in_port = DP_MAX_PORTS; - return metadata_from_nlattrs(&match, &attrs, a, false, log); + return metadata_from_nlattrs(net, &match, &attrs, a, false, log); } static int __ovs_nla_put_key(const struct sw_flow_key *swkey, @@ -1929,6 +1942,7 @@ static int validate_set(const struct nlattr *a, case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABEL: case OVS_KEY_ATTR_ETHERNET: break; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index c0b484b237c9..07878e22e783 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -45,15 +45,16 @@ void ovs_match_init(struct sw_flow_match *match, int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, int attr, bool is_mask, struct sk_buff *); -int ovs_nla_get_flow_metadata(const struct nlattr *, struct sw_flow_key *, - bool log); +int ovs_nla_get_flow_metadata(struct net *, const struct nlattr *, + struct sw_flow_key *, bool log); int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); -int ovs_nla_get_match(struct sw_flow_match *, const struct nlattr *key, - const struct nlattr *mask, bool log); +int ovs_nla_get_match(struct net *, struct sw_flow_match *, + const struct nlattr *key, const struct nlattr *mask, + bool log); int ovs_nla_put_egress_tunnel_key(struct sk_buff *, const struct ip_tunnel_info *); -- cgit v1.2.3 From cae3a2627520c3795b54533c5328b77af3405dbe Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Wed, 26 Aug 2015 11:31:53 -0700 Subject: openvswitch: Allow attaching helpers to ct action Add support for using conntrack helpers to assist protocol detection. The new OVS_CT_ATTR_HELPER attribute of the CT action specifies a helper to be used for this connection. If no helper is specified, then helpers will be automatically applied as per the sysctl configuration of net.netfilter.nf_conntrack_helper. The helper may be specified as part of the conntrack action, eg: ct(helper=ftp). Initial packets for related connections should be committed to allow later packets for the flow to be considered established. Example ovs-ofctl flows allowing FTP connections from ports 1->2: in_port=1,tcp,action=ct(helper=ftp,commit),2 in_port=2,tcp,ct_state=-trk,action=ct(recirc) in_port=2,tcp,ct_state=+trk-new+est,action=1 in_port=2,tcp,ct_state=+trk+rel,action=1 Signed-off-by: Joe Stringer Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 3 ++ net/openvswitch/conntrack.c | 109 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 110 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 9d52058a9330..32e07d8cbaf4 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -626,6 +626,7 @@ struct ovs_action_hash { * @OVS_CT_ATTR_LABEL: %OVS_CT_LABEL_LEN value followed by %OVS_CT_LABEL_LEN * mask. For each bit set in the mask, the corresponding bit in the value is * copied to the connection tracking label field in the connection. + * @OVS_CT_ATTR_HELPER: variable length string defining conntrack ALG. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -633,6 +634,8 @@ enum ovs_ct_attr { OVS_CT_ATTR_ZONE, /* u16 zone id. */ OVS_CT_ATTR_MARK, /* mark to associate with this connection. */ OVS_CT_ATTR_LABEL, /* label to associate with this connection. */ + OVS_CT_ATTR_HELPER, /* netlink helper to assist detection of + related connections. */ __OVS_CT_ATTR_MAX }; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index a0417fb33d1d..890d3eedb447 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ struct md_label { /* Conntrack action context for execution. */ struct ovs_conntrack_info { + struct nf_conntrack_helper *helper; struct nf_conntrack_zone zone; struct nf_conn *ct; u32 flags; @@ -234,6 +236,51 @@ static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key, return 0; } +/* 'skb' should already be pulled to nh_ofs. */ +static int ovs_ct_helper(struct sk_buff *skb, u16 proto) +{ + const struct nf_conntrack_helper *helper; + const struct nf_conn_help *help; + enum ip_conntrack_info ctinfo; + unsigned int protoff; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct || ctinfo == IP_CT_RELATED_REPLY) + return NF_ACCEPT; + + help = nfct_help(ct); + if (!help) + return NF_ACCEPT; + + helper = rcu_dereference(help->helper); + if (!helper) + return NF_ACCEPT; + + switch (proto) { + case NFPROTO_IPV4: + protoff = ip_hdrlen(skb); + break; + case NFPROTO_IPV6: { + u8 nexthdr = ipv6_hdr(skb)->nexthdr; + __be16 frag_off; + + protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), + &nexthdr, &frag_off); + if (protoff < 0 || (frag_off & htons(~0x7)) != 0) { + pr_debug("proto header not found\n"); + return NF_ACCEPT; + } + break; + } + default: + WARN_ONCE(1, "helper invoked on non-IP family!"); + return NF_DROP; + } + + return helper->help(skb, protoff, ct, ctinfo); +} + static int handle_fragments(struct net *net, struct sw_flow_key *key, u16 zone, struct sk_buff *skb) { @@ -306,6 +353,13 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb, return false; if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) return false; + if (info->helper) { + struct nf_conn_help *help; + + help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); + if (help && rcu_access_pointer(help->helper) != info->helper) + return false; + } return true; } @@ -334,6 +388,11 @@ static int __ovs_ct_lookup(struct net *net, const struct sw_flow_key *key, if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING, skb) != NF_ACCEPT) return -ENOENT; + + if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) { + WARN_ONCE(1, "helper rejected packet"); + return -EINVAL; + } } return 0; @@ -442,6 +501,30 @@ err: return err; } +static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name, + const struct sw_flow_key *key, bool log) +{ + struct nf_conntrack_helper *helper; + struct nf_conn_help *help; + + helper = nf_conntrack_helper_try_module_get(name, info->family, + key->ip.proto); + if (!helper) { + OVS_NLERR(log, "Unknown helper \"%s\"", name); + return -EINVAL; + } + + help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL); + if (!help) { + module_put(helper->me); + return -ENOMEM; + } + + rcu_assign_pointer(help->helper, helper); + info->helper = helper; + return 0; +} + static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { [OVS_CT_ATTR_FLAGS] = { .minlen = sizeof(u32), .maxlen = sizeof(u32) }, @@ -451,10 +534,12 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { .maxlen = sizeof(struct md_mark) }, [OVS_CT_ATTR_LABEL] = { .minlen = sizeof(struct md_label), .maxlen = sizeof(struct md_label) }, + [OVS_CT_ATTR_HELPER] = { .minlen = 1, + .maxlen = NF_CT_HELPER_NAME_LEN } }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, - bool log) + const char **helper, bool log) { struct nlattr *a; int rem; @@ -502,6 +587,13 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, break; } #endif + case OVS_CT_ATTR_HELPER: + *helper = nla_data(a); + if (!memchr(*helper, '\0', nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack helper"); + return -EINVAL; + } + break; default: OVS_NLERR(log, "Unknown conntrack attr (%d)", type); @@ -542,6 +634,7 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, struct sw_flow_actions **sfa, bool log) { struct ovs_conntrack_info ct_info; + const char *helper = NULL; u16 family; int err; @@ -557,7 +650,7 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, NF_CT_DEFAULT_ZONE_DIR, 0); - err = parse_ct(attr, &ct_info, log); + err = parse_ct(attr, &ct_info, &helper, log); if (err) return err; @@ -567,6 +660,11 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, OVS_NLERR(log, "Failed to allocate conntrack template"); return -ENOMEM; } + if (helper) { + err = ovs_ct_add_helper(&ct_info, helper, key, log); + if (err) + goto err_free_ct; + } err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, sizeof(ct_info), log); @@ -603,6 +701,11 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, nla_put(skb, OVS_CT_ATTR_LABEL, sizeof(ct_info->label), &ct_info->label)) return -EMSGSIZE; + if (ct_info->helper) { + if (nla_put_string(skb, OVS_CT_ATTR_HELPER, + ct_info->helper->name)) + return -EMSGSIZE; + } nla_nest_end(skb, start); @@ -613,6 +716,8 @@ void ovs_ct_free_action(const struct nlattr *a) { struct ovs_conntrack_info *ct_info = nla_data(a); + if (ct_info->helper) + module_put(ct_info->helper->me); if (ct_info->ct) nf_ct_put(ct_info->ct); } -- cgit v1.2.3 From 3b3ae880266d148bf73a573a766bc9b78c08d805 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 26 Aug 2015 23:00:06 +0200 Subject: net: sched: consolidate tc_classify{,_compat} For classifiers getting invoked via tc_classify(), we always need an extra function call into tc_classify_compat(), as both are being exported as symbols and tc_classify() itself doesn't do much except handling of reclassifications when tp->classify() returned with TC_ACT_RECLASSIFY. CBQ and ATM are the only qdiscs that directly call into tc_classify_compat(), all others use tc_classify(). When tc actions are being configured out in the kernel, tc_classify() effectively does nothing besides delegating. We could spare this layer and consolidate both functions. pktgen on single CPU constantly pushing skbs directly into the netif_receive_skb() path with a dummy classifier on ingress qdisc attached, improves slightly from 22.3Mpps to 23.1Mpps. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 4 +--- net/core/dev.c | 2 +- net/sched/sch_api.c | 55 ++++++++++++++++++++++-------------------------- net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 2 +- net/sched/sch_choke.c | 2 +- net/sched/sch_drr.c | 2 +- net/sched/sch_dsmark.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_hfsc.c | 2 +- net/sched/sch_htb.c | 2 +- net/sched/sch_multiq.c | 2 +- net/sched/sch_prio.c | 2 +- net/sched/sch_qfq.c | 2 +- net/sched/sch_sfb.c | 2 +- net/sched/sch_sfq.c | 2 +- 16 files changed, 40 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index 2342bf12cb78..401038d2f9b8 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -110,10 +110,8 @@ static inline void qdisc_run(struct Qdisc *q) __qdisc_run(q); } -int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res); int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res); + struct tcf_result *res, bool compat_mode); static inline __be16 tc_skb_protocol(const struct sk_buff *skb) { diff --git a/net/core/dev.c b/net/core/dev.c index b1f3f4844e60..7bb24f1879b8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3657,7 +3657,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); qdisc_bstats_cpu_update(cl->q, skb); - switch (tc_classify(skb, cl, &cl_res)) { + switch (tc_classify(skb, cl, &cl_res, false)) { case TC_ACT_OK: case TC_ACT_RECLASSIFY: skb->tc_index = TC_H_MIN(cl_res.classid); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f06aa01d60fd..59c227f26b56 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1806,51 +1806,46 @@ done: * to this qdisc, (optionally) tests for protocol and asks * specific classifiers. */ -int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) +int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, + struct tcf_result *res, bool compat_mode) { __be16 protocol = tc_skb_protocol(skb); - int err; +#ifdef CONFIG_NET_CLS_ACT + const struct tcf_proto *old_tp = tp; + int limit = 0; +reclassify: +#endif for (; tp; tp = rcu_dereference_bh(tp->next)) { + int err; + if (tp->protocol != protocol && tp->protocol != htons(ETH_P_ALL)) continue; - err = tp->classify(skb, tp, res); + err = tp->classify(skb, tp, res); +#ifdef CONFIG_NET_CLS_ACT + if (unlikely(err == TC_ACT_RECLASSIFY && + !compat_mode)) + goto reset; +#endif if (err >= 0) return err; } - return -1; -} -EXPORT_SYMBOL(tc_classify_compat); -int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - int err = 0; -#ifdef CONFIG_NET_CLS_ACT - const struct tcf_proto *otp = tp; - int limit = 0; -reclassify: -#endif - - err = tc_classify_compat(skb, tp, res); + return -1; #ifdef CONFIG_NET_CLS_ACT - if (err == TC_ACT_RECLASSIFY) { - tp = otp; - - if (unlikely(limit++ >= MAX_REC_LOOP)) { - net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n", - tp->q->ops->id, - tp->prio & 0xffff, - ntohs(tp->protocol)); - return TC_ACT_SHOT; - } - goto reclassify; +reset: + if (unlikely(limit++ >= MAX_REC_LOOP)) { + net_notice_ratelimited("%s: reclassify loop, rule prio %u, " + "protocol %02x\n", tp->q->ops->id, + tp->prio & 0xffff, ntohs(tp->protocol)); + return TC_ACT_SHOT; } + + tp = old_tp; + goto reclassify; #endif - return err; } EXPORT_SYMBOL(tc_classify); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index e3e2cc5fd068..1911af3ca7c0 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -375,7 +375,7 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch) list_for_each_entry(flow, &p->flows, list) { fl = rcu_dereference_bh(flow->filter_list); if (fl) { - result = tc_classify_compat(skb, fl, &res); + result = tc_classify(skb, fl, &res, true); if (result < 0) continue; flow = (struct atm_flow_data *)res.class; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index beeb75f80fdb..c538d9e4a8f6 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -240,7 +240,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) /* * Step 2+n. Apply classifier. */ - result = tc_classify_compat(skb, fl, &res); + result = tc_classify(skb, fl, &res, true); if (!fl || result < 0) goto fallback; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 6a783afe4960..665bde07916b 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -201,7 +201,7 @@ static bool choke_classify(struct sk_buff *skb, int result; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 338706092c27..f26bdea875c1 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -331,7 +331,7 @@ static struct drr_class *drr_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 66700a6116aa..c4d45fd8c551 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -230,7 +230,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch) else { struct tcf_result res; struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tc_classify(skb, fl, &res); + int result = tc_classify(skb, fl, &res, false); pr_debug("result %d class 0x%04x\n", result, res.classid); diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index a9ba030435a2..4c834e93dafb 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -92,7 +92,7 @@ static unsigned int fq_codel_classify(struct sk_buff *skb, struct Qdisc *sch, return fq_codel_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, filter, &res); + result = tc_classify(skb, filter, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index e6c7416d0332..b7ebe2c87586 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1165,7 +1165,7 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; head = &q->root; tcf = rcu_dereference_bh(q->root.filter_list); - while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { + while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index cf4b0f865d1b..15ccd7f8fb2a 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -229,7 +229,7 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, } *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { + while (tcf && (result = tc_classify(skb, tcf, &res, false)) >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_QUEUED: diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 42dd218871e0..4e904ca0af9d 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -46,7 +46,7 @@ multiq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - err = tc_classify(skb, fl, &res); + err = tc_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 8e5cd34aaa74..ba6487f2741f 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -42,7 +42,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); - err = tc_classify(skb, fl, &res); + err = tc_classify(skb, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index ffaeea63d473..3dc3a6e56052 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -717,7 +717,7 @@ static struct qfq_class *qfq_classify(struct sk_buff *skb, struct Qdisc *sch, *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index dcdff5c769a1..5bbb6332ec57 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -258,7 +258,7 @@ static bool sfb_classify(struct sk_buff *skb, struct tcf_proto *fl, struct tcf_result res; int result; - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 52f75a5473e1..3abab534eb5c 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -179,7 +179,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - result = tc_classify(skb, fl, &res); + result = tc_classify(skb, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { -- cgit v1.2.3 From d2d427b3927bd7a0348fc7f323d0e291f79a2779 Mon Sep 17 00:00:00 2001 From: Toshiaki Makita Date: Thu, 27 Aug 2015 15:32:26 +0900 Subject: bridge: Add netlink support for vlan_protocol attribute This enables bridge vlan_protocol to be configured through netlink. When CONFIG_BRIDGE_VLAN_FILTERING is disabled, kernel behaves the same way as this feature is not implemented. Signed-off-by: Toshiaki Makita Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 34 ++++++++++++++++++++++++++++++++++ net/bridge/br_private.h | 1 + net/bridge/br_vlan.c | 35 +++++++++++++++++++++-------------- 4 files changed, 57 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 313c305fd1ad..2d13dd44ecaa 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -231,6 +231,7 @@ enum { IFLA_BR_STP_STATE, IFLA_BR_PRIORITY, IFLA_BR_VLAN_FILTERING, + IFLA_BR_VLAN_PROTOCOL, __IFLA_BR_MAX, }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index dbcb1949ea58..af5e187553fd 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -673,6 +673,21 @@ static int br_validate(struct nlattr *tb[], struct nlattr *data[]) return -EADDRNOTAVAIL; } + if (!data) + return 0; + +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (data[IFLA_BR_VLAN_PROTOCOL]) { + switch (nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL])) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + break; + default: + return -EPROTONOSUPPORT; + } + } +#endif + return 0; } @@ -729,6 +744,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_STP_STATE] = { .type = NLA_U32 }, [IFLA_BR_PRIORITY] = { .type = NLA_U16 }, [IFLA_BR_VLAN_FILTERING] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_PROTOCOL] = { .type = NLA_U16 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -784,6 +800,16 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], return err; } +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (data[IFLA_BR_VLAN_PROTOCOL]) { + __be16 vlan_proto = nla_get_be16(data[IFLA_BR_VLAN_PROTOCOL]); + + err = __br_vlan_set_proto(br, vlan_proto); + if (err) + return err; + } +#endif + return 0; } @@ -796,6 +822,9 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(u32)) + /* IFLA_BR_STP_STATE */ nla_total_size(sizeof(u16)) + /* IFLA_BR_PRIORITY */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_FILTERING */ +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ +#endif 0; } @@ -819,6 +848,11 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) nla_put_u8(skb, IFLA_BR_VLAN_FILTERING, vlan_enabled)) return -EMSGSIZE; +#ifdef CONFIG_BRIDGE_VLAN_FILTERING + if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto)) + return -EMSGSIZE; +#endif + return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 3d95647039d0..19e8f79b6b99 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -616,6 +616,7 @@ bool br_vlan_find(struct net_bridge *br, u16 vid); void br_recalculate_fwd_mask(struct net_bridge *br); int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 3cef6892c0bb..3cd8cc9e804b 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -492,23 +492,16 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val) return 0; } -int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +int __br_vlan_set_proto(struct net_bridge *br, __be16 proto) { int err = 0; struct net_bridge_port *p; struct net_port_vlans *pv; - __be16 proto, oldproto; + __be16 oldproto; u16 vid, errvid; - if (val != ETH_P_8021Q && val != ETH_P_8021AD) - return -EPROTONOSUPPORT; - - if (!rtnl_trylock()) - return restart_syscall(); - - proto = htons(val); if (br->vlan_proto == proto) - goto unlock; + return 0; /* Add VLANs for the new proto to the device filter. */ list_for_each_entry(p, &br->port_list, list) { @@ -539,9 +532,7 @@ int br_vlan_set_proto(struct net_bridge *br, unsigned long val) vlan_vid_del(p->dev, oldproto, vid); } -unlock: - rtnl_unlock(); - return err; + return 0; err_filt: errvid = vid; @@ -557,7 +548,23 @@ err_filt: vlan_vid_del(p->dev, proto, vid); } - goto unlock; + return err; +} + +int br_vlan_set_proto(struct net_bridge *br, unsigned long val) +{ + int err; + + if (val != ETH_P_8021Q && val != ETH_P_8021AD) + return -EPROTONOSUPPORT; + + if (!rtnl_trylock()) + return restart_syscall(); + + err = __br_vlan_set_proto(br, htons(val)); + rtnl_unlock(); + + return err; } static bool vlan_default_pvid(struct net_port_vlans *pv, u16 vid) -- cgit v1.2.3 From c29a70d2cadfea443c027d23481f820530b70057 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 26 Aug 2015 23:46:50 -0700 Subject: tunnel: introduce udp_tun_rx_dst() Introduce function udp_tun_rx_dst() to initialize tunnel dst on receive path. Signed-off-by: Pravin B Shelar Reviewed-by: Jesse Gross Acked-by: Thomas Graf Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 29 +++------------------- include/net/dst_metadata.h | 61 ++++++++++++++++++++++++++++++++++++++++++++++ include/net/udp_tunnel.h | 4 +++ net/ipv4/ip_gre.c | 21 ++++------------ net/ipv4/udp_tunnel.c | 25 ++++++++++++++++++- 5 files changed, 97 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 61b457b9ec00..5b4cf66e632e 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1264,36 +1264,13 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) } if (vxlan_collect_metadata(vs)) { - tun_dst = metadata_dst_alloc(sizeof(*md), GFP_ATOMIC); + tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY, + cpu_to_be64(vni >> 8), sizeof(*md)); + if (!tun_dst) goto drop; info = &tun_dst->u.tun_info; - if (vxlan_get_sk_family(vs) == AF_INET) { - const struct iphdr *iph = ip_hdr(skb); - - info->key.u.ipv4.src = iph->saddr; - info->key.u.ipv4.dst = iph->daddr; - info->key.tos = iph->tos; - info->key.ttl = iph->ttl; - } else { - const struct ipv6hdr *ip6h = ipv6_hdr(skb); - - info->key.u.ipv6.src = ip6h->saddr; - info->key.u.ipv6.dst = ip6h->daddr; - info->key.tos = ipv6_get_dsfield(ip6h); - info->key.ttl = ip6h->hop_limit; - } - - info->key.tp_src = udp_hdr(skb)->source; - info->key.tp_dst = udp_hdr(skb)->dest; - - info->mode = IP_TUNNEL_INFO_RX; - info->key.tun_flags = TUNNEL_KEY; - info->key.tun_id = cpu_to_be64(vni >> 8); - if (udp_hdr(skb)->check != 0) - info->key.tun_flags |= TUNNEL_CSUM; - md = ip_tunnel_info_opts(info, sizeof(*md)); } else { memset(md, 0, sizeof(*md)); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 2cb52d562272..60c03326c087 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -48,4 +48,65 @@ static inline bool skb_valid_dst(const struct sk_buff *skb) struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); +static inline struct metadata_dst *tun_rx_dst(__be16 flags, + __be64 tunnel_id, int md_size) +{ + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->mode = IP_TUNNEL_INFO_RX; + info->key.tun_flags = flags; + info->key.tun_id = tunnel_id; + info->key.tp_src = 0; + info->key.tp_dst = 0; + return tun_dst; +} + +static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, + __be16 flags, + __be64 tunnel_id, + int md_size) +{ + const struct iphdr *iph = ip_hdr(skb); + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + tun_dst = tun_rx_dst(flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.u.ipv4.src = iph->saddr; + info->key.u.ipv4.dst = iph->daddr; + info->key.tos = iph->tos; + info->key.ttl = iph->ttl; + return tun_dst; +} + +static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, + __be16 flags, + __be64 tunnel_id, + int md_size) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + tun_dst = tun_rx_dst(flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.u.ipv6.src = ip6h->saddr; + info->key.u.ipv6.dst = ip6h->daddr; + info->key.tos = ipv6_get_dsfield(ip6h); + info->key.ttl = ip6h->hop_limit; + return tun_dst; +} + #endif /* __NET_DST_METADATA_H */ diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index c491c1221606..35041d0fc21e 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -93,6 +93,10 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, void udp_tunnel_sock_release(struct socket *sock); +struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, + __be16 flags, __be64 tunnel_id, + int md_size); + static inline struct sk_buff *udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) { diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1bf328182697..faf1cde6f8da 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -400,25 +400,14 @@ static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi) if (tunnel) { skb_pop_mac_header(skb); if (tunnel->collect_md) { - struct ip_tunnel_info *info; + __be16 flags; + __be64 tun_id; - tun_dst = metadata_dst_alloc(0, GFP_ATOMIC); + flags = tpi->flags & (TUNNEL_CSUM | TUNNEL_KEY); + tun_id = key_to_tunnel_id(tpi->key); + tun_dst = ip_tun_rx_dst(skb, flags, tun_id, 0); if (!tun_dst) return PACKET_REJECT; - - info = &tun_dst->u.tun_info; - info->key.u.ipv4.src = iph->saddr; - info->key.u.ipv4.dst = iph->daddr; - info->key.tos = iph->tos; - info->key.ttl = iph->ttl; - - info->mode = IP_TUNNEL_INFO_RX; - info->key.tun_flags = tpi->flags & - (TUNNEL_CSUM | TUNNEL_KEY); - info->key.tun_id = key_to_tunnel_id(tpi->key); - - info->key.tp_src = 0; - info->key.tp_dst = 0; } ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 933ea903f7b8..aba428626b52 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -4,9 +4,10 @@ #include #include #include +#include +#include #include #include -#include int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) @@ -103,4 +104,26 @@ void udp_tunnel_sock_release(struct socket *sock) } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); +struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, + __be16 flags, __be64 tunnel_id, int md_size) +{ + struct metadata_dst *tun_dst; + struct ip_tunnel_info *info; + + if (family == AF_INET) + tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); + else + tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); + if (!tun_dst) + return NULL; + + info = &tun_dst->u.tun_info; + info->key.tp_src = udp_hdr(skb)->source; + info->key.tp_dst = udp_hdr(skb)->dest; + if (udp_hdr(skb)->check) + info->key.tun_flags |= TUNNEL_CSUM; + return tun_dst; +} +EXPORT_SYMBOL_GPL(udp_tun_rx_dst); + MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 6b001e682e90d7edf21f93687f5c3b39d412ad6c Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 26 Aug 2015 23:46:53 -0700 Subject: openvswitch: Use Geneve device. With help of tunnel metadata mode OVS can directly use Geneve devices to implement Geneve tunnels. This patch removes all of the OVS specific Geneve code and make OVS use a Geneve net_device. Basic geneve vport is still there to handle compatibility with current userspace application. Signed-off-by: Pravin B Shelar Reviewed-by: Jesse Gross Acked-by: Thomas Graf Signed-off-by: David S. Miller --- net/openvswitch/Kconfig | 2 +- net/openvswitch/vport-geneve.c | 179 ++++++++--------------------------------- 2 files changed, 33 insertions(+), 148 deletions(-) (limited to 'net') diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 98f343d0d6dd..af7cdef42066 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -70,7 +70,7 @@ config OPENVSWITCH_VXLAN config OPENVSWITCH_GENEVE tristate "Open vSwitch Geneve tunneling support" depends on OPENVSWITCH - depends on GENEVE_CORE + depends on GENEVE default OPENVSWITCH ---help--- If you say Y here, then the Open vSwitch will be able create geneve vport. diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index d01bd6360970..fa37c95f7339 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -26,95 +26,44 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" static struct vport_ops ovs_geneve_vport_ops; - /** * struct geneve_port - Keeps track of open UDP ports - * @gs: The socket created for this port number. - * @name: vport name. + * @dst_port: destination port. */ struct geneve_port { - struct geneve_sock *gs; - char name[IFNAMSIZ]; + u16 port_no; }; -static LIST_HEAD(geneve_ports); - static inline struct geneve_port *geneve_vport(const struct vport *vport) { return vport_priv(vport); } -/* Convert 64 bit tunnel ID to 24 bit VNI. */ -static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni) -{ -#ifdef __BIG_ENDIAN - vni[0] = (__force __u8)(tun_id >> 16); - vni[1] = (__force __u8)(tun_id >> 8); - vni[2] = (__force __u8)tun_id; -#else - vni[0] = (__force __u8)((__force u64)tun_id >> 40); - vni[1] = (__force __u8)((__force u64)tun_id >> 48); - vni[2] = (__force __u8)((__force u64)tun_id >> 56); -#endif -} - -/* Convert 24 bit VNI to 64 bit tunnel ID. */ -static __be64 vni_to_tunnel_id(const __u8 *vni) -{ -#ifdef __BIG_ENDIAN - return (vni[0] << 16) | (vni[1] << 8) | vni[2]; -#else - return (__force __be64)(((__force u64)vni[0] << 40) | - ((__force u64)vni[1] << 48) | - ((__force u64)vni[2] << 56)); -#endif -} - -static void geneve_rcv(struct geneve_sock *gs, struct sk_buff *skb) -{ - struct vport *vport = gs->rcv_data; - struct genevehdr *geneveh = geneve_hdr(skb); - int opts_len; - struct ip_tunnel_info tun_info; - __be64 key; - __be16 flags; - - opts_len = geneveh->opt_len * 4; - - flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | - (udp_hdr(skb)->check != 0 ? TUNNEL_CSUM : 0) | - (geneveh->oam ? TUNNEL_OAM : 0) | - (geneveh->critical ? TUNNEL_CRIT_OPT : 0); - - key = vni_to_tunnel_id(geneveh->vni); - - ip_tunnel_info_init(&tun_info, ip_hdr(skb), - udp_hdr(skb)->source, udp_hdr(skb)->dest, - key, flags, geneveh->options, opts_len); - - ovs_vport_receive(vport, skb, &tun_info); -} - static int geneve_get_options(const struct vport *vport, struct sk_buff *skb) { struct geneve_port *geneve_port = geneve_vport(vport); - struct inet_sock *sk = inet_sk(geneve_port->gs->sock->sk); - if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(sk->inet_sport))) + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no)) return -EMSGSIZE; return 0; } -static void geneve_tnl_destroy(struct vport *vport) +static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, + struct ip_tunnel_info *egress_tun_info) { struct geneve_port *geneve_port = geneve_vport(vport); + struct net *net = ovs_dp_get_net(vport->dp); + __be16 dport = htons(geneve_port->port_no); + __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - geneve_sock_release(geneve_port->gs); - - ovs_vport_deferred_free(vport); + return ovs_tunnel_get_egress_info(egress_tun_info, + ovs_dp_get_net(vport->dp), + OVS_CB(skb)->egress_tun_info, + IPPROTO_UDP, skb->mark, sport, dport); } static struct vport *geneve_tnl_create(const struct vport_parms *parms) @@ -122,11 +71,11 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) struct net *net = ovs_dp_get_net(parms->dp); struct nlattr *options = parms->options; struct geneve_port *geneve_port; - struct geneve_sock *gs; + struct net_device *dev; struct vport *vport; struct nlattr *a; - int err; u16 dst_port; + int err; if (!options) { err = -EINVAL; @@ -148,104 +97,40 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms) return vport; geneve_port = geneve_vport(vport); - strncpy(geneve_port->name, parms->name, IFNAMSIZ); + geneve_port->port_no = dst_port; - gs = geneve_sock_add(net, htons(dst_port), geneve_rcv, vport, true, 0); - if (IS_ERR(gs)) { + rtnl_lock(); + dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); ovs_vport_free(vport); - return (void *)gs; + return ERR_CAST(dev); } - geneve_port->gs = gs; + dev_change_flags(dev, dev->flags | IFF_UP); + rtnl_unlock(); return vport; error: return ERR_PTR(err); } -static int geneve_tnl_send(struct vport *vport, struct sk_buff *skb) +static struct vport *geneve_create(const struct vport_parms *parms) { - const struct ip_tunnel_key *tun_key; - struct ip_tunnel_info *tun_info; - struct net *net = ovs_dp_get_net(vport->dp); - struct geneve_port *geneve_port = geneve_vport(vport); - __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; - __be16 sport; - struct rtable *rt; - struct flowi4 fl; - u8 vni[3], opts_len, *opts; - __be16 df; - int err; - - tun_info = OVS_CB(skb)->egress_tun_info; - if (unlikely(!tun_info)) { - err = -EINVAL; - goto error; - } - - tun_key = &tun_info->key; - rt = ovs_tunnel_route_lookup(net, tun_key, skb->mark, &fl, IPPROTO_UDP); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto error; - } - - df = tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - tunnel_id_to_vni(tun_key->tun_id, vni); - skb->ignore_df = 1; - - if (tun_key->tun_flags & TUNNEL_GENEVE_OPT) { - opts = (u8 *)tun_info->options; - opts_len = tun_info->options_len; - } else { - opts = NULL; - opts_len = 0; - } - - err = geneve_xmit_skb(geneve_port->gs, rt, skb, fl.saddr, - tun_key->u.ipv4.dst, tun_key->tos, - tun_key->ttl, df, sport, dport, - tun_key->tun_flags, vni, opts_len, opts, - !!(tun_key->tun_flags & TUNNEL_CSUM), false); - if (err < 0) - ip_rt_put(rt); - return err; - -error: - kfree_skb(skb); - return err; -} - -static const char *geneve_get_name(const struct vport *vport) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - - return geneve_port->name; -} + struct vport *vport; -static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) -{ - struct geneve_port *geneve_port = geneve_vport(vport); - struct net *net = ovs_dp_get_net(vport->dp); - __be16 dport = inet_sk(geneve_port->gs->sock->sk)->inet_sport; - __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); + vport = geneve_tnl_create(parms); + if (IS_ERR(vport)) + return vport; - /* Get tp_src and tp_dst, refert to geneve_build_header(). - */ - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, sport, dport); + return ovs_netdev_link(vport, parms->name); } static struct vport_ops ovs_geneve_vport_ops = { .type = OVS_VPORT_TYPE_GENEVE, - .create = geneve_tnl_create, - .destroy = geneve_tnl_destroy, - .get_name = geneve_get_name, + .create = geneve_create, + .destroy = ovs_netdev_tunnel_destroy, .get_options = geneve_get_options, - .send = geneve_tnl_send, + .send = ovs_netdev_send, .owner = THIS_MODULE, .get_egress_tun_info = geneve_get_egress_tun_info, }; -- cgit v1.2.3 From 371bd1061d29562e6423435073623add8c475ee2 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Wed, 26 Aug 2015 23:46:54 -0700 Subject: geneve: Consolidate Geneve functionality in single module. geneve_core module handles send and receive functionality. This way OVS could use the Geneve API. Now with use of tunnel meatadata mode OVS can directly use Geneve netdevice. So there is no need for separate module for Geneve. Following patch consolidates Geneve protocol processing in single module. Signed-off-by: Pravin B Shelar Reviewed-by: Jesse Gross Acked-by: John W. Linville Signed-off-by: David S. Miller --- drivers/net/Kconfig | 4 +- drivers/net/geneve.c | 507 ++++++++++++++++++++++++++++++++++++++++--------- include/net/geneve.h | 34 ---- net/ipv4/Kconfig | 14 -- net/ipv4/Makefile | 1 - net/ipv4/geneve_core.c | 447 ------------------------------------------- 6 files changed, 421 insertions(+), 586 deletions(-) delete mode 100644 net/ipv4/geneve_core.c (limited to 'net') diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 770483b31d62..d18eb607bee6 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -180,8 +180,8 @@ config VXLAN will be called vxlan. config GENEVE - tristate "Generic Network Virtualization Encapsulation netdev" - depends on INET && GENEVE_CORE + tristate "Generic Network Virtualization Encapsulation" + depends on INET && NET_UDP_TUNNEL select NET_IP_TUNNEL ---help--- This allows one to create geneve virtual interfaces that provide diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index d05150cc25d4..90d4d433f1c9 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -18,6 +18,7 @@ #include #include #include +#include #define GENEVE_NETDEV_VER "0.6" @@ -33,13 +34,18 @@ static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); +#define GENEVE_VER 0 +#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) + /* per-network namespace private data for this module */ struct geneve_net { - struct list_head geneve_list; - struct hlist_head vni_list[VNI_HASH_SIZE]; - struct geneve_dev __rcu *collect_md_tun; + struct list_head geneve_list; + struct hlist_head vni_list[VNI_HASH_SIZE]; + struct list_head sock_list; }; +static int geneve_net_id; + /* Pseudo network device */ struct geneve_dev { struct hlist_node hlist; /* vni hash table */ @@ -55,7 +61,15 @@ struct geneve_dev { bool collect_md; }; -static int geneve_net_id; +struct geneve_sock { + bool collect_md; + struct geneve_net *gn; + struct list_head list; + struct socket *sock; + struct rcu_head rcu; + int refcnt; + struct udp_offload udp_offloads; +}; static inline __u32 geneve_net_vni_hash(u8 vni[3]) { @@ -76,51 +90,62 @@ static __be64 vni_to_tunnel_id(const __u8 *vni) #endif } -static struct geneve_dev *geneve_lookup(struct geneve_net *gn, - struct geneve_sock *gs, - struct iphdr *iph, - struct genevehdr *gnvh) +static struct geneve_dev *geneve_lookup(struct geneve_net *gn, __be16 port, + __be32 addr, u8 vni[]) { - struct inet_sock *sk = inet_sk(gs->sock->sk); struct hlist_head *vni_list_head; struct geneve_dev *geneve; __u32 hash; - geneve = rcu_dereference(gn->collect_md_tun); - if (geneve) - return geneve; - /* Find the device for this VNI */ - hash = geneve_net_vni_hash(gnvh->vni); + hash = geneve_net_vni_hash(vni); vni_list_head = &gn->vni_list[hash]; hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) { - if (!memcmp(gnvh->vni, geneve->vni, sizeof(geneve->vni)) && - iph->saddr == geneve->remote.sin_addr.s_addr && - sk->inet_sport == geneve->dst_port) { + if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && + addr == geneve->remote.sin_addr.s_addr && + port == geneve->dst_port) { return geneve; } } return NULL; } +static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) +{ + return (struct genevehdr *)(udp_hdr(skb) + 1); +} + /* geneve receive/decap routine */ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) { + struct inet_sock *sk = inet_sk(gs->sock->sk); struct genevehdr *gnvh = geneve_hdr(skb); + struct geneve_net *gn = gs->gn; struct metadata_dst *tun_dst = NULL; struct geneve_dev *geneve = NULL; struct pcpu_sw_netstats *stats; - struct geneve_net *gn; struct iphdr *iph; + u8 *vni; + __be32 addr; int err; iph = ip_hdr(skb); /* Still outer IP header... */ - gn = gs->rcv_data; - geneve = geneve_lookup(gn, gs, iph, gnvh); + + if (gs->collect_md) { + static u8 zero_vni[3]; + + vni = zero_vni; + addr = 0; + } else { + vni = gnvh->vni; + addr = iph->saddr; + } + + geneve = geneve_lookup(gn, sk->inet_sport, addr, vni); if (!geneve) goto drop; - if (ip_tunnel_collect_metadata() || geneve->collect_md) { + if (ip_tunnel_collect_metadata() || gs->collect_md) { __be16 flags; void *opts; @@ -201,31 +226,326 @@ static void geneve_uninit(struct net_device *dev) free_percpu(dev->tstats); } +/* Callback from net/ipv4/udp.c to receive packets */ +static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) +{ + struct genevehdr *geneveh; + struct geneve_sock *gs; + int opts_len; + + /* Need Geneve and inner Ethernet header to be present */ + if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) + goto error; + + /* Return packets with reserved bits set */ + geneveh = geneve_hdr(skb); + if (unlikely(geneveh->ver != GENEVE_VER)) + goto error; + + if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) + goto error; + + opts_len = geneveh->opt_len * 4; + if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, + htons(ETH_P_TEB))) + goto drop; + + gs = rcu_dereference_sk_user_data(sk); + if (!gs) + goto drop; + + geneve_rx(gs, skb); + return 0; + +drop: + /* Consume bad packet */ + kfree_skb(skb); + return 0; + +error: + /* Let the UDP layer deal with the skb */ + return 1; +} + +static struct socket *geneve_create_sock(struct net *net, bool ipv6, + __be16 port) +{ + struct socket *sock; + struct udp_port_cfg udp_conf; + int err; + + memset(&udp_conf, 0, sizeof(udp_conf)); + + if (ipv6) { + udp_conf.family = AF_INET6; + } else { + udp_conf.family = AF_INET; + udp_conf.local_ip.s_addr = htonl(INADDR_ANY); + } + + udp_conf.local_udp_port = port; + + /* Open UDP socket */ + err = udp_sock_create(net, &udp_conf, &sock); + if (err < 0) + return ERR_PTR(err); + + return sock; +} + +static void geneve_notify_add_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + int err; + + if (sa_family == AF_INET) { + err = udp_add_offload(&gs->udp_offloads); + if (err) + pr_warn("geneve: udp_add_offload failed with status %d\n", + err); + } +} + +static int geneve_hlen(struct genevehdr *gh) +{ + return sizeof(*gh) + gh->opt_len * 4; +} + +static struct sk_buff **geneve_gro_receive(struct sk_buff **head, + struct sk_buff *skb, + struct udp_offload *uoff) +{ + struct sk_buff *p, **pp = NULL; + struct genevehdr *gh, *gh2; + unsigned int hlen, gh_len, off_gnv; + const struct packet_offload *ptype; + __be16 type; + int flush = 1; + + off_gnv = skb_gro_offset(skb); + hlen = off_gnv + sizeof(*gh); + gh = skb_gro_header_fast(skb, off_gnv); + if (skb_gro_header_hard(skb, hlen)) { + gh = skb_gro_header_slow(skb, hlen, off_gnv); + if (unlikely(!gh)) + goto out; + } + + if (gh->ver != GENEVE_VER || gh->oam) + goto out; + gh_len = geneve_hlen(gh); + + hlen = off_gnv + gh_len; + if (skb_gro_header_hard(skb, hlen)) { + gh = skb_gro_header_slow(skb, hlen, off_gnv); + if (unlikely(!gh)) + goto out; + } + + flush = 0; + + for (p = *head; p; p = p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + gh2 = (struct genevehdr *)(p->data + off_gnv); + if (gh->opt_len != gh2->opt_len || + memcmp(gh, gh2, gh_len)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + } + + type = gh->proto_type; + + rcu_read_lock(); + ptype = gro_find_receive_by_type(type); + if (!ptype) { + flush = 1; + goto out_unlock; + } + + skb_gro_pull(skb, gh_len); + skb_gro_postpull_rcsum(skb, gh, gh_len); + pp = ptype->callbacks.gro_receive(head, skb); + +out_unlock: + rcu_read_unlock(); +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} + +static int geneve_gro_complete(struct sk_buff *skb, int nhoff, + struct udp_offload *uoff) +{ + struct genevehdr *gh; + struct packet_offload *ptype; + __be16 type; + int gh_len; + int err = -ENOSYS; + + udp_tunnel_gro_complete(skb, nhoff); + + gh = (struct genevehdr *)(skb->data + nhoff); + gh_len = geneve_hlen(gh); + type = gh->proto_type; + + rcu_read_lock(); + ptype = gro_find_complete_by_type(type); + if (ptype) + err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); + + rcu_read_unlock(); + return err; +} + +/* Create new listen socket if needed */ +static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, + bool ipv6) +{ + struct geneve_net *gn = net_generic(net, geneve_net_id); + struct geneve_sock *gs; + struct socket *sock; + struct udp_tunnel_sock_cfg tunnel_cfg; + + gs = kzalloc(sizeof(*gs), GFP_KERNEL); + if (!gs) + return ERR_PTR(-ENOMEM); + + sock = geneve_create_sock(net, ipv6, port); + if (IS_ERR(sock)) { + kfree(gs); + return ERR_CAST(sock); + } + + gs->sock = sock; + gs->refcnt = 1; + gs->gn = gn; + + /* Initialize the geneve udp offloads structure */ + gs->udp_offloads.port = port; + gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; + gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; + geneve_notify_add_rx_port(gs); + + /* Mark socket as an encapsulation socket */ + tunnel_cfg.sk_user_data = gs; + tunnel_cfg.encap_type = 1; + tunnel_cfg.encap_rcv = geneve_udp_encap_recv; + tunnel_cfg.encap_destroy = NULL; + setup_udp_tunnel_sock(net, sock, &tunnel_cfg); + + list_add(&gs->list, &gn->sock_list); + return gs; +} + +static void geneve_notify_del_rx_port(struct geneve_sock *gs) +{ + struct sock *sk = gs->sock->sk; + sa_family_t sa_family = sk->sk_family; + + if (sa_family == AF_INET) + udp_del_offload(&gs->udp_offloads); +} + +static void geneve_sock_release(struct geneve_sock *gs) +{ + if (--gs->refcnt) + return; + + list_del(&gs->list); + geneve_notify_del_rx_port(gs); + udp_tunnel_sock_release(gs->sock); + kfree_rcu(gs, rcu); +} + +static struct geneve_sock *geneve_find_sock(struct geneve_net *gn, + __be16 dst_port) +{ + struct geneve_sock *gs; + + list_for_each_entry(gs, &gn->sock_list, list) { + if (inet_sk(gs->sock->sk)->inet_sport == dst_port && + inet_sk(gs->sock->sk)->sk.sk_family == AF_INET) { + return gs; + } + } + return NULL; +} + static int geneve_open(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct net *net = geneve->net; - struct geneve_net *gn = net_generic(geneve->net, geneve_net_id); + struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_sock *gs; - gs = geneve_sock_add(net, geneve->dst_port, geneve_rx, gn, - false, false); + gs = geneve_find_sock(gn, geneve->dst_port); + if (gs) { + gs->refcnt++; + goto out; + } + + gs = geneve_socket_create(net, geneve->dst_port, false); if (IS_ERR(gs)) return PTR_ERR(gs); +out: + gs->collect_md = geneve->collect_md; geneve->sock = gs; - return 0; } static int geneve_stop(struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); - struct geneve_sock *gs = geneve->sock; - geneve_sock_release(gs); + geneve_sock_release(geneve->sock); + return 0; +} + +static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb, + __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, + bool csum) +{ + struct genevehdr *gnvh; + int min_headroom; + int err; + min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len + + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr); + err = skb_cow_head(skb, min_headroom); + if (unlikely(err)) { + kfree_skb(skb); + goto free_rt; + } + + skb = udp_tunnel_handle_offloads(skb, csum); + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + goto free_rt; + } + + gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); + gnvh->ver = GENEVE_VER; + gnvh->opt_len = opt_len / 4; + gnvh->oam = !!(tun_flags & TUNNEL_OAM); + gnvh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); + gnvh->rsvd1 = 0; + memcpy(gnvh->vni, vni, 3); + gnvh->proto_type = htons(ETH_P_TEB); + gnvh->rsvd2 = 0; + memcpy(gnvh->options, opt, opt_len); + + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); return 0; + +free_rt: + ip_rt_put(rt); + return err; } static struct rtable *geneve_get_rt(struct sk_buff *skb, @@ -269,7 +589,6 @@ static struct rtable *geneve_get_rt(struct sk_buff *skb, ip_rt_put(rt); return ERR_PTR(-EINVAL); } - return rt; } @@ -293,15 +612,13 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) struct geneve_sock *gs = geneve->sock; struct ip_tunnel_info *info = NULL; struct rtable *rt = NULL; - const struct iphdr *iip; /* interior IP header */ struct flowi4 fl4; __u8 tos, ttl; __be16 sport; - bool xnet; + bool udp_csum; + __be16 df; int err; - sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); - if (geneve->collect_md) { info = skb_tunnel_info(skb); if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) { @@ -316,52 +633,57 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) dev->stats.tx_carrier_errors++; goto tx_error; } + + sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); skb_reset_mac_header(skb); - xnet = !net_eq(geneve->net, dev_net(geneve->dev)); if (info) { const struct ip_tunnel_key *key = &info->key; - bool udp_csum; u8 *opts = NULL; u8 vni[3]; - __be16 df; tunnel_id_to_vni(key->tun_id, vni); - df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; - udp_csum = !!(key->tun_flags & TUNNEL_CSUM); - if (key->tun_flags & TUNNEL_GENEVE_OPT) opts = ip_tunnel_info_opts(info, info->options_len); - err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr, - key->tos, key->ttl, df, - sport, geneve->dst_port, - key->tun_flags, vni, - info->options_len, opts, udp_csum, xnet); + udp_csum = !!(key->tun_flags & TUNNEL_CSUM); + err = geneve_build_skb(rt, skb, key->tun_flags, vni, + info->options_len, opts, udp_csum); + if (unlikely(err)) + goto err; + + tos = key->tos; + ttl = key->ttl; + df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; } else { + const struct iphdr *iip; /* interior IP header */ + + udp_csum = false; + err = geneve_build_skb(rt, skb, 0, geneve->vni, + 0, NULL, udp_csum); + if (unlikely(err)) + goto err; + iip = ip_hdr(skb); tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb); - ttl = geneve->ttl; if (!ttl && IN_MULTICAST(ntohl(fl4.daddr))) ttl = 1; - ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); - - /* no need to handle local destination and encap bypass...yet... */ - err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr, tos, - ttl, 0, sport, geneve->dst_port, 0, - geneve->vni, 0, NULL, false, xnet); + df = 0; } - if (err < 0) - ip_rt_put(rt); + err = udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, fl4.saddr, fl4.daddr, + tos, ttl, df, sport, geneve->dst_port, + !net_eq(geneve->net, dev_net(geneve->dev)), + !udp_csum); iptunnel_xmit_stats(err, &dev->stats, dev->tstats); return NETDEV_TX_OK; tx_error: - dev->stats.tx_errors++; dev_kfree_skb(skb); +err: + dev->stats.tx_errors++; return NETDEV_TX_OK; } @@ -454,25 +776,44 @@ static int geneve_validate(struct nlattr *tb[], struct nlattr *data[]) return 0; } +static struct geneve_dev *geneve_find_dev(struct geneve_net *gn, + __be16 dst_port, + __be32 rem_addr, + u8 vni[], + bool *tun_on_same_port, + bool *tun_collect_md) +{ + struct geneve_dev *geneve, *t; + + *tun_on_same_port = false; + *tun_collect_md = false; + t = NULL; + list_for_each_entry(geneve, &gn->geneve_list, next) { + if (geneve->dst_port == dst_port) { + *tun_collect_md = geneve->collect_md; + *tun_on_same_port = true; + } + if (!memcmp(vni, geneve->vni, sizeof(geneve->vni)) && + rem_addr == geneve->remote.sin_addr.s_addr && + dst_port == geneve->dst_port) + t = geneve; + } + return t; +} + static int geneve_configure(struct net *net, struct net_device *dev, __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos, __u16 dst_port, bool metadata) { struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_dev *dummy, *geneve = netdev_priv(dev); - struct hlist_head *vni_list_head; - struct sockaddr_in remote; /* IPv4 address for link partner */ + struct geneve_dev *t, *geneve = netdev_priv(dev); + bool tun_collect_md, tun_on_same_port; __u32 hash; int err; if (metadata) { - if (rtnl_dereference(gn->collect_md_tun)) - return -EEXIST; - if (!list_empty(&gn->geneve_list)) - return -EPERM; - } else { - if (rtnl_dereference(gn->collect_md_tun)) - return -EPERM; + if (rem_addr || vni || tos || ttl) + return -EINVAL; } geneve->net = net; @@ -486,36 +827,31 @@ static int geneve_configure(struct net *net, struct net_device *dev, if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr))) return -EINVAL; - remote = geneve->remote; - if (metadata) { - if (rem_addr || vni || tos || ttl) - return -EINVAL; - } - - hash = geneve_net_vni_hash(geneve->vni); - vni_list_head = &gn->vni_list[hash]; - hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) { - if (!memcmp(geneve->vni, dummy->vni, sizeof(dummy->vni)) && - !memcmp(&remote, &dummy->remote, sizeof(dummy->remote)) && - htons(dst_port) == dummy->dst_port) { - return -EBUSY; - } - } - geneve->ttl = ttl; geneve->tos = tos; geneve->dst_port = htons(dst_port); geneve->collect_md = metadata; + t = geneve_find_dev(gn, htons(dst_port), rem_addr, geneve->vni, + &tun_on_same_port, &tun_collect_md); + if (t) + return -EBUSY; + + if (metadata) { + if (tun_on_same_port) + return -EPERM; + } else { + if (tun_collect_md) + return -EPERM; + } + err = register_netdevice(dev); if (err) return err; list_add(&geneve->next, &gn->geneve_list); + hash = geneve_net_vni_hash(geneve->vni); hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]); - - if (geneve->collect_md) - rcu_assign_pointer(gn->collect_md_tun, geneve); return 0; } @@ -554,12 +890,6 @@ static void geneve_dellink(struct net_device *dev, struct list_head *head) { struct geneve_dev *geneve = netdev_priv(dev); - if (geneve->collect_md) { - struct geneve_net *gn = net_generic(geneve->net, geneve_net_id); - - rcu_assign_pointer(gn->collect_md_tun, NULL); - } - if (!hlist_unhashed(&geneve->hlist)) hlist_del_rcu(&geneve->hlist); @@ -651,6 +981,7 @@ static __net_init int geneve_init_net(struct net *net) INIT_LIST_HEAD(&gn->geneve_list); + INIT_LIST_HEAD(&gn->sock_list); for (h = 0; h < VNI_HASH_SIZE; ++h) INIT_HLIST_HEAD(&gn->vni_list[h]); diff --git a/include/net/geneve.h b/include/net/geneve.h index 4245e1d23b9b..3106ed6eae0d 100644 --- a/include/net/geneve.h +++ b/include/net/geneve.h @@ -62,41 +62,7 @@ struct genevehdr { struct geneve_opt options[]; }; -static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) -{ - return (struct genevehdr *)(udp_hdr(skb) + 1); -} - #ifdef CONFIG_INET -struct geneve_sock; - -typedef void (geneve_rcv_t)(struct geneve_sock *gs, struct sk_buff *skb); - -struct geneve_sock { - struct list_head list; - geneve_rcv_t *rcv; - void *rcv_data; - struct socket *sock; - struct rcu_head rcu; - int refcnt; - struct udp_offload udp_offloads; -}; - -#define GENEVE_VER 0 -#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr)) - -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6); - -void geneve_sock_release(struct geneve_sock *vs); - -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet); - struct net_device *geneve_dev_create_fb(struct net *net, const char *name, u8 name_assign_type, u16 dst_port); #endif /*ifdef CONFIG_INET */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6fb3c90ad726..416dfa004cfb 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -331,20 +331,6 @@ config NET_FOU_IP_TUNNELS When this option is enabled IP tunnels can be configured to use FOU or GUE encapsulation. -config GENEVE_CORE - tristate "Generic Network Virtualization Encapsulation library" - depends on INET - select NET_UDP_TUNNEL - ---help--- - This allows one to create Geneve virtual interfaces that provide - Layer 2 Networks over Layer 3 Networks. Geneve is often used - to tunnel virtual network infrastructure in virtualized environments. - For more information see: - http://tools.ietf.org/html/draft-gross-geneve-01 - - To compile this driver as a module, choose M here: the module - - config INET_AH tristate "IP: AH transformation" select XFRM_ALGO diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index efc43f300b8c..89aacb630a53 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -57,7 +57,6 @@ obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o -obj-$(CONFIG_GENEVE_CORE) += geneve_core.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/geneve_core.c b/net/ipv4/geneve_core.c deleted file mode 100644 index 311a4ba6950a..000000000000 --- a/net/ipv4/geneve_core.c +++ /dev/null @@ -1,447 +0,0 @@ -/* - * Geneve: Generic Network Virtualization Encapsulation - * - * Copyright (c) 2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#if IS_ENABLED(CONFIG_IPV6) -#include -#include -#include -#include -#endif - -/* Protects sock_list and refcounts. */ -static DEFINE_MUTEX(geneve_mutex); - -/* per-network namespace private data for this module */ -struct geneve_net { - struct list_head sock_list; -}; - -static int geneve_net_id; - -static struct geneve_sock *geneve_find_sock(struct net *net, - sa_family_t family, __be16 port) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - - list_for_each_entry(gs, &gn->sock_list, list) { - if (inet_sk(gs->sock->sk)->inet_sport == port && - inet_sk(gs->sock->sk)->sk.sk_family == family) - return gs; - } - - return NULL; -} - -static void geneve_build_header(struct genevehdr *geneveh, - __be16 tun_flags, u8 vni[3], - u8 options_len, u8 *options) -{ - geneveh->ver = GENEVE_VER; - geneveh->opt_len = options_len / 4; - geneveh->oam = !!(tun_flags & TUNNEL_OAM); - geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); - geneveh->rsvd1 = 0; - memcpy(geneveh->vni, vni, 3); - geneveh->proto_type = htons(ETH_P_TEB); - geneveh->rsvd2 = 0; - - memcpy(geneveh->options, options, options_len); -} - -/* Transmit a fully formatted Geneve frame. - * - * When calling this function. The skb->data should point - * to the geneve header which is fully formed. - * - * This function will add other UDP tunnel headers. - */ -int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, - struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, - __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, - bool csum, bool xnet) -{ - struct genevehdr *gnvh; - int min_headroom; - int err; - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) - + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - - err = skb_cow_head(skb, min_headroom); - if (unlikely(err)) { - kfree_skb(skb); - return err; - } - - skb = vlan_hwaccel_push_inside(skb); - if (unlikely(!skb)) - return -ENOMEM; - - skb = udp_tunnel_handle_offloads(skb, csum); - if (IS_ERR(skb)) - return PTR_ERR(skb); - - gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); - geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); - - skb_set_inner_protocol(skb, htons(ETH_P_TEB)); - - return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst, - tos, ttl, df, src_port, dst_port, xnet, - !csum); -} -EXPORT_SYMBOL_GPL(geneve_xmit_skb); - -static int geneve_hlen(struct genevehdr *gh) -{ - return sizeof(*gh) + gh->opt_len * 4; -} - -static struct sk_buff **geneve_gro_receive(struct sk_buff **head, - struct sk_buff *skb, - struct udp_offload *uoff) -{ - struct sk_buff *p, **pp = NULL; - struct genevehdr *gh, *gh2; - unsigned int hlen, gh_len, off_gnv; - const struct packet_offload *ptype; - __be16 type; - int flush = 1; - - off_gnv = skb_gro_offset(skb); - hlen = off_gnv + sizeof(*gh); - gh = skb_gro_header_fast(skb, off_gnv); - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - if (gh->ver != GENEVE_VER || gh->oam) - goto out; - gh_len = geneve_hlen(gh); - - hlen = off_gnv + gh_len; - if (skb_gro_header_hard(skb, hlen)) { - gh = skb_gro_header_slow(skb, hlen, off_gnv); - if (unlikely(!gh)) - goto out; - } - - flush = 0; - - for (p = *head; p; p = p->next) { - if (!NAPI_GRO_CB(p)->same_flow) - continue; - - gh2 = (struct genevehdr *)(p->data + off_gnv); - if (gh->opt_len != gh2->opt_len || - memcmp(gh, gh2, gh_len)) { - NAPI_GRO_CB(p)->same_flow = 0; - continue; - } - } - - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_receive_by_type(type); - if (!ptype) { - flush = 1; - goto out_unlock; - } - - skb_gro_pull(skb, gh_len); - skb_gro_postpull_rcsum(skb, gh, gh_len); - pp = ptype->callbacks.gro_receive(head, skb); - -out_unlock: - rcu_read_unlock(); -out: - NAPI_GRO_CB(skb)->flush |= flush; - - return pp; -} - -static int geneve_gro_complete(struct sk_buff *skb, int nhoff, - struct udp_offload *uoff) -{ - struct genevehdr *gh; - struct packet_offload *ptype; - __be16 type; - int gh_len; - int err = -ENOSYS; - - udp_tunnel_gro_complete(skb, nhoff); - - gh = (struct genevehdr *)(skb->data + nhoff); - gh_len = geneve_hlen(gh); - type = gh->proto_type; - - rcu_read_lock(); - ptype = gro_find_complete_by_type(type); - if (ptype) - err = ptype->callbacks.gro_complete(skb, nhoff + gh_len); - - rcu_read_unlock(); - return err; -} - -static void geneve_notify_add_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - int err; - - if (sa_family == AF_INET) { - err = udp_add_offload(&gs->udp_offloads); - if (err) - pr_warn("geneve: udp_add_offload failed with status %d\n", - err); - } -} - -static void geneve_notify_del_rx_port(struct geneve_sock *gs) -{ - struct sock *sk = gs->sock->sk; - sa_family_t sa_family = sk->sk_family; - - if (sa_family == AF_INET) - udp_del_offload(&gs->udp_offloads); -} - -/* Callback from net/ipv4/udp.c to receive packets */ -static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) -{ - struct genevehdr *geneveh; - struct geneve_sock *gs; - int opts_len; - - /* Need Geneve and inner Ethernet header to be present */ - if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) - goto error; - - /* Return packets with reserved bits set */ - geneveh = geneve_hdr(skb); - - if (unlikely(geneveh->ver != GENEVE_VER)) - goto error; - - if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) - goto error; - - opts_len = geneveh->opt_len * 4; - if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, - htons(ETH_P_TEB))) - goto drop; - - gs = rcu_dereference_sk_user_data(sk); - if (!gs) - goto drop; - - gs->rcv(gs, skb); - return 0; - -drop: - /* Consume bad packet */ - kfree_skb(skb); - return 0; - -error: - /* Let the UDP layer deal with the skb */ - return 1; -} - -static struct socket *geneve_create_sock(struct net *net, bool ipv6, - __be16 port) -{ - struct socket *sock; - struct udp_port_cfg udp_conf; - int err; - - memset(&udp_conf, 0, sizeof(udp_conf)); - - if (ipv6) { - udp_conf.family = AF_INET6; - } else { - udp_conf.family = AF_INET; - udp_conf.local_ip.s_addr = htonl(INADDR_ANY); - } - - udp_conf.local_udp_port = port; - - /* Open UDP socket */ - err = udp_sock_create(net, &udp_conf, &sock); - if (err < 0) - return ERR_PTR(err); - - return sock; -} - -/* Create new listen socket if needed */ -static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool ipv6) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - struct geneve_sock *gs; - struct socket *sock; - struct udp_tunnel_sock_cfg tunnel_cfg; - - gs = kzalloc(sizeof(*gs), GFP_KERNEL); - if (!gs) - return ERR_PTR(-ENOMEM); - - sock = geneve_create_sock(net, ipv6, port); - if (IS_ERR(sock)) { - kfree(gs); - return ERR_CAST(sock); - } - - gs->sock = sock; - gs->refcnt = 1; - gs->rcv = rcv; - gs->rcv_data = data; - - /* Initialize the geneve udp offloads structure */ - gs->udp_offloads.port = port; - gs->udp_offloads.callbacks.gro_receive = geneve_gro_receive; - gs->udp_offloads.callbacks.gro_complete = geneve_gro_complete; - geneve_notify_add_rx_port(gs); - - /* Mark socket as an encapsulation socket */ - tunnel_cfg.sk_user_data = gs; - tunnel_cfg.encap_type = 1; - tunnel_cfg.encap_rcv = geneve_udp_encap_recv; - tunnel_cfg.encap_destroy = NULL; - setup_udp_tunnel_sock(net, sock, &tunnel_cfg); - - list_add(&gs->list, &gn->sock_list); - - return gs; -} - -struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, - geneve_rcv_t *rcv, void *data, - bool no_share, bool ipv6) -{ - struct geneve_sock *gs; - - mutex_lock(&geneve_mutex); - - gs = geneve_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); - if (gs) { - if (!no_share && gs->rcv == rcv) - gs->refcnt++; - else - gs = ERR_PTR(-EBUSY); - } else { - gs = geneve_socket_create(net, port, rcv, data, ipv6); - } - - mutex_unlock(&geneve_mutex); - - return gs; -} -EXPORT_SYMBOL_GPL(geneve_sock_add); - -void geneve_sock_release(struct geneve_sock *gs) -{ - mutex_lock(&geneve_mutex); - - if (--gs->refcnt) - goto unlock; - - list_del(&gs->list); - geneve_notify_del_rx_port(gs); - udp_tunnel_sock_release(gs->sock); - kfree_rcu(gs, rcu); - -unlock: - mutex_unlock(&geneve_mutex); -} -EXPORT_SYMBOL_GPL(geneve_sock_release); - -static __net_init int geneve_init_net(struct net *net) -{ - struct geneve_net *gn = net_generic(net, geneve_net_id); - - INIT_LIST_HEAD(&gn->sock_list); - - return 0; -} - -static struct pernet_operations geneve_net_ops = { - .init = geneve_init_net, - .id = &geneve_net_id, - .size = sizeof(struct geneve_net), -}; - -static int __init geneve_init_module(void) -{ - int rc; - - rc = register_pernet_subsys(&geneve_net_ops); - if (rc) - return rc; - - pr_info("Geneve core logic\n"); - - return 0; -} -module_init(geneve_init_module); - -static void __exit geneve_cleanup_module(void) -{ - unregister_pernet_subsys(&geneve_net_ops); -} -module_exit(geneve_cleanup_module); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Jesse Gross "); -MODULE_DESCRIPTION("Driver library for GENEVE encapsulated traffic"); -- cgit v1.2.3 From 0e4ead9d7b3655d76371604abb9b0dcc4e79bb7d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 27 Aug 2015 09:31:18 +0200 Subject: net: introduce change upper device notifier change info Add info that is passed along with NETDEV_CHANGEUPPER event. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 7 +++++++ net/core/dev.c | 16 ++++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6abe0d6f1e1d..39f30daac483 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2127,6 +2127,13 @@ struct netdev_notifier_change_info { unsigned int flags_changed; }; +struct netdev_notifier_changeupper_info { + struct netdev_notifier_info info; /* must be first */ + struct net_device *upper_dev; /* new upper dev */ + bool master; /* is upper dev master */ + bool linking; /* is the nofication for link or unlink */ +}; + static inline void netdev_notifier_info_init(struct netdev_notifier_info *info, struct net_device *dev) { diff --git a/net/core/dev.c b/net/core/dev.c index 7bb24f1879b8..a8e6cf4298d3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5311,6 +5311,7 @@ static int __netdev_upper_dev_link(struct net_device *dev, struct net_device *upper_dev, bool master, void *private) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j, *to_i, *to_j; int ret = 0; @@ -5329,6 +5330,10 @@ static int __netdev_upper_dev_link(struct net_device *dev, if (master && netdev_master_upper_dev_get(dev)) return -EBUSY; + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = master; + changeupper_info.linking = true; + ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private, master); if (ret) @@ -5367,7 +5372,8 @@ static int __netdev_upper_dev_link(struct net_device *dev, goto rollback_lower_mesh; } - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); return 0; rollback_lower_mesh: @@ -5462,9 +5468,14 @@ EXPORT_SYMBOL(netdev_master_upper_dev_link_private); void netdev_upper_dev_unlink(struct net_device *dev, struct net_device *upper_dev) { + struct netdev_notifier_changeupper_info changeupper_info; struct netdev_adjacent *i, *j; ASSERT_RTNL(); + changeupper_info.upper_dev = upper_dev; + changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev; + changeupper_info.linking = false; + __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev); /* Here is the tricky part. We must remove all dev's lower @@ -5484,7 +5495,8 @@ void netdev_upper_dev_unlink(struct net_device *dev, list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) __netdev_adjacent_dev_unlink(dev, i->dev); - call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev); + call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev, + &changeupper_info.info); } EXPORT_SYMBOL(netdev_upper_dev_unlink); -- cgit v1.2.3 From 35d4e1725202e6656fcfa8b88447327ad3ae0c0c Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Thu, 27 Aug 2015 09:31:20 +0200 Subject: net: add netif_is_ovs_master helper with IFF_OPENVSWITCH private flag Add this helper so code can easily figure out if netdev is openswitch. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 ++++++++ net/openvswitch/vport-internal_dev.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index be625f4754b9..6656a43c072d 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1264,6 +1264,7 @@ struct net_device_ops { * @IFF_MACVLAN: Macvlan device * @IFF_VRF_MASTER: device is a VRF master * @IFF_NO_QUEUE: device can run without qdisc attached + * @IFF_OPENVSWITCH: device is a Open vSwitch master */ enum netdev_priv_flags { IFF_802_1Q_VLAN = 1<<0, @@ -1293,6 +1294,7 @@ enum netdev_priv_flags { IFF_IPVLAN_SLAVE = 1<<24, IFF_VRF_MASTER = 1<<25, IFF_NO_QUEUE = 1<<26, + IFF_OPENVSWITCH = 1<<27, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1322,6 +1324,7 @@ enum netdev_priv_flags { #define IFF_IPVLAN_SLAVE IFF_IPVLAN_SLAVE #define IFF_VRF_MASTER IFF_VRF_MASTER #define IFF_NO_QUEUE IFF_NO_QUEUE +#define IFF_OPENVSWITCH IFF_OPENVSWITCH /** * struct net_device - The DEVICE structure. @@ -3853,6 +3856,11 @@ static inline bool netif_is_bridge_master(const struct net_device *dev) return dev->priv_flags & IFF_EBRIDGE; } +static inline bool netif_is_ovs_master(const struct net_device *dev) +{ + return dev->priv_flags & IFF_OPENVSWITCH; +} + static inline bool netif_index_is_vrf(struct net *net, int ifindex) { bool rc = false; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index c058bbf876c3..80b3e12ec882 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -135,7 +135,7 @@ static void do_setup(struct net_device *netdev) netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH; netdev->destructor = internal_dev_destructor; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; -- cgit v1.2.3 From 7b85b4dff2caa5e52726093fd058f87bbc14f156 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Thu, 27 Aug 2015 15:25:46 -0700 Subject: openvswitch: Include ip6_fib.h. kbuild test robot reports that certain configurations will not automatically pick up on the "struct rt6_info" definition, so explicitly include the header for this structure. Fixes: 7f8a436 "openvswitch: Add conntrack action" Signed-off-by: Joe Stringer Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 736a113a75c3..4487543806bb 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From b22fbf22f8469d8cacb6fcf5d266426826e1137d Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 27 Aug 2015 14:19:20 -0700 Subject: bridge: fdb: rearrange net_bridge_fdb_entry While looking into fixing the local entries scalability issue I noticed that the structure is badly arranged because vlan_id would fall in a second cache line while keeping rcu which is used only when deleting in the first, so re-arrange the structure and push rcu to the end so we can get 16 bytes which can be used for other fields (by pushing rcu fully in the second 64 byte chunk). With this change all the core necessary information when doing fdb lookups will be available in a single cache line. pahole before (note vlan_id): struct net_bridge_fdb_entry { struct hlist_node hlist; /* 0 16 */ struct net_bridge_port * dst; /* 16 8 */ struct callback_head rcu; /* 24 16 */ long unsigned int updated; /* 40 8 */ long unsigned int used; /* 48 8 */ mac_addr addr; /* 56 6 */ unsigned char is_local:1; /* 62: 7 1 */ unsigned char is_static:1; /* 62: 6 1 */ unsigned char added_by_user:1; /* 62: 5 1 */ unsigned char added_by_external_learn:1; /* 62: 4 1 */ /* XXX 4 bits hole, try to pack */ /* XXX 1 byte hole, try to pack */ /* --- cacheline 1 boundary (64 bytes) --- */ __u16 vlan_id; /* 64 2 */ /* size: 72, cachelines: 2, members: 11 */ /* sum members: 65, holes: 1, sum holes: 1 */ /* bit holes: 1, sum bit holes: 4 bits */ /* padding: 6 */ /* last cacheline: 8 bytes */ } pahole after (note vlan_id): struct net_bridge_fdb_entry { struct hlist_node hlist; /* 0 16 */ struct net_bridge_port * dst; /* 16 8 */ long unsigned int updated; /* 24 8 */ long unsigned int used; /* 32 8 */ mac_addr addr; /* 40 6 */ __u16 vlan_id; /* 46 2 */ unsigned char is_local:1; /* 48: 7 1 */ unsigned char is_static:1; /* 48: 6 1 */ unsigned char added_by_user:1; /* 48: 5 1 */ unsigned char added_by_external_learn:1; /* 48: 4 1 */ /* XXX 4 bits hole, try to pack */ /* XXX 7 bytes hole, try to pack */ struct callback_head rcu; /* 56 16 */ /* --- cacheline 1 boundary (64 bytes) was 8 bytes ago --- */ /* size: 72, cachelines: 2, members: 11 */ /* sum members: 65, holes: 1, sum holes: 7 */ /* bit holes: 1, sum bit holes: 4 bits */ /* last cacheline: 8 bytes */ } Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/bridge/br_private.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 19e8f79b6b99..213baf7aaa93 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -95,15 +95,15 @@ struct net_bridge_fdb_entry struct hlist_node hlist; struct net_bridge_port *dst; - struct rcu_head rcu; unsigned long updated; unsigned long used; mac_addr addr; + __u16 vlan_id; unsigned char is_local:1, is_static:1, added_by_user:1, added_by_external_learn:1; - __u16 vlan_id; + struct rcu_head rcu; }; struct net_bridge_port_group { -- cgit v1.2.3 From 69dba9bbc50609f19ee89d62d5199c81fcbc74b2 Mon Sep 17 00:00:00 2001 From: Jean Sacren Date: Thu, 27 Aug 2015 18:05:49 -0600 Subject: sock: fix kernel doc error The symbol '__sk_reclaim' is not present in the current tree. Apparently '__sk_reclaim' was meant to be '__sk_mem_reclaim', so fix it with the right symbol name for the kernel doc. Signed-off-by: Jean Sacren Cc: Hideo Aoki Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 193901d09757..ca2984afe16e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2078,7 +2078,7 @@ suppress_allocation: EXPORT_SYMBOL(__sk_mem_schedule); /** - * __sk_reclaim - reclaim memory_allocated + * __sk_mem_reclaim - reclaim memory_allocated * @sk: socket * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ -- cgit v1.2.3 From f84bb1eac0275283ccd76455e20f926e186ea8c8 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Aug 2015 21:21:36 +0200 Subject: net: fix IFF_NO_QUEUE for drivers using alloc_netdev Printing a warning in alloc_netdev_mqs() if tx_queue_len is zero and IFF_NO_QUEUE not set is not appropriate since drivers may use one of the alloc_netdev* macros instead of alloc_etherdev*, thereby not intentionally leaving tx_queue_len uninitialized. Instead check here if tx_queue_len is zero and set IFF_NO_QUEUE, so the value of tx_queue_len can be ignored in net/sched_generic.c. Fixes: 906470c ("net: warn if drivers set tx_queue_len = 0") Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a8e6cf4298d3..877c84834d81 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7010,7 +7010,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, setup(dev); if (!dev->tx_queue_len) - printk(KERN_WARNING "%s uses DEPRECATED zero tx_queue_len - convert driver to use IFF_NO_QUEUE instead.\n", name); + dev->priv_flags |= IFF_NO_QUEUE; dev->num_tx_queues = txqs; dev->real_num_tx_queues = txqs; -- cgit v1.2.3 From db4094bca7a5746bc8e36db0557e8732963e88f0 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Aug 2015 21:21:37 +0200 Subject: net: sched: ignore tx_queue_len when assigning default qdisc Since alloc_netdev_mqs() sets IFF_NO_QUEUE for drivers not initializing tx_queue_len, it is safe to assume that if tx_queue_len is zero, dev->priv flags always contains IFF_NO_QUEUE. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 942fea8405a4..f501b7409320 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -735,7 +735,7 @@ static void attach_one_default_qdisc(struct net_device *dev, { struct Qdisc *qdisc = &noqueue_qdisc; - if (dev->tx_queue_len && !(dev->priv_flags & IFF_NO_QUEUE)) { + if (!(dev->priv_flags & IFF_NO_QUEUE)) { qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops, TC_H_ROOT); if (!qdisc) { @@ -756,7 +756,6 @@ static void attach_default_qdiscs(struct net_device *dev) txq = netdev_get_tx_queue(dev, 0); if (!netif_is_multiqueue(dev) || - dev->tx_queue_len == 0 || dev->priv_flags & IFF_NO_QUEUE) { netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL); dev->qdisc = txq->qdisc_sleeping; -- cgit v1.2.3 From d66d6c3152e8d5a6db42a56bf7ae1c6cae87ba48 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Aug 2015 21:21:38 +0200 Subject: net: sched: register noqueue qdisc This way users can attach noqueue just like any other qdisc using tc without having to mess with tx_queue_len first. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- include/net/sch_generic.h | 1 + net/sched/sch_api.c | 1 + net/sched/sch_generic.c | 12 +++++++++++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 2eab08c38e32..444faa89a55f 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -340,6 +340,7 @@ extern struct Qdisc noop_qdisc; extern struct Qdisc_ops noop_qdisc_ops; extern struct Qdisc_ops pfifo_fast_ops; extern struct Qdisc_ops mq_qdisc_ops; +extern struct Qdisc_ops noqueue_qdisc_ops; extern const struct Qdisc_ops *default_qdisc_ops; struct Qdisc_class_common { diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 59c227f26b56..a3c70a18a764 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1942,6 +1942,7 @@ static int __init pktsched_init(void) register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); register_qdisc(&mq_qdisc_ops); + register_qdisc(&noqueue_qdisc_ops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index f501b7409320..d5c7c0d88786 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -416,9 +416,19 @@ struct Qdisc noop_qdisc = { }; EXPORT_SYMBOL(noop_qdisc); -static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { +static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt) +{ + /* register_qdisc() assigns a default of noop_enqueue if unset, + * but __dev_queue_xmit() treats noqueue only as such + * if this is NULL - so clear it here. */ + qdisc->enqueue = NULL; + return 0; +} + +struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .id = "noqueue", .priv_size = 0, + .init = noqueue_init, .enqueue = noop_enqueue, .dequeue = noop_dequeue, .peek = noop_dequeue, -- cgit v1.2.3 From 3e692f21532a54eeb5e6fc0ccc214cfa56fe23d3 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Aug 2015 21:21:39 +0200 Subject: net: sched: simplify attach_one_default_qdisc() Now that noqueue qdisc can be attached just like any other qdisc, no special treatment is necessary anymore when attaching it as default qdisc. This change has the added benefit that 'tc qdisc show' prints noqueue instead of nothing for devices defaulting to noqueue. Signed-off-by: Phil Sutter Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index d5c7c0d88786..cb5d4ad32946 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -435,24 +435,6 @@ struct Qdisc_ops noqueue_qdisc_ops __read_mostly = { .owner = THIS_MODULE, }; -static struct Qdisc noqueue_qdisc; -static struct netdev_queue noqueue_netdev_queue = { - .qdisc = &noqueue_qdisc, - .qdisc_sleeping = &noqueue_qdisc, -}; - -static struct Qdisc noqueue_qdisc = { - .enqueue = NULL, - .dequeue = noop_dequeue, - .flags = TCQ_F_BUILTIN, - .ops = &noqueue_qdisc_ops, - .list = LIST_HEAD_INIT(noqueue_qdisc.list), - .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), - .dev_queue = &noqueue_netdev_queue, - .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock), -}; - - static const u8 prio2band[TC_PRIO_MAX + 1] = { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; @@ -743,18 +725,19 @@ static void attach_one_default_qdisc(struct net_device *dev, struct netdev_queue *dev_queue, void *_unused) { - struct Qdisc *qdisc = &noqueue_qdisc; + struct Qdisc *qdisc; + const struct Qdisc_ops *ops = default_qdisc_ops; - if (!(dev->priv_flags & IFF_NO_QUEUE)) { - qdisc = qdisc_create_dflt(dev_queue, - default_qdisc_ops, TC_H_ROOT); - if (!qdisc) { - netdev_info(dev, "activation failed\n"); - return; - } - if (!netif_is_multiqueue(dev)) - qdisc->flags |= TCQ_F_ONETXQUEUE; + if (dev->priv_flags & IFF_NO_QUEUE) + ops = &noqueue_qdisc_ops; + + qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT); + if (!qdisc) { + netdev_info(dev, "activation failed\n"); + return; } + if (!netif_is_multiqueue(dev)) + qdisc->flags |= TCQ_F_ONETXQUEUE; dev_queue->qdisc_sleeping = qdisc; } @@ -790,7 +773,7 @@ static void transition_one_qdisc(struct net_device *dev, clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state); rcu_assign_pointer(dev_queue->qdisc, new_qdisc); - if (need_watchdog_p && new_qdisc != &noqueue_qdisc) { + if (need_watchdog_p) { dev_queue->trans_start = 0; *need_watchdog_p = 1; } -- cgit v1.2.3 From df945360ce07ca592464e44fdd2ce61ee1536e1e Mon Sep 17 00:00:00 2001 From: Nicholas Krause Date: Tue, 18 Aug 2015 21:23:01 -0400 Subject: Bluetooth: Make the function sco_conn_del have a return type of void This makes the function sco_conn_del have a return type of void now due to this function always running successfully and thus never needing to signal its caller when a non recoverable internal failure occurs by returning a error code to its respective caller. Signed-off-by: Nicholas Krause Signed-off-by: Marcel Holtmann --- net/bluetooth/sco.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 688a040c5626..f315c8d0e43b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -154,13 +154,13 @@ static void sco_chan_del(struct sock *sk, int err) sock_set_flag(sk, SOCK_ZAPPED); } -static int sco_conn_del(struct hci_conn *hcon, int err) +static void sco_conn_del(struct hci_conn *hcon, int err) { struct sco_conn *conn = hcon->sco_data; struct sock *sk; if (!conn) - return 0; + return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); @@ -179,7 +179,6 @@ static int sco_conn_del(struct hci_conn *hcon, int err) hcon->sco_data = NULL; kfree(conn); - return 0; } static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) -- cgit v1.2.3 From 4e1795de10903ae561872c54a72fa1bcf78f49cd Mon Sep 17 00:00:00 2001 From: Stefan Schmidt Date: Thu, 20 Aug 2015 12:09:47 +0200 Subject: nl802154: stricter input checking for boolean inputs So far we handled boolean input by forcing them with !! and assigning them into a bool. This allowed userspace to send values > 1 which were used as 1. We should be stricter here and return -EINVAL for all but 0 or 1. Signed-off-by: Stefan Schmidt Acked-by: Alexander Aring Signed-off-by: Marcel Holtmann --- net/ieee802154/nl802154.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index 1b00a14850cb..3f89c0abdab1 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -1034,7 +1034,7 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; - bool mode; + int mode; if (netif_running(dev)) return -EBUSY; @@ -1042,7 +1042,11 @@ static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL802154_ATTR_LBT_MODE]) return -EINVAL; - mode = !!nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); + mode = nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); + + if (mode != 0 && mode != 1) + return -EINVAL; + if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt)) return -EINVAL; @@ -1055,7 +1059,7 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; - bool ackreq; + int ackreq; if (netif_running(dev)) return -EBUSY; @@ -1063,7 +1067,11 @@ nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]) return -EINVAL; - ackreq = !!nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]); + ackreq = nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]); + + if (ackreq != 0 && ackreq != 1) + return -EINVAL; + return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); } -- cgit v1.2.3 From 618353b1f34947b3a399d6f51934f10df40e42ff Mon Sep 17 00:00:00 2001 From: Kuba Pawlak Date: Fri, 28 Aug 2015 13:05:22 +0100 Subject: Bluetooth: Fix SCO link type handling on connection complete Synchronous connections are initially created with type eSCO. Link manager may reject proposed link parameters, which triggers connection setup retry with a different set. Link type embedded in responses should be disregarded until Synchronous Connect Complete returns Success (0x00). Current code updates link type every time which creates an issue when link type changes to SCO and back to eSCO on further attepts. Issue happens with BlackBerry 9100 and 9700 with Intel WilkinsPeak on third connection setup attept 2015-05-18 01:27:57.332242 < HCI Command: Setup Synchronous Connection (0x01|0x0028) plen 17 handle 256 voice setting 0x0060 ptype 0x0380 2015-05-18 01:27:57.333604 > HCI Event: Command Status (0x0f) plen 4 Setup Synchronous Connection (0x01|0x0028) status 0x00 ncmd 1 2015-05-18 01:27:57.334614 > HCI Event: Synchronous Connect Complete (0x2c) plen 17 status 0x1a handle 0 bdaddr 30:7C:30:B3:A8:86 type SCO Error: Unsupported Remote Feature / Unsupported LMP Feature 2015-05-18 01:27:57.334895 < HCI Command: Setup Synchronous Connection (0x01|0x0028) plen 17 handle 256 voice setting 0x0060 ptype 0x0380 2015-05-18 01:27:57.335601 > HCI Event: Command Status (0x0f) plen 4 Setup Synchronous Connection (0x01|0x0028) status 0x00 ncmd 1 2015-05-18 01:27:57.336610 > HCI Event: Synchronous Connect Complete (0x2c) plen 17 status 0x1a handle 0 bdaddr 30:7C:30:B3:A8:86 type SCO Error: Unsupported Remote Feature / Unsupported LMP Feature 2015-05-18 01:27:57.336685 < HCI Command: Setup Synchronous Connection (0x01|0x0028) plen 17 handle 256 voice setting 0x0060 ptype 0x03c8 2015-05-18 01:27:57.337603 > HCI Event: Command Status (0x0f) plen 4 Setup Synchronous Connection (0x01|0x0028) status 0x00 ncmd 1 2015-05-18 01:27:57.342608 > HCI Event: Max Slots Change (0x1b) plen 3 handle 256 slots 1 2015-05-18 01:27:57.377631 > HCI Event: Synchronous Connect Complete (0x2c) plen 17 status 0x00 handle 257 bdaddr 30:7C:30:B3:A8:86 type eSCO Air mode: CVSD Signed-off-by: Kuba Pawlak Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 7ba35a9ba6b7..186041866315 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -3726,17 +3726,25 @@ static void hci_sync_conn_complete_evt(struct hci_dev *hdev, if (ev->link_type == ESCO_LINK) goto unlock; + /* When the link type in the event indicates SCO connection + * and lookup of the connection object fails, then check + * if an eSCO connection object exists. + * + * The core limits the synchronous connections to either + * SCO or eSCO. The eSCO connection is preferred and tried + * to be setup first and until successfully established, + * the link type will be hinted as eSCO. + */ conn = hci_conn_hash_lookup_ba(hdev, ESCO_LINK, &ev->bdaddr); if (!conn) goto unlock; - - conn->type = SCO_LINK; } switch (ev->status) { case 0x00: conn->handle = __le16_to_cpu(ev->handle); conn->state = BT_CONNECTED; + conn->type = ev->link_type; hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); -- cgit v1.2.3 From 98dbbfc3f1a555194e784304d930d2aafde3e2a3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 26 Aug 2015 23:20:51 +0200 Subject: Revert "netfilter: xtables: compute exact size needed for jumpstack" This reverts commit 98d1bd802cdbc8f56868fae51edec13e86b59515. mark_source_chains will not re-visit chains, so *filter :INPUT ACCEPT [365:25776] :FORWARD ACCEPT [0:0] :OUTPUT ACCEPT [217:45832] :t1 - [0:0] :t2 - [0:0] :t3 - [0:0] :t4 - [0:0] -A t1 -i lo -j t2 -A t2 -i lo -j t3 -A t3 -i lo -j t4 # -A INPUT -j t4 # -A INPUT -j t3 # -A INPUT -j t2 -A INPUT -j t1 COMMIT Will compute a chain depth of 2 if the comments are removed. Revert back to counting the number of chains for the time being. Reported-by: Cong Wang Reported-by: Hannes Frederic Sowa Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/ipv4/netfilter/arp_tables.c | 19 +++++++------------ net/ipv4/netfilter/ip_tables.c | 28 ++++++++++------------------ net/ipv6/netfilter/ip6_tables.c | 23 ++++++++--------------- 3 files changed, 25 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index c416cb355cb0..8f87fc38ccde 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -367,13 +367,10 @@ static inline bool unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. - * - * Keeps track of largest call depth seen and stores it in newinfo->stacksize. */ -static int mark_source_chains(struct xt_table_info *newinfo, +static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { - unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -389,7 +386,6 @@ static int mark_source_chains(struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; - calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -444,8 +440,6 @@ static int mark_source_chains(struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; - if (calldepth > 0) - --calldepth; } else { int newpos = t->verdict; @@ -460,10 +454,6 @@ static int mark_source_chains(struct xt_table_info *newinfo, return 0; } - if (entry0 + newpos != arpt_next_entry(e) && - ++calldepth > max_calldepth) - max_calldepth = calldepth; - /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -480,7 +470,6 @@ static int mark_source_chains(struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } - newinfo->stacksize = max_calldepth; return 1; } @@ -670,6 +659,9 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, if (ret != 0) break; ++i; + if (strcmp(arpt_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -1442,6 +1434,9 @@ static int translate_compat_table(const char *name, break; } ++i; + if (strcmp(arpt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 787f99ed55e2..b0a86e73451c 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -443,15 +443,11 @@ ipt_do_table(struct sk_buff *skb, } /* Figures out from what hook each rule can be called: returns 0 if - * there are loops. Puts hook bitmask in comefrom. - * - * Keeps track of largest call depth seen and stores it in newinfo->stacksize. - */ + there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct xt_table_info *newinfo, +mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { - unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -465,7 +461,6 @@ mark_source_chains(struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; - calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -527,9 +522,6 @@ mark_source_chains(struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; - WARN_ON_ONCE(calldepth == 0); - if (calldepth > 0) - --calldepth; } else { int newpos = t->verdict; @@ -543,14 +535,9 @@ mark_source_chains(struct xt_table_info *newinfo, newpos); return 0; } - if (entry0 + newpos != ipt_next_entry(e) && - !(e->ip.flags & IPT_F_GOTO) && - ++calldepth > max_calldepth) - max_calldepth = calldepth; - /* This a jump; chase it. */ - duprintf("Jump rule %u -> %u, calldepth %d\n", - pos, newpos, calldepth); + duprintf("Jump rule %u -> %u\n", + pos, newpos); } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; @@ -564,7 +551,6 @@ mark_source_chains(struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } - newinfo->stacksize = max_calldepth; return 1; } @@ -844,6 +830,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (ret != 0) return ret; ++i; + if (strcmp(ipt_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (i != repl->num_entries) { @@ -1759,6 +1748,9 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; + if (strcmp(ipt_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 4e21f80228be..0771991ed812 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -455,15 +455,11 @@ ip6t_do_table(struct sk_buff *skb, } /* Figures out from what hook each rule can be called: returns 0 if - * there are loops. Puts hook bitmask in comefrom. - * - * Keeps track of largest call depth seen and stores it in newinfo->stacksize. - */ + there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct xt_table_info *newinfo, +mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { - unsigned int calldepth, max_calldepth = 0; unsigned int hook; /* No recursion; use packet counter to save back ptrs (reset @@ -477,7 +473,6 @@ mark_source_chains(struct xt_table_info *newinfo, /* Set initial back pointer. */ e->counters.pcnt = pos; - calldepth = 0; for (;;) { const struct xt_standard_target *t @@ -539,8 +534,6 @@ mark_source_chains(struct xt_table_info *newinfo, (entry0 + pos + size); e->counters.pcnt = pos; pos += size; - if (calldepth > 0) - --calldepth; } else { int newpos = t->verdict; @@ -554,11 +547,6 @@ mark_source_chains(struct xt_table_info *newinfo, newpos); return 0; } - if (entry0 + newpos != ip6t_next_entry(e) && - !(e->ipv6.flags & IP6T_F_GOTO) && - ++calldepth > max_calldepth) - max_calldepth = calldepth; - /* This a jump; chase it. */ duprintf("Jump rule %u -> %u\n", pos, newpos); @@ -575,7 +563,6 @@ mark_source_chains(struct xt_table_info *newinfo, next: duprintf("Finished chain %u\n", hook); } - newinfo->stacksize = max_calldepth; return 1; } @@ -855,6 +842,9 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, if (ret != 0) return ret; ++i; + if (strcmp(ip6t_get_target(iter)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (i != repl->num_entries) { @@ -1767,6 +1757,9 @@ translate_compat_table(struct net *net, if (ret != 0) break; ++i; + if (strcmp(ip6t_get_target(iter1)->u.user.name, + XT_ERROR_TARGET) == 0) + ++newinfo->stacksize; } if (ret) { /* -- cgit v1.2.3 From 851345c5bbb4644911f7c351c042559a71f57d19 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 28 Aug 2015 00:16:21 +0200 Subject: netfilter: reduce sparse warnings bridge/netfilter/ebtables.c:290:26: warning: incorrect type in assignment (different modifiers) -> remove __pure annotation. ipv6/netfilter/ip6t_SYNPROXY.c:240:27: warning: cast from restricted __be16 -> switch ntohs to htons and vice versa. netfilter/core.c:391:30: warning: symbol 'nfq_ct_nat_hook' was not declared. Should it be static? -> delete it, got removed net/netfilter/nf_synproxy_core.c:221:48: warning: cast to restricted __be32 -> Use __be32 instead of u32. Tested with objdiff that these changes do not affect generated code. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 2 +- net/ipv6/netfilter/ip6t_SYNPROXY.c | 2 +- net/netfilter/core.c | 3 --- net/netfilter/nf_synproxy_core.c | 6 +++--- 4 files changed, 5 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 18ca4b24c418..48b6b01295de 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -176,7 +176,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, return 0; } -static inline __pure +static inline struct ebt_entry *ebt_next_entry(const struct ebt_entry *entry) { return (void *)entry + entry->next_offset; diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index ebbb754c2111..1e4bf99ed16e 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -237,7 +237,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet, nth->ack_seq = th->ack_seq; tcp_flag_word(nth) = TCP_FLAG_ACK; nth->doff = tcp_hdr_size / 4; - nth->window = ntohs(htons(th->window) >> opts->wscale); + nth->window = htons(ntohs(th->window) >> opts->wscale); nth->check = 0; nth->urg_ptr = 0; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 2a5a0704245c..0b939b7ad724 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -388,9 +388,6 @@ EXPORT_SYMBOL(nf_conntrack_destroy); struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfq_ct_hook); -struct nfq_ct_nat_hook __rcu *nfq_ct_nat_hook __read_mostly; -EXPORT_SYMBOL_GPL(nfq_ct_nat_hook); - #endif /* CONFIG_NF_CONNTRACK */ #ifdef CONFIG_NF_NAT_NEEDED diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 8fbbdb09826e..888b9558415e 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -188,7 +188,7 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, const struct nf_conn_synproxy *synproxy) { unsigned int optoff, optend; - u32 *ptr, old; + __be32 *ptr, old; if (synproxy->tsoff == 0) return 1; @@ -216,12 +216,12 @@ unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, if (op[0] == TCPOPT_TIMESTAMP && op[1] == TCPOLEN_TIMESTAMP) { if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { - ptr = (u32 *)&op[2]; + ptr = (__be32 *)&op[2]; old = *ptr; *ptr = htonl(ntohl(*ptr) - synproxy->tsoff); } else { - ptr = (u32 *)&op[6]; + ptr = (__be32 *)&op[6]; old = *ptr; *ptr = htonl(ntohl(*ptr) + synproxy->tsoff); -- cgit v1.2.3 From df2cf4a78e488d26728590cb3c6b4fe4c4862c77 Mon Sep 17 00:00:00 2001 From: Philip Downey Date: Thu, 27 Aug 2015 16:46:26 +0100 Subject: IGMP: Inhibit reports for local multicast groups The range of addresses between 224.0.0.0 and 224.0.0.255 inclusive, is reserved for the use of routing protocols and other low-level topology discovery or maintenance protocols, such as gateway discovery and group membership reporting. Multicast routers should not forward any multicast datagram with destination addresses in this range, regardless of its TTL. Currently, IGMP reports are generated for this reserved range of addresses even though a router will ignore this information since it has no purpose. However, the presence of reserved group addresses in an IGMP membership report uses up network bandwidth and can also obscure addresses of interest when inspecting membership reports using packet inspection or debug messages. Although the RFCs for the various version of IGMP (e.g.RFC 3376 for v3) do not specify that the reserved addresses be excluded from membership reports, it should do no harm in doing so. In particular there should be no adverse effect in any IGMP snooping functionality since 224.0.0.x is specifically excluded as per RFC 4541 (IGMP and MLD Snooping Switches Considerations) section 2.1.2. Data Forwarding Rules: 2) Packets with a destination IP (DIP) address in the 224.0.0.X range which are not IGMP must be forwarded on all ports. IGMP reports for local multicast groups can now be optionally inhibited by means of a system control variable (by setting the value to zero) e.g.: echo 0 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports To retain backwards compatibility the previous behaviour is retained by default on system boot or reverted by setting the value back to non-zero e.g.: echo 1 > /proc/sys/net/ipv4/igmp_link_local_mcast_reports Signed-off-by: Philip Downey Signed-off-by: David S. Miller --- include/linux/igmp.h | 1 + net/ipv4/igmp.c | 26 +++++++++++++++++++++++++- net/ipv4/sysctl_net_ipv4.c | 7 +++++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 193ad488d3e2..908429216d9f 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -37,6 +37,7 @@ static inline struct igmpv3_query * return (struct igmpv3_query *)skb_transport_header(skb); } +extern int sysctl_igmp_llm_reports; extern int sysctl_igmp_max_memberships; extern int sysctl_igmp_max_msf; extern int sysctl_igmp_qrv; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 9fdfd9deac11..d38b8b61eaee 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -110,6 +110,9 @@ #define IP_MAX_MEMBERSHIPS 20 #define IP_MAX_MSF 10 +/* IGMP reports for link-local multicast groups are enabled by default */ +int sysctl_igmp_llm_reports __read_mostly = 1; + #ifdef CONFIG_IP_MULTICAST /* Parameter names and values are taken from igmp-v2-06 draft */ @@ -437,6 +440,8 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, if (pmc->multiaddr == IGMP_ALL_HOSTS) return skb; + if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports) + return skb; isquery = type == IGMPV3_MODE_IS_INCLUDE || type == IGMPV3_MODE_IS_EXCLUDE; @@ -545,6 +550,9 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) for_each_pmc_rcu(in_dev, pmc) { if (pmc->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(pmc->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&pmc->lock); if (pmc->sfcount[MCAST_EXCLUDE]) type = IGMPV3_MODE_IS_EXCLUDE; @@ -678,7 +686,11 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) return igmpv3_send_report(in_dev, pmc); - else if (type == IGMP_HOST_LEAVE_MESSAGE) + + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return 0; + + if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; else dst = group; @@ -851,6 +863,8 @@ static bool igmp_heard_report(struct in_device *in_dev, __be32 group) if (group == IGMP_ALL_HOSTS) return false; + if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports) + return false; rcu_read_lock(); for_each_pmc_rcu(in_dev, im) { @@ -957,6 +971,9 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, continue; if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; spin_lock_bh(&im->lock); if (im->tm_running) im->gsquery = im->gsquery && mark; @@ -1181,6 +1198,8 @@ static void igmp_group_dropped(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; reporter = im->reporter; igmp_stop_timer(im); @@ -1213,6 +1232,8 @@ static void igmp_group_added(struct ip_mc_list *im) #ifdef CONFIG_IP_MULTICAST if (im->multiaddr == IGMP_ALL_HOSTS) return; + if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports) + return; if (in_dev->dead) return; @@ -1518,6 +1539,9 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev) for_each_pmc_rtnl(in_dev, im) { if (im->multiaddr == IGMP_ALL_HOSTS) continue; + if (ipv4_is_local_multicast(im->multiaddr) && + !sysctl_igmp_llm_reports) + continue; /* a failover is happening and switches * must be notified immediately diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 879bdc5c95b1..894da3a70aff 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -929,6 +929,13 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "igmp_link_local_mcast_reports", + .data = &sysctl_igmp_llm_reports, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, { } }; -- cgit v1.2.3 From 72afa352d6a3d4da7783b5ddee02b94be49e051a Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:06:59 -0700 Subject: net: Introduce ipv4_addr_hash and use it for tcp metrics Refactors a common line into helper function. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/ip.h | 5 +++++ net/ipv4/tcp_metrics.c | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index bee5f3582e38..7b9e1c782aa3 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -458,6 +458,11 @@ static __inline__ void inet_reset_saddr(struct sock *sk) #endif +static inline unsigned int ipv4_addr_hash(__be32 ip) +{ + return (__force unsigned int) ip; +} + bool ip_call_ra_chain(struct sk_buff *skb); /* diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b3d64f61d922..3a4289268f97 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -249,7 +249,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, case AF_INET: saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; - hash = (__force unsigned int) daddr.addr.a4; + hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: @@ -289,7 +289,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock saddr.addr.a4 = tw->tw_rcv_saddr; daddr.family = AF_INET; daddr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) daddr.addr.a4; + hash = ipv4_addr_hash(tw->tw_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (tw->tw_family == AF_INET6) { @@ -298,7 +298,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock saddr.addr.a4 = tw->tw_rcv_saddr; daddr.family = AF_INET; daddr.addr.a4 = tw->tw_daddr; - hash = (__force unsigned int) daddr.addr.a4; + hash = ipv4_addr_hash(tw->tw_daddr); } else { saddr.family = AF_INET6; saddr.addr.in6 = tw->tw_v6_rcv_saddr; @@ -339,7 +339,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, saddr.addr.a4 = inet_sk(sk)->inet_saddr; daddr.family = AF_INET; daddr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) daddr.addr.a4; + hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { @@ -348,7 +348,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, saddr.addr.a4 = inet_sk(sk)->inet_saddr; daddr.family = AF_INET; daddr.addr.a4 = inet_sk(sk)->inet_daddr; - hash = (__force unsigned int) daddr.addr.a4; + hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } else { saddr.family = AF_INET6; saddr.addr.in6 = sk->sk_v6_rcv_saddr; @@ -959,7 +959,7 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, addr->family = AF_INET; addr->addr.a4 = nla_get_in_addr(a); if (hash) - *hash = (__force unsigned int) addr->addr.a4; + *hash = ipv4_addr_hash(addr->addr.a4); return 0; } a = info->attrs[v6]; -- cgit v1.2.3 From 3abef286cf2f138de353fb0b54453621de961043 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:07:00 -0700 Subject: net: Add set,get helpers for inetpeer addresses Use inetpeer set,get helpers in tcp_metrics rather than peeking into the inetpeer_addr struct. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inetpeer.h | 23 ++++++++++++++++++ net/ipv4/tcp_metrics.c | 65 +++++++++++++++++++++----------------------------- 2 files changed, 50 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 002f0bd27001..f75b9e7036a2 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -71,6 +71,29 @@ void inet_initpeers(void) __init; #define INETPEER_METRICS_NEW (~(u32) 0) +static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) +{ + iaddr->addr.a4 = ip; + iaddr->family = AF_INET; +} + +static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) +{ + return iaddr->addr.a4; +} + +static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, + struct in6_addr *in6) +{ + iaddr->addr.in6 = *in6; + iaddr->family = AF_INET6; +} + +static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) +{ + return &iaddr->addr.in6; +} + /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 3a4289268f97..4ef4dd4bf38c 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -247,14 +247,14 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, daddr.family = req->rsk_ops->family; switch (daddr.family) { case AF_INET: - saddr.addr.a4 = inet_rsk(req)->ir_loc_addr; - daddr.addr.a4 = inet_rsk(req)->ir_rmt_addr; + inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr); + inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr); hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - saddr.addr.in6 = inet_rsk(req)->ir_v6_loc_addr; - daddr.addr.in6 = inet_rsk(req)->ir_v6_rmt_addr; + inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr); + inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr); hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif @@ -285,25 +285,19 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock struct net *net; if (tw->tw_family == AF_INET) { - saddr.family = AF_INET; - saddr.addr.a4 = tw->tw_rcv_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = tw->tw_daddr; + inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); + inetpeer_set_addr_v4(&daddr, tw->tw_daddr); hash = ipv4_addr_hash(tw->tw_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (tw->tw_family == AF_INET6) { if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { - saddr.family = AF_INET; - saddr.addr.a4 = tw->tw_rcv_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = tw->tw_daddr; + inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); + inetpeer_set_addr_v4(&daddr, tw->tw_daddr); hash = ipv4_addr_hash(tw->tw_daddr); } else { - saddr.family = AF_INET6; - saddr.addr.in6 = tw->tw_v6_rcv_saddr; - daddr.family = AF_INET6; - daddr.addr.in6 = tw->tw_v6_daddr; + inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr); + inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr); hash = ipv6_addr_hash(&tw->tw_v6_daddr); } } @@ -335,25 +329,19 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct net *net; if (sk->sk_family == AF_INET) { - saddr.family = AF_INET; - saddr.addr.a4 = inet_sk(sk)->inet_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = inet_sk(sk)->inet_daddr; + inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); + inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { - saddr.family = AF_INET; - saddr.addr.a4 = inet_sk(sk)->inet_saddr; - daddr.family = AF_INET; - daddr.addr.a4 = inet_sk(sk)->inet_daddr; + inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); + inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } else { - saddr.family = AF_INET6; - saddr.addr.in6 = sk->sk_v6_rcv_saddr; - daddr.family = AF_INET6; - daddr.addr.in6 = sk->sk_v6_daddr; + inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr); + inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr); hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } @@ -796,18 +784,18 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, - tm->tcpm_daddr.addr.a4) < 0) + inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, - tm->tcpm_saddr.addr.a4) < 0) + inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, - &tm->tcpm_daddr.addr.in6) < 0) + inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, - &tm->tcpm_saddr.addr.in6) < 0) + inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; default: @@ -956,20 +944,21 @@ static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, a = info->attrs[v4]; if (a) { - addr->family = AF_INET; - addr->addr.a4 = nla_get_in_addr(a); + inetpeer_set_addr_v4(addr, nla_get_in_addr(a)); if (hash) - *hash = ipv4_addr_hash(addr->addr.a4); + *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr)); return 0; } a = info->attrs[v6]; if (a) { + struct in6_addr in6; + if (nla_len(a) != sizeof(struct in6_addr)) return -EINVAL; - addr->family = AF_INET6; - addr->addr.in6 = nla_get_in6_addr(a); + in6 = nla_get_in6_addr(a); + inetpeer_set_addr_v6(addr, &in6); if (hash) - *hash = ipv6_addr_hash(&addr->addr.in6); + *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr)); return 0; } return optional ? 1 : -EAFNOSUPPORT; -- cgit v1.2.3 From d39d14ffa24cca9f0e44aa4a63315f4c44c56a93 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:07:01 -0700 Subject: net: Add helper function to compare inetpeer addresses tcp_metrics and inetpeer both have functions to compare inetpeer addresses. Consolidate into 1 version. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inetpeer.h | 16 ++++++++++++++++ net/ipv4/inetpeer.c | 20 ++------------------ net/ipv4/tcp_metrics.c | 6 +----- 3 files changed, 19 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index f75b9e7036a2..9d9b3446731d 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -121,6 +121,22 @@ static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, return inet_getpeer(base, &daddr, create); } +static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, + const struct inetpeer_addr *b) +{ + int i, n = (a->family == AF_INET ? 1 : 4); + + for (i = 0; i < n; i++) { + if (a->addr.a6[i] == b->addr.a6[i]) + continue; + if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) + return -1; + return 1; + } + + return 0; +} + /* can be called from BH context or outside */ void inet_putpeer(struct inet_peer *p); bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 241afd743d2c..86fa45809540 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -157,22 +157,6 @@ void __init inet_initpeers(void) INIT_DEFERRABLE_WORK(&gc_work, inetpeer_gc_worker); } -static int addr_compare(const struct inetpeer_addr *a, - const struct inetpeer_addr *b) -{ - int i, n = (a->family == AF_INET ? 1 : 4); - - for (i = 0; i < n; i++) { - if (a->addr.a6[i] == b->addr.a6[i]) - continue; - if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i]) - return -1; - return 1; - } - - return 0; -} - #define rcu_deref_locked(X, BASE) \ rcu_dereference_protected(X, lockdep_is_held(&(BASE)->lock.lock)) @@ -188,7 +172,7 @@ static int addr_compare(const struct inetpeer_addr *a, *stackptr++ = &_base->root; \ for (u = rcu_deref_locked(_base->root, _base); \ u != peer_avl_empty;) { \ - int cmp = addr_compare(_daddr, &u->daddr); \ + int cmp = inetpeer_addr_cmp(_daddr, &u->daddr); \ if (cmp == 0) \ break; \ if (cmp == -1) \ @@ -215,7 +199,7 @@ static struct inet_peer *lookup_rcu(const struct inetpeer_addr *daddr, int count = 0; while (u != peer_avl_empty) { - int cmp = addr_compare(daddr, &u->daddr); + int cmp = inetpeer_addr_cmp(daddr, &u->daddr); if (cmp == 0) { /* Before taking a reference, check if this entry was * deleted (refcnt=-1) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4ef4dd4bf38c..c8cbc2b4b792 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -81,11 +81,7 @@ static void tcp_metric_set(struct tcp_metrics_block *tm, static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { - if (a->family != b->family) - return false; - if (a->family == AF_INET) - return a->addr.a4 == b->addr.a4; - return ipv6_addr_equal(&a->addr.in6, &b->addr.in6); + return inetpeer_addr_cmp(a, b) == 0; } struct tcpm_hash_bucket { -- cgit v1.2.3 From 192132b9a034d87566294be0fba5f8f75c2cf16b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 27 Aug 2015 16:07:03 -0700 Subject: net: Add support for VRFs to inetpeer cache inetpeer caches based on address only, so duplicate IP addresses within a namespace return the same cached entry. Enhance the ipv4 address key to contain both the IPv4 address and VRF device index. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/inetpeer.h | 17 ++++++++++++----- net/ipv4/icmp.c | 3 ++- net/ipv4/ip_fragment.c | 3 ++- net/ipv4/route.c | 7 +++++-- 4 files changed, 21 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index e34f98aa93b1..4a6009d4486b 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -15,11 +15,17 @@ #include #include +/* IPv4 address key for cache lookups */ +struct ipv4_addr_key { + __be32 addr; + int vif; +}; + #define INETPEER_MAXKEYSZ (sizeof(struct in6_addr) / sizeof(u32)) struct inetpeer_addr { union { - __be32 a4; + struct ipv4_addr_key a4; struct in6_addr a6; u32 key[INETPEER_MAXKEYSZ]; }; @@ -71,13 +77,13 @@ void inet_initpeers(void) __init; static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) { - iaddr->a4 = ip; + iaddr->a4.addr = ip; iaddr->family = AF_INET; } static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) { - return iaddr->a4; + return iaddr->a4.addr; } static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, @@ -99,11 +105,12 @@ struct inet_peer *inet_getpeer(struct inet_peer_base *base, static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, - int create) + int vif, int create) { struct inetpeer_addr daddr; - daddr.a4 = v4daddr; + daddr.a4.addr = v4daddr; + daddr.a4.vif = vif; daddr.family = AF_INET; return inet_getpeer(base, &daddr, create); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index f16488efa1c8..79fe05befcae 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -309,9 +309,10 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, rc = false; if (icmp_global_allow()) { + int vif = vrf_master_ifindex(dst->dev); struct inet_peer *peer; - peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, vif, 1); rc = inet_peer_xrlim_allow(peer, net->ipv4.sysctl_icmp_ratelimit); if (peer) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 15762e758861..fa7f15305f9a 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -151,7 +151,8 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a) qp->vif = arg->vif; qp->user = arg->user; qp->peer = sysctl_ipfrag_max_dist ? - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, 1) : NULL; + inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : + NULL; } static void ip4_frag_free(struct inet_frag_queue *q) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f3087aaa6dd8..6b91879e9cbe 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -838,6 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) struct inet_peer *peer; struct net *net; int log_martians; + int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); @@ -846,10 +847,11 @@ void ip_rt_send_redirect(struct sk_buff *skb) return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); + vif = vrf_master_ifindex_rcu(rt->dst.dev); rcu_read_unlock(); net = dev_net(rt->dst.dev); - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); if (!peer) { icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); @@ -938,7 +940,8 @@ static int ip_error(struct sk_buff *skb) break; } - peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); + peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, + vrf_master_ifindex(skb->dev), 1); send = true; if (peer) { -- cgit v1.2.3 From 9723e6abc70a472c726d5d5ac6402a2d1bb10882 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Fri, 28 Aug 2015 10:39:56 +0200 Subject: openswitch: fix typo CONFIG_NF_CONNTRACK_LABEL Fix typo in conntrack.c s/CONFIG_NF_CONNTRACK_LABEL/CONFIG_NF_CONNTRACK_LABELS/ Signed-off-by: Valentin Rothberg Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 890d3eedb447..886bd2758502 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -169,7 +169,7 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark)) return -EMSGSIZE; - if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABEL) && + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && nla_put(skb, OVS_KEY_ATTR_CT_LABEL, sizeof(key->ct.label), &key->ct.label)) return -EMSGSIZE; -- cgit v1.2.3 From c1b3b19923a371c9e099c30372d376b02fe66088 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 28 Aug 2015 18:46:39 +0200 Subject: net: sched: don't break line in tc_classify loop notification Just some minor noise follow-up to address some stylistic issues of commit 3b3ae880266d ("net: sched: consolidate tc_classify{,_compat}"). Accidentally v1 instead of v2 of that commit got applied, so this patch adds the relative diff. Suggested-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/sched/sch_api.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a3c70a18a764..f43c8f33f09e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1825,8 +1825,7 @@ reclassify: err = tp->classify(skb, tp, res); #ifdef CONFIG_NET_CLS_ACT - if (unlikely(err == TC_ACT_RECLASSIFY && - !compat_mode)) + if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) goto reset; #endif if (err >= 0) @@ -1837,9 +1836,9 @@ reclassify: #ifdef CONFIG_NET_CLS_ACT reset: if (unlikely(limit++ >= MAX_REC_LOOP)) { - net_notice_ratelimited("%s: reclassify loop, rule prio %u, " - "protocol %02x\n", tp->q->ops->id, - tp->prio & 0xffff, ntohs(tp->protocol)); + net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n", + tp->q->ops->id, tp->prio & 0xffff, + ntohs(tp->protocol)); return TC_ACT_SHOT; } -- cgit v1.2.3 From c9fd56b34efd06a031cbc918d288f09e38173ff9 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 28 Aug 2015 15:44:25 -0700 Subject: netpoll: warn on netpoll_send_udp users who haven't disabled irqs Make sure we catch future netpoll_send_udp users who use it without disabling irqs and also as a hint for poll_controller users. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/netpoll.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c126a878c47c..6aa3db8dfc3b 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -380,6 +380,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len) static atomic_t ip_ident; struct ipv6hdr *ip6h; + WARN_ON_ONCE(!irqs_disabled()); + udp_len = len + sizeof(*udph); if (np->ipv6) ip_len = udp_len + sizeof(*ip6h); -- cgit v1.2.3 From 0d5cdef8d5dd0a6819fd85305adb448f5ba56f24 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Fri, 28 Aug 2015 19:22:11 -0700 Subject: openvswitch: Fix conntrack compilation without mark. Fix build with !CONFIG_NF_CONNTRACK_MARK && CONFIG_OPENVSWITCH_CONNTRACK Fixes: 182e304 ("openvswitch: Allow matching on conntrack mark") Reported-by: Simon Horman Signed-off-by: Joe Stringer Tested-by: Simon Horman Signed-off-by: David S. Miller --- net/openvswitch/conntrack.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 886bd2758502..e8e524ad8a01 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -100,6 +100,15 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) return ct_state; } +static u32 ovs_ct_get_mark(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + return ct ? ct->mark : 0; +#else + return 0; +#endif +} + static void ovs_ct_get_label(const struct nf_conn *ct, struct ovs_key_ct_label *label) { @@ -124,7 +133,7 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, { key->ct.state = state; key->ct.zone = zone->id; - key->ct.mark = ct ? ct->mark : 0; + key->ct.mark = ovs_ct_get_mark(ct); ovs_ct_get_label(ct, &key->ct.label); } @@ -180,12 +189,11 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb) static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, u32 ct_mark, u32 mask) { +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) enum ip_conntrack_info ctinfo; struct nf_conn *ct; u32 new_mark; - if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) - return -ENOTSUPP; /* The connection could be invalid, in which case set_mark is no-op. */ ct = nf_ct_get(skb, &ctinfo); @@ -200,6 +208,9 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key, } return 0; +#else + return -ENOTSUPP; +#endif } static int ovs_ct_set_label(struct sk_buff *skb, struct sw_flow_key *key, -- cgit v1.2.3 From 0a6a3a23ea6efde079a5b77688541a98bf202721 Mon Sep 17 00:00:00 2001 From: Christophe Ricard Date: Fri, 28 Aug 2015 07:07:48 +0200 Subject: netlink: add NETLINK_CAP_ACK socket option Since commit c05cdb1b864f ("netlink: allow large data transfers from user-space"), the kernel may fail to allocate the necessary room for the acknowledgment message back to userspace. This patch introduces a new socket option that trims off the payload of the original netlink message. The netlink message header is still included, so the user can guess from the sequence number what is the message that has triggered the acknowledgment. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Christophe Ricard Signed-off-by: David S. Miller --- include/uapi/linux/netlink.h | 1 + net/netlink/af_netlink.c | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index cf6a65cccbdf..6f3fe16cd22a 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -110,6 +110,7 @@ struct nlmsgerr { #define NETLINK_TX_RING 7 #define NETLINK_LISTEN_ALL_NSID 8 #define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 struct nl_pktinfo { __u32 group; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index a774985489e2..3eea0b2a3239 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -84,6 +84,7 @@ struct listeners { #define NETLINK_F_BROADCAST_SEND_ERROR 0x4 #define NETLINK_F_RECV_NO_ENOBUFS 0x8 #define NETLINK_F_LISTEN_ALL_NSID 0x10 +#define NETLINK_F_CAP_ACK 0x20 static inline int netlink_is_kernel(struct sock *sk) { @@ -2258,6 +2259,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; err = 0; break; + case NETLINK_CAP_ACK: + if (val) + nlk->flags |= NETLINK_F_CAP_ACK; + else + nlk->flags &= ~NETLINK_F_CAP_ACK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2332,6 +2340,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, netlink_table_ungrab(); break; } + case NETLINK_CAP_ACK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; + if (put_user(len, optlen) || + put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2873,9 +2891,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); + struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); - /* error messages get the original request appened */ - if (err) + /* Error messages get the original request appened, unless the user + * requests to cap the error message. + */ + if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) payload += nlmsg_len(nlh); skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload), @@ -2898,7 +2919,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = err; - memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh)); + memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); -- cgit v1.2.3 From 7084a315897715776d1764f5fd9250609e515beb Mon Sep 17 00:00:00 2001 From: Ken-ichirou MATSUZAWA Date: Fri, 28 Aug 2015 16:05:20 +0900 Subject: netlink: mmap: fix lookup frame position __netlink_lookup_frame() was always called with the same "pos" value in netlink_forward_ring(). It will look at the same ring entry header over and over again, every time through this loop. Then cycle through the whole ring, advancing ring->head, not "pos" until it equals the "ring->head != head" loop test fails. Signed-off-by: Ken-ichirou MATSUZAWA Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 3eea0b2a3239..7965ca7c461d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -611,11 +611,11 @@ static void netlink_increment_head(struct netlink_ring *ring) static void netlink_forward_ring(struct netlink_ring *ring) { - unsigned int head = ring->head, pos = head; + unsigned int head = ring->head; const struct nl_mmap_hdr *hdr; do { - hdr = __netlink_lookup_frame(ring, pos); + hdr = __netlink_lookup_frame(ring, ring->head); if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) break; if (hdr->nm_status != NL_MMAP_STATUS_SKIP) -- cgit v1.2.3 From 7c5a9461812b21e1a1b7a09a83e97530b8867459 Mon Sep 17 00:00:00 2001 From: lucien Date: Fri, 28 Aug 2015 17:45:58 +0800 Subject: sctp: ASCONF-ACK with Unresolvable Address should be sent RFC 5061: This is an opaque integer assigned by the sender to identify each request parameter. The receiver of the ASCONF Chunk will copy this 32-bit value into the ASCONF Response Correlation ID field of the ASCONF-ACK response parameter. The sender of the ASCONF can use this same value in the ASCONF-ACK to find which request the response is for. Note that the receiver MUST NOT change this 32-bit value. Address Parameter: TLV This field contains an IPv4 or IPv6 address parameter, as described in Section 3.3.2.1 of [RFC4960]. ASCONF chunk with Error Cause Indication Parameter (Unresolvable Address) should be sent if the Delete IP Address is not part of the association. Endpoint A Endpoint B (ESTABLISHED) (ESTABLISHED) ASCONF -----------------> (Delete IP Address) <----------------- ASCONF-ACK (Unresolvable Address) Signed-off-by: Xin Long Acked-by: Vlad Yasevich Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- net/sctp/sm_make_chunk.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index a655ddc3f353..7954c52e1794 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); - } else - sctp_assoc_del_peer(asoc, &addr); + return SCTP_ERROR_NO_ERROR; + } + + /* If the address is not part of the association, the + * ASCONF-ACK with Error Cause Indication Parameter + * which including cause of Unresolvable Address should + * be sent. + */ + peer = sctp_assoc_lookup_paddr(asoc, &addr); + if (!peer) + return SCTP_ERROR_DNS_FAILED; + + sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 -- cgit v1.2.3 From 73e6742027f5cb5a7c747d9abab5351b01fd4c74 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Fri, 28 Aug 2015 21:23:39 -0400 Subject: sctp: Do not try to search for the transport twice When removing an non-primary transport during ASCONF processing, we end up traversing the transport list twice: once in sctp_cmd_del_non_primary, and once in sctp_assoc_del_peer. We can avoid the second search and call sctp_assoc_rm_peer() instead. Found by code inspection during code reviews. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_sideeffect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 85e6f03aeb70..35df1266bf07 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -954,7 +954,7 @@ static void sctp_cmd_del_non_primary(struct sctp_association *asoc) t = list_entry(pos, struct sctp_transport, transports); if (!sctp_cmp_addr_exact(&t->ipaddr, &asoc->peer.primary_addr)) { - sctp_assoc_del_peer(asoc, &t->ipaddr); + sctp_assoc_rm_peer(asoc, t); } } } -- cgit v1.2.3 From f6d3c19274c74ff17174df8078e0a14df003667f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 28 Aug 2015 08:42:09 -0700 Subject: net: FIB tracepoints A few useful tracepoints developing VRF driver. Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/trace/events/fib.h | 111 +++++++++++++++++++++++++++++++++++++++++++++ net/core/net-traces.c | 1 + net/ipv4/fib_frontend.c | 3 ++ net/ipv4/fib_trie.c | 5 ++ 4 files changed, 120 insertions(+) create mode 100644 include/trace/events/fib.h (limited to 'net') diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h new file mode 100644 index 000000000000..4030f75410d7 --- /dev/null +++ b/include/trace/events/fib.h @@ -0,0 +1,111 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM fib + +#if !defined(_TRACE_FIB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_FIB_H + +#include +#include +#include +#include + +TRACE_EVENT(fib_table_lookup, + + TP_PROTO(int tb_id, const struct flowi4 *flp), + + TP_ARGS(tb_id, flp), + + TP_STRUCT__entry( + __field( int, tb_id ) + __field( int, oif ) + __field( int, iif ) + __field( __u8, tos ) + __field( __u8, scope ) + __field( __u8, flags ) + __array( __u8, src, 4 ) + __array( __u8, dst, 4 ) + ), + + TP_fast_assign( + __be32 *p32; + + __entry->tb_id = tb_id; + __entry->oif = flp->flowi4_oif; + __entry->iif = flp->flowi4_iif; + __entry->tos = flp->flowi4_tos; + __entry->scope = flp->flowi4_scope; + __entry->flags = flp->flowi4_flags; + + p32 = (__be32 *) __entry->src; + *p32 = flp->saddr; + + p32 = (__be32 *) __entry->dst; + *p32 = flp->daddr; + ), + + TP_printk("table %d oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x", + __entry->tb_id, __entry->oif, __entry->iif, + __entry->src, __entry->dst, __entry->tos, __entry->scope, + __entry->flags) +); + +TRACE_EVENT(fib_table_lookup_nh, + + TP_PROTO(const struct fib_nh *nh), + + TP_ARGS(nh), + + TP_STRUCT__entry( + __string( name, nh->nh_dev->name) + __field( int, oif ) + __array( __u8, src, 4 ) + ), + + TP_fast_assign( + __be32 *p32 = (__be32 *) __entry->src; + + __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "not set"); + __entry->oif = nh->nh_oif; + *p32 = nh->nh_saddr; + ), + + TP_printk("nexthop dev %s oif %d src %pI4", + __get_str(name), __entry->oif, __entry->src) +); + +TRACE_EVENT(fib_validate_source, + + TP_PROTO(const struct net_device *dev, const struct flowi4 *flp), + + TP_ARGS(dev, flp), + + TP_STRUCT__entry( + __string( name, dev->name ) + __field( int, oif ) + __field( int, iif ) + __array( __u8, src, 4 ) + __array( __u8, dst, 4 ) + ), + + TP_fast_assign( + __be32 *p32; + + __assign_str(name, dev ? dev->name : "not set"); + __entry->oif = flp->flowi4_oif; + __entry->iif = flp->flowi4_iif; + + p32 = (__be32 *) __entry->src; + *p32 = flp->saddr; + + p32 = (__be32 *) __entry->dst; + *p32 = flp->daddr; + ), + + TP_printk("dev %s oif %d iif %d src %pI4 dst %pI4", + __get_str(name), __entry->oif, __entry->iif, + __entry->src, __entry->dst) +); +#endif /* _TRACE_FIB_H */ + +/* This part must be outside protection */ +#include diff --git a/net/core/net-traces.c b/net/core/net-traces.c index ba3c0120786c..adef015b2f41 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -31,6 +31,7 @@ #include #include #include +#include EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7fa277176c33..4036c94dfbe1 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -46,6 +46,7 @@ #include #include #include +#include #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -344,6 +345,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0; + trace_fib_validate_source(dev, &fl4); + net = dev_net(dev); if (fib_lookup(net, &fl4, &res, 0)) goto last_resort; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5154f81c5326..26d6ffb6d23c 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -81,6 +81,7 @@ #include #include #include +#include #include "fib_lookup.h" #define MAX_STAT_DEPTH 32 @@ -1278,6 +1279,8 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, unsigned long index; t_key cindex; + trace_fib_table_lookup(tb->tb_id, flp); + pn = t->kv; cindex = 0; @@ -1442,6 +1445,8 @@ found: #ifdef CONFIG_IP_FIB_TRIE_STATS this_cpu_inc(stats->semantic_match_passed); #endif + trace_fib_table_lookup_nh(nh); + return err; } } -- cgit v1.2.3 From 46fa062ad63146dd138ec0f017e71224471e8ea5 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 28 Aug 2015 20:48:19 +0200 Subject: ip_tunnels: convert the mode field of ip_tunnel_info to flags The mode field holds a single bit of information only (whether the ip_tunnel_info struct is for rx or tx). Change the mode field to bit flags. This allows more mode flags to be added. Signed-off-by: Jiri Benc Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 +- drivers/net/vxlan.c | 2 +- include/net/dst_metadata.h | 1 - include/net/ip_tunnels.h | 9 ++------- net/ipv4/ip_gre.c | 2 +- net/ipv4/route.c | 2 +- net/ipv6/route.c | 2 +- 7 files changed, 7 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 4357bae732d7..4a39c09f144c 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -623,7 +623,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) if (geneve->collect_md) { info = skb_tunnel_info(skb); - if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) { + if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX))) { netdev_dbg(dev, "no tunnel metadata\n"); goto tx_error; } diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 30e56cb58884..bd1b8cdf2bf6 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2113,7 +2113,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } if (vxlan->flags & VXLAN_F_COLLECT_METADATA && - info && info->mode == IP_TUNNEL_INFO_TX) { + info && info->mode & IP_TUNNEL_INFO_TX) { vxlan_xmit_one(skb, dev, NULL, false); return NETDEV_TX_OK; } diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 60c03326c087..2b83f0d232e0 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -59,7 +59,6 @@ static inline struct metadata_dst *tun_rx_dst(__be16 flags, return NULL; info = &tun_dst->u.tun_info; - info->mode = IP_TUNNEL_INFO_RX; info->key.tun_flags = flags; info->key.tun_id = tunnel_id; info->key.tp_src = 0; diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 224e4ecec91b..9bdb3948798f 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -50,13 +50,8 @@ struct ip_tunnel_key { __be16 tp_dst; }; -/* Indicates whether the tunnel info structure represents receive - * or transmit tunnel parameters. - */ -enum { - IP_TUNNEL_INFO_RX, - IP_TUNNEL_INFO_TX, -}; +/* Flags for ip_tunnel_info mode. */ +#define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ struct ip_tunnel_info { struct ip_tunnel_key key; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index faf1cde6f8da..1e813a9f9378 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -511,7 +511,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) int err; tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX)) + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))) goto err_free_skb; key = &tun_info->key; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6b91879e9cbe..5f4a5565ad8b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1696,7 +1696,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, */ tun_info = skb_tunnel_info(skb); - if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index df3e353a012d..308dd5f9158f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1174,7 +1174,7 @@ void ip6_route_input(struct sk_buff *skb) }; tun_info = skb_tunnel_info(skb); - if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX) + if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id; skb_dst_drop(skb); skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags)); -- cgit v1.2.3 From 7f9562a1f405306eacb97f95d78cb996e33f27f5 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 28 Aug 2015 20:48:20 +0200 Subject: ip_tunnels: record IP version in tunnel info There's currently nothing preventing directing packets with IPv6 encapsulation data to IPv4 tunnels (and vice versa). If this happens, IPv6 addresses are incorrectly interpreted as IPv4 ones. Track whether the given ip_tunnel_key contains IPv4 or IPv6 data. Store this in ip_tunnel_info. Reject packets at appropriate places if they are supposed to be encapsulated into an incompatible protocol. Signed-off-by: Jiri Benc Acked-by: Alexei Starovoitov Acked-by: Thomas Graf Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- drivers/net/geneve.c | 2 ++ drivers/net/vxlan.c | 2 ++ include/net/dst_metadata.h | 1 + include/net/ip_tunnels.h | 10 ++++++++++ net/core/filter.c | 2 ++ net/ipv4/ip_gre.c | 3 ++- net/ipv4/ip_tunnel_core.c | 2 +- net/openvswitch/flow.c | 2 ++ net/openvswitch/vport.c | 2 ++ 9 files changed, 24 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 4a39c09f144c..3908a22f23d1 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -627,6 +627,8 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) netdev_dbg(dev, "no tunnel metadata\n"); goto tx_error; } + if (info && ip_tunnel_info_af(info) != AF_INET) + goto tx_error; } rt = geneve_get_rt(skb, dev, &fl4, info); diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index bd1b8cdf2bf6..e3adfe0ef66b 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1903,6 +1903,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, dev->name); goto drop; } + if (family != ip_tunnel_info_af(info)) + goto drop; dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; vni = be64_to_cpu(info->key.tun_id); diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 2b83f0d232e0..d32f49cc621d 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -105,6 +105,7 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, info->key.u.ipv6.dst = ip6h->daddr; info->key.tos = ipv6_get_dsfield(ip6h); info->key.ttl = ip6h->hop_limit; + info->mode = IP_TUNNEL_INFO_IPV6; return tun_dst; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 9bdb3948798f..2b4fa06e91bd 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,7 @@ struct ip_tunnel_key { /* Flags for ip_tunnel_info mode. */ #define IP_TUNNEL_INFO_TX 0x01 /* represents tx tunnel parameters */ +#define IP_TUNNEL_INFO_IPV6 0x02 /* key contains IPv6 addresses */ struct ip_tunnel_info { struct ip_tunnel_key key; @@ -208,6 +210,8 @@ static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, tun_info->options = opts; tun_info->options_len = opts_len; + + tun_info->mode = 0; } static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, @@ -221,6 +225,12 @@ static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, tun_id, tun_flags, opts, opts_len); } +static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info + *tun_info) +{ + return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET; +} + #ifdef CONFIG_INET int ip_tunnel_init(struct net_device *dev); diff --git a/net/core/filter.c b/net/core/filter.c index 66500d490995..13079f03902e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1493,6 +1493,8 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) return -EINVAL; + if (ip_tunnel_info_af(info) != AF_INET) + return -EINVAL; to->tunnel_id = be64_to_cpu(info->key.tun_id); to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 1e813a9f9378..bd0679d90519 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -511,7 +511,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev) int err; tun_info = skb_tunnel_info(skb); - if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX))) + if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) || + ip_tunnel_info_af(tun_info) != AF_INET)) goto err_free_skb; key = &tun_info->key; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 934f2ac8ad61..0c756ade1cf7 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -356,7 +356,7 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, if (tb[LWTUNNEL_IP6_FLAGS]) tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); - tun_info->mode = IP_TUNNEL_INFO_TX; + tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; tun_info->options = NULL; tun_info->options_len = 0; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 5a3195e538ce..9760dc43bdb9 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -688,6 +688,8 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, { /* Extract metadata from packet. */ if (tun_info) { + if (ip_tunnel_info_af(tun_info) != AF_INET) + return -EINVAL; memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); if (tun_info->options) { diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index e2dc9dac59e6..40164037928e 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -587,6 +587,8 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, if (unlikely(!tun_info)) return -EINVAL; + if (ip_tunnel_info_af(tun_info) != AF_INET) + return -EINVAL; tun_key = &tun_info->key; -- cgit v1.2.3 From b9b6695cf0e1afebc207e28c7e9350c90547a426 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 28 Aug 2015 20:48:21 +0200 Subject: fou: reject IPv6 config fou does not really support IPv6 encapsulation. After an UDP socket is created in fou_create, the encap_rcv callback is set either to fou_udp_recv or to gue_udp_recv. Both of those unconditionally assume that the received packet has an IPv4 header and access the data at network_header as it was an IPv4 header. This leads to IPv6 flow label being interpreted as IP packet length, etc. Disallow fou tunnel to be configured as IPv6 until real IPv6 support is added to fou. CC: Tom Herbert Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- net/ipv4/fou.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 2d1646cff057..e0fcbbbcfe54 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -566,7 +566,7 @@ static int parse_nl_config(struct genl_info *info, if (info->attrs[FOU_ATTR_AF]) { u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); - if (family != AF_INET && family != AF_INET6) + if (family != AF_INET) return -EINVAL; cfg->udp_config.family = family; -- cgit v1.2.3 From a43a9ef6a2e510fec61176ff2c34fab3e7d581da Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Fri, 28 Aug 2015 20:48:22 +0200 Subject: vxlan: do not receive IPv4 packets on IPv6 socket By default (subject to the sysctl settings), IPv6 sockets listen also for IPv4 traffic. Vxlan is not prepared for that and expects IPv6 header in packets received through an IPv6 socket. In addition, it's currently not possible to have both IPv4 and IPv6 vxlan tunnel on the same port (unless bindv6only sysctl is enabled), as it's not possible to create and bind both IPv4 and IPv6 vxlan interfaces and there's no way to specify both IPv4 and IPv6 remote/group IP addresses. Set IPV6_V6ONLY on vxlan sockets to fix both of these issues. This is not done globally in udp_tunnel, as l2tp and tipc seems to work okay when receiving IPv4 packets on IPv6 socket and people may rely on this behavior. The other tunnels (geneve and fou) do not support IPv6. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 1 + include/net/udp_tunnel.h | 3 ++- net/ipv6/ip6_udp_tunnel.c | 9 +++++++++ 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index e3adfe0ef66b..6c5269aea544 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -2530,6 +2530,7 @@ static struct socket *vxlan_create_sock(struct net *net, bool ipv6, udp_conf.family = AF_INET6; udp_conf.use_udp6_rx_checksums = !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); + udp_conf.ipv6_v6only = 1; } else { udp_conf.family = AF_INET; } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 35041d0fc21e..cb2f89f20f5c 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -31,7 +31,8 @@ struct udp_port_cfg { __be16 peer_udp_port; unsigned int use_udp_checksums:1, use_udp6_tx_checksums:1, - use_udp6_rx_checksums:1; + use_udp6_rx_checksums:1, + ipv6_v6only:1; }; int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index e1a1136bda7c..14dacf1df529 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -23,6 +23,15 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, if (err < 0) goto error; + if (cfg->ipv6_v6only) { + int val = 1; + + err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + (char *) &val, sizeof(val)); + if (err < 0) + goto error; + } + udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); -- cgit v1.2.3 From c30da497893718abc6cec4f1d34d35875200edee Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sat, 29 Aug 2015 09:02:21 +0900 Subject: openvswitch: retain parsed IPv6 header fields in flow on error skipping extension headers When an error occurs skipping IPv6 extension headers retain the already parsed IP protocol and IPv6 addresses in the flow. Also assume that the packet is not a fragment in the absence of information to the contrary; that is always use the frag_off value set by ipv6_skip_exthdr(). This allows matching on the IP protocol and IPv6 addresses of packets with malformed extension headers. Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/openvswitch/flow.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 9760dc43bdb9..9db87331e211 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -272,8 +272,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ipv6.addr.dst = nh->daddr; payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); - if (unlikely(payload_ofs < 0)) - return -EINVAL; if (frag_off) { if (frag_off & htons(~0x7)) @@ -284,6 +282,13 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) key->ip.frag = OVS_FRAG_TYPE_NONE; } + /* Delayed handling of error in ipv6_skip_exthdr() as it + * always sets frag_off to a valid value which may be + * used to set key->ip.frag above. + */ + if (unlikely(payload_ofs < 0)) + return -EPROTO; + nh_len = payload_ofs - nh_ofs; skb_set_transport_header(skb, nh_ofs + nh_len); key->ip.proto = nexthdr; @@ -623,12 +628,16 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { - memset(&key->ip, 0, sizeof(key->ip)); - memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); - if (nh_len == -EINVAL) { + switch (nh_len) { + case -EINVAL: + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); + /* fall-through */ + case -EPROTO: skb->transport_header = skb->network_header; error = 0; - } else { + break; + default: error = nh_len; } return error; -- cgit v1.2.3 From 24d43f32d86026a0e318fee736380e164d85eefa Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sat, 29 Aug 2015 17:44:05 -0700 Subject: openvswitch: Remove vport get_name() Remove unused get_name() function pointer from vport ops. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index b88b3ee86f07..2f352446b6a8 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -157,7 +157,6 @@ struct vport_parms { * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. - * @get_name: Get the device's name. * @send: Send a packet on the device. Returns the length of the packet sent, * zero for dropped packets or negative for error. * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for @@ -173,9 +172,6 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - /* Called with rcu_read_lock or ovs_mutex. */ - const char *(*get_name)(const struct vport *); - int (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, struct ip_tunnel_info *); @@ -239,7 +235,7 @@ static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, static inline const char *ovs_vport_name(struct vport *vport) { - return vport->dev ? vport->dev->name : vport->ops->get_name(vport); + return vport->dev->name; } int ovs_vport_ops_register(struct vport_ops *ops); -- cgit v1.2.3 From 3eedb41fb43461b5fde3f72fd00a7706f0b90103 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sat, 29 Aug 2015 17:44:06 -0700 Subject: openvswitch: Remove egress_tun_info. tun info is passed using skb-dst pointer. Now we have converted all vports to netdev based implementation so Now we can remove redundant pointer to tun-info from OVS_CB. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 5 ----- net/openvswitch/datapath.c | 1 - net/openvswitch/datapath.h | 3 --- net/openvswitch/vport-geneve.c | 3 +-- net/openvswitch/vport-gre.c | 3 +-- net/openvswitch/vport-vxlan.c | 3 +-- net/openvswitch/vport.c | 6 +++--- net/openvswitch/vport.h | 3 +-- 8 files changed, 7 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 4487543806bb..090d9e3a460c 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -896,10 +896,6 @@ static int execute_set_action(struct sk_buff *skb, skb_dst_drop(skb); dst_hold((struct dst_entry *)tun->tun_dst); skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); - - /* FIXME: Remove when all vports have been converted */ - OVS_CB(skb)->egress_tun_info = &tun->tun_dst->u.tun_info; - return 0; } @@ -1159,7 +1155,6 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, int err; this_cpu_inc(exec_actions_level); - OVS_CB(skb)->egress_tun_info = NULL; err = do_execute_actions(dp, skb, key, acts->actions, acts->actions_len); diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ec0f8d9cee73..60c2ab8e6bc3 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -610,7 +610,6 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) goto err_flow_free; rcu_assign_pointer(flow->sf_acts, acts); - OVS_CB(packet)->egress_tun_info = NULL; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 4e785ab88973..da15fd3f3c34 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -94,15 +94,12 @@ struct datapath { /** * struct ovs_skb_cb - OVS data in skb CB - * @egress_tun_key: Tunnel information about this packet on egress path. - * NULL if the packet is not being tunneled. * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not * fragmented. */ struct ovs_skb_cb { - struct ip_tunnel_info *egress_tun_info; struct vport *input_vport; u16 mru; }; diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index fa37c95f7339..24c56e56fedd 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -62,8 +62,7 @@ static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, return ovs_tunnel_get_egress_info(egress_tun_info, ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, sport, dport); + skb, IPPROTO_UDP, sport, dport); } static struct vport *geneve_tnl_create(const struct vport_parms *parms) diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 871801d2ac23..36c39843607e 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -89,8 +89,7 @@ static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, { return ovs_tunnel_get_egress_info(egress_tun_info, ovs_dp_get_net(vport->dp), - OVS_CB(skb)->egress_tun_info, - IPPROTO_GRE, skb->mark, 0, 0); + skb, IPPROTO_GRE, 0, 0); } static struct vport_ops ovs_gre_vport_ops = { diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 1e8b00a23a23..ed7b23f443ec 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -160,8 +160,7 @@ static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, src_port = udp_flow_src_port(net, skb, 0, 0, true); return ovs_tunnel_get_egress_info(egress_tun_info, net, - OVS_CB(skb)->egress_tun_info, - IPPROTO_UDP, skb->mark, + skb, IPPROTO_UDP, src_port, dst_port); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 40164037928e..e658439a0bdd 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -483,7 +483,6 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, u64_stats_update_end(&stats->syncp); OVS_CB(skb)->input_vport = vport; - OVS_CB(skb)->egress_tun_info = NULL; OVS_CB(skb)->mru = 0; /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); @@ -575,13 +574,14 @@ EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, - const struct ip_tunnel_info *tun_info, + struct sk_buff *skb, u8 ipproto, - u32 skb_mark, __be16 tp_src, __be16 tp_dst) { + const struct ip_tunnel_info *tun_info = skb_tunnel_info(skb); const struct ip_tunnel_key *tun_key; + u32 skb_mark = skb->mark; struct rtable *rt; struct flowi4 fl; diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 2f352446b6a8..a36aef22af7f 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -61,9 +61,8 @@ int ovs_vport_send(struct vport *, struct sk_buff *); int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, - const struct ip_tunnel_info *tun_info, + struct sk_buff *, u8 ipproto, - u32 skb_mark, __be16 tp_src, __be16 tp_dst); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, -- cgit v1.2.3 From 8c876639c98501b049269b554bf4ecbb8f9c012f Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sat, 29 Aug 2015 17:44:07 -0700 Subject: openvswitch: Remove vport stats. Since all vport types are now backed by netdev, we can directly use netdev stats. Following patch removes redundant stat from vport. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/vport-internal_dev.c | 57 ++++++++---------- net/openvswitch/vport-netdev.c | 22 +++---- net/openvswitch/vport-netdev.h | 2 +- net/openvswitch/vport.c | 109 ++++------------------------------- net/openvswitch/vport.h | 36 ++++-------- 5 files changed, 56 insertions(+), 170 deletions(-) (limited to 'net') diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 80b3e12ec882..388b8a6bf112 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -43,35 +43,26 @@ static struct internal_dev *internal_dev_priv(struct net_device *netdev) return netdev_priv(netdev); } -/* This function is only called by the kernel network layer.*/ -static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, - struct rtnl_link_stats64 *stats) -{ - struct vport *vport = ovs_internal_dev_get_vport(netdev); - struct ovs_vport_stats vport_stats; - - ovs_vport_get_stats(vport, &vport_stats); - - /* The tx and rx stats need to be swapped because the - * switch and host OS have opposite perspectives. */ - stats->rx_packets = vport_stats.tx_packets; - stats->tx_packets = vport_stats.rx_packets; - stats->rx_bytes = vport_stats.tx_bytes; - stats->tx_bytes = vport_stats.rx_bytes; - stats->rx_errors = vport_stats.tx_errors; - stats->tx_errors = vport_stats.rx_errors; - stats->rx_dropped = vport_stats.tx_dropped; - stats->tx_dropped = vport_stats.rx_dropped; - - return stats; -} - /* Called with rcu_read_lock_bh. */ static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { + int len, err; + + len = skb->len; rcu_read_lock(); - ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); + err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); + + if (likely(!err)) { + struct pcpu_sw_netstats *tstats = this_cpu_ptr(netdev->tstats); + + u64_stats_update_begin(&tstats->syncp); + tstats->tx_bytes += len; + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { + netdev->stats.tx_errors++; + } return 0; } @@ -121,7 +112,6 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, .ndo_change_mtu = internal_dev_change_mtu, - .ndo_get_stats64 = internal_dev_get_stats, }; static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { @@ -212,18 +202,17 @@ static void internal_dev_destroy(struct vport *vport) rtnl_unlock(); } -static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +static void internal_dev_recv(struct vport *vport, struct sk_buff *skb) { struct net_device *netdev = vport->dev; - int len; + struct pcpu_sw_netstats *stats; if (unlikely(!(netdev->flags & IFF_UP))) { kfree_skb(skb); - return 0; + netdev->stats.rx_dropped++; + return; } - len = skb->len; - skb_dst_drop(skb); nf_reset(skb); secpath_reset(skb); @@ -233,9 +222,13 @@ static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); - netif_rx(skb); + stats = this_cpu_ptr(netdev->tstats); + u64_stats_update_begin(&stats->syncp); + stats->rx_packets++; + stats->rx_bytes += skb->len; + u64_stats_update_end(&stats->syncp); - return len; + netif_rx(skb); } static struct vport_ops ovs_internal_vport_ops = { diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index a75011505039..f7e8dcce7ada 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -39,8 +39,11 @@ static struct vport_ops ovs_netdev_vport_ops; /* Must be called with rcu_read_lock. */ -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +static void netdev_port_receive(struct sk_buff *skb) { + struct vport *vport; + + vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; @@ -56,10 +59,8 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) skb_push(skb, ETH_HLEN); ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); - ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; - error: kfree_skb(skb); } @@ -68,15 +69,11 @@ error: static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; - struct vport *vport; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; - vport = ovs_netdev_get_vport(skb->dev); - - netdev_port_receive(vport, skb); - + netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } @@ -203,27 +200,24 @@ static unsigned int packet_length(const struct sk_buff *skb) return length; } -int ovs_netdev_send(struct vport *vport, struct sk_buff *skb) +void ovs_netdev_send(struct vport *vport, struct sk_buff *skb) { int mtu = vport->dev->mtu; - int len; if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", vport->dev->name, packet_length(skb), mtu); + vport->dev->stats.tx_errors++; goto drop; } skb->dev = vport->dev; - len = skb->len; dev_queue_xmit(skb); - - return len; + return; drop: kfree_skb(skb); - return 0; } EXPORT_SYMBOL_GPL(ovs_netdev_send); diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index 497cc81f1aca..bf22fcedbc69 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -27,7 +27,7 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); struct vport *ovs_netdev_link(struct vport *vport, const char *name); -int ovs_netdev_send(struct vport *vport, struct sk_buff *skb); +void ovs_netdev_send(struct vport *vport, struct sk_buff *skb); void ovs_netdev_detach_dev(struct vport *); int __init ovs_netdev_init(void); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index e658439a0bdd..1679dea7c6bc 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -34,9 +34,6 @@ #include "vport.h" #include "vport-internal_dev.h" -static void ovs_vport_record_error(struct vport *, - enum vport_err_type err_type); - static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ @@ -157,12 +154,6 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, return ERR_PTR(-EINVAL); } - vport->percpu_stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); - if (!vport->percpu_stats) { - kfree(vport); - return ERR_PTR(-ENOMEM); - } - return vport; } EXPORT_SYMBOL_GPL(ovs_vport_alloc); @@ -183,7 +174,6 @@ void ovs_vport_free(struct vport *vport) * it is safe to use raw dereference. */ kfree(rcu_dereference_raw(vport->upcall_portids)); - free_percpu(vport->percpu_stats); kfree(vport); } EXPORT_SYMBOL_GPL(ovs_vport_free); @@ -290,30 +280,24 @@ void ovs_vport_del(struct vport *vport) */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { + struct net_device *dev = vport->dev; int i; memset(stats, 0, sizeof(*stats)); + stats->rx_errors = dev->stats.rx_errors; + stats->tx_errors = dev->stats.tx_errors; + stats->tx_dropped = dev->stats.tx_dropped; + stats->rx_dropped = dev->stats.rx_dropped; - /* We potentially have 2 sources of stats that need to be combined: - * those we have collected (split into err_stats and percpu_stats) from - * set_stats() and device error stats from netdev->get_stats() (for - * errors that happen downstream and therefore aren't reported through - * our vport_record_error() function). - * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). - * netdev-stats can be directly read over netlink-ioctl. - */ - - stats->rx_errors = atomic_long_read(&vport->err_stats.rx_errors); - stats->tx_errors = atomic_long_read(&vport->err_stats.tx_errors); - stats->tx_dropped = atomic_long_read(&vport->err_stats.tx_dropped); - stats->rx_dropped = atomic_long_read(&vport->err_stats.rx_dropped); + stats->rx_dropped += atomic_long_read(&dev->rx_dropped); + stats->tx_dropped += atomic_long_read(&dev->tx_dropped); for_each_possible_cpu(i) { const struct pcpu_sw_netstats *percpu_stats; struct pcpu_sw_netstats local_stats; unsigned int start; - percpu_stats = per_cpu_ptr(vport->percpu_stats, i); + percpu_stats = per_cpu_ptr(dev->tstats, i); do { start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); @@ -468,94 +452,25 @@ u32 ovs_vport_find_upcall_portid(const struct vport *vport, struct sk_buff *skb) * Must be called with rcu_read_lock. The packet cannot be shared and * skb->data should point to the Ethernet header. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - const struct ip_tunnel_info *tun_info) +int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + const struct ip_tunnel_info *tun_info) { - struct pcpu_sw_netstats *stats; struct sw_flow_key key; int error; - stats = this_cpu_ptr(vport->percpu_stats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len + - (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); - u64_stats_update_end(&stats->syncp); - OVS_CB(skb)->input_vport = vport; OVS_CB(skb)->mru = 0; /* Extract flow from 'skb' into 'key'. */ error = ovs_flow_key_extract(tun_info, skb, &key); if (unlikely(error)) { kfree_skb(skb); - return; + return error; } ovs_dp_process_packet(skb, &key); + return 0; } EXPORT_SYMBOL_GPL(ovs_vport_receive); -/** - * ovs_vport_send - send a packet on a device - * - * @vport: vport on which to send the packet - * @skb: skb to send - * - * Sends the given packet and returns the length of data sent. Either ovs - * lock or rcu_read_lock must be held. - */ -int ovs_vport_send(struct vport *vport, struct sk_buff *skb) -{ - int sent = vport->ops->send(vport, skb); - - if (likely(sent > 0)) { - struct pcpu_sw_netstats *stats; - - stats = this_cpu_ptr(vport->percpu_stats); - - u64_stats_update_begin(&stats->syncp); - stats->tx_packets++; - stats->tx_bytes += sent; - u64_stats_update_end(&stats->syncp); - } else if (sent < 0) { - ovs_vport_record_error(vport, VPORT_E_TX_ERROR); - } else { - ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); - } - return sent; -} - -/** - * ovs_vport_record_error - indicate device error to generic stats layer - * - * @vport: vport that encountered the error - * @err_type: one of enum vport_err_type types to indicate the error type - * - * If using the vport generic stats layer indicate that an error of the given - * type has occurred. - */ -static void ovs_vport_record_error(struct vport *vport, - enum vport_err_type err_type) -{ - switch (err_type) { - case VPORT_E_RX_DROPPED: - atomic_long_inc(&vport->err_stats.rx_dropped); - break; - - case VPORT_E_RX_ERROR: - atomic_long_inc(&vport->err_stats.rx_errors); - break; - - case VPORT_E_TX_DROPPED: - atomic_long_inc(&vport->err_stats.tx_dropped); - break; - - case VPORT_E_TX_ERROR: - atomic_long_inc(&vport->err_stats.tx_errors); - break; - } - -} - static void free_vport_rcu(struct rcu_head *rcu) { struct vport *vport = container_of(rcu, struct vport, rcu); diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index a36aef22af7f..30615edc084b 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -57,8 +57,6 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -int ovs_vport_send(struct vport *, struct sk_buff *); - int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, struct net *net, struct sk_buff *, @@ -68,14 +66,6 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, struct ip_tunnel_info *info); -/* The following definitions are for implementers of vport devices: */ - -struct vport_err_stats { - atomic_long_t rx_dropped; - atomic_long_t rx_errors; - atomic_long_t tx_dropped; - atomic_long_t tx_errors; -}; /** * struct vport_portids - array of netlink portids of a vport. * must be protected by rcu. @@ -101,8 +91,6 @@ struct vport_portids { * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. - * @percpu_stats: Points to per-CPU statistics used and maintained by vport - * @err_stats: Points to error statistics used and maintained by vport * @detach_list: list used for detaching vport in net-exit call. */ struct vport { @@ -115,9 +103,6 @@ struct vport { struct hlist_node dp_hash_node; const struct vport_ops *ops; - struct pcpu_sw_netstats __percpu *percpu_stats; - - struct vport_err_stats err_stats; struct list_head detach_list; struct rcu_head rcu; }; @@ -156,7 +141,7 @@ struct vport_parms { * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. - * @send: Send a packet on the device. Returns the length of the packet sent, + * @send: Send a packet on the device. * zero for dropped packets or negative for error. * @get_egress_tun_info: Get the egress tunnel 5-tuple and other info for * a packet. @@ -171,7 +156,7 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - int (*send)(struct vport *, struct sk_buff *); + void (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, struct ip_tunnel_info *); @@ -179,13 +164,6 @@ struct vport_ops { struct list_head list; }; -enum vport_err_type { - VPORT_E_RX_DROPPED, - VPORT_E_RX_ERROR, - VPORT_E_TX_DROPPED, - VPORT_E_TX_ERROR, -}; - struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); @@ -222,8 +200,8 @@ static inline struct vport *vport_from_priv(void *priv) return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *, - const struct ip_tunnel_info *); +int ovs_vport_receive(struct vport *, struct sk_buff *, + const struct ip_tunnel_info *); static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len) @@ -258,4 +236,10 @@ static inline struct rtable *ovs_tunnel_route_lookup(struct net *net, rt = ip_route_output_key(net, fl); return rt; } + +static inline void ovs_vport_send(struct vport *vport, struct sk_buff *skb) +{ + vport->ops->send(vport, skb); +} + #endif /* vport.h */ -- cgit v1.2.3 From a581b96dbfc52b6b498552525ec929aa635680ed Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sat, 29 Aug 2015 17:44:08 -0700 Subject: openvswitch: Remove vport-net This structure is not used anymore. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.h | 2 -- net/openvswitch/flow.c | 3 ++- net/openvswitch/vport.h | 4 ---- 3 files changed, 2 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index da15fd3f3c34..c05b7d9e7bf2 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -30,7 +30,6 @@ #include "conntrack.h" #include "flow.h" #include "flow_table.h" -#include "vport.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 @@ -134,7 +133,6 @@ struct dp_upcall_info { struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; - struct vport_net vport_net; /* Module reference for configuring conntrack. */ bool xt_label; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 9db87331e211..bed8d09230cd 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -46,10 +46,11 @@ #include #include +#include "conntrack.h" #include "datapath.h" #include "flow.h" #include "flow_netlink.h" -#include "conntrack.h" +#include "vport.h" u64 ovs_flow_used_time(unsigned long flow_jiffies) { diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 30615edc084b..4b6f4a5296c3 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -36,10 +36,6 @@ struct vport_parms; /* The following definitions are for users of the vport subsytem: */ -struct vport_net { - struct vport __rcu *gre_vport; -}; - int ovs_vport_init(void); void ovs_vport_exit(void); -- cgit v1.2.3 From c4c6bc314618f60ba69b0cbf93e506e4c38a11d2 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Sun, 30 Aug 2015 11:29:41 +0530 Subject: net: Introduce helper functions to get the per cpu data Signed-off-by: Raghavendra K T Signed-off-by: David S. Miller --- include/net/ip.h | 10 ++++++++++ net/ipv4/af_inet.c | 41 +++++++++++++++++++++++++++-------------- 2 files changed, 37 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 7b9e1c782aa3..9b9ca2839399 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -202,10 +202,20 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, #define NET_ADD_STATS_BH(net, field, adnd) SNMP_ADD_STATS_BH((net)->mib.net_statistics, field, adnd) #define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd) +u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offct); unsigned long snmp_fold_field(void __percpu *mib, int offt); #if BITS_PER_LONG==32 +u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, + size_t syncp_offset); u64 snmp_fold_field64(void __percpu *mib, int offt, size_t sync_off); #else +static inline u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, + size_t syncp_offset) +{ + return snmp_get_cpu_field(mib, cpu, offct); + +} + static inline u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_off) { return snmp_fold_field(mib, offt); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 675e88cac2b4..0c69c0bbe1a1 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1452,38 +1452,51 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); +u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) +{ + return *(((unsigned long *)per_cpu_ptr(mib, cpu)) + offt); +} +EXPORT_SYMBOL_GPL(snmp_get_cpu_field); + unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; int i; for_each_possible_cpu(i) - res += *(((unsigned long *) per_cpu_ptr(mib, i)) + offt); + res += snmp_get_cpu_field(mib, i, offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 +u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, + size_t syncp_offset) +{ + void *bhptr; + struct u64_stats_sync *syncp; + u64 v; + unsigned int start; + + bhptr = per_cpu_ptr(mib, cpu); + syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); + do { + start = u64_stats_fetch_begin_irq(syncp); + v = *(((u64 *)bhptr) + offt); + } while (u64_stats_fetch_retry_irq(syncp, start)); + + return v; +} +EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); + u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; for_each_possible_cpu(cpu) { - void *bhptr; - struct u64_stats_sync *syncp; - u64 v; - unsigned int start; - - bhptr = per_cpu_ptr(mib, cpu); - syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); - do { - start = u64_stats_fetch_begin_irq(syncp); - v = *(((u64 *) bhptr) + offt); - } while (u64_stats_fetch_retry_irq(syncp, start)); - - res += v; + res += snmp_get_cpu_field(mib, cpu, offct, syncp_offset); } return res; } -- cgit v1.2.3 From a3a773726c9f9ba2e87fd8ad8e36feff5f6ffd8e Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Sun, 30 Aug 2015 11:29:42 +0530 Subject: net: Optimize snmp stat aggregation by walking all the percpu data at once Docker container creation linearly increased from around 1.6 sec to 7.5 sec (at 1000 containers) and perf data showed 50% ovehead in snmp_fold_field. reason: currently __snmp6_fill_stats64 calls snmp_fold_field that walks through per cpu data of an item (iteratively for around 36 items). idea: This patch tries to aggregate the statistics by going through all the items of each cpu sequentially which is reducing cache misses. Docker creation got faster by more than 2x after the patch. Result: Before After Docker creation time 6.836s 3.25s cache miss 2.7% 1.41% perf before: 50.73% docker [kernel.kallsyms] [k] snmp_fold_field 9.07% swapper [kernel.kallsyms] [k] snooze_loop 3.49% docker [kernel.kallsyms] [k] veth_stats_one 2.85% swapper [kernel.kallsyms] [k] _raw_spin_lock perf after: 10.57% docker docker [.] scanblock 8.37% swapper [kernel.kallsyms] [k] snooze_loop 6.91% docker [kernel.kallsyms] [k] snmp_get_cpu_field 6.67% docker [kernel.kallsyms] [k] veth_stats_one changes/ideas suggested: Using buffer in stack (Eric), Usage of memset (David), Using memcpy in place of unaligned_put (Joe). Signed-off-by: Raghavendra K T Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0f08d3b9e238..99c0f2b843f0 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4726,18 +4726,24 @@ static inline void __snmp6_fill_statsdev(u64 *stats, atomic_long_t *mib, } static inline void __snmp6_fill_stats64(u64 *stats, void __percpu *mib, - int items, int bytes, size_t syncpoff) + int bytes, size_t syncpoff) { - int i; - int pad = bytes - sizeof(u64) * items; + int i, c; + u64 buff[IPSTATS_MIB_MAX]; + int pad = bytes - sizeof(u64) * IPSTATS_MIB_MAX; + BUG_ON(pad < 0); - /* Use put_unaligned() because stats may not be aligned for u64. */ - put_unaligned(items, &stats[0]); - for (i = 1; i < items; i++) - put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]); + memset(buff, 0, sizeof(buff)); + buff[0] = IPSTATS_MIB_MAX; - memset(&stats[items], 0, pad); + for_each_possible_cpu(c) { + for (i = 1; i < IPSTATS_MIB_MAX; i++) + buff[i] += snmp_get_cpu_field64(mib, c, i, syncpoff); + } + + memcpy(stats, buff, IPSTATS_MIB_MAX * sizeof(u64)); + memset(&stats[IPSTATS_MIB_MAX], 0, pad); } static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, @@ -4745,8 +4751,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype, { switch (attrtype) { case IFLA_INET6_STATS: - __snmp6_fill_stats64(stats, idev->stats.ipv6, - IPSTATS_MIB_MAX, bytes, offsetof(struct ipstats_mib, syncp)); + __snmp6_fill_stats64(stats, idev->stats.ipv6, bytes, + offsetof(struct ipstats_mib, syncp)); break; case IFLA_INET6_ICMP6STATS: __snmp6_fill_statsdev(stats, idev->stats.icmpv6dev->mibs, ICMP6_MIB_MAX, bytes); -- cgit v1.2.3 From 0ef707700f1cef2357ce655fc86a4de5e41fa4b5 Mon Sep 17 00:00:00 2001 From: Ken-ichirou MATSUZAWA Date: Mon, 31 Aug 2015 07:54:49 +0900 Subject: netlink: rx mmap: fix POLLIN condition Poll() returns immediately after setting the kernel current frame (ring->head) to SKIP from user space even though there is no new frame. And in a case of all frames is VALID, user space program unintensionally sets (only) kernel current frame to UNUSED, then calls poll(), it will not return immediately even though there are VALID frames. To avoid situations like above, I think we need to scan all frames to find VALID frames at poll() like netlink_alloc_skb(), netlink_forward_ring() finding an UNUSED frame at skb allocation. Signed-off-by: Ken-ichirou MATSUZAWA Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7965ca7c461d..50889be1517d 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -594,16 +594,6 @@ netlink_current_frame(const struct netlink_ring *ring, return netlink_lookup_frame(ring, ring->head, status); } -static struct nl_mmap_hdr * -netlink_previous_frame(const struct netlink_ring *ring, - enum nl_mmap_status status) -{ - unsigned int prev; - - prev = ring->head ? ring->head - 1 : ring->frame_max; - return netlink_lookup_frame(ring, prev, status); -} - static void netlink_increment_head(struct netlink_ring *ring) { ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; @@ -624,6 +614,21 @@ static void netlink_forward_ring(struct netlink_ring *ring) } while (ring->head != head); } +static bool netlink_has_valid_frame(struct netlink_ring *ring) +{ + unsigned int head = ring->head, pos = head; + const struct nl_mmap_hdr *hdr; + + do { + hdr = __netlink_lookup_frame(ring, pos); + if (hdr->nm_status == NL_MMAP_STATUS_VALID) + return true; + pos = pos != 0 ? pos - 1 : ring->frame_max; + } while (pos != head); + + return false; +} + static bool netlink_dump_space(struct netlink_sock *nlk) { struct netlink_ring *ring = &nlk->rx_ring; @@ -671,8 +676,7 @@ static unsigned int netlink_poll(struct file *file, struct socket *sock, spin_lock_bh(&sk->sk_receive_queue.lock); if (nlk->rx_ring.pg_vec) { - netlink_forward_ring(&nlk->rx_ring); - if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) + if (netlink_has_valid_frame(&nlk->rx_ring)) mask |= POLLIN | POLLRDNORM; } spin_unlock_bh(&sk->sk_receive_queue.lock); -- cgit v1.2.3 From 80ec1927b102480f89436e7d7f961a263e916a44 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Sun, 30 Aug 2015 22:40:44 -0700 Subject: ipv4: Fix 32-bit build. net/ipv4/af_inet.c: In function 'snmp_get_cpu_field64': >> net/ipv4/af_inet.c:1486:26: error: 'offt' undeclared (first use in this function) v = *(((u64 *)bhptr) + offt); ^ net/ipv4/af_inet.c:1486:26: note: each undeclared identifier is reported only once for each function it appears in net/ipv4/af_inet.c: In function 'snmp_fold_field64': >> net/ipv4/af_inet.c:1499:39: error: 'offct' undeclared (first use in this function) res += snmp_get_cpu_field(mib, cpu, offct, syncp_offset); ^ >> net/ipv4/af_inet.c:1499:10: error: too many arguments to function 'snmp_get_cpu_field' res += snmp_get_cpu_field(mib, cpu, offct, syncp_offset); ^ net/ipv4/af_inet.c:1455:5: note: declared here u64 snmp_get_cpu_field(void __percpu *mib, int cpu, int offt) ^ Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 0c69c0bbe1a1..c2d0ebc628d7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1471,7 +1471,7 @@ EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 -u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offct, +u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, size_t syncp_offset) { void *bhptr; @@ -1496,7 +1496,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) int cpu; for_each_possible_cpu(cpu) { - res += snmp_get_cpu_field(mib, cpu, offct, syncp_offset); + res += snmp_get_cpu_field(mib, cpu, offt, syncp_offset); } return res; } -- cgit v1.2.3 From d1bfc62591a0a5144dc380976e737fbbb4f40f4f Mon Sep 17 00:00:00 2001 From: Madalin Bucur Date: Mon, 31 Aug 2015 15:46:07 +0300 Subject: ipv4: fix 32b build Address remaining issue after 80ec192. Signed-off-by: Madalin Bucur Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index c2d0ebc628d7..96773a2f95a7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1496,7 +1496,7 @@ u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) int cpu; for_each_possible_cpu(cpu) { - res += snmp_get_cpu_field(mib, cpu, offt, syncp_offset); + res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); } return res; } -- cgit v1.2.3 From 4c22279848c531fc7f555d463daf3d0df963bd41 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Sun, 30 Aug 2015 18:09:38 -0700 Subject: ip-tunnel: Use API to access tunnel metadata options. Currently tun-info options pointer is used in few cases to pass options around. But tunnel options can be accessed using ip_tunnel_info_opts() API without using the pointer. Following patch removes the redundant pointer and consistently make use of API. Signed-off-by: Pravin B Shelar Acked-by: Thomas Graf Reviewed-by: Jesse Gross Signed-off-by: David S. Miller --- drivers/net/geneve.c | 9 ++---- drivers/net/vxlan.c | 4 +-- include/net/dst_metadata.h | 31 +++++++++---------- include/net/ip_tunnels.h | 67 +++++++++++++++++++----------------------- net/ipv4/ip_tunnel_core.c | 2 -- net/openvswitch/actions.c | 8 +++-- net/openvswitch/datapath.c | 3 +- net/openvswitch/datapath.h | 3 +- net/openvswitch/flow.c | 7 +++-- net/openvswitch/flow_netlink.c | 27 +++++++---------- net/openvswitch/flow_netlink.h | 3 +- net/openvswitch/vport-geneve.c | 5 ++-- net/openvswitch/vport-gre.c | 5 ++-- net/openvswitch/vport-vxlan.c | 4 +-- net/openvswitch/vport.c | 27 +++++++++-------- net/openvswitch/vport.h | 7 +++-- 16 files changed, 100 insertions(+), 112 deletions(-) (limited to 'net') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 68b0f0325fc7..da3259ce7c8d 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -143,7 +143,6 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) if (ip_tunnel_collect_metadata() || gs->collect_md) { __be16 flags; - void *opts; flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT | (gnvh->oam ? TUNNEL_OAM : 0) | @@ -154,11 +153,9 @@ static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb) gnvh->opt_len * 4); if (!tun_dst) goto drop; - /* Update tunnel dst according to Geneve options. */ - opts = ip_tunnel_info_opts(&tun_dst->u.tun_info, - gnvh->opt_len * 4); - memcpy(opts, gnvh->options, gnvh->opt_len * 4); + ip_tunnel_info_opts_set(&tun_dst->u.tun_info, + gnvh->options, gnvh->opt_len * 4); } else { /* Drop packets w/ critical options, * since we don't support any... @@ -663,7 +660,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev) tunnel_id_to_vni(key->tun_id, vni); if (key->tun_flags & TUNNEL_GENEVE_OPT) - opts = ip_tunnel_info_opts(info, info->options_len); + opts = ip_tunnel_info_opts(info); udp_csum = !!(key->tun_flags & TUNNEL_CSUM); err = geneve_build_skb(rt, skb, key->tun_flags, vni, diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 6c5269aea544..ce988fd01b34 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -1271,7 +1271,7 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) goto drop; info = &tun_dst->u.tun_info; - md = ip_tunnel_info_opts(info, sizeof(*md)); + md = ip_tunnel_info_opts(info); } else { memset(md, 0, sizeof(*md)); } @@ -1948,7 +1948,7 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, tos = info->key.tos; if (info->options_len) - md = ip_tunnel_info_opts(info, sizeof(*md)); + md = ip_tunnel_info_opts(info); } else { md->gbp = skb->mark; } diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index d32f49cc621d..547ab8241593 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -48,21 +48,16 @@ static inline bool skb_valid_dst(const struct sk_buff *skb) struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags); struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags); -static inline struct metadata_dst *tun_rx_dst(__be16 flags, - __be64 tunnel_id, int md_size) +static inline struct metadata_dst *tun_rx_dst(int md_size) { struct metadata_dst *tun_dst; - struct ip_tunnel_info *info; tun_dst = metadata_dst_alloc(md_size, GFP_ATOMIC); if (!tun_dst) return NULL; - info = &tun_dst->u.tun_info; - info->key.tun_flags = flags; - info->key.tun_id = tunnel_id; - info->key.tp_src = 0; - info->key.tp_dst = 0; + tun_dst->u.tun_info.options_len = 0; + tun_dst->u.tun_info.mode = 0; return tun_dst; } @@ -73,17 +68,14 @@ static inline struct metadata_dst *ip_tun_rx_dst(struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); struct metadata_dst *tun_dst; - struct ip_tunnel_info *info; - tun_dst = tun_rx_dst(flags, tunnel_id, md_size); + tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; - info = &tun_dst->u.tun_info; - info->key.u.ipv4.src = iph->saddr; - info->key.u.ipv4.dst = iph->daddr; - info->key.tos = iph->tos; - info->key.ttl = iph->ttl; + ip_tunnel_key_init(&tun_dst->u.tun_info.key, + iph->saddr, iph->daddr, iph->tos, iph->ttl, + 0, 0, tunnel_id, flags); return tun_dst; } @@ -96,16 +88,21 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct sk_buff *skb, struct metadata_dst *tun_dst; struct ip_tunnel_info *info; - tun_dst = tun_rx_dst(flags, tunnel_id, md_size); + tun_dst = tun_rx_dst(md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; + info->mode = IP_TUNNEL_INFO_IPV6; + info->key.tun_flags = flags; + info->key.tun_id = tunnel_id; + info->key.tp_src = 0; + info->key.tp_dst = 0; + info->key.u.ipv6.src = ip6h->saddr; info->key.u.ipv6.dst = ip6h->daddr; info->key.tos = ipv6_get_dsfield(ip6h); info->key.ttl = ip6h->hop_limit; - info->mode = IP_TUNNEL_INFO_IPV6; return tun_dst; } diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 2b4fa06e91bd..9a6a3ba888e8 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -57,7 +57,6 @@ struct ip_tunnel_key { struct ip_tunnel_info { struct ip_tunnel_key key; - const void *options; u8 options_len; u8 mode; }; @@ -180,49 +179,32 @@ int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *op, int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *op, unsigned int num); -static inline void __ip_tunnel_info_init(struct ip_tunnel_info *tun_info, - __be32 saddr, __be32 daddr, - u8 tos, u8 ttl, - __be16 tp_src, __be16 tp_dst, - __be64 tun_id, __be16 tun_flags, - const void *opts, u8 opts_len) +static inline void ip_tunnel_key_init(struct ip_tunnel_key *key, + __be32 saddr, __be32 daddr, + u8 tos, u8 ttl, + __be16 tp_src, __be16 tp_dst, + __be64 tun_id, __be16 tun_flags) { - tun_info->key.tun_id = tun_id; - tun_info->key.u.ipv4.src = saddr; - tun_info->key.u.ipv4.dst = daddr; - memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_IPV4_PAD, + key->tun_id = tun_id; + key->u.ipv4.src = saddr; + key->u.ipv4.dst = daddr; + memset((unsigned char *)key + IP_TUNNEL_KEY_IPV4_PAD, 0, IP_TUNNEL_KEY_IPV4_PAD_LEN); - tun_info->key.tos = tos; - tun_info->key.ttl = ttl; - tun_info->key.tun_flags = tun_flags; + key->tos = tos; + key->ttl = ttl; + key->tun_flags = tun_flags; /* For the tunnel types on the top of IPsec, the tp_src and tp_dst of * the upper tunnel are used. * E.g: GRE over IPSEC, the tp_src and tp_port are zero. */ - tun_info->key.tp_src = tp_src; - tun_info->key.tp_dst = tp_dst; + key->tp_src = tp_src; + key->tp_dst = tp_dst; /* Clear struct padding. */ - if (sizeof(tun_info->key) != IP_TUNNEL_KEY_SIZE) - memset((unsigned char *)&tun_info->key + IP_TUNNEL_KEY_SIZE, - 0, sizeof(tun_info->key) - IP_TUNNEL_KEY_SIZE); - - tun_info->options = opts; - tun_info->options_len = opts_len; - - tun_info->mode = 0; -} - -static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info, - const struct iphdr *iph, - __be16 tp_src, __be16 tp_dst, - __be64 tun_id, __be16 tun_flags, - const void *opts, u8 opts_len) -{ - __ip_tunnel_info_init(tun_info, iph->saddr, iph->daddr, - iph->tos, iph->ttl, tp_src, tp_dst, - tun_id, tun_flags, opts, opts_len); + if (sizeof(*key) != IP_TUNNEL_KEY_SIZE) + memset((unsigned char *)key + IP_TUNNEL_KEY_SIZE, + 0, sizeof(*key) - IP_TUNNEL_KEY_SIZE); } static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info @@ -317,11 +299,24 @@ static inline void iptunnel_xmit_stats(int err, } } -static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info, size_t n) +static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info) { return info + 1; } +static inline void ip_tunnel_info_opts_get(void *to, + const struct ip_tunnel_info *info) +{ + memcpy(to, info + 1, info->options_len); +} + +static inline void ip_tunnel_info_opts_set(struct ip_tunnel_info *info, + const void *from, int len) +{ + memcpy(ip_tunnel_info_opts(info), from, len); + info->options_len = len; +} + static inline struct ip_tunnel_info *lwt_tun_info(struct lwtunnel_state *lwtstate) { return (struct ip_tunnel_info *)lwtstate->data; diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 0c756ade1cf7..29ed6c5a5185 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -249,7 +249,6 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]); tun_info->mode = IP_TUNNEL_INFO_TX; - tun_info->options = NULL; tun_info->options_len = 0; *ts = new_state; @@ -357,7 +356,6 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr, tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]); tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6; - tun_info->options = NULL; tun_info->options_len = 0; *ts = new_state; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 090d9e3a460c..315f5330b6e5 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -793,11 +793,13 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, if (vport) { int err; + upcall.egress_tun_info = &info; err = ovs_vport_get_egress_tun_info(vport, skb, - &info); - if (!err) - upcall.egress_tun_info = &info; + &upcall); + if (err) + upcall.egress_tun_info = NULL; } + break; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 60c2ab8e6bc3..6fbd2decb19e 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -491,7 +491,8 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, if (upcall_info->egress_tun_info) { nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); err = ovs_nla_put_egress_tunnel_key(user_skb, - upcall_info->egress_tun_info); + upcall_info->egress_tun_info, + upcall_info->egress_tun_opts); BUG_ON(err); nla_nest_end(user_skb, nla); } diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index c05b7d9e7bf2..f88038a99f44 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -116,7 +116,8 @@ struct ovs_skb_cb { * @mru: If not zero, Maximum received IP fragment size. */ struct dp_upcall_info { - const struct ip_tunnel_info *egress_tun_info; + struct ip_tunnel_info *egress_tun_info; + const void *egress_tun_opts; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index bed8d09230cd..c8db44ab2ee7 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -702,12 +702,13 @@ int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, return -EINVAL; memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); - if (tun_info->options) { + if (tun_info->options_len) { BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * 8)) - 1 > sizeof(key->tun_opts)); - memcpy(TUN_METADATA_OPTS(key, tun_info->options_len), - tun_info->options, tun_info->options_len); + + ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), + tun_info); key->tun_opts_len = tun_info->options_len; } else { key->tun_opts_len = 0; diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index e22c5bfe8575..c92d6a262bc5 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -716,10 +716,11 @@ static int ipv4_tun_to_nlattr(struct sk_buff *skb, } int ovs_nla_put_egress_tunnel_key(struct sk_buff *skb, - const struct ip_tunnel_info *egress_tun_info) + const struct ip_tunnel_info *egress_tun_info, + const void *egress_tun_opts) { return __ipv4_tun_to_nlattr(skb, &egress_tun_info->key, - egress_tun_info->options, + egress_tun_opts, egress_tun_info->options_len); } @@ -1876,20 +1877,14 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, tun_info = &tun_dst->u.tun_info; tun_info->mode = IP_TUNNEL_INFO_TX; tun_info->key = key.tun_key; - tun_info->options_len = key.tun_opts_len; - - if (tun_info->options_len) { - /* We need to store the options in the action itself since - * everything else will go away after flow setup. We can append - * it to tun_info and then point there. - */ - memcpy((tun_info + 1), - TUN_METADATA_OPTS(&key, key.tun_opts_len), key.tun_opts_len); - tun_info->options = (tun_info + 1); - } else { - tun_info->options = NULL; - } + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + ip_tunnel_info_opts_set(tun_info, + TUN_METADATA_OPTS(&key, key.tun_opts_len), + key.tun_opts_len); add_nested_action_end(*sfa, start); return err; @@ -2345,7 +2340,7 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) err = ipv4_tun_to_nlattr(skb, &tun_info->key, tun_info->options_len ? - tun_info->options : NULL, + ip_tunnel_info_opts(tun_info) : NULL, tun_info->options_len); if (err) return err; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 07878e22e783..6ca3f0baf449 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -56,7 +56,8 @@ int ovs_nla_get_match(struct net *, struct sw_flow_match *, const struct nlattr *key, const struct nlattr *mask, bool log); int ovs_nla_put_egress_tunnel_key(struct sk_buff *, - const struct ip_tunnel_info *); + const struct ip_tunnel_info *, + const void *egress_tun_opts); bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c index 24c56e56fedd..2735e9c4a3b8 100644 --- a/net/openvswitch/vport-geneve.c +++ b/net/openvswitch/vport-geneve.c @@ -53,15 +53,14 @@ static int geneve_get_options(const struct vport *vport, } static int geneve_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) + struct dp_upcall_info *upcall) { struct geneve_port *geneve_port = geneve_vport(vport); struct net *net = ovs_dp_get_net(vport->dp); __be16 dport = htons(geneve_port->port_no); __be16 sport = udp_flow_src_port(net, skb, 1, USHRT_MAX, true); - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), + return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), skb, IPPROTO_UDP, sport, dport); } diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 36c39843607e..4d24481669c9 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -85,10 +85,9 @@ static struct vport *gre_create(const struct vport_parms *parms) } static int gre_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) + struct dp_upcall_info *upcall) { - return ovs_tunnel_get_egress_info(egress_tun_info, - ovs_dp_get_net(vport->dp), + return ovs_tunnel_get_egress_info(upcall, ovs_dp_get_net(vport->dp), skb, IPPROTO_GRE, 0, 0); } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index ed7b23f443ec..c11413d5075f 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -147,7 +147,7 @@ static struct vport *vxlan_create(const struct vport_parms *parms) } static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *egress_tun_info) + struct dp_upcall_info *upcall) { struct vxlan_dev *vxlan = netdev_priv(vport->dev); struct net *net = ovs_dp_get_net(vport->dp); @@ -159,7 +159,7 @@ static int vxlan_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, inet_get_local_port_range(net, &port_min, &port_max); src_port = udp_flow_src_port(net, skb, 0, 0, true); - return ovs_tunnel_get_egress_info(egress_tun_info, net, + return ovs_tunnel_get_egress_info(upcall, net, skb, IPPROTO_UDP, src_port, dst_port); } diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index 1679dea7c6bc..dc81dc619aa2 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -487,13 +487,14 @@ void ovs_vport_deferred_free(struct vport *vport) } EXPORT_SYMBOL_GPL(ovs_vport_deferred_free); -int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, struct net *net, struct sk_buff *skb, u8 ipproto, __be16 tp_src, __be16 tp_dst) { + struct ip_tunnel_info *egress_tun_info = upcall->egress_tun_info; const struct ip_tunnel_info *tun_info = skb_tunnel_info(skb); const struct ip_tunnel_key *tun_key; u32 skb_mark = skb->mark; @@ -520,26 +521,26 @@ int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, /* Generate egress_tun_info based on tun_info, * saddr, tp_src and tp_dst */ - __ip_tunnel_info_init(egress_tun_info, - fl.saddr, tun_key->u.ipv4.dst, - tun_key->tos, - tun_key->ttl, - tp_src, tp_dst, - tun_key->tun_id, - tun_key->tun_flags, - tun_info->options, - tun_info->options_len); - + ip_tunnel_key_init(&egress_tun_info->key, + fl.saddr, tun_key->u.ipv4.dst, + tun_key->tos, + tun_key->ttl, + tp_src, tp_dst, + tun_key->tun_id, + tun_key->tun_flags); + egress_tun_info->options_len = tun_info->options_len; + egress_tun_info->mode = tun_info->mode; + upcall->egress_tun_opts = ip_tunnel_info_opts(egress_tun_info); return 0; } EXPORT_SYMBOL_GPL(ovs_tunnel_get_egress_info); int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *info) + struct dp_upcall_info *upcall) { /* get_egress_tun_info() is only implemented on tunnel ports. */ if (unlikely(!vport->ops->get_egress_tun_info)) return -EINVAL; - return vport->ops->get_egress_tun_info(vport, skb, info); + return vport->ops->get_egress_tun_info(vport, skb, upcall); } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 4b6f4a5296c3..a413f3ae6a7b 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -53,14 +53,15 @@ int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -int ovs_tunnel_get_egress_info(struct ip_tunnel_info *egress_tun_info, +int ovs_tunnel_get_egress_info(struct dp_upcall_info *upcall, struct net *net, struct sk_buff *, u8 ipproto, __be16 tp_src, __be16 tp_dst); + int ovs_vport_get_egress_tun_info(struct vport *vport, struct sk_buff *skb, - struct ip_tunnel_info *info); + struct dp_upcall_info *upcall); /** * struct vport_portids - array of netlink portids of a vport. @@ -154,7 +155,7 @@ struct vport_ops { void (*send)(struct vport *, struct sk_buff *); int (*get_egress_tun_info)(struct vport *, struct sk_buff *, - struct ip_tunnel_info *); + struct dp_upcall_info *upcall); struct module *owner; struct list_head list; -- cgit v1.2.3 From 6cf9dfd3bd62edfff69f11c0f111bc261166e4c7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 31 Aug 2015 15:58:44 +0200 Subject: net: fib: move metrics parsing to a helper fib_create_info() is already quite large, so before adding more code to the metrics section move that to a helper, similar to ip6_convert_metrics. Suggested-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 71 ++++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 1b2d01170a4d..88afbae893f0 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -876,6 +876,44 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) return true; } +static int +fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) +{ + struct nlattr *nla; + int remaining; + + if (!cfg->fc_mx) + return 0; + + nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { + int type = nla_type(nla); + u32 val; + + if (!type) + continue; + if (type > RTAX_MAX) + return -EINVAL; + + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; + + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp); + if (val == TCP_CA_UNSPEC) + return -EINVAL; + } else { + val = nla_get_u32(nla); + } + if (type == RTAX_ADVMSS && val > 65535 - 40) + val = 65535 - 40; + if (type == RTAX_MTU && val > 65535 - 15) + val = 65535 - 15; + fi->fib_metrics[type - 1] = val; + } + + return 0; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -948,36 +986,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } endfor_nexthops(fi) - if (cfg->fc_mx) { - struct nlattr *nla; - int remaining; - - nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { - int type = nla_type(nla); - - if (type) { - u32 val; - - if (type > RTAX_MAX) - goto err_inval; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; - - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err_inval; - } else { - val = nla_get_u32(nla); - } - if (type == RTAX_ADVMSS && val > 65535 - 40) - val = 65535 - 40; - if (type == RTAX_MTU && val > 65535 - 15) - val = 65535 - 15; - fi->fib_metrics[type - 1] = val; - } - } - } + err = fib_convert_metrics(fi, cfg); + if (err) + goto failure; if (cfg->fc_mp) { #ifdef CONFIG_IP_ROUTE_MULTIPATH -- cgit v1.2.3 From 1bb14807bc761a88bb9d319e7bf519eebf4c82ec Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:45 +0200 Subject: net: fib6: reduce identation in ip6_convert_metrics Reduce the identation a bit, there's no need to artificically have it increased. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv6/route.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 308dd5f9158f..0261b721b34b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1711,26 +1711,26 @@ static int ip6_convert_metrics(struct mx6_config *mxc, nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) { int type = nla_type(nla); + u32 val; - if (type) { - u32 val; - - if (unlikely(type > RTAX_MAX)) - goto err; - if (type == RTAX_CC_ALGO) { - char tmp[TCP_CA_NAME_MAX]; + if (!type) + continue; + if (unlikely(type > RTAX_MAX)) + goto err; - nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); - if (val == TCP_CA_UNSPEC) - goto err; - } else { - val = nla_get_u32(nla); - } + if (type == RTAX_CC_ALGO) { + char tmp[TCP_CA_NAME_MAX]; - mp[type - 1] = val; - __set_bit(type - 1, mxc->mx_valid); + nla_strlcpy(tmp, nla, sizeof(tmp)); + val = tcp_ca_get_key_by_name(tmp); + if (val == TCP_CA_UNSPEC) + goto err; + } else { + val = nla_get_u32(nla); } + + mp[type - 1] = val; + __set_bit(type - 1, mxc->mx_valid); } mxc->mx = mp; -- cgit v1.2.3 From b8d3e4163a3562d7cba486687904383e78e7dd6a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:46 +0200 Subject: fib, fib6: reject invalid feature bits Feature bits that are invalid should not be accepted by the kernel, only the lower 4 bits may be configured, but not the remaining ones. Even from these 4, 2 of them are unused. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 11 +++++++---- net/ipv4/fib_semantics.c | 2 ++ net/ipv6/route.c | 2 ++ 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 0d3d3cc43356..702024769c74 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -418,10 +418,13 @@ enum { #define RTAX_MAX (__RTAX_MAX - 1) -#define RTAX_FEATURE_ECN 0x00000001 -#define RTAX_FEATURE_SACK 0x00000002 -#define RTAX_FEATURE_TIMESTAMP 0x00000004 -#define RTAX_FEATURE_ALLFRAG 0x00000008 +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) +#define RTAX_FEATURE_TIMESTAMP (1 << 2) +#define RTAX_FEATURE_ALLFRAG (1 << 3) + +#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) struct rta_session { __u8 proto; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 88afbae893f0..115a08e70d43 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -908,6 +908,8 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) val = 65535 - 40; if (type == RTAX_MTU && val > 65535 - 15) val = 65535 - 15; + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + return -EINVAL; fi->fib_metrics[type - 1] = val; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0261b721b34b..8771530df45e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1728,6 +1728,8 @@ static int ip6_convert_metrics(struct mx6_config *mxc, } else { val = nla_get_u32(nla); } + if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK)) + goto err; mp[type - 1] = val; __set_bit(type - 1, mxc->mx_valid); -- cgit v1.2.3 From c3a8d9474684d391b0afc3970d9b249add15ec07 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 31 Aug 2015 15:58:47 +0200 Subject: tcp: use dctcp if enabled on the route to the initiator Currently, the following case doesn't use DCTCP, even if it should: A responder has f.e. Cubic as system wide default, but for a specific route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The initiating host then uses DCTCP as congestion control, but since the initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok, and we have to fall back to Reno after 3WHS completes. We were thinking on how to solve this in a minimal, non-intrusive way without bloating tcp_ecn_create_request() needlessly: lets cache the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0) is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES contains the unexposed (internal-only) DST_FEATURE_ECN_CA. This allows to only do a single metric feature lookup inside tcp_ecn_create_request(). Joint work with Florian Westphal. Signed-off-by: Daniel Borkmann Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/dst.h | 6 ++++++ include/net/tcp.h | 2 +- net/core/rtnetlink.c | 6 ++++++ net/ipv4/fib_semantics.c | 6 +++++- net/ipv4/tcp_cong.c | 9 ++++++--- net/ipv4/tcp_input.c | 7 +++++-- net/ipv6/route.c | 9 +++++++-- 7 files changed, 36 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/dst.h b/include/net/dst.h index 4c4801645371..9261d928303d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -207,6 +207,12 @@ static inline void dst_metric_set(struct dst_entry *dst, int metric, u32 val) p[metric-1] = val; } +/* Kernel-internal feature bits that are unallocated in user space. */ +#define DST_FEATURE_ECN_CA (1 << 31) + +#define DST_FEATURE_MASK (DST_FEATURE_ECN_CA) +#define DST_FEATURE_ECN_MASK (DST_FEATURE_ECN_CA | RTAX_FEATURE_ECN) + static inline u32 dst_feature(const struct dst_entry *dst, u32 feature) { diff --git a/include/net/tcp.h b/include/net/tcp.h index 4a7b03947a38..0cab28cd43a9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find_key(u32 key); -u32 tcp_ca_get_key_by_name(const char *name); +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 788ceed39463..a466821d1441 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -678,6 +678,12 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; + } else if (i == RTAX_FEATURES - 1) { + u32 user_features = metrics[i] & RTAX_FEATURE_MASK; + + BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); + if (nla_put_u32(skb, i + 1, user_features)) + goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 115a08e70d43..992a9597daf8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -879,6 +879,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) static int fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) { + bool ecn_ca = false; struct nlattr *nla; int remaining; @@ -898,7 +899,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) return -EINVAL; } else { @@ -913,6 +914,9 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) fi->fib_metrics[type - 1] = val; } + if (ecn_ca) + fi->fib_metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + return 0; } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index a2ed23c595cf..93c4dc3ab23f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) } EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); -u32 tcp_ca_get_key_by_name(const char *name) +u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) { const struct tcp_congestion_ops *ca; - u32 key; + u32 key = TCP_CA_UNSPEC; might_sleep(); rcu_read_lock(); ca = __tcp_ca_find_autoload(name); - key = ca ? ca->key : TCP_CA_UNSPEC; + if (ca) { + key = ca->key; + *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; + } rcu_read_unlock(); return key; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc08e2352665..a8f515bb19c4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6003,14 +6003,17 @@ static void tcp_ecn_create_request(struct request_sock *req, const struct net *net = sock_net(listen_sk); bool th_ecn = th->ece && th->cwr; bool ect, ecn_ok; + u32 ecn_ok_dst; if (!th_ecn) return; ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); - ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN); + ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); + ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst; - if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk)) + if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + (ecn_ok_dst & DST_FEATURE_ECN_CA)) inet_rsk(req)->ecn_ok = 1; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8771530df45e..f45cac6f8356 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1698,6 +1698,7 @@ out: static int ip6_convert_metrics(struct mx6_config *mxc, const struct fib6_config *cfg) { + bool ecn_ca = false; struct nlattr *nla; int remaining; u32 *mp; @@ -1722,7 +1723,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, char tmp[TCP_CA_NAME_MAX]; nla_strlcpy(tmp, nla, sizeof(tmp)); - val = tcp_ca_get_key_by_name(tmp); + val = tcp_ca_get_key_by_name(tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) goto err; } else { @@ -1735,8 +1736,12 @@ static int ip6_convert_metrics(struct mx6_config *mxc, __set_bit(type - 1, mxc->mx_valid); } - mxc->mx = mp; + if (ecn_ca) { + __set_bit(RTAX_FEATURES - 1, mxc->mx_valid); + mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA; + } + mxc->mx = mp; return 0; err: kfree(mp); -- cgit v1.2.3 From 6ea3c9d5b042edf14eac1e21af21c41f81f3491e Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Mon, 31 Aug 2015 10:44:19 -0700 Subject: mpls: fix mpls_net_init memory leak Fix a memory leak in the mpls netns init function in case of failure. If register_net_sysctl fails then we need to free the ctl_table. Fixes: 7720c01f3f59 ("mpls: Add a sysctl to control the size of the mpls label table") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/mpls/af_mpls.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 8c5707db53c5..bb185a28de98 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1177,8 +1177,10 @@ static int mpls_net_init(struct net *net) table[0].data = net; net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); - if (net->mpls.ctl == NULL) + if (net->mpls.ctl == NULL) { + kfree(table); return -ENOMEM; + } return 0; } -- cgit v1.2.3 From 39b0c705195e9409dc8a40cc82b11d81405a4a4b Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 31 Aug 2015 15:56:49 +0200 Subject: net: dsa: Allow configuration of CPU & DSA port speeds/duplex By default, DSA and CPU ports are configured to the maximum speed the switch supports. However there can be use cases where the peer devices port is slower. Allow a fixed-link property to be used with the DSA and CPU port in the device tree, and use this information to configure the port. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 053eb2b8e682..afff17341b73 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -176,6 +176,35 @@ __ATTRIBUTE_GROUPS(dsa_hwmon); #endif /* CONFIG_NET_DSA_HWMON */ /* basic switch operations **************************************************/ +static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) +{ + struct dsa_chip_data *cd = ds->pd; + struct device_node *port_dn; + struct phy_device *phydev; + int ret, port; + + for (port = 0; port < DSA_MAX_PORTS; port++) { + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + + port_dn = cd->port_dn[port]; + if (of_phy_is_fixed_link(port_dn)) { + ret = of_phy_register_fixed_link(port_dn); + if (ret) { + netdev_err(master, + "failed to register fixed PHY\n"); + return ret; + } + phydev = of_phy_find_device(port_dn); + genphy_config_init(phydev); + genphy_read_status(phydev); + if (ds->drv->adjust_link) + ds->drv->adjust_link(ds, port, phydev); + } + } + return 0; +} + static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) { struct dsa_switch_driver *drv = ds->drv; @@ -297,6 +326,14 @@ static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) } } + /* Perform configuration of the CPU and DSA ports */ + ret = dsa_cpu_dsa_setup(ds, dst->master_netdev); + if (ret < 0) { + netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", + index); + ret = 0; + } + #ifdef CONFIG_NET_DSA_HWMON /* If the switch provides a temperature sensor, * register with hardware monitoring subsystem. -- cgit v1.2.3 From e44853466844c20d8b5b16de187f63ddc50710dd Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 31 Aug 2015 15:56:50 +0200 Subject: net: dsa: Allow DSA and CPU ports to have a phy-mode property It can be useful for DSA and CPU ports to have a phy-mode property, in particular to specify RGMII delays. Parse the property and set it in the fixed-link phydev. Signed-off-by: Andrew Lunn Acked-by: Florian Fainelli Signed-off-by: David S. Miller --- net/dsa/dsa.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index afff17341b73..76e3800765f8 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -181,7 +181,7 @@ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) struct dsa_chip_data *cd = ds->pd; struct device_node *port_dn; struct phy_device *phydev; - int ret, port; + int ret, port, mode; for (port = 0; port < DSA_MAX_PORTS; port++) { if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) @@ -196,6 +196,12 @@ static int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct net_device *master) return ret; } phydev = of_phy_find_device(port_dn); + + mode = of_get_phy_mode(port_dn); + if (mode < 0) + mode = PHY_INTERFACE_MODE_NA; + phydev->interface = mode; + genphy_config_init(phydev); genphy_read_status(phydev); if (ds->drv->adjust_link) -- cgit v1.2.3 From a394eef562d781f37a50d99cf1dfe596dc1ed96d Mon Sep 17 00:00:00 2001 From: Marius Tomaschewski Date: Mon, 31 Aug 2015 15:59:22 +0200 Subject: ipv6: send NEWLINK on RA managed/otherconf changes The kernel is applying the RA managed/otherconf flags silently and forgets to send ifinfo notify to inform about their change when the router provides a zero reachable_time and retrans_timer as dnsmasq and many routers send it, which just means unspecified by this router and the host should continue using whatever value it is already using. Userspace may monitor the ifinfo notifications to activate dhcpv6. Signed-off-by: Marius Tomaschewski Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 13d3c2beb93e..12fff091e043 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1078,6 +1078,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) struct ndisc_options ndopts; int optlen; unsigned int pref = 0; + __u32 old_if_flags; __u8 *opt = (__u8 *)(ra_msg + 1); @@ -1148,6 +1149,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) * Remember the managed/otherconf flags from most recently * received RA message (RFC 2462) -- yoshfuji */ + old_if_flags = in6_dev->if_flags; in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | IF_RA_OTHERCONF)) | (ra_msg->icmph.icmp6_addrconf_managed ? @@ -1155,6 +1157,9 @@ static void ndisc_router_discovery(struct sk_buff *skb) (ra_msg->icmph.icmp6_addrconf_other ? IF_RA_OTHERCONF : 0); + if (old_if_flags != in6_dev->if_flags) + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + if (!in6_dev->cnf.accept_ra_defrtr) { ND_PRINTK(2, info, "RA: %s, defrtr is false for dev: %s\n", -- cgit v1.2.3 From 2053aeb69a53224717296db31b13d5b45b4f1a0e Mon Sep 17 00:00:00 2001 From: Marius Tomaschewski Date: Tue, 1 Sep 2015 01:57:30 +0200 Subject: ipv6: send only one NEWLINK when RA causes changes Signed-off-by: Marius Tomaschewski Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 12fff091e043..64a71354b069 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1079,6 +1079,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) int optlen; unsigned int pref = 0; __u32 old_if_flags; + bool send_ifinfo_notify = false; __u8 *opt = (__u8 *)(ra_msg + 1); @@ -1158,7 +1159,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) IF_RA_OTHERCONF : 0); if (old_if_flags != in6_dev->if_flags) - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + send_ifinfo_notify = true; if (!in6_dev->cnf.accept_ra_defrtr) { ND_PRINTK(2, info, @@ -1261,7 +1262,7 @@ skip_defrtr: rtime = HZ/10; NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime); in6_dev->tstamp = jiffies; - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + send_ifinfo_notify = true; } rtime = ntohl(ra_msg->reachable_time); @@ -1278,11 +1279,17 @@ skip_defrtr: GC_STALETIME, 3 * rtime); in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); in6_dev->tstamp = jiffies; - inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + send_ifinfo_notify = true; } } } + /* + * Send a notify if RA changed managed/otherconf flags or timer settings + */ + if (send_ifinfo_notify) + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + skip_linkparms: /* -- cgit v1.2.3 From 63b6c13dbb7d3e36f031629f7e4e86dacfcab8cf Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Mon, 31 Aug 2015 20:05:57 -0700 Subject: tun_dst: Remove opts_size opts_size is only written and never read. Following patch removes this unused variable. Signed-off-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/net/dst_metadata.h | 1 - net/core/dst.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h index 547ab8241593..af9d5382f6cb 100644 --- a/include/net/dst_metadata.h +++ b/include/net/dst_metadata.h @@ -7,7 +7,6 @@ struct metadata_dst { struct dst_entry dst; - size_t opts_len; union { struct ip_tunnel_info tun_info; } u; diff --git a/net/core/dst.c b/net/core/dst.c index 477035ed7903..0771c8cb9307 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -378,7 +378,6 @@ static void __metadata_dst_init(struct metadata_dst *md_dst, u8 optslen) dst->output = dst_md_discard_sk; memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); - md_dst->opts_len = optslen; } struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags) -- cgit v1.2.3 From 9b8ff51822893e743eee09350c1928daa3ef503f Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 1 Sep 2015 14:26:35 -0600 Subject: net: Make table id type u32 A number of VRF patches used 'int' for table id. It should be u32 to be consistent with the rest of the stack. Fixes: 4e3c89920cd3a ("net: Introduce VRF related flags and helpers") 15be405eb2ea9 ("net: Add inet_addr lookup by table") 30bbaa1950055 ("net: Fix up inet_addr_type checks") 021dd3b8a142d ("net: Add routes to the table associated with the device") dc028da54ed35 ("inet: Move VRF table lookup to inlined function") f6d3c19274c74 ("net: FIB tracepoints") Signed-off-by: David Ahern Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/net/route.h | 2 +- include/net/vrf.h | 24 ++++++++++++------------ include/trace/events/fib.h | 6 +++--- net/ipv4/af_inet.c | 2 +- net/ipv4/fib_frontend.c | 10 +++++----- net/ipv4/fib_semantics.c | 2 +- 6 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/net/route.h b/include/net/route.h index 395d79bb556c..cc61cb95f059 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -188,7 +188,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); void ip_rt_send_redirect(struct sk_buff *skb); unsigned int inet_addr_type(struct net *net, __be32 addr); -unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); +unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); unsigned int inet_addr_type_dev_table(struct net *net, diff --git a/include/net/vrf.h b/include/net/vrf.h index 5bfb16237fd7..593e6094ddd4 100644 --- a/include/net/vrf.h +++ b/include/net/vrf.h @@ -66,9 +66,9 @@ static inline int vrf_master_ifindex(const struct net_device *dev) } /* called with rcu_read_lock */ -static inline int vrf_dev_table_rcu(const struct net_device *dev) +static inline u32 vrf_dev_table_rcu(const struct net_device *dev) { - int tb_id = 0; + u32 tb_id = 0; if (dev) { struct net_vrf_dev *vrf_ptr; @@ -80,9 +80,9 @@ static inline int vrf_dev_table_rcu(const struct net_device *dev) return tb_id; } -static inline int vrf_dev_table(const struct net_device *dev) +static inline u32 vrf_dev_table(const struct net_device *dev) { - int tb_id; + u32 tb_id; rcu_read_lock(); tb_id = vrf_dev_table_rcu(dev); @@ -91,10 +91,10 @@ static inline int vrf_dev_table(const struct net_device *dev) return tb_id; } -static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) +static inline u32 vrf_dev_table_ifindex(struct net *net, int ifindex) { struct net_device *dev; - int tb_id = 0; + u32 tb_id = 0; if (!ifindex) return 0; @@ -111,9 +111,9 @@ static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) } /* called with rtnl */ -static inline int vrf_dev_table_rtnl(const struct net_device *dev) +static inline u32 vrf_dev_table_rtnl(const struct net_device *dev) { - int tb_id = 0; + u32 tb_id = 0; if (dev) { struct net_vrf_dev *vrf_ptr; @@ -149,22 +149,22 @@ static inline int vrf_master_ifindex(const struct net_device *dev) return 0; } -static inline int vrf_dev_table_rcu(const struct net_device *dev) +static inline u32 vrf_dev_table_rcu(const struct net_device *dev) { return 0; } -static inline int vrf_dev_table(const struct net_device *dev) +static inline u32 vrf_dev_table(const struct net_device *dev) { return 0; } -static inline int vrf_dev_table_ifindex(struct net *net, int ifindex) +static inline u32 vrf_dev_table_ifindex(struct net *net, int ifindex) { return 0; } -static inline int vrf_dev_table_rtnl(const struct net_device *dev) +static inline u32 vrf_dev_table_rtnl(const struct net_device *dev) { return 0; } diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h index acd1d22571a2..833cfcb6750d 100644 --- a/include/trace/events/fib.h +++ b/include/trace/events/fib.h @@ -11,12 +11,12 @@ TRACE_EVENT(fib_table_lookup, - TP_PROTO(int tb_id, const struct flowi4 *flp), + TP_PROTO(u32 tb_id, const struct flowi4 *flp), TP_ARGS(tb_id, flp), TP_STRUCT__entry( - __field( int, tb_id ) + __field( u32, tb_id ) __field( int, oif ) __field( int, iif ) __field( __u8, tos ) @@ -43,7 +43,7 @@ TRACE_EVENT(fib_table_lookup, *p32 = flp->daddr; ), - TP_printk("table %d oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x", + TP_printk("table %u oif %d iif %d src %pI4 dst %pI4 tos %d scope %d flags %x", __entry->tb_id, __entry->oif, __entry->iif, __entry->src, __entry->dst, __entry->tos, __entry->scope, __entry->flags) diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 96773a2f95a7..1d0c3adb6f34 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -428,7 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; - int tb_id = RT_TABLE_LOCAL; + u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4036c94dfbe1..6fcbd215cdbc 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -213,7 +213,7 @@ void fib_flush_external(struct net *net) */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, - __be32 addr, int tb_id) + __be32 addr, u32 tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; @@ -240,7 +240,7 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, return ret; } -unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id) +unsigned int inet_addr_type_table(struct net *net, __be32 addr, u32 tb_id) { return __inet_dev_addr_type(net, NULL, addr, tb_id); } @@ -255,7 +255,7 @@ EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, dev, addr, rt_table); } @@ -268,7 +268,7 @@ unsigned int inet_addr_type_dev_table(struct net *net, const struct net_device *dev, __be32 addr) { - int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + u32 rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; return __inet_dev_addr_type(net, NULL, addr, rt_table); } @@ -803,7 +803,7 @@ out: static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); - int tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev); + u32 tb_id = vrf_dev_table_rtnl(ifa->ifa_dev->dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 992a9597daf8..064bd3caaa4f 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -863,7 +863,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) { if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fib_prefsrc != cfg->fc_dst) { - int tb_id = cfg->fc_table; + u32 tb_id = cfg->fc_table; if (tb_id == RT_TABLE_MAIN) tb_id = RT_TABLE_LOCAL; -- cgit v1.2.3 From bcc83839ffdb063dd2b0370cd85c4f825761fc59 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:24 -0700 Subject: skbuff: Make __skb_set_sw_hash a general function Move __skb_set_sw_hash to skbuff.h and add __skb_set_hash which is a common method (between __skb_set_sw_hash and skb_set_hash) to set the hash in an skbuff. Also, move skb_clear_hash to be closer to __skb_set_hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 45 ++++++++++++++++++++++++++++---------------- include/net/flow_dissector.h | 5 +++++ net/core/flow_dissector.c | 18 ++++++------------ 3 files changed, 40 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8a697c673b58..5d2c812e725b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -937,14 +937,40 @@ enum pkt_hash_types { PKT_HASH_TYPE_L4, /* Input: src_IP, dst_IP, src_port, dst_port */ }; -static inline void -skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) +static inline void skb_clear_hash(struct sk_buff *skb) { - skb->l4_hash = (type == PKT_HASH_TYPE_L4); + skb->hash = 0; skb->sw_hash = 0; + skb->l4_hash = 0; +} + +static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) +{ + if (!skb->l4_hash) + skb_clear_hash(skb); +} + +static inline void +__skb_set_hash(struct sk_buff *skb, __u32 hash, bool is_sw, bool is_l4) +{ + skb->l4_hash = is_l4; + skb->sw_hash = is_sw; skb->hash = hash; } +static inline void +skb_set_hash(struct sk_buff *skb, __u32 hash, enum pkt_hash_types type) +{ + /* Used by drivers to set hash from HW */ + __skb_set_hash(skb, hash, false, type == PKT_HASH_TYPE_L4); +} + +static inline void +__skb_set_sw_hash(struct sk_buff *skb, __u32 hash, bool is_l4) +{ + __skb_set_hash(skb, hash, true, is_l4); +} + void __skb_get_hash(struct sk_buff *skb); u32 skb_get_poff(const struct sk_buff *skb); u32 __skb_get_poff(const struct sk_buff *skb, void *data, @@ -1027,19 +1053,6 @@ static inline __u32 skb_get_hash_raw(const struct sk_buff *skb) return skb->hash; } -static inline void skb_clear_hash(struct sk_buff *skb) -{ - skb->hash = 0; - skb->sw_hash = 0; - skb->l4_hash = 0; -} - -static inline void skb_clear_hash_if_not_l4(struct sk_buff *skb) -{ - if (!skb->l4_hash) - skb_clear_hash(skb); -} - static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) { to->hash = from->hash; diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 6777a84c6f94..af76c496f7db 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -167,4 +167,9 @@ struct flow_keys_digest { void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); +static inline bool flow_keys_have_l4(struct flow_keys *keys) +{ + return (keys->ports.ports || keys->tags.flow_label); +} + #endif diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 11e6540fa386..151b6e48b81f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -590,15 +590,6 @@ void make_flow_keys_digest(struct flow_keys_digest *digest, } EXPORT_SYMBOL(make_flow_keys_digest); -static inline void __skb_set_sw_hash(struct sk_buff *skb, u32 hash, - struct flow_keys *keys) -{ - if (keys->ports.ports) - skb->l4_hash = 1; - skb->sw_hash = 1; - skb->hash = hash; -} - /** * __skb_get_hash: calculate a flow hash * @skb: sk_buff to calculate flow hash from @@ -619,7 +610,8 @@ void __skb_get_hash(struct sk_buff *skb) if (!hash) return; - __skb_set_sw_hash(skb, hash, &keys); + __skb_set_sw_hash(skb, hash, + flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); @@ -648,7 +640,8 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) keys.tags.flow_label = (__force u32)fl6->flowlabel; keys.basic.ip_proto = fl6->flowi6_proto; - __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); return skb->hash; } @@ -668,7 +661,8 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) keys.keyid.keyid = fl4->fl4_gre_key; keys.basic.ip_proto = fl4->flowi4_proto; - __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), &keys); + __skb_set_sw_hash(skb, flow_hash_from_keys(&keys), + flow_keys_have_l4(&keys)); return skb->hash; } -- cgit v1.2.3 From c6cc1ca7f4d70cbb3ea3a5ca163c5dabaf155cdb Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:25 -0700 Subject: flowi: Abstract out functions to get flow hash based on flowi Create __get_hash_from_flowi6 and __get_hash_from_flowi4 to get the flow keys and hash based on flowi structures. These are called by __skb_get_hash_flowi6 and __skb_get_hash_flowi4. Also, created get_hash_from_flowi6 and get_hash_from_flowi4 which can be called when just the hash value for a flowi is needed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/linux/skbuff.h | 16 ++++++++++++---- include/net/flow.h | 19 +++++++++++++++++++ include/net/flow_dissector.h | 2 ++ net/core/flow.c | 36 ++++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5d2c812e725b..bbe41bccfc5f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1030,8 +1030,12 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6); static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) { - if (!skb->l4_hash && !skb->sw_hash) - __skb_get_hash_flowi6(skb, fl6); + if (!skb->l4_hash && !skb->sw_hash) { + struct flow_keys keys; + + __skb_set_sw_hash(skb, __get_hash_from_flowi6(fl6, &keys), + flow_keys_have_l4(&keys)); + } return skb->hash; } @@ -1040,8 +1044,12 @@ __u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl); static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) { - if (!skb->l4_hash && !skb->sw_hash) - __skb_get_hash_flowi4(skb, fl4); + if (!skb->l4_hash && !skb->sw_hash) { + struct flow_keys keys; + + __skb_set_sw_hash(skb, __get_hash_from_flowi4(fl4, &keys), + flow_keys_have_l4(&keys)); + } return skb->hash; } diff --git a/include/net/flow.h b/include/net/flow.h index 9e0297c4c11d..dafe97c3c048 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -10,6 +10,7 @@ #include #include #include +#include /* * ifindex generation is per-net namespace, and loopback is @@ -243,4 +244,22 @@ void flow_cache_flush(struct net *net); void flow_cache_flush_deferred(struct net *net); extern atomic_t flow_cache_genid; +__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys); + +static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6) +{ + struct flow_keys keys; + + return __get_hash_from_flowi6(fl6, &keys); +} + +__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys); + +static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4) +{ + struct flow_keys keys; + + return __get_hash_from_flowi4(fl4, &keys); +} + #endif diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index af76c496f7db..74fe160f0b05 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -172,4 +172,6 @@ static inline bool flow_keys_have_l4(struct flow_keys *keys) return (keys->ports.ports || keys->tags.flow_label); } +u32 flow_hash_from_keys(struct flow_keys *keys); + #endif diff --git a/net/core/flow.c b/net/core/flow.c index 1033725be40b..61930bb0eb59 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -509,3 +510,38 @@ void flow_cache_fini(struct net *net) fc->percpu = NULL; } EXPORT_SYMBOL(flow_cache_fini); + +__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->basic.ip_proto = fl6->flowi6_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi6); + +__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + keys->addrs.v4addrs.src = fl4->saddr; + keys->addrs.v4addrs.dst = fl4->daddr; + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys->ports.src = fl4->fl4_sport; + keys->ports.dst = fl4->fl4_dport; + keys->keyid.keyid = fl4->fl4_gre_key; + keys->basic.ip_proto = fl4->flowi4_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi4); -- cgit v1.2.3 From a6e544b0a88b53114bfa5a57e21b7be7a8dfc9d0 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:26 -0700 Subject: flow_dissector: Jump to exit code in __skb_flow_dissect Instead of returning immediately (on a parsing failure for instance) we jump to cleanup code. This always sets protocol values in key_control (even on a failure there is still valid information in the key_tags that was set before the problem was hit). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 51 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 151b6e48b81f..22f3d768b459 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -130,6 +130,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_keyid *key_keyid; u8 ip_proto = 0; + bool ret = false; if (!data) { data = skb->data; @@ -171,7 +172,7 @@ again: ip: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph || iph->ihl < 5) - return false; + goto out_bad; nhoff += iph->ihl * 4; ip_proto = iph->protocol; @@ -197,7 +198,7 @@ ip: ipv6: iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph); if (!iph) - return false; + goto out_bad; ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); @@ -234,7 +235,7 @@ ipv6: vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan); if (!vlan) - return false; + goto out_bad; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_VLANID)) { @@ -256,7 +257,7 @@ ipv6: } *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; proto = hdr->proto; nhoff += PPPOE_SES_HLEN; switch (proto) { @@ -265,7 +266,7 @@ ipv6: case htons(PPP_IPV6): goto ipv6; default: - return false; + goto out_bad; } } case htons(ETH_P_TIPC): { @@ -275,9 +276,7 @@ ipv6: } *hdr, _hdr; hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; - key_basic->n_proto = proto; - key_control->thoff = (u16)nhoff; + goto out_bad; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { @@ -287,7 +286,7 @@ ipv6: key_addrs->tipcaddrs.srcnode = hdr->srcnode; key_control->addr_type = FLOW_DISSECTOR_KEY_TIPC_ADDRS; } - return true; + goto out_good; } case htons(ETH_P_MPLS_UC): @@ -297,7 +296,7 @@ mpls: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { @@ -310,21 +309,17 @@ mpls: htonl(MPLS_LS_LABEL_MASK); } - key_basic->n_proto = proto; - key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; - - return true; + goto out_good; } - return true; + goto out_good; } case htons(ETH_P_FCOE): key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN); /* fall through */ default: - return false; + goto out_bad; } ip_proto_again: @@ -337,7 +332,7 @@ ip_proto_again: hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) - return false; + goto out_bad; /* * Only look inside GRE if version zero and no * routing @@ -357,7 +352,7 @@ ip_proto_again: data, hlen, &_keyid); if (!keyid) - return false; + goto out_bad; if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID)) { @@ -378,7 +373,7 @@ ip_proto_again: sizeof(_eth), data, hlen, &_eth); if (!eth) - return false; + goto out_bad; proto = eth->h_proto; nhoff += sizeof(*eth); } @@ -395,7 +390,7 @@ ip_proto_again: opthdr = __skb_header_pointer(skb, nhoff, sizeof(_opthdr), data, hlen, &_opthdr); if (!opthdr) - return false; + goto out_bad; ip_proto = opthdr[0]; nhoff += (opthdr[1] + 1) << 3; @@ -415,10 +410,6 @@ ip_proto_again: break; } - key_basic->n_proto = proto; - key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; - if (skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, @@ -428,7 +419,15 @@ ip_proto_again: data, hlen); } - return true; +out_good: + ret = true; + +out_bad: + key_basic->n_proto = proto; + key_basic->ip_proto = ip_proto; + key_control->thoff = (u16)nhoff; + + return ret; } EXPORT_SYMBOL(__skb_flow_dissect); -- cgit v1.2.3 From cd79a2382aa5dcefa6e21a7c59bb1bb19e53b74d Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:27 -0700 Subject: flow_dissector: Add flags argument to skb_flow_dissector functions The flags argument will allow control of the dissection process (for instance whether to parse beyond L3). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- drivers/net/ethernet/cisco/enic/enic_clsf.c | 2 +- drivers/net/hyperv/netvsc_drv.c | 2 +- include/linux/skbuff.h | 19 +++++++++++-------- net/core/flow_dissector.c | 7 ++++--- net/ethernet/eth.c | 2 +- net/sched/cls_flow.c | 2 +- net/sched/cls_flower.c | 2 +- net/sched/sch_choke.c | 4 ++-- 9 files changed, 23 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 06e2d01f0b4e..771a449d2f56 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3095,7 +3095,7 @@ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, int noff, proto = -1; if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) - return skb_flow_dissect_flow_keys(skb, fk); + return skb_flow_dissect_flow_keys(skb, fk, 0); fk->ports.ports = 0; noff = skb_network_offset(skb); diff --git a/drivers/net/ethernet/cisco/enic/enic_clsf.c b/drivers/net/ethernet/cisco/enic/enic_clsf.c index d106186f4f4a..3c677ed3c29e 100644 --- a/drivers/net/ethernet/cisco/enic/enic_clsf.c +++ b/drivers/net/ethernet/cisco/enic/enic_clsf.c @@ -177,7 +177,7 @@ int enic_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb, int res, i; enic = netdev_priv(dev); - res = skb_flow_dissect_flow_keys(skb, &keys); + res = skb_flow_dissect_flow_keys(skb, &keys, 0); if (!res || keys.basic.n_proto != htons(ETH_P_IP) || (keys.basic.ip_proto != IPPROTO_TCP && keys.basic.ip_proto != IPPROTO_UDP)) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 2990024b90f9..409b48e1e589 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -239,7 +239,7 @@ static bool netvsc_set_hash(u32 *hash, struct sk_buff *skb) struct flow_keys flow; int data_len; - if (!skb_flow_dissect_flow_keys(skb, &flow) || + if (!skb_flow_dissect_flow_keys(skb, &flow, 0) || !(flow.basic.n_proto == htons(ETH_P_IP) || flow.basic.n_proto == htons(ETH_P_IPV6))) return false; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index bbe41bccfc5f..9e62687c70f3 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -991,31 +991,34 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, - void *data, __be16 proto, int nhoff, int hlen); + void *data, __be16 proto, int nhoff, int hlen, + unsigned int flags); static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container) + void *target_container, unsigned int flags) { return __skb_flow_dissect(skb, flow_dissector, target_container, - NULL, 0, 0, 0); + NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys(const struct sk_buff *skb, - struct flow_keys *flow) + struct flow_keys *flow, + unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(skb, &flow_keys_dissector, flow, - NULL, 0, 0, 0); + NULL, 0, 0, 0, flags); } static inline bool skb_flow_dissect_flow_keys_buf(struct flow_keys *flow, void *data, __be16 proto, - int nhoff, int hlen) + int nhoff, int hlen, + unsigned int flags) { memset(flow, 0, sizeof(*flow)); return __skb_flow_dissect(NULL, &flow_keys_buf_dissector, flow, - data, proto, nhoff, hlen); + data, proto, nhoff, hlen, flags); } static inline __u32 skb_get_hash(struct sk_buff *skb) @@ -2046,7 +2049,7 @@ static inline void skb_probe_transport_header(struct sk_buff *skb, if (skb_transport_header_was_set(skb)) return; - else if (skb_flow_dissect_flow_keys(skb, &keys)) + else if (skb_flow_dissect_flow_keys(skb, &keys, 0)) skb_set_transport_header(skb, keys.control.thoff); else skb_set_transport_header(skb, offset_hint); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 22f3d768b459..c3d9807cb34e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -121,7 +121,8 @@ EXPORT_SYMBOL(__skb_flow_get_ports); bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, - void *data, __be16 proto, int nhoff, int hlen) + void *data, __be16 proto, int nhoff, int hlen, + unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -556,7 +557,7 @@ EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, u32 keyval) { - if (!skb_flow_dissect_flow_keys(skb, keys)) + if (!skb_flow_dissect_flow_keys(skb, keys, 0)) return 0; return __flow_hash_from_keys(keys, keyval); @@ -726,7 +727,7 @@ u32 skb_get_poff(const struct sk_buff *skb) { struct flow_keys keys; - if (!skb_flow_dissect_flow_keys(skb, &keys)) + if (!skb_flow_dissect_flow_keys(skb, &keys, 0)) return 0; return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 217127c3a3ef..d850fdc828f9 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -132,7 +132,7 @@ u32 eth_get_headlen(void *data, unsigned int len) /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, - sizeof(*eth), len)) + sizeof(*eth), len, 0)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index bb2a0f529c1f..536838b657bf 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -301,7 +301,7 @@ static int flow_classify(struct sk_buff *skb, const struct tcf_proto *tp, keymask = f->keymask; if (keymask & FLOW_KEYS_NEEDED) - skb_flow_dissect_flow_keys(skb, &flow_keys); + skb_flow_dissect_flow_keys(skb, &flow_keys, 0); for (n = 0; n < f->nkeys; n++) { key = ffs(keymask) - 1; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 2f3d03f99487..57692947ebbe 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -129,7 +129,7 @@ static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, * so do it rather here. */ skb_key.basic.n_proto = skb->protocol; - skb_flow_dissect(skb, &head->dissector, &skb_key); + skb_flow_dissect(skb, &head->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 665bde07916b..02bfd3d1c4f0 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -170,13 +170,13 @@ static bool choke_match_flow(struct sk_buff *skb1, if (!choke_skb_cb(skb1)->keys_valid) { choke_skb_cb(skb1)->keys_valid = 1; - skb_flow_dissect_flow_keys(skb1, &temp); + skb_flow_dissect_flow_keys(skb1, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp); } if (!choke_skb_cb(skb2)->keys_valid) { choke_skb_cb(skb2)->keys_valid = 1; - skb_flow_dissect_flow_keys(skb2, &temp); + skb_flow_dissect_flow_keys(skb2, &temp, 0); make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp); } -- cgit v1.2.3 From 807e165dc44fd93f9d378f861f0540a158d7343a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:28 -0700 Subject: flow_dissector: Add control/reporting of fragmentation Add an input flag to flow dissector on rather dissection should be attempted on a first fragment. Also add key_control flags to indicate that a packet is a fragment or first fragment. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 4 ++++ net/core/flow_dissector.c | 15 +++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 74fe160f0b05..34102270b086 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -12,6 +12,8 @@ struct flow_dissector_key_control { u16 thoff; u16 addr_type; + u32 is_fragment:1; + u32 first_frag:1; }; /** @@ -122,6 +124,8 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_MAX, }; +#define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) + struct flow_dissector_key { enum flow_dissector_key_id key_id; size_t offset; /* offset of struct flow_dissector_key_* diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index c3d9807cb34e..7536a4669029 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -177,8 +177,6 @@ ip: nhoff += iph->ihl * 4; ip_proto = iph->protocol; - if (ip_is_fragment(iph)) - ip_proto = 0; if (!skb_flow_dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) @@ -189,6 +187,19 @@ ip: memcpy(&key_addrs->v4addrs, &iph->saddr, sizeof(key_addrs->v4addrs)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + + if (ip_is_fragment(iph)) { + key_control->is_fragment = 1; + + if (iph->frag_off & htons(IP_OFFSET)) { + goto out_good; + } else { + key_control->first_frag = 1; + if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) + goto out_good; + } + } + break; } case htons(ETH_P_IPV6): { -- cgit v1.2.3 From b840f28b908da0239c8c5d9c8cae362ad21cda97 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:29 -0700 Subject: flow_dissector: Support IPv6 fragment header Parse NEXTHDR_FRAGMENT. When seen account for it in the fragment bits of key_control. Also, check if first fragment should be parsed. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 7536a4669029..907de2f68b1f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -409,6 +409,31 @@ ip_proto_again: goto ip_proto_again; } + case NEXTHDR_FRAGMENT: { + struct frag_hdr _fh, *fh; + + if (proto != htons(ETH_P_IPV6)) + break; + + fh = __skb_header_pointer(skb, nhoff, sizeof(_fh), + data, hlen, &_fh); + + if (!fh) + goto out_bad; + + key_control->is_fragment = 1; + + nhoff += sizeof(_fh); + + if (!(fh->frag_off & htons(IP6_OFFSET))) { + key_control->first_frag = 1; + if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { + ip_proto = fh->nexthdr; + goto ip_proto_again; + } + } + goto out_good; + } case IPPROTO_IPIP: proto = htons(ETH_P_IP); goto ip; -- cgit v1.2.3 From 8306b688f1a6621b9efe3b0d827e26750528b12a Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:30 -0700 Subject: flow_dissector: Add flag to stop parsing at L3 Add an input flag to flow dissector on rather dissection should be stopped when an L3 packet is encountered. This would be useful if a caller just wanted to get IP addresses of the outermost header (e.g. to do an L3 hash). Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + net/core/flow_dissector.c | 6 ++++++ 2 files changed, 7 insertions(+) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 34102270b086..bb81e3b576e7 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -125,6 +125,7 @@ enum flow_dissector_key_id { }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) +#define FLOW_DISSECTOR_F_STOP_AT_L3 BIT(1) struct flow_dissector_key { enum flow_dissector_key_id key_id; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 907de2f68b1f..94fd841f341f 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -200,6 +200,9 @@ ip: } } + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + goto out_good; + break; } case htons(ETH_P_IPV6): { @@ -238,6 +241,9 @@ ipv6: } } + if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) + goto out_good; + break; } case htons(ETH_P_8021AD): -- cgit v1.2.3 From 872b1abb1ed47a691f465fb3d285f6cf6bcd8663 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:31 -0700 Subject: flow_dissector: Add flag to stop parsing when an IPv6 flow label is seen Add an input flag to flow dissector on rather dissection should be stopped when a flow label is encountered. Presumably, the flow label is derived from a sufficient hash of an inner transport packet so further dissection is not needed (that is ports are not included in the flow hash). Using the flow label instead of ports has the additional benefit that packet fragments should hash to same value as non-fragments for a flow (assuming that the same flow label is used). We set this flag by default in for skb_get_hash. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 1 + net/core/flow_dissector.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index bb81e3b576e7..66dbc3498efb 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -126,6 +126,7 @@ enum flow_dissector_key_id { #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_L3 BIT(1) +#define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(2) struct flow_dissector_key { enum flow_dissector_key_id key_id; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 94fd841f341f..094e34354627 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -239,6 +239,8 @@ ipv6: target_container); key_tags->flow_label = ntohl(flow_label); } + if (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL) + goto out_good; } if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) @@ -599,7 +601,8 @@ EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, u32 keyval) { - if (!skb_flow_dissect_flow_keys(skb, keys, 0)) + if (!skb_flow_dissect_flow_keys(skb, keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) return 0; return __flow_hash_from_keys(keys, keyval); -- cgit v1.2.3 From 823b96939578eae67b9d6c0e33a39d6a7b6401e7 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:32 -0700 Subject: flow_dissector: Add control/reporting of encapsulation Add an input flag to flow dissector on rather dissection should stop when encapsulation is detected (IP/IP or GRE). Also, add a key_control flag that indicates encapsulation was encountered during the dissection. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 2 ++ net/core/flow_dissector.c | 15 +++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 66dbc3498efb..bddd1089dbce 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -14,6 +14,7 @@ struct flow_dissector_key_control { u16 addr_type; u32 is_fragment:1; u32 first_frag:1; + u32 encapsulation:1; }; /** @@ -127,6 +128,7 @@ enum flow_dissector_key_id { #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_L3 BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(2) +#define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(3) struct flow_dissector_key { enum flow_dissector_key_id key_id; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 094e34354627..8d890132e2d7 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -397,6 +397,11 @@ ip_proto_again: proto = eth->h_proto; nhoff += sizeof(*eth); } + + key_control->encapsulation = 1; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto again; } case NEXTHDR_HOP: @@ -444,9 +449,19 @@ ip_proto_again: } case IPPROTO_IPIP: proto = htons(ETH_P_IP); + + key_control->encapsulation = 1; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto ip; case IPPROTO_IPV6: proto = htons(ETH_P_IPV6); + + key_control->encapsulation = 1; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + goto out_good; + goto ipv6; case IPPROTO_MPLS: proto = htons(ETH_P_MPLS_UC); -- cgit v1.2.3 From 6db61d79c1e1b2346e2142d6c950a8d2e8380b82 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Tue, 1 Sep 2015 09:24:33 -0700 Subject: flow_dissector: Ignore flow dissector return value from ___skb_get_hash In ___skb_get_hash ignore return value from skb_flow_dissect_flow_keys. A failure in that function likely means that there was a parse error, so we may as well use whatever fields were found before the error was hit. This is also good because it means we won't keep trying to derive the hash on subsequent calls to skb_get_hash for the same packet. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 8d890132e2d7..b563339436d0 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -616,9 +616,8 @@ EXPORT_SYMBOL(flow_hash_from_keys); static inline u32 ___skb_get_hash(const struct sk_buff *skb, struct flow_keys *keys, u32 keyval) { - if (!skb_flow_dissect_flow_keys(skb, keys, - FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) - return 0; + skb_flow_dissect_flow_keys(skb, keys, + FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL); return __flow_hash_from_keys(keys, keyval); } @@ -662,15 +661,10 @@ EXPORT_SYMBOL(make_flow_keys_digest); void __skb_get_hash(struct sk_buff *skb) { struct flow_keys keys; - u32 hash; __flow_hash_secret_init(); - hash = ___skb_get_hash(skb, &keys, hashrnd); - if (!hash) - return; - - __skb_set_sw_hash(skb, hash, + __skb_set_sw_hash(skb, ___skb_get_hash(skb, &keys, hashrnd), flow_keys_have_l4(&keys)); } EXPORT_SYMBOL(__skb_get_hash); -- cgit v1.2.3 From 4b36993d3df0834eff3b4172962de0343a4d9123 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Tue, 1 Sep 2015 16:46:08 -0700 Subject: flow_dissector: Don't use bit fields. Just have a flags member instead. In file included from include/linux/linkage.h:4:0, from include/linux/kernel.h:6, from net/core/flow_dissector.c:1: In function 'flow_keys_hash_start', inlined from 'flow_hash_from_keys' at net/core/flow_dissector.c:553:34: >> include/linux/compiler.h:447:38: error: call to '__compiletime_assert_459' declared with attribute error: BUILD_BUG_ON failed: FLOW_KEYS_HASH_OFFSET % sizeof(u32) Reported-by: kbuild test robot Signed-off-by: David S. Miller --- include/net/flow_dissector.h | 8 +++++--- net/core/flow_dissector.c | 14 +++++++------- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index bddd1089dbce..8c8548cf5888 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -12,11 +12,13 @@ struct flow_dissector_key_control { u16 thoff; u16 addr_type; - u32 is_fragment:1; - u32 first_frag:1; - u32 encapsulation:1; + u32 flags; }; +#define FLOW_DIS_IS_FRAGMENT BIT(0) +#define FLOW_DIS_FIRST_FRAG BIT(1) +#define FLOW_DIS_ENCAPSULATION BIT(2) + /** * struct flow_dissector_key_basic: * @thoff: Transport header offset diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b563339436d0..8d32020303c6 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -189,12 +189,12 @@ ip: key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (ip_is_fragment(iph)) { - key_control->is_fragment = 1; + key_control->flags |= FLOW_DIS_IS_FRAGMENT; if (iph->frag_off & htons(IP_OFFSET)) { goto out_good; } else { - key_control->first_frag = 1; + key_control->flags |= FLOW_DIS_FIRST_FRAG; if (!(flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)) goto out_good; } @@ -398,7 +398,7 @@ ip_proto_again: nhoff += sizeof(*eth); } - key_control->encapsulation = 1; + key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) goto out_good; @@ -434,12 +434,12 @@ ip_proto_again: if (!fh) goto out_bad; - key_control->is_fragment = 1; + key_control->flags |= FLOW_DIS_IS_FRAGMENT; nhoff += sizeof(_fh); if (!(fh->frag_off & htons(IP6_OFFSET))) { - key_control->first_frag = 1; + key_control->flags |= FLOW_DIS_FIRST_FRAG; if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) { ip_proto = fh->nexthdr; goto ip_proto_again; @@ -450,7 +450,7 @@ ip_proto_again: case IPPROTO_IPIP: proto = htons(ETH_P_IP); - key_control->encapsulation = 1; + key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) goto out_good; @@ -458,7 +458,7 @@ ip_proto_again: case IPPROTO_IPV6: proto = htons(ETH_P_IPV6); - key_control->encapsulation = 1; + key_control->flags |= FLOW_DIS_ENCAPSULATION; if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) goto out_good; -- cgit v1.2.3 From a17ace95b0f08ccbcf24946db5673c4b5ee8fcae Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Tue, 1 Sep 2015 17:00:24 -0700 Subject: flow: Move __get_hash_from_flowi{4,6} into flow_dissector.c These cannot live in net/core/flow.c which only builds when XFRM is enabled. Reported-by: kbuild test robot Signed-off-by: David S. Miller --- net/core/flow.c | 36 ------------------------------------ net/core/flow_dissector.c | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/core/flow.c b/net/core/flow.c index 61930bb0eb59..1033725be40b 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -510,38 +509,3 @@ void flow_cache_fini(struct net *net) fc->percpu = NULL; } EXPORT_SYMBOL(flow_cache_fini); - -__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys) -{ - memset(keys, 0, sizeof(*keys)); - - memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, - sizeof(keys->addrs.v6addrs.src)); - memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, - sizeof(keys->addrs.v6addrs.dst)); - keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; - keys->ports.src = fl6->fl6_sport; - keys->ports.dst = fl6->fl6_dport; - keys->keyid.keyid = fl6->fl6_gre_key; - keys->tags.flow_label = (__force u32)fl6->flowlabel; - keys->basic.ip_proto = fl6->flowi6_proto; - - return flow_hash_from_keys(keys); -} -EXPORT_SYMBOL(__get_hash_from_flowi6); - -__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys) -{ - memset(keys, 0, sizeof(*keys)); - - keys->addrs.v4addrs.src = fl4->saddr; - keys->addrs.v4addrs.dst = fl4->daddr; - keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; - keys->ports.src = fl4->fl4_sport; - keys->ports.dst = fl4->fl4_dport; - keys->keyid.keyid = fl4->fl4_gre_key; - keys->basic.ip_proto = fl4->flowi4_proto; - - return flow_hash_from_keys(keys); -} -EXPORT_SYMBOL(__get_hash_from_flowi4); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 8d32020303c6..345a0408cfe4 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -787,6 +787,41 @@ u32 skb_get_poff(const struct sk_buff *skb) return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } +__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + memcpy(&keys->addrs.v6addrs.src, &fl6->saddr, + sizeof(keys->addrs.v6addrs.src)); + memcpy(&keys->addrs.v6addrs.dst, &fl6->daddr, + sizeof(keys->addrs.v6addrs.dst)); + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + keys->ports.src = fl6->fl6_sport; + keys->ports.dst = fl6->fl6_dport; + keys->keyid.keyid = fl6->fl6_gre_key; + keys->tags.flow_label = (__force u32)fl6->flowlabel; + keys->basic.ip_proto = fl6->flowi6_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi6); + +__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys) +{ + memset(keys, 0, sizeof(*keys)); + + keys->addrs.v4addrs.src = fl4->saddr; + keys->addrs.v4addrs.dst = fl4->daddr; + keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + keys->ports.src = fl4->fl4_sport; + keys->ports.dst = fl4->fl4_dport; + keys->keyid.keyid = fl4->fl4_gre_key; + keys->basic.ip_proto = fl4->flowi4_proto; + + return flow_hash_from_keys(keys); +} +EXPORT_SYMBOL(__get_hash_from_flowi4); + static const struct flow_dissector_key flow_keys_dissector_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, -- cgit v1.2.3 From 20a17bf6c04e3eca8824c930ecc55ab832558e3b Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Tue, 1 Sep 2015 21:19:17 -0700 Subject: flow_dissector: Use 'const' where possible. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 8 ++--- include/net/flow.h | 8 ++--- net/core/flow_dissector.c | 79 ++++++++++++++++++++++++----------------------- 3 files changed, 49 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index eabfb810bc62..2738d355cdf9 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1029,9 +1029,9 @@ static inline __u32 skb_get_hash(struct sk_buff *skb) return skb->hash; } -__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6); +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6); -static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; @@ -1043,9 +1043,9 @@ static inline __u32 skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) return skb->hash; } -__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl); +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl); -static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +static inline __u32 skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) { if (!skb->l4_hash && !skb->sw_hash) { struct flow_keys keys; diff --git a/include/net/flow.h b/include/net/flow.h index dafe97c3c048..acd6a096250e 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -244,18 +244,18 @@ void flow_cache_flush(struct net *net); void flow_cache_flush_deferred(struct net *net); extern atomic_t flow_cache_genid; -__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys); +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); -static inline __u32 get_hash_from_flowi6(struct flowi6 *fl6) +static inline __u32 get_hash_from_flowi6(const struct flowi6 *fl6) { struct flow_keys keys; return __get_hash_from_flowi6(fl6, &keys); } -__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys); +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys); -static inline __u32 get_hash_from_flowi4(struct flowi4 *fl4) +static inline __u32 get_hash_from_flowi4(const struct flowi4 *fl4) { struct flow_keys keys; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 345a0408cfe4..d79699c9d1b9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -19,14 +19,14 @@ #include #include -static bool skb_flow_dissector_uses_key(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) +static bool dissector_uses_key(const struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1 << key_id); } -static void skb_flow_dissector_set_key(struct flow_dissector *flow_dissector, - enum flow_dissector_key_id key_id) +static void dissector_set_key(struct flow_dissector *flow_dissector, + enum flow_dissector_key_id key_id) { flow_dissector->used_keys |= (1 << key_id); } @@ -51,20 +51,20 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, * boundaries of unsigned short. */ BUG_ON(key->offset > USHRT_MAX); - BUG_ON(skb_flow_dissector_uses_key(flow_dissector, - key->key_id)); + BUG_ON(dissector_uses_key(flow_dissector, + key->key_id)); - skb_flow_dissector_set_key(flow_dissector, key->key_id); + dissector_set_key(flow_dissector, key->key_id); flow_dissector->offset[key->key_id] = key->offset; } /* Ensure that the dissector always includes control and basic key. * That way we are able to avoid handling lack of these in fast path. */ - BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_CONTROL)); - BUG_ON(!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_BASIC)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL)); + BUG_ON(!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC)); } EXPORT_SYMBOL(skb_flow_dissector_init); @@ -154,8 +154,8 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ETH_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); struct flow_dissector_key_eth_addrs *key_eth_addrs; @@ -178,8 +178,8 @@ ip: ip_proto = iph->protocol; - if (!skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV4_ADDRS)) + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS)) break; key_addrs = skb_flow_dissector_target(flow_dissector, @@ -218,8 +218,8 @@ ipv6: ip_proto = iph->nexthdr; nhoff += sizeof(struct ipv6hdr); - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs; key_ipv6_addrs = skb_flow_dissector_target(flow_dissector, @@ -232,8 +232,8 @@ ipv6: flow_label = ip6_flowlabel(iph); if (flow_label) { - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_FLOW_LABEL)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_FLOW_LABEL)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_FLOW_LABEL, target_container); @@ -257,8 +257,8 @@ ipv6: if (!vlan) goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_VLANID)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_VLANID)) { key_tags = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_VLANID, target_container); @@ -298,8 +298,8 @@ ipv6: if (!hdr) goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_TIPC_ADDRS)) { key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_TIPC_ADDRS, target_container); @@ -320,8 +320,8 @@ mpls: if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); @@ -374,8 +374,8 @@ ip_proto_again: if (!keyid) goto out_bad; - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_GRE_KEYID)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_GRE_KEYID, target_container); @@ -470,8 +470,8 @@ ip_proto_again: break; } - if (skb_flow_dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_PORTS)) { + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS)) { key_ports = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_PORTS, target_container); @@ -497,18 +497,21 @@ static __always_inline void __flow_hash_secret_init(void) net_get_random_once(&hashrnd, sizeof(hashrnd)); } -static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval) +static __always_inline u32 __flow_hash_words(const u32 *words, u32 length, + u32 keyval) { return jhash2(words, length, keyval); } -static inline void *flow_keys_hash_start(struct flow_keys *flow) +static inline const u32 *flow_keys_hash_start(const struct flow_keys *flow) { + const void *p = flow; + BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % sizeof(u32)); - return (void *)flow + FLOW_KEYS_HASH_OFFSET; + return (const u32 *)(p + FLOW_KEYS_HASH_OFFSET); } -static inline size_t flow_keys_hash_length(struct flow_keys *flow) +static inline size_t flow_keys_hash_length(const struct flow_keys *flow) { size_t diff = FLOW_KEYS_HASH_OFFSET + sizeof(flow->addrs); BUILD_BUG_ON((sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) % sizeof(u32)); @@ -598,7 +601,7 @@ static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval) __flow_hash_consistentify(keys); - hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys), + hash = __flow_hash_words(flow_keys_hash_start(keys), flow_keys_hash_length(keys), keyval); if (!hash) hash = 1; @@ -677,7 +680,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb) } EXPORT_SYMBOL(skb_get_hash_perturb); -__u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) +__u32 __skb_get_hash_flowi6(struct sk_buff *skb, const struct flowi6 *fl6) { struct flow_keys keys; @@ -701,7 +704,7 @@ __u32 __skb_get_hash_flowi6(struct sk_buff *skb, struct flowi6 *fl6) } EXPORT_SYMBOL(__skb_get_hash_flowi6); -__u32 __skb_get_hash_flowi4(struct sk_buff *skb, struct flowi4 *fl4) +__u32 __skb_get_hash_flowi4(struct sk_buff *skb, const struct flowi4 *fl4) { struct flow_keys keys; @@ -787,7 +790,7 @@ u32 skb_get_poff(const struct sk_buff *skb) return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb)); } -__u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys) +__u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); @@ -806,7 +809,7 @@ __u32 __get_hash_from_flowi6(struct flowi6 *fl6, struct flow_keys *keys) } EXPORT_SYMBOL(__get_hash_from_flowi6); -__u32 __get_hash_from_flowi4(struct flowi4 *fl4, struct flow_keys *keys) +__u32 __get_hash_from_flowi4(const struct flowi4 *fl4, struct flow_keys *keys) { memset(keys, 0, sizeof(*keys)); -- cgit v1.2.3 From b382c08656000c12a146723a153b85b13a855b49 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 2 Sep 2015 14:00:36 +0200 Subject: sock, diag: fix panic in sock_diag_put_filterinfo diag socket's sock_diag_put_filterinfo() dumps classic BPF programs upon request to user space (ss -0 -b). However, native eBPF programs attached to sockets (SO_ATTACH_BPF) cannot be dumped with this method: Their orig_prog is always NULL. However, sock_diag_put_filterinfo() unconditionally tries to access its filter length resp. wants to copy the filter insns from there. Internal cBPF to eBPF transformations attached to sockets don't have this issue, as orig_prog state is kept. It's currently only used by packet sockets. If we would want to add native eBPF support in the future, this needs to be done through a different attribute than PACKET_DIAG_FILTER to not confuse possible user space disassemblers that work on diag data. Fixes: 89aa075832b0 ("net: sock: allow eBPF programs to be attached to sockets") Signed-off-by: Daniel Borkmann Acked-by: Nicolas Dichtel Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/sock_diag.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index d79866c5f8bc..817622f3dbb7 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -90,6 +90,9 @@ int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, goto out; fprog = filter->prog->orig_prog; + if (!fprog) + goto out; + flen = bpf_classic_proglen(fprog); attr = nla_reserve(skb, attrtype, flen); -- cgit v1.2.3 From e41b0bedba0293b9e1e8d1e8ed553104b9693656 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Sep 2015 00:29:07 +0200 Subject: ipv6: fix exthdrs offload registration in out_rt path We previously register IPPROTO_ROUTING offload under inet6_add_offload(), but in error path, we try to unregister it with inet_del_offload(). This doesn't seem correct, it should actually be inet6_del_offload(), also ipv6_exthdrs_offload_exit() from that commit seems rather incorrect (it also uses rthdr_offload twice), but it got removed entirely later on. Fixes: 3336288a9fea ("ipv6: Switch to using new offload infrastructure.") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv6/exthdrs_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/exthdrs_offload.c b/net/ipv6/exthdrs_offload.c index 447a7fbd1bb6..f5e2ba1c18bf 100644 --- a/net/ipv6/exthdrs_offload.c +++ b/net/ipv6/exthdrs_offload.c @@ -36,6 +36,6 @@ out: return ret; out_rt: - inet_del_offload(&rthdr_offload, IPPROTO_ROUTING); + inet6_del_offload(&rthdr_offload, IPPROTO_ROUTING); goto out; } -- cgit v1.2.3 From a82b0e63917f597c546cd479acc938e08ac54f2d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 2 Sep 2015 20:54:02 +0200 Subject: netfilter: nf_dup{4, 6}: fix build error when nf_conntrack disabled While testing various Kconfig options on another issue, I found that the following one triggers as well on allmodconfig and nf_conntrack disabled: net/ipv4/netfilter/nf_dup_ipv4.c: In function ‘nf_dup_ipv4’: net/ipv4/netfilter/nf_dup_ipv4.c:72:20: error: ‘nf_skb_duplicated’ undeclared (first use in this function) if (this_cpu_read(nf_skb_duplicated)) [...] net/ipv6/netfilter/nf_dup_ipv6.c: In function ‘nf_dup_ipv6’: net/ipv6/netfilter/nf_dup_ipv6.c:66:20: error: ‘nf_skb_duplicated’ undeclared (first use in this function) if (this_cpu_read(nf_skb_duplicated)) Fix it by including directly the header where it is defined. Fixes: bbde9fc1824a ("netfilter: factor out packet duplication for IPv4/IPv6") Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_dup_ipv4.c | 1 + net/ipv6/netfilter/nf_dup_ipv6.c | 1 + 2 files changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index b5bb37564b0e..2d79e6e8d934 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index c5c87e921ccd..c8ab626556a0 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 62da98656b62a5ca57f22263705175af8ded5aa1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 3 Sep 2015 01:26:07 +0200 Subject: netfilter: nf_conntrack: make nf_ct_zone_dflt built-in Fengguang reported, that some randconfig generated the following linker issue with nf_ct_zone_dflt object involved: [...] CC init/version.o LD init/built-in.o net/built-in.o: In function `ipv4_conntrack_defrag': nf_defrag_ipv4.c:(.text+0x93e95): undefined reference to `nf_ct_zone_dflt' net/built-in.o: In function `ipv6_defrag': nf_defrag_ipv6_hooks.c:(.text+0xe3ffe): undefined reference to `nf_ct_zone_dflt' make: *** [vmlinux] Error 1 Given that configurations exist where we have a built-in part, which is accessing nf_ct_zone_dflt such as the two handlers nf_ct_defrag_user() and nf_ct6_defrag_user(), and a part that configures nf_conntrack as a module, we must move nf_ct_zone_dflt into a fixed, guaranteed built-in area when netfilter is configured in general. Therefore, split the more generic parts into a common header under include/linux/netfilter/ and move nf_ct_zone_dflt into the built-in section that already holds parts related to CONFIG_NF_CONNTRACK in the netfilter core. This fixes the issue on my side. Fixes: 308ac9143ee2 ("netfilter: nf_conntrack: push zone object into functions") Reported-by: Fengguang Wu Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netfilter.h | 2 ++ .../linux/netfilter/nf_conntrack_zones_common.h | 23 ++++++++++++++++++++++ include/net/netfilter/nf_conntrack_zones.h | 19 +----------------- net/netfilter/core.c | 6 ++++++ net/netfilter/nf_conntrack_core.c | 7 ------- 5 files changed, 32 insertions(+), 25 deletions(-) create mode 100644 include/linux/netfilter/nf_conntrack_zones_common.h (limited to 'net') diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index d788ce62d826..36a652531791 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -368,6 +368,8 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family) #endif /*CONFIG_NETFILTER*/ #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include + extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu; void nf_ct_attach(struct sk_buff *, const struct sk_buff *); extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu; diff --git a/include/linux/netfilter/nf_conntrack_zones_common.h b/include/linux/netfilter/nf_conntrack_zones_common.h new file mode 100644 index 000000000000..5d7cf36d4766 --- /dev/null +++ b/include/linux/netfilter/nf_conntrack_zones_common.h @@ -0,0 +1,23 @@ +#ifndef _NF_CONNTRACK_ZONES_COMMON_H +#define _NF_CONNTRACK_ZONES_COMMON_H + +#include + +#define NF_CT_DEFAULT_ZONE_ID 0 + +#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) +#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) + +#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) + +#define NF_CT_FLAG_MARK 1 + +struct nf_conntrack_zone { + u16 id; + u8 flags; + u8 dir; +}; + +extern const struct nf_conntrack_zone nf_ct_zone_dflt; + +#endif /* _NF_CONNTRACK_ZONES_COMMON_H */ diff --git a/include/net/netfilter/nf_conntrack_zones.h b/include/net/netfilter/nf_conntrack_zones.h index 5316c7b3a374..4e32512cef32 100644 --- a/include/net/netfilter/nf_conntrack_zones.h +++ b/include/net/netfilter/nf_conntrack_zones.h @@ -1,24 +1,7 @@ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H -#include - -#define NF_CT_DEFAULT_ZONE_ID 0 - -#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) -#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) - -#define NF_CT_DEFAULT_ZONE_DIR (NF_CT_ZONE_DIR_ORIG | NF_CT_ZONE_DIR_REPL) - -#define NF_CT_FLAG_MARK 1 - -struct nf_conntrack_zone { - u16 id; - u8 flags; - u8 dir; -}; - -extern const struct nf_conntrack_zone nf_ct_zone_dflt; +#include #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 0b939b7ad724..8e47f8113495 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -388,6 +388,12 @@ EXPORT_SYMBOL(nf_conntrack_destroy); struct nfq_ct_hook __rcu *nfq_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfq_ct_hook); +/* Built-in default zone used e.g. by modules. */ +const struct nf_conntrack_zone nf_ct_zone_dflt = { + .id = NF_CT_DEFAULT_ZONE_ID, + .dir = NF_CT_DEFAULT_ZONE_DIR, +}; +EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); #endif /* CONFIG_NF_CONNTRACK */ #ifdef CONFIG_NF_NAT_NEEDED diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ac3be9b0629b..eedf0495f11f 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1286,13 +1286,6 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, } EXPORT_SYMBOL_GPL(__nf_ct_kill_acct); -/* Built-in default zone used e.g. by modules. */ -const struct nf_conntrack_zone nf_ct_zone_dflt = { - .id = NF_CT_DEFAULT_ZONE_ID, - .dir = NF_CT_DEFAULT_ZONE_DIR, -}; -EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); - #ifdef CONFIG_NF_CONNTRACK_ZONES static struct nf_ct_ext_type nf_ct_zone_extend __read_mostly = { .len = sizeof(struct nf_conntrack_zone), -- cgit v1.2.3