From e050dbeb0d1e19072d0d656b51f06af1af860f19 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 15 Aug 2014 10:19:39 +0200 Subject: batman-adv: Fix parameter order of hlist_add_behind 1d023284c31a4e40a94d5bbcb7dbb7a35ee0bcbc ("list: fix order of arguments for hlist_add_after(_rcu)") was incorrectly rebased on top of d9124268d84a836f14a6ead54ff9d8eee4c43be5 ("batman-adv: Fix out-of-order fragmentation support"). The parameter order change of the rebased patch was not re-applied as expected. This causes a memory leak and can cause crashes when out-of-order packets are received. hlist_add_behind will try to access the uninitalized list pointers of frag_entry_new to find the previous/next entry and may modify/read random memory locations. Signed-off-by: Sven Eckelmann Cc: Andrew Morton Signed-off-by: David S. Miller --- net/batman-adv/fragmentation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 52c43f904220..fc1835c6bb40 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -188,7 +188,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, /* Reached the end of the list, so insert after 'frag_entry_last'. */ if (likely(frag_entry_last)) { - hlist_add_behind(&frag_entry_last->list, &frag_entry_new->list); + hlist_add_behind(&frag_entry_new->list, &frag_entry_last->list); chain->size += skb->len - hdr_size; chain->timestamp = jiffies; ret = true; -- cgit v1.2.3 From ac32c7f705692b92fe12dcbe88fe87136fdfff6f Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Fri, 15 Aug 2014 16:44:35 +0200 Subject: tipc: fix message importance range check Commit 3b4f302d8578 ("tipc: eliminate redundant locking") introduced a bug by removing the sanity check for message importance, allowing programs to assign any value to the msg_user field. This will mess up the packet reception logic and may cause random link resets. Signed-off-by: Erik Hugne Signed-off-by: David S. Miller --- net/tipc/port.h | 4 +++- net/tipc/socket.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/port.h b/net/tipc/port.h index 3f93454592b6..a69118ff018b 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -179,8 +179,10 @@ static inline int tipc_port_importance(struct tipc_port *port) return msg_importance(&port->phdr); } -static inline void tipc_port_set_importance(struct tipc_port *port, int imp) +static inline int tipc_port_set_importance(struct tipc_port *port, int imp) { + if (imp > TIPC_CRITICAL_IMPORTANCE) + return -EINVAL msg_set_importance(&port->phdr, (u32)imp); } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7d423ee10897..ff8c8118d56e 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1973,7 +1973,7 @@ static int tipc_setsockopt(struct socket *sock, int lvl, int opt, switch (opt) { case TIPC_IMPORTANCE: - tipc_port_set_importance(port, value); + res = tipc_port_set_importance(port, value); break; case TIPC_SRC_DROPPABLE: if (sock->type != SOCK_STREAM) -- cgit v1.2.3 From 73d0f37ac4ee5b60e6b9c1b3ccb8766bade9d9c5 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 14 Aug 2014 12:27:47 +0400 Subject: cbq: incorrectly low bandwidth setting blocks limited traffic Mainstream commit f0f6ee1f70c4 ("cbq: incorrect processing of high limits") have side effect: if cbq bandwidth setting is less than real interface throughput non-limited traffic can delay limited traffic for a very long time. This happen because of q->now changes incorrectly in cbq_dequeue(): in described scenario L2T is much greater than real time delay, and q->now gets an extra boost for each transmitted packet. Accumulated boost prevents update q->now, and blocked class can wait very long time until (q->now >= cl->undertime) will be true again. To fix the problem the patch updates q->now on each cbq_update() call. L2T-related pre-modification q->now was moved to cbq_update(). My testing confirmed that it fixes the problem and did not discover any side-effects Fixes: f0f6ee1f70c4 ("cbq: incorrect processing of high limits") Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 37 +++++++++++++------------------------ 1 file changed, 13 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index ead526467cca..550be9504fff 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -700,8 +700,13 @@ cbq_update(struct cbq_sched_data *q) struct cbq_class *this = q->tx_class; struct cbq_class *cl = this; int len = q->tx_len; + psched_time_t now; q->tx_class = NULL; + /* Time integrator. We calculate EOS time + * by adding expected packet transmission time. + */ + now = q->now + L2T(&q->link, len); for ( ; cl; cl = cl->share) { long avgidle = cl->avgidle; @@ -717,7 +722,7 @@ cbq_update(struct cbq_sched_data *q) * idle = (now - last) - last_pktlen/rate */ - idle = q->now - cl->last; + idle = now - cl->last; if ((unsigned long)idle > 128*1024*1024) { avgidle = cl->maxidle; } else { @@ -761,7 +766,7 @@ cbq_update(struct cbq_sched_data *q) idle -= L2T(&q->link, len); idle += L2T(cl, len); - cl->undertime = q->now + idle; + cl->undertime = now + idle; } else { /* Underlimit */ @@ -771,7 +776,8 @@ cbq_update(struct cbq_sched_data *q) else cl->avgidle = avgidle; } - cl->last = q->now; + if ((s64)(now - cl->last) > 0) + cl->last = now; } cbq_update_toplevel(q, this, q->tx_borrowed); @@ -943,30 +949,13 @@ cbq_dequeue(struct Qdisc *sch) struct sk_buff *skb; struct cbq_sched_data *q = qdisc_priv(sch); psched_time_t now; - psched_tdiff_t incr; now = psched_get_time(); - incr = now - q->now_rt; - - if (q->tx_class) { - psched_tdiff_t incr2; - /* Time integrator. We calculate EOS time - * by adding expected packet transmission time. - * If real time is greater, we warp artificial clock, - * so that: - * - * cbq_time = max(real_time, work); - */ - incr2 = L2T(&q->link, q->tx_len); - q->now += incr2; + + if (q->tx_class) cbq_update(q); - if ((incr -= incr2) < 0) - incr = 0; - q->now += incr; - } else { - if (now > q->now) - q->now = now; - } + + q->now = now; q->now_rt = now; for (;;) { -- cgit v1.2.3 From 7201c1ddf774c12daa2dd5da098b8929db53f047 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 14 Aug 2014 12:27:59 +0400 Subject: cbq: now_rt removal Now q->now_rt is identical to q->now and is not required anymore. Signed-off-by: Vasily Averin Signed-off-by: David S. Miller --- net/sched/sch_cbq.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 550be9504fff..762a04bb8f6d 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -159,7 +159,6 @@ struct cbq_sched_data { struct cbq_class *tx_borrowed; int tx_len; psched_time_t now; /* Cached timestamp */ - psched_time_t now_rt; /* Cached real time */ unsigned int pmask; struct hrtimer delay_timer; @@ -353,12 +352,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) int toplevel = q->toplevel; if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) { - psched_time_t now; - psched_tdiff_t incr; - - now = psched_get_time(); - incr = now - q->now_rt; - now = q->now + incr; + psched_time_t now = psched_get_time(); do { if (cl->undertime < now) { @@ -956,7 +950,6 @@ cbq_dequeue(struct Qdisc *sch) cbq_update(q); q->now = now; - q->now_rt = now; for (;;) { q->wd_expires = 0; @@ -1212,7 +1205,6 @@ cbq_reset(struct Qdisc *sch) hrtimer_cancel(&q->delay_timer); q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); - q->now_rt = q->now; for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) q->active[prio] = NULL; @@ -1396,7 +1388,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->delay_timer.function = cbq_undelay; q->toplevel = TC_CBQ_MAXLEVEL; q->now = psched_get_time(); - q->now_rt = q->now; cbq_link_class(&q->link); -- cgit v1.2.3 From 02784f1b05b8f241c8180af88869e717e2758593 Mon Sep 17 00:00:00 2001 From: David S. Miller Date: Tue, 19 Aug 2014 11:14:02 -0700 Subject: tipc: Fix build. Missing semicolon in range check fix. Signed-off-by: David S. Miller --- net/tipc/port.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/port.h b/net/tipc/port.h index a69118ff018b..3087da39ee47 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -182,8 +182,9 @@ static inline int tipc_port_importance(struct tipc_port *port) static inline int tipc_port_set_importance(struct tipc_port *port, int imp) { if (imp > TIPC_CRITICAL_IMPORTANCE) - return -EINVAL + return -EINVAL; msg_set_importance(&port->phdr, (u32)imp); + return 0; } #endif -- cgit v1.2.3 From 6df378d2d1f87a249a88ac4a8c7a14861d9c9474 Mon Sep 17 00:00:00 2001 From: chas williams - CONTRACTOR Date: Thu, 14 Aug 2014 09:19:47 -0400 Subject: lec: Use rtnl lock/unlock when updating MTU The LECS response contains the MTU that should be used. Correctly synchronize with other layers when updating. Signed-off-by: Chas Williams - CONTRACTOR Signed-off-by: David S. Miller --- net/atm/lec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/atm/lec.c b/net/atm/lec.c index e4853b50cf40..4b98f897044a 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -410,9 +410,11 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) priv->lane2_ops = NULL; if (priv->lane_version > 1) priv->lane2_ops = &lane2_ops; + rtnl_lock(); if (dev_set_mtu(dev, mesg->content.config.mtu)) pr_info("%s: change_mtu to %d failed\n", dev->name, mesg->content.config.mtu); + rtnl_unlock(); priv->is_proxy = mesg->content.config.is_proxy; break; case l_flush_tran_id: -- cgit v1.2.3 From dc808110bb62b64a448696ecac3938902c92e1ab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Aug 2014 09:16:04 -0700 Subject: packet: handle too big packets for PACKET_V3 af_packet can currently overwrite kernel memory by out of bound accesses, because it assumed a [new] block can always hold one frame. This is not generally the case, even if most existing tools do it right. This patch clamps too long frames as API permits, and issue a one time error on syslog. [ 394.357639] tpacket_rcv: packet too big, clamped from 5042 to 3966. macoff=82 In this example, packet header tp_snaplen was set to 3966, and tp_len was set to 5042 (skb->len) Signed-off-by: Eric Dumazet Fixes: f6fb8f100b80 ("af-packet: TPACKET_V3 flexible buffer implementation.") Acked-by: Daniel Borkmann Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/packet/af_packet.c | 17 +++++++++++++++++ net/packet/internal.h | 1 + 2 files changed, 18 insertions(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8d9f8042705a..93896d2092f6 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -632,6 +632,7 @@ static void init_prb_bdqc(struct packet_sock *po, p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; + p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po, tx_ring); prb_open_block(p1, pbd); @@ -1942,6 +1943,18 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, if ((int)snaplen < 0) snaplen = 0; } + } else if (unlikely(macoff + snaplen > + GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { + u32 nval; + + nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; + pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", + snaplen, nval, macoff); + snaplen = nval; + if (unlikely((int)snaplen < 0)) { + snaplen = 0; + macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; + } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, @@ -3783,6 +3796,10 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, goto out; if (unlikely(req->tp_block_size & (PAGE_SIZE - 1))) goto out; + if (po->tp_version >= TPACKET_V3 && + (int)(req->tp_block_size - + BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0) + goto out; if (unlikely(req->tp_frame_size < po->tp_hdrlen + po->tp_reserve)) goto out; diff --git a/net/packet/internal.h b/net/packet/internal.h index eb9580a6b25f..cdddf6a30399 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -29,6 +29,7 @@ struct tpacket_kbdq_core { char *pkblk_start; char *pkblk_end; int kblk_size; + unsigned int max_frame_len; unsigned int knum_blocks; uint64_t knxt_seq_num; char *prev; -- cgit v1.2.3 From 061079ac0b9be7a578dcd09f7865c2c0d6ac894a Mon Sep 17 00:00:00 2001 From: zhuyj Date: Wed, 20 Aug 2014 17:31:43 +0800 Subject: sctp: not send SCTP_PEER_ADDR_CHANGE notifications with failed probe Since the transport has always been in state SCTP_UNCONFIRMED, it therefore wasn't active before and hasn't been used before, and it always has been, so it is unnecessary to bug the user with a notification. Reported-by: Deepak Khandelwal Suggested-by: Vlad Yasevich Suggested-by: Michael Tuexen Suggested-by: Daniel Borkmann Signed-off-by: Zhu Yanjun Acked-by: Vlad Yasevich Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/sctp/associola.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 06a9ee6b2d3a..aaafb3250c6a 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -813,6 +813,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, else { dst_release(transport->dst); transport->dst = NULL; + ulp_notify = false; } spc_state = SCTP_ADDR_UNREACHABLE; -- cgit v1.2.3 From 793c3b4000a1ef611ae7e5c89bd2a9c6b776cb5e Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Thu, 21 Aug 2014 19:37:48 +0200 Subject: net: ipv6: fib: don't sleep inside atomic lock The function fib6_commit_metrics() allocates a piece of memory in mode GFP_KERNEL while holding an atomic lock from higher up in the stack, in the function __ip6_ins_rt(). This produces the following BUG: > BUG: sleeping function called from invalid context at mm/slub.c:1250 > in_atomic(): 1, irqs_disabled(): 0, pid: 2909, name: dhcpcd > 2 locks held by dhcpcd/2909: > #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 > #1: (&tb->tb6_lock){++--+.}, at: [] ip6_route_add+0x65a/0x800 > CPU: 1 PID: 2909 Comm: dhcpcd Not tainted 3.17.0-rc1 #1 > Hardware name: ASUS All Series/Q87T, BIOS 0216 10/16/2013 > 0000000000000008 ffff8800c8f13858 ffffffff81af135a 0000000000000000 > ffff880212202430 ffff8800c8f13878 ffffffff810f8d3a ffff880212202c98 > 0000000000000010 ffff8800c8f138c8 ffffffff8121ad0e 0000000000000001 > Call Trace: > [] dump_stack+0x4e/0x68 > [] __might_sleep+0x10a/0x120 > [] kmem_cache_alloc_trace+0x4e/0x190 > [] ? fib6_commit_metrics+0x66/0x110 > [] fib6_commit_metrics+0x66/0x110 > [] fib6_add+0x883/0xa80 > [] ? ip6_route_add+0x65a/0x800 > [] ip6_route_add+0x675/0x800 > [] ? ip6_route_add+0x6a/0x800 > [] inet6_rtm_newroute+0x5c/0x80 > [] rtnetlink_rcv_msg+0x211/0x260 > [] ? rtnl_lock+0x17/0x20 > [] ? lock_release_holdtime+0x28/0x180 > [] ? rtnl_lock+0x17/0x20 > [] ? __rtnl_unlock+0x20/0x20 > [] netlink_rcv_skb+0x6e/0xd0 > [] rtnetlink_rcv+0x25/0x40 > [] netlink_unicast+0xd9/0x180 > [] netlink_sendmsg+0x700/0x770 > [] ? local_clock+0x25/0x30 > [] sock_sendmsg+0x6c/0x90 > [] ? might_fault+0xa3/0xb0 > [] ? verify_iovec+0x7d/0xf0 > [] ___sys_sendmsg+0x37e/0x3b0 > [] ? trace_hardirqs_on_caller+0x185/0x220 > [] ? mutex_unlock+0xe/0x10 > [] ? netlink_insert+0xbc/0xe0 > [] ? netlink_autobind.isra.30+0x125/0x150 > [] ? netlink_autobind.isra.30+0x60/0x150 > [] ? netlink_bind+0x159/0x230 > [] ? might_fault+0x5a/0xb0 > [] ? SYSC_bind+0x7e/0xd0 > [] __sys_sendmsg+0x4d/0x80 > [] SyS_sendmsg+0x12/0x20 > [] system_call_fastpath+0x16/0x1b Fixing this by replacing the mode GFP_KERNEL with GFP_ATOMIC. Signed-off-by: Benjamin Block Acked-by: David Rientjes Acked-by: Hannes Frederic Sowa Signed-off-by: David S. Miller --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index cb4459bd1d29..76b7f5ee8f4c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -643,7 +643,7 @@ static int fib6_commit_metrics(struct dst_entry *dst, if (dst->flags & DST_HOST) { mp = dst_metrics_write_ptr(dst); } else { - mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL); + mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_ATOMIC); if (!mp) return -ENOMEM; dst_init_metrics(dst, mp, 0); -- cgit v1.2.3 From 2ba5af42a7b59ef01f9081234d8855140738defd Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Thu, 21 Aug 2014 21:33:44 +0200 Subject: openvswitch: fix panic with multiple vlan headers When there are multiple vlan headers present in a received frame, the first one is put into vlan_tci and protocol is set to ETH_P_8021Q. Anything in the skb beyond the VLAN TPID may be still non-linear, including the inner TCI and ethertype. While ovs_flow_extract takes care of IP and IPv6 headers, it does nothing with ETH_P_8021Q. Later, if OVS_ACTION_ATTR_POP_VLAN is executed, __pop_vlan_tci pulls the next vlan header into vlan_tci. This leads to two things: 1. Part of the resulting ethernet header is in the non-linear part of the skb. When eth_type_trans is called later as the result of OVS_ACTION_ATTR_OUTPUT, kernel BUGs in __skb_pull. Also, __pop_vlan_tci is in fact accessing random data when it reads past the TPID. 2. network_header points into the ethernet header instead of behind it. mac_len is set to a wrong value (10), too. Reported-by: Yulong Pei Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- net/openvswitch/actions.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index fe5cda0deb39..5231652a95d9 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -42,6 +42,9 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, static int make_writable(struct sk_buff *skb, int write_len) { + if (!pskb_may_pull(skb, write_len)) + return -ENOMEM; + if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) return 0; @@ -70,6 +73,8 @@ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) vlan_set_encap_proto(skb, vhdr); skb->mac_header += VLAN_HLEN; + if (skb_network_offset(skb) < ETH_HLEN) + skb_set_network_header(skb, ETH_HLEN); skb_reset_mac_len(skb); return 0; -- cgit v1.2.3 From ea4f19c1f81d4bf709c74e3789ec785828bc6e51 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Aug 2014 13:03:29 +0200 Subject: net: sctp: spare unnecessary comparison in sctp_trans_elect_best When both transports are the same, we don't have to go down that road only to realize that we will return the very same transport. We are guaranteed that curr is always non-NULL. Therefore, just short-circuit this special case. Signed-off-by: Daniel Borkmann Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index aaafb3250c6a..104fae489ad4 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1245,7 +1245,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr, { u8 score_curr, score_best; - if (best == NULL) + if (best == NULL || curr == best) return curr; score_curr = sctp_trans_score(curr); -- cgit v1.2.3 From aa4a83ee8bbc08342c4acfd59ef234cac51a1eef Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 22 Aug 2014 13:03:30 +0200 Subject: net: sctp: fix suboptimal edge-case on non-active active/retrans path selection In SCTP, selection of active (T.ACT) and retransmission (T.RET) transports is being done whenever transport control operations (UP, DOWN, PF, ...) are engaged through sctp_assoc_control_transport(). Commits 4c47af4d5eb2 ("net: sctp: rework multihoming retransmission path selection to rfc4960") and a7288c4dd509 ("net: sctp: improve sctp_select_active_and_retran_path selection") have both improved it towards a more fine-grained and optimal path selection. Currently, the selection algorithm for T.ACT and T.RET is as follows: 1) Elect the two most recently used ACTIVE transports T1, T2 for T.ACT, T.RET, where T.ACT<-T1 and T1 is most recently used 2) In case primary path T.PRI not in {T1, T2} but ACTIVE, set T.ACT<-T.PRI and T.RET<-T1 3) If only T1 is ACTIVE from the set, set T.ACT<-T1 and T.RET<-T1 4) If none is ACTIVE, set T.ACT<-best(T.PRI, T.RET, T3) where T3 is the most recently used (if avail) in PF, set T.RET<-T.PRI Prior to above commits, 4) was simply a camp on T.ACT<-T.PRI and T.RET<-T.PRI, ignoring possible paths in PF. Camping on T.PRI is still slightly suboptimal as it can lead to the following scenario: Setup: T1: p1p1 (10.0.10.10) <==> .'`) <==> p1p1 (10.0.10.12) <= T.PRI T2: p1p2 (10.0.10.20) <==> (_ . ) <==> p1p2 (10.0.10.22) net.sctp.rto_min = 1000 net.sctp.path_max_retrans = 2 net.sctp.pf_retrans = 0 net.sctp.hb_interval = 1000 T.PRI is permanently down, T2 is put briefly into PF state (e.g. due to link flapping). Here, the first time transmission is sent over PF path T2 as it's the only non-INACTIVE path, but the retransmitted data-chunks are sent over the INACTIVE path T1 (T.PRI), which is not good. After the patch, it's choosing better transports in both cases by modifying step 4): 4) If none is ACTIVE, set T.ACT_new<-best(T.ACT_old, T3) where T3 is the most recently used (if avail) in PF, set T.RET<-T.ACT_new This will still select a best possible path in PF if available (which can also include T.PRI/T.RET), and set both T.ACT/T.RET to it. In case sctp_assoc_control_transport() *just* put T.ACT_old into INACTIVE as it transitioned from ACTIVE->PF->INACTIVE and stays in INACTIVE just for a very short while before going back ACTIVE, it will guarantee that this path will be reselected for T.ACT/T.RET since T3 (PF) is not available. Previously, this was not possible, as we would only select between T.PRI and T.RET, and a possible T3 would be NULL due to the fact that we have just transitioned T3 in sctp_assoc_control_transport() from PF->INACTIVE and would select a suboptimal path when T.PRI/T.RET have worse properties. In the case that T.ACT_old permanently went to INACTIVE during this transition and there's no PF path available, plus T.PRI and T.RET are INACTIVE as well, we would now camp on T.ACT_old, but if everything is being INACTIVE there's really not much we can do except hoping for a successful HB to bring one of the transports back up again and, thus cause a new selection through sctp_assoc_control_transport(). Now both tests work fine: Case 1: 1. T1 S(ACTIVE) T.ACT T2 S(ACTIVE) T.RET 2. T1 S(ACTIVE) T.ACT, T.RET T2 S(PF) 3. T1 S(ACTIVE) T.ACT, T.RET T2 S(INACTIVE) 5. T1 S(PF) T.ACT, T.RET T2 S(INACTIVE) [ 5.1 T1 S(INACTIVE) T.ACT, T.RET T2 S(INACTIVE) ] 6. T1 S(ACTIVE) T.ACT, T.RET T2 S(INACTIVE) 7. T1 S(ACTIVE) T.ACT T2 S(ACTIVE) T.RET Case 2: 1. T1 S(ACTIVE) T.ACT T2 S(ACTIVE) T.RET 2. T1 S(PF) T2 S(ACTIVE) T.ACT, T.RET 3. T1 S(INACTIVE) T2 S(ACTIVE) T.ACT, T.RET 5. T1 S(INACTIVE) T2 S(PF) T.ACT, T.RET [ 5.1 T1 S(INACTIVE) T2 S(INACTIVE) T.ACT, T.RET ] 6. T1 S(INACTIVE) T2 S(ACTIVE) T.ACT, T.RET 7. T1 S(ACTIVE) T.ACT T2 S(ACTIVE) T.RET Signed-off-by: Daniel Borkmann Acked-by: Neil Horman Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 104fae489ad4..a88b8524846e 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1356,14 +1356,11 @@ static void sctp_select_active_and_retran_path(struct sctp_association *asoc) trans_sec = trans_pri; /* If we failed to find a usable transport, just camp on the - * primary or retran, even if they are inactive, if possible - * pick a PF iff it's the better choice. + * active or pick a PF iff it's the better choice. */ if (trans_pri == NULL) { - trans_pri = sctp_trans_elect_best(asoc->peer.primary_path, - asoc->peer.retran_path); - trans_pri = sctp_trans_elect_best(trans_pri, trans_pf); - trans_sec = asoc->peer.primary_path; + trans_pri = sctp_trans_elect_best(asoc->peer.active_path, trans_pf); + trans_sec = trans_pri; } /* Set the active and retran transports. */ -- cgit v1.2.3