From 9b8805a325591cf5b6b9df71200de25a2bd721fd Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:11 +0200 Subject: sock: add sk_dst_pending_confirm flag Add new sock flag to allow sockets to confirm neighbour. When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. As not all call paths lock the socket use full word for the flag. Add sk_dst_confirm as replacement for dst_confirm when called for received packets. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 12 ++++++++++++ net/core/sock.c | 2 ++ 2 files changed, 14 insertions(+) diff --git a/include/net/sock.h b/include/net/sock.h index 94e65fd70354..85d856b94b4b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -240,6 +240,7 @@ struct sock_common { * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_dst_cache: destination cache + * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed @@ -393,6 +394,8 @@ struct sock { struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; + __u32 sk_dst_pending_confirm; + /* Note: 32bit hole on 64bit arches */ long sk_sndtimeo; struct timer_list sk_timer; __u32 sk_priority; @@ -1764,6 +1767,7 @@ static inline void dst_negative_advice(struct sock *sk) if (ndst != dst) { rcu_assign_pointer(sk->sk_dst_cache, ndst); sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; } } } @@ -1774,6 +1778,7 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; /* * This can be called while sk is owned by the caller only, * with no state that can be checked in a rcu_dereference_check() cond @@ -1789,6 +1794,7 @@ sk_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old_dst; sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst); dst_release(old_dst); } @@ -1809,6 +1815,12 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); +static inline void sk_dst_confirm(struct sock *sk) +{ + if (!sk->sk_dst_pending_confirm) + sk->sk_dst_pending_confirm = 1; +} + bool sk_mc_loop(struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) diff --git a/net/core/sock.c b/net/core/sock.c index 8b35debfe454..b74356535559 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -502,6 +502,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { sk_tx_queue_clear(sk); + sk->sk_dst_pending_confirm = 0; RCU_INIT_POINTER(sk->sk_dst_cache, NULL); dst_release(dst); return NULL; @@ -1519,6 +1520,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) af_family_clock_key_strings[newsk->sk_family]); newsk->sk_dst_cache = NULL; + newsk->sk_dst_pending_confirm = 0; newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; atomic_set(&newsk->sk_drops, 0); -- cgit v1.2.3 From 4ff0620354f2b39b9fe2a91c22c4de9d1fba0c8e Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:12 +0200 Subject: net: add dst_pending_confirm flag to skbuff Add new skbuff flag to allow protocols to confirm neighbour. When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. Add sock_confirm_neigh() helper to confirm the neighbour and use it for IPv4, IPv6 and VRF before dst_neigh_output. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/vrf.c | 5 ++++- include/linux/skbuff.h | 12 ++++++++++++ include/net/sock.h | 14 ++++++++++++++ net/ipv4/ip_output.c | 5 ++++- net/ipv6/ip6_output.c | 1 + 5 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 264fc1585b3c..630eafdb79e8 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -378,6 +378,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk, if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { + sock_confirm_neigh(skb, neigh); ret = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return ret; @@ -574,8 +575,10 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); - if (!IS_ERR(neigh)) + if (!IS_ERR(neigh)) { + sock_confirm_neigh(skb, neigh); ret = dst_neigh_output(dst, neigh, skb); + } rcu_read_unlock_bh(); err: diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c6a78e1892b6..f1adddc1c5ac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -612,6 +612,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1, * @wifi_acked_valid: wifi_acked was set * @wifi_acked: whether frame was acked on wifi or not * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS + * @dst_pending_confirm: need to confirm neighbour * @napi_id: id of the NAPI struct this skb came from * @secmark: security marking * @mark: Generic packet mark @@ -741,6 +742,7 @@ struct sk_buff { __u8 csum_level:2; __u8 csum_bad:1; + __u8 dst_pending_confirm:1; #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif @@ -3698,6 +3700,16 @@ static inline bool skb_rx_queue_recorded(const struct sk_buff *skb) return skb->queue_mapping != 0; } +static inline void skb_set_dst_pending_confirm(struct sk_buff *skb, u32 val) +{ + skb->dst_pending_confirm = val; +} + +static inline bool skb_get_dst_pending_confirm(const struct sk_buff *skb) +{ + return skb->dst_pending_confirm != 0; +} + static inline struct sec_path *skb_sec_path(struct sk_buff *skb) { #ifdef CONFIG_XFRM diff --git a/include/net/sock.h b/include/net/sock.h index 85d856b94b4b..6f83e78eaa5a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1821,6 +1821,20 @@ static inline void sk_dst_confirm(struct sock *sk) sk->sk_dst_pending_confirm = 1; } +static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) +{ + if (skb_get_dst_pending_confirm(skb)) { + struct sock *sk = skb->sk; + unsigned long now = jiffies; + + /* avoid dirtying neighbour */ + if (n->confirmed != now) + n->confirmed = now; + if (sk && sk->sk_dst_pending_confirm) + sk->sk_dst_pending_confirm = 0; + } +} + bool sk_mc_loop(struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index b67719f45953..c9fc32fa3272 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -222,7 +222,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); if (!IS_ERR(neigh)) { - int res = dst_neigh_output(dst, neigh, skb); + int res; + + sock_confirm_neigh(skb, neigh); + res = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return res; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index b6a94ff0bbd0..14d99fbf102e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -119,6 +119,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); if (!IS_ERR(neigh)) { + sock_confirm_neigh(skb, neigh); ret = dst_neigh_output(dst, neigh, skb); rcu_read_unlock_bh(); return ret; -- cgit v1.2.3 From c86a773c78025f5b825bacd7b846f4fa60dc0317 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:13 +0200 Subject: sctp: add dst_pending_confirm flag Add new transport flag to allow sockets to confirm neighbour. When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. The flag is propagated from transport to every packet. It is reset when cached dst is reset. Reported-by: YueHaibing Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.") Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.") Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 6 ++---- include/net/sctp/structs.h | 4 ++++ net/sctp/associola.c | 3 +-- net/sctp/output.c | 10 +++++++++- net/sctp/outqueue.c | 2 +- net/sctp/sm_make_chunk.c | 6 ++---- net/sctp/sm_sideeffect.c | 2 +- net/sctp/socket.c | 4 ++-- net/sctp/transport.c | 16 +++++++++++++++- 9 files changed, 37 insertions(+), 16 deletions(-) diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 3cfd365bcfbc..480b65a24aff 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -593,10 +593,8 @@ static inline void sctp_v4_map_v6(union sctp_addr *addr) */ static inline struct dst_entry *sctp_transport_dst_check(struct sctp_transport *t) { - if (t->dst && !dst_check(t->dst, t->dst_cookie)) { - dst_release(t->dst); - t->dst = NULL; - } + if (t->dst && !dst_check(t->dst, t->dst_cookie)) + sctp_transport_dst_release(t); return t->dst; } diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 231fa9ac50bd..6a685049f67f 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -804,6 +804,8 @@ struct sctp_transport { __u32 burst_limited; /* Holds old cwnd when max.burst is applied */ + __u32 dst_pending_confirm; /* need to confirm neighbour */ + /* Destination */ struct dst_entry *dst; /* Source address. */ @@ -950,6 +952,8 @@ unsigned long sctp_transport_timeout(struct sctp_transport *); void sctp_transport_reset(struct sctp_transport *); void sctp_transport_update_pmtu(struct sock *, struct sctp_transport *, u32); void sctp_transport_immediate_rtx(struct sctp_transport *); +void sctp_transport_dst_release(struct sctp_transport *t); +void sctp_transport_dst_confirm(struct sctp_transport *t); /* This is the structure we use to queue packets as they come into diff --git a/net/sctp/associola.c b/net/sctp/associola.c index e50dc6d7543f..2a6835b4562b 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -832,8 +832,7 @@ void sctp_assoc_control_transport(struct sctp_association *asoc, if (transport->state != SCTP_UNCONFIRMED) transport->state = SCTP_INACTIVE; else { - dst_release(transport->dst); - transport->dst = NULL; + sctp_transport_dst_release(transport); ulp_notify = false; } diff --git a/net/sctp/output.c b/net/sctp/output.c index 07ab5062e541..814eac047467 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -546,6 +546,7 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) struct sctp_association *asoc = tp->asoc; struct sctp_chunk *chunk, *tmp; int pkt_count, gso = 0; + int confirm; struct dst_entry *dst; struct sk_buff *head; struct sctphdr *sh; @@ -624,7 +625,14 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp) asoc->peer.last_sent_to = tp; } head->ignore_df = packet->ipfragok; - tp->af_specific->sctp_xmit(head, tp); + confirm = tp->dst_pending_confirm; + if (confirm) + skb_set_dst_pending_confirm(head, 1); + /* neighbour should be confirmed on successful transmission or + * positive error + */ + if (tp->af_specific->sctp_xmit(head, tp) >= 0 && confirm) + tp->dst_pending_confirm = 0; out: list_for_each_entry_safe(chunk, tmp, &packet->chunk_list, list) { diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 65abe22d8691..db352e5d61f8 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1654,7 +1654,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, if (forward_progress) { if (transport->dst) - dst_confirm(transport->dst); + sctp_transport_dst_confirm(transport); } } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index ad3445b3408e..c7d3249f88ec 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3333,8 +3333,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc, local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { - dst_release(transport->dst); - transport->dst = NULL; + sctp_transport_dst_release(transport); } break; case SCTP_PARAM_DEL_IP: @@ -3348,8 +3347,7 @@ static void sctp_asconf_param_success(struct sctp_association *asoc, local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { - dst_release(transport->dst); - transport->dst = NULL; + sctp_transport_dst_release(transport); } break; default: diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index a4552712b882..51abcc90fe75 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -755,7 +755,7 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, * forward progress. */ if (t->dst) - dst_confirm(t->dst); + sctp_transport_dst_confirm(t); /* The receiver of the HEARTBEAT ACK should also perform an * RTT measurement for that destination transport address diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 5fc7122c76de..a4609a0be76d 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -592,7 +592,7 @@ static int sctp_send_asconf_add_ip(struct sock *sk, list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) { /* Clear the source and route cache */ - dst_release(trans->dst); + sctp_transport_dst_release(trans); trans->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); trans->ssthresh = asoc->peer.i.a_rwnd; @@ -843,7 +843,7 @@ skip_mkasconf: */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { - dst_release(transport->dst); + sctp_transport_dst_release(transport); sctp_transport_route(transport, NULL, sctp_sk(asoc->base.sk)); } diff --git a/net/sctp/transport.c b/net/sctp/transport.c index baa1ac00d7b5..5b63ceb3bf37 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -240,7 +240,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ if (!transport->dst || transport->dst->obsolete) { - dst_release(transport->dst); + sctp_transport_dst_release(transport); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); } @@ -672,3 +672,17 @@ void sctp_transport_immediate_rtx(struct sctp_transport *t) sctp_transport_hold(t); } } + +/* Drop dst */ +void sctp_transport_dst_release(struct sctp_transport *t) +{ + dst_release(t->dst); + t->dst = NULL; + t->dst_pending_confirm = 0; +} + +/* Schedule neighbour confirm */ +void sctp_transport_dst_confirm(struct sctp_transport *t) +{ + t->dst_pending_confirm = 1; +} -- cgit v1.2.3 From c3a2e8370534f810cac6050169db0ed3e0f94f0b Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:14 +0200 Subject: tcp: replace dst_confirm with sk_dst_confirm When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. Use the new sk_dst_confirm() helper to propagate the indication from received packets to sock_confirm_neigh(). Reported-by: YueHaibing Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.") Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.") Tested-by: YueHaibing Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 +++--------- net/ipv4/tcp_metrics.c | 7 ++----- net/ipv4/tcp_output.c | 2 ++ 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 27c95acbb52f..2c0ff327b6df 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3644,11 +3644,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); - if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) { - struct dst_entry *dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); - } + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) + sk_dst_confirm(sk); if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); @@ -5995,7 +5992,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; case TCP_FIN_WAIT1: { - struct dst_entry *dst; int tmo; /* If we enter the TCP_FIN_WAIT1 state and we are a @@ -6022,9 +6018,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_set_state(sk, TCP_FIN_WAIT2); sk->sk_shutdown |= SEND_SHUTDOWN; - dst = __sk_dst_get(sk); - if (dst) - dst_confirm(dst); + sk_dst_confirm(sk); if (!sock_flag(sk, SOCK_DEAD)) { /* Wake up lingering close() */ diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b9ed0d50aead..0f46e5fe31ad 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -375,12 +375,10 @@ void tcp_update_metrics(struct sock *sk) u32 val; int m; + sk_dst_confirm(sk); if (sysctl_tcp_nometrics_save || !dst) return; - if (dst->flags & DST_HOST) - dst_confirm(dst); - rcu_read_lock(); if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? @@ -493,11 +491,10 @@ void tcp_init_metrics(struct sock *sk) struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ + sk_dst_confirm(sk); if (!dst) goto reset; - dst_confirm(dst); - rcu_read_lock(); tm = tcp_get_metrics(sk, dst, true); if (!tm) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ccf1ef4dcba4..61f272a99a49 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -980,6 +980,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, skb_set_hash_from_sk(skb, sk); atomic_add(skb->truesize, &sk->sk_wmem_alloc); + skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); + /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; -- cgit v1.2.3 From 63fca65d08632fbec9d9b655f671cf08aa1aeeb8 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:15 +0200 Subject: net: add confirm_neigh method to dst_ops Add confirm_neigh method to dst_ops and use it from IPv4 and IPv6 to lookup and confirm the neighbour. Its usage via the new helper dst_confirm_neigh() should be restricted to MSG_PROBE users for performance reasons. For XFRM prefer the last tunnel address, if present. With help from Steffen Klassert. Signed-off-by: Julian Anastasov Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/arp.h | 16 ++++++++++++++++ include/net/dst.h | 7 +++++++ include/net/dst_ops.h | 2 ++ include/net/ndisc.h | 17 +++++++++++++++++ net/ipv4/route.c | 19 +++++++++++++++++++ net/ipv6/route.c | 16 ++++++++++++++++ net/xfrm/xfrm_policy.c | 19 +++++++++++++++++++ 7 files changed, 96 insertions(+) diff --git a/include/net/arp.h b/include/net/arp.h index 5e0f891d476c..65619a2de6f4 100644 --- a/include/net/arp.h +++ b/include/net/arp.h @@ -35,6 +35,22 @@ static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 return n; } +static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) +{ + struct neighbour *n; + + rcu_read_lock_bh(); + n = __ipv4_neigh_lookup_noref(dev, key); + if (n) { + unsigned long now = jiffies; + + /* avoid dirtying neighbour */ + if (n->confirmed != now) + n->confirmed = now; + } + rcu_read_unlock_bh(); +} + void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, diff --git a/include/net/dst.h b/include/net/dst.h index 6835d224d47b..3a3b34b83b00 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -477,6 +477,13 @@ static inline struct neighbour *dst_neigh_lookup_skb(const struct dst_entry *dst return IS_ERR(n) ? NULL : n; } +static inline void dst_confirm_neigh(const struct dst_entry *dst, + const void *daddr) +{ + if (dst->ops->confirm_neigh) + dst->ops->confirm_neigh(dst, daddr); +} + static inline void dst_link_failure(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 8a2b66d8d78d..c84b3287e38b 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -33,6 +33,8 @@ struct dst_ops { struct neighbour * (*neigh_lookup)(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); + void (*confirm_neigh)(const struct dst_entry *dst, + const void *daddr); struct kmem_cache *kmem_cachep; diff --git a/include/net/ndisc.h b/include/net/ndisc.h index d562a2fe4860..8a0214654b6b 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -391,6 +391,23 @@ static inline struct neighbour *__ipv6_neigh_lookup(struct net_device *dev, cons return n; } +static inline void __ipv6_confirm_neigh(struct net_device *dev, + const void *pkey) +{ + struct neighbour *n; + + rcu_read_lock_bh(); + n = __ipv6_neigh_lookup_noref(dev, pkey); + if (n) { + unsigned long now = jiffies; + + /* avoid dirtying neighbour */ + if (n->confirmed != now) + n->confirmed = now; + } + rcu_read_unlock_bh(); +} + int ndisc_init(void); int ndisc_late_init(void); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 4b7c231c1aef..cb494a5050f7 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -154,6 +154,7 @@ static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); +static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, @@ -168,6 +169,7 @@ static struct dst_ops ipv4_dst_ops = { .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, + .confirm_neigh = ipv4_confirm_neigh, }; #define ECN_OR_COST(class) TC_PRIO_##class @@ -461,6 +463,23 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, return neigh_create(&arp_tbl, pkey, dev); } +static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) +{ + struct net_device *dev = dst->dev; + const __be32 *pkey = daddr; + const struct rtable *rt; + + rt = (const struct rtable *)dst; + if (rt->rt_gateway) + pkey = (const __be32 *)&rt->rt_gateway; + else if (!daddr || + (rt->rt_flags & + (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) + return; + + __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); +} + #define IP_IDENTS_SZ 2048u static atomic_t *ip_idents __read_mostly; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8ffa24cc8899..98b183f1bc8b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -223,6 +223,21 @@ static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, return neigh_create(&nd_tbl, daddr, dst->dev); } +static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) +{ + struct net_device *dev = dst->dev; + struct rt6_info *rt = (struct rt6_info *)dst; + + daddr = choose_neigh_daddr(rt, NULL, daddr); + if (!daddr) + return; + if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) + return; + if (ipv6_addr_is_multicast((const struct in6_addr *)daddr)) + return; + __ipv6_confirm_neigh(dev, daddr); +} + static struct dst_ops ip6_dst_ops_template = { .family = AF_INET6, .gc = ip6_dst_gc, @@ -239,6 +254,7 @@ static struct dst_ops ip6_dst_ops_template = { .redirect = rt6_do_redirect, .local_out = __ip6_local_out, .neigh_lookup = ip6_neigh_lookup, + .confirm_neigh = ip6_confirm_neigh, }; static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 99ad1af2927f..0a0f63d3cc96 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2856,6 +2856,23 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, return dst->path->ops->neigh_lookup(dst, skb, daddr); } +static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) +{ + const struct dst_entry *path = dst->path; + + for (; dst != path; dst = dst->child) { + const struct xfrm_state *xfrm = dst->xfrm; + + if (xfrm->props.mode == XFRM_MODE_TRANSPORT) + continue; + if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) + daddr = xfrm->coaddr; + else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) + daddr = &xfrm->id.daddr; + } + path->ops->confirm_neigh(path, daddr); +} + int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) { int err = 0; @@ -2882,6 +2899,8 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) dst_ops->link_failure = xfrm_link_failure; if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; + if (likely(!dst_ops->confirm_neigh)) + dst_ops->confirm_neigh = xfrm_confirm_neigh; if (likely(afinfo->garbage_collect == NULL)) afinfo->garbage_collect = xfrm_garbage_collect_deferred; rcu_assign_pointer(xfrm_policy_afinfo[afinfo->family], afinfo); -- cgit v1.2.3 From 0dec879f636f11b0ffda1cb5fd96a1754c59ead3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:16 +0200 Subject: net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. The datagram protocols can use MSG_CONFIRM to confirm the neighbour. When used with MSG_PROBE we do not reach the code where neighbour is confirmed, so we have to do the same slow lookup by using the dst_confirm_neigh() helper. When MSG_PROBE is not used, ip_append_data/ip6_append_data will set the skb flag dst_pending_confirm. Reported-by: YueHaibing Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.") Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.") Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 6 ++++++ net/ipv4/ping.c | 3 ++- net/ipv4/raw.c | 6 +++++- net/ipv4/udp.c | 3 ++- net/ipv6/ip6_output.c | 6 ++++++ net/ipv6/raw.c | 6 +++++- net/ipv6/route.c | 27 ++++++++++++++------------- net/ipv6/udp.c | 3 ++- net/l2tp/l2tp_ip6.c | 3 ++- 9 files changed, 44 insertions(+), 19 deletions(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c9fc32fa3272..7a719f1ae556 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -889,6 +889,9 @@ static inline int ip_ufo_append_data(struct sock *sk, skb->csum = 0; + if (flags & MSG_CONFIRM) + skb_set_dst_pending_confirm(skb, 1); + __skb_queue_tail(queue, skb); } else if (skb_is_gso(skb)) { goto append; @@ -1089,6 +1092,9 @@ alloc_new_skb: exthdrlen = 0; csummode = CHECKSUM_NONE; + if ((flags & MSG_CONFIRM) && !skb_prev) + skb_set_dst_pending_confirm(skb, 1); + /* * Put the packet on the pending queue. */ diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 592db6a3a0e9..6ee792d83d5b 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -848,7 +848,8 @@ out: return err; do_confirm: - dst_confirm(&rt->dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4e49e5cb001c..8119e1f66e03 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -383,6 +383,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, sock_tx_timestamp(sk, sockc->tsflags, &skb_shinfo(skb)->tx_flags); + if (flags & MSG_CONFIRM) + skb_set_dst_pending_confirm(skb, 1); + skb->transport_header = skb->network_header; err = -EFAULT; if (memcpy_from_msg(iph, msg, length)) @@ -666,7 +669,8 @@ out: return len; do_confirm: - dst_confirm(&rt->dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(&rt->dst, &fl4.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cf6ba3387401..4a1ba04565d1 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1101,7 +1101,8 @@ out: return err; do_confirm: - dst_confirm(&rt->dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(&rt->dst, &fl4->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 14d99fbf102e..d299040613a0 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1145,6 +1145,9 @@ static inline int ip6_ufo_append_data(struct sock *sk, skb->protocol = htons(ETH_P_IPV6); skb->csum = 0; + if (flags & MSG_CONFIRM) + skb_set_dst_pending_confirm(skb, 1); + __skb_queue_tail(queue, skb); } else if (skb_is_gso(skb)) { goto append; @@ -1517,6 +1520,9 @@ alloc_new_skb: exthdrlen = 0; dst_exthdrlen = 0; + if ((flags & MSG_CONFIRM) && !skb_prev) + skb_set_dst_pending_confirm(skb, 1); + /* * Put the packet on the pending queue */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ea89073c8247..f174e76e6505 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -654,6 +654,9 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, skb->ip_summed = CHECKSUM_NONE; + if (flags & MSG_CONFIRM) + skb_set_dst_pending_confirm(skb, 1); + skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); if (err) @@ -934,7 +937,8 @@ out: txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: - dst_confirm(dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 98b183f1bc8b..f54f4265b37f 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1381,6 +1381,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct ipv6hdr *iph, u32 mtu) { + const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = (struct rt6_info *)dst; if (rt6->rt6i_flags & RTF_LOCAL) @@ -1389,26 +1390,26 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (dst_metric_locked(dst, RTAX_MTU)) return; - dst_confirm(dst); + if (iph) { + daddr = &iph->daddr; + saddr = &iph->saddr; + } else if (sk) { + daddr = &sk->sk_v6_daddr; + saddr = &inet6_sk(sk)->saddr; + } else { + daddr = NULL; + saddr = NULL; + } + dst_confirm_neigh(dst, daddr); mtu = max_t(u32, mtu, IPV6_MIN_MTU); if (mtu >= dst_mtu(dst)) return; if (!rt6_cache_allowed_for_pmtu(rt6)) { rt6_do_update_pmtu(rt6, mtu); - } else { - const struct in6_addr *daddr, *saddr; + } else if (daddr) { struct rt6_info *nrt6; - if (iph) { - daddr = &iph->daddr; - saddr = &iph->saddr; - } else if (sk) { - daddr = &sk->sk_v6_daddr; - saddr = &inet6_sk(sk)->saddr; - } else { - return; - } nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr); if (nrt6) { rt6_do_update_pmtu(nrt6, mtu); @@ -2332,7 +2333,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu * Look, redirects are sent only in response to data packets, * so that this nexthop apparently is reachable. --ANK */ - dst_confirm(&rt->dst); + dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr); neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1); if (!neigh) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b4c6516a3a0c..51346fa70298 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1308,7 +1308,8 @@ out: return err; do_confirm: - dst_confirm(dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 4b06eb415f68..734798a00ca0 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -658,7 +658,8 @@ out: return err < 0 ? err : len; do_confirm: - dst_confirm(dst); + if (msg->msg_flags & MSG_PROBE) + dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; -- cgit v1.2.3 From 51ce8bd4d17a761e1a90a34a1b5c9b762cce7553 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 6 Feb 2017 23:14:17 +0200 Subject: net: pending_confirm is not used anymore When same struct dst_entry can be used for many different neighbours we can not use it for pending confirmations. As last step, we can remove the pending_confirm flag. Reported-by: YueHaibing Fixes: 5110effee8fd ("net: Do delayed neigh confirmation.") Fixes: f2bb4bedf35d ("ipv4: Cache output routes in fib_info nexthops.") Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/dst.h | 14 ++------------ net/core/dst.c | 1 - 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/include/net/dst.h b/include/net/dst.h index 3a3b34b83b00..84a1043dd6a1 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -59,8 +59,6 @@ struct dst_entry { #define DST_XFRM_QUEUE 0x0100 #define DST_METADATA 0x0200 - unsigned short pending_confirm; - short error; /* A non-zero value of dst->obsolete forces by-hand validation @@ -78,6 +76,8 @@ struct dst_entry { #define DST_OBSOLETE_KILL -2 unsigned short header_len; /* more space at head required */ unsigned short trailer_len; /* space to reserve at tail */ + unsigned short __pad3; + #ifdef CONFIG_IP_ROUTE_CLASSID __u32 tclassid; #else @@ -440,7 +440,6 @@ static inline void dst_rcu_free(struct rcu_head *head) static inline void dst_confirm(struct dst_entry *dst) { - dst->pending_confirm = 1; } static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, @@ -448,15 +447,6 @@ static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, { const struct hh_cache *hh; - if (dst->pending_confirm) { - unsigned long now = jiffies; - - dst->pending_confirm = 0; - /* avoid dirtying neighbour */ - if (n->confirmed != now) - n->confirmed = now; - } - hh = &n->hh; if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) return neigh_hh_output(hh, skb); diff --git a/net/core/dst.c b/net/core/dst.c index b5cbbe07f786..960e503b5a52 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -190,7 +190,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->pending_confirm = 0; dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); -- cgit v1.2.3